|
| View previous topic :: View next topic |
| Author |
Message |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Fri Dec 14, 2012 2:39 pm Post subject: |
|
|
Hi bofang,
Odd. For some reason the runtime can't find the device copy of temp2. I have filed a problem report (TPR#19044) and sent to our compiler engineers. The work around is to add a data region:
| Code: | % cat test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024
double fRand(double fMin, double fMax)
{
double f = (double)rand() / RAND_MAX;
return fMin + f * (fMax - fMin);
}
int main(void){
int *temp1=(int *)calloc(T,sizeof(int));
double **restrict temp2 = (double **) calloc(T, sizeof(double *));
for (int i = 0; i < T; i++){
temp2[i] = (double *) calloc(32, sizeof(double));
}
srand(time(NULL));
for(int i = 0; i < T;i++)
{ temp1[i] = rand()%2;
// printf("Temp1 is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
for(int j = 0; j < 32; j++)
temp2[i][j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)
#pragma acc data copyin(temp1[0:T],temp2[0:T][0:32])
{
#pragma acc kernels
for(int n = 0; n < T; n++){
double tem = 0;
if (temp1[n]== 1){
for(int f= 0;f<32;f++)
tem += (a+b)*log(c+a*temp2[n][f]);
}
reduction_1 += tem;
}
}
printf("Result is %f\n",reduction_1);
return 1;
}
% pgcc -Minfo=accel test.c -V12.9 -acc
main:
38, Generating copyin(temp2[0:1048576][0:32])
Generating copyin(temp1[0:1048576])
40, Generating present_or_copyin(temp2[0:1048576][0:32])
Generating present_or_copyin(temp1[0:1048576])
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
41, Loop is parallelizable
Accelerator kernel generated
41, #pragma acc loop gang /* blockIdx.x */
CC 1.3 : 31 registers; 96 shared, 52 constant, 0 local memory bytes
CC 2.0 : 28 registers; 32 shared, 104 constant, 0 local memory bytes
44, #pragma acc loop vector(32) /* threadIdx.x */
47, Sum reduction generated for reduction_1
44, Loop is parallelizable
% a.out
Result is 105127285.494059
Accelerator Kernel Timing data
./test.c
main
40: region entered 1 time
time(us): total=32,520
kernels=25,658
41: kernel launched 1 times
grid: [65535] block: [32]
time(us): total=25,552 max=25,552 min=25,552 avg=25,552
47: kernel launched 1 times
grid: [1] block: [256]
time(us): total=106 max=106 min=106 avg=106
/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/../src-nv/nvfill.c
__pgi_cu_fill
26: region entered 1 time
time(us): total=32,227
kernels=83
27: kernel launched 1 times
grid: [8192] block: [128]
time(us): total=83 max=83 min=83 avg=83
./test.c
main
38: region entered 1 time
time(us): total=5,909,665 init=67,314 region=5,842,351
data=5,773,474
w/o init: total=5,842,351 max=5,842,351 min=5,842,351 avg=5,842,351
|
One thing to note is that your performance will be very poor since it takes almost 6 seconds to copy your data to the GPU (at least on my system). The problem being that data needs to be copied in contiguous blocks so your temp2 can't be copied in one large chunk. Instead, you have 1048576 copies of 32 elements each. To fix, you should move to using a 1-D array. For example:
| Code: | % cat test2.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024
#define TT 32
#ifdef _OPENACC
#include <accelmath.h>
#endif
double fRand(double fMin, double fMax)
{
double f = (double)rand() / RAND_MAX;
return fMin + f * (fMax - fMin);
}
int main(void){
int *temp1=(int *)calloc(T,sizeof(int));
double *restrict temp2 = (double *) calloc(T*TT, sizeof(double));
/*
for (int i = 0; i < T; i++){
temp2[i] = (double *) calloc(32, sizeof(double));
}
*/
srand(time(NULL));
for(int i = 0; i < T;i++)
{ temp1[i] = rand()%2;
// printf("Temp1 is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
for(int j = 0; j < TT; j++)
temp2[i*TT+j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)
#pragma acc data copyin(a,b,c,temp1[0:T],temp2[0:T*TT])
{
reduction_1 = 0.0;
#pragma acc kernels loop
for(int n = 0; n < T; n++){
double tem = 0;
if (temp1[n]== 1){
for(int f= 0;f<TT;f++)
tem += (a+b)*log(c+a*temp2[n*TT+f]);
}
reduction_1 += tem;
}
}
printf("Result is %f\n",reduction_1);
return 1;
}
% pgcc -Minfo=accel test2.c -V12.9 -acc
main:
44, Generating copyin(temp2[0:33554432])
Generating copyin(temp1[0:1048576])
Generating copyin(c)
Generating copyin(b)
Generating copyin(a)
47, Generating present_or_copyin(temp2[0:33554432])
Generating present_or_copyin(temp1[0:1048576])
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
48, Loop is parallelizable
Accelerator kernel generated
48, #pragma acc loop gang /* blockIdx.x */
CC 1.3 : 33 registers; 96 shared, 52 constant, 0 local memory bytes
CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
51, #pragma acc loop vector(32) /* threadIdx.x */
54, Sum reduction generated for reduction_1
51, Loop is parallelizable
% a.out
Result is 105064474.847449
Accelerator Kernel Timing data
./test2.c
main
47: region entered 1 time
time(us): total=119,004
kernels=25,406
48: kernel launched 1 times
grid: [65535] block: [32]
time(us): total=25,300 max=25,300 min=25,300 avg=25,300
54: kernel launched 1 times
grid: [1] block: [256]
time(us): total=106 max=106 min=106 avg=106
./test2.c
main
44: region entered 1 time
time(us): total=280,812 init=98,856 region=181,956
data=58,649
w/o init: total=181,956 max=181,956 min=181,956 avg=181,956
|
With this change, the data transfer time goes from 5,773,474 ms down to 58,649 ms.
- Mat |
|
| Back to top |
|
 |
bofang
Joined: 06 Dec 2012 Posts: 6
|
Posted: Fri Dec 14, 2012 4:22 pm Post subject: |
|
|
Hi Mat,
Those are very good suggestions! Thank you.
One interesting thing is that after I applied all tricks you mentioned in the code, here is my time profile:
| Code: |
Accelerator Kernel Timing data
/home/bo/test1.c
main
42: region entered 1 time
time(us): total=28,935 init= region=28,934
kernels=27,537
w/o init: total=28,934 max=28,934 min=28,934 avg=28,934
43: kernel launched 1 times
grid: [65535] block: [32]
time(us): total=27,402 max=27,402 min=27,402 avg=27,402
49: kernel launched 1 times
grid: [1] block: [256]
time(us): total=135 max=135 min=135 avg=135
/home/bo/test1.c
main
39: region entered 1 time
time(us): total=6,079,576 init=5,953,711 region=125,865
data=90,984
w/o init: total=125,865 max=125,865 min=125,865 avg=125,865
|
As you can see the data transfer time is 90,984. Which is ok. But init time is 5,953,711, which is quite large comparing to your 98,856. Any ideas?
Thanks a lot!
Bo |
|
| Back to top |
|
 |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Fri Dec 14, 2012 5:19 pm Post subject: |
|
|
Hi Bo,
Linux will power down the GPU when not in use and it takes about 1 second per device to start it back-up. You can run the PGI utility pgcudainit as background task to hold the devices open so you minimize the initialization cost.
Hope this helps,
Mat |
|
| Back to top |
|
 |
bofang
Joined: 06 Dec 2012 Posts: 6
|
Posted: Sat Dec 15, 2012 10:47 pm Post subject: |
|
|
Oh thats good to know.
Thank you
Bo |
|
| Back to top |
|
 |
bo_fang
Joined: 21 Dec 2012 Posts: 2
|
Posted: Fri Dec 21, 2012 1:42 pm Post subject: |
|
|
Hi Mat,
I found an interesting thing here:
| Code: |
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#define T 4756
void stopwatch_start(double* stopwatch) {
struct timeval tval;
gettimeofday(&tval, NULL);
*stopwatch = (tval.tv_sec * 1000 + tval.tv_usec/1000.0);
}
/**
* * * Returns the elapsed time since the stopwatch started via stopwatch_start
* * * @param[in] stopwatch the stopwatch handler
* * * @return elapsed time in milliseconds
* * */
double stopwatch_elapsed(double* stopwatch) {
struct timeval tval;
gettimeofday(&tval, NULL);
return ((tval.tv_sec * 1000 + tval.tv_usec/1000.0) - *stopwatch);
}
double fRand(double fMin, double fMax)
{
double f = (double)rand() / RAND_MAX;
return fMin + f * (fMax - fMin);
}
int main(void){
unsigned int len1 = 3182;
int *temp1=(int *)calloc(T,sizeof(int));
double * temp2 = (double *) calloc(T*32, sizeof(double *));
/* for (int i = 0; i < T; i++){
temp2[i] = (double *) calloc(32, sizeof(double));
}
*/
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
double timer;
stopwatch_start(&timer);
// high level loop
for(int r_est = 0; r_est < len1; r_est ++){
for(int i = 0; i < T;i++)
{
if(i%2 == 0)
temp1[i] = 0;
else
temp1[i] = 1;
// printf("Temp1 is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
for(int j = 0; j < 32; j++)
temp2[i*32+j] = 1.5f;
}
#pragma acc data copyin(temp1[0:T],temp2[0:T*32])
{// printf("enter %d times\n",r_est);
#pragma acc kernels
for(int n = 0; n < T; n++){
double tem = 0;
if (temp1[n]== 1){
for(int f= 0;f<32;f++)
tem += (a+b)*log(c+a*temp2[n*32+f]);
}
reduction_1 += tem;
}
#pragma acc kernels
for(int n = 0; n < T; n++){
double tem = 0;
if (temp1[n]== 0){
for(int f= 0;f<32;f++)
tem += (a+b)*log(c+a*temp2[n*32+f]);
}
reduction_1 += tem;
}
}
}
printf("Result is %f, time is %f\n",reduction_1,stopwatch_elapsed(&timer));
return 1;
}
|
If I compile the above code in g++, it needs 22 s to run on my machine. If I compile it with pgcc, it takes 8s to finish. If I compile it with pgcc+openacc, it takes 7.5s to finish. I am wondering how pgcc optimize the code so that it can have a very good performance? I attached my openacc timing below. 50% of time has been spent in initialization, but I did keep the pgcudainit run in the background. Is there anything I can do to minimize that time? Thank you
Please note that I comment out the openacc pragma when I compile it with g++ and pgcc.
| Code: |
Accelerator Kernel Timing data
/home/bo/test1.c
main
78: region entered 3182 times
time(us): total=630,282 init=216 region=630,066
kernels=493,211
w/o init: total=630,066 max=329 min=196 avg=198
79: kernel launched 3182 times
grid: [4756] block: [32]
time(us): total=453,832 max=149 min=142 avg=142
85: kernel launched 3182 times
grid: [1] block: [256]
time(us): total=39,379 max=19 min=12 avg=12
/home/bo/test1.c
main
69: region entered 3182 times
time(us): total=656,918 init=479 region=656,439
kernels=509,233
w/o init: total=656,439 max=1,105 min=204 avg=206
70: kernel launched 3182 times
grid: [4756] block: [32]
time(us): total=458,228 max=157 min=143 avg=144
76: kernel launched 3182 times
grid: [1] block: [256]
time(us): total=51,005 max=30 min=15 avg=16
/home/bo/test1.c
main
67: region entered 3182 times
time(us): total=7,424,109 init=3,092,275 region=4,331,834
data=2,019,706
w/o init: total=4,331,834 max=4,352 min=1,313 avg=1,361
| [/code] |
|
| Back to top |
|
 |
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2001, 2002 phpBB Group
|