PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

cuModuleGetGlobal error
Goto page Previous  1, 2, 3  Next
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
mkcolg



Joined: 30 Jun 2004
Posts: 5952
Location: The Portland Group Inc.

PostPosted: Fri Dec 14, 2012 2:39 pm    Post subject: Reply with quote

Hi bofang,

Odd. For some reason the runtime can't find the device copy of temp2. I have filed a problem report (TPR#19044) and sent to our compiler engineers. The work around is to add a data region:

Code:
% cat test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
    double **restrict temp2 = (double **) calloc(T, sizeof(double *));
    for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i][j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)
#pragma acc data copyin(temp1[0:T],temp2[0:T][0:32])
 {
#pragma acc kernels
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}
 }
printf("Result is %f\n",reduction_1);
 return 1;
}
% pgcc -Minfo=accel test.c -V12.9 -acc
main:
     38, Generating copyin(temp2[0:1048576][0:32])
         Generating copyin(temp1[0:1048576])
     40, Generating present_or_copyin(temp2[0:1048576][0:32])
         Generating present_or_copyin(temp1[0:1048576])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     41, Loop is parallelizable
         Accelerator kernel generated
         41, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 31 registers; 96 shared, 52 constant, 0 local memory bytes
             CC 2.0 : 28 registers; 32 shared, 104 constant, 0 local memory bytes
         44, #pragma acc loop vector(32) /* threadIdx.x */
         47, Sum reduction generated for reduction_1
     44, Loop is parallelizable
% a.out
Result is 105127285.494059

Accelerator Kernel Timing data
./test.c
  main
    40: region entered 1 time
        time(us): total=32,520
                  kernels=25,658
        41: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=25,552 max=25,552 min=25,552 avg=25,552
        47: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=106 max=106 min=106 avg=106
/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/../src-nv/nvfill.c
  __pgi_cu_fill
    26: region entered 1 time
        time(us): total=32,227
                  kernels=83
        27: kernel launched 1 times
            grid: [8192]  block: [128]
            time(us): total=83 max=83 min=83 avg=83
./test.c
  main
    38: region entered 1 time
        time(us): total=5,909,665 init=67,314 region=5,842,351
                  data=5,773,474
        w/o init: total=5,842,351 max=5,842,351 min=5,842,351 avg=5,842,351



One thing to note is that your performance will be very poor since it takes almost 6 seconds to copy your data to the GPU (at least on my system). The problem being that data needs to be copied in contiguous blocks so your temp2 can't be copied in one large chunk. Instead, you have 1048576 copies of 32 elements each. To fix, you should move to using a 1-D array. For example:

Code:
% cat test2.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024
#define TT 32

#ifdef _OPENACC
#include <accelmath.h>
#endif

double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
double *restrict temp2 = (double *) calloc(T*TT, sizeof(double));
/*
 for (int i = 0; i < T; i++){
   temp2[i] = (double *) calloc(32, sizeof(double));
 }
*/

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < TT; j++)
    temp2[i*TT+j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc data copyin(a,b,c,temp1[0:T],temp2[0:T*TT])
 {
   reduction_1 = 0.0;
#pragma acc kernels loop
   for(int n = 0; n < T; n++){
     double tem = 0;
     if (temp1[n]== 1){
       for(int f= 0;f<TT;f++)
    tem += (a+b)*log(c+a*temp2[n*TT+f]);
     }
    reduction_1 += tem;
   }
 }
printf("Result is %f\n",reduction_1);
 return 1;
}

% pgcc -Minfo=accel test2.c -V12.9 -acc
main:
     44, Generating copyin(temp2[0:33554432])
         Generating copyin(temp1[0:1048576])
         Generating copyin(c)
         Generating copyin(b)
         Generating copyin(a)
     47, Generating present_or_copyin(temp2[0:33554432])
         Generating present_or_copyin(temp1[0:1048576])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     48, Loop is parallelizable
         Accelerator kernel generated
         48, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 33 registers; 96 shared, 52 constant, 0 local memory bytes
             CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
         51, #pragma acc loop vector(32) /* threadIdx.x */
         54, Sum reduction generated for reduction_1
     51, Loop is parallelizable

% a.out
Result is 105064474.847449

Accelerator Kernel Timing data
./test2.c
  main
    47: region entered 1 time
        time(us): total=119,004
                  kernels=25,406
        48: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=25,300 max=25,300 min=25,300 avg=25,300
        54: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=106 max=106 min=106 avg=106
./test2.c
  main
    44: region entered 1 time
        time(us): total=280,812 init=98,856 region=181,956
                  data=58,649
        w/o init: total=181,956 max=181,956 min=181,956 avg=181,956


With this change, the data transfer time goes from 5,773,474 ms down to 58,649 ms.

- Mat
Back to top
View user's profile
bofang



Joined: 06 Dec 2012
Posts: 6

PostPosted: Fri Dec 14, 2012 4:22 pm    Post subject: Reply with quote

Hi Mat,

Those are very good suggestions! Thank you.

One interesting thing is that after I applied all tricks you mentioned in the code, here is my time profile:
Code:

Accelerator Kernel Timing data
/home/bo/test1.c
  main
    42: region entered 1 time
        time(us): total=28,935 init= region=28,934
                  kernels=27,537
        w/o init: total=28,934 max=28,934 min=28,934 avg=28,934
        43: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=27,402 max=27,402 min=27,402 avg=27,402
        49: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=135 max=135 min=135 avg=135
/home/bo/test1.c
  main
    39: region entered 1 time
        time(us): total=6,079,576 init=5,953,711 region=125,865
                  data=90,984
        w/o init: total=125,865 max=125,865 min=125,865 avg=125,865


As you can see the data transfer time is 90,984. Which is ok. But init time is 5,953,711, which is quite large comparing to your 98,856. Any ideas?

Thanks a lot!

Bo
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5952
Location: The Portland Group Inc.

PostPosted: Fri Dec 14, 2012 5:19 pm    Post subject: Reply with quote

Hi Bo,

Linux will power down the GPU when not in use and it takes about 1 second per device to start it back-up. You can run the PGI utility pgcudainit as background task to hold the devices open so you minimize the initialization cost.

Hope this helps,
Mat
Back to top
View user's profile
bofang



Joined: 06 Dec 2012
Posts: 6

PostPosted: Sat Dec 15, 2012 10:47 pm    Post subject: Reply with quote

Oh thats good to know.

Thank you

Bo
Back to top
View user's profile
bo_fang



Joined: 21 Dec 2012
Posts: 2

PostPosted: Fri Dec 21, 2012 1:42 pm    Post subject: Reply with quote

Hi Mat,

I found an interesting thing here:
Code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#define T 4756

void stopwatch_start(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  *stopwatch = (tval.tv_sec * 1000 + tval.tv_usec/1000.0);
}

/**                                                                                                                                           
 *  *  * Returns the elapsed time since the stopwatch started via stopwatch_start                                                                 
 *   *   * @param[in] stopwatch the stopwatch handler                                                                                             
 *    *    * @return elapsed time in milliseconds                                                                                                   
 *     *     */
double stopwatch_elapsed(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  return ((tval.tv_sec * 1000 + tval.tv_usec/1000.0) - *stopwatch);
}


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){
unsigned int len1 = 3182;
int *temp1=(int *)calloc(T,sizeof(int));
    double * temp2 = (double *) calloc(T*32, sizeof(double *));
  /*  for (int i = 0; i < T; i++){
 
        temp2[i] = (double *) calloc(32, sizeof(double));
    }
*/
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;

double timer;
stopwatch_start(&timer);
// high level loop
for(int r_est = 0; r_est < len1; r_est ++){


for(int i = 0; i < T;i++)
{   
     if(i%2 == 0)
     temp1[i] = 0;
     else
     temp1[i] = 1;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i*32+j] = 1.5f;
}

#pragma acc data  copyin(temp1[0:T],temp2[0:T*32])
{// printf("enter %d times\n",r_est);
    #pragma acc kernels
    for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n*32+f]);
        }
        reduction_1 += tem;
    }
    #pragma acc kernels
    for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 0){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n*32+f]);
        }
        reduction_1 += tem;
    }
   
   
}

}
printf("Result is %f,  time is %f\n",reduction_1,stopwatch_elapsed(&timer));
 return 1;
}


If I compile the above code in g++, it needs 22 s to run on my machine. If I compile it with pgcc, it takes 8s to finish. If I compile it with pgcc+openacc, it takes 7.5s to finish. I am wondering how pgcc optimize the code so that it can have a very good performance? I attached my openacc timing below. 50% of time has been spent in initialization, but I did keep the pgcudainit run in the background. Is there anything I can do to minimize that time? Thank you

Please note that I comment out the openacc pragma when I compile it with g++ and pgcc.

Code:

Accelerator Kernel Timing data
/home/bo/test1.c
  main
    78: region entered 3182 times
        time(us): total=630,282 init=216 region=630,066
                  kernels=493,211
        w/o init: total=630,066 max=329 min=196 avg=198
        79: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=453,832 max=149 min=142 avg=142
        85: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=39,379 max=19 min=12 avg=12
/home/bo/test1.c
  main
    69: region entered 3182 times
        time(us): total=656,918 init=479 region=656,439
                  kernels=509,233
        w/o init: total=656,439 max=1,105 min=204 avg=206
        70: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=458,228 max=157 min=143 avg=144
        76: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=51,005 max=30 min=15 avg=16
/home/bo/test1.c
  main
    67: region entered 3182 times
        time(us): total=7,424,109 init=3,092,275 region=4,331,834
                  data=2,019,706
        w/o init: total=4,331,834 max=4,352 min=1,313 avg=1,361
[/code]
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Goto page Previous  1, 2, 3  Next
Page 2 of 3

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group