PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

Inlining with pragmas
Goto page Previous  1, 2
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
mkcolg



Joined: 30 Jun 2004
Posts: 5952
Location: The Portland Group Inc.

PostPosted: Wed Apr 23, 2014 12:57 pm    Post subject: Reply with quote

Quote:
Should I be concerned if when I compile it, I don't get the same messages? I.e. I don't get notified of the reductions in the loops

The loop at line 155 is getting a "gang vector" and the inner loops aren't getting scheduled. Hence no reduction needed. The question is why the compiler would be ignoring the schedule you provided? It's because of the dependency "Accelerator restriction: size of the GPU copy of 'rr' is unknown".

The main difference between what you have and what I did, is I was using the C compiler but you're using C++. C++ doesn't have true VLAs so "rr[N][N][3]" gets turned into a pointer to a pointer to a fixed size array while in C this becomes a fixed size array. Fixed size arrays have a known size while pointers don't. To fix, either use C or linearize rr (see below).

Quote:
source/n_body.cpp", line 65: warning: invalid text in pragma
#pragma acc routine
Sorry, I didn't realize you were using C++. C++ support for "routine" will be added in 14.3 along with several other C++ improvements (like accessing class member variables in compute regions and data clauses, and single dimension VLAs).

Here's the linearized "rr" version of the example:
Code:

% cat test_fixed.cpp

#define IDX3D(i,j,k)  ((i*N*3)+(j*3)+k)
void rhs(double r[][3], double v[][3], double ro[][3], double vo[][3],double * mass, double * eta, int N)
 {
     // register int m;
     double vo0,vo1,vo2;
#ifdef PGI_144
     double rr[N*N*3],rp[3],temp,temp2,temp3;
#else
     double *rr,rp[3],temp,temp2,temp3;
     rr = new double[N*N*3];
#endif
     #pragma acc data copyin(N, mass[:N]) create(rr[0:N*N*3])
{
     #pragma acc kernels
     {
         #pragma acc loop independent //for private(rp[:3])
         for (int j = 0; j < N; j++) {
             #pragma acc loop independent
             for (int i = 0; i < N; i++) {
                 if (i >= j) continue;
               ///  rij(r,mass,eta,i,j,rp);
               ///  temp = norm3(rp);

                 // rp[0]*temp^3
                 rr[IDX3D(i,j,0)] = (1/temp)*rp[0];
                 rr[IDX3D(i,j,0)] = (1/temp)*rp[1];
                 rr[IDX3D(i,j,0)] = (1/temp)*rp[2];
             }
         }
     }


///     A3(ro[0],v[0]);
     vo[0][0] = 0; vo[0][1] = 0; vo[0][2] = 0;
     //#pragma acc kernels copyin(mass[:N])
     //{
      #pragma acc kernels loop independent copy(vo[0:N][0:3], ro[0:N][0:3])
      for (int i = 0; i < N; i++) {
             ro[i][0] = v[i][0];//A3(ro[i],v[i]);
             vo0 = (temp*mass[0])*rr[IDX3D(0,i,0)];//A3(vo[i],(eta[i]/eta[i-1]*mass[0])*rr[0][i]);
             ro[i][1] = v[i][1];//A3(ro[i],v[i]);
             vo1 = (temp*mass[0])*rr[IDX3D(0,i,1)];
             ro[i][2] = v[i][2];//A3(ro[i],v[i]);
             vo2 = (temp*mass[0])*rr[IDX3D(0,i,2)];

             #pragma acc loop reduction(+:vo0,vo1,vo2) independent
             for (int j = 1; j < N; j++) {
                 if (j > i-1) continue;
                 vo0 += (temp*mass[j])*rr[IDX3D(j,i,0)];//A3P(vo[i],temp*mass[j]*rr[j][i]);
                 vo1 += (temp*mass[j])*rr[IDX3D(j,i,1)];
                 vo2 += (temp*mass[j])*rr[IDX3D(j,i,2)];
             }
             #pragma acc loop reduction(+:vo0,vo1,vo2) independent
             for (int j=i+1; j<=N-1; j++) {
                 vo0 += -(mass[j])*(rr[IDX3D(j,i,0)]); //A3P(vo[i],-mass[j]*rr[i][j]);
                 vo1 += -(mass[j])*(rr[IDX3D(j,i,1)]);
                 vo2 += -(mass[j])*(rr[IDX3D(j,i,2)]);
             }
             #pragma acc loop reduction(+:vo0,vo1,vo2) independent
             for (int j=0; j <= i-1; j++) {
             #pragma acc loop reduction(+:vo0,vo1,vo2) independent
                 for (int k = i+1; k <= N-1; k++) {
                     vo0 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,0)]);//A3P(vo[i],mass[j]*mass[k]*temp*rr[j][k]);
                     vo1 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,1)]);
                     vo2 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,2)]);
                 }
             }
             vo[i][0] +=  vo0;
             vo[i][1] +=  vo1;
             vo[i][2] +=  vo2;
        }
    }
#ifndef PGI_144
   delete rr;
#endif

 }


% pgcpp -acc -Minfo -V14.3 test_fixed.cpp -Msafeptr -c -w
rhs(double (*)[3], double (*)[3], double (*)[3], double (*)[3], double *, double *, int):
     14, Generating copyin(N)
         Generating copyin(mass[:N])
         Generating create(rr[:(N*N)*3])
     16, Generating present_or_copyin(rp[:])
         Generating NVIDIA code
     18, Loop is parallelizable
     20, Loop is parallelizable
         Accelerator kernel generated
         18, #pragma acc loop gang /* blockIdx.y */
         20, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     35, Generating copy(vo[:N][:])
         Generating copy(ro[:N][:])
         Generating present_or_copyin(v[:N][:])
         Generating NVIDIA code
     39, Loop is parallelizable
         Accelerator kernel generated
         39, #pragma acc loop gang /* blockIdx.x */
         48, #pragma acc loop vector(128) /* threadIdx.x */
             Sum reduction generated for vo0
             Sum reduction generated for vo1
             Sum reduction generated for vo2
         55, #pragma acc loop vector(128) /* threadIdx.x */
             Sum reduction generated for vo0
             Sum reduction generated for vo1
             Sum reduction generated for vo2
         61, Sum reduction generated for vo0
             Sum reduction generated for vo1
             Sum reduction generated for vo2
         63, #pragma acc loop vector(128) /* threadIdx.x */
             Sum reduction generated for vo0
             Sum reduction generated for vo1
             Sum reduction generated for vo2
     48, Loop is parallelizable
     55, Loop is parallelizable
     61, Loop is parallelizable
     63, Loop is parallelizable
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Goto page Previous  1, 2
Page 2 of 2

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group