PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

HOW TO USE'UPDATE'

 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
Teslalady



Joined: 16 Mar 2012
Posts: 75

PostPosted: Fri Sep 07, 2012 9:03 am    Post subject: HOW TO USE'UPDATE' Reply with quote

I HAVE A CODE AS BELOW;

Code:
for (k=0;k<n;k++)
        {   
                temp=k;
                       for(p=k;p<n;p++)
                {
                         if( fabs(a[p*n+k])> fabs(a[temp*n+k]))
                                   temp=p;   
                }
                for(i=0;i<n;i++)
                {
                        change=a[k*n+i];
                         a[k*n+i]=a[temp*n+i];
                         a[temp*n+i]=change;
                }   
             
               

                    if(a[k*n+k]==0)
                        return 0;
                [color=Red]for(j=k+1;j<n;j++)
                {
                        a[j*n+k]=a[j*n+k]/a[k*n+k];
                }
                         
                for(j=k+1;j<n;j++)
                {           
                       
                        for(i=k+1;i<n;i++)
                        {
                                a[j*n+i]=a[j*n+i]-a[j*n+k]*a[k*n+i];   
                        }                       
                }[/color]
               
          }

I want to ACELERATE the red area that i marked,so I oPTiMIzIED the coDE As BelOW:

Code:
for (k=0;k<n;k++)
        {   
                temp=k;
                       for(p=k;p<n;p++)
                {
                         if( fabs(a[p*n+k])> fabs(a[temp*n+k]))
                                   temp=p;   
                }
                for(i=0;i<n;i++)
                {
                        change=a[k*n+i];
                         a[k*n+i]=a[temp*n+i];
                         a[temp*n+i]=change;
                }   
             
               

                    if(a[k*n+k]==0)
                        return 0;
                [color=Red]#pragma acc data  copy(a[:N*N])
                {       
                        #pragma acc kernels
                        {
                                #pragma acc loop independent
                                for(j=k+1;j<n;j++)
                                {
                                        a[j*n+k]=a[j*n+k]/a[k*n+k];
                                }
                        }
                         #pragma acc kernels
                        {
                                #pragma acc loop independent
                                for(j=k+1;j<n;j++)
                                {           
                                        #pragma acc loop independent
                                        for(i=k+1;i<n;i++)
                                        {
                                                a[j*n+i]=a[j*n+i]-a[j*n+k]*a[k*n+i];   
                                        }                       
                                }
                        }
                }[/color]
         }

it can be compiled succesfully,but it seemed be not accelerated.i found there should be somthing wrong with #pragma acc data copy(a[:N*N]),it need copy daTA For eaCh k lOOP.i know that 'updaTE' COuLD be used for updating data,so how can i use it in my code/[/quote]
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5952
Location: The Portland Group Inc.

PostPosted: Fri Sep 07, 2012 3:24 pm    Post subject: Reply with quote

Hi Sisiy,

The update directive would be used within a data region to synchronize the device and host copies of the data. Typically for this type of code I would recommend you put the data region outside so that you're not copying "a" back and forth between the GPU and host for every iteration of the "k" loop. However, if you put the update directives both before and after the compute regions, you're no better off then using the inner data region.

Ideally, you'd put all computation performed on "a" into a compute region. Even in this code, where you have sequential regions, it's better to put these sequential regions on the GPU rather than copy "a" back and forth each time. To illustrate, I wrote the example bellow. Note that without the outer data region (like what you have), my runtime on the GPU was 448 seconds with 440 of those spent copying data (versus 65 seconds when compiled for the host). After moving the data region outside of the "k" loop and adding the sequential region, the GPU time dropped to 9 seconds with the data being 0.008 seconds of that.

The one caveat is that I removed the inner "if" check. While you can add an update directive here to return this element of "a", your runtime will degrade. Also I question if this if statement is effective given it's a comparison of absolute equality to a floating point number.

Here's my example code:
Code:

% cat test_update_1.c

#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
#ifdef _OPENACC
#include <accelmath.h>
#endif
int main ()  {

double * a;
double change;
int i,j,k,temp,p,n;

n = 5000;

a = (double*) malloc(n*n*sizeof(double));
srand(123);
for (i = 0; i < n*n; ++i) {
   a[i] = (double) rand() / (double) RAND_MAX;
}
 

#pragma acc data copy(a[0:n*n])
{

for (k=0;k<n;k++)
        {   
                //temp=k;
#pragma acc kernels
{
                for(p=temp=k;p<n;p++)
                {
                         if( fabs(a[p*n+k])> fabs(a[temp*n+k]))
                                   temp=p;   
                }
#pragma acc loop gang vector independent
                for(i=0;i<n;i++)
                {
                        change=a[k*n+i];
                         a[k*n+i]=a[temp*n+i];
                         a[temp*n+i]=change;
                }   
 }           
               
/*
                    if(a[k*n+k]==0)
                        break;
*/

#pragma acc kernels loop independent
                for(j=k+1;j<n;j++)
                {
                        a[j*n+k]=a[j*n+k]/a[k*n+k];
                }
                         
#pragma acc kernels loop independent
                for(j=k+1;j<n;j++)
                {           
#pragma acc loop independent
                        for(i=k+1;i<n;i++)
                        {
                                a[j*n+i]=a[j*n+i]-a[j*n+k]*a[k*n+i];   
                        }                       
                }
}
             
          }

for (i = 0; i < n*n; i+=((n/10)*n)) {
    printf ("%d: %g \n", i, a[i]);
}

free(a);
}

% pgcc -fast test_update_1.c -Minfo=accel -o test_update_gpu1.out -acc -V12.8 -Msafeptr
main:
     23, Generating copy(a[0:n*n])
     29, Generating copy(a[0:n*n])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     31, Loop carried scalar dependence for 'temp' at line 33
         Scalar last value needed after loop for 'temp' at line 40
         Scalar last value needed after loop for 'temp' at line 41
         Accelerator restriction: scalar variable live-out from loop: temp
         Accelerator kernel generated
         31, CC 1.3 : 12 registers; 36 shared, 8 constant, 0 local memory bytes
             CC 2.0 : 2 registers; 0 shared, 60 constant, 0 local memory bytes
     37, Loop is parallelizable
         Accelerator kernel generated
         37, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.3 : 11 registers; 48 shared, 4 constant, 0 local memory bytes
             CC 2.0 : 16 registers; 0 shared, 72 constant, 0 local memory bytes
     50, Generating copy(a[0:n*n])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     51, Loop is parallelizable
         Accelerator kernel generated
         51, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 23 registers; 44 shared, 0 constant, 0 local memory bytes
             CC 2.0 : 24 registers; 0 shared, 76 constant, 0 local memory bytes
     56, Generating copy(a[0:n*n])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     57, Loop is parallelizable
     60, Loop is parallelizable
         Accelerator kernel generated
         57, #pragma acc loop gang /* blockIdx.y */
         60, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.3 : 14 registers; 52 shared, 0 constant, 0 local memory bytes
             CC 2.0 : 20 registers; 0 shared, 68 constant, 0 local memory bytes
% time test_update_gpu1.out
0: 0.0600514
2500000: 15.6055
5000000: 0.469559
7500000: 12.7688
10000000: 5.39385
12500000: 14.9757
15000000: 11.6148
17500000: 2.55971
20000000: 10.9453
22500000: 15.9231

Accelerator Kernel Timing data
/proj/qa/support/sisiy/test_update_1.c
  main
    56: region entered 5000 times
        time(us): total=7,525,391 init=317 region=7,525,074
                  kernels=7,480,182
        w/o init: total=7,525,074 max=4,487 min= avg=1,505
        60: kernel launched 4999 times
            grid: [1-40x1-4999]  block: [128]
            time(us): total=7,480,182 max=4,477 min=5 avg=1,496
/proj/qa/support/sisiy/test_update_1.c
  main
    50: region entered 5000 times
        time(us): total=237,549 init=370 region=237,179
                  kernels=193,538
        w/o init: total=237,179 max=145 min= avg=47
        51: kernel launched 4999 times
            grid: [1-4999]  block: [1]
            time(us): total=193,538 max=78 min=5 avg=38
/proj/qa/support/sisiy/test_update_1.c
  main
    29: region entered 5000 times
        time(us): total=200,767 init=324 region=200,443
                  kernels=47,476
        w/o init: total=200,443 max=762 min=37 avg=40
        31: kernel launched 5000 times
            grid: [1]  block: [1]
            time(us): total=15,473 max=13 min=3 avg=3
        37: kernel launched 5000 times
            grid: [40]  block: [128]
            time(us): total=32,003 max=16 min=6 avg=6
/proj/qa/support/sisiy/test_update_1.c
  main
    23: region entered 1 time
        time(us): total=9,212,921 init=1,158,163 region=8,054,758
                  data=76,850
        w/o init: total=8,054,758 max=8,054,758 min=8,054,758 avg=8,054,758
4.651u 5.057s 0:09.87 98.2%   0+0k 0+0io 0pf+0w


Hope this helps,
Mat
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group