PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

Reduction clause

 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
JPMN



Joined: 23 Oct 2012
Posts: 8

PostPosted: Mon Apr 22, 2013 3:34 am    Post subject: Reduction clause Reply with quote

Hi there.

I've been testing out a small program in OpenACC and it seems I have a small problem with it.

I wrote this small program:
Code:
#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
//#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;
   
    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }
   
    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], result=0.0;
    int i, j;
    printf("Hello world\n");
   
    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;
   
    for(i=0; i<M_Y; i++)
    {
        scaler[i]=0.0;
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>scaler[i])
                scaler[i]=matrix[i][j];
        }
       
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
   
   
   
    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");
   
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
   
    printf("result -> %f\n", result);
   
   
   
    exit(0);
}


Which gives me the following output:
Quote:
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result -> 10.125000



And then I tried to parallelize it using OpenACC, and I changed the code to the following:
Code:
#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;
   
    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }
   
    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], aux_scaler, result=0.0;
    int i, j;
    printf("Hello world\n");
   
    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;
   
#pragma acc data copy(matrix), copyout(scaler[M_Y]), create(aux_scaler)
{
    #pragma acc kernels loop private(aux_scaler)
    for(i=0; i<M_Y; i++)
    {
        aux_scaler=0.0;
        #pragma acc loop reduction(max : aux_scaler)
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>aux_scaler)
                aux_scaler=matrix[i][j];
        }
        scaler[i]=aux_scaler;
    }
   
    #pragma acc kernels loop independent
    for(i=0; i<M_Y; i++)
    {
        #pragma acc loop independent
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
}
   
   
    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");
   
    #pragma acc data copyin(scaler[M_Y]) copy(result)
    {
    #pragma acc kernels loop reduction(+:result)
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
    }
   
    printf("result -> %f\n", result);
   
   
   
    exit(0);
}


But now I get this output:
Quote:
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result -> 0.000000


It seems that everything is fine until the reduction clause I use at the end. Either the result is not being computed or the value is not being transfered back to the CPU.

What am I doing wrong with the reduction clause?

Thanks in advance!
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5871
Location: The Portland Group Inc.

PostPosted: Tue Apr 23, 2013 12:07 am    Post subject: Reply with quote

Hi JPMN,

Remove the data clauses that contain your reduction variables. We've seen users do this, especially those that started with the Cray compiler, so are looking at adding support for this style of syntax. Though for now, what's happening is that reduction variables are treated differently and putting them in copy clauses interferes with how the compiler is generating the reduction. Essentially, you are overwriting the "result" when copying it back to the host.

Here's the modified code:
Code:
 % cat red.c
#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;

    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }

    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], aux_scaler, result=0.0;
    int i, j;
    printf("Hello world\n");

    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;

#pragma acc data copy(matrix), copyout(scaler[M_Y])
{
    #pragma acc kernels loop
    for(i=0; i<M_Y; i++)
    {
        aux_scaler=0.0;
        #pragma acc loop reduction(max : aux_scaler)
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>aux_scaler)
                aux_scaler=matrix[i][j];
        }
        scaler[i]=aux_scaler;
    }

    #pragma acc kernels loop independent
    for(i=0; i<M_Y; i++)
    {
        #pragma acc loop independent
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
}


    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");

    #pragma acc data copyin(scaler[M_Y])
    {
    #pragma acc kernels loop reduction(+:result)
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
    }

    printf("result -> %f\n", result);



    exit(0);
}

% pgcc -acc -Minfo=accel red.c -V13.4 ; a.out
main:
     38, Generating copyout(scaler[0:])
         Generating copy(matrix[0:][0:])
     40, Generating present_or_copyout(scaler[0:])
         Generating present_or_copy(matrix[0:][0:])
         Generating NVIDIA code
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     41, Loop is parallelizable
         Accelerator kernel generated
         41, #pragma acc loop gang /* blockIdx.x */
         45, #pragma acc loop vector(32) /* threadIdx.x */
         Loop is parallelizable
     53, Generating present_or_copy(matrix[0:][0:])
         Generating present_or_copyout(scaler[0:])
         Generating NVIDIA code
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     54, Loop is parallelizable
     57, Loop is parallelizable
         Accelerator kernel generated
         54, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         57, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     73, Generating copyin(scaler[0:])
     75, Generating present_or_copyin(scaler[0:])
         Generating NVIDIA code
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     76, Loop is parallelizable
         Accelerator kernel generated
         76, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result -> 10.125000


Hope this helps,
Mat
Back to top
View user's profile
JPMN



Joined: 23 Oct 2012
Posts: 8

PostPosted: Tue Apr 23, 2013 6:39 am    Post subject: Reply with quote

Hi Mat.

That was exactly it! Thank you so much, you solved my problem!

This was the second time that I had to work with the reduction clause and the first time I got this error I just made a work around to solve it.

Thank you once again!

JPMN.
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group