PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

cuModuleGetGlobal error
Goto page 1, 2, 3  Next
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
bofang



Joined: 06 Dec 2012
Posts: 6

PostPosted: Thu Dec 13, 2012 1:27 am    Post subject: cuModuleGetGlobal error Reply with quote

Hi,

I am new to OpenACC and here is a error that I encountered when I tried to use openacc on my code.

Code:

#define T 1024*1024

int *temp1=(int *)calloc(T,sizeof(int));
double **restrict temp2 = (double **) calloc(T, sizeof(double *));

// initializing temp1 and temp2
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc kernels  copyin(a,b,c,temp1[0:T],temp2[0:T][0:32])
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}



The compile seems fine:

$pgcc -ta=nvidia,cc20 -Minfo test1.c
main:
39, Generating present_or_copyin(temp2[0:1048576][0:32])
Generating present_or_copyin(temp1[0:1048576])
Generating present_or_copyin(c)
Generating present_or_copyin(b)
Generating present_or_copyin(a)
Generating compute capability 2.0 binary
40, Loop is parallelizable
Accelerator kernel generated
40, #pragma acc loop gang /* blockIdx.x */
CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
43, #pragma acc loop vector(32) /* threadIdx.x */
46, Sum reduction generated for reduction_1
43, Loop is parallelizable

But it fails with following error: ( I enabled NVDEBUG )

__pgi_cu_init() found 3 devices
__pgi_cu_init( file=/home/bo/test1.c, function=main, line=39, startline=14, endline=50 )
__pgi_cu_init() will use device 0 (V2.0)
__pgi_cu_init() compute context created
__pgi_cu_module3( lineno=39 )
__pgi_cu_module3 module loaded at 0x12e99040
__pgi_cu_module_function( name=0x41db08=main_40_gpu, lineno=40, argname=(nil)=, argsize=52, varname=0x41db26=b1, varsize=8, SWcachesize=2048 )
Function handle is 0x12ea6e70
__pgi_cu_module_function( name=0x41db15=main_46_gpu_red, lineno=40, argname=(nil)=, argsize=0, varname=(nil)=, varsize=0, SWcachesize=2048 )
Function handle is 0x12ea2bf0
pgi_acc_dataon(devptr=0x401d70,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3d20f010
__pgi_cu_alloc(size=276824064,lineno=39,name=temp2)
__pgi_cu_alloc(276824064) returns 0x200300000 (address=0x7fff8878f008)
__pgi_cu_init( file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/../src-nv/nvfill.c, function=__pgi_cu_fill, line=26, startline=22, endline=28 )
__pgi_cu_module3( lineno=26 )
__pgi_cu_module3 module loaded at 0x12ea0970
__pgi_cu_module_function( name=0x420539=__pgi_cu_fill_27_gpu, lineno=27, argname=(nil)=, argsize=32, varname=(nil)=, varsize=0, SWcachesize=0 )
Function handle is 0x12ea1250
__pgi_cu_launch_a(func=0x12ea1250, grid=8192x1x1, block=128x1x1, lineno=27)
__pgi_cu_launch_a(func=0x12ea1250, params=0x7fff8878eba0, bytes=32, sharedbytes=0)
First arguments are:
1048576 48 11534336 2 3145728 2 256 0

0x00100000 0x00000030 0x00b00000 0x00000002 0x00300000 0x00000002 0x00000100 0x00000000
launch kernel file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/../src-nv/nvfill.c function=__pgi_cu_fill line=27 device=0 grid=8192 block=128 queue=0
__pgi_cu_close()
map dev:0x200300000 host:0x7f1f3d20f010 size:8388608 offset:0 data[dev:0x200b00000 host:0x1d0d010 size:268435456] (line:39 name:temp2) dims=32x1048576
alloc done with devptr at 0x200300000 (address=0x7fff8878f008)
__pgi_acc_dataupx(devptr=0x200b00000,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x0)
pgi_acc_dataon(devptr=0x0,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3da10010
__pgi_cu_alloc(size=4194304,lineno=39,name=temp1)
__pgi_cu_alloc(4194304) returns 0x210f00000 (address=0x7fff8878f000)
map dev:0x210f00000 host:0x7f1f3da10010 size:4194304 offset:0 data[dev:0x210f00000 host:0x7f1f3da10010 size:4194304] (line:39 name:temp1)
alloc done with devptr at 0x210f00000 (address=0x7fff8878f000)
__pgi_acc_dataupx(devptr=0x210f00000,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x0)
pgi_acc_dataon(devptr=0x41d9e0,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eeb8
__pgi_cu_alloc(size=8,lineno=39,name=c)
__pgi_cu_alloc(8) returns 0x211300000 (address=0x7fff8878eff8)
map dev:0x211300000 host:0x7fff8878eeb8 size:8 offset:0 data[dev:0x211300000 host:0x7fff8878eeb8 size:8] (line:39 name:c)
alloc done with devptr at 0x211300000 (address=0x7fff8878eff8)
__pgi_acc_dataupx(devptr=0x211300000,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x0)
pgi_acc_dataon(devptr=0x30fcd91730,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec0
__pgi_cu_alloc(size=8,lineno=39,name=b)
__pgi_cu_alloc(8) returns 0x211300200 (address=0x7fff8878eff0)
map dev:0x211300200 host:0x7fff8878eec0 size:8 offset:0 data[dev:0x211300200 host:0x7fff8878eec0 size:8] (line:39 name:b)
alloc done with devptr at 0x211300200 (address=0x7fff8878eff0)
__pgi_acc_dataupx(devptr=0x211300200,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x0)
pgi_acc_dataon(devptr=0x41da25,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec8
__pgi_cu_alloc(size=8,lineno=39,name=a)
__pgi_cu_alloc(8) returns 0x211300400 (address=0x7fff8878efe8)
map dev:0x211300400 host:0x7fff8878eec8 size:8 offset:0 data[dev:0x211300400 host:0x7fff8878eec8 size:8] (line:39 name:a)
alloc done with devptr at 0x211300400 (address=0x7fff8878efe8)
__pgi_acc_dataupx(devptr=0x211300400,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x0)
__pgi_cu_alloc(size=524280,lineno=40,name=)
__pgi_cu_alloc(524280) returns 0x211400000
__pgi_cu_uploadc( "b1", size=8, offset=0, lineno=40 )
call to cuModuleGetGlobal returned error 500: Not found
CUDA driver version: 4010



Please give me a hint what should I do or what the problem could be.

Thank you in advance.
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6141
Location: The Portland Group Inc.

PostPosted: Thu Dec 13, 2012 10:11 am    Post subject: Reply with quote

Hi bofang,

Can you double check that your code works in host code (i.e. don't compile with -acc)? I don't see where you allocate temp2's second dimension.

You can also try using the C99 VLA syntax to declare temp2.
Code:
double * restrict temp2[32] = (double *) calloc(T*32, sizeof(double));


- Mat
Back to top
View user's profile
bofang



Joined: 06 Dec 2012
Posts: 6

PostPosted: Thu Dec 13, 2012 1:31 pm    Post subject: Reply with quote

Sorry that I didn't paste that part of the code here. This piece of code works on the host.

Code:

 for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6141
Location: The Portland Group Inc.

PostPosted: Thu Dec 13, 2012 5:21 pm    Post subject: Reply with quote

Ok, then I don't see anything obvious. Please post a complete reproducing example or send one to PGI Customer Service (trs@progup.com) and ask them to forward it to me.

Note while it shouldn't cause this error, you don't need to copyin your scalar variables. These will be passed as argument to your kernels so don't need to be explicitly copied.

- Mat
Back to top
View user's profile
bofang



Joined: 06 Dec 2012
Posts: 6

PostPosted: Fri Dec 14, 2012 12:15 am    Post subject: Reply with quote

OK. Here is my complete code:

Code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
    double **restrict temp2 = (double **) calloc(T, sizeof(double *));
    for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i][j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc kernels  /*copyin(a,b,c,temp1[0:1024*1024],temp2[0:T][0:32])*/
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}
printf("Result is %f\n",reduction_1);
 return 1;
}


The command I am using to compile is

pgcc -ta=nvidia,cc20 test1.c -Minfo


Thank you Mat
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Goto page 1, 2, 3  Next
Page 1 of 3

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group