PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

Urgent Help Needed: Constant memory issue

 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
crip_crop



Joined: 28 Jul 2010
Posts: 68

PostPosted: Thu Aug 26, 2010 12:20 pm    Post subject: Urgent Help Needed: Constant memory issue Reply with quote

Hi there,

I have a deadline tomorrow at 12 and I've accidently messed up my code so I need urgent help from anyone who can.

The code's basically supposed to use constant memory for the devifact array, but when i run it it's saying copyin Memcpy FAILED:17.

The copy from host to constant memory is devifact=ifact(1:292)

Please please please can anyone help.

Host code
Code:
      subroutine matmul_host(lowt,a,b,d,ifact,nbasis)

!     Declare host arrays
      Implicit none
      Integer:: threadblocks
      Integer::  error, istat,size
      Integer, value:: lowt, nbasis,blocksize
      double precision, dimension(1:lowt):: a, b,d
      integer::ifact(292)

!     Declare device arrays
      Double precision, device,allocatable, dimension(:)::adev,bdev,ddev
      integer, constant::devifact(292)

!     Declare grid and block
      type(dim3):: dimGrid, dimBlock

!     Declare variables for timer
      real ctimeall, ctimekernel
      integer c1, c2, c3, c4

!     Start time for kernel and data transfers
      call system_clock( count=c1 )

!     Allocate memory on the device
      allocate(adev(1:lowt),bdev(1:lowt),ddev(1:lowt))
      istat = cudathreadsynchronize()

!     Define the grid and block shape
      blocksize=3
      if (mod(nbasis, blocksize)==0) then
         threadblocks=nbasis/blocksize
      else
         threadblocks=nbasis/blocksize+1
      end if
      dimGrid= dim3(threadblocks, 1, 1)
      dimBlock= dim3 (blocksize, 1, 1)
     
!     Copy a, b, and ifact to the device
      Adev = A(1:lowt)
      bdev = b(1:lowt)
      devifact=ifact(1:292)
     
!     Second timer to record kernel time without data transfers
      call system_clock( count=c2 )

!     Kernel invocation
      call matmulcuf<<<dimGrid,dimBlock>>>(lowt,adev,bdev,ddev,devifact,nbasis,blocksize)

!     Synchronise threads
      istat = cudathreadsynchronize()

!     Stop second timer
      call system_clock( count=c3 )

!     Copy results back to host array
      d = ddev(1:lowt)
     
!     Stop first timer
      call system_clock( count=c4 )

!     Freeing arrays on device(may need to declare as integer function)
      deallocate(adev,bdev,ddev)
     
!     Print out kernel times
      ctimekernel = c3 - c2
      ctimeall = c4 - c1
      print *, 'Kernel time excluding data xfer:', ctimekernel/1000000
      print *, 'Total time including data xfer: ', ctimeall/1000000
      end

      end


Device code

Code:
      Module matmul_mod
      use cudafor
      contains
     
!     Device code
      attributes(global) subroutine matmulcuf(lowt, a, b, d, ifact, nbasis, blocksize)
      implicit none
      double precision :: a(lowt), b(lowt), d(lowt)
      integer:: i, j, k, ii, jj, kk, l, m, tx, bx
      double precision ::  sum, temp
      integer, value:: lowt, nbasis, blocksize
      double precision, value:: one

!     Declare devifact to reside in constant memory
      Integer, constant:: ifact(292)

!     Initialising variables
      kk=0
      ii=0
      jj=0

!     Setting local thread index and block index
      tx= threadidx%x
      bx= blockidx%x

!     Decomposing into threads
      i= blocksize*(bx-1)+(tx)


!     Conditional statement to prevent threads working outside of array boundaries
     if (i<(nbasis+1))then

!     Ifact array used as an index
         ii=ifact(i)

!     Outer j loop
         do j=1,i
            jj=ifact(j)
            sum=0.0d0
            one=0.0d0;

!     First vecotr multiplication, indexing set by ifact(j) and ifact(i),
!     the results is accumulated in sum
            do k=1,j
               sum=sum+a(ii+k)*b(jj+k)
            end do 

!     Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
!     the results is accumulated in sum
            do k=j+1,i
               kk=ifact(k)
               sum=sum+a(ii+k)*b(kk+j)
            end do

!     Third vecotr multiplication, indexing set by ifact(k),
!     the results is accumulated in sum           
            do k=i+1,nbasis
               kk=ifact(k)
               sum=sum+a(kk+i)*b(kk+j)
            end do
 
!     Storing intermediate variables
            temp=sum;
            one=-1.0d0;
            sum=0.0d0

!     Combined section of kernel
!    Multiplying the opposite part of the a and b arrays,
!     notice the b and a arrays have swapped positions.

!     First vecotr multiplication, indexing set by ifact(j) and ifact(i),
!     the results is accumulated in sum
            do k=1,j
               sum=sum+b(ii+k)*a(jj+k)
            end do 

!     Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
!     the results is accumulated in sum
            do k=j+1,i
               kk=ifact(k)
               sum=sum+b(ii+k)*a(kk+j)
            end do

!     Third vecotr multiplication, indexing set by ifact(k),
!     the results is accumulated in sum             
            do k=i+1,nbasis
               kk=ifact(k)
               sum=sum+b(kk+i)*a(kk+j)
            end do

!     Calculating the counter
            m=0
            do l=i-1,0,-1
               m=m+l
            end do

!     Calculating the result
            d(m+j)=sum+one*temp   
         end do
      end if
     
     
      end


Thank you in advance,
Crip_crop
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6125
Location: The Portland Group Inc.

PostPosted: Thu Aug 26, 2010 4:20 pm    Post subject: Reply with quote

Hi Crip_crop,

It's actually illegal to use the constant qualifier in host code. Unfortunately, the compiler didn't start flagging this as a syntax error until the 10.8 version. So if I compile your code using 10.8 I get:

Code:
% pgf90 -V10.8 test_org.cuf -c
PGF90-S-0134-Illegal attribute constant not allowed in host subprograms (test_org.cuf: 13)
PGF90-S-0155-Derived type has not been declared - dim3 (test_org.cuf: 16)
  0 inform,   0 warnings,   2 severes, 0 fatal for matmul_host

The second error is due to a missing 'use cudafor'.

To fix, move 'devifact' into your module's data section and not pass into the device kernel. Granted I have not run the code so there could be additional issues. Though, hopefully it gets you farther.

Code:
     Module matmul_mod

      use cudafor
      integer, constant::devifact(292)
      contains

!     Device code
      attributes(global) subroutine matmulcuf(lowt, a, b, d, nbasis, blocksize)
      implicit none
      double precision :: a(lowt), b(lowt), d(lowt)
      integer:: i, j, k, ii, jj, kk, l, m, tx, bx
      double precision ::  sum, temp
      integer, value:: lowt, nbasis, blocksize
      double precision, value:: one


!     Initialising variables
      kk=0
      ii=0
      jj=0

!     Setting local thread index and block index
      tx= threadidx%x
      bx= blockidx%x

!     Decomposing into threads
      i= blocksize*(bx-1)+(tx)


!     Conditional statement to prevent threads working outside of array boundaries
     if (i<(nbasis+1))then

!     Ifact array used as an index
         ii=devifact(i)

!     Outer j loop
         do j=1,i
            jj=devifact(j)
            sum=0.0d0
            one=0.0d0;

!     First vecotr multiplication, indexing set by ifact(j) and ifact(i),
!     the results is accumulated in sum
            do k=1,j
               sum=sum+a(ii+k)*b(jj+k)
            end do

!     Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
!     the results is accumulated in sum
            do k=j+1,i
               kk=devifact(k)
               sum=sum+a(ii+k)*b(kk+j)
            end do

!     Third vecotr multiplication, indexing set by ifact(k),
!     the results is accumulated in sum
            do k=i+1,nbasis
               kk=devifact(k)
               sum=sum+a(kk+i)*b(kk+j)
            end do

!     Storing intermediate variables
            temp=sum;
            one=-1.0d0;
            sum=0.0d0

!     Combined section of kernel
!    Multiplying the opposite part of the a and b arrays,
!     notice the b and a arrays have swapped positions.

!     First vecotr multiplication, indexing set by ifact(j) and ifact(i),
!     the results is accumulated in sum
            do k=1,j
               sum=sum+b(ii+k)*a(jj+k)
            end do

!     Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
!     the results is accumulated in sum
            do k=j+1,i
               kk=devifact(k)
               sum=sum+b(ii+k)*a(kk+j)
            end do

!     Third vecotr multiplication, indexing set by ifact(k),
!     the results is accumulated in sum
            do k=i+1,nbasis
               kk=devifact(k)
               sum=sum+b(kk+i)*a(kk+j)
            end do

!     Calculating the counter
            m=0
            do l=i-1,0,-1
               m=m+l
            end do

!     Calculating the result
            d(m+j)=sum+one*temp
         end do
      end if


      end




      subroutine matmul_host(lowt,a,b,d,ifact,nbasis)
      use cudafor
!     Declare host arrays
      Implicit none
      Integer:: threadblocks
      Integer::  error, istat,size
      Integer, value:: lowt, nbasis,blocksize
      double precision, dimension(1:lowt):: a, b,d
      integer::ifact(292)

!     Declare device arrays
      Double precision, device,allocatable, dimension(:)::adev,bdev,ddev

!     Declare grid and block
      type(dim3):: dimGrid, dimBlock

!     Declare variables for timer
      real ctimeall, ctimekernel
      integer c1, c2, c3, c4

!     Start time for kernel and data transfers
      call system_clock( count=c1 )

!     Allocate memory on the device
      allocate(adev(1:lowt),bdev(1:lowt),ddev(1:lowt))
      istat = cudathreadsynchronize()

!     Define the grid and block shape
      blocksize=3
      if (mod(nbasis, blocksize)==0) then
         threadblocks=nbasis/blocksize
      else
         threadblocks=nbasis/blocksize+1
      end if
      dimGrid= dim3(threadblocks, 1, 1)
      dimBlock= dim3 (blocksize, 1, 1)

!     Copy a, b, and ifact to the device
      Adev = A(1:lowt)
      bdev = b(1:lowt)
      devifact=ifact(1:292)

!     Second timer to record kernel time without data transfers
      call system_clock( count=c2 )

!     Kernel invocation
      call matmulcuf<<<dimGrid,dimBlock>>>(lowt,adev,bdev,ddev,nbasis,blocksize)

!     Synchronise threads
      istat = cudathreadsynchronize()

!     Stop second timer
      call system_clock( count=c3 )

!     Copy results back to host array
      d = ddev(1:lowt)

!     Stop first timer
      call system_clock( count=c4 )

!     Freeing arrays on device(may need to declare as integer function)
      deallocate(adev,bdev,ddev)

!     Print out kernel times
      ctimekernel = c3 - c2
      ctimeall = c4 - c1
      print *, 'Kernel time excluding data xfer:', ctimekernel/1000000
      print *, 'Total time including data xfer: ', ctimeall/1000000
      end

      end module matmul_mod



- Mat
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group