|
| View previous topic :: View next topic |
| Author |
Message |
SWL_EGGBABY
Joined: 16 Dec 2009 Posts: 29
|
Posted: Fri Feb 26, 2010 2:25 am Post subject: GPU does not work why? |
|
|
| I put the data into the GPU to run it, the result is wrong,why? |
|
| Back to top |
|
 |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Fri Feb 26, 2010 10:03 am Post subject: |
|
|
Hi SWL_EGGBABY,
There could be a number of possibilities. Can you please provide more details including sample code (if possible), compiler version, and OS? Also, please post your response to this topic. Posting new topics to each response makes it difficult for other users to follow the thread.
Thanks,
Mat |
|
| Back to top |
|
 |
SWL_EGGBABY
Joined: 16 Dec 2009 Posts: 29
|
Posted: Sun Feb 28, 2010 7:19 pm Post subject: GPU does not work why? |
|
|
Hi Mat,
The version is PGI Visual Fortran 2008 x64 10.2
The OS is windows xp 64-bit
The code is:
module mmul_mod
use cudafor
contains
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16), Bsub(16,16)
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
subroutine mmul( A, B, C )
real, dimension(:,:) :: A, B, C
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
type(dim3) :: dimGrid, dimBlock
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
Adev = A(1:N,1:M)
Bdev(:,:) = B(1:M,1:L)
dimGrid = dim3( N/16, M/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, &
N, M, L )
C(1:N,1:L) = Cdev
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
program swl_eggbaby
use mmul_mod
implicit none
integer,parameter :: N=16,M=16,L=4
integer :: i
real :: A(N,M)=0
real :: B(M,L)=0
real :: C(N,L)
call mmul(A,B,C)
write(*,*) "计算结果:"
do i=1,N
write(*,*) C(N,:)
end do
end program
run it
计算结果:
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
请按任意键继续. . .[/img]
thank you |
|
| Back to top |
|
 |
SWL_EGGBABY
Joined: 16 Dec 2009 Posts: 29
|
Posted: Mon Mar 01, 2010 1:26 am Post subject: GPU does not work why? |
|
|
Device Number: 0
Device Name: Quadro FX 5800
Total Global Memory: 0.000 Gbytes
sharedMemPerBlock: 16384 bytes
regsPerBlock: 16384
warpSize: 32
maxThreadsPerBlock: 512
maxThreadsDim: 512 x 512 x 64
maxGridSize: 65535 x 65535 x 1
ClockRate: 1.296 GHz
Total Const Memory: 65536 bytes
Compute Capability Revision: 1.3
TextureAlignment: 256 bytes
deviceOverlap: T
multiProcessorCount: 30
integrated: T
canMapHostMemory: T
Device Number: 1
Device Name: Tesla C1060
Total Global Memory: 0.000 Gbytes
sharedMemPerBlock: 16384 bytes
regsPerBlock: 16384
warpSize: 32
maxThreadsPerBlock: 512
maxThreadsDim: 512 x 512 x 64
maxGridSize: 65535 x 65535 x 1
ClockRate: 1.296 GHz
Total Const Memory: 65536 bytes
Compute Capability Revision: 1.3
TextureAlignment: 256 bytes
deviceOverlap: T
multiProcessorCount: 30
integrated: T
canMapHostMemory: T |
|
| Back to top |
|
 |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Mon Mar 01, 2010 5:29 pm Post subject: |
|
|
Hi SWL_EGGBABY,
Your code looks fine so I suspect that there is some issue with your cards. Try adding error checking to give us some ideas:
| Code: | module mmul_mod
use cudafor
contains
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16), Bsub(16,16)
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
subroutine mmul( A, B, C )
real, dimension(:,:) :: A, B, C
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
type(dim3) :: dimGrid, dimBlock
integer :: errCode
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
Adev = A(1:N,1:M)
Bdev(:,:) = B(1:M,1:L)
dimGrid = dim3( N/16, M/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, &
N, M, L )
errCode = cudaGetLastError()
print *, cudaGetErrorString(errCode)
C(1:N,1:L) = Cdev
errCode = cudaGetLastError()
print *, cudaGetErrorString(errCode)
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
program swl_eggbaby
use mmul_mod
implicit none
integer,parameter :: N=16,M=16,L=4
integer :: i
real :: A(N,M)=0
real :: B(M,L)=0
real :: C(N,L)
call mmul(A,B,C)
write(*,*) "Results:"
do i=1,N
write(*,*) C(N,:)
end do
end program
|
- Mat |
|
| Back to top |
|
 |
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2001, 2002 phpBB Group
|