! start the module containing the matrix multiply kernel module mmul_mod use cudafor contains ! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL attributes(global) subroutine mmul_kernel( A, B, C, N, M, L ) real,device :: A(N,M), B(M,L), C(N,L) integer, value :: N, M, L integer :: i, j, kb, k, tx, ty ! submatrices are declared to be in CUDA shared memory real, shared :: Asub(16,16), Bsub(16,16) ! the value of C(i,j) being computed, a temporary scalar real :: Cij ! Start execution, first get my thread indices tx = threadidx%x ty = threadidx%y ! This thread computes C(i,j) = sum(A(i,:) * B(:,j)) i = (blockidx%x-1) * 16 + tx j = (blockidx%y-1) * 16 + ty Cij = 0.0 ! Do the k loop in chunks of 16, the block size do kb = 1, M, 16 ! Fill the submatrices; each of 16x16 threads in the thread block ! loads one element of Asub and Bsub Asub(tx,ty) = A(i,kb+ty-1) Bsub(tx,ty) = B(kb+tx-1,j) ! Wait until all elements are filled call syncthreads() ! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the ! dot product for its element of C(i,j) do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo ! Synchronize to make sure all threads are done reading the submatrices before ! overwriting them in the next iteration of the kb loop call syncthreads() enddo ! Each of the 16x16 threads stores its element to the global C array C(i,j) = Cij end subroutine mmul_kernel ! The host routine to drive the matrix multiplication subroutine mmul( A, B, C ) ! assumed shape input arrays real, dimension(:,:) :: A, B, C ! Array dimensions integer :: N, M, L ! allocatable device arrays real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev ! dim3 variables to define the grid and block shapes type(dim3) :: dimGrid, dimBlock integer :: r ! Get the array sizes real ctimeall, ctimekernel, flops, mflopskernel, mflopsall integer c1, c2, c3, c4 ! Begin execution, first determine the sizes of the input arrays N = size( A, 1 ) M = size( A, 2 ) L = size( B, 2 ) ! Start data xfer-inclusive timer and allocate the device arrays using ! F90 ALLOCATE call system_clock( count=c1 ) allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) ) ! Copy A and B to the device using F90 array assignments Adev = A(1:N,1:M) Bdev = B(1:M,1:L) ! Create the grid and block dimensions dimGrid = dim3( N/16, L/16, 1 ) dimBlock = dim3( 16, 16, 1 ) ! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion call system_clock( count=c2 ) call mmul_kernel<<>>( Adev, Bdev, Cdev, N, M, L ) r = cudathreadsynchronize() ! Stop data xfer-exlusive timer, copy the results back, stop data xfer- ! inclusive timer call system_clock( count=c3 ) C(1:N,1:L) = Cdev call system_clock( count=c4 ) ! Calculate inclusive/exclusive execution times, and report MFLOPS flops = float(N) * float(M) * float(L) ctimekernel = c3 - c2 mflopskernel = flops / ctimekernel ctimeall = c4 - c1 mflopsall = flops / ctimeall ! Print out results print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds' print *, 'Megaflops excluding data xfer: ', mflopskernel print *, 'Total time including data xfer: ', ctimeall, ' microseconds' print *, 'Megaflops including data xfer: ', mflopsall ! Deallocate device arrays and exit deallocate( Adev, Bdev, Cdev ) end subroutine mmul end module mmul_mod ! Main program to initialize arrays, invoke mmul, check results program matmul use mmul_mod real,dimension(:,:),allocatable :: A,B,C,CC integer N, M, L integer idevice, istat ! Begin execution N = 512 M = 1024 L = 512 idevice = 0 print *,' arrays sized ', N, ' by ', M, ' by ', L allocate(A(N,M),B(M,L),C(N,L),CC(N,L)) ! Initialize the A and B arrays; zero out the C array to be computed ! on the GPU, and the CC array to be computed on the host do j = 1,M do i = 1,N A(i,j) = i*10 + j*1000 enddo enddo do j = 1,L do i = 1,M B(i,j) = i-j enddo enddo do j = 1,L do i = 1,N CC(i,j) = 0.0 C(i,j) = 0.0 enddo enddo ! Initialize CPU device istat = cudaSetDevice(idevice) ! Call matrix multiply subroutine to execute on the GPU to compute C print *,'calling mmul' call mmul( A, B, C ) print *,' C(1,1) = ', C(1,1) print *,' C(2,2) = ', C(2,2) ! Perform matrix multiply on host to compute CC do i = 1,N do j = 1,L do k = 1,M CC(i,j) = CC(i,j) + A(i,k)*B(k,j) enddo enddo enddo ! Check for errors ierr = 0 do j = 1,L do i = 1,N diff = abs(C(i,j) - CC(i,j)) denom = CC(i,j) if ( denom == 0.0 ) denom = 1.0 error = diff / denom if ( error > 2.0e-5 ) then ierr = ierr + 1 if ( ierr <= 10 ) then print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error endif endif enddo enddo if( ierr == 0 )then print *, ' No errors found' else print *, ierr, ' ERRORS FOUND!!!' endif end program