PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

About two or more GPUs
Goto page Previous  1, 2
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Programming and Compiling
View previous topic :: View next topic  
Author Message
Balkrishna



Joined: 22 Mar 2010
Posts: 6

PostPosted: Mon Jul 30, 2012 11:58 pm    Post subject: OpenMP + Multi-GPU + CUDA Fortran not working Reply with quote

Hi,

Can somebody please post a sample example of OpenMP + Multi-GPU + CUDA Fortran? I have seen enough examples on MPI, but not any single running example on OpenMP.

Thanks in advance.
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6141
Location: The Portland Group Inc.

PostPosted: Tue Jul 31, 2012 2:32 pm    Post subject: Reply with quote

Hi Balkrishna,

Below is some code that uses OpenMP+CUDA Fortran. I had originally written it for my article on Multi-GPU programming (https://www.pgroup.com/lit/articles/insider/v3n3a2.htm) but decided to just focus on MPI+CUDA Fortran.

The biggest things to watch out for are your device allocatables since they need to be managed by separate threads. Be sure to put to put them in a "private" clause and don't allocate them till after the GPU context has been created (i.e. after cudaSetDevice has been called). Also, unless you're using a Kepler, each OpenMP thread should have their own GPU.

Let me know if you have questions,
Mat

Code:
% cat life.F90
#ifdef _CUDA
#define DEVICE device,
#else
#define DEVICE
#endif

module life_mod
 
    integer :: NXSIZE, NYSIZE, SEED
    parameter(NXSIZE=128, NYSIZE=128, SEED=123)
    real :: initPercent=0.4

contains

#ifdef _CUDA
attributes(global) subroutine life_kernel(dOld, dNew, Nx, Ny)

   implicit none
   integer, dimension(Nx,Ny), device :: dOld, dNew
   integer, value :: Nx,Ny
   integer :: i, j, ix, iy, neighbors, state

   ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
   iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

   if (ix .gt. 1 .and. iy.gt.1 .and. &
            ix .lt. Nx .and. iy .lt. Ny) then
   
            neighbors = dOld(ix,iy-1) + &
                   dOld(ix,iy+1) +   &
                   dOld(ix+1,iy-1) + &   
                   dOld(ix+1,iy+1) + &   
                   dOld(ix-1,iy-1) + &   
                   dOld(ix-1,iy+1) + &   
                   dOld(ix-1,iy) +   &
                   dOld(ix+1,iy)
   state = dOld(ix,iy)
   
   if (state .eq. 0 .and. neighbors .eq. 3) then
      dNew(ix,iy) = 1  ! birth
   else if (state.eq.1.and.neighbors.ne.2.and.neighbors.ne.3) then
      dNew(ix,iy) = 0  ! death
   else
      dNew(ix,iy) = state ! no change
        end if
   end if
   
end subroutine life_kernel

#else
subroutine life(dOld, dNew, Nx, Ny)

   implicit none
   integer, dimension(Nx,Ny) :: dOld, dNew
   integer, value :: Nx,Ny,tid
   integer :: i, j,ix,iy, neighbors, state

        do ix=2,Nx-1
   do iy=2,Ny-1
     
            neighbors = dOld(ix,iy-1) + &
                   dOld(ix,iy+1) +   &
                   dOld(ix+1,iy-1) + &   
                   dOld(ix+1,iy+1) + &   
                   dOld(ix-1,iy-1) + &   
                   dOld(ix-1,iy+1) + &   
                   dOld(ix-1,iy) +   &
                   dOld(ix+1,iy)

       state = dOld(ix,iy)

       if (state .eq. 0 .and. neighbors .eq. 3) then
         dNew(ix,iy) = 1  ! birth
       else if (state.gt.0.and.neighbors.ne.2.and.neighbors.ne.3) then
         dNew(ix,iy) = 0  ! death 0
       else
              dNew(ix,iy) = state ! no change
             end if

      end do
   end do

end subroutine life

#endif

subroutine lifeMain ()
   
#ifdef _CUDA
   use cudafor
#endif
#ifdef _OMP
   use omp_lib
#endif
   implicit none

   integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
        integer :: numdev, devnum, istat, myseed, blocksize
   integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
   integer, allocatable, dimension(:,:) :: A, B
   integer :: start, end
   real, allocatable, dimension(:,:) :: rand
   character(len=80) :: rfile
   integer :: alive
#ifdef _CUDA
       type(dim3) :: dimGrid, dimBlock
#else
        integer,dimension(3) :: dimGrid,dimBlock
#endif
   ! account for the halo
   Nx = NXSIZE+2
   Ny = NYSIZE+2
   myseed = SEED
   allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
   call random_seed(myseed)
   call random_number(rand)
   A=0
   do i=2,Nx-1
      do j=2,Ny-1
        if (rand(i,j) < initPercent) then
           A(i,j) = 1
        else
                A(i,j) = 0
             endif
           enddo
        enddo
        B=A

   ! Set the inital conditions
   count = 0
   steps = 0
   temp = 0
   alive = 1      
#ifdef _CUDA
        istat = cudaGetDeviceCount(numdev)
#endif

!$omp parallel &
!$omp   shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp   private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt, &
!$omp           i,j,blocksize,start,end)

#ifdef _OMP
   nthds = omp_get_num_threads()   
   tnum = omp_get_thread_num()
#else
        nthds = 1
        tnum=0
#endif

!$omp master
   
!$omp end master
!$omp barrier

   blocksize =  NXSIZE/nthds
   Nt = blocksize+2
   start = (tnum*blocksize)+1
   end = start + blocksize + 1

#ifdef _CUDA
        devnum = mod(tnum,numdev)
        istat = cudaSetDevice(devnum)
#endif
      allocate(dNew(Nt,Ny), dOld(Nt,Ny))   

        dOld=0
   dNew=0
#ifdef _CUDA
   dimBlock = dim3(16,16,1)
        dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif

   do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)

       dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
            dNew=dOld
#ifdef _CUDA
            call life_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
            call life(dOld,dNew,Nt,Ny)
#endif
       B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier

!$omp master
            A=B
       alive = sum(A)
!            if (mod(steps,100).eq.0) then
            print *, 'Step:', steps, ' Alive:', alive
!            endif
            if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
               count = count + 1
       else
          count = 0
            endif
            temp = alive
       steps = steps + 1

!$omp end master
!$omp barrier
    
   end do
   
   deallocate(dOld)
   deallocate(dNew)

!$omp end parallel
 
   alive = sum(A)
   print *, 'Step:', steps, ' Alive:',alive
   print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
   close(21)
   deallocate(A,B)
      
   stop

end subroutine lifeMain
end module life_mod


program life
use life_mod
implicit none
call lifeMain()
end program life

% pgfortran -Mpreprocess -mp -fast -Minfo=accel -Mcuda life.F90 -o life.out
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Programming and Compiling All times are GMT - 7 Hours
Goto page Previous  1, 2
Page 2 of 2

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group