PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

Data visualization
Goto page 1, 2, 3  Next
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
Tuan



Joined: 11 Jun 2009
Posts: 233

PostPosted: Wed Dec 22, 2010 7:41 am    Post subject: Data visualization Reply with quote

Do we have any example showing how to use the CUDA Fortran data (say 2D array) for visualization purpose, i.e. calling OpenGL interoperability

Thanks,
Tuan
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5815
Location: The Portland Group Inc.

PostPosted: Mon Jan 03, 2011 1:07 pm    Post subject: Reply with quote

Hi Tuan,

I don't know if it's exactly what you want, but below is an example CUDA Fortran program that includes some simple OpenGL calls. You'll need to build the F90gl library ( http://math.nist.gov/f90gl/), see http://www.pgroup.com/resources/f90gl/f90gl129_pgi60.htm for details. Although I originally wrote this guide for use with the 6.0 compilers so may be dated, the basic steps should be the same.

Auxiliary files: delay.f90 and dclock_64.s
Code:

#ifndef DELAY
#define DELAY 1000
#endif

subroutine delay

    double precision :: s,t, dclock
    s=dclock()*1.0e+6
    t=s+DELAY
    !print *, s, t
    do while (s<t)
       s=dclock()*1.0e+6
       !print *, s
    enddo
   
end subroutine delay


Code:
        .file   "dclock-hammer.s"
        .align    8
        .data
 .clock:  .double 0.000000000376        # 2.66 GHz
# .clock:  .double 0.000000000357        # 2.8 GHz
# .clock:  .double 0.0000000003333       # 3.0 GHz
# .clock:  .double 0.0000000003125       # 3.2 GHz

.low:   .long 0x00000000
.high:  .long 0x00000000
        .text
       
        .globl   _DCLOCK, dclock, _dclock, _dclock_, dclock_
_DCLOCK:
dclock:
_dclock:
_dclock_:
dclock_:
        .byte   0x0f, 0x31

        movl    %eax, .low(%RIP)
        movl    %edx, .high(%RIP)

        fildll  .low(%RIP)
        fmull   .clock(%RIP)
        fstpl   -24(%rsp)
        movsd   -24(%rsp), %xmm0
        ret


Main code for the game of life "lifeD.F90"
Code:
#ifdef _CUDA
#define DEVICE device,
#else
#define DEVICE
#endif

module lifeD
 
    integer :: NXSIZE, NYSIZE, SEED
    parameter(NXSIZE=128, NYSIZE=128, SEED=2)
    real :: initPercent=0.2

contains

#ifdef _CUDA
attributes(global) subroutine life1_kernel(dOld, dNew, Nx, Ny)

   implicit none
   integer, dimension(Nx,Ny), device :: dOld, dNew
   integer, value :: Nx,Ny
   integer :: i, j, ix, iy, neighbors, state

   ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
   iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

   if (ix .gt. 1 .and. iy.gt.1 .and. &
            ix .lt. Nx .and. iy .lt. Ny) then
   
            neighbors = dOld(ix,iy-1) + &
                   dOld(ix,iy+1) +   &
                   dOld(ix+1,iy-1) + &   
                   dOld(ix+1,iy+1) + &   
                   dOld(ix-1,iy-1) + &   
                   dOld(ix-1,iy+1) + &   
                   dOld(ix-1,iy) +   &
                   dOld(ix+1,iy)
   state = dOld(ix,iy)
   
   if (state .eq. 0 .and. neighbors .eq. 3) then
      dNew(ix,iy) = 1  ! birth
   else if (state.eq.1.and.neighbors.ne.2.and.neighbors.ne.3) then
      dNew(ix,iy) = 0  ! death
   else
      dNew(ix,iy) = state ! no change
        end if
   end if
   
end subroutine life1_kernel

#else
subroutine life1(dOld, dNew, Nx, Ny)

   implicit none
   integer, dimension(Nx,Ny) :: dOld, dNew
   integer, value :: Nx,Ny,tid
   integer :: i, j,ix,iy, neighbors, state

        do ix=2,Nx-1
   do iy=2,Ny-1
     
            neighbors = dOld(ix,iy-1) + &
                   dOld(ix,iy+1) +   &
                   dOld(ix+1,iy-1) + &   
                   dOld(ix+1,iy+1) + &   
                   dOld(ix-1,iy-1) + &   
                   dOld(ix-1,iy+1) + &   
                   dOld(ix-1,iy) +   &
                   dOld(ix+1,iy)

       state = dOld(ix,iy)

       if (state .eq. 0 .and. neighbors .eq. 3) then
         dNew(ix,iy) = 1  ! birth
       else if (state.gt.0.and.neighbors.ne.2.and.neighbors.ne.3) then
         dNew(ix,iy) = 0  ! death 0
       else
              dNew(ix,iy) = state ! no change
             end if

      end do
   end do

end subroutine life1

#endif

subroutine lifeDisplay ()
   
        use opengl_gl
        use opengl_glut
#ifdef _CUDA
   use cudafor
#endif
   use omp_lib
   implicit none

   integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
        integer :: numdev, devnum, istat, myseed, blocksize
   integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
   integer, allocatable, dimension(:,:) :: A, B
   integer :: start, end
   real, allocatable, dimension(:,:) :: rand
   character(len=80) :: rfile
   integer :: alive
#ifdef _CUDA
       type(dim3) :: dimGrid, dimBlock
#else
        integer,dimension(3) :: dimGrid,dimBlock
#endif
        real :: x,y
        real :: wd,hg,rad

   ! account for the halo
   Nx = NXSIZE+2
   Ny = NYSIZE+2
   myseed = SEED
   allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
   call random_seed(myseed)
   call random_number(rand)
   A=0
   do i=2,Nx-1
      do j=2,Ny-1
        if (rand(i,j) < initPercent) then
           A(i,j) = 1
        else
                A(i,j) = 0
             endif
           enddo
        enddo
        B=A

       ! set the width and height of each box
        wd = 1.0/real(Ny)
        hg = 1.0/real(Nx)
        rad= wd/2.0;
        !print *, 'Nx:', Nx, 'Ny:', Ny, 'WD:', wd, 'HG:', hg
   ! Set the inital conditions
   count = 0
   steps = 0
   temp = 0
   alive = 1      
#ifdef _CUDA
        istat = cudaGetDeviceCount(numdev)
#endif

!$omp parallel &
!$omp   shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp   private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt,x,y, &
!$omp           angle,i,j,blocksize,start,end)

#ifdef _OMP
   nthds = omp_get_num_threads()   
   tnum = omp_get_thread_num()
#else
        nthds = 1
        tnum=0
#endif

!$omp master
   
!$omp end master
!$omp barrier

   blocksize =  NXSIZE/nthds
   Nt = blocksize+2
   start = (tnum*blocksize)+1
   end = start + blocksize + 1

!   if (tnum .eq. nthds-1) then
!      Nt = Nt + mod(blocksize,nthds)
!      end = Nx
!        endif

#ifdef _CUDA
        devnum = mod(tnum,numdev)
        istat = cudaSetDevice(devnum)
#endif
      allocate(dNew(Nt,Ny), dOld(Nt,Ny))   

        dOld=0
   dNew=0
#ifdef _CUDA
   dimBlock = dim3(16,16,1)
        dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif

   do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)

       dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
            dNew=dOld
#ifdef _CUDA
            call life1_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
            call life1(dOld,dNew,Nt,Ny)
#endif
       B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier


!$omp master
            A=B
       alive = sum(A)
            print *, 'Step:', steps, ' Alive:', alive
       call glclear(GL_COLOR_BUFFER_BIT)
            call glcolor3f(1.0_glfloat, 1.0_glfloat, 1.0_glfloat)
            do i=1,Nx
          do j=Ny,1,-1
                   x=(i-1)*wd
                   y=(j-1)*hg                 
                   
            if (A(i,j).gt.0) then
                    call glBegin(GL_TRIANGLE_FAN)
                    call glVertex2f(x, y)
                    do angle=1,360,5
                        call glVertex2f(x+sin(real(angle))*rad,y+cos(real(angle))*rad)
                    enddo
                   call glEnd()
                 endif
              enddo
            enddo

            call glflush()
            call delay() 

            if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
               count = count + 1
       else
          count = 0
            endif
            temp = alive
       steps = steps + 1

!   pause   

!$omp end master
!$omp barrier
    
   end do
   
   deallocate(dOld)
   deallocate(dNew)
#ifdef _CUDA
   istat=cudaThreadExit()
#endif

!$omp end parallel
 
   alive = sum(A)
   print *, 'Step:', steps, ' Alive:',alive
   print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
   close(21)
   deallocate(A,B)
      
!   stop

end subroutine lifeDisplay


end module lifeD


      
program life
 use opengl_gl
use opengl_glut
use lifeD
implicit none

real(kind=glfloat),  parameter :: zero  = 0.0_glfloat
real(kind=gldouble), parameter :: dzero = 0.0_gldouble, &
                                  one   = 1.0_gldouble
integer(kind=glcint) :: iwin,windW,windH
integer(kind=glenum) :: itemp

windW=1024
windH=1024

! Declare initial window size, position, and display mode
! (single buffer and RGBA).  Open window with "LIFE"
! in its title bar.
call glutInitWindowSize(windW,windH)
call glutinit()
call glutinitdisplaymode(ior(GLUT_SINGLE,GLUT_RGB))
iwin = glutcreatewindow("LIFE")

! select clearing color
call glclearcolor(zero, zero, zero, zero)

! initialize viewing values
itemp = GL_PROJECTION
call glmatrixmode(itemp)
call glloadidentity()
call glortho(dzero, one, dzero, one, -one, one)

! Register callback function to display graphics.
! Enter main loop and process events.
call glutdisplayfunc(lifeDisplay)
call glutmainloop()
end program life


Compilation commands:
Code:
pgfortran -I./f90gl-1.2.15/include/GL  -mp -fast -c -Mpreprocess   -Mcuda=cuda3.1  ./lifeD.F90 -o .//lifeD.o
pgfortran -g -c -Mpreprocess -DDELAY=10000 ./delay.f90 -o ./delay.o
pgfortran -c ./dclock_64.s -o ./dclock_64.o
pgfortran -mp -fast   -Mcuda=cuda3.1  ./lifeD.o ./delay.o ./dclock_64.o -L./f90gl-1.2.15/lib  -lglut1 -L/lib -lGLU -lGL -lf90glut -lf90GLU -lf90GL -lglut1 -L/usr/X11R6/lib -lXaw -lXt -lXmu -lXi -lXext -lX11 -o lifeD.out

Also, the code can be compiled with or without OpenMP (add or remove -mp) and with or without CUDA (add remove -Mcuda).

Hope this helps,
Mat
Back to top
View user's profile
Tuan



Joined: 11 Jun 2009
Posts: 233

PostPosted: Mon Jan 03, 2011 2:23 pm    Post subject: Reply with quote

Thanks a lot Mat.
I'll check it.

Tuan
Back to top
View user's profile
Tuan



Joined: 11 Jun 2009
Posts: 233

PostPosted: Mon Jan 03, 2011 2:46 pm    Post subject: Reply with quote

Hi Mat.

I don't think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions. It would be great if you have a similar one that use CUDA-OpenGL interop in Fortran.

Also, when I try to write one by myself. I use the cudaGLSetGLDevice(devID), rather than cudaSetDevice(devID) and the sample code crash, with "segmentation fault", when I try to allocate data on device memory.

Here, I created an explicit interface to cudaGLSetGLDevice as CUDA Fortran has no binding for that.

Code:
PROGRAM ABC
  USE CUDAFOR
  USE CUDA_INTEROP

  REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
  integer :: gpuID

  gpuID = 0
  IF (cudaGLSetGLDevice( gpuID ) ) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
    CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID
  ENDIF

   allocate(mat_dev(100, 100))
   
END PROGRAM

MODULE CUDA_INTEROP
 USE CUDAFOR
 USE ISO_C_BINDING

 INTERFACE
    FUNCTION cudaGLSetGLDevice (device)  &
          BIND(C, name="cudaGLSetGLDevice") RESULT(res)
      USE iso_c_binding
      INTEGER(c_int), VALUE :: device
      INTEGER(KIND(cudaSuccess)) :: res
    END FUNCTION cudaGLSetGLDevice

  END INTERFACE
 
CONTAINS
END MODULE


What is your suggestion? I'm using Fortran 10.9, CUDA 3.1.

Tuan
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5815
Location: The Portland Group Inc.

PostPosted: Mon Jan 03, 2011 3:13 pm    Post subject: Reply with quote

Quote:
I don't think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions
Correct and why I didn't think it was what you wanted, it's just what I had on hand.

Quote:
What is your suggestion? I'm using Fortran 10.9, CUDA 3.1.
This is new for me so don't have much for you here. I'll ask my contacts at NVIDIA to see if they can help.

I was able to work around the seg fault by allocating the array before calling cudaGLSetGLDevice. I'm sure why the segv occurred but hopefully it will get you farther along as you blaze trail here.

Code:
% cat test.cuf

MODULE CUDA_INTEROP
 USE CUDAFOR
 USE ISO_C_BINDING

 INTERFACE
    FUNCTION cudaGLSetGLDevice (device)  &
          BIND(C, name="cudaGLSetGLDevice") RESULT(res)
      USE iso_c_binding
      INTEGER(c_int), VALUE :: device
      INTEGER(KIND(cudaSuccess)) :: res
    END FUNCTION cudaGLSetGLDevice

  END INTERFACE
 
CONTAINS
END MODULE

  PROGRAM ABC
  USE CUDAFOR
  USE CUDA_INTEROP

  REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
  REAL, DIMENSION(:,:), ALLOCATABLE :: mat
  integer :: gpuID
 

  gpuID = 0
  istat = cudaSetDevice(gpuID) 
  allocate(mat(100, 100), mat_dev(100, 100))

  IF (cudaGLSetGLDevice( gpuID ) ) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
 !   CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID
  ENDIF
  mat_dev=1.0
  mat = mat_dev

  print *, mat(1,1)
END PROGRAM

Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Goto page 1, 2, 3  Next
Page 1 of 3

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group