PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

OpenACC: Problem with present directive and module array
Goto page 1, 2, 3  Next
 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
xlapillonne



Joined: 16 Feb 2011
Posts: 69

PostPosted: Tue May 08, 2012 8:11 am    Post subject: OpenACC: Problem with present directive and module array Reply with quote

Dear PGI support,

While translating some of my PGI acc test codes to Openacc I have found some strong performance decrease (x5 slower). After investigation, it seems to be related to the present directive.

Here is the original PGI acc code:

----------------------------
Code:


! test programme PGI acc
module data_field
  implicit none
  real*8, allocatable :: a(:),b(:)
  !$acc mirror(a,b)
END module data_field

module work_array
  implicit none
  real*8, allocatable :: a1(:), a2(:),  a3(:), a4(:), a5(:),  a6(:)
  real*8, allocatable :: zparam(:)
  !$acc mirror(a1,a2,a3,a4,a5,a6)
  !$acc mirror(zparam)
END module work_array

module computation
  implicit none
 
contains

subroutine gpu_routine(nvec,a,b)
  USE work_array
  integer, intent(in) :: nvec
  real*8, intent(inout) :: a(nvec)
  real*8, intent(in) :: b(nvec)
  integer :: i,k, iparam, il
  !$acc reflected(a,b) 
 

  !$acc region
  DO iparam=1,8
     zparam(iparam)=0.1D0*iparam
  END DO
  !$acc end region


  !$acc region
  Do i=1,nvec       
     a1(i)=0.1D0*(1.0D0+1.0D0/i)
     a2(i)=0.2D0*(1.0D0+1.0D0/i)
     a3(i)=0.3D0*(1.0D0+1.0D0/i)
     a4(i)=0.4D0*(1.0D0+1.0D0/i)
     a5(i)=0.5D0*(1.0D0+1.0D0/i)
     a6(i)=0.6D0*(1.0D0+1.0D0/i)
  END do
  !$acc end region

  !$acc region do kernel
  do i=1,nvec       
     do iparam=1,8 ! just to imitate many operations
        a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
     end do
  end do !i
  !$acc end region

end subroutine gpu_routine

end module computation
 
 


program main
  USE data_field, only: a,b
  USE work_array
  USE computation, only: gpu_routine
  implicit none
  integer :: n1, n2
  integer :: nargs,i,j,k,nt, dummy(20), niter
  character*10 arg
  integer :: nvec,nblock
  real*8 :: rt
  INTEGER ::  icountnew, icountold, icountrate, icountmax
  INTEGER :: z_sync(2) !use for synchronization
  !$acc local(z_sync)

  nargs = command_argument_count()
  niter=10

  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     stop('usage ./test n1 n2')
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec),b(nvec))
 allocate(a1(nvec), a2(nvec),  a3(nvec), a4(nvec), a5(nvec),  a6(nvec))
 allocate(zparam(8))
 z_sync(:)=1


 !$acc region
 do i=1,nvec
    a(i)=0.0D0
    b(i)=0.1D0
 end do
 !$acc end region

 
 !$acc update device(z_sync)
 CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
 
 do nt=1,niter
       call gpu_routine(nvec,a,b)
 end do

 !$acc update device(z_sync)
  CALL SYSTEM_CLOCK(COUNT=icountnew)
  !$acc update host(a)

rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
 print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
 write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )

 DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)


end program main



and the Open Acc version:

Code:

! test programme OpenACC
module data_field
  implicit none
  real*8, allocatable :: a(:),b(:)
END module data_field

module work_array
  implicit none
  real*8, allocatable :: a1(:), a2(:),  a3(:), a4(:), a5(:),  a6(:)
  real*8, allocatable :: zparam(:)
END module work_array

module computation
  implicit none 
contains

subroutine gpu_routine(nvec,a,b)
  USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
  integer, intent(in) :: nvec
  real*8, intent(inout) :: a(nvec)
  real*8, intent(in) :: b(nvec)
  integer :: i, iparam

  !$acc data present(a,b) &
  !$acc& present(a1,a2,a3,a4,a5,a6) &
  !$acc& present(zparam)
 
  !$acc kernels
  DO iparam=1,8
     zparam(iparam)=0.1D0*iparam
  END DO
  !$acc end kernels
 

  !$acc kernels
  Do i=1,nvec       
     a1(i)=0.1D0*(1.0D0+1.0D0/i)
     a2(i)=0.2D0*(1.0D0+1.0D0/i)
     a3(i)=0.3D0*(1.0D0+1.0D0/i)
     a4(i)=0.4D0*(1.0D0+1.0D0/i)
     a5(i)=0.5D0*(1.0D0+1.0D0/i)
     a6(i)=0.6D0*(1.0D0+1.0D0/i)
  END do
  !$acc end kernels

  !$acc kernels loop
  do i=1,nvec       
     do iparam=1,8 ! just to imitate several operations
        a(i)=zparam(iparam)*(1+cos(a(i)))+b(i)*(1.0D0+sin(1.0D0+a1(i)+a2(i)+a3(i)+a4(i)+a5(i)+a6(i)))
     end do
  end do !i
  !$acc end kernels

  !$acc end data
 
end subroutine gpu_routine

end module computation
 
program main
  USE data_field, only: a,b
  USE work_array, only: a1,a2,a3,a4,a5,a6,zparam
  USE computation, only: gpu_routine
  implicit none
  integer :: n1,n2
  integer :: nargs,i,j,k,nt, niter
  character*10 arg
  integer :: nvec,nblock
  real*8 :: rt
  INTEGER ::  icountnew, icountold, icountrate, icountmax
  INTEGER :: z_sync(2) !use for synchronization

  nargs = command_argument_count()
  niter=10
  if( nargs == 2 ) then
     call getarg( 1, arg )
     read(arg,'(i)') n1
     call getarg( 2, arg )
     read(arg,'(i)') n2
  else
     stop('usage ./test n1 n2')
  endif
 
 nvec=n1*n2
 
 allocate(a(nvec),b(nvec))
 allocate(a1(nvec), a2(nvec), a3(nvec),a4(nvec),a5(nvec),a6(nvec))
 allocate(zparam(8))
 z_sync(:)=1

!$acc data create(z_sync) &
!$acc& create(a,b) &
!$acc& create(a1,a2,a3,a4,a5,a6,zparam)

 !$acc kernels
 do i=1,nvec
    a(i)=0.0D0
    b(i)=0.1D0
 end do
 !$acc end kernels
 
 !$acc update device(z_sync)
 CALL SYSTEM_CLOCK(COUNT=icountold,COUNT_RATE=icountrate,COUNT_MAX=icountmax)
 
 do nt=1,niter
       call gpu_routine(nvec,a,b)
 end do

 !$acc update device(z_sync)
  CALL SYSTEM_CLOCK(COUNT=icountnew)
  !$acc update host(a)


 rt = ( REAL(icountnew) - REAL(icountold) ) / REAL(icountrate)
 print*, 'n1 =', n1, 'n2=', n2, sum(a), sum(z_sync)
  write(*,20) rt*1.0e3/niter
20 format( ' time/step=', f10.5, ' ms' )

 DEALLOCATE(a,b,a1,a2,a3,a4,a5,a6,zparam)
  !$acc end data

end program main



------------------------------

If I now compile and run the two codes I get:

pgf90 -ta=nvidia -o test test.f90
> ./test_automatic_array 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.10040 ms

> pgf90 -ta=nvidia -o test_openacc test_openacc.f90
> ./test_openacc 100 100
n1 = 100 n2= 100 12313.39881122256 2
time/step= 0.56890 ms


I have investigated the problem with the nvidia profilier, and I have seen in the OpenAcc version that there are several memcpyHtoD between the three kernels in subroutine "gpu_routine".
This seems incorrect as all array involved at this point should be already on the GPU (unless I am doing something wrong with the data region in the main program.)


Best regards,

Xavier

PGI version: 12.4
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6119
Location: The Portland Group Inc.

PostPosted: Tue May 08, 2012 4:58 pm    Post subject: Reply with quote

Hi Xavier,

As you know our support for OpenACC is still in development so your feedback is very much appreciated. In this case what's happening is that the section descriptors for these arrays are being copied over to the GPU each time the present clause is encountered. Michael is aware of the issue and will have his team increase the priority of removing the need copying over the section descriptor. This work had been scheduled for later this year.

Note that the better equivalent OpenACC for "mirror" is the "device_resident" clause which is currently scheduled to be available in the 12.6.

Best Regards,
Mat
Back to top
View user's profile
xlapillonne



Joined: 16 Feb 2011
Posts: 69

PostPosted: Wed May 09, 2012 12:45 am    Post subject: Reply with quote

Hi Mat,

Ok, thanks for the info. I'll wait then before further testing OpenAcc.

Best regards,

Xavier
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 6119
Location: The Portland Group Inc.

PostPosted: Wed May 09, 2012 8:08 am    Post subject: Reply with quote

Quote:
I'll wait then before further testing OpenAcc.
Please do continue testing since we need the feedback as we implement the full standard.

Thanks!
Mat
Back to top
View user's profile
xlapillonne



Joined: 16 Feb 2011
Posts: 69

PostPosted: Fri May 11, 2012 6:26 am    Post subject: Reply with quote

Hi Mat,

After your last comment I also went back to analyse nvidia profiler output for some other PGI acc codes (not OpenAcc).

In several subroutines I have also found many memcpyHtoD between kernels although the arrays used there are embeded in a data region and passed using the reflected keyword (also I don't get any copy message from -Minfo).

Actually in one subroutine (which is made of a few different kernels), memcpyHtoD account for half of the total time (looking at the CPU time column).

What are these memcpyHtoD corresponding to ? Are they also related to section descriptors like for OpenAcc. Or could they be related to scalar parameters required in the kernel ?

Is there a way to get a list of what is being send to the device (the -Minfo says nothings apparently).

Thanks,

Xavier
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Goto page 1, 2, 3  Next
Page 1 of 3

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group