Correct manually deep copy of user defined data structure

OpenACC and CUDA Fortran
Post Reply
Peter85
Posts: 48
Joined: Mar 06 2018

Correct manually deep copy of user defined data structure

Post by Peter85 » Thu May 14, 2020 1:28 am

Hi,

I'm trying to use a user defined datatype inside a kernel, but the program fails.
When I compile with the deepcopy flag it works but I would like to know how to manually create a deep copy.

How do I make a correct copy of the comm_data datatype?
I tried to create small example program, it creates a library and then links it to a testprogram.

Thank you for your help.

type.f90

Code: Select all

module krylov
  type nrk
     integer ndata
     integer,allocatable :: offset(:)
  end type nrk
  type comm
     integer ndata
     type(nrk),allocatable :: nrk_array(:)
  end type comm

  type KSP
     type(comm) :: comm_data
     real*8,allocatable :: r(:)
     real*8,allocatable :: x(:)
     real*8,allocatable :: b(:)
  end type KSP

contains
  subroutine set_KSP_solver(method,size_n)
    implicit none
    type(KSP) :: method
    integer size_n
    write(*,*)"Set_ksp_solver"
  end subroutine set_KSP_solver
end module krylov
libmain.f90

Code: Select all

    subroutine  solver(vec_size)
        use krylov
        implicit none
        type(KSP) method
        integer n,j,i,vec_size
        real*8,bnorm
        n=vec_size
        bnorm=0.0d0

        write(*,*)"Start testprogram"
        call init_KSP_CRS(method,10)
        !$acc data copyin(method, method%comm_data,&
        !$acc method%comm_data%nrk_array)
        !$acc parallel loop
        do j=1,n
           bnorm = bnorm+method%b(i)
           write(*,*)method%comm_data%nrk_array(j)%ndata
           write(*,*)method%comm_data%nrk_array(j)%offset(j)
        enddo
        !$acc end data

        write(*,*)"End testprogram"
      end subroutine solver

      subroutine init_KSP_CRS(method,n)
        use krylov
        implicit none
        type(KSP) method
        integer n,i
        allocate(method%comm_data%nrk_array(n))
        do i=1,n
          method%comm_data%nrk_array(i)%ndata = i
          allocate(method%comm_data%nrk_array(i)%offset(i))
        enddo
end subroutine init_KSP_CRS
main.f90

Code: Select all

PROGRAM test
        implicit none
        write(*,*)"Start testprogram"
        call solver(10)
        write(*,*)"End testprogram"
      end PROGRAM test
makefile

Code: Select all

CC=pgf90

OBJS=type.o libmain.o
OPTS=-ta=tesla:cc70 -acc -Minfo=accel -Minfo

%.o: %.f90
        ${CC} ${OPTS} -c $<

all: myProgram
myProgram: main.o libtest.a
        ${CC} ${OPTS} -o myProgram main.o -L. -ltest
myProg:main.o
        ${CC} ${OPTS} -c $<

libtest.a:${OBJS}
        ar rc libtest.a ${OBJS}
clean:
        rm -f libtest.a *.o

mkcolg
Posts: 8382
Joined: Jun 30 2004

Re: Correct manually deep copy of user defined data structure

Post by mkcolg » Thu May 14, 2020 9:46 am

Hi Peter,

I typically use unstructured data regions which are inserted as part of the allocation and deallocation of the type. Then use update directives in between to synchronize the data. This way the device copy of the type has the same lifetime and scope as the host copy of the type.

Your example wasn't run able (it segv's on the host), so I fixed a few things to get it to work and also added how I would perform the deep copy.

Code: Select all

% cat libmain.f90
    subroutine  solver(vec_size)
        use krylov
        implicit none
        type(KSP) method
        integer n,j,i,vec_size
        real*8,bnorm
        n=vec_size
        bnorm=0.0d0

        write(*,*)"Start testprogram"
        call init_KSP_CRS(method,n)
        !$acc parallel loop present(method) reduction(+:bnorm)
        do j=1,n
           bnorm = bnorm+method%b(j)
           write(*,*)method%comm_data%nrk_array(j)%ndata, method%comm_data%nrk_array(j)%offset(j)
        enddo

        write(*,*)"End testprogram", bnorm
      end subroutine solver

      subroutine init_KSP_CRS(method,n)
        use krylov
        implicit none
        type(KSP) method
        integer n,i,j
        allocate(method%b(n))
        allocate(method%r(n))
        allocate(method%x(n))
        allocate(method%comm_data%nrk_array(n))
!$acc enter data create(method,method%b(:n),method%r(:n),method%x(n), &
!$acc                   method%comm_data%nrk_array(:n))
        do i=1,n
          method%comm_data%nrk_array(i)%ndata = i
!$acc update device(method%comm_data%nrk_array(i)%ndata)
          allocate(method%comm_data%nrk_array(i)%offset(i))
!$acc enter data create(method%comm_data%nrk_array(i)%offset(:i))
        enddo

!This can be someplace else in the program, i.e. when the data is assigned

        method%b=1
!$acc update device(method%b)
        do i=1,n
           method%comm_data%nrk_array(i)%offset = i
!$acc update device(method%comm_data%nrk_array(i)%offset)
        enddo

end subroutine init_KSP_CRS
% pgfortran type.f90 libmain.f90 main.f90 -o cpu.out
type.f90:
libmain.f90:
main.f90:
% pgfortran type.f90 libmain.f90 main.f90 -ta=tesla -o gpu.out
type.f90:
libmain.f90:
main.f90:
% ./cpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram
% setenv PGI_ACC_TIME 1
% ./gpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram

Accelerator Kernel Timing data
libmain.f90
  solver  NVIDIA  devicenum=0
    time(us): 131
    12: compute region reached 1 time
        12: kernel launched 1 time
            grid: [1]  block: [128]
             device time(us): total=96 max=96 min=96 avg=96
            elapsed time(us): total=1,341 max=1,341 min=1,341 avg=1,341
        12: reduction kernel launched 1 time
            grid: [1]  block: [256]
             device time(us): total=4 max=4 min=4 avg=4
            elapsed time(us): total=29 max=29 min=29 avg=29
    12: data region reached 4 times
        12: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
        18: data copyout transfers: 1
             device time(us): total=25 max=25 min=25 avg=25
libmain.f90
  init_ksp_crs  NVIDIA  devicenum=0
    time(us): 217
    30: data region reached 1 time
        30: data copyin transfers: 4
             device time(us): total=40 max=16 min=6 avg=10
    34: update directive reached 10 times
        34: data copyin transfers: 10
             device time(us): total=59 max=9 min=5 avg=5
    36: data region reached 10 times
        36: data copyin transfers: 10
             device time(us): total=62 max=7 min=6 avg=6
    42: update directive reached 1 time
        42: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
    45: update directive reached 10 times
        45: data copyin transfers: 10
             device time(us): total=50 max=5 min=5 avg=5
Hope this helps,
Mat

Peter85
Posts: 48
Joined: Mar 06 2018

Re: Correct manually deep copy of user defined data structure

Post by Peter85 » Fri May 15, 2020 1:23 am

mkcolg wrote:
Thu May 14, 2020 9:46 am
Hi Peter,

I typically use unstructured data regions which are inserted as part of the allocation and deallocation of the type. Then use update directives in between to synchronize the data. This way the device copy of the type has the same lifetime and scope as the host copy of the type.

Your example wasn't run able (it segv's on the host), so I fixed a few things to get it to work and also added how I would perform the deep copy.

Code: Select all

% cat libmain.f90
    subroutine  solver(vec_size)
        use krylov
        implicit none
        type(KSP) method
        integer n,j,i,vec_size
        real*8,bnorm
        n=vec_size
        bnorm=0.0d0

        write(*,*)"Start testprogram"
        call init_KSP_CRS(method,n)
        !$acc parallel loop present(method) reduction(+:bnorm)
        do j=1,n
           bnorm = bnorm+method%b(j)
           write(*,*)method%comm_data%nrk_array(j)%ndata, method%comm_data%nrk_array(j)%offset(j)
        enddo

        write(*,*)"End testprogram", bnorm
      end subroutine solver

      subroutine init_KSP_CRS(method,n)
        use krylov
        implicit none
        type(KSP) method
        integer n,i,j
        allocate(method%b(n))
        allocate(method%r(n))
        allocate(method%x(n))
        allocate(method%comm_data%nrk_array(n))
!$acc enter data create(method,method%b(:n),method%r(:n),method%x(n), &
!$acc                   method%comm_data%nrk_array(:n))
        do i=1,n
          method%comm_data%nrk_array(i)%ndata = i
!$acc update device(method%comm_data%nrk_array(i)%ndata)
          allocate(method%comm_data%nrk_array(i)%offset(i))
!$acc enter data create(method%comm_data%nrk_array(i)%offset(:i))
        enddo

!This can be someplace else in the program, i.e. when the data is assigned

        method%b=1
!$acc update device(method%b)
        do i=1,n
           method%comm_data%nrk_array(i)%offset = i
!$acc update device(method%comm_data%nrk_array(i)%offset)
        enddo

end subroutine init_KSP_CRS
% pgfortran type.f90 libmain.f90 main.f90 -o cpu.out
type.f90:
libmain.f90:
main.f90:
% pgfortran type.f90 libmain.f90 main.f90 -ta=tesla -o gpu.out
type.f90:
libmain.f90:
main.f90:
% ./cpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram
% setenv PGI_ACC_TIME 1
% ./gpu.out
 Start testprogram
 Start testprogram
            1            1
            2            2
            3            3
            4            4
            5            5
            6            6
            7            7
            8            8
            9            9
           10           10
 End testprogram    10.00000000000000
 End testprogram

Accelerator Kernel Timing data
libmain.f90
  solver  NVIDIA  devicenum=0
    time(us): 131
    12: compute region reached 1 time
        12: kernel launched 1 time
            grid: [1]  block: [128]
             device time(us): total=96 max=96 min=96 avg=96
            elapsed time(us): total=1,341 max=1,341 min=1,341 avg=1,341
        12: reduction kernel launched 1 time
            grid: [1]  block: [256]
             device time(us): total=4 max=4 min=4 avg=4
            elapsed time(us): total=29 max=29 min=29 avg=29
    12: data region reached 4 times
        12: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
        18: data copyout transfers: 1
             device time(us): total=25 max=25 min=25 avg=25
libmain.f90
  init_ksp_crs  NVIDIA  devicenum=0
    time(us): 217
    30: data region reached 1 time
        30: data copyin transfers: 4
             device time(us): total=40 max=16 min=6 avg=10
    34: update directive reached 10 times
        34: data copyin transfers: 10
             device time(us): total=59 max=9 min=5 avg=5
    36: data region reached 10 times
        36: data copyin transfers: 10
             device time(us): total=62 max=7 min=6 avg=6
    42: update directive reached 1 time
        42: data copyin transfers: 1
             device time(us): total=6 max=6 min=6 avg=6
    45: update directive reached 10 times
        45: data copyin transfers: 10
             device time(us): total=50 max=5 min=5 avg=5
Hope this helps,
Mat
Thank you for your answer! It worked!

Post Reply