|
| View previous topic :: View next topic |
| Author |
Message |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Mon Aug 10, 2009 12:00 pm Post subject: |
|
|
Hi Matt,
I went ahead a wrote a small matmul program using MPI and accelerator regions. The main thing to note is that each process should set the device number to avoid too much contention (though processes can share a device).
| Code: |
#ifdef _ACCEL
numdevices = acc_get_num_devices(acc_device_nvidia)
mydevice = mod(myRank,numdevices)
call acc_set_device_num(mydevice,acc_device_nvidia)
#endif
|
Here's the full source. Please let me know if you spot any bugs since I haven't done MPI programming since grad school. ;-)
- Mat
| Code: |
% cat mpi_mm_acc.f90
! Simple matmul example
module mymm
contains
subroutine mm_acc( a, b, c, m, start, end)
implicit none
real, dimension(:,:), intent(out) :: a
real, dimension(:,:), intent(in) :: b,c
integer m, start, end, i,j,k
!$acc region copyin(b(:m,:m), c(:m,:m)), &
!$acc& copyout(a(start:end,:m))
do j = 1, m
do i = start,end
a(i,j) = 0.0
enddo
do i = start,end
do k = 1,m
a(i,j) = a(i,j) + b(i,k) * c(k,j)
enddo
enddo
enddo
!$acc end region
end subroutine
end module
PROGRAM mpi_matmul_acc
use mymm
use mpi
#ifdef _ACCEL
use accel_lib
#endif
IMPLICIT NONE
INTEGER :: myRank, ierr, status(MPI_STATUS_SIZE)
INTEGER :: count, dest, source, tag, size, icode
INTEGER :: nn, chunk, start, end, a_chunk_type
INTEGER :: i, j, n, mydevice, numdevices
REAL,allocatable :: a(:,:)
REAL,allocatable :: b(:,:), c(:,:)
! set-up MPI
CALL MPI_INIT(ierr)
CALL MPI_COMM_RANK( MPI_COMM_WORLD, myRank, ierr )
CALL MPI_COMM_SIZE( MPI_COMM_WORLD, size, ierr )
! have task 0 get the size of the array from the user
IF (myRank.EQ.0) THEN
print *, 'enter array size to run'
read *, nn
END IF
! send the size to the processes
call MPI_BCAST(nn,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
! set the number of columns each process should work on
chunk = nn / size
#ifdef _ACCEL
! set the device for each process
numdevices = acc_get_num_devices(acc_device_nvidia)
mydevice = mod(myRank,numdevices)
call acc_set_device_num(mydevice,acc_device_nvidia)
#endif
! set-up the vector type for our arrays
CALL MPI_TYPE_VECTOR(1, nn, nn, MPI_REAL, a_chunk_type, ierr)
CALL MPI_TYPE_COMMIT(a_chunk_type,ierr)
! allocate our arrays
allocate(a(nn,nn))
allocate(b(nn,nn))
allocate(c(nn,nn))
tag=99
a=0
! have task 0 initialize the b and c arrays with random numbers
IF (myRank.EQ.0) THEN
call RANDOM_NUMBER(b)
call RANDOM_NUMBER(c)
end if
! sync the b and c arrays
call MPI_BCAST(b,nn*nn,MPI_REAL,0,MPI_COMM_WORLD,ierr)
call MPI_BCAST(c,nn*nn,MPI_REAL,0,MPI_COMM_WORLD,ierr)
! determine the starting column
start = (myRank*chunk)+1
! call the MM routine
call mm_acc( a, b, c, nn, start, start+chunk-1 )
! Have process 0 Gather the results
IF (size .gt. 1. .and. myRank.EQ.0) THEN
do source=1,size-1
start = (source*chunk)+1
do i=start,start+chunk
CALL MPI_RECV(a(i,:), 1, a_chunk_type, source, tag, &
MPI_COMM_WORLD, status, ierr)
end do
end do
END IF
!have the other processes send their data to process 0
IF (size .gt. 1. .and. myRank.ne.0) THEN
dest = 0
do i=start,start+chunk
CALL MPI_SEND(a(i,:), 1, a_chunk_type, dest, tag, &
MPI_COMM_WORLD, ierr)
end do
END IF
! have task 0 print out a few of the values to make sure it got the results back
IF (myRank.EQ.0) THEN
do i=1,size
PRINT*, "TASK #", myRank, " ", a((i*chunk)-1,34)
end do
end if
! Clean-up
deallocate(a)
deallocate(b)
deallocate(c)
CALL MPI_FINALIZE(ierr)
END
% mpif90 -Mpreprocess -o mmacc.out mpi_mm_acc.f90 -fastsse -Minfo=accel -V9.0-3 -ta=nvidia
mm_acc:
12, Generating copyout(a(start:end,:m))
Generating copyin(c(:m,:m))
Generating copyin(b(:m,:m))
15, Loop is parallelizable
16, Loop is parallelizable
Accelerator kernel generated
15, !$acc do parallel, vector(16)
16, !$acc do parallel, vector(16)
20, Loop is parallelizable
21, Complex loop carried dependence of a prevents parallelization
Loop carried dependence of a prevents parallelization
Loop carried backward dependence of a prevents vectorization
Inner sequential loop scheduled on accelerator
Accelerator kernel generated
15, !$acc do parallel, vector(16)
20, !$acc do parallel, vector(16)
Using register for 'a'
21, !$acc do seq
Cached references to size [16x16] block of 'b'
Cached references to size [16x16] block of 'c'
% mpirun -np 4 mmacc.out
enter array size to run
1024
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=16 grid=64x16 block=16x16
launch kernel file=mpi_mm_acc.f90 function=mm_acc line=21 grid=64x16 block=16x16
TASK # 0 264.2788
TASK # 0 267.8382
TASK # 0 268.5576
TASK # 0 263.5367
|
|
|
| Back to top |
|
 |
TheMatt
Joined: 06 Jul 2009 Posts: 263 Location: Greenbelt, MD
|
Posted: Tue Aug 11, 2009 8:32 am Post subject: |
|
|
| mkcolg wrote: | I went ahead a wrote a small matmul program using MPI and accelerator regions. The main thing to note is that each process should set the device number to avoid too much contention (though processes can share a device).
| Code: |
#ifdef _ACCEL
numdevices = acc_get_num_devices(acc_device_nvidia)
mydevice = mod(myRank,numdevices)
call acc_set_device_num(mydevice,acc_device_nvidia)
#endif
|
|
Thanks! This is exactly what I was hoping for: a "transparent" sort of hook. I'll be sure to let you know when I get a chance to try it out! (At the moment, I'm aiming to try one CPU and one GPU. Get that going first!)
One more (of many, to be sure) question: if you had 8 CPUs using 4 GPUs, would you incur initialization penalties each time you cycled through the mod? That is, if it turned out that rank 0 asked for the GPU 0, then rank 4, if rank 0 then asks for it again, do you incur the reinitialize time again?
Matt |
|
| Back to top |
|
 |
mkcolg
Joined: 30 Jun 2004 Posts: 4996 Location: The Portland Group Inc.
|
Posted: Tue Aug 11, 2009 8:57 am Post subject: |
|
|
| Quote: | | One more (of many, to be sure) question: if you had 8 CPUs using 4 GPUs, would you incur initialization penalties each time you cycled through the mod? That is, if it turned out that rank 0 asked for the GPU 0, then rank 4, if rank 0 then asks for it again, do you incur the reinitialize time again? |
I don't believe so. Once a device is initialized, it doesn't need to be initialized again. Even if t initialized by another program.
- Mat |
|
| Back to top |
|
 |
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2001, 2002 phpBB Group
|