PGI User Forum
 SearchSearch   MemberlistMemberlist     RegisterRegister   ProfileProfile    Log inLog in 

CUDA-x86.

Output incorrect when using accelerator

 
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming
View previous topic :: View next topic  
Author Message
Eno_Khem



Joined: 17 Aug 2010
Posts: 3

PostPosted: Fri Aug 27, 2010 2:45 am    Post subject: Output incorrect when using accelerator Reply with quote

I'm using VS2010 and PGI Visual Fortran 10.8

Current working old code is

Code:
      SUBROUTINE NLMMT(TH,IFN,JFN,Z,M,N,DZ,DM,DN,HZ,HM,HN,RX,DT,FM)
C
      PARAMETER   (GX=1.0E-5, GG=9.81)
C
      INTEGER   IFN, JFN
      REAL      TH, Z(IFN,JFN,2), M(IFN,JFN,2), N(IFN,JFN,2)
      REAL      DZ(IFN,JFN,2), DM(IFN,JFN,2), DN(IFN,JFN,2)
      REAL      HZ(IFN,JFN), HM(IFN,JFN), HN(IFN,JFN), RX, DT, FM
   REAL      DM1, DM2, DN1, DN2, FN
C
      DO 10 K=1,2
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
      DO J=1, JFN
         DO I=1, IFN   
         IF (I .LT. IFN) THEN
               DM1 = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I+1,J,1)+Z(I+1,J,2))
     &             + 0.5*(HZ(I,J)+HZ(I+1,J))
               DM2 = 0.5*(Z(I,J,2)+Z(I+1,J,2)+HZ(I,J)+HZ(I+1,J))
            ELSE
               DM1 = Z(I,J,2) + HZ(I,J)
               DM2 = Z(I,J,2) + HZ(I,J)
            END IF
            IF (DM1 .GE. GX) DM(I,J,1) = DM1
            IF (DM2 .GE. GX) DM(I,J,2) = DM2
C
            IF (J .LT. JFN) THEN
               DN1 = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I,J+1,1)+Z(I,J+1,2))
     &             + 0.5*(HZ(I,J)+HZ(I,J+1))
               DN2 = 0.5*(Z(I,J,2)+Z(I,J+1,2)+HZ(I,J)+HZ(I,J+1))
            ELSE
               DN1 = Z(I,J,2)+HZ(I,J)
               DN2 = Z(I,J,2)+HZ(I,J)
            END IF
            IF (DN1 .GE. GX) DN(I,J,1) = DN1
            IF (DN2 .GE. GX) DN(I,J,2) = DN2
         END DO
      END DO 

      FN=0.5*DT*GG*FM**2
C
C   ------- X-DIRECTION -------
C
      DO J=1, JFN
         DO I=1, IFN
            IF (HZ(I,J).GT.TH .AND. HM(I,J).GT.TH) THEN
               CALL XMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DM,
     &                                      M, N, RX, FN)
            END IF
C
C   ------- Y-DIRECTION -------
C
            IF (HZ(I,J).GT.TH .AND. HN(I,J).GT.TH) THEN
               CALL YMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DN,
     &                                      M, N, RX, FN)
            END IF
         END DO
      END DO

      RETURN
      END


This is the code after modification, but it doesn't work correctly. Although some local variables are changed to arrays, it works correctly without !$acc region.

Code:
      SUBROUTINE NLMMT(TH,IFN,JFN,Z,M,N,DZ,DM,DN,HZ,HM,HN,RX,DT,FM)
C
      PARAMETER   (GX=1.0E-5, GG=9.81)
C
      INTEGER   IFN, JFN
      REAL      TH, Z(IFN,JFN,2), M(IFN,JFN,2), N(IFN,JFN,2)
      REAL      DZ(IFN,JFN,2), DM(IFN,JFN,2), DN(IFN,JFN,2)
      REAL      HZ(IFN,JFN), HM(IFN,JFN), HN(IFN,JFN), RX, DT, FM
   REAL      DM1(IFN,JFN), DM2(IFN,JFN), DN1(IFN,JFN), DN2(IFN,JFN), FN
C
!$acc region
      DO 10 K=1,2
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
      DO J=1, JFN
         DO I=1, IFN   
         IF (I .LT. IFN) THEN
               DM1(I,J) = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I+1,J,1)+Z(I+1,J,2))
     &             + 0.5*(HZ(I,J)+HZ(I+1,J))
               DM2(I,J) = 0.5*(Z(I,J,2)+Z(I+1,J,2)+HZ(I,J)+HZ(I+1,J))
            ELSE
               DM1(I,J) = Z(I,J,2) + HZ(I,J)
               DM2(I,J) = Z(I,J,2) + HZ(I,J)
            END IF
C
            IF (J .LT. JFN) THEN
               DN1(I,J) = 0.25*(Z(I,J,1)+Z(I,J,2)+Z(I,J+1,1)+Z(I,J+1,2))
     &             + 0.5*(HZ(I,J)+HZ(I,J+1))
               DN2(I,J) = 0.5*(Z(I,J,2)+Z(I,J+1,2)+HZ(I,J)+HZ(I,J+1))
            ELSE
               DN1(I,J) = Z(I,J,2)+HZ(I,J)
               DN2(I,J) = Z(I,J,2)+HZ(I,J)
            END IF
         END DO
      END DO
      DO J=1, JFN
         DO I=1, IFN   
           IF (DM1(I,J) .GE. GX) DM(I,J,1) = DM1(I,J)
            IF (DM2(I,J) .GE. GX) DM(I,J,2) = DM2(I,J)
            IF (DN1(I,J) .GE. GX) DN(I,J,1) = DN1(I,J)
            IF (DN2(I,J) .GE. GX) DN(I,J,2) = DN2(I,J)
      END DO
     END DO
!$acc end region

      FN=0.5*DT*GG*FM**2
C
C   ------- X-DIRECTION -------
C
      DO J=1, JFN
         DO I=1, IFN
            IF (HZ(I,J).GT.TH .AND. HM(I,J).GT.TH) THEN
               CALL XMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DM,
     &                                      M, N, RX, FN)
            END IF
C
C   ------- Y-DIRECTION -------
C
            IF (HZ(I,J).GT.TH .AND. HN(I,J).GT.TH) THEN
               CALL YMMT(GG, I, J, IFN, JFN, HZ, Z, DZ, DN,
     &                                      M, N, RX, FN)
            END IF
         END DO
      END DO

      RETURN
      END


The size of arrays M,N,Z are all 2293416x2293416x2
The main program call this function on only parts of the M,N,Z as around IFN=721 and JFN=841 per call


Last edited by Eno_Khem on Sun Aug 29, 2010 12:09 am; edited 1 time in total
Back to top
View user's profile
mkcolg



Joined: 30 Jun 2004
Posts: 5952
Location: The Portland Group Inc.

PostPosted: Fri Aug 27, 2010 11:42 am    Post subject: Reply with quote

Hi Eno_Khem,

Can you post a reproduce able example which shows the problem?

The code compiles fine and does create the several device kernels. Though, I did need add "-Mextend" since one line is longer than 72 columns in length.

- Mat
Back to top
View user's profile
Eno_Khem



Joined: 17 Aug 2010
Posts: 3

PostPosted: Wed Sep 01, 2010 11:07 am    Post subject: Reply with quote

well, another example is...
Code:
      SUBROUTINE INTERQT (IFN,JFN,KK,NT,M,N,M_T,N_T)
C
      INTEGER IFN,JFN,KK,NT
      REAL M(IFN,JFN,2),N(IFN,JFN,2)
      REAL M_T(IFN,JFN,2),N_T(IFN,JFN,2)
C
!$acc region
      DO 10 J=1,JFN
      DO 10 I=1,IFN
         M_T(I,J,1)=M(I,J,1)
         N_T(I,J,1)=N(I,J,1)

         M_T(I,J,2)=M(I,J,1)+(M(I,J,2)-M(I,J,1))*KK/NT
         N_T(I,J,2)=N(I,J,1)+(N(I,J,2)-N(I,J,1))*KK/NT
   10 CONTINUE
!$acc end region
      RETURN
      END

and
Code:
      SUBROUTINE INTERQT (IFN,JFN,KK,NT,M,N,M_T,N_T)
C
      INTEGER IFN,JFN,KK,NT
      REAL M(IFN,JFN,2),N(IFN,JFN,2)
      REAL M_T(IFN,JFN,2),N_T(IFN,JFN,2)
C
!$acc region
      DO 10 J=1,JFN
      DO 10 I=1,IFN
         M_T(I,J,1)=M(I,J,1)
         N_T(I,J,1)=N(I,J,1)
   10 CONTINUE
     DO 20 J=1,JFN
      DO 20 I=1,IFN
         M_T(I,J,2)=M(I,J,1)+(M(I,J,2)-M(I,J,1))*KK/NT
         N_T(I,J,2)=N(I,J,1)+(N(I,J,2)-N(I,J,1))*KK/NT
   20 CONTINUE
!$acc end region
      RETURN
      END

I think these 2 should have same output but they're not
the 1st subroutine's output is wrong (ONLY when using accelerator)

1st subroutine's compiler message
Code:
interqt:
   2376, Generating copyout(n_t(1:ifn,1:jfn,1:2))
         Generating copyin(n(1:ifn,1:jfn,1:2))
         Generating copyout(m_t(1:ifn,1:jfn,1:2))
         Generating copyin(m(1:ifn,1:jfn,1:2))
         Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
   2377, Loop is parallelizable
   2378, Loop is parallelizable
         Accelerator kernel generated
       2377, !$acc do parallel, vector(16)
       2378, !$acc do parallel, vector(16)
             Cached references to size [16x16x2] block of 'n'
             Cached references to size [16x16x2] block of 'm'
             CC 1.0 : 20 registers; 4120 shared, 92 constant, 0 local memory bytes; 33 occupancy
             CC 1.3 : 20 registers; 4120 shared, 92 constant, 0 local memory bytes; 75 occupancy

2nd subroutine's compiler message
Code:
interqt:
   2376, Generating copyin(m(1:ifn,1:jfn,1:2))
         Generating copyout(m_t(1:ifn,1:jfn,1:2))
         Generating copyin(n(1:ifn,1:jfn,1:2))
         Generating copyout(n_t(1:ifn,1:jfn,1:2))
         Generating compute capability 1.0 binary
         Generating compute capability 1.3 binary
   2377, Loop is parallelizable
   2378, Loop is parallelizable
         Accelerator kernel generated
       2377, !$acc do parallel, vector(16)
       2378, !$acc do parallel, vector(16)
             CC 1.0 : 6 registers; 24 shared, 80 constant, 0 local memory bytes; 100 occupancy
             CC 1.3 : 6 registers; 24 shared, 80 constant, 0 local memory bytes; 100 occupancy
   2382, Loop is parallelizable
   2383, Loop is parallelizable
         Accelerator kernel generated
       2382, !$acc do parallel, vector(16)
       2383, !$acc do parallel, vector(16)
             CC 1.0 : 18 registers; 24 shared, 124 constant, 0 local memory bytes; 33 occupancy
             CC 1.3 : 18 registers; 24 shared, 124 constant, 0 local memory bytes; 75 occupancy
Back to top
View user's profile
Eno_Khem



Joined: 17 Aug 2010
Posts: 3

PostPosted: Sat Sep 18, 2010 8:15 am    Post subject: Reply with quote

well I found that I have to set private clase for variables
Code:
!$acc region local(I,J,K)
!$acc do private(J)
      DO 10 K=1,2
!$acc do private(I)
        DO 10 J=1,JFN
          DO 10 I=1,IFN
            DM(I,J,K)=0.0
            DN(I,J,K)=0.0
   10 CONTINUE   
C
!$acc do private(I)
      DO J=1, JFN
!$acc do private(DM1,DM2,DN1,DN2)
         DO I=1, IFN
             ...
Back to top
View user's profile
Display posts from previous:   
Post new topic   Reply to topic    PGI User Forum Forum Index -> Accelerator Programming All times are GMT - 7 Hours
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © phpBB Group