SUBROUTINE JACOBI(ANS,RHS,NN,MM,P,Q, ME,ME_P,ME_Q, TOL)
      IMPLICIT NONE
      INTEGER NN,MM,P,Q, ME,ME_P,ME_Q
      REAL    ANS(0:NN+1,0:MM+1)[0:P-1,0:*], &
              RHS(0:NN+1,0:MM+1)[0:P-1,0:*]
      REAL    TOL
!
!     Solve Helmholtz's equation using Jacobi iteration.
!
!     See: http://www.netlib.org/linalg/html_templates/Templates.html
!          R. Barrett et. al. (1994).  Templates for the solution of 
!          Linear Systems: Building Blocks for Iterative Methods, 2nd
!          Edition.  SIAM.  Philadelphia, PA.
!
!     Equation solved is the 5-point stencil:
!           -1
!        -1  6 -1
!           -1
!     over a doubly periodic region.
!
!     SPMD domain decomposition - each image "owns" a (1:NN,1:MM) piece
!     of the (1:NN*P,1:MM*Q) array which is therefore distributed across
!     all images.  A halo is added around (1:NN,1:MM), so computations
!     can always use local arrays.
!
      INTEGER I,ITERATIONS,J,ME_PM,ME_PP,ME_QM,ME_QP,NIEGHBORS(5)
      REAL    PMAXI,RESID_MAX
      REAL    WRK(1:NN,1:MM)
      REAL, SAVE :: PMAX[*]

      ME_QP = MOD(ME_Q+1+Q,Q)  ! north
      ME_QM = MOD(ME_Q-1+Q,Q)  ! south
      ME_PP = MOD(ME_P+1+P,P)  ! east
      ME_PM = MOD(ME_P-1+P,P)  ! west
      IF (MIN(P,Q) >= 3) THEN  ! list of nieghbouring images
        NIEGHBORS = (/ ME, ME_P  + 1 + P*ME_QM, ME_P  + 1 + P*ME_QP, &
                           ME_PM + 1 + P*ME_Q,  ME_PP + 1 + P*ME_Q   /)
      ENDIF

      DO ITERATIONS= 1,999
!
!       update halo.
!
        IF (MIN(P,Q) >= 3) THEN
          CALL SYNC_ALL( WAIT=NIEGHBORS )
        ELSE
          CALL SYNC_ALL()
        ENDIF  ! nieghbouring images have ANS(1:NN,1:MM) up to date
        ANS(1:NN,MM+1) = ANS(1:NN,1   )[ME_P, ME_QP]  ! north
        ANS(1:NN,   0) = ANS(1:NN,  MM)[ME_P, ME_QM]  ! south
        ANS(NN+1,1:MM) = ANS(1,   1:MM)[ME_PP,ME_Q ]  ! east
        ANS(   0,1:MM) = ANS(  NN,1:MM)[ME_PM,ME_Q ]  ! west
!
!       5-point stencil is correct everywhere, since halo is up to date.
!
        DO J= 1,MM
          DO I= 1,NN
            WRK(I,J) = (1.0/6.0) * (RHS(I,  J  ) + &
                                    ANS(I-1,J  ) + &
                                    ANS(I+1,J  ) + &
                                    ANS(I,  J-1) + &
                                    ANS(I,  J+1)   )
          ENDDO
        ENDDO
!
!       calculate global maximum residual error.
!
        PMAX = MAXVAL( ABS( WRK(1:NN,1:MM) - ANS(1:NN,1:MM) ) )
        CALL SYNC_ALL()  ! protects both PMAX and ANS
        IF (ME == 1) THEN
          DO I= 2,NUM_IMAGES()
            PMAXI = PMAX[I]
            PMAX  = MAX( PMAX, PMAXI )
          ENDDO
        ENDIF
        CALL SYNC_ALL( WAIT=(/1/) )  ! protects PMAX[1]
        RESID_MAX = PMAX[1]
!
!       update the result, note that above SYNC_ALL() guarentees that
!       the old ANS(1:NN,1:MM) is no longer needed for halo update.
!
        ANS(1:NN,1:MM) = WRK(1:NN,1:MM)
!
!       exit if converged.
!
        IF (RESID_MAX <= TOL) THEN
          EXIT
        ENDIF
      ENDDO
      IF (ME == 1) THEN
        WRITE(6,"('After',I4,' iterations, maximum residual is',F12.8)") &
           ITERATIONS,PMAX
        CALL SYNC_FILE(6)
      ENDIF
      RETURN
      END SUBROUTINE JACOBI

      PROGRAM TEST_JACOBI
      IMPLICIT NONE
!
! Example implementation of Jacobi iterative method for Helmholtz's equation
! using SPMD 2-D domain decomposition and Co-Array Fortran.  Note that
! Jacobi has very poor convergence properties, but it is scalable and easy
! to understand and program.  Use a better scheme, such as Red-Black SOR or 
! Preconditioned Conjugate Gradients, for solving real problems.
!
! See: http://www.netlib.org/linalg/html_templates/Templates.html
!      R. Barrett et. al. (1994).  Templates for the solution of 
!      Linear Systems: Building Blocks for Iterative Methods, 2nd
!      Edition.  SIAM.  Philadelphia, PA.
!
! Equation solved is the 5-point stencil:
!       -1
!    -1  6 -1
!       -1
! over a doubly periodic region with a point source right hand side.
!
! Input: N,M,P,Q
!
!        N = 1st dimension of entire rectangular grid
!        M = 2nd dimension of entire rectangular grid
!        P = number of nodes in 1st dimension, N/P an integer
!        Q = number of nodes in 2nd dimension, M/Q an integer
!        Total number of nodes = P*Q, must be NUM_IMAGES()
!
! Correctness is confirmed by printing the solution for two locations
! of the point source on the right hand side (solutions should be
! identical after a shift).
!
! Alan. J. Wallcraft, Naval Research Laboratory, October 1998.
!
      INTERFACE  ! needed because jacobi has co-array dummy arguments
         SUBROUTINE JACOBI(ANS,RHS,NN,MM,P,Q, ME,ME_P,ME_Q, TOL)
         INTEGER NN,MM,P,Q, ME,ME_P,ME_Q
         REAL    ANS(0:NN+1,0:MM+1)[0:P-1,0:*], &
                 RHS(0:NN+1,0:MM+1)[0:P-1,0:*]
         REAL    TOL
         END SUBROUTINE JACOBI
      END INTERFACE

      INTEGER :: N,M,P,Q
      INTEGER :: I1,II1,IP,IQ,J1,JJ1,ME,ME_P,ME_Q,MM,NN

      INTEGER, SAVE     :: IN(4)[*]
      REAL, ALLOCATABLE :: ANS(:,:)[:,:],RHS(:,:)[:,:]
      REAL, ALLOCATABLE :: GLOBAL_ANS(:,:)

      ME = THIS_IMAGE()
!
!     input.
!
      IF (ME == 1) THEN
        READ(5,*) IN(1:4)
      ENDIF
      CALL SYNC_ALL( WAIT=(/1/) )
      N = IN(1)[1]
      M = IN(2)[1]
      P = IN(3)[1]
      Q = IN(4)[1]
      IF ((N/P)*P /= N .OR. (M/Q)*Q /= M .OR. P*Q /= NUM_IMAGES()) THEN
        IF (ME == 1) THEN
          WRITE(6,*) 'BAD INPUT VALUES'
          CALL SYNC_FILE(6)
        ENDIF
        CALL SYNC_ALL()
        STOP
      ENDIF
      NN   = N/P
      MM   = M/Q
      ME_Q = (ME-1)/P         ! ANS(1:NN,1:MM)[ME_P,ME_Q] holds
      ME_P = (ME-1) - ME_Q*P  ! the local piece of the global array
      CALL SYNC_ALL()
!
!     allocate arrays and co-arrays.
!
      IF (ME == 1) THEN
        ALLOCATE(GLOBAL_ANS(N,M))
      ENDIF
      ALLOCATE(ANS(0:NN+1,0:MM+1)[0:P-1,0:*])  ! include a halo
      ALLOCATE(RHS(0:NN+1,0:MM+1)[0:P-1,0:*])  ! around (1:NN,1:MM)
!
!     point source in center of global array.
!
      I1  = N/2
      J1  = M/2
      II1 = I1 - ME_P*NN
      JJ1 = J1 - ME_Q*MM
      RHS(:,:) = 0.0  ! rsh is mostly zero
      IF (II1 >= 1 .AND. II1 <= NN .AND. &
          JJ1 >= 1 .AND. JJ1 <= MM       ) THEN
        RHS(II1,JJ1) = 1.0  ! with single non-zero element
      ENDIF
      ANS(:,:) = 0.0  ! 1st guess answer is zero
      CALL JACOBI(ANS,RHS,NN,MM,P,Q, ME,ME_P,ME_Q, 1.E-6)
      CALL SYNC_ALL()
      IF (ME == 1) THEN
        DO IQ= 0,Q-1
          DO IP= 0,P-1
            GLOBAL_ANS(IP*NN+1:IP*NN+NN,IQ*MM+1:IQ*MM+MM) = &
                     ANS(1:NN,1:MM)[IP,IQ]
          ENDDO
        ENDDO
        WRITE(6,*) 'Point source in center of global array'
        WRITE(6,"('ANS.I = ',9F8.5)") &
           GLOBAL_ANS(I1,MOD(J1-1+M-M/8,M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-5,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-3,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-1,  M)+1), &
           GLOBAL_ANS(I1,J1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+1,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+3,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+5,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+M/8,M)+1)
        WRITE(6,"('ANS.J = ',9F8.5)") &
           GLOBAL_ANS(MOD(I1-1+N-N/8,N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-5,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-3,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-1,  N)+1,J1), &
           GLOBAL_ANS(I1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+1,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+3,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+5,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+N/8,N)+1,J1)
        CALL SYNC_FILE(6)
      ENDIF
      CALL SYNC_ALL( WAIT=(/1/) )
!
!     point source at first point of global array.
!
      I1  = 1
      J1  = 1 
      II1 = I1 - ME_P*NN
      JJ1 = J1 - ME_Q*MM
      RHS(:,:) = 0.0  ! rsh is mostly zero
      IF (II1 >= 1 .AND. II1 <= NN .AND. &
          JJ1 >= 1 .AND. JJ1 <= MM       ) THEN
        RHS(II1,JJ1) = 1.0  ! with single non-zero element
      ENDIF
      ANS(:,:) = 0.0  ! 1st guess answer is zero
      CALL JACOBI(ANS,RHS,NN,MM,P,Q, ME,ME_P,ME_Q, 1.E-6)
      CALL SYNC_ALL()
      IF (ME == 1) THEN
        DO IQ= 0,Q-1
          DO IP= 0,P-1
            GLOBAL_ANS(IP*NN+1:IP*NN+NN,IQ*MM+1:IQ*MM+MM) = &
                     ANS(1:NN,1:MM)[IP,IQ]
          ENDDO
        ENDDO
        WRITE(6,*) 'Point source at first point of global array'
        WRITE(6,"('ANS.I = ',9F8.5)") &
           GLOBAL_ANS(I1,MOD(J1-1+M-M/8,M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-5,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-3,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M-1,  M)+1), &
           GLOBAL_ANS(I1,J1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+1,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+3,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+5,  M)+1), &
           GLOBAL_ANS(I1,MOD(J1-1+M+M/8,M)+1)
        WRITE(6,"('ANS.J = ',9F8.5)") &
           GLOBAL_ANS(MOD(I1-1+N-N/8,N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-5,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-3,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N-1,  N)+1,J1), &
           GLOBAL_ANS(I1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+1,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+3,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+5,  N)+1,J1), &
           GLOBAL_ANS(MOD(I1-1+N+N/8,N)+1,J1)
        CALL SYNC_FILE(6)
      ENDIF
      CALL SYNC_ALL( WAIT=(/1/) )
      END PROGRAM TEST_JACOBI