HPF 2D Jacobi Kernel
Generated Code


      program jacobi
        integer n, m
        parameter (n = 1024)
CHPF$   processors p (4, 8) 
CHPF$   template t (n, n) 
CHPF$   align a (i, j) with t(i, j)
CHPF$   align b (i, j) with t(i, j)
CHPF$   distribute t(block, block) onto p
C       --------------------------------------------------------------
C       declarations for heap-based runtime dynamic storage allocation
C       --------------------------------------------------------------
        common /hpf$heap$common/ hpf$heap
        dimension hpf$heap$integer(0:0)
        equivalence (hpf$heap$integer(0), hpf$heap)
        dimension hpf$heap$double(0:0)
        equivalence (hpf$heap$double(0), hpf$heap)
#include 
        integer status(MPI_STATUS_SIZE)
C       -----------------------------------------------
C       declarations for compiler-generated temporaries
C       -----------------------------------------------
        external hpf_arrayrtd_get_lextent, hpf_arrayrtd_get_gextent
        save a, b
        logical p$wrap
        integer counter$b$24, send$buf$b$24$index, i1, i2, p$q1, p$q2
        integer counter$b$25, send$buf$b$25$index, counter$b$26
        integer send$buf$b$26$index, counter$b$27
        integer send$buf$b$27$index, recv$buf$b$24$index
        integer recv$buf$b$25$index, recv$buf$b$26$index
        integer recv$buf$b$27$index, a$coord$0, a$coord$1
        integer hpf_arrayrtd_get_lextent, hpf_arrayrtd_get_gextent
        integer a$local$extent$0, a$global$extent$0, b$coord$0
        integer b$coord$1, b$local$extent$0, b$global$extent$0
        integer p$myid1, p$myid2, hpf$heap$integer, p$cmap, p$dims
        integer sendproc, recvproc, myid, ierr, request, j, i
        integer*4 a$data, a$coord, b$data, b$coord, p$coord
        real*8 hpf$heap$double, lnltmp1, hpf_nonlocal_lookupd
        real*8 lnltmp2, lnltmp3, lnltmp4, lnltmp5, lnltmp6, lnltmp7
        real*8 lnltmp8, lnltmp9, lnltmp10, lnltmp11, lnltmp12
        real*8 lnltmp13, lnltmp14, lnltmp15, lnltmp16, lnltmp17
        real*8 lnltmp18, lnltmp19, lnltmp20, lnltmp21, lnltmp22
        real*8 lnltmp23, lnltmp24, a, b, hpf$heap
        integer*4 hash$nonlocals, send$buf$b$24, send$buf$b$25
        integer*4 send$buf$b$26, send$buf$b$27, recv$buf$b$24
        integer*4 recv$buf$b$25, recv$buf$b$26, recv$buf$b$27, a$align
        integer*4 a$dist, a$tmpl, a$proc, a$align$new, a$dist$new
        integer*4 a$tmpl$new, a$proc$new, a$desc, b$align, b$dist
        integer*4 b$tmpl, b$proc, b$align$new, b$dist$new, b$tmpl$new
        integer*4 b$proc$new, b$desc, t$template, t$proc, t$dist
        integer*4 p$processors
        dimension a(0:0), b(0:0), p$dims(1:2), p$wrap(1:2)
C       
C       
C       -----------------------------
C       runtime system initialization
C       -----------------------------
        call mpi_init(ierr)
        call mpi_comm_rank(MPI_COMM_WORLD, myid, ierr)
C       -----------------------------------------------
C       initializations for run-time descriptor indices
C       -----------------------------------------------
        a$align$new = 0
        a$align = 0
        b$align$new = 0
        b$align = 0
        t$dist = 0
        a$dist$new = 0
        a$dist = 0
        b$dist$new = 0
        b$dist = 0
        a$desc = 0
        b$desc = 0
C       
C       
C       -------------------------------------
C       building array descriptor for array a
C       -------------------------------------
        call hpf_arrayrtd_alloc(2, 2, a$desc)
        call hpf_arrayrtd_setdim(a$desc, 0, 1, 1024)
        call hpf_arrayrtd_setdim(a$desc, 1, 1, 1024)
C       
C       -------------------------------------
C       building array descriptor for array b
C       -------------------------------------
        call hpf_arrayrtd_alloc(2, 2, b$desc)
        call hpf_arrayrtd_setdim(b$desc, 0, 1, 1024)
        call hpf_arrayrtd_setdim(b$desc, 1, 1, 1024)
C       
C       ------------------------------------
C       building tmpl descriptor: template t
C       ------------------------------------
        call hpf_tmplrtd_alloc(2, t$template)
        call hpf_tmplrtd_setdim(t$template, 0, 1, 1024)
        call hpf_tmplrtd_setdim(t$template, 1, 1, 1024)
C       
C       ---------------------------------------
C       building procs descriptor: processors p
C       ---------------------------------------
        call hpf_procrtd_alloc(2, p$processors)
        call hpf_procrtd_setdim(p$processors, 0, 1, 4)
        call hpf_procrtd_setdim(p$processors, 1, 1, 8)
C       
C       ----------------------------------------------------
C       initialize processor topology for processors array p
C       ----------------------------------------------------
        p$dims(1) = 4
        p$wrap(1) = .false.
        p$dims(2) = 8
        p$wrap(2) = .false.
        call mpi_cart_create(MPI_COMM_WORLD, 2, p$dims, p$wrap, .false.,
     * p$cmap, ierr)
        call hpf_procrtd_get_coords(hpf$heap, p$processors, p$coord)
        call mpi_cart_coords(p$cmap, myid, 2, hpf$heap$integer(p$coord),
     * ierr)
        p$myid1 = hpf$heap$integer(p$coord + 0)
        p$myid2 = hpf$heap$integer(p$coord + 1)
        call hpf_procrtd_set_chandle(p$processors, p$cmap)
C       
C       -----------------------------------------------
C       building dist descriptor: distribute template t
C       -----------------------------------------------
        call hpf_distrtd_alloc(2, t$dist)
        call hpf_distrtd_setdim(t$dist, 0, -1, 256, 0)
        call hpf_distrtd_setdim(t$dist, 1, -1, 128, 1)
        t$proc = p$processors
C       
C       --------------------------------------------
C       building dist descriptor: distribute array a
C       --------------------------------------------
        call hpf_distrtd_clone(t$dist, a$dist$new)
        a$proc$new = t$proc
C       
C       --------------------------------------------
C       building dist descriptor: distribute array b
C       --------------------------------------------
        call hpf_distrtd_clone(t$dist, b$dist$new)
        b$proc$new = t$proc
C       
C       -----------------------------------------
C       building align descriptor: align b with t
C       -----------------------------------------
        call hpf_alignrtd_alloc(2, 2, b$align$new)
        call hpf_alignrtd_setdim_src(b$align$new, 0, 0)
        call hpf_alignrtd_setdim_src(b$align$new, 1, 1)
        call hpf_alignrtd_setdim_tmpl(b$align$new, 0, 0, 1, 0, 0)
        call hpf_alignrtd_setdim_tmpl(b$align$new, 1, 1, 1, 0, 0)
C       
C       ------------------------------------------------------------
C       establish template and processor association: align b with t
C       ------------------------------------------------------------
        b$tmpl$new = t$template
        b$proc$new = t$proc
C       
C       -----------------------------------------
C       building align descriptor: align a with t
C       -----------------------------------------
        call hpf_alignrtd_alloc(2, 2, a$align$new)
        call hpf_alignrtd_setdim_src(a$align$new, 0, 0)
        call hpf_alignrtd_setdim_src(a$align$new, 1, 1)
        call hpf_alignrtd_setdim_tmpl(a$align$new, 0, 0, 1, 0, 0)
        call hpf_alignrtd_setdim_tmpl(a$align$new, 1, 1, 1, 0, 0)
C       
C       ------------------------------------------------------------
C       establish template and processor association: align a with t
C       ------------------------------------------------------------
        a$tmpl$new = t$template
        a$proc$new = t$proc
C       
C       ------------------------------------------------------------------------
C       allocate or redistribute array a; compute array-indexed processor coords
C       ------------------------------------------------------------------------
        call hpf_array_remap(a, a$align, a$dist, a$tmpl, a$proc, a$align
     *$new, a$dist$new, a$tmpl$new, a$proc$new, a$desc, a$data)
C       
C       -----------------------------------------------------------------------
C       map array-indexed processor coordinates to partitioned array dimensions
C       -----------------------------------------------------------------------
        call hpf_arrayrtd_get_coords(hpf$heap, a$desc, a$coord)
        a$coord$0 = hpf$heap$integer(a$coord + 0)
        a$coord$1 = hpf$heap$integer(a$coord + 1)
C       
C       -----------------------------------------------------------------
C       initializing scalar extent vars used for linearization of array a
C       -----------------------------------------------------------------
        a$local$extent$0 = hpf_arrayrtd_get_lextent(a$desc, 0)
        a$global$extent$0 = hpf_arrayrtd_get_gextent(a$desc, 0)
C       
C       ------------------------------------------------------------------------
C       allocate or redistribute array b; compute array-indexed processor coords
C       ------------------------------------------------------------------------
        call hpf_array_remap(b, b$align, b$dist, b$tmpl, b$proc, b$align
     *$new, b$dist$new, b$tmpl$new, b$proc$new, b$desc, b$data)
C       
C       -----------------------------------------------------------------------
C       map array-indexed processor coordinates to partitioned array dimensions
C       -----------------------------------------------------------------------
        call hpf_arrayrtd_get_coords(hpf$heap, b$desc, b$coord)
        b$coord$0 = hpf$heap$integer(b$coord + 0)
        b$coord$1 = hpf$heap$integer(b$coord + 1)
C       
C       -----------------------------------------------------------------
C       initializing scalar extent vars used for linearization of array b
C       -----------------------------------------------------------------
        b$local$extent$0 = hpf_arrayrtd_get_lextent(b$desc, 0)
        b$global$extent$0 = hpf_arrayrtd_get_gextent(b$desc, 0)
C       
        call hpf_nonlocals_alloc(hash$nonlocals)
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$24 = 0
              if (p$q2 .eq. p$myid2 .and. max(256 * p$myid1 + 1, 256 * p
     *$q1) .le. min(256 * p$myid1 + 256, 256 * p$q1 + 255, 1022) .and. m
     *ax(128 * p$myid2 + 1, 2) .le. min(128 * p$myid2 + 128, 1023)) then
                counter$b$24 = counter$b$24 + (min(256 * p$myid1 + 256, 
     *256 * p$q1 + 255, 1022) - max(256 * p$myid1 + 1, 256 * p$q1) + 1) 
     ** (min(128 * p$myid2 + 128, 1023) - max(128 * p$myid2 + 1, 2) + 1)
              endif
              call hpf_buffer_alloc(counter$b$24 * 8, send$buf$b$24)
              call hpf_ptr_to_index(hpf$heap, send$buf$b$24, 8, send$buf
     *$b$24$index)
C             --< Pack Loop For Send For Nonlocal Read >--
              counter$b$24 = 0
C             
C             Loop section ---[ max(((128 * p$myid2) + 1), 2) <= i2 <= min(((128
C * p$myid2) + 128), 1023), max(((256 * p$myid1) + 1), (256 * p$q1)) <= i1 <= mi
Cn(((256 * p$myid1) + 256), ((256 * p$q1) + 255), 1022) ]---
C             
              if (p$q2 .eq. p$myid2) then
                do i1 = max(256 * p$myid1 + 1, 256 * p$q1), min(256 * p$
     *myid1 + 256, 256 * p$q1 + 255, 1022)
                  do i2 = max(128 * p$myid2 + 1, 2), min(128 * p$myid2 +
     * 128, 1023)
                    hpf$heap$double(send$buf$b$24$index + counter$b$24) 
     *= b(b$data + i1 - (b$coord$0 * 256 + 1) + (i2 - (b$coord$1 * 128 +
     * 1)) * b$local$extent$0)
                    counter$b$24 = counter$b$24 + 1
                  enddo
                enddo
              endif
              if (counter$b$24 .gt. 0) then
                call mpi_send(hpf$heap$double(send$buf$b$24$index), coun
     *ter$b$24, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 1, p$cmap, reques
     *t, ierr)
              endif
              call hpf_buffer_free(send$buf$b$24)
            endif
          enddo
        enddo
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$25 = 0
              if (p$q2 .eq. p$myid2 .and. max(64 * p$q1 + 3, 256 * p$myi
     *d1 + 1, 256 * p$q1 + 2) .le. min(256 * p$myid1 + 256, 256 * p$q1 +
     * 257) .and. max(128 * p$myid2 + 1, 2) .le. min(128 * p$myid2 + 128
     *, 1023)) then
                counter$b$25 = counter$b$25 + (min(256 * p$myid1 + 256, 
     *256 * p$q1 + 257) - max(64 * p$q1 + 3, 256 * p$myid1 + 1, 256 * p$
     *q1 + 2) + 1) * (min(128 * p$myid2 + 128, 1023) - max(128 * p$myid2
     * + 1, 2) + 1)
              endif
              call hpf_buffer_alloc(counter$b$25 * 8, send$buf$b$25)
              call hpf_ptr_to_index(hpf$heap, send$buf$b$25, 8, send$buf
     *$b$25$index)
C             --< Pack Loop For Send For Nonlocal Read >--
              counter$b$25 = 0
C             
C             Loop section ---[ max(((128 * p$myid2) + 1), 2) <= i2 <= min(((128
C * p$myid2) + 128), 1023), max(((64 * p$q1) + 3), ((256 * p$myid1) + 1), ((256 
C* p$q1) + 2)) <= i1 <= min(((256 * p$myid1) + 256), ((256 * p$q1) + 257)) ]---
C             
              if (p$q2 .eq. p$myid2) then
                do i1 = max(64 * p$q1 + 3, 256 * p$myid1 + 1, 256 * p$q1
     * + 2), min(256 * p$myid1 + 256, 256 * p$q1 + 257)
                  do i2 = max(128 * p$myid2 + 1, 2), min(128 * p$myid2 +
     * 128, 1023)
                    hpf$heap$double(send$buf$b$25$index + counter$b$25) 
     *= b(b$data + i1 - (b$coord$0 * 256 + 1) + (i2 - (b$coord$1 * 128 +
     * 1)) * b$local$extent$0)
                    counter$b$25 = counter$b$25 + 1
                  enddo
                enddo
              endif
              if (counter$b$25 .gt. 0) then
                call mpi_send(hpf$heap$double(send$buf$b$25$index), coun
     *ter$b$25, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 2, p$cmap, reques
     *t, ierr)
              endif
              call hpf_buffer_free(send$buf$b$25)
            endif
          enddo
        enddo
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$26 = 0
              if (p$q1 .eq. p$myid1 .and. p$myid2 .le. p$q2 .and. p$q2 .
     *le. 1 + p$myid2 .and. max(256 * p$myid1 + 1, 2) .le. min(256 * p$m
     *yid1 + 256, 1023) .and. max(128 * p$q2, 128 * p$myid2 + 1) .le. mi
     *n(128 * p$q2 + 127, 128 * p$myid2 + 128, 1022)) then
                counter$b$26 = counter$b$26 + (min(256 * p$myid1 + 256, 
     *1023) - max(256 * p$myid1 + 1, 2) + 1) * (min(128 * p$q2 + 127, 12
     *8 * p$myid2 + 128, 1022) - max(128 * p$q2, 128 * p$myid2 + 1) + 1)
              endif
              call hpf_buffer_alloc(counter$b$26 * 8, send$buf$b$26)
              call hpf_ptr_to_index(hpf$heap, send$buf$b$26, 8, send$buf
     *$b$26$index)
C             --< Pack Loop For Send For Nonlocal Read >--
              counter$b$26 = 0
C             
C             Loop section ---[ max((128 * p$q2), ((128 * p$myid2) + 1)) <= i2 <
C= min(((128 * p$q2) + 127), ((128 * p$myid2) + 128), 1022), max(((256 * p$myid1
C) + 1), 2) <= i1 <= min(((256 * p$myid1) + 256), 1023) ]---
C             
              if (p$q1 .eq. p$myid1 .and. p$myid2 .le. p$q2 .and. p$q2 .
     *le. 1 + p$myid2) then
                do i1 = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 2
     *56, 1023)
                  do i2 = max(128 * p$q2, 128 * p$myid2 + 1), min(128 * 
     *p$q2 + 127, 128 * p$myid2 + 128, 1022)
                    hpf$heap$double(send$buf$b$26$index + counter$b$26) 
     *= b(b$data + i1 - (b$coord$0 * 256 + 1) + (i2 - (b$coord$1 * 128 +
     * 1)) * b$local$extent$0)
                    counter$b$26 = counter$b$26 + 1
                  enddo
                enddo
              endif
              if (counter$b$26 .gt. 0) then
                call mpi_send(hpf$heap$double(send$buf$b$26$index), coun
     *ter$b$26, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 3, p$cmap, reques
     *t, ierr)
              endif
              call hpf_buffer_free(send$buf$b$26)
            endif
          enddo
        enddo
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$27 = 0
              if (p$q1 .eq. p$myid1 .and. p$myid2 .le. 1 + p$q2 .and. p$
     *q2 .le. p$myid2 .and. max(256 * p$myid1 + 1, 2) .le. min(256 * p$m
     *yid1 + 256, 1023) .and. max(128 * p$q2 + 2, 128 * p$myid2 + 1, 3) 
     *.le. min(128 * p$q2 + 129, 128 * p$myid2 + 128)) then
                counter$b$27 = counter$b$27 + (min(256 * p$myid1 + 256, 
     *1023) - max(256 * p$myid1 + 1, 2) + 1) * (min(128 * p$q2 + 129, 12
     *8 * p$myid2 + 128) - max(128 * p$q2 + 2, 128 * p$myid2 + 1, 3) + 1
     *)
              endif
              call hpf_buffer_alloc(counter$b$27 * 8, send$buf$b$27)
              call hpf_ptr_to_index(hpf$heap, send$buf$b$27, 8, send$buf
     *$b$27$index)
C             --< Pack Loop For Send For Nonlocal Read >--
              counter$b$27 = 0
C             
C             Loop section ---[ max(((128 * p$q2) + 2), ((128 * p$myid2) + 1), 3
C) <= i2 <= min(((128 * p$q2) + 129), ((128 * p$myid2) + 128)), max(((256 * p$my
Cid1) + 1), 2) <= i1 <= min(((256 * p$myid1) + 256), 1023) ]---
C             
              if (p$q1 .eq. p$myid1 .and. p$myid2 .le. 1 + p$q2 .and. p$
     *q2 .le. p$myid2) then
                do i1 = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 2
     *56, 1023)
                  do i2 = max(128 * p$q2 + 2, 128 * p$myid2 + 1, 3), min
     *(128 * p$q2 + 129, 128 * p$myid2 + 128)
                    hpf$heap$double(send$buf$b$27$index + counter$b$27) 
     *= b(b$data + i1 - (b$coord$0 * 256 + 1) + (i2 - (b$coord$1 * 128 +
     * 1)) * b$local$extent$0)
                    counter$b$27 = counter$b$27 + 1
                  enddo
                enddo
              endif
              if (counter$b$27 .gt. 0) then
                call mpi_send(hpf$heap$double(send$buf$b$27$index), coun
     *ter$b$27, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 4, p$cmap, reques
     *t, ierr)
              endif
              call hpf_buffer_free(send$buf$b$27)
            endif
          enddo
        enddo
        continue
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$24 = 0
              if (p$q2 .eq. p$myid2 .and. max(256 * p$q1 + 1, 256 * p$my
     *id1) .le. min(256 * p$q1 + 256, 256 * p$myid1 + 255, 1022) .and. m
     *ax(128 * p$q2 + 1, 2) .le. min(128 * p$q2 + 128, 1023)) then
                counter$b$24 = counter$b$24 + (min(256 * p$q1 + 256, 256
     * * p$myid1 + 255, 1022) - max(256 * p$q1 + 1, 256 * p$myid1) + 1) 
     ** (min(128 * p$q2 + 128, 1023) - max(128 * p$q2 + 1, 2) + 1)
              endif
              call hpf_buffer_alloc(counter$b$24 * 8, recv$buf$b$24)
              call hpf_ptr_to_index(hpf$heap, recv$buf$b$24, 8, recv$buf
     *$b$24$index)
              if (counter$b$24 .gt. 0) then
                call mpi_recv(hpf$heap$double(recv$buf$b$24$index), coun
     *ter$b$24, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 1, p$cmap, reques
     *t, ierr)
              endif
C             --< Unpack Loop From Recv For Nonlocal Read >--
              counter$b$24 = 0
C             
C             Loop section ---[ max(((128 * p$q2) + 1), 2) <= i2 <= min(((128 * 
Cp$q2) + 128), 1023), max(((256 * p$q1) + 1), (256 * p$myid1)) <= i1 <= min(((25
C6 * p$q1) + 256), ((256 * p$myid1) + 255), 1022) ]---
C             
              if (p$q2 .eq. p$myid2) then
                do i1 = max(256 * p$q1 + 1, 256 * p$myid1), min(256 * p$
     *q1 + 256, 256 * p$myid1 + 255, 1022)
                  do i2 = max(128 * p$q2 + 1, 2), min(128 * p$q2 + 128, 
     *1023)
                    call hpf_nonlocal_insertd(hash$nonlocals, b$data, i1
     * + i2 * b$global$extent$0, hpf$heap$double(recv$buf$b$24$index + c
     *ounter$b$24))
                    counter$b$24 = counter$b$24 + 1
                  enddo
                enddo
              endif
              call hpf_buffer_free(recv$buf$b$24)
            endif
          enddo
        enddo
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$25 = 0
              if (p$q2 .eq. p$myid2 .and. max(64 * p$q1 + 3, 256 * p$q1 
     *+ 1, 256 * p$myid1 + 2) .le. min(256 * p$q1 + 256, 256 * p$myid1 +
     * 257) .and. max(128 * p$q2 + 1, 2) .le. min(128 * p$q2 + 128, 1023
     *)) then
                counter$b$25 = counter$b$25 + (min(256 * p$q1 + 256, 256
     * * p$myid1 + 257) - max(64 * p$q1 + 3, 256 * p$q1 + 1, 256 * p$myi
     *d1 + 2) + 1) * (min(128 * p$q2 + 128, 1023) - max(128 * p$q2 + 1, 
     *2) + 1)
              endif
              call hpf_buffer_alloc(counter$b$25 * 8, recv$buf$b$25)
              call hpf_ptr_to_index(hpf$heap, recv$buf$b$25, 8, recv$buf
     *$b$25$index)
              if (counter$b$25 .gt. 0) then
                call mpi_recv(hpf$heap$double(recv$buf$b$25$index), coun
     *ter$b$25, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 2, p$cmap, reques
     *t, ierr)
              endif
C             --< Unpack Loop From Recv For Nonlocal Read >--
              counter$b$25 = 0
C             
C             Loop section ---[ max(((128 * p$q2) + 1), 2) <= i2 <= min(((128 * 
Cp$q2) + 128), 1023), max(((64 * p$q1) + 3), ((256 * p$q1) + 1), ((256 * p$myid1
C) + 2)) <= i1 <= min(((256 * p$q1) + 256), ((256 * p$myid1) + 257)) ]---
C             
              if (p$q2 .eq. p$myid2) then
                do i1 = max(64 * p$q1 + 3, 256 * p$q1 + 1, 256 * p$myid1
     * + 2), min(256 * p$q1 + 256, 256 * p$myid1 + 257)
                  do i2 = max(128 * p$q2 + 1, 2), min(128 * p$q2 + 128, 
     *1023)
                    call hpf_nonlocal_insertd(hash$nonlocals, b$data, i1
     * + i2 * b$global$extent$0, hpf$heap$double(recv$buf$b$25$index + c
     *ounter$b$25))
                    counter$b$25 = counter$b$25 + 1
                  enddo
                enddo
              endif
              call hpf_buffer_free(recv$buf$b$25)
            endif
          enddo
        enddo
        continue
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$26 = 0
              if (p$q1 .eq. p$myid1 .and. p$q2 .le. p$myid2 .and. p$myid
     *2 .le. 1 + p$q2 .and. max(256 * p$myid1 + 1, 2) .le. min(256 * p$m
     *yid1 + 256, 1023) .and. max(128 * p$myid2, 128 * p$q2 + 1) .le. mi
     *n(128 * p$myid2 + 127, 128 * p$q2 + 128, 1022)) then
                counter$b$26 = counter$b$26 + (min(256 * p$myid1 + 256, 
     *1023) - max(256 * p$myid1 + 1, 2) + 1) * (min(128 * p$myid2 + 127,
     * 128 * p$q2 + 128, 1022) - max(128 * p$myid2, 128 * p$q2 + 1) + 1)
              endif
              call hpf_buffer_alloc(counter$b$26 * 8, recv$buf$b$26)
              call hpf_ptr_to_index(hpf$heap, recv$buf$b$26, 8, recv$buf
     *$b$26$index)
              if (counter$b$26 .gt. 0) then
                call mpi_recv(hpf$heap$double(recv$buf$b$26$index), coun
     *ter$b$26, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 3, p$cmap, reques
     *t, ierr)
              endif
C             --< Unpack Loop From Recv For Nonlocal Read >--
              counter$b$26 = 0
C             
C             Loop section ---[ max((128 * p$myid2), ((128 * p$q2) + 1)) <= i2 <
C= min(((128 * p$myid2) + 127), ((128 * p$q2) + 128), 1022), max(((256 * p$myid1
C) + 1), 2) <= i1 <= min(((256 * p$myid1) + 256), 1023) ]---
C             
              if (p$q1 .eq. p$myid1 .and. p$q2 .le. p$myid2 .and. p$myid
     *2 .le. 1 + p$q2) then
                do i1 = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 2
     *56, 1023)
                  do i2 = max(128 * p$myid2, 128 * p$q2 + 1), min(128 * 
     *p$myid2 + 127, 128 * p$q2 + 128, 1022)
                    call hpf_nonlocal_insertd(hash$nonlocals, b$data, i1
     * + i2 * b$global$extent$0, hpf$heap$double(recv$buf$b$26$index + c
     *ounter$b$26))
                    counter$b$26 = counter$b$26 + 1
                  enddo
                enddo
              endif
              call hpf_buffer_free(recv$buf$b$26)
            endif
          enddo
        enddo
        continue
C       
C       --<< Iterations that access only local values >>-- 
C       
C       
C       Loop section ---[ ((256 * p$myid1) + 2) <= i <= ((256 * p$myid1) + 255),
C ((128 * p$myid2) + 2) <= j <= ((128 * p$myid2) + 127) ]---
C       
        do j = 128 * p$myid2 + 2, 128 * p$myid2 + 127
          do i = 256 * p$myid1 + 2, 256 * p$myid1 + 255
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (b(b$data + i - 1 - (b$coord$0
     * * 256 + 1) + (j - (b$coord$1 * 128 + 1)) * b$local$extent$0) + b(
     *b$data + i + 1 - (b$coord$0 * 256 + 1) + (j - (b$coord$1 * 128 + 1
     *)) * b$local$extent$0) + b(b$data + i - (b$coord$0 * 256 + 1) + (j
     * - 1 - (b$coord$1 * 128 + 1)) * b$local$extent$0) + b(b$data + i -
     * (b$coord$0 * 256 + 1) + (j + 1 - (b$coord$1 * 128 + 1)) * b$local
     *$extent$0))
          enddo
        enddo
C       
C       Loop section ---[ 0 <= p$q2 <= 7, 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          do p$q2 = 0, 7
            if (p$myid1 .ne. p$q1 .or. p$myid2 .ne. p$q2) then
C             --< Loop Counters >--
              counter$b$27 = 0
              if (p$q1 .eq. p$myid1 .and. p$q2 .le. 1 + p$myid2 .and. p$
     *myid2 .le. p$q2 .and. max(256 * p$myid1 + 1, 2) .le. min(256 * p$m
     *yid1 + 256, 1023) .and. max(128 * p$myid2 + 2, 128 * p$q2 + 1, 3) 
     *.le. min(128 * p$myid2 + 129, 128 * p$q2 + 128)) then
                counter$b$27 = counter$b$27 + (min(256 * p$myid1 + 256, 
     *1023) - max(256 * p$myid1 + 1, 2) + 1) * (min(128 * p$myid2 + 129,
     * 128 * p$q2 + 128) - max(128 * p$myid2 + 2, 128 * p$q2 + 1, 3) + 1
     *)
              endif
              call hpf_buffer_alloc(counter$b$27 * 8, recv$buf$b$27)
              call hpf_ptr_to_index(hpf$heap, recv$buf$b$27, 8, recv$buf
     *$b$27$index)
              if (counter$b$27 .gt. 0) then
                call mpi_recv(hpf$heap$double(recv$buf$b$27$index), coun
     *ter$b$27, MPI_DOUBLE_PRECISION, p$q2 + 8 * p$q1, 4, p$cmap, reques
     *t, ierr)
              endif
C             --< Unpack Loop From Recv For Nonlocal Read >--
              counter$b$27 = 0
C             
C             Loop section ---[ max(((128 * p$myid2) + 2), ((128 * p$q2) + 1), 3
C) <= i2 <= min(((128 * p$myid2) + 129), ((128 * p$q2) + 128)), max(((256 * p$my
Cid1) + 1), 2) <= i1 <= min(((256 * p$myid1) + 256), 1023) ]---
C             
              if (p$q1 .eq. p$myid1 .and. p$q2 .le. 1 + p$myid2 .and. p$
     *myid2 .le. p$q2) then
                do i1 = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 2
     *56, 1023)
                  do i2 = max(128 * p$myid2 + 2, 128 * p$q2 + 1, 3), min
     *(128 * p$myid2 + 129, 128 * p$q2 + 128)
                    call hpf_nonlocal_insertd(hash$nonlocals, b$data, i1
     * + i2 * b$global$extent$0, hpf$heap$double(recv$buf$b$27$index + c
     *ounter$b$27))
                    counter$b$27 = counter$b$27 + 1
                  enddo
                enddo
              endif
              call hpf_buffer_free(recv$buf$b$27)
            endif
          enddo
        enddo
C       
C       --<< Iterations that read (but do not compute) non-local values >>-- 
C       
C       
C       Loop section ---[ max(((256 * p$myid1) + 1), 2) <= i <= min(((256 * p$my
Cid1) + 256), 1023), j = ((128 * p$myid2) + 1) ]---
C       
        if (1 .le. p$myid2) then
          do i = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 256, 102
     *3)
            j = 128 * p$myid2 + 1
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp3 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp3 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp9 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp9 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i +
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp15 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp15 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp21 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp21 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp3 + lnltmp9 + lnltmp15 
     *+ lnltmp21)
          enddo
        endif
C       
C       Loop section ---[ i = ((256 * p$myid1) + 1), ((128 * p$myid2) + 2) <= j 
C<= ((128 * p$myid2) + 127) ]---
C       
C       
C       Loop section ---[ i = ((256 * p$myid1) + 256), ((128 * p$myid2) + 2) <= 
Cj <= ((128 * p$myid2) + 127) ]---
C       
        if (1 .le. p$myid1 .and. p$myid1 .le. 2) then
          do j = 128 * p$myid2 + 2, 128 * p$myid2 + 127
            i = 256 * p$myid1 + 1
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp4 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp4 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp10 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp10 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp16 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp16 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp22 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp22 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp4 + lnltmp10 + lnltmp16
     * + lnltmp22)
            i = 256 * p$myid1 + 256
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp5 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp5 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp11 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp11 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp17 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp17 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp23 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp23 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp5 + lnltmp11 + lnltmp17
     * + lnltmp23)
          enddo
        endif
C       
C       Loop section ---[ i = 769, ((128 * p$myid2) + 2) <= j <= ((128 * p$myid2
C) + 127) ]---
C       
        if (3 .le. p$myid1) then
          do j = 128 * p$myid2 + 2, 128 * p$myid2 + 127
            i = 769
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp6 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp6 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp12 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp12 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp18 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp18 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp24 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp24 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp6 + lnltmp12 + lnltmp18
     * + lnltmp24)
          enddo
        endif
C       
C       Loop section ---[ i = 256, ((128 * p$myid2) + 2) <= j <= ((128 * p$myid2
C) + 127) ]---
C       
        if (p$myid1 .le. 0) then
          do j = 128 * p$myid2 + 2, 128 * p$myid2 + 127
            i = 256
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp1 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp1 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp7 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp7 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i +
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp13 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp13 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp19 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp19 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp1 + lnltmp7 + lnltmp13 
     *+ lnltmp19)
          enddo
        endif
C       
C       Loop section ---[ max(((256 * p$myid1) + 1), 2) <= i <= min(((256 * p$my
Cid1) + 256), 1023), j = ((128 * p$myid2) + 128) ]---
C       
        if (p$myid2 .le. 6) then
          do i = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 256, 102
     *3)
            j = 128 * p$myid2 + 128
            if (p$myid1 * 256 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp2 = b(b$data + i - 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp2 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i -
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i + 1 .and. i + 1 .lt. p$myid1 * 
     *256 + 257 .and. p$myid2 * 128 + 1 .le. j .and. j .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp8 = b(b$data + i + 1 - (b$coord$0 * 256 + 1) + (j - 
     *(b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp8 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i +
     * 1 + j * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j - 1 .and. j - 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp14 = b(b$data + i - (b$coord$0 * 256 + 1) + (j - 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp14 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j - 1) * b$global$extent$0)
            endif
            if (p$myid1 * 256 + 1 .le. i .and. i .lt. p$myid1 * 256 + 25
     *7 .and. p$myid2 * 128 + 1 .le. j + 1 .and. j + 1 .lt. p$myid2 * 12
     *8 + 129) then
              lnltmp20 = b(b$data + i - (b$coord$0 * 256 + 1) + (j + 1 -
     * (b$coord$1 * 128 + 1)) * b$local$extent$0)
            else
              lnltmp20 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i 
     *+ (j + 1) * b$global$extent$0)
            endif
            a(a$data + i - (a$coord$0 * 256 + 1) + (j - (a$coord$1 * 128
     * + 1)) * a$local$extent$0) = 0.25 * (lnltmp2 + lnltmp8 + lnltmp14 
     *+ lnltmp20)
          enddo
        endif
C       
C       --<< Iterations that access only local values >>-- 
C       
C       
C       Loop section ---[ max(((256 * p$myid1) + 1), 2) <= i <= min(((256 * p$my
Cid1) + 256), 1023), max(((128 * p$myid2) + 1), 2) <= j <= min(((128 * p$myid2) 
C+ 128), 1023) ]---
C       
        do j = max(128 * p$myid2 + 1, 2), min(128 * p$myid2 + 128, 1023)
          do i = max(256 * p$myid1 + 1, 2), min(256 * p$myid1 + 256, 102
     *3)
            b(b$data + i - (b$coord$0 * 256 + 1) + (j - (b$coord$1 * 128
     * + 1)) * b$local$extent$0) = a(a$data + i - (a$coord$0 * 256 + 1) 
     *+ (j - (a$coord$1 * 128 + 1)) * a$local$extent$0)
          enddo
        enddo
        call hpf_nonlocals_free(hash$nonlocals)
C       -----------------------------
C       finalize run-time descriptors
C       -----------------------------
        call hpf_procrtd_free(p$processors)
        call hpf_tmplrtd_free(t$template)
        call hpf_alignrtd_free(a$align)
        call hpf_alignrtd_free(a$align$new)
        call hpf_distrtd_free(a$dist)
        call hpf_distrtd_free(a$dist$new)
        call hpf_array_free(a, a$data, a$desc)
        call hpf_arrayrtd_free(a$desc)
        call hpf_alignrtd_free(b$align)
        call hpf_alignrtd_free(b$align$new)
        call hpf_distrtd_free(b$dist)
        call hpf_distrtd_free(b$dist$new)
        call hpf_array_free(b, b$data, b$desc)
        call hpf_arrayrtd_free(b$desc)
C       ---------------------------
C       runtime system finalization
C       ---------------------------
        call mpi_finalize(ierr)
C       
      end