HPF 1D Shift Kernel
Generated Code


      program simpletest
        integer i
CHPF$   processors p (4) 
CHPF$   template t (100) 
CHPF$   align a (i) with t(i)
CHPF$   align b (i) with t(i)
CHPF$   distribute t(block) onto p
C       --------------------------------------------------------------
C       declarations for heap-based runtime dynamic storage allocation
C       --------------------------------------------------------------
        common /hpf$heap$common/ hpf$heap
        dimension hpf$heap$integer(0:0)
        equivalence (hpf$heap$integer(0), hpf$heap)
        dimension hpf$heap$double(0:0)
        equivalence (hpf$heap$double(0), hpf$heap)
#include 
        integer status(MPI_STATUS_SIZE)
C       -----------------------------------------------
C       declarations for compiler-generated temporaries
C       -----------------------------------------------
        save a, b
        logical p$wrap
        integer counter$b$3, send$buf$b$3$index, i1, p$q1
        integer recv$buf$b$3$index, a$coord$0, b$coord$0, p$myid1
        integer hpf$heap$integer, p$cmap, p$dims, sendproc, recvproc
        integer myid, ierr, request
        integer*4 a$data, a$coord, b$data, b$coord, p$coord
        real*8 hpf$heap$double, lnltmp1, hpf_nonlocal_lookupd, a, b
        real*8 hpf$heap
        integer*4 hash$nonlocals, send$buf$b$3, recv$buf$b$3, a$align
        integer*4 a$dist, a$tmpl, a$proc, a$align$new, a$dist$new
        integer*4 a$tmpl$new, a$proc$new, a$desc, b$align, b$dist
        integer*4 b$tmpl, b$proc, b$align$new, b$dist$new, b$tmpl$new
        integer*4 b$proc$new, b$desc, t$template, t$proc, t$dist
        integer*4 p$processors
        dimension a(0:0), b(0:0), p$dims(1:1), p$wrap(1:1)
C       
C       
C       -----------------------------
C       runtime system initialization
C       -----------------------------
        call mpi_init(ierr)
        call mpi_comm_rank(MPI_COMM_WORLD, myid, ierr)
C       -----------------------------------------------
C       initializations for run-time descriptor indices
C       -----------------------------------------------
        a$align$new = 0
        a$align = 0
        b$align$new = 0
        b$align = 0
        t$dist = 0
        a$dist$new = 0
        a$dist = 0
        b$dist$new = 0
        b$dist = 0
        a$desc = 0
        b$desc = 0
C       
C       
C       -------------------------------------
C       building array descriptor for array a
C       -------------------------------------
        call hpf_arrayrtd_alloc(1, 2, a$desc)
        call hpf_arrayrtd_setdim(a$desc, 0, 1, 100)
C       
C       -------------------------------------
C       building array descriptor for array b
C       -------------------------------------
        call hpf_arrayrtd_alloc(1, 2, b$desc)
        call hpf_arrayrtd_setdim(b$desc, 0, 1, 100)
C       
C       ------------------------------------
C       building tmpl descriptor: template t
C       ------------------------------------
        call hpf_tmplrtd_alloc(1, t$template)
        call hpf_tmplrtd_setdim(t$template, 0, 1, 100)
C       
C       ---------------------------------------
C       building procs descriptor: processors p
C       ---------------------------------------
        call hpf_procrtd_alloc(1, p$processors)
        call hpf_procrtd_setdim(p$processors, 0, 1, 4)
C       
C       ----------------------------------------------------
C       initialize processor topology for processors array p
C       ----------------------------------------------------
        p$dims(1) = 4
        p$wrap(1) = .false.
        call mpi_cart_create(MPI_COMM_WORLD, 1, p$dims, p$wrap, .false.,
     * p$cmap, ierr)
        call hpf_procrtd_get_coords(hpf$heap, p$processors, p$coord)
        call mpi_cart_coords(p$cmap, myid, 1, hpf$heap$integer(p$coord),
     * ierr)
        p$myid1 = hpf$heap$integer(p$coord + 0)
        call hpf_procrtd_set_chandle(p$processors, p$cmap)
C       
C       -----------------------------------------------
C       building dist descriptor: distribute template t
C       -----------------------------------------------
        call hpf_distrtd_alloc(1, t$dist)
        call hpf_distrtd_setdim(t$dist, 0, -1, 25, 0)
        t$proc = p$processors
C       
C       --------------------------------------------
C       building dist descriptor: distribute array a
C       --------------------------------------------
        call hpf_distrtd_clone(t$dist, a$dist$new)
        a$proc$new = t$proc
C       
C       --------------------------------------------
C       building dist descriptor: distribute array b
C       --------------------------------------------
        call hpf_distrtd_clone(t$dist, b$dist$new)
        b$proc$new = t$proc
C       
C       -----------------------------------------
C       building align descriptor: align b with t
C       -----------------------------------------
        call hpf_alignrtd_alloc(1, 1, b$align$new)
        call hpf_alignrtd_setdim_src(b$align$new, 0, 0)
        call hpf_alignrtd_setdim_tmpl(b$align$new, 0, 0, 1, 0, 0)
C       
C       ------------------------------------------------------------
C       establish template and processor association: align b with t
C       ------------------------------------------------------------
        b$tmpl$new = t$template
        b$proc$new = t$proc
C       
C       -----------------------------------------
C       building align descriptor: align a with t
C       -----------------------------------------
        call hpf_alignrtd_alloc(1, 1, a$align$new)
        call hpf_alignrtd_setdim_src(a$align$new, 0, 0)
        call hpf_alignrtd_setdim_tmpl(a$align$new, 0, 0, 1, 0, 0)
C       
C       ------------------------------------------------------------
C       establish template and processor association: align a with t
C       ------------------------------------------------------------
        a$tmpl$new = t$template
        a$proc$new = t$proc
C       
C       ------------------------------------------------------------------------
C       allocate or redistribute array a; compute array-indexed processor coords
C       ------------------------------------------------------------------------
        call hpf_array_remap(a, a$align, a$dist, a$tmpl, a$proc, a$align
     *$new, a$dist$new, a$tmpl$new, a$proc$new, a$desc, a$data)
C       
C       -----------------------------------------------------------------------
C       map array-indexed processor coordinates to partitioned array dimensions
C       -----------------------------------------------------------------------
        call hpf_arrayrtd_get_coords(hpf$heap, a$desc, a$coord)
        a$coord$0 = hpf$heap$integer(a$coord + 0)
C       
C       ------------------------------------------------------------------------
C       allocate or redistribute array b; compute array-indexed processor coords
C       ------------------------------------------------------------------------
        call hpf_array_remap(b, b$align, b$dist, b$tmpl, b$proc, b$align
     *$new, b$dist$new, b$tmpl$new, b$proc$new, b$desc, b$data)
C       
C       -----------------------------------------------------------------------
C       map array-indexed processor coordinates to partitioned array dimensions
C       -----------------------------------------------------------------------
        call hpf_arrayrtd_get_coords(hpf$heap, b$desc, b$coord)
        b$coord$0 = hpf$heap$integer(b$coord + 0)
C       
        call hpf_nonlocals_alloc(hash$nonlocals)
C       
C       Loop section ---[ 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          if (p$myid1 .ne. p$q1) then
C           --< Loop Counters >--
            counter$b$3 = 0
            if (max(25 * p$myid1 + 1, 25 * p$q1) .le. min(25 * p$q1 + 24
     *, 25 * p$myid1 + 25, 98)) then
              counter$b$3 = counter$b$3 + min(25 * p$q1 + 24, 25 * p$myi
     *d1 + 25, 98) - max(25 * p$myid1 + 1, 25 * p$q1) + 1
            endif
            call hpf_buffer_alloc(counter$b$3 * 8, send$buf$b$3)
            call hpf_ptr_to_index(hpf$heap, send$buf$b$3, 8, send$buf$b$
     *3$index)
C           --< Pack Loop For Send For Nonlocal Read >--
            counter$b$3 = 0
C           
C           Loop section ---[ max(((25 * p$myid1) + 1), (25 * p$q1)) <= i1 <= mi
Cn(((25 * p$q1) + 24), ((25 * p$myid1) + 25), 98) ]---
C           
            do i1 = max(25 * p$myid1 + 1, 25 * p$q1), min(25 * p$q1 + 24
     *, 25 * p$myid1 + 25, 98)
              hpf$heap$double(send$buf$b$3$index + counter$b$3) = b(b$da
     *ta + i1 - (b$coord$0 * 25 + 1))
              counter$b$3 = counter$b$3 + 1
            enddo
            if (counter$b$3 .gt. 0) then
              call mpi_send(hpf$heap$double(send$buf$b$3$index), counter
     *$b$3, MPI_DOUBLE_PRECISION, p$q1, 1, p$cmap, request, ierr)
            endif
            call hpf_buffer_free(send$buf$b$3)
          endif
        enddo
        continue
C       
C       Loop section ---[  ]---
C       
        if (p$myid1 .le. 0) then
          a(a$data + 1 - (a$coord$0 * 25 + 1)) = 0
        endif
        continue
C       
C       --<< Iterations that access only local values >>-- 
C       
C       
C       Loop section ---[ ((25 * p$myid1) + 2) <= i <= min(((25 * p$myid1) + 25)
C, 99) ]---
C       
        do i = 25 * p$myid1 + 2, min(25 * p$myid1 + 25, 99)
          a(a$data + i - (a$coord$0 * 25 + 1)) = 0.25 * b(b$data + i - 1
     * - (b$coord$0 * 25 + 1))
        enddo
C       
C       Loop section ---[ 0 <= p$q1 <= 3 ]---
C       
        do p$q1 = 0, 3
          if (p$myid1 .ne. p$q1) then
C           --< Loop Counters >--
            counter$b$3 = 0
            if (max(25 * p$q1 + 1, 25 * p$myid1) .le. min(25 * p$myid1 +
     * 24, 25 * p$q1 + 25, 98)) then
              counter$b$3 = counter$b$3 + min(25 * p$myid1 + 24, 25 * p$
     *q1 + 25, 98) - max(25 * p$q1 + 1, 25 * p$myid1) + 1
            endif
            call hpf_buffer_alloc(counter$b$3 * 8, recv$buf$b$3)
            call hpf_ptr_to_index(hpf$heap, recv$buf$b$3, 8, recv$buf$b$
     *3$index)
            if (counter$b$3 .gt. 0) then
              call mpi_recv(hpf$heap$double(recv$buf$b$3$index), counter
     *$b$3, MPI_DOUBLE_PRECISION, p$q1, 1, p$cmap, request, ierr)
            endif
C           --< Unpack Loop From Recv For Nonlocal Read >--
            counter$b$3 = 0
C           
C           Loop section ---[ max(((25 * p$q1) + 1), (25 * p$myid1)) <= i1 <= mi
Cn(((25 * p$myid1) + 24), ((25 * p$q1) + 25), 98) ]---
C           
            do i1 = max(25 * p$q1 + 1, 25 * p$myid1), min(25 * p$myid1 +
     * 24, 25 * p$q1 + 25, 98)
              call hpf_nonlocal_insertd(hash$nonlocals, b$data, i1, hpf$
     *heap$double(recv$buf$b$3$index + counter$b$3))
              counter$b$3 = counter$b$3 + 1
            enddo
            call hpf_buffer_free(recv$buf$b$3)
          endif
        enddo
C       
C       --<< Iterations that read (but do not compute) non-local values >>-- 
C       
C       
C       Loop section ---[ i = ((25 * p$myid1) + 1) ]---
C       
        if (1 .le. p$myid1) then
          i = 25 * p$myid1 + 1
          if (p$myid1 * 25 + 1 .le. i - 1 .and. i - 1 .lt. p$myid1 * 25 
     *+ 26) then
            lnltmp1 = b(b$data + i - 1 - (b$coord$0 * 25 + 1))
          else
            lnltmp1 = hpf_nonlocal_lookupd(hash$nonlocals, b$data, i - 1
     *)
          endif
          a(a$data + i - (a$coord$0 * 25 + 1)) = 0.25 * lnltmp1
        endif
        call hpf_nonlocals_free(hash$nonlocals)
C       -----------------------------
C       finalize run-time descriptors
C       -----------------------------
        call hpf_procrtd_free(p$processors)
        call hpf_tmplrtd_free(t$template)
        call hpf_alignrtd_free(a$align)
        call hpf_alignrtd_free(a$align$new)
        call hpf_distrtd_free(a$dist)
        call hpf_distrtd_free(a$dist$new)
        call hpf_array_free(a, a$data, a$desc)
        call hpf_arrayrtd_free(a$desc)
        call hpf_alignrtd_free(b$align)
        call hpf_alignrtd_free(b$align$new)
        call hpf_distrtd_free(b$dist)
        call hpf_distrtd_free(b$dist$new)
        call hpf_array_free(b, b$data, b$desc)
        call hpf_arrayrtd_free(b$desc)
C       ---------------------------
C       runtime system finalization
C       ---------------------------
        call mpi_finalize(ierr)
C       
      end