/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
#include "MemTest.h"
#include "BlackJackTimer.h"

static int *ColumnIndexSet   = (int *) NULL;
static int ColumnIndexSize   = 0;
static int *RowIndexSet      = (int *) NULL;
static int RowIndexSize      = 0;
static int StartingPoint     = -1;

static int count  = 0;
static int complain = 0;

static void **MemArray = (void *) NULL;


/* BuildBCPerm
 *
 * Constructs, in global array MemArray, a block cyclic permutation
 *
 * PARAMETERS: ArraySize, BlockSize, Stride
 *
 * RETURNS: void
 *
 */

void BuildBCPerm( int ArraySize, /* see BCTest */
		  int BlockSize, /* see BCTest */
		  int Stride     /* see BCTest */ )
{
  int i, j, col, ThisElt, LastElt, count;
  void **p, **Old;

  int NColumns = BlockSize / Stride;
  int NRows    = ArraySize / BlockSize;

  if (NColumns < 8 && complain == 0)
  {
    fprintf(LogFile,"\"BlockSize (%s w) is small relative to Stride (%s w).\"\n",
	    PrintNum(BlockSize),PrintNum(Stride));
    /* Abort("Structural problem with BlockSize and Stride.",-1); */
    complain++;
  }

  if (NRows < 1)
  {
    fprintf(LogFile,"\"BlockSize (%s w) is larger than ArraySize (%s w).\"\n",
	    PrintNum(BlockSize),PrintNum(ArraySize));
    Abort("Structural problem in use of BuildBCPerm, see logfile.",-1);
  }

  if (ColumnIndexSet != (int *) NULL)
      PACE_FreeMem(ColumnIndexSet);

  ColumnIndexSet   = (int *) PACE_AllocMem( NColumns * sizeof(int) );
  ColumnIndexSize  =         NColumns;

  if (RowIndexSet != NULL)
    PACE_FreeMem(RowIndexSet);

  RowIndexSet = (int *) PACE_AllocMem( NRows * sizeof(int) );
  RowIndexSize = NRows;
  
  (void) GenerateLinearSet( ColumnIndexSet, NColumns, Stride );
  (void) Shuffle( ColumnIndexSet, NColumns );

  (void) GenerateLinearSet( RowIndexSet, NRows, 1 );
  /*  (void) Shuffle( RowIndexSet, NRows, 1 ); */

  Old = MemArray;
  MemArray = (void **) PACE_AllocMem(ArraySize * UnitSize );
  if (Old != NULL)
  {
    PACE_FreeMem(Old);
    Old = (void **) NULL;
  }

  /*  Assemble the permutation ...
   *
   *  The right way to think of this permutation is as an array 
   *  in row major order.  The first index set picks columns at
   *  random, separated by Stride.  The second index set picks
   *  rows at random, with unit stride among rows. (E.g., we use
   *  all the rows but space among the columns by Stride to avoid
   *  spatial locality.  Spatial locality is bad for the measurements
   *  because it simply decreases the magnitude of a transition.)
   *
   *  The starting point for the permutation is 
   *   
   *     MemArray[RowIndexSet[0]][ColumnIndexSet[0]]
   *   
   *  We link together the elements of MemArray in the permutation
   *  by choosing a column (ColumnIndexSet[i]) and linking together
   *  the row entries for that column in the order dictated by 
   *  RowIndexSet[0 ... NBlocks].  We link the last one of those to
   *  the first element of the next columrn, and continue.
   *
   *  To simplify the loop (and complicate the explanation), we link
   *  element a to element b by making MemArray[b] contain a.  Thus,
   *  when TestPerm walks the array, it will walk it in the opposite
   *  of the order in which it is constructed.  (In otherwords, it 
   *  will move from b to a rather than from a to b.
   *
   */ 

  LastElt = -1;
  for (i=0;i<NColumns;i++)
  {
    col = ColumnIndexSet[i];
    for (j=0;j<NRows;j++)
    {
      ThisElt = col + (RowIndexSet[j] * NColumns * Stride);
      MemArray[ThisElt] = &MemArray[LastElt];  /* MemArray[j][i] <-  LastElt */
      if (Debug>1)
	fprintf(LogFile,"M[%s] <- %s.\t\t(%d,%d)\t%d + (%d * %d)\n",
		PrintNum(ThisElt),PrintNum(LastElt),j,i,
		col,RowIndexSet[j],NColumns*Stride);
      LastElt = ThisElt;
    }

  }
  /* and, finally, MemArray[0][0] <- MemArray[NRows-1][NCols-1] */
  StartingPoint = ColumnIndexSet[0] + (RowIndexSet[0] * NColumns * Stride);
  MemArray[StartingPoint] = &MemArray[LastElt];
  if (Debug>1)
    fprintf(LogFile,"M[%s] <- %s. ** starting point **\n",
	    PrintNum(StartingPoint),PrintNum(LastElt));

  /* verify permutation */
  p = MemArray[StartingPoint];
  j = 0;
  count = NColumns * NRows;
  while (p != &MemArray[StartingPoint])
  {
    p = *p;
    if (j++ > count)
    {
      fprintf(stderr,"Cycle did not return to starting point.\n");
      fprintf(stderr,"Cycle length is %s of %s.\n",
	      PrintNum(j),PrintNum(count));
      break;
    }
  }
  if ((Debug) && (j == count-1))
    fprintf(stderr,"Maximal length permutation.\n");

  PACE_FreeMem(ColumnIndexSet);
  ColumnIndexSet = (int *) NULL;
  ColumnIndexSize = 0;
  PACE_FreeMem(RowIndexSet);
  RowIndexSet = (int *) NULL;     
  RowIndexSize = 0;
}



/* BCTest 
 *
 * Conducts a single block-cyclic memory test
 *
 * PARAMETERS:  ArraySize, BlockSize, Stride, NAccesses
 *
 * RETURNS:     An elapsed time, in microseconds, as a double
 *
 */

double BCTest( int ArraySize,  /* footprint for test, in WORDS            */
	       int BlockSize,  /* size of each randomized block, in WORDS */
	       int Stride,     /* distance between unrandomized accesses  */
	       struct AccessCount NA          /* number of iterations     */
	       )
{
  int i;
  double result;
  
  if (HeartBeat > 1)
    fprintf(stderr,"Trial @ %s b: ",PrintNum(ArraySize*UnitSize));

  /* Initialize MemArray */
   BuildBCPerm(ArraySize,BlockSize,Stride);

  /* Run the test */
   result = TimePermPtr( MemArray, StartingPoint, NA);

  if (HeartBeat > 1)
    fprintf(stderr,"%s usec\n",PrintDNum(result));

  return result;
}

/* BCTrial
 *
 * PARAMETERS: Sizes[], Times[] Size
 *
 */

void BCTrial ( int    Sizes[],    /* Array of trial sizes to run */
	       double Times[],    /* Minimum times from the runs */
	       int    Count,      /* number of entries in Sizes & Times */
	       int    BlockSize,  /* Block size for the test     */
	       int    Stride )    /* Access stride for the test  */
{
  int i, j, k, BigInt, SaveHeartBeat, NotDone;
  int Counters[Count];
  double Trial;

  struct AccessCount NAccesses;

  complain = 0;
  SaveHeartBeat = HeartBeat;
  HeartBeat = 0;
  
  FindNA( Sizes[0], BlockSize, Stride, &NAccesses );

  HeartBeat = SaveHeartBeat;

  fprintf(LogFile,"\nBlock Cyclic Test of %s points between %s b to %s b.\n",
	  PrintNum(Count), PrintNum(Sizes[0]*UnitSize),
	  PrintNum(Sizes[Count-1]*UnitSize));
  fprintf(LogFile,"Blocks of %s b, Stride of %s b.\n",
	  PrintNum(BlockSize*UnitSize),PrintNum(Stride*UnitSize));
  fprintf(LogFile,"( %s ; %s ) accesses.\n",
	  PrintNum(NAccesses.outer),PrintNum(NAccesses.inner));
  fprintf(LogFile,"\nSize\tTime\n");

  for (i=0;i<Count;i++)
    Counters[i] = TRIALS_FOR_MIN;

  i = 1;
  NotDone = 1;
  while(NotDone)
  {
    if (HeartBeat)
      fprintf(stderr,"Starting L1 Test series %3d, ",i);

    NotDone = 0;
    for (j=0; j<Count;j++)
    {
      if (Counters[j])
      {
	Trial = BCTest(Sizes[j],BlockSize,Stride,NAccesses);
	NotDone++;
	Counters[j] --;

	if (i == 1)
	  Times[j] = Trial;
	else if (Trial < Times[j])
	{
	  Times[j] = Trial;
	  Counters[j] = TRIALS_FOR_MIN;
	}
      }
    }
    if (HeartBeat)
      fprintf(stderr," %3d trials.\n",NotDone);
    i++;
  }

  /* free the MemArray */
  if (MemArray != (void **) NULL)
  {
    PACE_FreeMem(MemArray);
    MemArray = (void **) NULL;
  }

  /* Log the results */
  for (j=0; j<Count; j++)
  {
    fprintf(LogFile,"%s\t%s\n",PrintNum(Sizes[j]*UnitSize),PrintDNum(Times[j]));
  }

  /* Normalize to cycles */
  for (j=0; j<Count; j++)
  {
    Times[j] = Times[j] * 1000.0 / (double) NAccesses.inner;
    Times[j] = Times[j] / (double) NAccesses.outer;
    Times[j] = round( Times[j] / AddCostInNSec );
  }

  /* and re-log the results */
  fprintf(LogFile,"\nNormalized to Cycles\nSize\tTime\n");

  for (j=0; j<Count; j++)
  {
    fprintf(LogFile,"%s\t%s\n",PrintNum(Sizes[j]*UnitSize),PrintDNum(Times[j]));
  }
}

void FindNA( int Size, int BlockSize, int Stride, 
		      struct AccessCount *NA)
{
  double Trial;
  int remainder;

  if (Verbose > 1)
    fprintf(LogFile,"FindNA( %s, %s, %s (? ; ?)\n",
	    PrintNum(Size),PrintNum(BlockSize),PrintNum(Stride));

  NA->outer = 1;
  NA->inner = 2 * (Size / Stride);

  /* TimePerm expects NA.inner to be a multiple of ten */
  NA->inner = NA->inner + 10 - (NA->inner % 10);  

  Trial = 0;
  while(Trial < MinTime)
  {
    if (NA->inner < BigInt)
      NA->inner = NA->inner + NA->inner;
    else 
      NA->outer = NA->outer + NA->outer;
    Trial = BCTest(Size, BlockSize, Stride, *NA);
  }

  if (Verbose > 1)
    fprintf(LogFile,"->FindNA returns (%s ; %s).\n",
	    PrintNum(NA->outer),PrintNum(NA->inner));
}
