/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
/* This version modified to use 'int *' */
#include "MemTest.h"
#include "BlackJackTimer.h"

static int *ColumnIndexSet   = (int *) NULL;
static int *LOColumnIndexSet = (int *) NULL;
static int *RowIndexSet      = (int *) NULL;
static int StartingPoint     = -1;

static int count  = 0;
static int paramwarning = 0;
static int complain = 0;

static void **MemArray = (void **) NULL;

/* declaration of intrinsic */
double round ( double x );


static int AlwaysConfirm( int s );

/* for use when ArraySize is small relative to PageSize */
void BuildSmallPerm( int ArraySize,
		     int Stride )
{
  int i, ThisElt, LastElt;
  void **p, **Old;

  int NColumns = ArraySize / Stride ;

  /* pick some reasonable parameters and fall back to the simpler 
   * randomized permutation. 
   */
  if (Debug>1)
  {
    fprintf(stderr,"BuildSmallPerm( AS: %s, Stride: %s )\n",
	    PrintNum(ArraySize*UnitSize), PrintNum(Stride*UnitSize));

    fprintf(stderr,"-> 1 row by %s columns of %s bytes (= %s b).\n",
	    PrintNum(NColumns),PrintNum(Stride*UnitSize),
	    PrintNum(NColumns*Stride*UnitSize));
  }

  ColumnIndexSet  = PACE_AllocMem( NColumns * sizeof(int) );

  (void) GenerateLinearSet( ColumnIndexSet, NColumns, Stride );
  (void) Shuffle( ColumnIndexSet, NColumns );

  Old = MemArray;
  MemArray = (void **) PACE_AllocMem(ArraySize * UnitSize);
  if (Old != (void **) NULL)
  {
    PACE_FreeMem(Old);
    Old = (void **) NULL;
  }

  /* Assemble the permutation ...
   *
   * This case is a degenerate of the more complex, multipage case
   * done below.
   *
   */

  LastElt = -1;
  for (i=0; i<NColumns; i++)
  {
    ThisElt = ColumnIndexSet[i];
    if (LastElt != -1)
      MemArray[ThisElt] = &MemArray[LastElt];
    if (Debug>1)
      fprintf(LogFile,"M[%s] <- %s.\n",PrintNum(ThisElt),PrintNum(LastElt));

    LastElt = ThisElt;
  }
  StartingPoint = ColumnIndexSet[0];
  MemArray[StartingPoint] = &MemArray[LastElt];

  if (Debug>1)
    fprintf(LogFile,"M[%s] <- %s ** starting point**.\n",
	    PrintNum(StartingPoint),PrintNum(LastElt));

  /* verify permutation */
  p = MemArray[StartingPoint];   
  i = 0;
  while( p != &MemArray[StartingPoint] )
  {
    p = *p;
    i++;
    if (i > NColumns)
    {
      fprintf(stderr,"Cycle did not return to starting point.\n");
      fprintf(stderr,"Cycle length is %s of %s.\n",
	      PrintNum(i),PrintNum(NColumns));
    }
  }
  if (Debug>1)
  {
    if (i == (NColumns-1))
      fprintf(stderr,"Maximal length cycle.\n");
    else 
      fprintf(stderr,"%s vs %s.\n",PrintNum(i),PrintNum(NColumns));
  }

  PACE_FreeMem(ColumnIndexSet);
  ColumnIndexSet = (int *) NULL;
}

/* BuildBCPerm
 *
 * Constructs, in global array MemArray, a block cyclic permutation
 *
 * PARAMETERS: ArraySize, Stride
 *
 * RETURNS: void
 *
 */

void BuildBCPerm( int ArraySize, /* see BCTest */
		  int Stride     /* see BCTest */ )
{
  int i, j, row, col, ThisElt, LastElt, count;
  int NPages, LeftOver, LOColumns;
  void **p, **Old;

  int NColumns = PageSize / Stride;
  int NRows    = ArraySize / PageSize;

  if (ArraySize < 2 * PageSize)
  {
    BuildSmallPerm( ArraySize, Stride );
    return;
  }

  NPages = ArraySize / PageSize;
  LeftOver = ArraySize % PageSize;

  if (Debug > 1  && LeftOver != 0)
  {
    fprintf(LogFile,"@ %s b, using %s full pages and %s b leftover.\n",
	    PrintNum(ArraySize*UnitSize),
	    PrintNum(NPages),PrintNum(LeftOver*UnitSize));
  }

  if (NColumns < 8 && complain == 0)
  {
    fprintf(LogFile,"\"Page size (%s w) is small relative to Stride (%s w).\"\n",
	    PrintNum(PageSize),PrintNum(Stride));

    complain++;
  }

  if (NRows < 1)
    BuildSmallPerm(ArraySize,Stride);

  else
  {
    ColumnIndexSet   = PACE_AllocMem( (NColumns+1) * sizeof(int) );
    RowIndexSet      = PACE_AllocMem( (NRows+1)    * sizeof(int) );

    (void) GenerateLinearSet( ColumnIndexSet, NColumns, Stride );
    (void) Shuffle( ColumnIndexSet, NColumns );

    (void) GenerateLinearSet( RowIndexSet, NRows, 1 );
    /*  (void) Shuffle( RowIndexSet, NRows, 1 ); */

    Old = MemArray;
    /* This call allocates at least NRows + LeftOver, page aligned  */
    MemArray = (void **) PACE_AllocMem((ArraySize+PageSize) * UnitSize); 
    if (Old != (void **) NULL)
    {
      PACE_FreeMem(Old);
      Old = (void **) NULL;
    }

    /*  Assemble the permutation ...
     *
     *  This code generates the "cache only" reference stream.
     *  It runs down a row (page) & makes all of its references, then
     *  switches to the next row (page).
     *
     *  Leftovers are handled in the last row.
     */ 

    LastElt = -1;
    for (i=0;i<NRows;i++)
    {
      row = RowIndexSet[i] * NColumns * Stride;    

      for (j=0;j<NColumns;j++)
      {
	ThisElt = row + ColumnIndexSet[j];
	if (LastElt != -1)
	  MemArray[ThisElt] = &MemArray[LastElt]; /* MemArray[j][i] <-  LastElt */
	if (Debug>1)
	  fprintf(LogFile,"M[%s] <- %s.\t\t(%d,%d)\t(%d * %d) + %d\n",
		  PrintNum(ThisElt),PrintNum(LastElt),i,j,
		  RowIndexSet[i],NColumns*Stride,ColumnIndexSet[j]);
	LastElt = ThisElt;
      }
    }

    if (LeftOver > 0)
    {
      /* create a permuted index of the leftover columns */
      LOColumns = LeftOver / Stride;
      LOColumnIndexSet = PACE_AllocMem( (LOColumns+1) * sizeof(int) );
      (void) GenerateLinearSet( LOColumnIndexSet, LOColumns, Stride );
      (void) Shuffle( LOColumnIndexSet, LOColumns);

      row = NRows * PageSize;
      for (i=0; i<LOColumns; i++)
      {
	ThisElt = row + ColumnIndexSet[i];
	if (LastElt != -1)
	  MemArray[ThisElt] = &MemArray[LastElt];
	if (Debug > 1)
	  fprintf(LogFile,"M[%s] <- %s.\t\t(%d,%d)\t(%d * %d) + %d\n",
		  PrintNum(ThisElt),PrintNum(LastElt),i,j,
		  NRows,NColumns*Stride,ColumnIndexSet[j]);
	LastElt = ThisElt;
      }
      PACE_FreeMem(LOColumnIndexSet);
    }
    /* and, finally, MemArray[0][0] <- MemArray[NRows-1][NCols-1] */
    StartingPoint = ColumnIndexSet[0] + (RowIndexSet[0] * NColumns * Stride);
    MemArray[StartingPoint] = &MemArray[LastElt];
    if (Debug>1)
      fprintf(LogFile,"M[%s] <- %s. ** starting point **\n",
	      PrintNum(StartingPoint),PrintNum(LastElt));

    /* verify permutation */
    p = MemArray[StartingPoint];
    j = 0;
    count = NColumns * NRows + LeftOver;
    while (p != &MemArray[StartingPoint])
    {
      p = *p;
      if (j++ > count)
      {
	fprintf(stderr,"Cycle did not return to starting point.\n");
	fprintf(stderr,"Cycle length is %s of %s.\n",
		PrintNum(j),PrintNum(count));
	break;
      }
    }
    if ((Debug) && (j == count-1))
      fprintf(stderr,"Maximal length permutation.\n");

    PACE_FreeMem(ColumnIndexSet);
    ColumnIndexSet = (int *) NULL;
    PACE_FreeMem(RowIndexSet);
    RowIndexSet = (int *) NULL;
  }

}



/* BCTest 
 *
 * Conducts a single block-cyclic memory test
 *
 * PARAMETERS:  ArraySize, Stride, NAccesses
 *
 * RETURNS:     An elapsed time, in microseconds, as a double
 *
 */

double BCTest( int ArraySize,  /* footprint for test, in WORDS            */
	       int Stride,     /* distance between unrandomized accesses  */
	       struct AccessCount NA          /* number of iterations     */
	       )
{
  int i;
  double result;
  
  if (HeartBeat > 1)
    fprintf(stderr,"Trial @ %s b: ",PrintNum(ArraySize*UnitSize));

  /* Initialize MemArray */
   BuildBCPerm(ArraySize,Stride);

  /* Run the test */
   result = TimePermPtr(MemArray, StartingPoint, NA);

  if (HeartBeat > 1)
    fprintf(stderr,"%s usec\n",PrintDNum(result));

  return result;
}

/* SingleTrial
 *
 * PARAMETERS: Size
 *
 */

int SingleTrial ( int    Size )   /* Array size to test */
{
  int i, j, k, NotDone, BigInt, SaveHeartBeat;
  int Counter;
  int Stride;
  double Trial, Time;

  struct AccessCount NAccesses;

  complain = 0;

  SaveHeartBeat = HeartBeat;
  HeartBeat = 0;
  
  NAccesses.inner = 1000000;
  NAccesses.outer = 1;

  Stride    = L1LineSize;

  if (Debug>0)
  {
    fprintf(LogFile,"\nTesting Cache Latency at %s b.\n",
	    PrintNum(Size*UnitSize));
    fprintf(LogFile,"Blocks of %s b, Stride of %s b.\n",
	    PrintNum(PageSize*UnitSize),PrintNum(Stride*UnitSize));
    fprintf(LogFile,"( %s ; %s ) accesses.\n",
	    PrintNum(NAccesses.outer),PrintNum(NAccesses.inner));
  }

  Counter = TRIALS_FOR_MIN; /* set the counter for this entry */

  NotDone = 1;
  i = 1;
  Time = BCTest(Size,Stride,NAccesses);
  while(Counter > 0)
  {
    Trial = BCTest(Size,Stride,NAccesses);
    i++;

    if (Trial < Time)
    {
      Time = Trial;
      Counter = TRIALS_FOR_MIN;
    }
    else
    {
      Counter--;
    }
  }
  /* Convert Time to nsecs */
  Time = Time * 1000 / NAccesses.inner;

  fprintf(stderr,"\nLatency at %s b is %f ns\t(%d trials)\n",
	  PrintNum(Size*UnitSize),Time,i);
  fprintf(LogFile,"%10s\t%f\t",PrintNum(Size*UnitSize),Time);

  /* Convert Time to adds */
  Trial = round(Time / AddCostInNsecs);
  i = (int) Trial;
  fprintf(stderr,"Latency at %s b is %d int32 adds \t(%f).\n",
	  PrintNum(Size*UnitSize),i,AddCostInNsecs);
  fprintf(LogFile,"%d\n",i);

  if (MemArray != (void **) NULL)
  {
    PACE_FreeMem(MemArray);
    MemArray = (void **) NULL;
  }

  return i;
}


/* SetOfTrials:  Runs a set of trials at different array sizes, 
 *               computing the memory latency in both nanoseconds
 *               and integer adds (roughly speaking, cycles) at each
 *               size.
 *
 * PARAMETERS: Size
 *
 */

void SetOfTrials( int Sizes[], 
                  int Cycles[], 
                  double Times[],
		  int Count ) 
{
  int i, j, k, NotDone, BigInt, series;
  int Counters[Count];
  int Stride;
  double Trial;

  struct AccessCount NAccesses;

  fprintf(stderr,"In SetOfTrials.\n");

  complain = 0;

  NAccesses.inner = 1000000;
  NAccesses.outer = 1;

  Stride    = L1LineSize;

  if (Debug>0)
  {
    fprintf(LogFile,"\nTesting Cache Latencies from %s b to %s b.\n",
	    PrintNum(Sizes[0]*UnitSize),PrintNum(Sizes[Count-1]*UnitSize));
    fprintf(LogFile,"Using L1 Line Size of %s b, Add Cost of %f\n",
	    PrintNum(L1LineSize*UnitSize),AddCostInNsecs);
    fprintf(LogFile,"Block size is min( array size, page size ).\n");	    
  }

  for (i=0;i<Count;i++)           /* initialize arrays */
  {
    Counters[i] = TRIALS_FOR_MIN; 
    Times[i]    = -1.0;
  }

  NotDone = 1;
  series = 1;
  while(NotDone)
  {
    if (HeartBeat)
      fprintf(stderr,"Cache Only series %3d.",series++);

    fprintf(LogFile,"\nCache Only series %3d.\n",series-1);

    NotDone = 0;
    j = 0;
    for (i=0;i<Count;i++)
    {
      if (Counters[i])
      {
	j++;
	Trial = BCTest(Sizes[i],Stride,NAccesses);
	Counters[i]--;

	if (Times[i] == -1.0)
	  Times[i] = Trial;
	else if (Trial < Times[i])
	  {
	    Times[i] = Trial;
	    Counters[i] = TRIALS_FOR_MIN;
	  }
	if (Counters[i] > 0)
	  NotDone = 1;
      }
    }
    if (HeartBeat)
      fprintf(stderr," Tested %3d points.\n",j);
  }

  if (MemArray != (void **) NULL)
  {
    PACE_FreeMem(MemArray);
    MemArray = (void **) NULL;
  }


  /* Convert Times to nsecs */
  for (i=0;i<Count;i++)
  {
    Times[i] = Times[i] * 1000 / NAccesses.inner ;
    Trial = round( Times[i] / AddCostInNsecs );
    Cycles[i] = (int) Trial;
  }

  fprintf(LogFile,"%12s\tTime\tCycles\n","Size");
  for (i=0;i<Count;i++)
  {
    fprintf(LogFile,"%12s\t%f\t%d\n",
	    PrintNum(Sizes[i]*UnitSize),Times[i],Cycles[i]);
  }

  IntAnalyze( Sizes, Cycles, Count, "CL", /* smoothing */ 1,
	      AlwaysConfirm );
}

static int AlwaysConfirm( int s )
{
  return 1;
}

