/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
/*******************************************************************************
 * $HeadURL: svn+ssh://orion.cs.rice.edu/home/jasandov/phd/svn/code/characterization/cache_test/dcache/dcache_linesize.c $
 * $Revision: 2031 $
 * $Author: jasandov $
 * $Date: 2010-06-22 14:35:54 -0500 (Tue, 22 Jun 2010) $
 *
 * The purpose of this benchmark is to determine the cache linesize
 * for a physically mapped cache.  The original linesize benchmark
 * only worked for virtually mapped caches becuase it relied the data
 * being mapped contiguously into the cache.  Data does not
 * necessarily map contiguously into a physically mapped cache, except
 * for within a single page.  The idea behind this benchmark, then, it
 * to leverage the contiguous mapping within a page along with
 * randomization and the theory of probability.  Since we cannot
 * guarantee that the pages are contiguously mapped, we must sample a
 * large number of random page orderings.  If the predicted linesize
 * is too small then the cache lines will thrash, no matter the actual
 * page mapping.  If the predicted linesize is large enough, then at
 * least some of the mappings will align properly and result in
 * improved performance.  The only remaining matter is determining how
 * many samples must be collected to draw a confident conclusion.
 *
 *
 * TODO: what impact does a large page (1MB) have on this
 *       microbenchmark?  Can we modify the test so that it uses
 *       smaller "chunks" (i.e., smaller than the actual pagesize) but
 *       still works as expected?  I think we could, since the
 *       benchmark is designed to handle random phyiscal-to-virtual
 *       mappings for each page, but it assumes that within a page the
 *       mapping is contiguous (it also assumes that if two pages
 *       overlap in the cache then they will overlap perfectly).  If
 *       we use a chunk that's smaller than the actual pagesize, we
 *       still get contiguous mapping within the chunk, random
 *       mappings across chunks (partially because *we* randomize the
 *       chunk placement), and (as long as the chunk evenly divides
 *       the pagesize) we are guaranteed that chunks will only ever
 *       overlap perfectly.
 *
 *
 * TODO: what happens if the linesize that we're testing for becomes a
 *       large fraction of the pagesize?  For example, as we test for
 *       a 256b linesize on a 1kb pagesize, we are really only
 *       touching each page twice.  Worse yet, we are really using
 *       only a fraction of the available cache (since the lines
 *       within a page that we use will always map to the same
 *       cachelines, while the lines that we don't use within a page
 *       will remain unused in the cache).
 *
 * TODO: are we drawing samples (i.e., sets of pages) from a large
 *       enough pool?  Right now the pool is 2x the cache capacity,
 *       but I wonder if we should make it a larger pool to allow for
 *       the possibility that the pages within the pool don't cover
 *       all of the cache lines.
 *
 ******************************************************************************/
#include "dcache.h"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <unistd.h>


static const unsigned int debug = 0;

static const unsigned int wordsize = sizeof(void*);

static const char *GP_FILENAME = NULL;
static const char *CSV_FILENAME = NULL;
static const char *PDF_FILENAME = NULL;

static int get_pagesize() {
  static int pagesize = -1;
  if (pagesize == -1) {
    int rc = sysconf(_SC_PAGESIZE);
    if (rc == -1) {
      fprintf(stderr, "error with sysconf(_SC_PAGESIZE)\n");
      exit(-1);
    }
    pagesize = rc;
  }
  return pagesize;
}

unsigned int get_chunksize(const unsigned int capacity) {
  const unsigned int pagesize = get_pagesize();
  unsigned int chunksize = pagesize;
  while (chunksize*2 > capacity) {
    chunksize = chunksize / 2;
  }
  return chunksize;
}



/*   The purpose of this function is to generate a set of accesses for
 *   which, at the page granularity, half of the pages contain a comb
 *   access pattern and the other half of the pages contain the
 *   opposite comb pattern.  If the comb pattern is too small (i.e.,
 *   less than the actual cache linesize), then the cache lines will
 *   always thrash no matter how the pages map into the cache.  If the
 *   comb pattern is greater than or equal to the actual linesize,
 *   then there will be some page mappings that will result in better
 *   performance -- we should be able to detect this if we sample
 *   enough random page mappings.
 *
 *   This routine uses the idea of a chunk (which is less than or
 *   equal to the pagesize), however, to handle the case where the
 *   pagesize is larger than the data_count for which we are testing.
 *   We can still shrink the chunk size less than the pagesize to
 *   preserve the contiguous property of the chunk, but it allows us
 *   to test for caches that are smaller than the pagesize.
 *
 */
void generate_random_striped_pages_indices(unsigned int *indices_count,
                                           unsigned int indices [],
                                           const unsigned int data_count,
                                           const unsigned int stripe_width)
{
  unsigned int debug = 0;
  const unsigned int max_indices_count = (*indices_count);
  const unsigned int chunksize = get_chunksize(data_count * wordsize);
  const unsigned int chunks = (data_count * sizeof(void*)) / chunksize;
  const unsigned int words = chunksize / sizeof(void*);
  unsigned int count = 0;
  unsigned int i;
  unsigned int j;

  unsigned int *chunks_array = malloc(sizeof(unsigned int) * chunks);
  unsigned int *words_array = malloc(sizeof(unsigned int) * words);

  if (debug) fprintf(stderr, "chunks = %u, words = %u, stripe = %u\n", chunks, words, stripe_width);

  /* if (stripe_width * wordsize >= 1024) debug = 1; */
  
  assert(chunks_array);
  assert(words_array);

  assert(stripe_width > 0);
  assert(data_count > 0);
  assert(chunks > 1);
  assert(words > 0);

  assert(stripe_width*sizeof(void*) < chunksize);

  for (i = 0 ; i < chunks ; i++) chunks_array[i] = i;
  shuffle_indices(chunks, chunks_array);

  for (i = 0 ; i < words/(2*stripe_width) ; i++) words_array[i] = i*(2*stripe_width);
  shuffle_indices(words/(2*stripe_width), words_array);

  if (debug) fprintf(stderr, "first set of chunks:\n");

  for (j = 0 ; j < words/(2*stripe_width) ; j++) {
    const unsigned int word = words_array[j];
    for (i = 0 ; i < chunks / 2 ; i++) {
      const unsigned int chunk = chunks_array[i];
      const unsigned int index = chunk * words + word;
      assert(count < max_indices_count);
      indices[count++] = index;
      if (debug) fprintf(stderr, "chunk = %u word = %u index = %u\n", chunk, word, index);
    }
  }
  
  if (debug) fprintf(stderr, "second set of chunks:\n");

  for (i = 0 ; i < words/(2*stripe_width) ; i++) words_array[i] = i*(2*stripe_width) + stripe_width;
  shuffle_indices((words-stripe_width)/(2*stripe_width), words_array);

  for (j = 0 ; j < words/(2*stripe_width) ; j++) {
    const unsigned int word = words_array[j];
    for (i = chunks / 2 ; i < chunks ; i++) {
      const unsigned int chunk = chunks_array[i];
      const unsigned int index = chunk * words + word;
      assert(count < max_indices_count);
      indices[count++] = index;
      if (debug) fprintf(stderr, "chunk = %u word = %u index = %u\n", chunk, word, index);
    }
  }

  /* shuffle_indices(count, indices); */

  if (debug)
    for (i = 0 ; i < count ; i++)
      fprintf(stderr, "indices[%u] = %u\n", i, indices[i]);

  (*indices_count) = count;

  if (debug) fprintf(stderr, "indices_count = %u\n", count);

  free(chunks_array);
  free(words_array);
  return;
}

static unsigned int analysis(const unsigned int linesize_count,
                             const unsigned int linesizes [],
                             const unsigned int samples,
                             const double **duration)
{
  double *best = calloc(sizeof(double), linesize_count);
  unsigned int i;
  unsigned int j;
  for (i = 0 ; i < linesize_count ; i++) {
    best[i] = duration[i][0];
    for (j = 1 ; j < samples ; j++) {
      if (duration[i][j] < best[i])
        best[i] = duration[i][j];
    }
  }
  for (i = 1 ; i < linesize_count ; i++) {
    if (best[i] < best[0]) {
      return linesizes[i] * wordsize;
    }
  }
  return linesizes[0] * wordsize;
}
                             

static unsigned int run_test(const unsigned int n,
                             const unsigned int linesize_count,
                             const unsigned int linesizes [],
                             const unsigned int data_size,
                             const unsigned int samples,
                             FILE *csv_out,
                             FILE *gp_out)
{
  const unsigned int chunk_count = 2;
  const unsigned int chunk_size = data_size;
  double **duration;
  unsigned int i;
  unsigned int j;

  const void **data = calloc(sizeof(const void *), chunk_count*chunk_size);
  const unsigned int max_indices_count = data_size;
  unsigned int *indices = malloc(sizeof(int)*max_indices_count);

  duration = malloc(sizeof(double*) * linesize_count);
  assert(duration);
  for (i = 0 ; i < linesize_count ; i++) {
    duration[i] = malloc(sizeof(double) * samples);
    assert(duration[i]);
  }

  if (data == 0) { fprintf(stderr, "malloc failure for data\n"); abort(); }
  if (indices == 0) { fprintf(stderr, "malloc failure indices\n"); abort(); }

  /* Run experiment */
  for (i = 0 ; i < linesize_count ; i++) {
    for (j = 0 ; j < samples ; j++) {
      const unsigned int linesize = linesizes[i];
      const unsigned int sample = j;
      unsigned int indices_count = max_indices_count;
      double d;
      /*         void generate_random_striped_pages_indices(const unsigned int *indices_count, */
      /*                                                    unsigned int indices [], */
      /*                                                    const unsigned int data_count, */
      /*                                                    const unsigned int stripe_width); */
      generate_random_striped_pages_indices(&indices_count,
                                            indices,
                                            chunk_size*chunk_count,
                                            linesize);
      
      link_indices_abs(chunk_count*chunk_size,
                       data,
                       indices_count,
                       indices);
      
      assert(n > indices_count * 4);
      
      /* warm up cache */
      traverse_list_abs(indices_count, data);
      /* run test */
      d = traverse_list_abs(n, data);
      duration[i][sample] = d;
      if (filtered_print(1.0))
        fprintf(stderr, "chunk_size=%s, chunk_stripe=%s/%s, sample=%u/%u, duration=%f\n",
                pp_size((double)(chunk_size * wordsize)),
                pp_size(linesize*wordsize), pp_size(linesizes[linesize_count-1]*wordsize),
                sample+1, samples, duration[i][sample]);
    }
  }
  
  /* Output all sample data */
  fprintf(csv_out, "# Capacity=%s\n", pp_size(data_size*wordsize));
  fprintf(csv_out, "# Sample");
  for (i = 0 ; i < linesize_count ; i++) {
    const unsigned int linesize = linesizes[i];
    fprintf(csv_out, ", %s", pp_size(linesize*wordsize));
  }
  fprintf(csv_out, "\n");
  for (j = 0 ; j < samples ; j += 1) {
    const unsigned int sample = j;
    fprintf(csv_out, "%u", sample);
    for (i = 0 ; i < linesize_count ; i++) {
      fprintf(csv_out, ", %f", duration[i][sample]);
    }
    fprintf(csv_out, "\n");
  }
  fprintf(csv_out, "\n\n\n");

  /* Output the best (minimum) from each group of samples */
  fprintf(csv_out, "# Best-for-Capacity=%s\n", pp_size(data_size*wordsize));
  fprintf(csv_out, "# Linesize, Duration\n");
  for (i = 0 ; i < linesize_count ; i++) {
    const unsigned int linesize = linesizes[i]*wordsize;
    double best = duration[i][0];
    for (j = 1 ; j < samples ; j++) {
      if (duration[i][j] < best) best = duration[i][j];
    }
    fprintf(csv_out, "%u, %g\n", linesize, best);
  }
  fprintf(csv_out, "\n\n\n");


  {
    fprintf(gp_out, "set auto fix\n");
    fprintf(gp_out, "set offset graph 0.05, graph 0.05, graph 0.05, graph 0.05\n");

    fprintf(gp_out, "set ylabel \"Time (s)\"\n");
    fprintf(gp_out, "set xlabel \"Chunk Stride (bytes)\"\n");
    fprintf(gp_out, "set title \"Linesize test for cache capacity=%s", pp_size(data_size*wordsize));
    fprintf(gp_out, "\"\n");
    fprintf(gp_out, "set logscale x 2\n");
    fprintf(gp_out, "set xrange [:]\n");
    fprintf(gp_out, "set yrange [:]\n");
    set_pretty_log_xtics(gp_out, linesizes[0], linesizes[linesize_count-1]);
    fprintf(gp_out, "\n");
    fprintf(gp_out, "plot");
    for (i = 0 ; i < linesize_count ; i++) {
      unsigned int linesize = linesizes[i];
      if (i > 0) fprintf(gp_out, ",\\\n");
      fprintf(gp_out, " '%s' index 'Capacity=%s' using (%u):%u with points notitle",
              CSV_FILENAME,
              pp_size(data_size*wordsize),
              (linesize*wordsize), i+2);
    }
    fprintf(gp_out, ",\\\n '%s' index 'Best-for-Capacity=%s' using 1:2 with linespoints notitle",
            CSV_FILENAME,
            pp_size(data_size*wordsize));
    fprintf(gp_out, "\n");

  }

  /*   static unsigned int analysis(const unsigned int linesize_count, */
  /*                                const unsigned int linesizes [], */
  /*                                const unsigned int samples, */
  /*                                const double **duration) */
  {
    const unsigned int linesize = analysis(linesize_count, linesizes, samples, (const double **)duration);
    fprintf(stderr, "linesize = %s\n", pp_size(linesize));
    
    for (i = 0 ; i < linesize_count ; i++) {
      free(duration[i]);
    }
    free(duration);
    free(data);
    free(indices);
    return linesize;
  }
}

int main(const int argc, const char *argv[]) {
  const unsigned int samples = 1000;

  unsigned int data_size = 0;
  int cache_level = 0;

  char gp_filename_buffer[128];
  char csv_filename_buffer[128];
  char pdf_filename_buffer[128];

  if (3 != argc ||
      (cache_level = atoi(argv[1])) < 1 ||
      0 == parse_size_string(argv[2], &data_size))
    {
      fprintf(stderr, "usage:  %s <CacheLevel> <CacheCapacity>\n", argv[0]);
      return -1;
    }

  /* Initialize Filenames */
  {
    sprintf(gp_filename_buffer, "dcache_linesize.L%d.gp", cache_level);
    sprintf(csv_filename_buffer, "dcache_linesize.L%d.csv", cache_level);
    sprintf(pdf_filename_buffer, "dcache_linesize.L%d.pdf", cache_level);
    GP_FILENAME = gp_filename_buffer;
    CSV_FILENAME = csv_filename_buffer;
    PDF_FILENAME = pdf_filename_buffer;
  }

    

  {
    FILE *csv_out = fopen(CSV_FILENAME, "w");
    FILE *gp_out = fopen(GP_FILENAME, "w");
    fprintf(gp_out, "set term pdf\n");
    fprintf(gp_out, "set output \"%s\"\n", PDF_FILENAME);
    {
      const unsigned int max_test_linesize = get_chunksize(data_size)/2/wordsize;
      const unsigned int tmp_linesize_count = (unsigned int)floor(log((double)max_test_linesize)/log(2.0)) + 1;
      const unsigned int linesizes [] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
      const unsigned int linesize_count = (tmp_linesize_count > 17 ? 17 : tmp_linesize_count);
      const unsigned int n = traverse_empty_list_abs_until_threshold(data_size*5/wordsize, 0.002);
      const unsigned int linesize = run_test(n, linesize_count, linesizes, data_size/wordsize, samples, csv_out, gp_out);
      fprintf(stderr, "linesize=%s\n", pp_size(linesize));
      fprintf(stdout, "%u\n", linesize);
      /* Write to PACE_RCDB */
      {
        char key[512];
        sprintf(key, "cache.kind.data.level.%i.linesize", cache_level);
        PACE_RCDB_write_int(key, linesize);
      }
      /* Write to PACE_TEMP */
      if (0) {
        char key[512];
        sprintf(key, "sandoval.dcache.L%u.linesize", cache_level);
        PACE_TEMP_write_int(key, linesize);
      }
    }
    fclose(gp_out);
    fclose(csv_out);
  }

  return 0;
}
