/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
/* Getting good timings can be pretty annoying.  The version of membench
 * at
 *
 *   www.cs.berkeley.edu/~richie/bebop/notes/matmul/files/membench/
 *
 * use timing routines which are available on the Sun, but not on Linux
 * boxen.  This version provides two timing options:
 *
 * 1) Use the clock() function.  This function is part of the standard
 *    C library, and is pretty universally available.  Unfortunately,
 *    the granularity available from clock() is generally not
 *    that great.  You can get around that somewhat by running the
 *    timing trial many times in a loop (a good idea regardless),
 *    but it's still sort of annoying.
 *
 *    The the clock routine is that is supposed to return only the 
 *    processor time used by the program.  In this case, that's what we
 *    want (I think), but often you'd like to use wall clock time for
 *    programming more complicated things.  The functions "time"
 *    and "difftime" are useful for that, though once again you must
 *    be careful about the resolution.
 *
 * 2) Use the intel_timer package, included, which uses the cycle counters
 *    available on the PPro and beyond.  As far as I know, the cycle counters
 *    aren't saved/restored on context switch, so this should report
 *    wall clock time.
 *
 *    intel_timer is something I put together from a snippet of code
 *    that I got from Rich Vuduc.  I'm sure there must be more intelligent
 *    (and portable) timing packages out there, but this seems to at least
 *    work.  There are probably timers in the PAPI library, for instance.
 */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <limits.h>
#include <sys/times.h>
#include <sys/types.h>


#define SAMPLE    10
#define CACHE_MIN (1024)
#define CACHE_MAX (16*1024*1024)

/* defined by the standard */
#define TICKS_PER_SECOND 1000000.0

enum operations {ADD_OP, SUB_OP, MUL_OP, DIV_OP, XOR_OP};

#define TIME_DIF_TO_NS(s,f) \
    ((f.tv_sec-s.tv_sec)*1000000000.0 + (f.tv_nsec-s.tv_nsec))


int foo_int(int x, int operation, int dummy)
{
    if (operation == DIV_OP)
	return 0xAAAAAAAA;
    else
	return 3;
} /* foo_int */


int32_t foo_int32_t(int x, int operation, int32_t dummy)
{
    if (operation == DIV_OP)
	return INT32_C(0xAAAAAAAA);
    else
	return 3;
} /* foo_int32_t */



int64_t foo_int64_t(int x, int operation, int64_t dummy)
{
    if (operation == DIV_OP)
	return INT64_C(0xAAAAAAAAAAAAAAAA);
    else
	return 3;
} /* foo_int64_t */



float foo_float(int x, int operation, float dummy)
{
    if (x == 1)
    {
	if (operation == MUL_OP)
	    return 1.0E+00;
	else
	    return 4.607182E-9;
    }
    else
    {
	if (operation == MUL_OP)
	    return 1.0E+00;
	else 
	    return 9.218868E-9;
    }
} /* foo_float */



double foo_double(int x, int operation, double dummy)
{
    if (x == 1)
    {
	if (operation == MUL_OP)
	    return 1.0E+00;
	else
	    return 4.607182E+18;
    }
    else
    {
	if (operation == MUL_OP)
	    return 1.0E+00;
	else
	    return 9.218868E+18;
    }
} /* foo_double */



int x[CACHE_MAX];


/* this gives a #define called NUMBER_OF_FUNCTIONS */
#include "calls_membench/calls_membench.h"


int main(int argc, char **argv)
{
    int i, j, k, l, index, stride, limit, temp;
    int total_calls = 1;
    double sample_ns, sample_sec, sample_sec2, sec;
    int steps, tsteps, csize, trials;
    /*clock_t start,finish,start2,finish2;*/
    struct timeval timer_val1, timer_val2;
    double start, finish, time;
    double lowest_time = 10000000.0; /* bigger than any expected time */
    int threshold = 50;

    if (argc > 1) /* this is a multiplier */
	total_calls *= atoi(argv[1]);

    if (argc > 2)
    {
	trials = atoi(argv[2]);
	if (trials > NUMBER_OF_FUNCTIONS)
	{
	    trials = NUMBER_OF_FUNCTIONS;
	    fprintf(stderr, "There are only %d functions, so that's all we'll do.\n", trials);
	}
    }
    else
	trials = NUMBER_OF_FUNCTIONS;

  
    j = 3;
    for (i=0;i<trials;i++)
    {
	sample_sec = 0.0;
	lowest_time = 1000000.0;
	if (!calls[i])
	    continue;

	/* get the baseline */
	for (k=0;k<threshold;k++)
	{
	    /*	start = clock();*/
	    gettimeofday(&timer_val1, NULL);
	    
	    (calls[i])(j, total_calls);
	    
	    /*	finish = clock();*/	
	    gettimeofday(&timer_val2, NULL);
	    start = timer_val1.tv_sec + (timer_val1.tv_usec/TICKS_PER_SECOND);
	    finish = timer_val2.tv_sec + (timer_val2.tv_usec/TICKS_PER_SECOND);
	    
	    /*sample_sec = (double)(finish - start)/CLOCKS_PER_SEC;*/
	    time = finish-start;
	    if (time < lowest_time)
		lowest_time = time;
	}

	/* run until we find the lowest */
	for (k=0, l=0;k<threshold;k++, l++)
	{
	    /*	start = clock();*/
	    gettimeofday(&timer_val1, NULL);
	    
	    (calls[i])(j, total_calls);
	    
	    /*	finish = clock();*/	
	    gettimeofday(&timer_val2, NULL);
	    start = timer_val1.tv_sec + (timer_val1.tv_usec/TICKS_PER_SECOND);
	    finish = timer_val2.tv_sec + (timer_val2.tv_usec/TICKS_PER_SECOND);
	    
	    /*sample_sec = (double)(finish - start)/CLOCKS_PER_SEC;*/
	    time = finish-start;
	    if (time < lowest_time)
	    {
		lowest_time = time;
		k = 0; /* reset the counter, b/c we found a lower time */
	    }
	}

	/*fprintf(stderr, "Trial: %d\tTries: %d\n", i, l);*/
	/*fprintf(stdout, "%d\t%1.08f\n", i+1, sample_sec);*/
	fprintf(stdout, "%d\t%1.12f\n", i+1, lowest_time);
    }
    fprintf(stdout, "\n");
    
    return 0;
} /* main */
