/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */

/*
 *  
 *  Measure number of memory bound threads before performance significantly deteriorates.
 *  This version uses pthread barrier synchronization.
 *
 *  Created by Anna Youssefi on 1/13/10.
 *  Copyright 2010 __MyCompanyName__. All rights reserved.
 *
 */
 
#include "affinity.h"

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>


#define debug 0

#define MAX_THREADS 10000

#define BASE_SIZE 8
#define NUM_SIZES 6 
#define MAX_FLAT 5

#define TIME_THRESHOLD 1000000
#define START_ITERS 10000000
#define ITERS_ADD 10000000

#define UPPER_THRESHOLD 1.03
#define LOWER_THRESHOLD 0.90

typedef double ticks;
static inline ticks getticks(void){
	ticks ret;
	struct timeval tv;
	gettimeofday(&tv, NULL);
	ret = 1000*1000*(ticks)tv.tv_sec + (ticks)tv.tv_usec;
return ret;
}

static inline double elapsed(ticks t1, ticks t0){
	return (double)t1 - (double)t0;
}

int DATA_SIZE, ITERS, NUM_INTS, FLAG, NUM_THREADS, THREAD_AFFINITY;
double SINGLE_THREAD_TIME;

pthread_barrier_t *barrier, *initbarrier;

int *data;

struct vars{
	int *data;
	int cpunum;
};


void setITERS(){
	int i, n, j, index;
	ticks start, finish;
	double time;
	
	n = START_ITERS;
	ITERS = 0;
	
	//warm cache
	for(index =0, j=0;j<NUM_INTS;j++){
	    index = data[j];
	}
	if (FLAG) fprintf(stderr, "index = %d\n", index);
	
	while (ITERS == 0){
		index = 0;
		start = getticks();
		for (i=0; i < n; i++){
			index = data[index];
	
		}
		finish = getticks();
		time = elapsed(finish , start);
	//	if (debug) fprintf(stderr,"n %d \t time %f\n", n, time);
		if (FLAG) fprintf(stderr, "index = %d\n", index);
		if (time > TIME_THRESHOLD){
			ITERS = n;
		//	if (debug) fprintf(stderr,"ITERS = %d\n", ITERS);
			SINGLE_THREAD_TIME = time/(1000*1000);
			//if (debug) fprintf(stderr,"SINGLE_THREAD_TIME = %f\n", SINGLE_THREAD_TIME);
			break;
		}
		n *= 5;
	}

}


void * ThreadProc(void *param){
	int i, index, ret;
	struct vars v=*((struct vars *)param);
	int *mydata = v.data;
	
	
	/* THREAD AFFINITY */
	ret = set_affinity(v.cpunum);	

	if (ret) {
		fprintf(stderr, "ERROR:  set thread affinity failed on cpu %d with error code %d\n", v.cpunum, ret);
		//exit(1);
	}
	
	///////////////////////////////////////////////
	//wait for all threads to finish initializing
	ret = pthread_barrier_wait(initbarrier);
	//if (debug) fprintf(stderr, "thread passed init barrier\n");
	
	//wait to start computation
	ret = pthread_barrier_wait(barrier);
	
	//if (debug) fprintf(stderr, "thread passed start barrier\n");
	
	index = 0;
	
	for (index = 0, i=0; i < ITERS; i++){
		index = mydata[index];
	}
	
	if (FLAG) fprintf(stderr, "index = %d\n", index);

	pthread_exit(NULL);

}

// This routine scrambles the array to prevent prefetching

void scramble_array(void)
{
    int i, j;
 
    int *candidates = malloc(NUM_INTS*(sizeof(int)));


    /* initialize the candidates array */
    for (i=0; i < NUM_INTS-1; i++)
		candidates[i] = i+1;

    /* the invariant is that array[0] will always point to the
       first element in the list, and the last element in the
       list will point to 0 -- each step will pick a new random
       index and assign that entry with the value of array[0],
       and then we replace array[0] with that new index; the
       candidates array ensures that we can't get a cycle, b/c
       each index can only be picked once (and then it gets
       removed from the candidates array) */
  	  for (j=0; j < NUM_THREADS; j++) data[j * NUM_INTS] = 0;
	
    /* link the guys together */
    srand(0);
    while (i > 0)
    {
	    int next_index = rand()%i;
	    int next = candidates[next_index];

		/* remove the next index and move the last index in the list
	       into the "next_index"'s place */
	    candidates[next_index] = candidates[i-1];

	   /* link it up */
		for (j=0; j < NUM_THREADS; j++) data[j * NUM_INTS + next] = data[0];
		for (j=0; j < NUM_THREADS; j++) data[j * NUM_INTS] = next;
		i--;
    }

    free(candidates);

} /* scramble_array */

int main(int argc, char ** argv){

	int i, j, n, rc, multiplier;
	int cpucount, res;
	ticks start, finish;
	double time;
	pthread_t threads[MAX_THREADS];
	struct vars vars_structs[MAX_THREADS];
	int count, dsize, sequential_count;
	double throughput, t, answer_throughput;
	int bestanswer, answer_threads;
	int verbose = 0;

	/* the first param is cache size in KB */
	if (argc < 2 ) {
		fprintf(stderr, "ERROR: Program takes input 0 (flag to prevent dead code elim of loops) and optionally v for verbose output.\n");
		return 1;
	}

	FLAG = atoi(argv[1]);  //this prevents dead code elim of loops, should be 0
	if (argc == 3){
		if (argv[2][0] == 'v') verbose = 1;
	}
	
	//TRY INITIATING AFFINITY
	res = init_affinity(&cpucount);
	if (res){
		cpucount = 1;
	}
	
	answer_throughput = -1.0;
	answer_threads =0;
	dsize = 0;
	sequential_count = 0;
	
	for (j = 0; j < NUM_SIZES; j++){
	
		multiplier = (j > 0) ? (j * 4) : 1;
		//try this data size
		DATA_SIZE = BASE_SIZE * multiplier  * 1024;
		NUM_INTS = DATA_SIZE/sizeof(int);
		
		if (debug) fprintf(stderr, "Data size %d KB\n", DATA_SIZE/1024);
		
		//reset
		count = 0;
		throughput = - 1.0;
		bestanswer =0;
		
		for (n = 0; n < MAX_THREADS; n++){
		
			NUM_THREADS = n+1;
			
			if (debug) fprintf(stderr, "Num threads %d: ", NUM_THREADS);
			
			initbarrier = (pthread_barrier_t *)malloc(sizeof(pthread_barrier_t));
			rc = pthread_barrier_init(initbarrier, NULL, NUM_THREADS+1);
			if (rc){
				fprintf(stderr, "ERROR; return code %d from pthread_barrier_init() for initbarrier, num threads %d\n", rc, NUM_THREADS);
				/*
				switch(rc){
					case EINVAL: 
						fprintf(stderr, "EINVAL\n"); 
						break;
					case EAGAIN: 
						fprintf(stderr, "EAGAIN\n"); 
						break;
					case ENOMEM:  
						fprintf(stderr, "ENOMEM\n"); 
						break;
					case EBUSY:  
						fprintf(stderr, "EBUSY\n"); 
						break;
					default:
						fprintf(stderr, "UNKNOWN RETURN CODE: %d\n", rc); 
				}
				*/
				exit(-1);
			}
			
			barrier = (pthread_barrier_t *)malloc(sizeof(pthread_barrier_t));
			rc = pthread_barrier_init(barrier, NULL, NUM_THREADS+1);
			if (rc){
				fprintf(stderr, "ERROR; return code %d from pthread_barrier_init() for num threads %d\n", rc, NUM_THREADS);
				exit(-1);
			}

			
			data = (int *)calloc(NUM_INTS * NUM_THREADS, sizeof(int));
			scramble_array();
			setITERS();
			ITERS += ITERS_ADD * (NUM_THREADS/10);
		
			for (i=0; i < NUM_THREADS; i++){
				vars_structs[i].data = &(data[i * NUM_INTS]);  //block distrib among threads
				vars_structs[i].cpunum = i % cpucount;
				
				rc = pthread_create(&(threads[i]), NULL, ThreadProc, &(vars_structs[i]));
				if (rc){
					 fprintf(stderr, "ERROR; return code from pthread_create() is %d for thread %d.\n", rc, i);
					 exit(-1);
				}
				
			}
			
			//wait for all threads to initialize
			rc = pthread_barrier_wait(initbarrier);  
		//	if (debug) fprintf(stderr, "main thread passed init barrier\n");
			
			
			start = getticks();
						
			// start threads
			rc = pthread_barrier_wait(barrier);
			
		//	if (debug) fprintf(stderr, "main thread passed start barrier\n");
			
			//wait for all threads to complete
			for (i=0; i < NUM_THREADS; i++){
				pthread_join(threads[i], NULL);
			}
			
			finish = getticks();
			
			rc = pthread_barrier_destroy(initbarrier);
			
			rc = pthread_barrier_destroy(barrier);
			
			time = elapsed(finish , start);
			time /= 1000*1000;
			
			t = ((double)NUM_THREADS)/time;
			
			if (debug) fprintf(stderr, "Throughput %f, Time %f\n", t, time);
			
			if (((time/SINGLE_THREAD_TIME) <= (NUM_THREADS * UPPER_THRESHOLD)) && ((time/SINGLE_THREAD_TIME) >= (NUM_THREADS * LOWER_THRESHOLD))){
				sequential_count ++;
				if (sequential_count == MAX_FLAT && sequential_count == NUM_THREADS){
					fprintf(stdout, "1\n");
					return 0;
				}
			}
			
			if (t > (throughput * UPPER_THRESHOLD)){
				throughput = t;
				bestanswer = NUM_THREADS;
				count = 0;
			}
			else if (t < (throughput * LOWER_THRESHOLD)){
				bestanswer = NUM_THREADS - 1;
				break;
			
			}
			else {
				count ++;  //must be MAX_FLAT in a row!
				if (count == MAX_FLAT) {
					break;
				}
			}
		}
		//end of search for this data size
		if (bestanswer > answer_threads){
			answer_threads = bestanswer;
			answer_throughput = throughput;
			dsize = DATA_SIZE/1024;
		}
		else{
			//answer found
			if (verbose) fprintf(stdout, "threads:  %d \t throughput:  %f \t datasize: %d KB\n", answer_threads, answer_throughput, dsize);
			else  fprintf(stdout, "%d\n", answer_threads);
			break;
		}
	}

	return 0;
}
	
	
	
	
	
	
