/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <float.h>

#include "BJTimer.h"
#include "barrier.h"
#include "affinity.h"

extern double e_double_mul_2(double a, double b);
extern void analyse_performance_degradation(double*, int, int, int);

void set_affinity_wrapped(int tid);

dissem_barrier g_barrier;
int g_loop_len;

int work_loop(int loop_len) {
  int i;
  double sum=0;
  double a0 = 1.0E+0;
  double a1 = 1.0E+0;

  for (i=0; i < loop_len; i++) {
    sum += e_double_mul_2(a0, a1);
  }

  return sum;
}

void * thread_work(void* a) {
  dissem_barrier_thread_local_data * bd = (dissem_barrier_thread_local_data*) a;
  int32_t sum = 0;
  int loop_length = g_loop_len;

  /* Bind to single compute context */
  set_affinity_wrapped(bd->thread_id);

  dissem_barrier_wait(bd);

  /* Do the work */
  sum = work_loop(loop_length);

  dissem_barrier_wait(bd);

  return (void*)(intptr_t)sum;
}


double timed_thread_work(dissem_barrier_thread_local_data * bd, int32_t * s) {
  int32_t sum;
  ticks t;
  int loop_length = g_loop_len;

  /* Bind to compute context 0 */
  set_affinity_wrapped(bd->thread_id);

  dissem_barrier_wait(bd);
  t = getticks();

  sum = work_loop(loop_length);

  dissem_barrier_wait(bd);
  t = getticks() - t;
  *s = sum;
  return t/1000000.0;
}

#define TEST_REPETION 5
double avg(double * t, int t_size) {
  double res = 0.0;
  int i;
  for (i=0; i < t_size; i++) {
    res += t[i];
  }
  res = res/(double)t_size;
  return res;
}

double min(double * t, int t_size) {
  double res = DBL_MAX;
  int i;
  for (i=0; i<t_size; i++) {
    if (res>t[i])
      res = t[i];
  }
  return res;
}

void set_loop_count() {
  g_loop_len = 1000;
  ticks t = 1.0;
  while (t / 1000000.0 < 1.0){
    g_loop_len *= 2;
    t = getticks();
    work_loop(g_loop_len);
    t = getticks() - t;
  }
}

int max_affinity = 0;
int affinity_error = 0;

void set_affinity_wrapped(int tid) {
 if (affinity_error == 0)
   set_affinity(tid%max_affinity);
}

int main(int argc, char * argv[]) {
  int max_num_threads;
  int num_threads;
  int i,r;
  int32_t sum;

  double *ts, *ts_next_iter;
  double time_data[TEST_REPETION];
  int init_perf_drop;
  int perf_drop_steps;
  int max_perf_drop;
  int seen_sig_drop;
  dissem_barrier_thread_local_data * bds;
  pthread_t * threads;

  affinity_error = init_affinity(&max_affinity);

  if (argc < 4) {
    printf("Usage: %s [INIT_PERF_DROP] [STEPS] [MAX_DROP]\n", argv[0]);
    return 1;
  }

  max_num_threads = 1000000;

  init_perf_drop = (int)strtol(argv[1], NULL, 10);
  perf_drop_steps = (int)strtol(argv[2], NULL, 10);
  max_perf_drop = (int)strtol(argv[3], NULL, 10);

  /* Determine loop count so code runs for at least 1 second */
  set_loop_count();
  fprintf(stderr, "loop count: %d\n", g_loop_len);


  ts  = (double*) calloc(2, sizeof(double));

  seen_sig_drop = 0;

  for (num_threads=1; num_threads < max_num_threads; num_threads++) {

    bds  = (dissem_barrier_thread_local_data*)calloc(num_threads+1, sizeof(dissem_barrier_thread_local_data));
    threads = (pthread_t*) calloc(num_threads+1, sizeof(pthread_t));

    for (r=0; r<TEST_REPETION; r++) {
      dissem_barrier_create(num_threads, &g_barrier);

      for (i=0; i<num_threads;i++) {
        dissem_barrier_thread_local_data_init(bds+i, &g_barrier, i, num_threads);
      }

      for (i=1;i<num_threads;i++) {
        /* Thread 1 to N-1 work */
        pthread_create(threads+i-1, NULL, thread_work, bds+i);
      }

      /* Thread 0 work */
      time_data[r] = timed_thread_work(bds, &sum);

      /* Ensure threads have finished */
      for (i=1; i < num_threads; i++)
        pthread_join(*(threads+i-1), NULL);

      dissem_barrier_destroy(&g_barrier);
    }

    ts[num_threads] = avg(time_data, TEST_REPETION);

    /* Output time and prevent IPO */
    fprintf(stderr, "%d\t%f\t%d\n", num_threads, ts[num_threads], sum);

    free(bds);
    free(threads);

    /* Stop once we reached the maximum performance drop */
    if (num_threads>1 && 
        ts[num_threads]/ts[1] >= max_perf_drop/100.0)
        if(++seen_sig_drop > 5)
      break;

    /* copy over data for next iteration */  
    ts_next_iter = (double*) calloc(num_threads+2, sizeof(double));    
    for (i=0; i<=num_threads; ++i) ts_next_iter[i] = ts[i];
    free(ts);
    ts = ts_next_iter;
  }

  fprintf(stderr, "analysing\n");
  
  analyse_performance_degradation(ts, num_threads, init_perf_drop, perf_drop_steps);
  
  free(ts);
  return 0; 
}


