/* Copyright 2011, Rice University.  All rights reserved.
   No warranty of usability express or implied.  Have a lovely day! */
#include <stdio.h>
#include <stdlib.h>
/*#include <libc.h>*/
#include "primes.h"

#ifndef bzero
extern void bzero();
#endif





/* this has to be the same as in find_issue_slots! */
enum operations {ADD_OP, SUB_OP, MUL_OP, DIV_OP, XOR_OP};


int maximum_insts_per_stream[6][5];

void init_maximum_insts_per_stream(void)
{
    /* ints */
    maximum_insts_per_stream[ADD_OP][1] = 100000;
    maximum_insts_per_stream[SUB_OP][1] = 100000;
    maximum_insts_per_stream[MUL_OP][1] = 100000;
    maximum_insts_per_stream[DIV_OP][1] = 100000;
    maximum_insts_per_stream[XOR_OP][1] = 100000;

    /* floats */
    maximum_insts_per_stream[ADD_OP][2] = 200;
    maximum_insts_per_stream[SUB_OP][2] = 200;
    maximum_insts_per_stream[MUL_OP][2] = 1019;
    maximum_insts_per_stream[DIV_OP][2] = 200;
    maximum_insts_per_stream[XOR_OP][2] = 0;

    /* doubles */
    maximum_insts_per_stream[ADD_OP][3] = 1019;
    maximum_insts_per_stream[SUB_OP][3] = 1019;
    maximum_insts_per_stream[MUL_OP][3] = 1019;
    maximum_insts_per_stream[DIV_OP][3] = 1019;
    maximum_insts_per_stream[XOR_OP][3] = 0;

    /* longs */
    maximum_insts_per_stream[ADD_OP][4] = 100000;
    maximum_insts_per_stream[SUB_OP][4] = 100000;
    maximum_insts_per_stream[MUL_OP][4] = 100000;
    maximum_insts_per_stream[DIV_OP][4] = 100000;
    maximum_insts_per_stream[XOR_OP][4] = 100000;
} /* init_maximum_insts_per_stream */




/* prime_set must have at least (target members+1), and target must
   be less than the highest prime in the primes.h array (currently,
   that's about 1 million */
static long long find_prime_factorization(int target, int *prime_set)
{
    int i, prime;

    while (target > 1)
    {
        for(i=0, prime = primes[0];prime<=target;i++, prime = primes[i])
        {
            if (((target/prime)*prime) == target)
            {
                prime_set[prime]++;
                target /=prime;
                break;
            }
        }
    }
} /* find_prime_factorization */



static int is_multiple(int candidate, int *target_factors, int *modulo_factors)
{
    int *candidate_factors = malloc(sizeof(int)*(candidate+1));
    int subset, i, prime;

    find_prime_factorization(candidate, candidate_factors);

    subset = 1;
    for(i=0, prime = primes[i]; prime<= candidate; i++, prime = primes[i++])
	if (candidate_factors[prime] > target_factors[prime])
	{
	    subset = 0;
	    modulo_factors[prime] = candidate_factors[prime] - target_factors[prime];
	}

    free(candidate_factors);

    return subset;
} /* is_multiple */



static long long find_divisors(int max, int threads, int *candidates)
{
    long long i, j;
    unsigned long long total = 1;
    long long running_max = threads;
    int *trial_factors = (int *)malloc(sizeof(int)*(threads+1));
    int *factors = (int *)malloc(sizeof(int)*(threads+1));
    bzero(factors, sizeof(int)*(threads+1));

    /* start out with the base a the maximum number of threads; this
       guarantees that that number will be a divisor, as we just assume
       that if we say we want x threads, the total number of instructions
       had better be divisible by (at the least) x */
    find_prime_factorization(threads, factors);

    /* for every number of threads, compute the factorization, and see
       if this is a subset of the prime factors already computed, in which
       case, do nothing; if it's not a subset, see how many of the prime
       factors can be used */
    for(i=2;i<=threads;i++)
    {
	int prime, divides_evenly = 1;

	bzero(trial_factors, sizeof(int)*(threads+1));
	find_prime_factorization(i, trial_factors);

	for(j=0, prime = primes[j];prime<threads;j++, prime = primes[j])
	{
	    if (trial_factors[prime] > factors[prime])
	    {
		int k;

		for(k=0;k<trial_factors[prime]-factors[prime];k++)
		    /* if within 110% of max */
		    if ((prime*running_max) < (max*1.1))
		    {
			factors[prime]++;
			running_max *= prime;
		    }
		    else
			divides_evenly = 0;
	    }
	}
	candidates[i] = divides_evenly;
    }

    if (running_max < (max*.9))
    {
	int prime;

	for(i=0, prime = primes[i];prime<threads;i++, prime = primes[i])
	    if ((running_max*prime) < (max*1.1))
	    {
		running_max *= prime;
		i = -1; /* reset i (it gets incremented at the end of the loop) */
	    }
    }

    return running_max;

} /* find_divisors */



main(int argc, char **argv)
{
    int threads, i, j;
    long long total, insts;
    int max_threads = 32;
    FILE *output_file;
    char sign = '*';
    enum operations operation_name;
    int how_to_declare = 1;
    int declarations_per_thread;
    int *candidates;
    char *type_of_operations = "int";
    char **variable_names = NULL;
    int fixed_length = 0;
    int maximum_insts_in_the_stream;
    int lengthen_code = 0;
    int inst_loop_max;

    if (argc < 8)
    {
	fprintf(stderr, "Arg:\tmax # of insts\n\tmax # of threads\n\t[1,2,(3, 4)] use/dont-use interleaved declarations\n\t\t(or use cycling declarations w/ or w/o interleaving)\n\t1 for int32s, 2 for int64s, 3 for floats, or 4 for doubles\n\tsign:\t1 for '+'\n\t\t2 for '-'\n\t\t3 for '*'\n\t\t4 for '/'\n\t\t5 for '^'\n\t1 for lcm, 2 for fixed length\n\t1 for dividing code by number of threads, 2 for increasing the total length\n");
	exit(0);
    }

    insts = atoi(argv[1]);
    max_threads = atoi(argv[2]);
    if (argc > 3)
	how_to_declare = atoi(argv[3]);
    /* five and four are out of order, b/c it's easier to set
       maximum_insts_in_the_stream */
    if (argc > 5)
	switch (atoi(argv[5]))
	{
	    case 1:
		sign = '+';
		operation_name = ADD_OP;
	        break;
	    case 2:
		sign = '-';
		operation_name = SUB_OP;
	        break;
	    case 3:
		sign = '*';
		operation_name = MUL_OP;
	        break;
	    case 4:
		sign = '/';
		operation_name = DIV_OP;
		break;
	    case 5:
		sign = '^';
		operation_name = XOR_OP;
		break;
	}
    if (argc > 4)
    {
	int arg4 = atoi(argv[4]);

	init_maximum_insts_per_stream();
	if (arg4 == 1)
	{
	    type_of_operations = "int32_t";
	    maximum_insts_in_the_stream = maximum_insts_per_stream[operation_name][1];
	}
	else if (arg4 == 2)
	{
	    type_of_operations = "int64_t";
	    maximum_insts_in_the_stream = maximum_insts_per_stream[operation_name][1];
	}
	else if (arg4 == 3)
	{
	    type_of_operations = "float";
	    maximum_insts_in_the_stream = maximum_insts_per_stream[operation_name][2];
	}
	else if (arg4 == 4)
	{
	    type_of_operations = "double";
	    maximum_insts_in_the_stream = maximum_insts_per_stream[operation_name][3];
	}
	else
	{
	    type_of_operations = "long";
	    maximum_insts_in_the_stream = maximum_insts_per_stream[operation_name][4];
	}

									   }
    if (argc > 6)
	fixed_length = (atoi(argv[6]) == 2)?1:0;
    if (argc > 7)
	lengthen_code = (atoi(argv[6]) == 2)?1:0;

    candidates = (int *)malloc(sizeof(int)*(max_threads+1));
    bzero(candidates, sizeof(int)*(max_threads+1));
    if (!fixed_length)
    {
	candidates[1] = 1;
	insts = find_divisors(insts, max_threads, candidates);
    }
    else /* fixed length */
	for(i=0;i<=max_threads;i++)
	    candidates[i] = 1;
    variable_names = (char **)malloc(sizeof(char*)*max_threads+1);
    for(i=0;i<max_threads;i++)
    {
	variable_names[i] = (char*)malloc(sizeof(char)*3);
	sprintf(variable_names[i], "%c%c", 'A'+(i/26), 'A'+(i%26));
    }

    /*    fprintf(stdout, "insts = %lld\n", insts);*/

    /* first, put out the header info */
    output_file = fopen("calls_membench/calls_membench.h", "w");
    fprintf(output_file, "#include <stdint.h>\n#define NUMBER_OF_FUNCTIONS %d\n", max_threads);
    for(threads=1; threads<=max_threads; threads++)
	if (candidates[threads])
	    fprintf(output_file, "extern %s call_%d (int, int);\n", type_of_operations, threads);
    fprintf(output_file, "extern %s foo_%s(int, int, %s);\n\n", type_of_operations, type_of_operations, type_of_operations);
    fprintf(output_file, "%s (*calls[%d])() = {", type_of_operations, max_threads);
    for(threads=1; threads<=max_threads; threads++)
	if (candidates[threads])
	    fprintf(output_file, "call_%d%s", threads, (threads == max_threads)?"};\n":",");
        else
	    /* we don't have to check if this is the last guy in the
	       list, b/c we've made the assumption that the maximum number
	       of threads will always be in the set of candidates */
	    fprintf(output_file, "NULL,");
    fclose(output_file);

    /* now, put out each function in its own file */
    for(threads=1; threads<=max_threads; threads++)
    {
	char next_filename[1000];
	char next_compile_command[1000];

	if (!candidates[threads])
	    continue;

	/* this is the value to use for mod to cycle through variables */
	/* if the value is three or four, then we want to use the smaller
	   number of declarations, but we still want to be able to experiment
	   with separate lines of declarations and intertwingled declarations,
	   so if we've specified the shorter set of declarations, we then
	   divide that flag by two to figure out how to write out
	   the declarations */
	if (how_to_declare > 2)
	    declarations_per_thread = (max_threads*2)+1;
	else /* we add two to the total, b/c the stream is actually
		two declarations longer than "max_threads"; and we add
		threads-1 to insts b/c we're using integer division, and
		this makes sure that we have enough declarations */
	    declarations_per_thread = ((insts+(threads-1))/threads)+2;

	sprintf(next_filename, "calls_membench/calls_membench%03d.c", threads);
	output_file = fopen(next_filename, "w");
	fprintf(output_file, "#include <stdint.h>\nint call_%d (%s y, int iterations)\n{\n    int i;\n", threads, "int"/*type_of_operations*/);
	if (how_to_declare%2 == 0)
	{
	    fprintf(output_file, "    %s ", type_of_operations);
	    for(i=0;i<declarations_per_thread;i++)
		for(j=0;j<threads;j++)
		{
		    fprintf(output_file, "%s%d, ", variable_names[j], i);
		}
	    fprintf(output_file, "x;\n");
	}
	else if (how_to_declare%2 == 1)
	{
	    for(j=0;j<threads;j++)
	    {
		fprintf(output_file, "    %s dummy%d", type_of_operations, j);
		for(i=0;i<declarations_per_thread;i++)
		{
		    fprintf(output_file, ", %s%d", variable_names[j], i);
		}
		fprintf(output_file, ";\n");
	    }
	    fprintf(output_file, "\t%s x;\n", type_of_operations);
	}

	/* put out initializations */
	for(j=0;j<threads;j++)
	    fprintf(output_file, "    %s0 = foo_%s(1, %d, (%s)0); %s1 = foo_%s(2, %d, (%s)0);\n", variable_names[j], type_of_operations, operation_name, type_of_operations, variable_names[j], type_of_operations, operation_name, type_of_operations);
	fprintf(output_file, "\n    for(i=0;i<iterations;i++)\n    {\n");

	/* put out the sequence of instructions */
	/* if we're putting out a fixed length of instructions, then the
	   total number might not be evenly divisible by the number of threads,
	   so the test for the inner loop becomes: do it threads number of
	   times, except in the case of where we're in the last iteration of
	   the i loop, in which case do it only the leftover number of times
	   (which is just inst%threads) */
	if (lengthen_code)
	    inst_loop_max = insts+2;
	else
	    inst_loop_max = (insts/threads)+2;

        /* in the case of division, we set up a repeating sequence that
           is three instructions long; since we want the names at the
           end of the loop to wrap around to the beginning of the loop,
           then the total length of the sequence has to be divisible by
           three */
        if (sign == '/')
            inst_loop_max = (((inst_loop_max+2)/3)*3)+2;

	for(i=2;i<inst_loop_max;i++)
	    for(j=0;(!fixed_length)?j<threads:(lengthen_code)?j<threads:(i==(insts/threads)+1)?j<insts%threads:j<threads;j++)
	    {
		int first_arg = (i-2)%declarations_per_thread;
		int second_arg = (i-1)%declarations_per_thread;
		int defined_arg;
                if (i<(inst_loop_max-2))
                    defined_arg = i%declarations_per_thread;
                else
                {
                    defined_arg = i-(inst_loop_max-2);
                    if (defined_arg == 1)
                        second_arg = 0;
                }

		if (sign == '/')
		{
		    if (!((i-1)%3))
		    {
			int temp = second_arg;

			second_arg = first_arg;
			first_arg = temp;
		    }
		}
		/* certain of the patterns can only go on for so many
		   instructions before we have to reinitialize the stream */
		if (!(i%maximum_insts_in_the_stream))
		{
		    char arg1[1000], arg2[1000];

		    sprintf(arg1, "%s%d", variable_names[j], first_arg);
		    sprintf(arg2, "%s%d", variable_names[j], second_arg);
		    fprintf(output_file, "\t%s = foo_%s(1, %d, %s); %s = foo_%s(2, %d, %s);\n", arg1, type_of_operations, operation_name, arg1, arg2, type_of_operations, operation_name, arg2);
		}
/*		if (sign == '-')
/*		{
/*		    first_arg = (rand()%2)?(i-1)%declarations_per_thread:(i-2)%declarations_per_thread;
/*		    second_arg = (first_arg==(i-1)%declarations_per_thread)?(i-2)%declarations_per_thread:(i-1)%declarations_per_thread;
/*		}
*/
		fprintf(output_file, "\t%s%d = %s%d %c %s%d;\n", variable_names[j], defined_arg, variable_names[j], first_arg, sign, variable_names[j], second_arg);
	    }

	/* use all of the final values so they're not DCE'd away */
	fprintf(output_file, "    }\n    x = ");
	for(j=0;j< threads;j++)
	    fprintf(output_file, "%s1 + ", variable_names[j]);
	fprintf(output_file, "1;\n\treturn x;\n} /* call_%d */\n", threads);
	fclose(output_file);
    }

} /* main */
