/*

pgive.c - Monte Carlo test of covariable contribution to inconsistent mapping

Copyright © 2003 by Brett Kessler   

This file is part of CondCons.

CondCons is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

CondCons is distributed in the hope that it will be useful,
but without any warranty; without even the implied warranty of
merchantability or fitness for a particular purpose.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with CondCons; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Requirements:
ANSI C compiler, such as GNU's gcc (I used version 3.2).
Standard math library libm.a

The input file contains a set of records, each record representing one
object with categorical levels for a source variable, a target
variable, and a "given" variable.  The test sees whether the source
variable plus the given variable, working in tandem, predict the
target variable significantly more often than they would if there is
only a chance association between the source and given variables. This
is determined by randomly reassigning the extent given variables
across all the records, and seeing what proportion of those random
rearrangements predicts the target variable at least as well as the
original arrangement. 

Input format:

[Header line:]
n_records
[Each subsequent line (n_records of them) is a record:]
source HT given HT target

Source, given, and target are all categorical variables, their values 
expressed as integers.

Output are tag-value pairs, separated by tabs:

Base  FLOAT    //Conditional Consistency of source->target mapping with given
ITER  INT      //Number of rearrangements in Monte Carlo test
p     FLOAT    //probability that contribution of given is due to chance;
                i.e., proportion of rearrangements whose conditional
                consistency is at least as high as Base
PMEAN FLOAT    //Average conditional consistency of the rearrangements

Options:

--in=input_file       Defaults to extracted-data

--iter=n_iterations   Defaults to 10000

--verbose=1           Type a lot of stuff.


Consistency is much like entropy, just scaled differently. If the same value 
of a TO variable occurs every time a FROM variable has a certain value, then 
the consistency is 1.0. Otherwise, the consistency of FROM level 'x' is the 
weighted average of the proportions of the objects with FROM 'x' that have each
of the various TO levels.  E.g., if TO 'y' occurs 10 times, 3 times with
FROM='x1' and 7 times with FROM='x2', the consistency is 
(3 x (3/10) + 7 * 7/10) / 10, or, more simply, (3/10)^2 + (7/10)^2.
That is, for each TO level, we count what proportion of the FROM level that
is found with, and square it, summing those squares.

 */

#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static char* in_file = "extracted-data";
static char* prog_name;
static int verbose = 0;
static int iter = 10000;

//One observation. All fields are integers representing levels of
// categorical variables.
typedef struct record {
  int source; //main predictor variable
  int given;  //secondary (conditioning) predictor variable
  int target; //response variable
} Record;

//All the observations.
typedef struct data {
  int n_records;
  double base_consistency;
  Record** record;
} Data;
  
static Data data;

//Just a malloc, but dies if out of memory.
static void*
Malloc(int n_bytes, char* msg) {
  void* mem = malloc(n_bytes);
  if (mem == NULL) {
    (void)
      fprintf(stderr, 
	      "Out of memory, trying to malloc %d bytes for %s in signif.c\n",
	      n_bytes, msg);
    exit(1);
  }
  return mem;
}

static void
Read_Options(int argc, char* argv[]) {
  static struct option options[] = {
    {"in", 1, 0, 'i'},      //file containing the data
    {"iter", 1, 0, 'I'},    //number of rearrangementes in Monte Carlo test
    {"verbose", 1, 0, 'v'}, //enables extra output to stderr
    {NULL, 0, 0, 0},
  };
  int fatal = 0;
  prog_name = argv[0];
  while (1) {
    int option_index;
    int c = getopt_long(argc, argv, "", options, &option_index);
    if (c == -1) {break;}
    switch (c) {
    case 'i':
      in_file = optarg;
      break;
    case 'I':
      iter = strtol(optarg, NULL, 10);
      break;
    case 'v':
      if (optarg[0] != '0') {
        verbose = 1;
      }
      break;
    }
  }
  if (fatal) {exit(1);}
}

//Really don't need more than 9 or so...
#define MAX_LINE 1024

//Read one line of data from file
static char*
Read_Line(FILE* in) {
  char* result;
  static char line[MAX_LINE];
  result = fgets(line, MAX_LINE, in);
  if (result == NULL) {
    if (ferror(in)) {
      perror("Reading line from file.");
      exit(1);
    } else {
      return NULL;
    }
  }
  return line;
}

//Read the file header, which tells how many records there are
static void
Read_Header(FILE* in) {
  char* count_field;
  char* line = Read_Line(in);
  char* next;
  if (line == NULL) {
    (void) fprintf(stderr, "Truncated data file.\n");
    exit(1);
  }
  data.n_records = strtol(line, &next, 10);
  if (data.n_records < 1 || *next != '\n') {
    (void) fprintf(stderr, "Error in data file header.\n");
    exit(1);
  }
  data.record = Malloc(data.n_records * sizeof(Record*), "data.record");
}

//Reads characters in LINE up to DELIM as an integer and returns the number.
// The LINE string is modified to point past that DELIM.
static int
Read_Int(char** line, char* delim) {
  char* string = strsep(line, delim);
  char* next;
  int num;
  if (string == NULL) {
    (void) fprintf(stderr, "Ill-formed line, missing a delimited.\n"); exit(1);
  }
  num = strtol(string, &next, 10);
  if (num < 0) {
    (void) fprintf(stderr, "Error in reading integer from '%s'.\n", string);
    exit(1);
  }
  return num;
}

//Reads one line of data from file and stores data as next record in
// "data".
static int
Read_Record(FILE* in, int record_number) {
  char* line = Read_Line(in);
  Record* record;
  if (line == NULL) {
    return 0;
  }
  data.record[record_number] = Malloc(sizeof (Record), "record");
  record = data.record[record_number];
  record->source = Read_Int(&line, "\t");
  record->given = Read_Int(&line, "\t");
  record->target = Read_Int(&line, "\n");
  return 1;
}

//Reads entire data file into "data".
static void
Read_Data(void) {
  FILE* in = fopen(in_file, "r");
  int i;
  if (in == NULL) {
    (void) fprintf(stderr, "Failed to open file `%s'\n", in_file);
    exit(1);
  }
  Read_Header(in);
  for (i = 0; i < data.n_records; i++) {Read_Record(in, i);}
  fclose(in);
}

//Dump all the data back out again. Invoked only if verbose>0
static void
Dump_Data(void) {
  int r;
  (void) printf("Base consistency %.3f\n", 
		data.base_consistency);
  (void) printf("%d records:\n", data.n_records);
  for (r = 0; r < data.n_records; r++) {
    Record* record = data.record[r];
    printf("%d -> %d | %d\n", 
	   record->source, record->target, record->given);
  }
}

/*
 * These two functions sort the "data", first by "source" field;
 * minor sort by "given" field; and then by "target" field.
 * The idea is that all records with the same predictor values (source and
 * given) will be next to each other. Within that block, all records
 * with the same target (result) values will be grouped together.
 */
static int
Compare_Records(const void* arg1, const void* arg2) {
  const Record* r1 = *((Record**) arg1);
  const Record* r2 = *((Record**) arg2);
  int cmp = r1->source - r2->source;
  /* fprintf(stderr, "Compare_Records (%d, %d, %d, %f) vs (%d, %d, %d, %f)\n",
     r1->source, r1->given, r1->target, r1->freq, 
     r2->source, r2->given, r2->target, r2->freq);
   */
  if (cmp) {return cmp;}
  cmp = r1->given - r2->given;
  if (cmp) {return cmp;}
  return (r1->target - r2->target);
}

static void
Sort_Records(void) {
  qsort(data.record, data.n_records, sizeof (Record*), Compare_Records);
  /* Dump_Data();*/
}

/*
 * Returns the index to the next record that has a different "source" or
 * "given" field from the current one. The function is so named because
 * all records with identical values in the source and given fields are
 * sorted next to each other, forming blocks with uniform predictor
 * variables -- the "from" variables in a mapping.
 *
 * If no next block, returns an index past the end of the data.
 */
static int
Find_Next_From_Block(int rec1) {
  int rec;
  for (rec = rec1 + 1; rec < data.n_records; rec++) {
    if (data.record[rec1]->source != data.record[rec]->source) {
      return rec;
    }
    if (data.record[rec1]->given != data.record[rec]->given) {
      return rec;
    }
  }
  return data.n_records;
}

/*
 * Returns the index to the next record in the current "from" block (see 
 * above function) that has a different "target" from the current record.
 * All records that have the same target value for the current predictors
 * have already been sorted together into one "to" block -- i.e., the target
 * variables in a mapping.
 */
static int
Find_Next_To_Block(int rec1, int next_from_block) {
  int r;
  int first = data.record[rec1]->target;
  for (r = rec1 + 1; r < next_from_block; r++) {
    if (first != data.record[r]->target) {return r;}
  }
  return next_from_block;
}

/*
 * The Consistency measure contributed by a particular from->to mapping:
 * the square of the proportion of the FROM observations that have the
 * value of the TO observations.
 */
static double
Consistency_Contrib(int to_count, int from_count) {
  double proportion = (double) to_count / (double) from_count;
  return proportion * proportion;
}

/*
 * We're within a FROM block (records with same FROM = predictor) variables.
 * We know how many records have this FROM value (FROM_SUM).
 * We advance to the next TO block within this FROM block (advancing the
 * record pointer REC1) which also lets us know how many records are in this
 * TO block. So we can compute the consistency contribution made by this
 * particular FROM->TO mapping.
 */
static double
Compute_To_Block(int* rec1, int next_from_block, int from_sum) {
  int rec_bound = Find_Next_To_Block(*rec1, next_from_block);
  int r = *rec1;
  int to_sum = rec_bound - r;
  *rec1 = rec_bound;
  return Consistency_Contrib(to_sum, from_sum);
}

/*
 * The current FROM block contains all the records that have the same
 * predictor values -- identical values for both "source" and "given".
 * We advance REC1 pointer to end of this block, so we know the denominator
 * of our consistency computation.  We now proceed through each TO
 * block in turn -- sets of records that have the same result variable as
 * each other -- and sum up their contribution to the consistency 
 * measure for this FROM block.
 * In the end, consistency of the whole data set will be the weighted
 * average of the consistency of each of these FROM blocks, so we 
 * return this consistency measure multiplied by the number of records
 * in this FROM block.
 */
static double
Compute_From_Block(int* rec1) {
  int from_sum = 0;
  double consistency = 0.0;
  int rec_bound = Find_Next_From_Block(*rec1);
  int rec = *rec1;
  *rec1 = rec_bound;
  from_sum = rec_bound - rec;
  while (rec < rec_bound) {
    consistency += Compute_To_Block(&rec, rec_bound, from_sum);
  }
  #ifdef NO_COMPILE
    (void) fprintf(stderr, "Compute_From_Block: C = %.3f, from_sum = %d\n", 
		 consistency, from_sum);
  #endif
  return consistency * from_sum;
}

/*
 * Computes conditional consistency of the current arrangment of the data.
 * We sort the records so that all records with the same predictor
 * values (source and given) are grouped together, and within those 
 * predictor ("FROM") blocks, all records with the same target variable value
 * are grouped together ("TO") blocks. Now consistency can be computed
 * by using the size of those blocks.
 */
static double
Compute(void) {
  int sum = 0;
  int first_record_in_block = 0;
  double weighted_sum = 0.0;
  Sort_Records();
  while (first_record_in_block < data.n_records) {
    weighted_sum += Compute_From_Block(&first_record_in_block);
  }
  #ifdef NO_COMPILE
  if (verbose) {
    (void) fprintf(stderr, "Weighted sum = %.3f\n", weighted_sum);
  }
  #endif
  //This division turns the weighted sum of the contributions of the FROM 
  // blocks into a real overall (weighted average) consistency measure.
  // Of course, since data.n_records is a constant across all rearrangements,
  // it's really otiose; we really should only do this division at the 
  // very end.
  return weighted_sum / (double) data.n_records;
}

/*
 * Return random integer in range [0..limit)
 */
static int Random_Number(int limit) {
  int r = (int) (((double) limit * rand()) / (RAND_MAX+1.0));
  return (r);
}

/*
 * Randomly redistribute "given" across the records.
 */
static void
Shuffle(void) {
  /* Fisher_Yates_Shuffle, from Perl Cookbook, p. 121  */
  int r;
  for (r = data.n_records - 1; r; r--) {
    int j = Random_Number(r + 1);
    if (r != j) {
      int temp = data.record[r]->given;
      data.record[r]->given = data.record[j]->given;
      data.record[j]->given = temp;
    }
  }
}

/*
 * Print the answers. "p" is probability that contribution of "given"
 * is due to chance. "PMEAN" tells what conditional consistency would be
 * if "given" were unassociated with source->target mapping.
 */
static void
Print_p(int tally, double sum) {
  double denom = (double) iter;
  double mean = sum / denom;
  (void) printf("p\t%f\n", (double) tally / denom);
  (void) printf("PMEAN\t%.3f\n", mean);
}

/*
 * Does the permutation test itself:
 * (1) Compute the conditional consistency of the data ("base").
 * (2) Randomly reassign "given" across all the records.
 * (3) Compute the conditional consistency now.
 * (4) If the new conditional consistency is at least as high as "base",
 *     increment "tally".
 * (5) Go to 2, repeating "iter" times.
 * In the end, "tally" gives us an idea of how likely our "base" results are
 *  due to chance. tally/iter is "p".
 */
static void
Permute_Test(void) {
  double base = Compute();
  int base_mil = (int) (base * 1000.0); //for comparing to 4-digit accuracy
  printf("Base\t%.3f\n", base);
  if (iter > 0) {
    int i;
    double sum = 0.0; //so we can also compute mean conditional consistency
    double permuted;
    int tally = 0;
    printf("ITER\t%d\n", iter);
    for (i = 0; i < iter; i++) {
      int permuted_mil;
      Shuffle();
      permuted = Compute();
      permuted_mil = (int) (permuted * 1000.0);
      if (!(i % 100) && verbose) { //let people know we're still here
	fprintf(stderr, "%5d: %.5f\n", i, permuted);
      }
      if (base_mil <= permuted_mil) {tally++;}
      sum += permuted;
    }
    Print_p(tally, sum);
  }
}

extern int
main(int argc, char* argv[]) {
  Read_Options(argc, argv);
  Read_Data();
  if (verbose > 1) {Dump_Data();}
  Permute_Test();
  return(0);
}
