/* Program to accompany 
   "Monte Carlo simulation for contingency tables in SAS"
   by Rick Wicklin, published 28OCT2015 on The DO Loop blog
   http://blogs.sas.com/content/iml/2015/10/28/simulation-exact-tables.html
*/

/******************************************************************/
/* The following modules are described in the article
   "Simulate contingency tables with fixed row and column sums in SAS"
   by Rick Wicklin, published 21OCT2015 on The DO Loop blog
   http://blogs.sas.com/content/iml/2015/10/21/simulate-contingency-tables-fixed-sums-sas.html
*/
proc iml;
/* output levels and frequencies for categories in x, including all 
   levels in the reference set
   http://blogs.sas.com/content/iml/2015/10/07/tabulate-counts-ref.html */
start TabulateLevels(OutLevels, OutFreq, x, refSet);
   call tabulate(levels, freq, x);        /* compute the observed frequencies */
   OutLevels = union(refSet, levels);     /* union of data and reference set */ 
   idx = loc(element(OutLevels, levels)); /* find the observed categories */
   OutFreq = j(1, ncol(OutLevels), 0);    /* set all frequencies to 0 */
   OutFreq[idx] = freq;                   /* overwrite observed frequencies */
finish;

/* http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.226.103&rep=rep1&type=pdf */
/* AGRESTI,  WACKERLY, AND BOYETT (1979) Psychometrika */
start RandContingency(_c, _r);
   c = rowvec(_c);   m = ncol(c);
   r = colvec(_r);   n = nrow(r);
   tbl = j(n, m, 0);
   refSet = 1:m;
   vals = repeat(refSet, c);  /* NOTE: vector replication */
   perm = sample(vals, ,"WOR"); /* n=number of elements of vals */
   e = cusum(r);              /* ending indices */
   s = 1 // (1+e[1:n-1]);     /* starting indices */
   do j = 1 to n;
      v = perm[s[j]:e[j]];
      call TabulateLevels(lev, freq, v, refSet);
      tbl[j,] = freq;
   end;
   return( tbl );
finish;
store module=(TabulateLevels RandContingency);
quit;

/********************************************************/
/* MC estimates for exact tests  */
/********************************************************/
/* Agresti & Wackerly, http://www.stat.ufl.edu/~aa/articles/agresti_wackerly.pdf
   Table has exact p-value of p=0.0038 for null hypotheses (p. 115) */
title "Data from Agresti, Wackerly, and Boyett";
data TestTable;
do Row = 1 to 3;
   do Col = 1 to 3;
      input Count @@;
      output;
   end;
end;
datalines;
10  1  6
 3  5  0
 5  0  1
;

proc freq data=TestTable;
weight Count;
table Row*Col / norow nocol nopct chisq;
exact pchi;  /* run an exact chi-square test */
run;

ods select PearsonChiSq PearsonChiSqMC;
proc freq data=TestTable;
weight Count;
table Row*Col / norow nocol nopct chisq;
exact pchi / MC; /* use MC option to get Monte Carlo estimate */
run;

title "Monte Carlo Simulation for Chi-Square Test";

proc iml;
load module=(RandContingency);
T = {10 1 6,
      3 5 0,
      5 0 1};
c = T[+, ];  r = T[ ,+];

/**************************************************************/
/* 1. Compute the chi-square statistic on the observed table. */
/**************************************************************/

E = r*c / c[+];         /* expected under null model of independence */
q0 = sum((T-E)##2 / E); /* observed chi-square statistic */
print q0;

/**************************************************************/
/* 2. Simulate random tables from the null distribution and 
      evaluate the chi-square statistic on each. */
/**************************************************************/

start MCChiSq(tbl, NRep);
   c = tbl[+, ];
   r = tbl[ ,+];
   E = r*c / c[+];   /* expected under null model of independence */
   q = j(NRep,1);
   do i = 1 to nRep;
      A = RandContingency(c, r);
      q[i] = sum((A-E)##2 / E);
   end;
   return( q );
finish;

call randseed(54321);
N = 10000;
q = MCChiSq(T, N);

/**************************************************************/
/* 3. Compute the p-value as the proportion of statistics that 
      are at least as extreme as the observed statistic. */
/**************************************************************/

x = (q>=q0);                 /* binary vector */
pValue = mean(x);
print pValue;

/**************************************************************/
/* 4. Optionally, you can compute a Wald confidence interval 
      for the p-value. */
/**************************************************************/

/* Let x be binary 0/1 vector. Compute estimate for 
   proportion. Use asymptotic standard error to 
   construct two-sided Wald confidence interval. For a Monte Carlo
   estimate, the standard error has N-1 in the denominator. */
start BinomialCI(x, alpha=0.05);
   p = mean(x);                       /* estimate proportion of 1s */
   se = sqrt(p*(1-p) / (nrow(x)-1));  /* standard error for MC estimate */
   z = quantile("Normal", 1-alpha/2); /* two-sided */
   LowerCL = p - z*se;
   UpperCL = p + z*se;
   return( p || max(LowerCL, 0) || min(UpperCL, 1) );
finish;

est = BinomialCI(x, 0.01);   /* compute 99% CL to match PROC FREQ */
print est[L="Binomial 99% CI" c={"Est" "LowerCL" "UpperCL"}];


/**************************************************************/
/* 5. Optionally, create a histogram of the 10,000 statistics, 
      which approximate the sampling distribution of the 
      statistic.  Mark the value of the observed statistic. */
/**************************************************************/

reflineStmt = "refline " + char(q0) + " / axis=x;";
call Histogram(q) other=reflineStmt;