/* Program to accompany "Monte Carlo simulation for contingency tables in SAS" by Rick Wicklin, published 28OCT2015 on The DO Loop blog http://blogs.sas.com/content/iml/2015/10/28/simulation-exact-tables.html */ /******************************************************************/ /* The following modules are described in the article "Simulate contingency tables with fixed row and column sums in SAS" by Rick Wicklin, published 21OCT2015 on The DO Loop blog http://blogs.sas.com/content/iml/2015/10/21/simulate-contingency-tables-fixed-sums-sas.html */ proc iml; /* output levels and frequencies for categories in x, including all levels in the reference set http://blogs.sas.com/content/iml/2015/10/07/tabulate-counts-ref.html */ start TabulateLevels(OutLevels, OutFreq, x, refSet); call tabulate(levels, freq, x); /* compute the observed frequencies */ OutLevels = union(refSet, levels); /* union of data and reference set */ idx = loc(element(OutLevels, levels)); /* find the observed categories */ OutFreq = j(1, ncol(OutLevels), 0); /* set all frequencies to 0 */ OutFreq[idx] = freq; /* overwrite observed frequencies */ finish; /* http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.226.103&rep=rep1&type=pdf */ /* AGRESTI, WACKERLY, AND BOYETT (1979) Psychometrika */ start RandContingency(_c, _r); c = rowvec(_c); m = ncol(c); r = colvec(_r); n = nrow(r); tbl = j(n, m, 0); refSet = 1:m; vals = repeat(refSet, c); /* NOTE: vector replication */ perm = sample(vals, ,"WOR"); /* n=number of elements of vals */ e = cusum(r); /* ending indices */ s = 1 // (1+e[1:n-1]); /* starting indices */ do j = 1 to n; v = perm[s[j]:e[j]]; call TabulateLevels(lev, freq, v, refSet); tbl[j,] = freq; end; return( tbl ); finish; store module=(TabulateLevels RandContingency); quit; /********************************************************/ /* MC estimates for exact tests */ /********************************************************/ /* Agresti & Wackerly, http://www.stat.ufl.edu/~aa/articles/agresti_wackerly.pdf Table has exact p-value of p=0.0038 for null hypotheses (p. 115) */ title "Data from Agresti, Wackerly, and Boyett"; data TestTable; do Row = 1 to 3; do Col = 1 to 3; input Count @@; output; end; end; datalines; 10 1 6 3 5 0 5 0 1 ; proc freq data=TestTable; weight Count; table Row*Col / norow nocol nopct chisq; exact pchi; /* run an exact chi-square test */ run; ods select PearsonChiSq PearsonChiSqMC; proc freq data=TestTable; weight Count; table Row*Col / norow nocol nopct chisq; exact pchi / MC; /* use MC option to get Monte Carlo estimate */ run; title "Monte Carlo Simulation for Chi-Square Test"; proc iml; load module=(RandContingency); T = {10 1 6, 3 5 0, 5 0 1}; c = T[+, ]; r = T[ ,+]; /**************************************************************/ /* 1. Compute the chi-square statistic on the observed table. */ /**************************************************************/ E = r*c / c[+]; /* expected under null model of independence */ q0 = sum((T-E)##2 / E); /* observed chi-square statistic */ print q0; /**************************************************************/ /* 2. Simulate random tables from the null distribution and evaluate the chi-square statistic on each. */ /**************************************************************/ start MCChiSq(tbl, NRep); c = tbl[+, ]; r = tbl[ ,+]; E = r*c / c[+]; /* expected under null model of independence */ q = j(NRep,1); do i = 1 to nRep; A = RandContingency(c, r); q[i] = sum((A-E)##2 / E); end; return( q ); finish; call randseed(54321); N = 10000; q = MCChiSq(T, N); /**************************************************************/ /* 3. Compute the p-value as the proportion of statistics that are at least as extreme as the observed statistic. */ /**************************************************************/ x = (q>=q0); /* binary vector */ pValue = mean(x); print pValue; /**************************************************************/ /* 4. Optionally, you can compute a Wald confidence interval for the p-value. */ /**************************************************************/ /* Let x be binary 0/1 vector. Compute estimate for proportion. Use asymptotic standard error to construct two-sided Wald confidence interval. For a Monte Carlo estimate, the standard error has N-1 in the denominator. */ start BinomialCI(x, alpha=0.05); p = mean(x); /* estimate proportion of 1s */ se = sqrt(p*(1-p) / (nrow(x)-1)); /* standard error for MC estimate */ z = quantile("Normal", 1-alpha/2); /* two-sided */ LowerCL = p - z*se; UpperCL = p + z*se; return( p || max(LowerCL, 0) || min(UpperCL, 1) ); finish; est = BinomialCI(x, 0.01); /* compute 99% CL to match PROC FREQ */ print est[L="Binomial 99% CI" c={"Est" "LowerCL" "UpperCL"}]; /**************************************************************/ /* 5. Optionally, create a histogram of the 10,000 statistics, which approximate the sampling distribution of the statistic. Mark the value of the observed statistic. */ /**************************************************************/ reflineStmt = "refline " + char(q0) + " / axis=x;"; call Histogram(q) other=reflineStmt;