proc iml; /* output levels and frequencies for categories in x, including all levels in the reference set http://blogs.sas.com/content/iml/2015/10/07/tabulate-counts-ref.html */ start TabulateLevels(OutLevels, OutFreq, x, refSet); call tabulate(levels, freq, x); /* compute the observed frequencies */ OutLevels = union(refSet, levels); /* union of data and reference set */ idx = loc(element(OutLevels, levels)); /* find the observed categories */ OutFreq = j(1, ncol(OutLevels), 0); /* set all frequencies to 0 */ OutFreq[idx] = freq; /* overwrite observed frequencies */ finish; /* There are K[i] balls of color i in an urn. Draw N balls without replacement and report the number of each color. N[1] = sample size N[2] = number of replications (optional. Default is 1) K = vector that gives the number balls of each color. The total number of balls is sum(K). */ start RandMVHyper(N, _K); if nrow(N)*ncol(N)>1 then nRep = N[2]; else nRep = 1; K = rowvec(_K); /* K[i] is number of items for category i */ nDraw = j(nRep, 1, N[1]); /* run nRep sims at once */ ItemsLeft = j(nRep, 1, sum(K)); x = j(nRep, ncol(K), 0); /* each row is draw from MV hyper */ h = j(nRep, 1, 0); /* vec for hypergeometric values */ do i = 1 to ncol(K)-1; Kvec = j(nRep, 1, K[i]); idx0 = loc(nDraw=0); if ncol(idx0)=0 then do; /* usual case */ call randgen(h, "Hyper", ItemsLeft, Kvec, nDraw); x[,i] = h; ItemsLeft = ItemsLeft - K[i]; /* update parameters */ nDraw = nDraw - h; end; else do; /* for some replicate, all balls have been drawn but there are still more colors to draw (which will be zero). This can happen when the last few colors are sparsely represented, or in small samples such as drawing 5 balls from K = {5 2 2} */ x[idx0,i] = 0; idx1 = loc(nDraw>0); if ncol(idx)>0 then do; hh = idx1`; /* allocate */ call randgen(hh, "Hyper", ItemsLeft[idx1], Kvec[idx1], nDraw[idx1]); x[idx1,i] = hh; ItemsLeft = ItemsLeft - K[i]; /* update parameters */ nDraw[idx1] = nDraw[idx1] - hh; end; end; end; x[,ncol(K)] = nDraw; return (x); finish; /* Generate random contingency table with given row and column marginal sums. Each column is a draw from a multivariate hypergeometric distribution. Algorithm based on a paragraph in Gentle, 2003, p. 206. Is there a better reference? */ start RandContingencyHyper(_c, _r); c = rowvec(_c); m = ncol(c); r = colvec(_r); n = nrow(r); tbl = j(n, m, 0); do j = 1 to m-1; idx = loc(r > 0); /* in rare cases, row sums complete when still more columns to fill */ if ncol(idx)>0 then do; v = T( RandMVHyper(c[j], r[idx]) ); /* transpose to col vector */ tbl[idx,j] = v; r[idx] = r[idx] - v; end; end; tbl[,m] = r; /* last column is constrained */ return( tbl ); finish; /* http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.226.103&rep=rep1&type=pdf */ /* AGRESTI, WACKERLY, AND BOYETT (1979) Psychometrika */ start RandContingency(_c, _r); c = rowvec(_c); m = ncol(c); r = colvec(_r); n = nrow(r); tbl = j(n, m, 0); refSet = 1:m; vals = repeat(refSet, c); /* NOTE: vector replication */ perm = sample(vals, ,"WOR"); /* n=number of elements of vals */ e = cusum(r); /* ending indices */ s = 1 // (1+e[1:n-1]); /* starting indices */ do j = 1 to n; v = perm[s[j]:e[j]]; call TabulateLevels(lev, freq, v, refSet); tbl[j,] = freq; end; return( tbl ); finish; /*******************************************************/ title "Data from Agresti, Wackerly, and Boyett, p. 79"; call randseed(12345); Table = {10 1 6, 3 5 0, 5 0 1}; print Table; c = Table[+, ]; r = Table[ ,+]; do i = 1 to 3; RandTbl = RandContingency(c, r); print RandTbl; end; t0=time(); do i = 1 to 10000; tbl = RandContingency(c, r); end; t1 = time()-t0; print t1[L="Time for 10,000 Agresti, Wackerly, Boyett"]; /**********************************************/ /* compare with MV hypergeometric */ do i = 1 to 3; tblMVH = RandContingencyHyper(c, r); print tblMVH; end; t0=time(); do i = 1 to 10000; tblMVH = RandContingencyHyper(c, r); end; t1 = time()-t0; print t1[L="Time for 10,000 MV Hypergeometric"]; /***********************************************/ /* try a big 500 x 20 table */ r = j(500, 1, 60); c = j( 1, 20, 1500); print (sum(c)) (sum(r)); t0 = time(); tbl = RandContingency(c, r); t1 = time()-t0; print t1[L="Agresti algorithm"]; ods graphics / width=600px height=2000px; call heatmapcont(tbl); cc = tbl[+, ]; rr = tbl[ ,+]; if any(r^=rr) then print "row sums do not match";else print "row sums OK"; if any(c^=cc) then print "col sums do not match";else print "col sums OK"; call tabulate(lev, freq, tbl); /* tabulate counts for the 500x20=10,000 cells */ print lev, freq; print (min(tbl))[L="min"] (max(tbl))[L="max"]; t0=time(); do i = 1 to 1000; tbl = RandContingency(c, r); end; t1 = time()-t0; print t1[L="Time for 1000 Large (Agresti, Wackerly, Boyett)"];