/* Program to accompany the article "Approximating a distribution from published quantiles" 18JUN2014 Rick Wicklin http://blogs.sas.com/content/iml/2014/06/18/distribution-from-quantiles */ proc iml; /* quantiles of total cholesterol from NHANES study */ Quantile = {0 , .01, .05, .10, .25, .50, .75, .90, .95, .99, 1}; Estimate = {80, 128, 151, 162, 184, 209, 236, 265, 284, 333, 727}; print Quantile[format=percent6.] Estimate; title "Approximate Empirical Distribution Function"; footnote justify=left "NHANES 1999-2002: Males, age 40-59"; /* use 12.3 function to graph ECDF */ call Series(Estimate,Quantile) grid={X Y} label="Total Cholesterol" option="Markers"; /*****************************************************************/ /* interpolation code from http://blogs.sas.com/content/iml/2012/03/16/linear-interpolation-in-sas/ */ start Last(x); /* Helper module: return last element in a vector */ return( x[nrow(x)*ncol(x)] ); finish; /* Interpolate such that (x,y) is on line segment between (x1,y1) and (x2,y2) */ start LinInterpPt(x1, x2, y1, y2, x); m = (y2-y1)/(x2-x1); /* slope */ return( m#(x-x1) + y1 ); /* point-slope formula */ finish; /* Linear interpolation: simple version */ start LinInterp1(x, y, v); /* Given column vectors (x, y), interpolate values for column vector v. Assume: 1. no missing values in x, y, or v 2. the values of x are unique and sorted 3. each element of v is in the interval [minX, maxX) */ fv = j(nrow(v),1); /* allocate return vector */ do i = 1 to nrow(v); k = Last( loc(x <= v[i] )); /* largest x less than v[i] */ fv[i] = LinInterpPt(x[k], x[k+1], y[k], y[k+1], v[i]); end; return( fv ); finish; /*****************************************************************/ /* simulate data from the piecewise linear approximate ECDF */ N = 10000; call randseed(1); U = j(N,1); call randgen(U, "uniform"); x = LinInterp1(Quantile, Estimate, U); /* inverse function is also PL linear */ x = round(x); /* cholesterol is integer valued */ title "Simulated Values of Total Cholesterol"; footnote justify=left "Simulated Males, age 40-59"; /* use 12.3 function to graph distribution of simulated data */ call histogram(x) label="Total Cholesterol"; call qntl(SimEst, x, Quantile); print Quantile[F=percent6.] Estimate SimEst[F=5.1];