/* SAS program to accompany the article "Compute highest density regions in SAS" by Rick Wicklin published 01Aug2016 on The DO Loop blog: http://blogs.sas.com/content/iml/2016/08/01/highest-density-regions-sas.html */ title "Jittered Scatter Plot of Data"; proc sgplot data=sashelp.iris; scatter x=SepalLength y=SepalWidth / jitter markerattrs=(symbol=CircleFilled size=10) transparency=0.5; xaxis grid; yaxis grid; run; /* define rectangular area on which to estimate density */ %let xmin = 35; %let xmax = 85; /* horizontal range */ %let ymin = 15; %let ymax = 45; /* vertical range */ title "Bivariate Kernel Density Estimate"; proc kde data=sashelp.iris; bivar SepalLength(gridL=&xmin gridU=&xmax) /* optional: bwm=0.8 ngrid=100 */ SepalWidth(gridL=&ymin gridU=&ymax) / /* optional: bwm=0.8 ngrid=100 */ out=KDEOut plots=ContourScatter; ods output Controls=Controls; run; /* What is the maximum value of the density on the boundary of the rectangle defined by (xmin, ymin) and (xmax, ymax)? It needs to be close to zero. */ proc means data=KDEOut max; where value1=&xmin OR value1=&xmax OR value2=&ymin OR value2=&ymax; var density; run; /* The "Levels" table provides bounding rectangles for the levels sets of (1-pct) or density */ /* data Poly; ID="95%"; x=39.24; y=20.59; output; x=79.92; y=20.59; output; x=79.92; y=42.46; output; x=39.24; y=42.46; output; ID="50%"; x=46.86; y=25.17; output; x=71.44; y=25.17; output; x=71.44; y=36.36; output; x=46.86; y=36.36; output; run; data All; set sashelp.iris Poly; run; proc sgplot data=All; polygon id=id x=x y=y / fill group=ID label labelpos=ymax labelattrs=(color=black weight=bold); scatter x=SepalLength y=SepalWidth; run; */ /* Compute area of each cell in grid */ proc iml; use Controls; read all var _NUM_ into Z[rowname=Descr]; close Controls; N = Z[1,]; /* Grid Points */ L = Z[2,]; /* Lower Grid Limit */ U = Z[3,]; /* Upper Grid Limit */ delta = (U - L) / (N-1); /* (dx, dy) */ CellArea = delta[#]; /* area of each cell in grid */ print CellArea; call symputx("CellArea", CellArea); /* create macro variable */ quit; proc sort data=KDEOut; by descending density; run; /* ensure that 0 is the lower limit of the density scale */ data FakeObs; value1=.; value2=.; density=0; run; /* accumulate probability and area */ data KDEOut; set FakeObs KDEOut; /* add fake observation */ Area + &CellArea; /* cumulative area */ prob + &CellArea * Density; /* cumulative probability */ run; title "Approximate 50% HDR"; proc sgplot data=KDEOut aspect=1; where prob <= 0.5; label value1="Sepal Length (mm)" value2="Sepal Width (mm)"; styleattrs wallcolor=CXF0F0F0; /* light gray background */ scatter x=Value1 y=Value2 / markerattrs=(symbol=SquareFilled size=7) colorresponse=Density colormodel=ThreeColorRamp; xaxis min=&xmin max=&xmax grid; yaxis min=&ymin max=&ymax grid; run; proc means data=KDEOut max; where prob <= 0.5; /* 0.5 is for the 50% HDR */ var Area; run; title "Approximate 95% HDR"; proc sgplot data=KDEOut aspect=1; where prob <= 0.95; label value1="Sepal Length (mm)" value2="Sepal Width (mm)"; styleattrs wallcolor=CXF0F0F0; /* light gray background */ scatter x=Value1 y=Value2 / markerattrs=(symbol=SquareFilled size=7) colorresponse=Density colormodel=ThreeColorRamp; xaxis min=&xmin max=&xmax grid; yaxis min=&ymin max=&ymax grid; run; proc means data=KDEOut max; where prob <= 0.95; /* 0.95 is for the 50% HDR */ var Area; run; /******************************************************/ /******************************************************/ /* Compare HDRs for bivariate normal data with */ /* prediciton ellipses. */ /******************************************************/ title "Bivariate Normal Data"; /******************************************************/ /* http://blogs.sas.com/content/iml/2014/07/23/prediction-ellipses-from-covariance.html */ /******************************************************/ proc iml; start PredEllipseFromCov(m, S, n, ConfLevel=0.95, nPts=128); /* Compute prediction ellipses centered at m from the 2x2 covariance matrix S. The return matrix is a matrix with three columns. Col 1: The X coordinate of the ellipse for the confidence level. Col 2: The Y coordinate of the ellipse for the confidence level. Col 3: The confidence level. Use this column to extract a single ellipse. Input parameters: m a 1x2 vector that specifies the center of the ellipse. This can be the sample mean or median. S a 2x2 symmetric positive definite matrix. This can be the sample covariance matrix or a robust estimate of the covariance. n The number of nonmissing observations in the data. ConfLevel a 1 x k vector of (1-alpha) confidence levels that determine the ellipses. Each entry is 0 < ConfLevel[i] < 1. For example, 0.95 produces the 95% confidence ellipse for alpha=0.05. nPts the number of points used to draw the ellipse. The default is 0.95. */ /* parameterize standard ellipse in coords of eigenvectors */ call eigen(lambda, evec, S); /* eigenvectors are columns of evec */ t = 2*constant("Pi") * (0:nPts-1) / nPts; xy = sqrt(lambda[1])*cos(t) // sqrt(lambda[2])*sin(t); stdEllipse = T( evec * xy ); /* transpose for convenience */ /* scale the ellipse; see documentation for PROC CORR */ c = 2 * (n-1)/n * (n+1)/(n-2); /* adjust for finite sample size */ p = rowvec(ConfLevel); /* row vector of confidence levels */ F = sqrt(c * quantile("F", p, 2, n-2)); /* p = 1-alpha */ ellipse = j(ncol(p)*nPts, 3); /* 3 cols: x y p */ startIdx = 1; /* starting index for next ellipse */ do i = 1 to ncol(p); /* scale and translate */ idx = startIdx:startIdx+nPts-1; ellipse[idx, 1:2] = F[i] # stdEllipse + m; ellipse[idx, 3] = p[i]; /* use confidence level as ID */ startIdx = startIdx + nPts; end; return( ellipse ); finish; /* Bivariate normal parameters */ N = 1000; mean = {60 86}; cov = {84 24, 24 28}; /* generate data from MVN(mean, cov) */ call randseed(1234); X = randnormal(N, mean, cov); *call scatter(X[,1], X[,2]); create MVN from X[c={"X" "Y"}]; append from X; close MVN; mean = mean(X); /* use empirical estimates instead of population parameters */ cov = cov(X); ConfLevel = {0.5 0.90}; classicalEllipse = PredEllipseFromCov(mean, cov, N, ConfLevel); /* write the ellipse data to a SAS data set */ E = classicalEllipse; create Ellipse from E[c={"classX" "classY" "ConfLevel"}]; append from E; close Ellipse; /* compute area of ellipses */ package load polygon; A = PolyArea(E); print A[r=(char(ConfLevel)) L="Area"]; call symputx("A50", A[1]); /* create macro variable */ call symputx("A90", A[2]); /* create macro variable */ quit; data All; set MVN Ellipse; run; %let xmin=30; %let xmax=95; %let ymin=70; %let ymax=105; /*****************************************/ title "Bivariate Kernel Density Estimate"; ods exclude all; proc kde data=MVN; bivar x(gridL=&xmin gridU=&xmax bwm=0.8) y(gridL=&ymin gridU=&ymax bwm=0.8) / out=KDEOut plots=NONE /*ContourScatter*/; ods output Controls=Controls; run; ods exclude none; /* Compute area of each cell in grid */ proc iml; use Controls; read all var _NUM_ into Z[rowname=Descr]; close Controls; N = Z[1,]; /* Grid Points */ L = Z[2,]; /* Lower Grid Limit */ U = Z[3,]; /* Upper Grid Limit */ delta = (U - L) / (N-1); /* (dx, dy) */ CellArea = delta[#]; /* area of each cell in grid */ call symputx("CellArea", CellArea); /* create macro variable */ quit; proc sort data=KDEOut; by descending density; run; /* ensure that 0 is the lower limit of the density scale */ data FakeObs; value1=.; value2=.; density=0; run; /* accumulate probability and area */ data KDEOut; set FakeObs KDEOut; /* add fake observation */ Area + &CellArea; /* cumulative area */ prob + &CellArea * Density; /* cumulative probability */ run; data K1; set KDEOut(where=(prob < 0.5)) Ellipse(where=(ConfLevel=0.5)); run; data K2; set KDEOut(where=(prob < 0.90)) Ellipse(where=(ConfLevel=0.90)); run; title "Compare Areas"; proc means data=K1 max; var Area; run; proc means data=K2 max; var Area; run; title "Prediction Ellipse for Bivariate Data"; title2 "Areas: A50=&A50 and A90=&A90"; proc sgplot data=All aspect=1 noautolegend; polygon x=classX y=classY id=ConfLevel / lineattrs=GraphData1 label labelpos=ymax; scatter x=x y=y / transparency=0.9 markerattrs=(symbol=CircleFilled size=12); xaxis min=&xmin max=&xmax grid; yaxis min=&ymin max=&ymax grid; run; title "Approximate 50% HDR"; title2 "Overlay with 50% Prediction Ellipse for Bivariate Normal Data"; proc sgplot data=K1 aspect=1 noautolegend; styleattrs wallcolor=CXF0F0F0; /* light gray background */ scatter x=Value1 y=Value2 / markerattrs=(symbol=SquareFilled size=7) colorresponse=Density colormodel=ThreeColorAltRamp; polygon x=classX y=classY id=ConfLevel / lineattrs=GraphData1 label labelpos=ymax; xaxis min=&xmin max=&xmax grid; yaxis min=&ymin max=&ymax grid; run; title "Approximate 90% HDR"; title2 "Overlay with 90% Prediction Ellipse for Bivariate Normal Data"; proc sgplot data=K2 aspect=1 noautolegend; styleattrs wallcolor=CXF0F0F0; /* light gray background */ scatter x=Value1 y=Value2 / markerattrs=(symbol=SquareFilled size=7) colorresponse=Density colormodel=ThreeColorRamp; polygon x=classX y=classY id=ConfLevel / lineattrs=GraphData1 label labelpos=ymax; xaxis min=&xmin max=&xmax grid; yaxis min=&ymin max=&ymax grid; run;