/* SAS program to accompany the article "A zipper plot for visualizing coverage probability in simulation studies" by Rick Wicklin, published 26MAR2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/03/26/zipper-plot.html This program shows how to create a zipper plot for simulation studies that generate confidence intervals. The zipper plots help to display the empirical coverage probabilities for various data generating mechanisms, those that satisfy the assumptions of the null hypothesis and those that violate the assumption. The zipper plot is shown in Morris, White, and Crowther, 2017, p. 22, "Using simulation studies to evaluate statistical methods" https://arxiv.org/pdf/1712.03198.pdf although the Morris et al. call the plot a "zip plot." */ /* Simulate the data. X ~ N(0,1) and Y+1 ~ Exp(1) */ %let N = 50; /* size of each sample */ %let NumSamples = 10000; /* number of samples */ /* 1. Simulate samples from N(0,1) */ data Sim(drop=i); call streaminit(12345); do SampleID = 1 to &NumSamples; /* simulation loop */ do i = 1 to &N; /* N obs in each sample */ x = rand("Normal"); /* X ~ N(0,1) */ y = rand("Expo") - 1; /* Y + 1 ~ Exp(1) */ output; end; end; run; /**********************************************************************/ %macro ZipperPlot(VarName=, InDSName=, OutDSName=, FractionMaxDisplay=); /* 2. Compute normal CI for each sample */ proc means data=&InDSName noprint; by SampleID; var &varName; output out=OutStats mean=SampleMean lclm=Lower uclm=Upper probt=pvalue; run; /* 3. Compute MC estimate of coverage and CI for estimate */ /* 3a. Create indicator variable: Does CI include parameter mu=0? */ data OutStats; set OutStats; drop _TYPE_; ParamInCI = (Lower <= 0 <= Upper); /* indicator variable for empirical coverage */ run; /* 3b. Nominal coverage probability is 95%. Estimate true coverage. */ proc freq data=OutStats ; tables ParamInCI / nocum binomial(level='1' p=0.95); output out=Est binomial; run; /* 4. Create macro variables and auxiliary data to use in zipper plot */ /* 4a. Define macro variables for min(CI), max(CI), MC Estimate of coverage prob, and MC CI for coverage */ proc sql noprint; select min(Lower), max(Upper) into :min, :max from OutStats; quit; data _null_; set Est; call symput("pHat", put(_BIN_,5.3)); /* Empirical estimate of coverage */ call symput("pL", put(L_BIN,5.3)); /* 95% CI for estimate: Lower limit */ call symput("pU", put(U_BIN,5.3)); /* Upper limit */ run; /* 4b. Rank p-values and sort by rank */ proc rank data=OutStats out=Ranks fraction; var pvalue; ranks Fraction; run; /* if you want to reproduce the graphs in Morris et al 2017, set FractionMaxDisplay=1.0 and use the following procedure steps: */ /* proc rank data=OutStats out=Ranks group=100; var pvalue; ranks Fraction; run; data Ranks; set Ranks; fraction = fraction / 100; run; */ proc sort data=Ranks; by Fraction; run; /* Define a data set to support the BAND statement that shows the MC CI for the coverage */ data Band; rangeMin = &min - (&max - &min); /* make band extend beyond [min, max] */ rangeMax = &max + (&max - &min); Fraction = 1 - &pU; output; Fraction = 1 - &pL; output; run; /* Define a data set to support the REFLINE statement that shows the MC estimate of coverage and the coverage CI */ data Refline; Refline = 1 - &pL; output; /* 1 - covereage */ Refline = 1 - &pHat; output; Refline = 1 - &pU; output; run; /* 5. Create the zipper plot. Show each CI in simulation with p-value in range (0, &PValueMaxDisplay] */ /* define format for 0/1 indicator variable */ proc format; value YorN 0="Non-Coverer" 1="Coverer"; run; data &OutDSName; set Ranks Band Refline; label Fraction="Proportion of Ranked p-Values for Null"; format ParamInCI YorN.; run; title2 "Estimate: &pHat; CI: [&pL, &pU]"; proc sgplot data=&OutDSName; where Fraction < &FractionMaxDisplay; band y=Fraction lower=rangeMin upper=rangeMax; highlow y=Fraction low=Lower high=Upper / group=ParamInCI nomissinggroup transparency=0.5 name="CI"; refline 0 / axis=x; refline Refline/ axis=y lineattrs=(color=black); xaxis label="95% CI for Mean" min=&min max=&max; yaxis reverse offsetmin=0.02 offsetmax=0.01 grid; keylegend "CI" / title=""; run; /* Or you can create a horizontal zipper plot */ /* proc sgplot data=&OutDSName; where Fraction < &FractionMaxDisplay; band x=Fraction lower=rangeMin upper=rangeMax; highlow x=Fraction low=Lower high=Upper / group=ParamInCI nomissinggroup transparency=0.5 name="CI"; refline 0 / axis=y; refline Refline/ axis=x lineattrs=(color=black); yaxis label="95% CI for Mean" min=&min max=&max reverse; xaxis offsetmin=0.02 offsetmax=0.01 grid; keylegend "CI" / title=""; run; */ %mend; /* create zipper plot for x (normal data) */ ods graphics / width=350px height=400px; title "Empirical Coverage: Normal Data, All Intervals"; %ZipperPlot(VarName = x, InDSName = Sim, OutDSName = ZipperX, FractionMaxDisplay = 1.0); title "Empirical Coverage of Normal Data"; %ZipperPlot(VarName = x, InDSName = Sim, OutDSName = ZipperX, FractionMaxDisplay = 0.20); /* now do the same thing for y (exponential data) */ title "Empirical Coverage of Exponential Data"; %ZipperPlot(VarName = y, InDSName = Sim, OutDSName = ZipperY, FractionMaxDisplay = 0.20); /******************************************************/ /* combine both zipper plots into a panel */ %let FractionMaxDisplay = 0.2; data All; length Distribution $12; /* name of population distribution */ set ZipperX(in=Normal) ZipperY; if Normal then Distribution = "Normal"; else Distribution = "Exponential"; run; ods graphics / width=400px height=400px;; title "Comparison of Coverage Probabilities"; proc sgpanel data=All; where Fraction < &FractionMaxDisplay; panelby Distribution / sort=descformat; band y=Fraction lower=rangeMin upper=rangeMax; highlow y=Fraction low=Lower high=Upper / group=ParamInCI nomissinggroup transparency=0.5 name="CI"; refline 0 / axis=x; refline Refline / axis=y lineattrs=(color=black); colaxis label="95% CI for Mean" min=-1 max=1; rowaxis reverse offsetmin=0.02 offsetmax=0.01 grid; keylegend "CI" / title=""; run;