/* SAS program to accompany the article "Use a fringe plot to visualize binary data in logistic models" by Rick Wicklin, published 31MAY2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/05/31/fringe-plot-binary-logistic.html This program shows how to construct a double-sided fringe plot that shows the locations of a binary response when plotted against a continuous covariate or the predicted probabilities. The resulting panel is better than the usual "overlay a scatter plot" method for visualizing the distribution of the horizontal variable in the plot. */ /************************************************ Simulate data, run logistic regression, and display usual predicted probability plot and calibration plot. ************************************************/ ods graphics / reset; /* simulate a sample that follows a quadratic logistic regression model */ ODS NOPROCTITLE; /* suppress procedure titles */ title; %let N = 500; /* sample size */ data LogiSim(keep= Y x); call streaminit(1234); /* set the random number seed */ do i = 1 to &N; /* for each observation in the sample */ /* Hosmer et al. chose x ~ U(-3,3). Use rand("Uniform",-3,3) for modern versions of SAS */ x = -3 + 6*rand("Uniform"); eta = -2.337 + 0.8569*x + 0.3011*x**2; /* quadratic model used by Hosmer et al. */ mu = logistic(eta); Y = rand("Bernoulli", mu); output; end; run; /* Optional: What is proportion of Y=0 and Y=1? */ /* proc freq data=LogiSim; tables y; run; */ /* 1. Use PROC LOGISTIC and output the predicted probabilities and CLs */ ods graphics / width=400px height=300px; proc logistic data=LogiSim plots(only)=FitPlot ; model Y(event='1') = x; ods select EffectPlot; output out=LogiOut predicted=PredProb lower=PredLower upper=PredUpper; run; proc sort data=LogiOut; by PredProb; run; ods graphics / width=320px height=300px; title "Calibration Plot with Binary Response"; proc sgplot data=LogiOut noautolegend aspect=1; loess x=PredProb y=y / interpolation=cubic clm; scatter x=PredProb y=y; lineparm x=0 y=0 slope=1 / lineattrs=(color=grey pattern=dash); run; ods graphics / width=640px height=480px; /*****************************************/ /* Define format used for butterfly plot */ /*****************************************/ proc format; picture positive low-<0="000,000" 0<-high="000,000"; run; /************************************************ A predicted probability plot with binary fringe Compute predicted probability plot and binary (butterfly) fringe plot for X. Plot in a panel. ************************************************/ /* Create panel that overlays FitPlot and Butterfly Fringe Plot */ %let XVar = X; /* continuous explanatory vaiable */ %let YVar = Y; /* binary response variable */ /* 2. Compute the min and max of the continuous explanatory variable */ proc sql noprint; select min(&XVar), max(&XVar), range(&XVar)/100 into :min, :max, :dx from LogiOut; quit; /* 3. Use PROC UNIVARIATE to count the number of X values in each of 100 bins in the range [min, max] for Y=0 and Y=1. */ proc univariate data=LogiOut; class &YVar; var &XVar; histogram &XVar / midpoints=(&min to &max by &dx) outhist=OutHist; /* 100 subdivisions */ ods select histogram; run; /* convert data from long format to wide format */ data ButterflyFringe; keep _MIDPT_ y0 y1; label y0= y1= _MIDPT_=; /* remove labels */ merge OutHist(where=(&YVar=1) rename=(_COUNT_=y1)) OutHist(where=(&YVar=0) rename=(_COUNT_=y0)); by _MIDPT_; /* merge on 'x' */ y0 = -y0; /* trick: reverse the direction */ run; /* 4. Merge the counts with the predicted probabilities. */ proc sort data=LogiOut; by &XVar; run; data LogiPlot; set LogiOut ButterflyFringe; format y0 y1 positive.; /* display counts as positive values */ label y0 = "Count" y1 = "Count"; run; /* 5. Define a GTL template to define a panel plot. The main panel showsn the predicted probabilities and the lower panel shows the binary fringe plot. */ proc template; define statgraph FitPlotFringePanel; dynamic _X _Y _YHIGH _YLOW _XCENTER _COUNTHIGH _COUNTLOW _TITLE; begingraph; entrytitle _TITLE; layout lattice / rowdatarange=data columndatarange=union rows=2 rowgutter=2 rowweights=(0.85 0.15); layout overlay / yaxisopts=( griddisplay=on); bandplot x=_X limitupper=_YHIGH limitlower=_YLOW / name='band' connectorder=axis extend=true; seriesplot x=_X y=_Y / name='Pred' connectorder=xaxis; endlayout; layout overlay / yaxisopts=(label="Count" griddisplay=on); highlowplot x=_XCENTER low=_COUNTLOW high=_COUNTHIGH / name='count' /*type=bar barwidth=0.5*/; /* use TYPE=BAR if you want a histogram look */ referenceline y=0; endlayout; columnaxes; columnaxis / griddisplay=ON; endcolumnaxes; endlayout; endgraph; end; run; /* 6. Use PROC SGRENDER to display the panel. */ title; proc sgrender data=LogiPlot template=FitPlotFringePanel; dynamic _X="&XVar" _Y="PredProb" _YHIGH="PredUpper" _YLOW="PredLower" _XCENTER="_MIDPT_" _COUNTLOW="y0" _COUNTHIGH="y1" _TITLE="Fit Plot with Butterfly Fringe Plot"; run; /************************************************ A calibration plot with binary fringe Compute calibration plot with loess curve and binary (butterfly) fringe plot for X. Plot in a panel. ************************************************/ /* Create panel that overlays FitPlot and Butterfly Fringe Plot */ %let XVar = PredProb; /* predicted probabilities */ %let YVar = Y; /* binary response variable */ /* 2. Use PROC LOESS to regress Y onto the predicted probability. This estimates the empirical probability for each value of the predicted probability. */ proc sort data=LogiOut; by &XVar; run; ods exclude all; proc loess data=LogiOut; model &YVar = &XVar / interp=cubic; output out=LoessOut Predicted LCLM UCLM; run; ods exclude none; /* 3. Use PROC UNIVARIATE to count the number of predicted probabilities for each of 100 bins in the range [0, 1] for Y=0 and Y=1. */ proc univariate data=LogiOut; class &YVar; var &XVar; histogram &XVar / midpoints=(0.005 to 0.995 by 0.01) outhist=OutHist; /* 100 points in [0,1] */ ods select histogram; run; /* convert data from long format to wide format */ data ButterflyFringe; keep _MIDPT_ y0 y1; label y0= y1= _MIDPT_=; /* remove labels */ merge OutHist(where=(&YVar=1) rename=(_COUNT_=y1)) OutHist(where=(&YVar=0) rename=(_COUNT_=y0)); by _MIDPT_; /* merge on 'x' */ y0 = -y0; /* trick: reverse the direction */ run; /* 4. Merge the counts with the predicted probabilities. */ data LogiPlot2; set LoessOut(keep=&XVar &YVar Predicted LowerCL UpperCL) ButterflyFringe; format y0 y1 positive.; /* display counts as positive values */ label y0 = "Count" y1 = "Count"; run; /* 5. Define a GTL template to define a panel plot. The main panel shows the calibration plot and the lower panel shows the binary fringe plot. */ /* Use LAYOUT LATTICE to define a panel plot that shows the butterfly fringe plot */ proc template; define statgraph CalibrationFringePanel; dynamic _X _Y _YHIGH _YLOW _XCENTER _COUNTHIGH _COUNTLOW _TITLE; begingraph; entrytitle _TITLE; layout lattice / rowdatarange=data columndatarange=union rows=2 rowgutter=2 rowweights=(0.85 0.15); layout overlay / yaxisopts=( griddisplay=on); bandplot x=_X limitupper=_YHIGH limitlower=_YLOW / name='band' connectorder=axis extend=true; seriesplot x=_X y=_Y / name='Pred' connectorder=xaxis; lineparm x=0 y=0 slope=1 / clip=TRUE; endlayout; layout overlay / yaxisopts=(label="Count" griddisplay=on); highlowplot x=_XCENTER low=_COUNTLOW high=_COUNTHIGH / name='count' /*type=bar barwidth=0.5*/; /* use TYPE=BAR if you want a histogram look */ referenceline y=0; endlayout; columnaxes; columnaxis / griddisplay=ON; endcolumnaxes; endlayout; endgraph; end; run; title; ods graphics / width=480px height=480px; proc sgrender data=LogiPlot2 template=CalibrationFringePanel; dynamic _X="&XVar" _Y="Predicted" _YHIGH="UpperCL" _YLOW="LowerCL" _XCENTER="_MIDPT_" _COUNTLOW="y0" _COUNTHIGH="y1" _TITLE="Calibration Plot with Butterfly Fringe Plot"; run;