/* Program to accompany "3 ways to visualize prediction regions for classification problems" by Rick Wicklin. Published 17JUL2017 on The DO Loop blog http://blogs.sas.com/content/iml/2017/07/15/prediction-regions-classification.html */ title;footnote; ods graphics / reset antialias subpixel; /* Simulated data from Example from _Simulating Data with SAS_, p. 226--229 and http://blogs.sas.com/content/iml/2014/06/25/simulate-logistic-data.html followed by macro call %data2datastep(LogisticData,work,c:/temp/ds.txt,); from http://blogs.sas.com/content/sastraining/2016/03/11/jedi-sas-tricks-data-to-data-step-macro/ */ data LogisticData; input y x1 x2 @@; datalines; 0 0.88 0.37 1 0.97 3.09 0 0.51 -1.72 0 0.89 -0.75 1 0.69 -0.04 0 0.94 2.21 0 0.93 0.66 1 0.49 -2.13 0 0.78 -2.06 0 0.88 0.90 1 0.57 4.04 0 0.67 -0.78 0 0.82 -4.06 0 0.93 -0.33 1 0.79 2.02 1 0.11 -0.62 1 0.89 4.99 0 0.54 -2.79 1 0.28 -0.43 0 0.93 -3.93 1 0.61 0.78 1 0.86 0.89 0 0.61 0.12 0 0.47 1.46 0 0.59 0.58 0 0.77 -0.72 0 0.44 -1.45 1 0.08 0.16 1 0.91 3.17 1 0.55 1.64 0 0.67 1.98 1 0.24 -2.53 1 0.14 3.88 1 0.33 2.03 0 0.33 -2.00 0 0.75 0.68 1 0.69 2.91 0 0.62 2.03 0 0.79 0.25 1 0.99 5.99 1 0.16 2.84 1 0.38 0.42 1 0.32 -0.14 1 0.07 -1.06 1 0.48 -2.08 0 0.12 -0.57 1 0.39 1.14 1 0.70 -0.16 1 0.67 -0.69 1 0.48 1.16 0 0.92 2.48 1 0.05 -0.14 1 0.41 0.15 0 0.13 -0.39 0 0.53 -1.70 1 0.56 1.97 0 0.63 3.52 0 0.71 -1.85 1 0.36 -1.45 0 0.30 -1.52 0 0.32 -1.91 0 0.38 -2.68 0 0.01 -1.88 0 0.65 0.00 1 0.75 -2.31 1 0.02 1.98 0 0.82 0.91 1 0.40 0.74 1 0.43 -0.93 1 0.52 1.04 1 0.37 -0.52 1 0.18 -1.39 1 0.31 2.50 0 0.42 -0.32 0 0.84 0.67 1 0.31 2.75 0 0.39 -2.78 0 0.49 -3.27 0 0.26 -2.51 0 0.88 -0.15 1 0.44 1.91 0 1.00 0.33 0 0.84 -1.83 0 0.77 -1.23 0 0.43 -0.89 1 0.58 2.06 0 0.95 -1.52 0 0.37 -4.53 0 0.83 -1.37 1 0.51 1.89 1 0.30 0.52 1 0.40 4.49 0 0.40 -2.67 0 0.86 -0.77 1 0.04 -0.65 0 0.99 1.13 1 0.43 1.50 1 0.43 -0.25 1 0.85 1.63 1 0.01 1.59 1 0.53 1.40 1 0.76 2.41 0 0.61 -6.87 0 0.36 -0.60 0 0.88 -0.45 0 0.46 -0.31 1 0.65 5.73 1 0.25 3.13 1 0.46 2.38 1 0.21 1.43 1 0.92 2.71 1 0.23 2.32 1 0.51 1.68 0 0.70 -1.21 0 0.19 -2.25 1 0.69 3.03 0 0.23 -1.56 1 0.02 -2.85 1 0.10 0.59 1 0.62 0.92 1 0.43 1.16 0 0.74 -2.42 0 0.97 2.85 1 0.04 0.27 0 0.66 0.20 1 0.06 -0.18 0 0.27 -0.26 0 0.72 0.00 1 0.36 0.99 1 0.22 4.02 1 0.15 -1.90 1 0.14 -3.73 0 0.12 0.67 0 0.38 -1.21 1 0.30 -0.34 0 0.81 0.81 0 0.80 -2.81 0 0.91 0.31 0 0.38 -3.38 1 0.12 1.60 0 0.99 -0.93 1 0.19 2.13 0 0.65 -0.92 1 0.50 -0.45 0 0.19 -2.46 0 0.61 0.24 0 0.36 -0.53 1 0.29 -1.59 0 0.05 -3.07 0 0.44 1.51 ; /* plot raw data, colored by response */ title "Data with Binary Response"; proc sgplot data=LogisticData; scatter x=x1 y=x2 / group=y markerattrs=(symbol=CircleFilled); run; /* Three ways to visualize prediction regions for classification problems 1: Polygon regions from parametric model 2: Contour plot with scatterplot overlay 3: evaluate model on dense grid. Plot colored markers to indicate regions */ /* fit predictive model */ proc logistic data=LogisticData; model y(Event='1') = x1 x2; store work.LogiModel; /* save model to item store */ effectplot contour(x=x1 y=x2); /* 2. contour plot with scatter plot overlay */ ods select ParameterEstimates /* contains parametric formula for boundary */ ContourFitPlot; /* the contour plot and overlay */ run; ods trace off; proc template; source Stat.Lmr.Graphics.ContourFit; run; data Grid; /* create grid in (x1,x2) coords */ do x1 = 0 to 1 by 0.02; do x2 = -7.5 to 7.5 by 0.3; output; end; end; run; /* use PROC PLM to score model on a grid. See http://blogs.sas.com/content/iml/2014/02/19/scoring-a-regression-model-in-sas.html */ proc plm restore=work.LogiModel; /* use PROC PLM to score model on a grid */ score data=Grid out=Pred(rename=(x1=gx x2=gy)) / ilink; /* evaluate the model on new data */ run; data All; set LogisticData Pred; PredCat = (Predicted > 0.5); /* 0 or 1 indicator for most probable outcome */ /* for more than two outcomes, choose outcome that has highest probability */ run; /************************************************/ /* 1. Polygon region. For regions that are bounded by a parametric curve, you can use the POLYGON statement to form the regions. The parameter estimates from PROC LOGISTIC indicate that the boundary is the level set 2.3565 -4.7618*x1 + 0.7959*x2 = 0 (the probability is 0.5 on this curve) or x2 = (-2.3565 + 4.7618*x1) / 0.7959 */ data polygon; ID=0; px1 = 0; px2 = -7.5; output; /* bottom left */ px1 = 1; px2 = -7.5; output; /* bottom right */ px1 = 1; px2 = (-2.3565 + 4.7618*px1)/0.7959; output; /* top right */ px1 = 0; px2 = (-2.3565 + 4.7618*px1)/0.7959; output; /* top left */ ID=1; px1 = 0; px2 = (-2.3565 + 4.7618*px1)/0.7959; output; /* bottom left */ px1 = 1; px2 = (-2.3565 + 4.7618*px1)/0.7959; output; /* bottom right */ px1 = 1; px2 = 7.5; output; /* top right */ px1 = 0; px2 = 7.5; output; /* top left */ run; data PolyRegion; set polygon LogisticData; run; proc sgplot data=PolyRegion; label ID="Predicted Regions"; polygon x=px1 y=px2 id=ID / group=ID fill outline transparency=0.5; scatter x=x1 y=x2 / group=y filledoutlinedmarkers markeroutlineattrs=(color=black) markerattrs=(symbol=CircleFilled size=12); run; /************************************************/ /* 2. Score model on fine grid. Use contour plot and scatter overlay */ proc template; define statgraph ContourPlotScat; dynamic _X _Y _GROUP /* (x,y) and group for data */ _CX _CY _CZ _TITLE; /* (x,y,z) for background contour plot */ begingraph; entrytitle _TITLE; layout overlay; contourplotparm x=_CX y=_CY z=_CZ / contourtype=linefill lineattrs=(color=black) /* levels=(0.2 0.3 0.4 0.5 0.6 0.7 0.8) */ nhint=11 colormodel=ThreeColorRamp name="Contour"; scatterplot x=_X y=_Y / group=_GROUP markerattrs=(symbol=CircleFilled size=12) FILLEDOUTLINEDMARKERS=TRUE markeroutlineattrs=(color=black); continuouslegend "Contour" / title=_CZ; endlayout; endgraph; end; run; proc sgrender data=All template=ContourPlotScat; dynamic _TITLE="Predicted Probabilities" _CX="gx" _CY="gy" _CZ="Predicted" _X="x1" _Y="x2" _GROUP="y"; label gx="x1" gy="x2"; run; /************************************************/ /* 3: Score data on a fine grid. Plot markers colored by predicted regions */ /* This method is used in _The Elements of Statistical Learning_ by Hastie, Tibshirani, and Friedman */ /* use tiny markers, like Hastie, Tibshirani, and Friedman */ ods graphics / width=640px height=480px; proc sgplot data=All nocycleattrs noautolegend; label gx="x1" gy="x2"; scatter x=gx y=gy / group=PredCat markerattrs=(symbol=SquareFilled size=3); scatter x=x1 y=x2 / group=y filledoutlinedmarkers markerattrs=(symbol=CircleFilled size=12) markeroutlineattrs=(color=black); run; /* or enlarge marker sizes to tile the background, as in http://www.wildml.com/2015/09/implementing-a-neural-network-from-scratch/ */ ods graphics / width=550px height=540px; proc sgplot data=All nocycleattrs noautolegend; label gx="x1" gy="x2"; scatter x=gx y=gy / group=PredCat markerattrs=(symbol=SquareFilled size=9) transparency=0.5; scatter x=x1 y=x2 / group=y filledoutlinedmarkers markerattrs=(symbol=CircleFilled size=12) markeroutlineattrs=(color=black); run; ods graphics / width=640px height=480px; /* You can use the scoring techniques (2 and 3) with nonparametric models, such as splines */ proc logistic data=LogisticData; effect spl = spline(x1 x2); model y(Event='1') = spl; store work.LogiSplModel; effectplot contour(x=x1 y=x2); ods select ContourFitPlot; /* the contour plot and overlay */ run;