/* SAS program to accompany the article
"Model assessment and selection in machine learning"
by Rick Wicklin, published 30JAN2019 on The DO Loop blog:
https://blogs.sas.com/content/iml/2019/01/30/model-validation-machine-learning.html
This program shows how to use a validation data set to
assess the goodness of fit for a model that was
fit to data in a training data set. The idea is adapted from
Rogers and Girolami, 2016,
_A First Course in Machine Learning_, Second Edition, p. 31-36 and 86.
*/
/* 1. Simulate data from a cubic polynomial regression model.
nTrain = 50; nValidate = 200
*/
data Have;
length Type $10.;
call streaminit(54321);
do i = 1 to 250;
if i <= 50 then Type = "Train";
else Type = "Validate";
x = rand("uniform", -3, 3);
/* 2 - 1.105 x - 0.2 x^2 + 0.5 x^3 */
y = 2 + 0.5*x*(x+1.3)*(x-1.7) + rand("Normal");
output;
end;
run;
/* Visualize the training and validation data. */
title "Training and Validation Data";
title2 "True Model Is Cubic Polynomial";
proc sgplot data=Have;
scatter x=x y=y / group=Type grouporder=data;
xaxis grid;
yaxis grid;
run;
/* 2. You can use the EFFECT statement to define a POLYNOMIAL effect
of degreed=d. See
https://blogs.sas.com/content/iml/2017/09/07/polynomial-effects-regression-sas.html
*/
%let Degree = 3;
proc glmselect data=Have;
effect poly = polynomial(x / degree=&Degree); /* model is polynomial of specified degree */
partition rolevar=Type(train="Train" validate="Validate"); /* specify training/validation observations */
model y = poly / selection=NONE; /* fit model on training data */
ods select FitStatistics ParameterEstimates;
*output out=glmout P=pred R=resid;
run;
/* Of course, in reality, we don't know the true model!
Let's fit many polynomial models of different degrees and
choose the best one according
(A) A classical information criterion such as AICC or SBC
(B) The average square error of the predicted model
when evaluated on the validation data.
*/
%MACRO DoPolyFit(MaxDegree);
proc datasets noprint nowarn; /* delete and data sets with prefix 'FitStats' */
delete FitStats:;
quit;
options nonotes;
%DO Degree = 1 %TO &MaxDegree;
/* use POLYNOMIAL effect to fit polynomial of degree=D */
title "Fit Polynomial of Degree = &Degree.";
proc glmselect data=Have;
partition rolevar=Type(train="Train" validate="Validate");
effect poly = polynomial(x / degree=&Degree);
model y = poly / selection=NONE;
ods output FitStatistics = Stats;
ods select FitStatistics ParameterEstimates;
run;
/* add Degree variable to data set */
data FitStats&Degree.;
Degree = &Degree.; set Stats(drop=cValue1);
run;
%END;
options notes;
/* concatenate all data sets into one data set. Rename some variables. */
data PolyFit;
set FitStats:;
rename Label1 = Statistic nValue1 = Value;
run;
%MEND;
%DoPolyFit(7);
/* 3. Visualize the goodness of fit as a function of the degree of
the polynomial models. Plot ASE on training and validation
data sets versus the degree of the polynomial. Note that
the minimum ASE on the validation data occurs when d=3. */
title "Fit Polynomial Models to Data";
title2 "nTrain = 50; nValidation = 200";
proc sgplot data=PolyFit;
where Statistic in ('ASE (Train)' 'ASE (Validate)');
series x=Degree y=Value / markers group=Statistic;
yaxis grid type=log;
xaxis grid;
run;
/* Compare with the classical method: AIC and SBC also choose d=3 */
title "Fit Statistics for Polynomial Models";
title2 "Sample Size = 50";
proc sgpanel data=PolyFit;
where Statistic in ('AIC' 'AICC' 'SBC');
panelBy Statistic / columns=1 uniscale=column onepanel sort=data;
series x=Degree y=Value / markers;
colaxis grid;
rowaxis grid;
run;
/* PROC GLMSELECT can actually automate this process by using
variable selection techniques. Use validation data to choose
effects to enter and leave the model. Effects chosen from
Intercept, x, x**2, ..., x**7 */
proc glmselect data=Have seed=1 plots=(ASEPlot Coefficients);
effect poly = polynomial(x / degree=7);
model y = poly / selection= stepwise(choose=validate select=validate);
partition rolevar=Type(train="Train" validate="Validate");
run;