/* SAS program to accompany the article 
   "Model assessment and selection in machine learning"
   by Rick Wicklin, published 30JAN2019 on The DO Loop blog:
   https://blogs.sas.com/content/iml/2019/01/30/model-validation-machine-learning.html

   This program shows how to use a validation data set to 
   assess the goodness of fit for a model that was 
   fit to data in a training data set. The idea is adapted from
   Rogers and Girolami, 2016, 
   _A First Course in Machine Learning_, Second Edition, p. 31-36 and 86.
*/

/* 1. Simulate data from a cubic polynomial regression model.
      nTrain = 50; nValidate = 200
*/
data Have;
length Type $10.;
call streaminit(54321);
do i = 1 to 250;
   if i <= 50 then Type = "Train";
   else            Type = "Validate";
   x = rand("uniform", -3, 3);
   /* 2 - 1.105 x - 0.2 x^2 + 0.5 x^3 */
   y = 2 + 0.5*x*(x+1.3)*(x-1.7) + rand("Normal");
   output;
end;
run;

/* Visualize the training and validation data. */
title "Training and Validation Data";
title2 "True Model Is Cubic Polynomial";
proc sgplot data=Have;
   scatter x=x y=y / group=Type grouporder=data;
   xaxis grid;
   yaxis grid;
run;


/* 2. You can use the EFFECT statement to define a POLYNOMIAL effect 
   of degreed=d. See
   https://blogs.sas.com/content/iml/2017/09/07/polynomial-effects-regression-sas.html
*/
%let Degree = 3;   
proc glmselect data=Have;
   effect poly = polynomial(x / degree=&Degree);              /* model is polynomial of specified degree */
   partition rolevar=Type(train="Train" validate="Validate"); /* specify training/validation observations */
   model y = poly / selection=NONE;                           /* fit model on training data */
   ods select FitStatistics ParameterEstimates;
   *output out=glmout P=pred R=resid;
run;

/* Of course, in reality, we don't know the true model! 
   Let's fit many polynomial models of different degrees and 
   choose the best one according
   (A) A classical information criterion such as AICC or SBC
   (B) The average square error of the predicted model 
       when evaluated on the validation data.
*/
%MACRO DoPolyFit(MaxDegree);
proc datasets noprint nowarn;   /* delete and data sets with prefix 'FitStats' */
   delete FitStats:;
quit;
options nonotes;
%DO Degree = 1 %TO &MaxDegree;
   /* use POLYNOMIAL effect to fit polynomial of degree=D */
   title "Fit Polynomial of Degree = &Degree.";
   proc glmselect data=Have;
      partition rolevar=Type(train="Train" validate="Validate");
      effect poly = polynomial(x / degree=&Degree);
      model y = poly / selection=NONE;
      ods output FitStatistics = Stats;
      ods select FitStatistics ParameterEstimates;
   run;
   /* add Degree variable to data set */
   data FitStats&Degree.;
      Degree = &Degree.;   set Stats(drop=cValue1);
   run;
%END;
options notes;
/* concatenate all data sets into one data set. Rename some variables. */
data PolyFit;
   set FitStats:;
   rename Label1 = Statistic nValue1 = Value;
run;
%MEND;

%DoPolyFit(7);

/* 3. Visualize the goodness of fit as a function of the degree of 
      the polynomial models. Plot ASE on training and validation 
      data sets versus the degree of the polynomial. Note that 
      the minimum ASE on the validation data occurs when d=3. */
title "Fit Polynomial Models to Data";
title2 "nTrain = 50; nValidation = 200";
proc sgplot data=PolyFit;
   where Statistic in ('ASE (Train)' 'ASE (Validate)');
   series x=Degree y=Value / markers group=Statistic;
   yaxis grid type=log;
   xaxis grid;
run;


/* Compare with the classical method: AIC and SBC also choose d=3 */
title "Fit Statistics for Polynomial Models";
title2 "Sample Size = 50";
proc sgpanel data=PolyFit;
   where Statistic in ('AIC' 'AICC' 'SBC');
   panelBy Statistic / columns=1 uniscale=column onepanel sort=data;
   series x=Degree y=Value / markers;
   colaxis grid;
   rowaxis grid;
run;


/* PROC GLMSELECT can actually automate this process by using 
   variable selection techniques. Use validation data to choose
   effects to enter and leave the model. Effects chosen from 
   Intercept, x, x**2, ..., x**7 */
proc glmselect data=Have seed=1 plots=(ASEPlot Coefficients);
   effect poly = polynomial(x / degree=7);
   model y = poly / selection= stepwise(choose=validate select=validate);
   partition rolevar=Type(train="Train" validate="Validate");
run;