/* SAS program to accompany the following blog posts from The DO Loop:
"Mean imputation in SAS"
https://blogs.sas.com/content/iml/2017/12/04/mean-imputation-sas.html
by Rick Wicklin, published 04DEC2017
and
"3 reasons to avoid mean imputation"
https://blogs.sas.com/content/iml/2017/12/06/problems-mean-imputation.html
by Rick Wicklin, published 06DEC2017.
Purpose: Show how to replace missing data by mean, median, etc.,
by using PROC STDIZE in SAS.
*/
/* Create "original data" by
randomly inserting missing values for some heights */
data Have;
set sashelp.class;
call streaminit(12345);
Replaced = rand("Bernoulli", 0.4);
if Replaced then height = .;
run;
/* How to compute mean imputation:
Use PROC STDIZE to replace missing data with mean.
The original variable is Orig_Height.
The imputed variable is Height. */
proc stdize data=Have out=Imputed
oprefix=Orig_ /* prefix for original variables */
reponly /* only replace; do not standardize */
method=MEAN; /* or MEDIAN, MINIMUM, MIDRANGE, etc. */
var Height;
run;
proc print data=Imputed(obs=9);
format Orig_Height Height BESTD8.1;
var Name Orig_Height Height Weight Replaced;
run;
/*******************************************/
/* Drawbacks of mean imputation */
/*******************************************/
/* Compare means and std dev for original and imputed vars;
Notice that the means are the same, but
1. The variance of the imputed variable is less than
for the original variable
2. The standard error of the imputed variable is smaller
than the original variable */
proc means data=Imputed ndec=2
N NMiss Mean StdErr StdDev;
var Orig_Height Height;
run;
/* Overlay histograms on same axis to emphasize that the
imputed values are all equal to the mean value */
title "Comparison of Original and Imputed Data";
proc sgplot data=Imputed;
xaxis label="Height" values=(52 to 70 by 4 61.5) valueshint;
yaxis grid;
histogram Height / scale=count binstart=52 binwidth=4 legendlabel="Imputed";
histogram Orig_Height / transparency=0.5 scale=count binstart=52 binwidth=4 legendlabel="Original";
refline 61.5 / axis=x lineattrs=(color=black) label="Mean"; /* mean value */
run;
/* Mean imputation corrupts multivariate relationships */
proc corr data=Imputed noprob;
var Weight Age;
with Orig_Height Height;
run;
/* Single-variable regression:
You can show mathematically for 1-var regression that
the slopes are the same, but the intercept will usually
be different. */
title "Original Data with Missing Values";
proc reg data=Imputed;
OriginalData: model Weight = Orig_Height;
ods select ParameterEstimates;
quit;
title "Mean-Imputed Data";
proc reg data=Imputed;
ImputedData: model Weight = Height;
ods select ParameterEstimates;
quit;
title;
ods graphics / attrpriority=NONE;
title "Simple Linear Regression with Mean Imputation";
proc sgplot data=Imputed;
styleattrs datasymbols=(Circle X);
reg x=Orig_Height y=Weight / nomarkers curvelabel="Original";
reg x=Height y=Weight / nomarkers curvelabel="Imputed";
scatter x=Height y=Weight / group=Replaced;
xaxis grid; yaxis grid;
run;
/* If you impute the RESPONSE variable, the imputed data has slope that is
smaller in magnitude (more flat) and the standard errors for the estimates
of the imputed data are smaller than for the original data.
*/
/*
title "Original Data with Missing Values";
proc reg data=Imputed plots(only)=Predictions(X=Weight);
OriginalData: model Orig_Height = Weight;
ods select ParameterEstimates PredictionPanel;
quit;
title "Mean-Imputed Data";
proc reg data=Imputed plots(only)=Predictions(X=Weight);
ImputedData: model Height = Weight;
ods select ParameterEstimates PredictionPanel;
quit;
title;
*/