/* SAS program to accompany the following blog posts from The DO Loop: "Mean imputation in SAS" https://blogs.sas.com/content/iml/2017/12/04/mean-imputation-sas.html by Rick Wicklin, published 04DEC2017 and "3 reasons to avoid mean imputation" https://blogs.sas.com/content/iml/2017/12/06/problems-mean-imputation.html by Rick Wicklin, published 06DEC2017. Purpose: Show how to replace missing data by mean, median, etc., by using PROC STDIZE in SAS. */ /* Create "original data" by randomly inserting missing values for some heights */ data Have; set sashelp.class; call streaminit(12345); Replaced = rand("Bernoulli", 0.4); if Replaced then height = .; run; /* How to compute mean imputation: Use PROC STDIZE to replace missing data with mean. The original variable is Orig_Height. The imputed variable is Height. */ proc stdize data=Have out=Imputed oprefix=Orig_ /* prefix for original variables */ reponly /* only replace; do not standardize */ method=MEAN; /* or MEDIAN, MINIMUM, MIDRANGE, etc. */ var Height; run; proc print data=Imputed(obs=9); format Orig_Height Height BESTD8.1; var Name Orig_Height Height Weight Replaced; run; /*******************************************/ /* Drawbacks of mean imputation */ /*******************************************/ /* Compare means and std dev for original and imputed vars; Notice that the means are the same, but 1. The variance of the imputed variable is less than for the original variable 2. The standard error of the imputed variable is smaller than the original variable */ proc means data=Imputed ndec=2 N NMiss Mean StdErr StdDev; var Orig_Height Height; run; /* Overlay histograms on same axis to emphasize that the imputed values are all equal to the mean value */ title "Comparison of Original and Imputed Data"; proc sgplot data=Imputed; xaxis label="Height" values=(52 to 70 by 4 61.5) valueshint; yaxis grid; histogram Height / scale=count binstart=52 binwidth=4 legendlabel="Imputed"; histogram Orig_Height / transparency=0.5 scale=count binstart=52 binwidth=4 legendlabel="Original"; refline 61.5 / axis=x lineattrs=(color=black) label="Mean"; /* mean value */ run; /* Mean imputation corrupts multivariate relationships */ proc corr data=Imputed noprob; var Weight Age; with Orig_Height Height; run; /* Single-variable regression: You can show mathematically for 1-var regression that the slopes are the same, but the intercept will usually be different. */ title "Original Data with Missing Values"; proc reg data=Imputed; OriginalData: model Weight = Orig_Height; ods select ParameterEstimates; quit; title "Mean-Imputed Data"; proc reg data=Imputed; ImputedData: model Weight = Height; ods select ParameterEstimates; quit; title; ods graphics / attrpriority=NONE; title "Simple Linear Regression with Mean Imputation"; proc sgplot data=Imputed; styleattrs datasymbols=(Circle X); reg x=Orig_Height y=Weight / nomarkers curvelabel="Original"; reg x=Height y=Weight / nomarkers curvelabel="Imputed"; scatter x=Height y=Weight / group=Replaced; xaxis grid; yaxis grid; run; /* If you impute the RESPONSE variable, the imputed data has slope that is smaller in magnitude (more flat) and the standard errors for the estimates of the imputed data are smaller than for the original data. */ /* title "Original Data with Missing Values"; proc reg data=Imputed plots(only)=Predictions(X=Weight); OriginalData: model Orig_Height = Weight; ods select ParameterEstimates PredictionPanel; quit; title "Mean-Imputed Data"; proc reg data=Imputed plots(only)=Predictions(X=Weight); ImputedData: model Height = Weight; ods select ParameterEstimates PredictionPanel; quit; title; */