/* Data and program for the articles published by Rick Wicklin on The DO Loop blog: "Plot the conditional distribution of the response in a linear regression model", published 19SEP2015, http://blogs.sas.com/content/iml/2015/09/10/plot-distrib-reg-model.html "Error distributions and exponential regression models", published 16SEP2015, http://blogs.sas.com/content/iml/2015/09/16/plot-distrib-exp.html */ /* Data and ideas from Arthur Charpentier, "GLM, non-linearity and heteroscedasticity," Freakonomics blog, published 22Oct2013, http://freakonometrics.hypotheses.org/9593 Also see Markus Gesmann, "Visualising theoretical distributions of GLMs," Mages' Blog, published 18Aug2015, http://www.magesblog.com/2015/08/visualising-theoretical-distributions.html */ /* Data on the distance (in feet) that a car required to stop when driving at a certain speed (in mph). The 'cars' data is a common data set in R. */ data MyData(label="cars data from R"); input x y @@; logY = log(y); label x = "Speed" y = "Distance" logY = "log(Distance)"; datalines; 4 2 4 10 7 4 7 22 8 16 9 10 10 18 10 26 10 34 11 17 11 28 12 14 12 20 12 24 12 28 13 26 13 34 13 34 13 46 14 26 14 36 14 60 14 80 15 20 15 26 15 54 16 32 16 40 17 32 17 40 17 50 18 42 18 56 18 76 18 84 19 36 19 46 19 68 20 32 20 48 20 52 20 56 20 64 22 66 23 54 24 70 24 92 24 93 24 120 25 85 ; /* 1. Create scoring data set */ /* 1a. Put min and max into macro variables */ proc sql noprint; select min(x), max(x) into :min_x, :max_x from MyData; quit; /* 1b. Create scoring data set */ data ScoreX; do x = &min_x to &max_x by (&max_x-&min_x)/100; /* min(X) to max(X) */ output; end; run; /************************************************/ /* Model 1: linear regression of Y */ /************************************************/ /* Equivalent model: proc glm data=MyData plots=FitPlot; model y = x; ods select FitStatistics ParameterEstimates FitPlot; run; */ title "Linear Regression of Y"; /* Create regression model. Save model to item store. Score with PROC PLM. */ proc genmod data=MyData; model y = x / dist=normal link=identity; ods select ParameterEstimates; store work.YModel / label='Normal distribution for Y; identity link'; run; proc plm restore=YModel noprint; score data=ScoreX out=Model pred=Fit; run; /* specify parameters for model */ %let std = 15.3796; %let b0 = -17.5791; %let b1 = 3.9324; %let ILINK = ; /* blank for identity */ /* create values to plot the assumed response distribution */ data NormalResponse(keep = s t distrib); /* compute scale factor for height of density distributions */ ds = (&max_x - &min_x) / 6; /* spacing for 5 distributions */ maxHt = ds / 2; /* max ht is 1/2 width */ maxPdf = pdf("Normal", 0, 0, &std); /* max density */ ScaleFactor = maxHt / maxPDF; /* scale density into data coords */ /* place distribs horizontally along X scale */ do s = (&min_x+ds) to (&max_x-ds) by ds; eta = &b0 + &b1*s; /* linear predictor */ pred = &ILINK(eta); /* apply inverse link function */ /* compute distribution of Y | X. Assume +/-3 std dev is appropriate */ do t = pred - 3*&std to pred + 3*&std by 6*&std/50; distrib = s + ScaleFactor * pdf("Normal", t, pred, &std); output; end; end; run; /* merge data, fit, and error distribution information */ data All; set MyData Model NormalResponse; run; title "Conditional Distribution of Response in Regression Model"; title2 "y = &b0 + &b1*x + eps, eps ~ N(0, &std)"; title3 "Normal Distribution, Identity Link"; proc sgplot data=All noautolegend; scatter x=x y=y; series x=x y=Fit; band y=t lower=s upper=distrib / group=s transparency=0.4 fillattrs=GraphData1; xaxis grid; yaxis grid; run; title; /****************************************************/ /* Model 2: Gen'lzd linear model of Y with log link */ /* GENMOD models y ~ N(exp(x`b), sigma_2), or */ /* log(E(Y)) = x`b */ /****************************************************/ title "Generalized Linear Model of Y with log link"; proc genmod data=MyData; model y = x / dist=normal link=log; ods select ParameterEstimates; store work.GenYModel / label='Normal distribution for Y; log link'; run; proc plm restore=GenYModel noprint; score data=ScoreX out=Model pred=Fit / ILINK; run; /* specify parameters for linear model */ %let std = 14.7679; %let b0 = 2.2412; %let b1 = 0.0917; %let ILINK = exp; /* create values to plot the assumed response distribution */ data NormalResponse(keep = s t distrib); /* compute scale factor for height of density distributions */ ds = (&max_x - &min_x) / 6; /* spacing for 5 distributions */ maxHt = ds / 2; /* max ht is 1/2 width */ maxPdf = pdf("Normal", 0, 0, &std); /* max density */ ScaleFactor = maxHt / maxPDF; /* scale density into data coords */ /* place distribs horizontally along X scale */ do s = (&min_x+ds) to (&max_x-ds) by ds; eta = &b0 + &b1*s; /* linear predictor */ pred = &ILINK(eta); /* apply inverse link function */ /* compute distribution of Y | X. Assume +/-3 std dev is appropriate */ do t = pred - 3*&std to pred + 3*&std by 6*&std/50; distrib = s + ScaleFactor * pdf("Normal", t, pred, &std); output; end; end; run; /* merge data, fit, and error distribution information */ data All; set MyData Model NormalResponse; run; title "Conditional Distribution of Response in Regression Model"; title2 "y ~ N(exp(&b0 + &b1*x), &std)"; title3 "Generalized linear model of Y, log link"; proc sgplot data=All noautolegend; scatter x=x y=y; series x=x y=Fit; band y=t lower=s upper=distrib / group=s transparency=0.4 fillattrs=GraphData1; xaxis grid; yaxis grid; run; title; /* Notice that in the previous model the covariates affect the response in a multiplicative fashion, whereas the error is additive. */ /* save for comparison with the next model */ data FitGenMod(keep= FitGenMod); set All; rename Fit = FitGenMod; run; /************************************************/ /* Model 3: linear regression of log(Y) */ /* GLM models log(y) ~ N(x`b, sigma_1), or */ /* y ~ Lognormal(x`b, sigma_1) */ /* and E(log(Y)) = x`b. */ /************************************************/ /* Equivalent models: proc glm data=MyData plots=FitPlot; model logY = x; *and exponentiate; ods select FitStatistics ParameterEstimates FitPlot; run; proc fmm data=MyData; model y = x / dist=lognormal; ods select ParameterEstimates; run; *FMM estimates the VARIANCE instead of scale: sqrt(0.1912) = 0.4373; */ title "Linear Model of log(Y)"; /* Create regression model. Save model to item store. */ proc genmod data=MyData; model logY = x / dist=normal link=identity; ods select ParameterEstimates; store work.LogYModel / label='Normal distribution for log(Y); identity link'; run; proc plm restore=LogYModel noprint; score data=ScoreX out=Model pred=Fit; run; /* specify parameters for model */ %let b0 = 1.6761; %let b1 = 0.1208; %let std = 0.4373; %let ILINK = ; /* blank for identity */ /* create values to plot the assumed response distribution */ data NormalResponse(keep = s t distrib); /* compute scale factor for height of density distributions */ ds = (&max_x - &min_x) / 6; /* spacing for 5 distributions */ maxHt = ds / 2; /* max ht is 1/2 width */ maxPdf = pdf("Normal", 0, 0, &std); /* max density */ ScaleFactor = maxHt / maxPDF; /* scale density into data coords */ /* place distribs horizontally along X scale */ do s = (&min_x+ds) to (&max_x-ds) by ds; eta = &b0 + &b1*s; /* linear predictor */ pred = &ILINK(eta); /* apply inverse link function */ /* compute distribution of Y | X. Assume +/-3 std dev is appropriate */ do t = pred - 3*&std to pred + 3*&std by 6*&std/50; distrib = s + ScaleFactor * pdf("Normal", t, pred, &std); output; end; end; run; /***************************************************************/ /* merge data, fit, and error distribution information. */ /* Exponentiate predicted values to obtain model on orig scale */ /***************************************************************/ data All; set MyData Model NormalResponse; expT = exp(t); expFit = exp(Fit); run; title "Conditional Distribution of Response in Regression Model"; title2 "log(y) = &b0 + &b1*x + eps, eps ~ N(0, &std)"; title3 "Exponential transformation of linear model of log(Y)"; proc sgplot data=All noautolegend; scatter x=x y=y; series x=x y=expFit; band y=expT lower=s upper=distrib / group=s transparency=0.4 fillattrs=GraphData1; xaxis grid; yaxis grid max=160; run; /*******************************************/ /* compare results of models 2 and 3 */ /*******************************************/ data Compare; merge All FitGenMod; run; title; title "Compare Log Link and log(Y) models"; proc sgplot data=Compare; scatter x=x y=y / legendlabel="Data"; series x=x y=FitGenMod / legendlabel="Y + Log Link"; series x=x y=expFit / legendlabel="log(Y) + Exp" lineattrs=GraphData2; xaxis grid; yaxis grid; run;