/* Program to accompany Wicklin (2017) "Simulate many samples from a linear regression model" published 01FEB2017 on The DO Loop blog: http://blogs.sas.com/content/iml/2017/02/01/simulate-samples-linear-regression.html */ /* SAS program to simulate many random samples from a regression model. Each sample has p=&nCont explanatory variables and &N observations. The explanatory variables are independent and x[i] ~ N(0,1). The error term is epsilon ~ N(0,1.5). The response variable Y satisfies the linear regression model Y = beta[0] + sum( beta[i]*x[i] ) + epsilon */ /* Simulate many samples from a linear regression model */ %let N = 50; /* N = sample size */ %let nCont = 10; /* p = number of continuous variables */ %let NumSamples = 100; /* number of samples */ data SimReg(keep= SampleID i Y x:); call streaminit(54321); array x[&nCont]; /* explanatory variables are named x1-x&nCont */ /* 1. Specify model coefficients. You can hard-code values such as array beta[0:&nCont] _temporary_ (-4 2 -1.33 1 -0.8 0.67 -0.57 0.5 -0.44 0.4 -0.36); or you can use a formula such as the following */ array beta[0:&nCont] _temporary_; do j = 0 to &nCont; beta[j] = 4 * (-1)**(j+1) / (j+1); /* formula for beta[j] */ end; do i = 1 to &N; /* for each observation in the sample */ do j = 1 to &nCont; x[j] = rand("Normal"); /* 2. Simulate explanatory variables */ end; eta = beta[0]; /* model = intercept term */ do j = 1 to &nCont; eta = eta + beta[j] * x[j]; /* + sum(beta[j]*x[j]) */ end; /* 5. simulate response for each sample */ do SampleID = 1 to &NumSamples; /* <== LOOP OVER SAMPLES */ epsilon = rand("Normal", 0, 1.5); /* 3. Specify error distrib*/ Y = eta + epsilon; /* 4. Y = model + error */ output; end; end; run; proc sort data=SimReg; by SampleID i; run; /* analysis of all &NumSample samples */ proc reg data=SimReg outest=PE NOPRINT; by SampleID; model y = x:; quit; /* correlation of the parameter estimates (covariance of the bets) */ /* proc corr data=PE outp=corrout; var Intercept x1-x&nCont; run; */ /* visualize sampling distribution; overlay parameter values */ data Coeffs; Intercept = -4; array x[&nCont]; /* explanatory variables are named x1-x&nCont */ do j = 1 to &nCont; x[j] = (-1)**(j+1) * 4 / (j+1); /* parameter values */ end; run; data PE2; set PE Coeffs(in=param); /* append parameter values */ if param then Beta = "Parameter"; /* add ID variable */ else Beta = "Estimate "; run; ods graphics / reset ATTRPRIORITY=none; title "Parameter Estimates for Linear Regression"; title2 "&NumSamples Simulated Data Sets, N = &N"; proc sgscatter data=PE2 datasymbols=(Circle StarFilled); matrix Intercept x1-x3 / group=Beta markerattrs=(size=11); run;