/* Program to accompany the article "Overlay plots on a box plot in SAS: Continuous X axis" by Rick Wicklin, published 15JUN2016 http://blogs.sas.com/content/iml/2016/06/15/overlay-box-plot-sas-continuous.html ? Average life expectancy data from World Bank http://data.worldbank.org/indicator/SP.DYN.LE00.IN downloaded 20May2016. */ /* Step 1: Download two CSV file The life expectancy data: http://blogs.sas.com/content/iml/files/2016/06/LE2.csv Country information: http://blogs.sas.com/content/iml/files/2016/06/LifeExpectancyCountries.csv /* /* Step 2: Import to SAS data sets */ PROC IMPORT OUT= WORK.LifeExpectancy DATAFILE= "C:\Users\frwick\Downloads\LifeExpectancy\LE2.csv" DBMS=CSV REPLACE; GETNAMES=YES; DATAROW=2; RUN; PROC IMPORT OUT= WORK.CountryCodes DATAFILE= "C:\Users\frwick\Downloads\LifeExpectancy\LifeExpectancyCountries.csv" DBMS=CSV REPLACE; GETNAMES=YES; DATAROW=2; RUN; /* Step 3: Prepare data for plotting. Add formats. Convert from Wide to Long form */ proc sort data=LifeExpectancy(drop=Indicator_Code Indicator_Name) out=LEsort; by Country_Code; run; proc format; value IncomeFmt 1='High OECD' 2='High nonOECD' 3='Upper Middle' 4='Lower Middle' 5='Low'; run; data LL2; merge LEsort CountryCodes(drop=SpecialNotes); by Country_Code; if IncomeGroup="High income: OECD" then Income=1; else if IncomeGroup="High income: nonOECD" then Income=2; else if IncomeGroup="Upper middle income" then Income=3; else if IncomeGroup="Lower middle income" then Income=4; else if IncomeGroup="Low income" then Income=5; else delete; format Income IncomeFmt.; run; proc sort data=LL2; by Income Country_Name; run; /* transpose from wide to long */ data LE; set LL2; array Yr[*] Y1960-Y2014; do i = 1 to dim(Yr); Year = 1960 + (i-1); Expected = Yr[i]; output; end; label Expected = "Life Expectancy at Birth (years)"; drop i Y1960-Y2015; run; /**************************************************************************/ /******************************************************************/ /* overlay box plots and series or scatter plots */ /******************************************************************/ /* series overlay */ /* See also http://www.r-bloggers.com/box-plot-as-goal-post/ */ data Decade / view=decade; set LE; if 1960 <= Year <= 1969 then decade=1965; else if 1970 <= Year <= 1979 then decade=1975; else if 1980 <= Year <= 1989 then decade=1985; else if 1990 <= Year <= 1999 then decade=1995; else if 2000 <= Year <= 2009 then decade=2005; else if 2010 <= Year then decade=2015; run; /* box plots of life expectancy by decade */ title "Distribution of Life Expectancy by Decade"; title2 "Raw Data for 207 Countries by Year"; proc sgplot data=Decade noautolegend; vbox Expected / category=Decade; scatter x=year y=Expected / transparency=0.9; xaxis type=linear display=(nolabel) values=(1965 to 2015 by 10) valuesdisplay=('60s' '70s' '80s' '90s' '2000s' '2010s'); run; title; /******************************************************************/ /* you can also add regression lines, such as quantile regression */ /******************************************************************/ proc quantreg data=Decade outest=ParamEst(rename=(Year=Slope)) plots=none; model Expected = Year / quantile=0.1,0.25,0.5,0.75,0.9; *model Expected = Year / quantile=0.25,0.5,0.75; run; proc print data=ParamEst; var _quantile_ Intercept Slope; run; data PlotIt; set Decade ParamEst(keep=_quantile_ Intercept Slope); x0 = 1960; y0 = Intercept + Slope*x0; select(_quantile_); when(0.10) QLabel="10th Qntl"; when(0.25) QLabel="25th Qntl"; when(0.50) QLabel="50th Qntl"; when(0.75) QLabel="75th Qntl"; when(0.90) QLabel="90th Qntl"; otherwise QLabel="N/A"; end; run; /* overlay data by year, quantile curves, and box plots */ title "Quantile Regression of Life Expectancy"; title2 "Box Plots Show Schematic Distribution by Decade"; proc sgplot data=PlotIt noautolegend; vbox Expected / category=Decade; scatter x=year y=Expected / transparency=0.9; lineparm x=x0 y=y0 slope=Slope / lineattrs=GraphFit2(thickness=2) group=QLabel curvelabel curvelabelloc=outside; xaxis type=linear display=(nolabel) values=(1965 to 2015 by 10) valuesdisplay=('60s' '70s' '80s' '90s' '2000s' '2010s'); run;