/* SAS program to generate results for blog post. Reference Wicklin, Rick, "Visualizing the distribution of ACT scores", The DO Loop, published July 17, 2015, http://blogs.sas.com/content/iml/2015/07/17/viz-act-scores.html /* /* Data from Table 2.2 of www.act.org/newsroom/data/2012/pdf/profile/National2012.pdf The raw data is in a wide format, with 36 rows. The Nth row provides the number of students who acheived score N, for 1 <= N <= 36. */ data ACTRaw2012; input Score English Math Reading Science Composite; datalines; 36 4833 5541 11995 6735 781 35 19398 10713 13534 7930 4457 34 25434 16041 22895 5766 9604 33 24374 18424 33313 9651 14864 32 26845 18162 36052 15543 21438 31 28762 17479 47059 20881 28154 30 36179 28213 44194 24584 36676 29 35760 34229 45999 24782 43285 28 39859 51457 49830 37695 54167 27 57640 69512 54276 67617 64084 26 62011 91709 56394 60560 73443 25 72434 92545 59929 96207 85920 24 99058 116052 91854 130760 97383 23 87707 102611 91121 118479 105317 22 95187 86506 94101 123374 111566 21 123133 58145 117496 140633 116199 20 108067 85525 97076 149855 114325 19 88852 90792 91843 111678 112583 18 73176 110340 117242 112991 110415 17 69791 139380 84106 77293 103221 16 86766 197530 73784 71271 94775 15 94124 140725 89923 59260 84274 14 69967 54812 76681 51623 71373 13 46314 21241 53479 47200 54925 12 40259 5222 55772 34541 33931 11 43182 1885 27605 25616 13524 10 43054 673 15788 21080 3589 9 26252 218 6169 7439 1151 8 22949 140 2983 2595 400 7 9522 36 1799 1159 128 6 3299 89 832 661 38 5 1216 5 455 230 16 4 398 41 192 120 6 3 143 0 128 145 2 2 46 0 87 6 2 1 26 24 31 57 1 ; /* Use PROC TRANSPOSE to convert the data to long format. See http://blogs.sas.com/content/iml/2015/02/25/plotting-multiple-series-transforming-data-from-wide-to-long.html */ proc sort data=ACTRaw2012; by Score; run; proc transpose data=ACTRaw2012 out=Act2012(rename=(col1=Freq)) name=Subject; by Score ; var English Math Reading Science Composite; run; /* Use SAS procedures to analyze the data in the "long" format. /* Summarize mean, std dev, and quartiles for each subject area */ proc means data=ACT2012 Median Q3 P90 Mean Std order=data maxdec=2; label Subject=; class Subject; freq Freq; var Score; run; /***********************************************************/ /* The published graphs aggregate the data into seven bins. You can define a format and use PROC FREQ form the aggregate counts for each bin. */ proc format; value ACT 1 - 12 ="01-12" 13 - 15 ="13-15" 16 - 19 ="16-19" 20 - 23 ="20-23" 24 - 27 ="24-27" 28 - 32 ="28-32" 33 - 36 ="33-36"; run; proc freq data=ACT2012; where Subject^="Composite"; label Subject=; format Score ACT.; weight Freq; tables Subject*Score / outpct out=OutPct ; run; /* The published plot displays percentages, not proportions. Apply the PERCENT format. */ data PlotIt; set OutPct; Pct_Row = Pct_Row / 100; format Pct_Row percent5.; run; title "Distribution of Binned ACT Scores by Content Area"; footnote J=L "Graduating Class 2012"; footnote2 J=L "Data from www.act.org/newsroom/data/2012/pdf/profile/National2012.pdf"; /* create a better version of the published graph */ proc sgpanel data=PlotIt; panelby Subject / columns=1 onepanel novarname layout=rowlattice; vbar Score / response=Pct_Row datalabel=Pct_Row; run; /***************************************************************/ /* Alternative: Why not print the actual data without binning? */ /***************************************************************/ /* add ACT benchmarks: "A benchmark score is the minimum score needed on an ACT subject-area test to indicate a 50% chance of obtaining a B or higher or about a 75% chance of obtaining a C or higher in the corresponding credit-bearing college courses, which include English Composition, Algebra, Social Science and Biology." */ data PlotIt; set Act2012; where Subject^="Composite"; if Subject="English" then Benchmark=18; else if Subject="Math" then Benchmark=22; else if Subject="Reading" then Benchmark=21; else if Subject="Science" then Benchmark=24; run; title "Distribution of ACT Scores by Content Area"; title2 "Vertical Lines Indicate College Readiness Benchmark Scores"; proc sgpanel data=PlotIt; panelby Subject / columns=1 onepanel novarname layout=rowlattice; histogram Score / freq=Freq binwidth=1; refline BenchMark / axis=x; rowaxis grid offsetmin=0 offsetmax=0.1; colaxis min=1 max=36 values=(5 to 35 by 5) valueshint; run;