/* Rick Wicklin, 11 January, 2011. SAS program to investigate the distribution of people's initials (e.g., John Smith = JS). The program uses the SAS Cary phonelist to compute the initials. The analysis is: */ ods listing style=listing; /* names are in a text file */ data emp; infile "C:\Documents and Settings\frwick\My Documents\Blog\phonelist.txt"; input Name $ 1-20 Location $ 21-25 Room $ 26-33; run; data Employees; set emp(where=(Location="NC")); /* restrict to Cary, NC, employees */ /* split first and last names. Names are stored as "Lastname, Firstname" */ drop i; i = index(Name, ","); length FirstName $20. LastName $20. I1 $1. I2 $1. Initials $2; LastName = substr(Name, 1, i-1); FirstName = strip(substr(Name, i+1)); I1 = upcase(substr(FirstName,1,1)); I2 = upcase(substr(LastName,1,1)); /* equate "Day" and "de Sousa" */ Initials = I1 || I2; label I1="First Initial" I2="Second Initial"; run; /* compute frequencies for First, Second, and Pairs of initials */ proc freq data=Employees order=freq; tables I1 / out=I1Freq; tables I2 / out=I2Freq; tables I1*I2 / out=InitialFreq missing sparse noprint; run; data SASUSER.InitialFreq; set InitialFreq; Initials = I1 || I2; run; /* print a few example frequencies */ proc print data=SASUSER.InitialFreq(where= (Initials="RW" | Initials="JG" | Initials="JS")); run; /* plot distribution of first and last initials */ ods graphics / width=500px height=300px; proc sgplot data=I1Freq; title "Distribution of First Initials"; vbar I1 / response=percent; xaxis discreteorder=data; yaxis grid values=(0 to 12 by 2) valueshint; run; proc sgplot data=I2Freq; title "Distribution of Last Initials"; vbar I2 / response=percent; xaxis discreteorder=data; yaxis grid values=(0 to 12 by 2) valueshint; run; /* what are deciles of the frequencies for pairs of initials? */ proc univariate data=SASUSER.InitialFreq noprint; var Percent; output out=out pctlpts=10 to 90 by 10 pctlpre=p; run; proc print data=out; run; /* Exclude pairs with frequency=0. What are quantiles? Use these values to color-code the heat map */ proc univariate data=SASUSER.InitialFreq(where=(Percent>0)) noprint; var Percent; output out=out Q1=Q1 Median=Q2 Q3=Q3; run; proc print data=out; run; /* show distribution of pairs of initials as a box plot */ ods graphics / width=600px height=180px; proc sgplot data=SASUSER.InitialFreq(where=(Percent>0)); title "Distribution of Pairs of Initials"; title2 "where Percent > 0"; hbox Percent / datalabel=Initials spread; run; /* Create a heat map of the pairs of initials */ /* 1. Create a format to group the prices into categories */ /** http://www2.sas.com/proceedings/forum2008/243-2008.pdf **/ proc format; VALUE heat 5='0%' 4='0-0.05%' 3='0.05-0.15%' 2='0.15-0.3%' 1='> 0.3%'; run; quit; /* 2. Define a style to assign the colors to be used */ /* 4-color diverging color scheme from www.colorbrewer.org */ proc template; define style styles.heatmap; parent=styles.listing; style graphcolors from graphcolors / 'gcdata5'=white 'gcdata4'=CX2B83BA 'gcdata3'=CXABDDA4 'gcdata2'=CXFDAE61 'gcdata1'=CXD7191C; end; run; /* 3. Group the data by the prevalence of initials and apply the format */ data initials; set SASUSER.InitialFreq; format group heat.; label group='Prevalence'; if percent=0 then group=5; else if percent<= 0.05 then group=4; else if percent<= 0.15 then group=3; else if percent<= 0.30 then group=2; else group=1; run; /* 4. Specify the style to be the HEATMAP style defined with PROC TEMPLATE */ ods listing style=heatmap; /* 5. (optional) Specify the width, in pixels, for the output */ ods graphics / width=600px height=642px; /* 6. Create the heat map */ proc sgplot data=initials; scatter x=I1 y=I2 / markerattrs=(size=16px symbol=squarefilled) group=group; scatter x=I1 y=I2 /markerattrs=(size=16px symbol=square); run; ods graphics / reset;