/*******************************************************/ /* quantile regression analysis of chess ratings */ /*******************************************************/ /* SAS program to accompany the article "Quantile regression analysis of chess ratings" by Rick Wicklin, published 13AUG2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/08/13/quantile-regression-chess-ratings.html This program shows how to compute quantile regression for chess ratings versus age. A second analysis computes quantiles for age and gender. The data are ratings in 2014 from the World Chess Federation https://ratings.fide.com/download.phtml The data were previously visualized by Robert Allison: https://blogs.sas.com/content/sastraining/2018/07/24/using-x-ray-glasses-to-see-patterns-in-your-data/ Allison's article was inspired by the article "Age vs. Elo - Your battle against time" by LionChessLtd https://www.chess.com/blog/LionChessLtd/age-vs-elo---your-battle-against-time */ /***************************************************************/ /* 1. Input data and correct obvious data errors. Written by Robert Allison */ /***************************************************************/ data ratings; infile "C:\Users\frwick\Downloads\ChessFedRatings.txt" lrecl=200 pad firstobs=2; input whole_line \$ 1-200; id_number=substr(whole_line,1,14); name=substr(whole_line,16,59); fed=substr(whole_line,77,3); sex=substr(whole_line,81,1); tit=substr(whole_line,85,3); wtit=substr(whole_line,90,3); otit=substr(whole_line,95,18); Score=.; Score=substr(whole_line,110,4); gms=.; gms=substr(whole_line,116,1); k=.; k=substr(whole_line,120,2); birth_year=.; birth_year=substr(whole_line,123,4); flag=substr(whole_line,129,3); run; /* fix some apparent data-problems ... */ /* There are other bad/suspect dates - like a lot of 1900 from Myanmar. I decided not to mess with them, because there's really no way to know if they're valid or not. */ data ratings; set ratings; if id_number=5213525 and birth_year=0993 then birth_year=1993; if id_number=3827852 and birth_year=0974 then birth_year=1974; if id_number=1658344 and birth_year=1048 then birth_year=1948; if id_number=4704118 and birth_year=1059 then birth_year=1959; if id_number=1453475 and birth_year=1076 then birth_year=1976; if id_number=21600376 and birth_year=1194 then birth_year=1994; if id_number=9012435 and birth_year=1657 then birth_year=1957; if id_number=1466801 and birth_year=1664 then birth_year=1964; run; /* estimate age in 2014 */ data ratings; set ratings; age=put('01MAY2014'd,year4.)-birth_year; run; data Orig; set ratings (where=( birth_year not in (0 .) and Flag not in ('i' 'wi') and k<30 and age<100 )); keep Age Sex Score; run; /***************************************************************/ /* 2. Basic sanity checks and KDE estimate of density of point */ /***************************************************************/ title1 "World Chess Federation Ratings"; title2 "Data source: ratings.fide.com, May 2014 - Active Players (K<30)"; proc means data=Orig; var age Score; run; /* https://blogs.sas.com/content/iml/2011/03/04/how-to-use-transparency-to-overcome-overplotting.html */ /* density of individiuals (Age, Scores) */ proc kde data=Orig; bivar Age Score / plots=(contour); run; /***************************************************************/ /* 3. Fit a quantile regression model and plot the predicted curves for Score vs Age. Technique explained at https://blogs.sas.com/content/iml/2018/08/06/score-quantile-regression-sas.html /***************************************************************/ proc quantreg data=Orig algorithm=IPM ci=none plot(maxpoints=none)=fitplot(nodata); effect Spl = spline(Age / knotmethod=list(10 to 70 by 10) ); model Score = Spl / quantile = 0.1 0.25 0.5 0.75 0.90; output out=Out P=pred; run; /***************************************************************/ /* 4. If you want to show the raw data, use semi-transparency, as follows: */ /***************************************************************/ proc sort data=Out; by Age; run; proc sgplot data=Orig noautolegend; label Score = "Rating"; scatter x=Age y=Score / group=Sex markerattrs=(symbol=CircleFilled) transparency=0.975; legenditem type=marker name="F" / label="Female" markerattrs=GraphData2; legenditem type=marker name="M" / label="Male" markerattrs=GraphData1; keylegend "F" "M"; xaxis values=(10 to 80 by 10) valueshint grid; yaxis values=(1000 to 2750 by 250) valueshint grid; run; /***************************************************************/ /* 5. Now do an analysis that includes gender. We need to create a score data set, then plot curves for two categorical variables as shown in https://blogs.sas.com/content/iml/2018/08/08/?plot-curves-two-categorical-variables-sas.html */ /***************************************************************/ /* 5.1. Create score data set */ data Score; Score = .; do Sex = 'F', 'M'; do Age = 7 to 100; output; end; end; run; /* 5.2. Append the score data to the original data. Use a binary variable to indicate which observations are the scoring data */ data Combined; set Orig /* original data */ Score(in=_ScoreData); /* scoring data */ ScoreData = _ScoreData; /* binary indicator variable. ScoreData=1 for scoring data */ run; /* 5.3. Run the procedure on the combined (original + scoring) data */ ods select ModelInfo NObs; proc quantreg data=Combined algorithm=IPM ci=none; class Sex; effect Spl = spline(Age / knotmethod=list(10 to 70 by 10) ); model Score = Spl | Sex / quantile = 0.1 0.25 0.5 0.75 0.90; output out=QRegOut(where=(ScoreData=1)) P=Pred / columnwise; run; /* 5.4. If you want a fit plot, sort by the independent variable for each curve. Put QUANTILE and other covariates first, then the X variable. */ proc sort data=QRegOut; by Quantile Sex Age; run; /*5.5. If you want to overlay the plots, it's easiest to define separate variables for original data and scoring data */ data ScoreOut; set QRegOut; /* scoring data */ /* automatically create joint levels from original levels */ Label = catt("Quantile=", Quantile) || "; " || catt("Sex=", Sex); Label2 = catt("Quantile=", Quantile); run; /* proc freq data=ScoreOut; tables Label2/ nocum; run; */ /* 5.6. Plot the curves for two categorical variables and three quantiles */ title2 "Data source: ratings.fide.com, May 2014 - Active Players"; proc sgplot data=ScoreOut(where=(Age<75)); where Quantile in (0.1 0.5 0.9); label Pred = "Predicted Rating"; series x=Age y=Pred / group=Label lineattrs=(pattern=solid) curvelabel grouplc=Sex; xaxis values=(10 to 80 by 10) grid; yaxis values=(1000 to 2500 by 500) grid; run; /* 5.7. Panel by Sex. Do not include the data */ data All; set Orig(where=(Age<80)) ScoreOut(rename=(Age=X Pred=Y) where=(X<80)); run; ods graphics / width=800px height=480px; proc sgpanel data=All; label Score="Rating" X="Age" Y="Predicted Rating"; panelby Sex / rows=1; *scatter x=Age y=Score / markerattrs=(symbol=CircleFilled) transparency=0.975; series x=X y=Y / group=Quantile lineattrs=(thickness=2) nomissinggroup name='c'; keylegend 'c' / position=right sortorder=reverseauto title="Quantile"; colaxis values=(10 to 80 by 10) grid; rowaxis values=(1000 to 2500 by 250) grid; run;