/* Program to accompany "Robust principal component analysis in SAS," by Rick Wicklin, published 09AUG2017 http://blogs.sas.com/content/iml/2017/08/06/robust-principal-components-sas.html ? */ /*****************************************************/ /* ROBUST PRINCIPAL COMPONENT ANALYSIS */ /*****************************************************/ /* Example data from Getting Started Example in PROC PRINCOMP documentation */ title 'Crime Rates per 100,000 Population by State'; data Crime; input State $1-15 Murder Rape Robbery Assault Burglary Larceny Auto_Theft; datalines; Alabama 14.2 25.2 96.8 278.3 1135.5 1881.9 280.7 Alaska 10.8 51.6 96.8 284.0 1331.7 3369.8 753.3 Arizona 9.5 34.2 138.2 312.3 2346.1 4467.4 439.5 Arkansas 8.8 27.6 83.2 203.4 972.6 1862.1 183.4 California 11.5 49.4 287.0 358.0 2139.4 3499.8 663.5 Colorado 6.3 42.0 170.7 292.9 1935.2 3903.2 477.1 Connecticut 4.2 16.8 129.5 131.8 1346.0 2620.7 593.2 Delaware 6.0 24.9 157.0 194.2 1682.6 3678.4 467.0 Florida 10.2 39.6 187.9 449.1 1859.9 3840.5 351.4 Georgia 11.7 31.1 140.5 256.5 1351.1 2170.2 297.9 Hawaii 7.2 25.5 128.0 64.1 1911.5 3920.4 489.4 Idaho 5.5 19.4 39.6 172.5 1050.8 2599.6 237.6 Illinois 9.9 21.8 211.3 209.0 1085.0 2828.5 528.6 Indiana 7.4 26.5 123.2 153.5 1086.2 2498.7 377.4 Iowa 2.3 10.6 41.2 89.8 812.5 2685.1 219.9 Kansas 6.6 22.0 100.7 180.5 1270.4 2739.3 244.3 Kentucky 10.1 19.1 81.1 123.3 872.2 1662.1 245.4 Louisiana 15.5 30.9 142.9 335.5 1165.5 2469.9 337.7 Maine 2.4 13.5 38.7 170.0 1253.1 2350.7 246.9 Maryland 8.0 34.8 292.1 358.9 1400.0 3177.7 428.5 Massachusetts 3.1 20.8 169.1 231.6 1532.2 2311.3 1140.1 Michigan 9.3 38.9 261.9 274.6 1522.7 3159.0 545.5 Minnesota 2.7 19.5 85.9 85.8 1134.7 2559.3 343.1 Mississippi 14.3 19.6 65.7 189.1 915.6 1239.9 144.4 Missouri 9.6 28.3 189.0 233.5 1318.3 2424.2 378.4 Montana 5.4 16.7 39.2 156.8 804.9 2773.2 309.2 Nebraska 3.9 18.1 64.7 112.7 760.0 2316.1 249.1 Nevada 15.8 49.1 323.1 355.0 2453.1 4212.6 559.2 New Hampshire 3.2 10.7 23.2 76.0 1041.7 2343.9 293.4 New Jersey 5.6 21.0 180.4 185.1 1435.8 2774.5 511.5 New Mexico 8.8 39.1 109.6 343.4 1418.7 3008.6 259.5 New York 10.7 29.4 472.6 319.1 1728.0 2782.0 745.8 North Carolina 10.6 17.0 61.3 318.3 1154.1 2037.8 192.1 North Dakota 0.9 9.0 13.3 43.8 446.1 1843.0 144.7 Ohio 7.8 27.3 190.5 181.1 1216.0 2696.8 400.4 Oklahoma 8.6 29.2 73.8 205.0 1288.2 2228.1 326.8 Oregon 4.9 39.9 124.1 286.9 1636.4 3506.1 388.9 Pennsylvania 5.6 19.0 130.3 128.0 877.5 1624.1 333.2 Rhode Island 3.6 10.5 86.5 201.0 1489.5 2844.1 791.4 South Carolina 11.9 33.0 105.9 485.3 1613.6 2342.4 245.1 South Dakota 2.0 13.5 17.9 155.7 570.5 1704.4 147.5 Tennessee 10.1 29.7 145.8 203.9 1259.7 1776.5 314.0 Texas 13.3 33.8 152.4 208.2 1603.1 2988.7 397.6 Utah 3.5 20.3 68.8 147.3 1171.6 3004.6 334.5 Vermont 1.4 15.9 30.8 101.2 1348.2 2201.0 265.2 Virginia 9.0 23.3 92.1 165.7 986.2 2521.2 226.7 Washington 4.3 39.6 106.2 224.8 1605.6 3386.9 360.3 West Virginia 6.0 13.2 42.2 90.9 597.4 1341.7 163.3 Wisconsin 2.8 12.9 52.2 63.7 846.9 2614.2 220.7 Wyoming 5.4 21.9 39.7 173.9 811.6 2772.2 282.0 ; /*********************************************************/ /* PCA: Classical correlation and robust correlation */ /*********************************************************/ /*********************************************************/ /* PCA from PROC PRINCOMP */ /*********************************************************/ proc princomp data=Crime /* add COV option for covariance analysis */ outstat=PCAStats_CORR out=Components_CORR plots=score(ncomp=4); var Murder Rape Robbery Assault Burglary Larceny Auto_Theft; id State; ods select Eigenvectors EigenValues ScreePlot '2 by 1' '3 by 1'; run; proc print data=Components_CORR(obs=5); var Prin:; run; title; options linesize=120; proc iml; reset EIGEN93; /* use "9.3" algorithm, not vendor BLAS (SAS 9.4m3) */ varNames = {"Murder" "Rape" "Robbery" "Assault" "Burglary" "Larceny" "Auto_Theft"}; use Crime; read all var varNames into X; read all var "State"; close; S = cov(X); /* estimate of covariance matrix */ R = cov2corr(S); /* = corr(X) */ call eigen(D, V, R); /* D=eigenvalues; columns of V are PCs */ scale = T( sqrt(vecdiag(S)) ); /* = std(X) */ B = (X - mean(X)) / scale; /* center and scale data about the mean */ scores = B * V; /* project standardized data onto the PCs */ print V[r=varNames c=("Prin1":"Prin7") L="Eigenvectors" format=8.5]; print (scores[1:5,])[c=("Prin1":"Prin7") L="Scores" format=8.5]; /* compare to the results from PROC PRINCOMP */ /* use PCAStats; read all var "_TYPE_" into type; read all var varNames into m; close; D2 = T( m[ loc(type="EIGENVAL"), ] ); V2 = T( m[ loc(type="SCORE"), ] ); print (max(abs(D - D2)))[L="max diff eigenvalues"]; print (max(abs(V - V2)))[L="max diff eigenvectors"]; use Components; read all var ("Prin1":"Prin7") into Scores2; close; print (max(abs(scores - Scores2)))[L="max diff scores"]; create scoreCorr from scores[c=("PC1":"PC7")]; append from scores; close; */ /**********************************/ /* a robust PCA: the CORR analysis */ /**********************************/ /* get robust estimates of location and covariance */ N = nrow(X); p = ncol(X); /* number of obs and variables */ optn = j(8,1,.); /* default options for MCD */ optn[4]= floor(0.75*N); /* override default: use 75% of data for robust estimates */ call MCD(rc,est,dist,optn,x); /* compute robust estimates */ RobCov = est[3:2+p, ]; /* robust estimate of covariance */ RobLoc = est[1, ]; /* robust estimate of location */ /* use robust estimates to perform PCA */ RobCorr = cov2corr(RobCov); /* robust correlation */ call eigen(D, V, RobCorr); /* D=eigenvalues; columns of V are PCs */ RobScale = T( sqrt(vecdiag(RobCov)) ); /* robust estimates of scale */ B = (x-RobLoc) / RobScale; /* center and scale data */ scores = B * V; /* project standardized data onto the PCs */ print (D/D[+])[L="Proportion"]; /* to compare scores, reverse some eigenvectors */ /* see http://blogs.sas.com/content/iml/2017/08/07/eigenvectors-not-unique.html */ scores[,{2 4}] = -scores[,{2 4}]; create scoreRCorr from scores[c=("RobPC1":"RobPC7")]; append from scores; close; QUIT; data All; merge Components_CORR scoreRCorr; L1 = State; /* if render by template=Stat.Princomp.Graphics.ScorePlot */ label Prin1 = "Classical Component 1 (58.8%)" Prin2 = "Classical Component 2 (17.7%)" Prin3 = "Classical Component 3 (10.4%)" Prin4 = "Classical Component 4 (4.5%)" RobPC1 = "Robust Component 1 (66.7%)" RobPC2 = "Robust Component 2 (18.5%)" RobPC3 = "Robust Component 3 (7.8%)" RobPC4 = "Robust Component 4 (2.6%)"; run; /* proc sgrender data=All template=Stat.Princomp.Graphics.ScorePlot; dynamic XVAR="RobPC1" YVAR="RobPC2"; run; proc sgrender data=All template=Stat.Princomp.Graphics.ScorePlot; dynamic XVAR="RobPC1" YVAR="RobPC3"; run; */ proc means data=All; var Prin1-Prin3 RobPC1-RobPC3; run; /* first and second component scores */ title "Classical Scores"; proc sgplot data=All; format State $7.; xaxis min=-4.1 max=7.5 grid; yaxis min=-3 max=2.7 grid; scatter x=Prin1 y=Prin2 / datalabel=State; run; title "Robust Scores"; proc sgplot data=All; format State $7.; xaxis min=-4.1 max=7.5 grid; yaxis min=-3 max=2.7 grid; scatter x=RobPC1 y=RobPC2 / datalabel=State; run; /* first and third component scores */ title "Classical Scores"; proc sgplot data=All; format State $7.; xaxis min=-4.1 max=7.5 grid; yaxis min=-2.3 max=4.1 grid; scatter x=Prin1 y=Prin3 / datalabel=State; run; title "Robust Scores"; proc sgplot data=All; format State $7.; xaxis min=-4.1 max=7.5 grid; yaxis min=-2.3 max=4.1 grid; scatter x=RobPC1 y=RobPC3 / datalabel=State; run; title;