/* SAS program to accompany the article "A distance correlation" by Rick Wicklin, published 04APR2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/04/04/distance-correlation.html This program shows how to compute the "distance correlation," as proposed by G. Szekely, M. Rizzo, and N. Bakirov, 2007, "Measuring and Testing Dependence by Correlation of Distances" The Annals of Statistics, 35(6) For a SAS macro that does not require PROC IML, see T. Billings, 2016, "Distance Correlation for Vectors: A SAS Macro" Proceedings of the 2016 WUSS Conference. */ proc iml; /* double centers matrix by subtracting row and column marginals */ start AdjustDist(A); rowMean = A[:, ]; colMean = rowMean`; /* the same, by symmetry */ grandMean = rowMean[:]; A_Adj = A - rowMean - colMean + grandMean; return (A_Adj); finish; /* distance correlation: G. Szekely, M. Rizzo, and N. Bakirov, 2007, Annals of Statistics, 35(6) */ start DistCorr(x, y); DX = distance(x); DY = distance(y); DX = AdjustDist(DX); DY = AdjustDist(DY); V2XY = (DX # DY)[:]; /* mean of product of distances */ V2X = (DX # DX)[:]; /* mean of squared (adjusted) distance */ V2Y = (DY # DY)[:]; /* mean of squared (adjusted) distance */ dCov = sqrt( V2XY ); /* distance covariance estimate */ denom = sqrt(V2X * V2Y); /* product of std deviations */ if denom > 0 then R2 = V2XY / denom; /* R^2 = (dCor)^2 */ else R2 = 0; dCor = sqrt(R2); T = nrow(DX)*V2XY; /* test statistic p. 2783. Reject indep when T>=z */ /* return List of resutls: */ L = [#"dCov"=dCov, #"dCor"=dCor, #"dVarX"=sqrt(V2X), #"dVarY"=sqrt(V2Y), #"T"=T]; /* or return a vector: L = dCov || dCor || sqrt(V2X) || sqrt(V2Y) || T; */ return L; finish; x = {1,0,-1, 0}; y = {0,1, 0,-1}; results = DistCorr(x, y); /* get itenms from results */ dCov=results\$"dCov"; dCor=results\$"dCor"; dVarX=results\$"dVarX"; dVarY=results\$"dVarY"; /* or from vector: dCov=results[1]; dCor=results[2]; dVarX=results[3]; dVarY=results[4]; */ print dCov dCor dVarX dVarY; /* a few examples */ /* helper functions */ start PearsonCorr(x,y); return( corr(x||y)[1,2] ); finish; start DCorr(x,y); results = DistCorr(X, Y); return( results\$"dCor" ); /* if DistCorr returns vector: return( results[2] ); */ finish; /*************************************************/ /* Classic example: Y is quadratic function of X */ x = do(-1, 1, 0.1)`; /* X is in [-1, 1] */ y = x##2; /* Y = X^2 */ PCor = PearsonCorr(x,y); DCor = DCorr(x,y); print PCor DCor; /*************************************************/ /* Unit vectors equally distributed on circle */ pi = constant("pi"); dt = pi/24; t = do(0, 2*pi - dt, dt)`; x = cos(t); y = sin(t); *call scatter(x,y); PCor = PearsonCorr(x,y); DCor = DCorr(x,y); print PCor DCor; /*************************************************/ /* Generate bivariate normal data from distribution with Pearson correlation rho=-0.9, -0.8, ..., 0.8, 0.9. For each sample, compute Pearson and distance correlation. Plot the sample correlations vs rho. */ N = 500; /* sample size */ mu = {0 0}; Sigma = I(2); /* parameters for bivaraite normal distrib */ rho = do(-0.9, 0.9, 0.1); /* grid of correlations */ PCor = j(1, ncol(rho), .); /* allocate vectors for results */ DCor = j(1, ncol(rho), .); call randseed(54321); do i = 1 to ncol(rho); /* for each rho, simulate bivariate normal data */ Sigma[1,2] = rho[i]; Sigma[2,1] = rho[i]; /* population covariance */ Z = RandNormal(N, mu, Sigma); /* bivariate normal sample */ PCor[i] = PearsonCorr(Z[,1], Z[,2]); /* Pearson correlation */ DCor[i] = DCorr(Z[,1], Z[,2]); /* distance correlation */ end; create MVNorm var {"rho" "PCor" "DCor"}; append; close; submit; title "Correlations of Bivariate Normal Data"; title2 "N = 500"; proc sgplot data=MVNorm; label rho="Pearson Correlation of Population" PCor="Pearson Correlation of Sample" DCor="Distance Correlation of Sample"; scatter x=rho y=PCor / name="P"; scatter x=rho y=DCor / name="D"; lineparm x=0 y=0 slope=1; yaxis grid label="Correlation"; xaxis grid; keylegend "P" "D"; run; endsubmit; /*************************************************/ /* What dort of sample variation do we observe? Repeat the previous simulation, but simulate 10 samples for each value of rho */ rho = do(-0.95, 0.95, 0.05); PCor = j(1, ncol(rho), .); DCor = j(1, ncol(rho), .); SampleID = j(1, ncol(rho), .); N = 100; create MVNorm var {"SampleID" "rho" "PCor" "DCor"}; do Rep = 1 to 10; do i = 1 to ncol(rho); Sigma[1,2] = rho[i]; Sigma[2,1] = rho[i]; Z = RandNormal(N, mu, Sigma); PCor[i] = PearsonCorr(Z[,1], Z[,2]); DCor[i] = DCorr(Z[,1], Z[,2]); SampleID[i] = Rep; end; append; end; close; submit; title "Distance Correlation of Bivariate Normal Data"; title2 "N = 100; 10 Samples for Each rho"; proc sgplot data=MVNorm noautolegend; label rho="Pearson Correlation of Population" PCor="Pearson Correlation of Sample" DCor="Distance Correlation of Sample"; scatter x=rho y=DCor / name="D"; lineparm x=0 y=0 slope=1 / clip; lineparm x=-1 y=1 slope=-1 / clip; yaxis grid min=0 max=1 offsetmin=0; xaxis grid; run; endsubmit; /*************************************************/ /* Correlation betwee a 2-D distribution and a 1-D distribution */ title "Correlation betwee a 2-D distribution and a 1-D distribution"; call randseed(12345, 1); /* reset random number seed */ N = 1000; mu = {0 0}; Sigma = { 1.0 -0.5, -0.5 1.0}; X = RandNormal(N, mu, Sigma); /* sample from 2-D distribution */ Y = randfun(N, "Normal", 0, 0.6); /* uncorrelated 1-D sample */ DCor = DCorr(X, Y); print DCor; QUIT;