/* SAS program to illustrate the "curse of dimensionality" and how to define outliers in high-dimensional multivariate normal data. Rick Wicklin, 16MAR2012 http://blogs.sas.com/content/iml/2012/03/23/the-curse-of-dimensionality */ proc iml; start Mahalanobis(x, _center, cov); center = rowvec(_center); /* x[i,] and center[i,] are row vectors */ n = nrow(x); m = nrow(center); d2 = j(n,m); /* return matrix of pairwise Mahalanobis distances */ U = root(cov); /* solve for Cholesky root once */ do k = 1 to m; /* for each row of the center matrix */ v = x - center[k,]; /* v is matrix of right-hand sides */ z = trisolv(2, U, v`); /* solve n linear systems in one call */ w = trisolv(1, U, z); /* Trick: vecdiag(A*B) = (A#B`)[,+] without computing product */ d2[,k] = (v # w`)[,+]; /* shortcut: diagonal of matrix product */ end; return (sqrt(d2)); finish; store module=Mahalanobis; quit; proc iml; /* Helper function: return correlation matrix with "compound symmetry" structure: {v+v1 v1 v1, v1 v+v1 v1, v1 v1 v+v1 }; */ start CompSym(N, v, v1); return( j(N,N,v1) + diag( j(N,1,v) ) ); finish; load module=Mahalanobis; /* or insert definition of module here */ call randseed(12345); N = 1000; /* sample size */ rho = 0.6; /* rho = corr(x_i, x_j) for i^=j */ dim = T(do(5,200,5)); /* dim=5,10,15,...,200 */ MinDist = j(nrow(dim),1); /* minimum distance to center */ PctOutliers = j(nrow(dim),1);/* pct outliers for chi-square d cutoff */ do i = 1 to nrow(dim); d = dim[i]; mu = j(d,1,0); Sigma = CompSym(d,1-rho,rho); /* get (d x d) correlation matrix */ X = randnormal(N, mu, Sigma); /* X ~ MVN(mu, Sigma) */ dist = Mahalanobis(X, mu, Sigma); MinDist[i] = min(dist); /* minimum distance to mu */ cutoff = sqrt( quantile("chisquare", 0.975, d) ); /* dist^2 ~ chi-square */ PctOutliers[i] = sum(dist>cutoff)/N; /* dist > statistical cutoff */ end; cutoff = sqrt(quantile("chisquare", 0.975, dim)); /* 97.5th pctl as function of dimension */ create CurseOfDim var {"Dim" "PctOutliers" "Cutoff" "MinDist"}; append; close; quit; proc sgplot data=CurseOfDim; title "Minimum Distance of 1,000 MVN Points to Mean"; scatter x=dim y=MinDist; xaxis label="Dimension" grid; yaxis label="Minimum Distance" grid; run; proc sgplot data=CurseOfDim; title "Outlier Criterion as a Function of Dimension"; series x=dim y=Cutoff; scatter x=dim y=MinDist; xaxis label="Dimension" grid; yaxis label="Distance" grid; run; proc sgplot data=CurseOfDim; title "Percentage of Outliers in MVN Data"; scatter x=dim y=PctOutliers; xaxis label="Dimension" grid; yaxis label="Percent Outliers" grid; refline 0.025/axis=y; run;