/* SAS program to accompany the article "Find the distance between observations and a target value" by Rick Wicklin, published 28MAR2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/03/28/closest-observation-target-value.html This program shows how to compute the distances between observations and a target value. In particular, between the (Age, Height, Weight) values of students in the Sashelp.Class data and the target value (13, 62, 100). The methods are: 1. Ordinary Euclidean and L1 distances for the raw data 2. Euclidean and L1 distances for the standardized data 3. Mahalanobis distances */ /* 1. Ordinary Euclidean and L1 distances for the raw data */ data Closest; /* target (Age, Height, Weight) = (13, 62, 100) */ set Sashelp.Class; EuclidDist = euclid(Age-13, Height-62, Weight-100); L1Dist = sumabs(Age-13, Height-62, Weight-100); run; /* Sort by Euclidean distance. Print the four observations closest to the target */ proc sort data=Closest out=Euclid; by EuclidDist; run; proc print data=Euclid(obs=4); run; /* plot the distances for each observation */ title "Distance to Target Value"; proc sgplot data=Euclid; series x=Name y=EuclidDist / curvelabel="Euclidean"; * datalabel=Weight; series x=Name y=L1Dist / curvelabel="L1"; * datalabel=Weight; yaxis grid label="Distance"; xaxis grid discreteorder=data; run; /************************************************************/ /* 2. Euclidean and L1 distances for the standardized data */ /************************************************************/ proc stdize data=Sashelp.Class out=StdClass outstat=StdIn method=STD; var Age Height Weight; run; proc print data=StdIn(obs=2); run; data Target; Age=13; Height=62; Weight=100; run; proc stdize data=Target out=StdTarget method=in(StdIn); var Age Height Weight; run; proc print data=StdTarget; run; /* rename the target values */ data StdTarget2; set StdTarget; rename Age=tAge Height=tHeight Weight=tWeight; run; data StdClose; /* compute distances in standardized coordinates */ set StdClass; array _x[*] Age Height Weight; if _N_=1 then do; set StdTarget2; array target[3} tAge tHeight tWeight; drop tAge tHeight tWeight; end; do i = 1 to dim(_x); _x[i] = _x[i] - target[i]; end; EuclidDist = euclid(of _x[*]); L1DIst = sumabs(of _x[*]); run; /* find three students closest to target for each distance definition */ /* First Euclidean distance */ proc sort data=StdClose out=StdEuclid; by EuclidDist; run; proc print data=StdEuclid(obs=4); run; /* plot the distances for each observation */ title "Standardized Distance to Target Value"; proc sgplot data=StdEuclid; series x=Name y=EuclidDist / curvelabel="Euclidean"; * datalabel=Weight; series x=Name y=L1Dist / curvelabel="L1"; * datalabel=Weight; yaxis grid label="Distance"; xaxis grid discreteorder=data; run; /************************************************************/ /* 3. Mahalanobis distances */ /************************************************************/ proc iml; use Sashelp.Class; read all var _NUM_ into X[colname=VarNames]; read all var "Name"; close; target = {13 62 100}; cov = cov(X); /* to find MD to arbitrary point, standardize the data by using the inverse Cholesky transformation. For details, see https://blogs.sas.com/content/iml/2012/02/15/what-is-mahalanobis-distance.html */ L = root(cov); z = (X-target)*inv(L)`; /* target is row vector, so transpose */ MD = sqrt( z[,##] ); call sortndx(idx, MD); title "Mahalanobis Distance to Target Value"; call series(Name[idx,], MD[idx,]) label={"Name" "Mahalanobis Distance"} grid={x y}; Y = MD[idx,] || (X[idx,]-target); Y = Y[1:5,]; /* five closest obs */ print Y[colname=("Mahal D" || varNames) rowname=(Name[idx,])]; /*************/ /* Note: You might be tempted to use the MAHALANOBIS function to compute the MD as MD = Mahalanobis(X, target, cov); print MD X[c=VarNames r=Name]; However, this isn't correct unless the target value is the sample or population mean (the center of the data). In the MAHALANOBIS function, the distance returned is from each observation to the center of the data. and 'center' and 'Cov' (which is a centered matrix) should use the same center. */