/* Program to accompany the article Wicklin, R. (2016) "Loess regression in SAS/IML", The DO Loop blog, 19OCT2016 http://blogs.sas.com/content/iml/2016/10/19/loess-regression-in-sasiml.html ? This program implements a basic 1-D loess regression algorithm in SAS/IML. */ /****************************************/ /* STEP 1: Define some helper functions */ /****************************************/ proc iml; /* http://blogs.sas.com/content/iml/2013/03/27/compute-distance.html */ /* compute Euclidean distance between points in S and points in R. S is a n x d matrix, where each row is a point in d dimensions. R is a m x d matrix. The function returns the n x m matrix of distances, D, such that D[i,j] is the distance between S[i,] and R[j,]. */ start PairwiseDist(S, R); if ncol(S)^=ncol(R) then return (.); /* different dimensions */ n = nrow(S); m = nrow(R); idx = T(repeat(1:n, m)); /* index matrix for S */ jdx = shape(repeat(1:m, n), n); /* index matrix for R */ diff = S[idx,] - R[jdx,]; return( shape( sqrt(diff[,##]), n ) ); /* sqrt(sum of squares) */ finish; /* Compute indices (row numbers) of k nearest neighbors. INPUT: S an (n x d) data matrix R an (m x d) matrix of reference points k specifies the number of nearest neighbors (k>=1) OUTPUT: idx an (n x k) matrix of row numbers. idx[,j] contains the row numbers (in R) of the j_th closest elements to S dist an (n x k) matrix. dist[,j] contains the distances between S and the j_th closest elements in R */ start PairwiseNearestNbr(idx, dist, S, R, k=1); n = nrow(S); idx = j(n, k, .); dist = j(n, k, .); D = PairwiseDist(S, R); /* n x m */ do j = 1 to k; dist[,j] = D[ ,><]; /* smallest distance in each row */ idx[,j] = D[ ,>:<]; /* column of smallest distance in each row */ if j < k then do; /* prepare for next closest neighbors */ ndx = sub2ndx(dimension(D), T(1:n)||idx[,j]); D[ndx] = .; /* set elements to missing */ end; end; finish; /* weighted polynomial regression for one regressor. Y, X, and w are col vectors */ /* http://blogs.sas.com/content/iml/2016/10/05/weighted-regression.html */ start PolyRegEst(Y, X, w, deg); Yw = sqrt(w)#Y; /* mult rows of Y by sqrt(w) */ XDesign = j(nrow(X), deg+1); do j = 0 to deg; /* design matrix for polynomial regression */ Xdesign[,j+1] = X##j; end; Xw = sqrt(w)#Xdesign; /* mult rows of X by sqrt(w) */ b = solve(Xw`*Xw, Xw`*Yw); /* solve normal equation */ return b; finish; /* Score regression fit on column vector x */ start PolyRegScore(x, coef); p = nrow(coef); y = j(nrow(x), 1, coef[p]); /* initialize to coef[p] */ do j = p-1 to 1 by -1; y = y # x + coef[j]; end; return(y); finish; store module=_ALL_; QUIT; /****************************************/ /* STEP 2: Define example data */ /****************************************/ data LoessData; input x y @@; datalines; 11.7 2.3 19.9 8.1 11.8 4.6 17.1 5.1 16.5 4.8 5.6 1.7 12.9 5.4 7.6 3.0 9.0 4.8 17.5 5.0 10.4 1.3 16.9 2.8 5.6 1.8 18.7 6.9 3.7 1.7 7.4 2.3 2.0 2.7 14.8 5.2 3.0 0.0 16.8 4.2 15.0 6.6 19.9 5.5 1.9 1.1 14.8 5.8 12.4 3.0 14.0 6.4 11.7 3.6 8.2 2.9 18.8 6.2 0.3 1.8 ; /****************************************/ /* STEP 3: Define loess function */ /****************************************/ proc iml; load module=(PairwiseNearestNbr PolyRegEst PolyRegScore); /* General 1-D loess algorithm. Input: t: points at which to fit loess (column vector) x, y: nonmissing data (column vectors) k: number of nearest neighbors used for loess fit deg: degree of local regression model Output: column vector L[i] = f(t[i]; k, deg) where f is loess model */ start LoessFit(t, x, y, k, deg=1); m = nrow(t); Fit = j(m, 1); do i = 1 to m; x0 = t[i]; run PairwiseNearestNbr(idx, dist, x0, x, k); XNbrs = X[idx]; YNbrs = Y[idx]; /* X and Y values of k nearest nbrs */ /* compute local weight function where dist[,k] is max dist */ w = 32/5 * (1 - (dist / dist[k])##3 )##3; b = PolyRegEst(YNbrs, XNbrs, w`, deg); Fit[i] = PolyRegScore(x0, b); end; return Fit; finish; /******************************************/ /* STEP 4: Call loess function on example */ /******************************************/ use LoessData; read all var {x y}; close; /* read example data */ s = 0.383; /* specify smoothing parameter */ k = floor(nrow(x)*0.383); /* num points in local neighborhood */ /* grid of points to evaluate loess curve (column vector) */ t = T( do(min(x), max(x), (max(x)-min(x))/50) ); Fit0 = LoessFit(t, x, y, k, 0); /* loess fit with degree=0 polynomials */ Fit1 = LoessFit(t, x, y, k, 1); /* degree=1 */ Fit2 = LoessFit(t, x, y, k, 2); /* degree=2 */ create Sim var {x y t Fit0 Fit1 Fit2}; append; close; QUIT; /****************************************/ /* STEP 5: Overlay loess curves on data */ /****************************************/ title "Overlay loess curves computed in SAS/IML"; proc sgplot data=Sim; label Fit0="Loess (Deg=0)" Fit1="Loess (Deg=1)" Fit2="Loess (Deg=2)"; scatter x=x y=y; series x=t y=Fit0 / curvelabel; series x=t y=Fit1 / curvelabel lineattrs=(color=red); series x=t y=Fit2 / curvelabel lineattrs=(color=ForestGreen); xaxis grid; yaxis grid; run; /****************************************************************************/ /****************************************************************************/ /* OPTIONAL: Compare SAS/IML predicted values to standard LOESS predictions */ /****************************************************************************/ /****************************************************************************/ /* STEP A: Output values used by LOESS to evaluate the loess model */ /* Verify that the loess algoithms give same values when they use the same prediction points. Output vertices of the k-d tree that PROC LOESS uses for evaluating the model when smooth = 0.383 */ ods select none; proc loess data=LoessData; model y = x / interp=linear degree=1 smooth=0.383; ods output kdtree=EvalPts; run; proc sort data=EvalPts; by splitvalue; where splitvalue > 0; run; ods select all; /**********************************************************************/ /* STEP B: Evaluate IML version of loess at the same values. */ /* The data set EvalPoints includes data values at which to evaluate the loess model. In 1-D, the min and max of the data are also used */ /**********************************************************************/ proc iml; load module=(PairwiseNearestNbr PolyRegEst PolyRegScore); start LoessFit(t, x, y, k, deg=1); m = nrow(t); Fit = j(m, 1); do i = 1 to m; x0 = t[i]; run PairwiseNearestNbr(idx, dist, x0, x, k); XNbrs = X[idx]; YNbrs = Y[idx]; /* X and Y values of k nearest nbrs */ /* compute local weight function where dist[,k] is max dist */ w = 32/5 * (1 - (dist / dist[k])##3 )##3; b = PolyRegEst(YNbrs, XNbrs, w`, deg); Fit[i] = PolyRegScore(x0, b); end; return Fit; finish; use LoessData; read all var {x y}; close; s = 0.383; /* smoothing parameter */ k = floor(nrow(x)*0.383); /* num points in local neighborhood */ use EvalPts; read all var "splitvalue" into t; close; Fit1 = LoessFit(t, x, y, k, 1); Fit2 = LoessFit(t, x, y, k, 2); create Confirm var {x y t Fit1 Fit2}; append; close; QUIT; /* STEP C: The LOESS curves from PROC SGPLOT should exactly overlay the curves computed in IML. */ title "Compare Manual Fit with LOESS Statement"; proc sgplot data=Confirm; scatter x=x y=y; loess x=x y=y / nomarkers interpolation=linear degree=1 smooth=0.383 legendlabel="Loess1" lineattrs=(color=black); loess x=x y=y / nomarkers interpolation=linear degree=2 smooth=0.383 legendlabel="Loess2" lineattrs=(color=black); /* overlay curves computed "by hand" */ series x=t y=Fit1 / curvelabel lineattrs=(color=red); series x=t y=Fit2 / curvelabel lineattrs=(color=green); run;