/* Program to accompany the article 
   Wicklin, R. (2016) "Loess regression in SAS/IML", The DO Loop blog, 19OCT2016
   http://blogs.sas.com/content/iml/2016/10/19/loess-regression-in-sasiml.html ?

   This program implements a basic 1-D loess regression algorithm in SAS/IML. 
*/
/****************************************/
/* STEP 1: Define some helper functions */
/****************************************/
proc iml;
/* http://blogs.sas.com/content/iml/2013/03/27/compute-distance.html */
/* compute Euclidean distance between points in S and points in R.
   S is a n x d matrix, where each row is a point in d dimensions.
   R is a m x d matrix.
   The function returns the n x m matrix of distances, D, such that
   D[i,j] is the distance between S[i,] and R[j,]. */
start PairwiseDist(S, R);
   if ncol(S)^=ncol(R) then return (.);       /* different dimensions */
   n = nrow(S);  m = nrow(R);
   idx = T(repeat(1:n, m));                   /* index matrix for S   */
   jdx = shape(repeat(1:m, n), n);            /* index matrix for R   */
   diff = S[idx,] - R[jdx,];
   return( shape( sqrt(diff[,##]), n ) );     /* sqrt(sum of squares) */
finish;

/* Compute indices (row numbers) of k nearest neighbors.
   INPUT:  S    an (n x d) data matrix
           R    an (m x d) matrix of reference points
           k    specifies the number of nearest neighbors (k>=1) 
   OUTPUT: idx  an (n x k) matrix of row numbers. idx[,j] contains
                the row numbers (in R) of the j_th closest elements
                to S
           dist an (n x k) matrix. dist[,j] contains the distances
                between S and the j_th closest elements in R
*/
start PairwiseNearestNbr(idx, dist, S, R, k=1);
   n = nrow(S);
   idx = j(n, k, .);
   dist = j(n, k, .);
   D = PairwiseDist(S, R);      /* n x m */
   do j = 1 to k;
      dist[,j] = D[ ,><];       /* smallest distance in each row */
      idx[,j] = D[ ,>:<];       /* column of smallest distance in each row */
      if j < k then do;         /* prepare for next closest neighbors */
         ndx = sub2ndx(dimension(D), T(1:n)||idx[,j]);
         D[ndx] = .;            /* set elements to missing */
      end;      
   end;
finish;

/* weighted polynomial regression for one regressor. Y, X, and w are col vectors */
/* http://blogs.sas.com/content/iml/2016/10/05/weighted-regression.html */
start PolyRegEst(Y, X, w, deg);
   Yw = sqrt(w)#Y;                  /* mult rows of Y by sqrt(w) */
   XDesign = j(nrow(X), deg+1);
   do j = 0 to deg;                 /* design matrix for polynomial regression */
      Xdesign[,j+1] = X##j;
   end;
   Xw = sqrt(w)#Xdesign;            /* mult rows of X by sqrt(w) */
   b = solve(Xw`*Xw, Xw`*Yw);       /* solve normal equation */
   return b;
finish;

/* Score regression fit on column vector x */
start PolyRegScore(x, coef);
   p = nrow(coef);
   y = j(nrow(x), 1, coef[p]);            /* initialize to coef[p] */
   do j = p-1 to 1 by -1; 
      y = y # x + coef[j];
   end;
   return(y);
finish;
store module=_ALL_;
QUIT;

/****************************************/
/* STEP 2: Define example data          */
/****************************************/

data LoessData;
input x y @@;
datalines;
11.7  2.3  19.9  8.1  11.8  4.6  17.1  5.1  16.5  4.8
 5.6  1.7  12.9  5.4   7.6  3.0   9.0  4.8  17.5  5.0
10.4  1.3  16.9  2.8   5.6  1.8  18.7  6.9   3.7  1.7
 7.4  2.3   2.0  2.7  14.8  5.2   3.0  0.0  16.8  4.2
15.0  6.6  19.9  5.5   1.9  1.1  14.8  5.8  12.4  3.0
14.0  6.4  11.7  3.6   8.2  2.9  18.8  6.2   0.3  1.8
;

/****************************************/
/* STEP 3: Define loess function        */
/****************************************/
proc iml;
load  module=(PairwiseNearestNbr PolyRegEst PolyRegScore);
/* General 1-D loess algorithm. 
   Input: t: points at which to fit loess (column vector)
          x, y: nonmissing data (column vectors)
          k: number of nearest neighbors used for loess fit
          deg: degree of local regression model 
   Output: column vector L[i] = f(t[i]; k, deg) where f is loess model
*/
start LoessFit(t, x, y, k, deg=1);
   m = nrow(t);
   Fit = j(m, 1);
   do i = 1 to m;
      x0 = t[i];
      run PairwiseNearestNbr(idx, dist, x0, x, k);
      XNbrs = X[idx]; YNbrs = Y[idx]; /* X and Y values of k nearest nbrs */
      /* compute local weight function where dist[,k] is max dist */
      w = 32/5 * (1 - (dist / dist[k])##3 )##3; 
      b = PolyRegEst(YNbrs, XNbrs, w`, deg);
      Fit[i] = PolyRegScore(x0, b);
   end;
   return Fit;
finish;

/******************************************/
/* STEP 4: Call loess function on example */
/******************************************/
use LoessData;  read all var {x y};  close;   /* read example data */

s = 0.383;                /* specify smoothing parameter */
k = floor(nrow(x)*0.383); /* num points in local neighborhood */

/* grid of points to evaluate loess curve (column vector) */
t    = T( do(min(x), max(x), (max(x)-min(x))/50) );
Fit0 = LoessFit(t, x, y, k, 0);    /* loess fit with degree=0 polynomials */
Fit1 = LoessFit(t, x, y, k, 1);    /* degree=1 */
Fit2 = LoessFit(t, x, y, k, 2);    /* degree=2 */

create Sim var {x y t Fit0 Fit1 Fit2};  append;  close;
QUIT;

/****************************************/
/* STEP 5: Overlay loess curves on data */
/****************************************/

title "Overlay loess curves computed in SAS/IML";
proc sgplot data=Sim;
label Fit0="Loess (Deg=0)" Fit1="Loess (Deg=1)" Fit2="Loess (Deg=2)"; 
scatter x=x y=y;
series x=t  y=Fit0 / curvelabel;
series x=t  y=Fit1 / curvelabel lineattrs=(color=red);
series x=t  y=Fit2 / curvelabel lineattrs=(color=ForestGreen);
xaxis grid; yaxis grid;
run;


/****************************************************************************/
/****************************************************************************/
/* OPTIONAL: Compare SAS/IML predicted values to standard LOESS predictions */
/****************************************************************************/
/****************************************************************************/
/* STEP A: Output values used by LOESS to evaluate the loess model */
/* Verify that the loess algoithms  give same values
   when they use the same prediction points. Output vertices 
   of the k-d tree that PROC LOESS uses for evaluating the 
  model when smooth = 0.383 */
ods select none;
proc loess data=LoessData;
model y = x / interp=linear degree=1 smooth=0.383;
ods output kdtree=EvalPts;
run;
proc sort data=EvalPts;
by splitvalue;
where splitvalue > 0;
run;
ods select all;

/**********************************************************************/
/* STEP B: Evaluate IML version of loess at the same values.          */
/* The data set EvalPoints includes data values at which to evaluate
   the loess model. In 1-D, the min and max of the data are also used */
/**********************************************************************/
proc iml;
load  module=(PairwiseNearestNbr PolyRegEst PolyRegScore);
start LoessFit(t, x, y, k, deg=1);
   m = nrow(t);
   Fit = j(m, 1);
   do i = 1 to m;
      x0 = t[i];
      run PairwiseNearestNbr(idx, dist, x0, x, k);
      XNbrs = X[idx];  YNbrs = Y[idx];  /* X and Y values of k nearest nbrs */
      /* compute local weight function where dist[,k] is max dist */
      w = 32/5 * (1 - (dist / dist[k])##3 )##3; 
      b = PolyRegEst(YNbrs, XNbrs, w`, deg);
      Fit[i] = PolyRegScore(x0, b);
   end;
   return Fit;
finish;

use LoessData;  read all var {x y};  close;
s = 0.383;       /* smoothing parameter */
k = floor(nrow(x)*0.383); /* num points in local neighborhood */

use EvalPts;  read all var "splitvalue" into t; close;
Fit1 = LoessFit(t, x, y, k, 1);
Fit2 = LoessFit(t, x, y, k, 2);
create Confirm var {x y t Fit1 Fit2}; append; close;
QUIT;

/* STEP C: The LOESS  curves from PROC SGPLOT should exactly 
   overlay the curves computed in IML. */
title "Compare Manual Fit with LOESS Statement";
proc sgplot data=Confirm;
scatter x=x y=y;
loess x=x y=y / nomarkers interpolation=linear degree=1 
                smooth=0.383 legendlabel="Loess1" lineattrs=(color=black);
loess x=x y=y / nomarkers interpolation=linear degree=2 
                smooth=0.383 legendlabel="Loess2" lineattrs=(color=black);
/* overlay curves computed "by hand" */
series x=t  y=Fit1 / curvelabel lineattrs=(color=red);
series x=t  y=Fit2 / curvelabel lineattrs=(color=green);
run;