/* SAS/IML program to accompany the article "Reorder variables in a heat map or scatter plot matrix" by Rick Wicklin, published 2MAY2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/05/01/reorder-variable-correlation-heat-map.html This program implements a variable reordering method based on Catherine B. Hurley, 2004, "Clustering Visualization of Multidimensional Data", Journal of Computational and Graphical Statistics, 13(4), 788-806. https://www.tandfonline.com/doi/abs/10.1198/106186004X12425 */ proc iml; /* Single Link Clustering (SLC) from 'merit matrix' */ start SingleLinkCluster( Merit ); /* Implement single link clustering algorithm as outlined in Hurley (2004) to create a permutation of variables. Hurley attributes the algorithm to Gruvaeus and Wainer (1972). */ n = nrow(Merit); if n <= 2 then return ( 1:n ); /* find indices Merit(i,j) with i>j */ RowIndex = repeat( T(1:n), 1, n ); ColIndex = repeat( 1:n, n ); LowerTriangularIndex = loc( RowIndex>ColIndex ); ColIndex = ColIndex[LowerTriangularIndex]; RowIndex = RowIndex[LowerTriangularIndex]; rho = -Merit; d = rho[LowerTriangularIndex] || RowIndex || ColIndex; rank = rank( d[,1] ); /* descending rank */ *print rho, d rank; call sortndx( sortIndex, rank, 1 ); d = d[ sortIndex, ]; /* ClusIndex keeps track of which cluster each variable 1:n belongs to at the i_th step of the algorithm. If ClusIndex[k]=r, then variable k belongs to the r_th cluster. Initially, every var is in its own cluster. */ ClusIndex = 1:n; /* Emulate a list of arrays using a dense nxn matrix. The k_th member of the list is the nonmissing elements of row k. Use ListLength(m,k) to get the number of elements in the k_th array. Use SLCListGet(m,k) to get the k_th array. Use SLCListReplace(m,k,v) to replace the k_th array. */ clusters = j(n,n,.); /* allocate storage for "list" */ clusters[,1] = T(1:n); /* initially each variable is in its own cluster */ m = nrow(d); /* at step i we have formed n-i+1 clusters */ do i = 1 to m; j = ClusIndex[d[i,2]]; k = ClusIndex[d[i,3]]; if j ^= k then do; if j > k then do; /* swap since we are using indices for lower triangle */ temp = j; j = k; k = temp; end; /* Get the j_th and k_th clusters */ clusj = SLCListGet( clusters, j ); clusk = SLCListGet( clusters, k); /* Merge j_th and k_th clusters. If clusj = a1..aj and clusk = b1..bk then merge(clusj,clusk) is formed by looking at the ranking of (a1,b1), (a1,bk), (aj, b1), and (aj, bk). */ d_11 = rho[clusj[1], clusk[1]]; d_1k = rho[clusj[1], clusk[ncol(clusk)]]; d_j1 = rho[clusj[ncol(clusj)], clusk[1]]; d_jk = rho[clusj[ncol(clusj)], clusk[ncol(clusk)]]; minD = min(d_11, d_1k, d_j1, d_jk); *print "At Step=" i " Merge " j "th and " k "th clusters"; *print clusj, clusk; /* The new cluster is of one of four forms: 1) a1..aj, b1..bk */ if minD = d_j1 then ; /* 4) aj..a1, bk..b1 */ else if minD = d_1k then do; clusj = ReverseRowVec(clusj); clusk = ReverseRowVec(clusk); end; /* 3) aj..a1, b1..bk */ else if minD = d_11 then clusj = ReverseRowVec(clusj); /* 2) a1..aj, bk..b1 */ else if minD = d_jk then clusk = ReverseRowVec(clusk); /* merge the two clusters into the "list" */ run SLCListReplace( clusters, j, clusj || clusk ); /* update the cluster index */ ClusIndex[loc(ClusIndex=k)] = j; if SLCListLength( clusters, 1 ) = n then return ( SLCListGet( clusters, 1 ) ); end; end; return ( SLCListGet( clusters, 1 ) ); finish; /* ------------------- helper functions ------------------ */ start ReverseRowVec(x); return ( x[,ncol(x):1] ); finish; start SLCListGet( x, i ); /* x is a matrix that emulates a list of arrays. Extract the i_th row and strip off trailing missing values */ if i>nrow(x) then return (.); /* error */ v = x[i,]; idx = loc( v ^= . ); if type(idx)='U' then return (.); /* error */ return ( v[,idx] ); finish; start SLCListReplace( x, i, v ); /* x is a matrix that emulates a list of arrays. Overwrite the i_th row with v and pad with missing values */ if i>nrow(x) then return; /* error */ if ncol(v) < ncol(x) then x[i,] = v || j(1, ncol(x)-ncol(v), .); else x[i,] = v; finish; start SLCListLength( x, i ); /* x is a matrix that emulates a list of arrays. Return the number of nonmissing values in the i_th row */ if i>nrow(x) then return (.); /* error */ idx = loc( x[i,] ^= . ); return ( ncol(idx) ); finish; store module=_all_; quit;