/************************************************************/
/* SAS/IML utility functions for solving Cryptoquotes by    */
/* using frequency analysis                                 */
/* Rick Wicklin                                             */
/* From "How to use frequency analysis to crack the Cryptoquote puzzle"
   http://blogs.sas.com/content/iml/2014/10/29/frequency-analysis-cryptoquote/
*/
/************************************************************/

proc iml;
/* Break a message into an array. Each word is on one row;
   each letter is in a separate column.
   Usage: msg = "THE QUICK BROWN FOX";
          w = SplitByWords(msg);
*/
start SplitByWords(_msg);
   msg = upcase(_msg);              /* msg is scalar string */
   m = substr(msg,1:length(msg),1); /* break into array */
   SpaceIdx = loc(m=" ");
   FirstLetters = 1 || (1 + SpaceIdx);
   LastLetters  = (SpaceIdx - 1) || length(msg);
   numWords = ncol(SpaceIdx) + 1;
   maxLength = max(LastLetters-FirstLetters+1);
   words = j(numWords, maxLength, " ");
   do i = 1 to numWords;
      range = FirstLetters[i]:LastLetters[i];
      words[i, 1:ncol(range)] = m[,range];
   end;
   return ( words );
finish;

/* Compute frequency or proportion of letters in message. 
   Return 1x26 vector of frequencies or proportions
   Usage: f = LetterFreq(msg);
          p = LetterProp(msg);
          print p[c=("A":"Z")];
*/
start LetterFreq(_msg);
   msg = upcase(rowcat( shape(_msg,1) )); /* create one long string */
   m = substr(msg,1:nleng(msg),1);        /* break into array */
   Letters = "A":"Z";
   freq = j(1,ncol(Letters),0);
   do i = 1 to ncol(Letters);
      freq[i] = ncol(loc(m=Letters[i]));
   end;
   return(freq);
finish;

start LetterProp(msg);
   freq = LetterFreq(msg);
   return(freq / sum(freq));   /* standardize as proportion */
finish;
 
/* return a 1x26 vector of frequency counts for double-letter bigrams */
start DblLetterFreq(_msg);
   Letters = "A":"Z";
   freq = j(1, ncol(Letters), 0);
   w = SplitByWords(_msg);
   jdx = 1:(ncol(w)-1);
   d = (w[,jdx] = w[,jdx+1]) & (element(w[,jdx],Letters));
   idx = loc(d=1);
   if ncol(idx)>0 then do;
      dbls = w[,jdx][idx];    * col vec of double letters;
      do i = 1 to nrow(dbls);
         j = loc(Letters=dbls[i]);
         freq[,j] = freq[,j] + 1;
      end;
   end;
   return ( freq );
finish;

/* return a 26x26 matrix of frequency counts for bigrams */
start BigramFreq(_msg);
   msg = upcase(_msg);
   m = substr(msg,1:length(msg),1); /* break into array */
   m = colvec(m);
   first  = m[1:nrow(m)-1];
   second = m[2:nrow(m)];

   freq = j(26,26,0);
   Letters = "A":"Z";
   do i = 1 to 26;
      do j = 1 to 26;
         freq[i,j] = ncol( loc(first=Letters[i] & second=Letters[j]) );
      end;
   end;
   return( freq );
finish;

/* print reference percentages */
/* See
   http://blogs.sas.com/content/iml/2014/09/19/frequency-of-letters/
   http://blogs.sas.com/content/iml/2014/09/26/bigrams/
   http://blogs.sas.com/content/iml/2014/10/03/double-letter-bigrams/
*/
start PrintFreqRef(which=3);
   /* Create reference sheet */
   letters = {E T A O I N S R H};
   letpct = {0.1249 0.0928 0.0804 0.0764 0.0757 0.0723 0.0651 0.0628 0.0505};
   bigrams = {TH HE IN ER AN RE ON AT EN ND TI ES OR TE OF ED IS IT AL AR ST NT TO};
   bipct = {0.03556 0.03075 0.02433 0.02048 0.01985 0.01854 0.01758 0.01487 0.01454 
            0.01352 0.01343 0.01339 0.01277 0.01205 0.01175 0.01168 0.01128 0.01123 
            0.01087 0.01075 0.01053 0.01041 0.01041};
   dbls = {LL SS EE OO TT FF PP RR MM CC NN};
   dblpct = {0.00577 0.00405 0.00378 0.0021 0.00171 0.00146 
             0.00137 0.00121 0.00096 0.00083 0.00073 };
   L=lowcase(letters);
   print letpct[c=L L="Letters" F=PERCENT8.2];
   if which>1 then do;
      L=lowcase(bigrams[1:12]);
      print (bipct[,1:12])[c=L L="Bigrams" F=PERCENT8.2];
   end;
   if which>2 then do;
      L=lowcase(dbls);
      print dblpct[c=L L="Double Letters" F=PERCENT8.2];
   end;
finish;

/* Extract approximately the top k bigrams by frequency
   Input: p is a 26x26 matrix of bigram frequencies 
   Output: FREQ is a row vector. Usually FREQ has at most k elements, 
                but if m elements are tied for the highest frequency, 
                then FREQ has at least m elements
           LABEL is the row vector of bigrams for FREQ
   Usage:  run SortBigrams(freq, label, p);
           print freq[c=label F=PERCENT8.2]; 
*/
start SortBigrams(freq, label, p, k=10);
   r = ranktie(p, "Dense");  /* 26x26 matrix of ranks: 1,2,...,maxRank */ 
   maxRank = max(r);
   rank = maxRank:1;
   n = j(1,maxRank,0);       /* count how many bigrams for each freq */
   do i = maxRank to 1 by -1;
      n[i] = ncol(loc(r=rank[i]));
   end;
   cusum = cusum(n);  /* {2 7 18 64 612} ==> 2 have r=5; 5 have r=4; 11 have r=3; 46 have r=2 */
   
   /* include all of first group if number in first group is larger than k */
   if k<n[1] then cutIdx = 1;
   else cutIdx = loc(cusum<=k)[<>]; /* largest rank s.t. the cumulative total is less than k */
   
   idx = loc( r>=rank[cutIdx] );    /* idx is never empty */
   pp = p[idx];
   rr = r[idx];
   s = Ndx2Sub(dimension(r), idx); /* get subscripts into 26x26 matrix */
   Letters = "A":"Z";
   label = rowcat( Letters[s[,1]] || Letters[s[,2]] );
   call sortndx(ndx, rr, 1, 1);    /* sort in descending order */
   freq = rowvec(pp[ndx]); 
   label = rowvec(label[ndx]);   
finish;

/* Input: 1x26 vector of proportions
   Output: sorted proportions and labels, optionally
           truncated at some cutoff value. Set cutoff=0 for no truncation
   Usage:  p = PropLetters(msg);
           run SortLettersByProp(prop, label, p);
           print prop[c=label];
*/
start SortLetters(prop, label, p, cutoff=0.05);
   free prop label;
   idx = loc(p>= cutoff);
   if IsEmpty(idx) then return;
   pp = p[idx];  Letters = ("A":"Z")[idx];
   /* sort in descending order */
   call sortndx(ndx, pp, 1, 1); 
   prop = rowvec(pp[ndx]); 
   label = rowvec(Letters[ndx]);
finish;

/* Print a frequency analysis for single letters and double-letter
   bigrams.
*/
start FreqAnalysis(msg, cutoff=0.05);
   p = LetterProp(msg);
   run SortLetters(prop, label, p, cutoff);
   print prop[c=label format=percent7.2 L="Letter Proportions"];
   f = BigramFreq(msg);
   run SortBigrams(freq, label, f);
   if ^IsEmpty(freq) then
      print freq[c=label L="Frequency of Top Bigrams"];
   p = DblLetterFreq(msg);
   run SortLetters(dbl, label, p) cutoff=1;
   if ^IsEmpty(dbl) then
      print dbl[c=label L="Double Letter Frequency"]; 
finish;

/* Replace ciphertext symbols with guesses for plaintext letters.
   Usage: msg = "RHES ADLUI DEU VYP";
          w = SplitByWords(msg);            * split into array;
          w1 = ApplySubs(w,  {U E D}, {e r a}); print w1;
          w2 = ApplySubs(w1, {H I}, {o s}); print w2;
          w3 = ApplySubs(w2, {L P}, {m n}); print w3;
          w4 = ApplySubs(w3, {R S A V Y}, {w d g f u}); print w4; 
*/
start ApplySubs(wOld, _cipher, _plain);
   w = wOld;  
   cipher = colvec(upcase(_cipher)); 
   plain  = colvec(lowcase(_plain)); 
   do i = 1 to nrow(cipher);
      idx = loc(w=cipher[i]);
      if ^IsEmpty(idx) then w[idx] = plain[i];
   end;
   return( w );
finish;
store module=_all_;
quit;

proc iml;
load module=_all_;
run PrintFreqRef();    /* print reference distribution of letters and bigrams */
print "------------------------------------------------";

reset linesize=140;
msg = "KBSCS OCS KPUSH XBSW DOCSWKBTTG HSSUH WTKBPWJ " +
      "IAK LSSGPWJ KBS UTAKB KBOK IPKSH MTA. " +
      "~ DSKSC GS FCPSH";

run FreqAnalysis(msg); /* distribution of letters and bigrams for this message */

/* The most common letters are ETAOINSRHL.
   The most common double-letter bigrams are LL, SS, EE, OO, TT.
   It seems likely that S=e.  The bigram KB appears often 
   in this message, so perhaps KB=th.
*/
w = SplitByWords(msg);            /* split into array */
rownum = char(1:nrow(w));         /* row numbers, for reference */
w1 = ApplySubs(w, {K B S}, {t h e});  /* try K=t, B=h, and S=e */
print w1[r=rownum];

/* Promising. Probably C=r and second word is 'are'? Notice that 
   CS=re is a frequent bigram! */
w2 = ApplySubs(w1, {O C}, {a r});
print w2[r=rownum];

/* what about the other double letter combination (line 5).
   It can't be a double consonant secause if follows a 'th'. 
   Since 'ee' is already revealed, try 'oo'.
   Also, is the author's first name 'peter'? */
w3 = ApplySubs(w2, {T D}, {o p});
print w3[r=rownum];

/* The 5th word must be 'parenthood'. Guess H=s from frequency. */
/* The suffix 'PnJ' occurs twice. Try 'ing' */
w4 = ApplySubs(w3, {H W G P J}, {s n d i g});
print w4[r=rownum];

/* 3rd word = 'times'? 4th word 'when'?*/
/* author is 'peter de vries' */
w5 = ApplySubs(w4, {U X F}, {m w v});
print w5[r=rownum];
/* 11th word = 'mouth', so 8th word = 'but'
   and 14th word 'you'. 9th word = 'feeding' */
w6 = ApplySubs(w5, {I A M L}, {b u y f});
print w6[r=rownum];


/*****************************************/
proc iml;
load module=_all_;
/*
More to try:
LX TEFKEN KEDT LJ KE OKER UVGQAZ QAKQLVKJ TQAAJNO BEN BEMN. MADJOO KUJNJ VNJ KUNJJ EKUJN RJERDJ. ~ENOEA PJDDJO
PL GLTPWWGRWB JBANXM XT EXLEGWGPAGLH P AGHBZ GY AX PWWXC XLBYBWT AX RB MBUXOZBM. ~QXLZPM PMBLPOBZ
AYDN XFAQO LCROJD XQUC QO QSAYDN ROERORNC KQMQKRNH EYV NQTROJ NXROJD EYV JVQONCI. ~QSIYFD XFBSCH
K'Y SKFKJU DE LWT ANOEJQ YO KJIEYN GMWG XN YWO WSYEDG AN DWKQ GE AN WVWTG. ~ N. N. IPYYKJUD
*/
msg = "CG EKLVFX WLU MVXZG IGLIFG YH UKGSB IGPQ " +
      "LR GNDGFFGWDG; YVU YH UKG XSEUPWDG UKGH KPTG " +
      "UBPTGFGX RBLA UKG ILSWU CKGBG UKGH EUPBUGX. " +
      "~ KGWBH CPBX YGGDKGB";

run PrintFreqRef();
print "------------------------------------------------";
run FreqAnalysis(msg) cutoff=0.045;  /* several letters just under 5% */

w = SplitByWords(msg);
rownum = char(1:nrow(w));
print w[r=rownum];

/* From freq analysis, guess that G=e. Guess the bigrams
   KG=he and UK=th */
w1 = ApplySubs(w, {U K G}, {t h e});
print w1[r=rownum];
 
/* Is author's first name 'henry'? 
   B=r makes sense in words like 7 & 21. Also B is common in the message
   and r is common at large. */
w2 = ApplySubs(w1, {W B H}, {n r y});
print w2[r=rownum];

/* 7th word is 'their'; 21st is 'where'? */
w3 = ApplySubs(w2, {S C}, {i w});
print w3[r=rownum];

/* going back to letter frequencies: L, P, and X are unknown,
   as are a and o.  Which could be vowels?
   X is not a vowel (word 17)
   L is a vowel (word 3) so must be o.
   P might be a vowel. From word 8, try P=a.
   Is the 16th word 'have'? */
w4 = ApplySubs(w3, {L P T}, {o a v});
print w4[r=rownum];

/* word 2 looks like 'should', and that would make F=l and X=d. 
   Note that FF is one of the double-letter bigrams, and ll is common.
*/
w5 = ApplySubs(w4, {E V F X}, {s u l d});
print w5[r=rownum];

/* At this point we can unravel the cipher by guessing words:
   4th = judge
   5th = people 
   6th = by
   14th = distance
   18th = from*/
w6 = ApplySubs(w5, {M Z I Y D R A Q N}, {j g p b c f m k x});
print w6[r=rownum];


/*****************************************/
proc iml;
load module=_all_;
msg = "LX LEKUJN KEDT LJ KE OKER UVGQAZ QAKQLVKJ TQAAJNO BEN BEMN " +
      "MADJOO KUJNJ VNJ KUNJJ EKUJN RJERDJ. " +
      "~ ENOEA PJDDJO";

run PrintFreqRef();
print "------------------------------------------------";
run FreqAnalysis(msg);  

w = SplitByWords(msg);
rownum = char(1:nrow(w));
print w[r=rownum];

/* from freq analysis of letters and bigrams: 
   J = e. Try N=r. Then by inspection it looks like KU=th */
w1 = ApplySubs(w, {J N K U}, {e r t h});
print w1[r=rownum];

/* Attack the small words:
   12th = me ==> 1st = my 
   5th = to
   14th = are */
w2 = ApplySubs(w1, {L E V}, {m o a});
print w2[r=rownum];

/* A, O, and D are unknown consonants because each appears as 
   a double letter and e and o are already known.
   All are common letters, and DJ=De and QA are common bigrams.
   Guess QA=in and D and O in {l s f p m n} */
w3 = ApplySubs(w2, {Q A D O}, {i n l s});
print w3[r=rownum];

/* Some words stand out. It unravels */
w4 = ApplySubs(w3, {X T R G B M P Z}, {y d p v f u w g});
print w4[r=rownum];

w5 = ApplySubs(w4, {S F L T M}, {l v f r h});
print w5[r=rownum];