/************************************************************/ /* SAS/IML utility functions for solving Cryptoquotes by */ /* using frequency analysis */ /* Rick Wicklin */ /* From "How to use frequency analysis to crack the Cryptoquote puzzle" http://blogs.sas.com/content/iml/2014/10/29/frequency-analysis-cryptoquote/ */ /************************************************************/ proc iml; /* Break a message into an array. Each word is on one row; each letter is in a separate column. Usage: msg = "THE QUICK BROWN FOX"; w = SplitByWords(msg); */ start SplitByWords(_msg); msg = upcase(_msg); /* msg is scalar string */ m = substr(msg,1:length(msg),1); /* break into array */ SpaceIdx = loc(m=" "); FirstLetters = 1 || (1 + SpaceIdx); LastLetters = (SpaceIdx - 1) || length(msg); numWords = ncol(SpaceIdx) + 1; maxLength = max(LastLetters-FirstLetters+1); words = j(numWords, maxLength, " "); do i = 1 to numWords; range = FirstLetters[i]:LastLetters[i]; words[i, 1:ncol(range)] = m[,range]; end; return ( words ); finish; /* Compute frequency or proportion of letters in message. Return 1x26 vector of frequencies or proportions Usage: f = LetterFreq(msg); p = LetterProp(msg); print p[c=("A":"Z")]; */ start LetterFreq(_msg); msg = upcase(rowcat( shape(_msg,1) )); /* create one long string */ m = substr(msg,1:nleng(msg),1); /* break into array */ Letters = "A":"Z"; freq = j(1,ncol(Letters),0); do i = 1 to ncol(Letters); freq[i] = ncol(loc(m=Letters[i])); end; return(freq); finish; start LetterProp(msg); freq = LetterFreq(msg); return(freq / sum(freq)); /* standardize as proportion */ finish; /* return a 1x26 vector of frequency counts for double-letter bigrams */ start DblLetterFreq(_msg); Letters = "A":"Z"; freq = j(1, ncol(Letters), 0); w = SplitByWords(_msg); jdx = 1:(ncol(w)-1); d = (w[,jdx] = w[,jdx+1]) & (element(w[,jdx],Letters)); idx = loc(d=1); if ncol(idx)>0 then do; dbls = w[,jdx][idx]; * col vec of double letters; do i = 1 to nrow(dbls); j = loc(Letters=dbls[i]); freq[,j] = freq[,j] + 1; end; end; return ( freq ); finish; /* return a 26x26 matrix of frequency counts for bigrams */ start BigramFreq(_msg); msg = upcase(_msg); m = substr(msg,1:length(msg),1); /* break into array */ m = colvec(m); first = m[1:nrow(m)-1]; second = m[2:nrow(m)]; freq = j(26,26,0); Letters = "A":"Z"; do i = 1 to 26; do j = 1 to 26; freq[i,j] = ncol( loc(first=Letters[i] & second=Letters[j]) ); end; end; return( freq ); finish; /* print reference percentages */ /* See http://blogs.sas.com/content/iml/2014/09/19/frequency-of-letters/ http://blogs.sas.com/content/iml/2014/09/26/bigrams/ http://blogs.sas.com/content/iml/2014/10/03/double-letter-bigrams/ */ start PrintFreqRef(which=3); /* Create reference sheet */ letters = {E T A O I N S R H}; letpct = {0.1249 0.0928 0.0804 0.0764 0.0757 0.0723 0.0651 0.0628 0.0505}; bigrams = {TH HE IN ER AN RE ON AT EN ND TI ES OR TE OF ED IS IT AL AR ST NT TO}; bipct = {0.03556 0.03075 0.02433 0.02048 0.01985 0.01854 0.01758 0.01487 0.01454 0.01352 0.01343 0.01339 0.01277 0.01205 0.01175 0.01168 0.01128 0.01123 0.01087 0.01075 0.01053 0.01041 0.01041}; dbls = {LL SS EE OO TT FF PP RR MM CC NN}; dblpct = {0.00577 0.00405 0.00378 0.0021 0.00171 0.00146 0.00137 0.00121 0.00096 0.00083 0.00073 }; L=lowcase(letters); print letpct[c=L L="Letters" F=PERCENT8.2]; if which>1 then do; L=lowcase(bigrams[1:12]); print (bipct[,1:12])[c=L L="Bigrams" F=PERCENT8.2]; end; if which>2 then do; L=lowcase(dbls); print dblpct[c=L L="Double Letters" F=PERCENT8.2]; end; finish; /* Extract approximately the top k bigrams by frequency Input: p is a 26x26 matrix of bigram frequencies Output: FREQ is a row vector. Usually FREQ has at most k elements, but if m elements are tied for the highest frequency, then FREQ has at least m elements LABEL is the row vector of bigrams for FREQ Usage: run SortBigrams(freq, label, p); print freq[c=label F=PERCENT8.2]; */ start SortBigrams(freq, label, p, k=10); r = ranktie(p, "Dense"); /* 26x26 matrix of ranks: 1,2,...,maxRank */ maxRank = max(r); rank = maxRank:1; n = j(1,maxRank,0); /* count how many bigrams for each freq */ do i = maxRank to 1 by -1; n[i] = ncol(loc(r=rank[i])); end; cusum = cusum(n); /* {2 7 18 64 612} ==> 2 have r=5; 5 have r=4; 11 have r=3; 46 have r=2 */ /* include all of first group if number in first group is larger than k */ if k]; /* largest rank s.t. the cumulative total is less than k */ idx = loc( r>=rank[cutIdx] ); /* idx is never empty */ pp = p[idx]; rr = r[idx]; s = Ndx2Sub(dimension(r), idx); /* get subscripts into 26x26 matrix */ Letters = "A":"Z"; label = rowcat( Letters[s[,1]] || Letters[s[,2]] ); call sortndx(ndx, rr, 1, 1); /* sort in descending order */ freq = rowvec(pp[ndx]); label = rowvec(label[ndx]); finish; /* Input: 1x26 vector of proportions Output: sorted proportions and labels, optionally truncated at some cutoff value. Set cutoff=0 for no truncation Usage: p = PropLetters(msg); run SortLettersByProp(prop, label, p); print prop[c=label]; */ start SortLetters(prop, label, p, cutoff=0.05); free prop label; idx = loc(p>= cutoff); if IsEmpty(idx) then return; pp = p[idx]; Letters = ("A":"Z")[idx]; /* sort in descending order */ call sortndx(ndx, pp, 1, 1); prop = rowvec(pp[ndx]); label = rowvec(Letters[ndx]); finish; /* Print a frequency analysis for single letters and double-letter bigrams. */ start FreqAnalysis(msg, cutoff=0.05); p = LetterProp(msg); run SortLetters(prop, label, p, cutoff); print prop[c=label format=percent7.2 L="Letter Proportions"]; f = BigramFreq(msg); run SortBigrams(freq, label, f); if ^IsEmpty(freq) then print freq[c=label L="Frequency of Top Bigrams"]; p = DblLetterFreq(msg); run SortLetters(dbl, label, p) cutoff=1; if ^IsEmpty(dbl) then print dbl[c=label L="Double Letter Frequency"]; finish; /* Replace ciphertext symbols with guesses for plaintext letters. Usage: msg = "RHES ADLUI DEU VYP"; w = SplitByWords(msg); * split into array; w1 = ApplySubs(w, {U E D}, {e r a}); print w1; w2 = ApplySubs(w1, {H I}, {o s}); print w2; w3 = ApplySubs(w2, {L P}, {m n}); print w3; w4 = ApplySubs(w3, {R S A V Y}, {w d g f u}); print w4; */ start ApplySubs(wOld, _cipher, _plain); w = wOld; cipher = colvec(upcase(_cipher)); plain = colvec(lowcase(_plain)); do i = 1 to nrow(cipher); idx = loc(w=cipher[i]); if ^IsEmpty(idx) then w[idx] = plain[i]; end; return( w ); finish; store module=_all_; quit; proc iml; load module=_all_; run PrintFreqRef(); /* print reference distribution of letters and bigrams */ print "------------------------------------------------"; reset linesize=140; msg = "KBSCS OCS KPUSH XBSW DOCSWKBTTG HSSUH WTKBPWJ " + "IAK LSSGPWJ KBS UTAKB KBOK IPKSH MTA. " + "~ DSKSC GS FCPSH"; run FreqAnalysis(msg); /* distribution of letters and bigrams for this message */ /* The most common letters are ETAOINSRHL. The most common double-letter bigrams are LL, SS, EE, OO, TT. It seems likely that S=e. The bigram KB appears often in this message, so perhaps KB=th. */ w = SplitByWords(msg); /* split into array */ rownum = char(1:nrow(w)); /* row numbers, for reference */ w1 = ApplySubs(w, {K B S}, {t h e}); /* try K=t, B=h, and S=e */ print w1[r=rownum]; /* Promising. Probably C=r and second word is 'are'? Notice that CS=re is a frequent bigram! */ w2 = ApplySubs(w1, {O C}, {a r}); print w2[r=rownum]; /* what about the other double letter combination (line 5). It can't be a double consonant secause if follows a 'th'. Since 'ee' is already revealed, try 'oo'. Also, is the author's first name 'peter'? */ w3 = ApplySubs(w2, {T D}, {o p}); print w3[r=rownum]; /* The 5th word must be 'parenthood'. Guess H=s from frequency. */ /* The suffix 'PnJ' occurs twice. Try 'ing' */ w4 = ApplySubs(w3, {H W G P J}, {s n d i g}); print w4[r=rownum]; /* 3rd word = 'times'? 4th word 'when'?*/ /* author is 'peter de vries' */ w5 = ApplySubs(w4, {U X F}, {m w v}); print w5[r=rownum]; /* 11th word = 'mouth', so 8th word = 'but' and 14th word 'you'. 9th word = 'feeding' */ w6 = ApplySubs(w5, {I A M L}, {b u y f}); print w6[r=rownum]; /*****************************************/ proc iml; load module=_all_; /* More to try: LX TEFKEN KEDT LJ KE OKER UVGQAZ QAKQLVKJ TQAAJNO BEN BEMN. MADJOO KUJNJ VNJ KUNJJ EKUJN RJERDJ. ~ENOEA PJDDJO PL GLTPWWGRWB JBANXM XT EXLEGWGPAGLH P AGHBZ GY AX PWWXC XLBYBWT AX RB MBUXOZBM. ~QXLZPM PMBLPOBZ AYDN XFAQO LCROJD XQUC QO QSAYDN ROERORNC KQMQKRNH EYV NQTROJ NXROJD EYV JVQONCI. ~QSIYFD XFBSCH K'Y SKFKJU DE LWT ANOEJQ YO KJIEYN GMWG XN YWO WSYEDG AN DWKQ GE AN WVWTG. ~ N. N. IPYYKJUD */ msg = "CG EKLVFX WLU MVXZG IGLIFG YH UKGSB IGPQ " + "LR GNDGFFGWDG; YVU YH UKG XSEUPWDG UKGH KPTG " + "UBPTGFGX RBLA UKG ILSWU CKGBG UKGH EUPBUGX. " + "~ KGWBH CPBX YGGDKGB"; run PrintFreqRef(); print "------------------------------------------------"; run FreqAnalysis(msg) cutoff=0.045; /* several letters just under 5% */ w = SplitByWords(msg); rownum = char(1:nrow(w)); print w[r=rownum]; /* From freq analysis, guess that G=e. Guess the bigrams KG=he and UK=th */ w1 = ApplySubs(w, {U K G}, {t h e}); print w1[r=rownum]; /* Is author's first name 'henry'? B=r makes sense in words like 7 & 21. Also B is common in the message and r is common at large. */ w2 = ApplySubs(w1, {W B H}, {n r y}); print w2[r=rownum]; /* 7th word is 'their'; 21st is 'where'? */ w3 = ApplySubs(w2, {S C}, {i w}); print w3[r=rownum]; /* going back to letter frequencies: L, P, and X are unknown, as are a and o. Which could be vowels? X is not a vowel (word 17) L is a vowel (word 3) so must be o. P might be a vowel. From word 8, try P=a. Is the 16th word 'have'? */ w4 = ApplySubs(w3, {L P T}, {o a v}); print w4[r=rownum]; /* word 2 looks like 'should', and that would make F=l and X=d. Note that FF is one of the double-letter bigrams, and ll is common. */ w5 = ApplySubs(w4, {E V F X}, {s u l d}); print w5[r=rownum]; /* At this point we can unravel the cipher by guessing words: 4th = judge 5th = people 6th = by 14th = distance 18th = from*/ w6 = ApplySubs(w5, {M Z I Y D R A Q N}, {j g p b c f m k x}); print w6[r=rownum]; /*****************************************/ proc iml; load module=_all_; msg = "LX LEKUJN KEDT LJ KE OKER UVGQAZ QAKQLVKJ TQAAJNO BEN BEMN " + "MADJOO KUJNJ VNJ KUNJJ EKUJN RJERDJ. " + "~ ENOEA PJDDJO"; run PrintFreqRef(); print "------------------------------------------------"; run FreqAnalysis(msg); w = SplitByWords(msg); rownum = char(1:nrow(w)); print w[r=rownum]; /* from freq analysis of letters and bigrams: J = e. Try N=r. Then by inspection it looks like KU=th */ w1 = ApplySubs(w, {J N K U}, {e r t h}); print w1[r=rownum]; /* Attack the small words: 12th = me ==> 1st = my 5th = to 14th = are */ w2 = ApplySubs(w1, {L E V}, {m o a}); print w2[r=rownum]; /* A, O, and D are unknown consonants because each appears as a double letter and e and o are already known. All are common letters, and DJ=De and QA are common bigrams. Guess QA=in and D and O in {l s f p m n} */ w3 = ApplySubs(w2, {Q A D O}, {i n l s}); print w3[r=rownum]; /* Some words stand out. It unravels */ w4 = ApplySubs(w3, {X T R G B M P Z}, {y d p v f u w g}); print w4[r=rownum]; w5 = ApplySubs(w4, {S F L T M}, {l v f r h}); print w5[r=rownum];