/* SAS program to accompany the article
"The distribution of shared birthdays in the Birthday Problem"
by Rick Wicklin, published 07FEB2018 on The DO Loop blog:
https://blogs.sas.com/content/iml/2018/02/07/distribution-sha…birthday-problem.html
The classical birthday problem deals with the probability of a
binary random variable: Is there a match or not? But you can also
ask about the distributions of the number of matching birthdays.
This program shows how to simulate the multiple birthday problem:
If there are N people in a room, what is the probability that at least
k birthdays are shared by two or more people, k=0, 1, 2, ..., floor(N/2)?
*/
ods graphics / reset antialias=on;
title;
proc iml;
/* Function that simulates B rooms, each with N people, and counts the number of shared
(matching) birthdays in each room. The return value is a row vector that has B counts. */
start BirthdaySim(N, B);
bday = Sample(1:365, B||N); /* each column is a room */
match = N - countunique(bday, "col"); /* number of unique birthdays in each col */
return ( match );
finish;
call randseed(12345); /* set seed for random number stream */
NumPeople = 23;
NumRooms = 1e6;
match = BirthdaySim(NumPeople, NumRooms); /* 1e6 counts */
call tabulate(k, freq, match); /* summarize: k=number of shared birthdays, freq=counts */
prob = freq / NumRooms; /* proportion of rooms that have 0, 1, 2,... shared birthdays */
print prob[F=BEST6. C=(char(k,2)) L="Estimate of Probability (N=23)"];
/* write data and create stacked bar chart */
N = j(1, ncol(prob), NumPeople);
create Sim23 var {"N" "k" "prob"}; append; close;
submit;
ods graphics / width=500px height=400px;
title "Probability of k Shared Birthdays in a Room with N=23 People";
proc sgplot data=Sim23;
vbar k / response=Prob;
xaxis label="k: Number of Shared Birthdays";
yaxis grid label = "Probability";
run;
proc sgplot data=Sim23;
vbar N / response=Prob group=k groupdisplay=stack;
xaxis label="N: Number of People in Room"
offsetmin=0 offsetmax=0;
yaxis grid label = "Probability" values=(0 to 1 by 0.1);
run;
endsubmit;
/* for N<=60, Prob(more than 20 matches) is very small */
/*
Tabulate proportion of simulation for which there were
0, 1, 2, ...., 20 matches
For an explanation of the TabulateLEvels function, see
https://blogs.sas.com/content/iml/2015/10/07/tabulate-counts-ref.html
*/
/* output levels and frequencies for categories in x, including all
levels in the reference set */
start TabulateLevels(x, refSet);
call tabulate(levels, freq, x); /* compute the observed frequencies */
OutFreq = j(1, ncol(refSet), 0); /* set all frequencies to 0 */
idx = loc(element(refSet, levels)); /* find the observed categories */
OutFreq[idx] = freq; /* overwrite observed frequencies */
return (OutFreq);
finish;
/* create matrix of probabilities where
P(i,j) = prob of having match(j) matches in room with N[i] people
N\Match 0 1 2 ... 20
-------+------+------+-----+-------
2 P{2,0} P{2,1} ... P{2,20}
3 etc
4
5
...
60
Each row is a proportion based on a tabulation of a simulation
for many rooms that have N[i] people.
*/
/* fill each row of the matrix */
maxN = 60; /* consider rooms with 2, 3, ..., maxN people */
NumPeople = 2:maxN;
NumRooms = 1e6; /* number of simulated rooms */
refSet = 0:20;
NumMatch = j(ncol(NumPeople), ncol(refSet), 0); /* allocate vector for results */
t0 = time();
do N = 1 to ncol(NumPeople);
match = BirthdaySim(NumPeople[N], NumRooms);
f = TabulateLevels(match, refSet);
NumMatch[N, ] = f;
end;
ProbMatch = NumMatch / NumRooms;
tElapsed = time() - t0;
print tElapsed;
/*
call heatmapcont(ProbMatch`) displayoutlines=0
yvalues=refSet xvalues=NumPeople
title = "Probability of k Shared Birthdays in a Room with N People"
range={0,1} debug=1;
*/
/* Write to data set and plot as stacked band chart.
See
https://blogs.sas.com/content/iml/2018/01/31/create-stacked-band-plot-sas.html
*/
RowCol = expandgrid(NumPeople, refSet);
N = RowCol[,1];
k = RowCol[,2];
create BDay var {"N" "k" "ProbMatch"};
append;
close;
QUIT;
/* 1. add cumulative probabilities */
data BDayProb;
set BDay;
by N;
if first.N then cumProb=0; /* initialize cumulative amount to 0 for each room */
cumProb + ProbMatch;
Previous = lag(cumProb);
if first.N then Previous=0; /* initialize baseline value for each room */
run;
ods graphics / width=640px height=480px antialias=on;
title "Distribution of Shared Birthdays";
proc sgpanel data=BDayProb;
where N in (25, 40, 60) AND k<=12; /* show only three graphs and truncate X axis */
panelby N / columns=1 rowheaderpos=right;
vbar k / response=ProbMatch;
label k="Number of Shared Birthdays" ProbMatch="Probability";
rowaxis grid;
run;
title "Distribution of Shared Birthdays";
title2 "Expected proportion of k shared birthdays in a room with N people";
proc sgplot data=BDayProb;
where k < 10; /* truncate X axis */
band x=N lower=Previous upper=cumProb / group=k name="bands";
series x=N y=cumProb / group=k lineattrs=(color=gray thickness=2);
xaxis thresholdmin=0;
yaxis grid;
keylegend "bands" / title="k" position=right sortorder=reverseauto; /* SAS 9.4M5: reverse legend order */
label cumProb = "Cumulative Probability of a Shared Birthday"
N = "N: Number of People in Room";
run;
proc print data=BDayProb noobs;
where N=40 and k < 5;
run;