/* SAS program to accompany the article "The distribution of shared birthdays in the Birthday Problem" by Rick Wicklin, published 07FEB2018 on The DO Loop blog: https://blogs.sas.com/content/iml/2018/02/07/distribution-sha…birthday-problem.html The classical birthday problem deals with the probability of a binary random variable: Is there a match or not? But you can also ask about the distributions of the number of matching birthdays. This program shows how to simulate the multiple birthday problem: If there are N people in a room, what is the probability that at least k birthdays are shared by two or more people, k=0, 1, 2, ..., floor(N/2)? */ ods graphics / reset antialias=on; title; proc iml; /* Function that simulates B rooms, each with N people, and counts the number of shared (matching) birthdays in each room. The return value is a row vector that has B counts. */ start BirthdaySim(N, B); bday = Sample(1:365, B||N); /* each column is a room */ match = N - countunique(bday, "col"); /* number of unique birthdays in each col */ return ( match ); finish; call randseed(12345); /* set seed for random number stream */ NumPeople = 23; NumRooms = 1e6; match = BirthdaySim(NumPeople, NumRooms); /* 1e6 counts */ call tabulate(k, freq, match); /* summarize: k=number of shared birthdays, freq=counts */ prob = freq / NumRooms; /* proportion of rooms that have 0, 1, 2,... shared birthdays */ print prob[F=BEST6. C=(char(k,2)) L="Estimate of Probability (N=23)"]; /* write data and create stacked bar chart */ N = j(1, ncol(prob), NumPeople); create Sim23 var {"N" "k" "prob"}; append; close; submit; ods graphics / width=500px height=400px; title "Probability of k Shared Birthdays in a Room with N=23 People"; proc sgplot data=Sim23; vbar k / response=Prob; xaxis label="k: Number of Shared Birthdays"; yaxis grid label = "Probability"; run; proc sgplot data=Sim23; vbar N / response=Prob group=k groupdisplay=stack; xaxis label="N: Number of People in Room" offsetmin=0 offsetmax=0; yaxis grid label = "Probability" values=(0 to 1 by 0.1); run; endsubmit; /* for N<=60, Prob(more than 20 matches) is very small */ /* Tabulate proportion of simulation for which there were 0, 1, 2, ...., 20 matches For an explanation of the TabulateLEvels function, see https://blogs.sas.com/content/iml/2015/10/07/tabulate-counts-ref.html */ /* output levels and frequencies for categories in x, including all levels in the reference set */ start TabulateLevels(x, refSet); call tabulate(levels, freq, x); /* compute the observed frequencies */ OutFreq = j(1, ncol(refSet), 0); /* set all frequencies to 0 */ idx = loc(element(refSet, levels)); /* find the observed categories */ OutFreq[idx] = freq; /* overwrite observed frequencies */ return (OutFreq); finish; /* create matrix of probabilities where P(i,j) = prob of having match(j) matches in room with N[i] people N\Match 0 1 2 ... 20 -------+------+------+-----+------- 2 P{2,0} P{2,1} ... P{2,20} 3 etc 4 5 ... 60 Each row is a proportion based on a tabulation of a simulation for many rooms that have N[i] people. */ /* fill each row of the matrix */ maxN = 60; /* consider rooms with 2, 3, ..., maxN people */ NumPeople = 2:maxN; NumRooms = 1e6; /* number of simulated rooms */ refSet = 0:20; NumMatch = j(ncol(NumPeople), ncol(refSet), 0); /* allocate vector for results */ t0 = time(); do N = 1 to ncol(NumPeople); match = BirthdaySim(NumPeople[N], NumRooms); f = TabulateLevels(match, refSet); NumMatch[N, ] = f; end; ProbMatch = NumMatch / NumRooms; tElapsed = time() - t0; print tElapsed; /* call heatmapcont(ProbMatch`) displayoutlines=0 yvalues=refSet xvalues=NumPeople title = "Probability of k Shared Birthdays in a Room with N People" range={0,1} debug=1; */ /* Write to data set and plot as stacked band chart. See https://blogs.sas.com/content/iml/2018/01/31/create-stacked-band-plot-sas.html */ RowCol = expandgrid(NumPeople, refSet); N = RowCol[,1]; k = RowCol[,2]; create BDay var {"N" "k" "ProbMatch"}; append; close; QUIT; /* 1. add cumulative probabilities */ data BDayProb; set BDay; by N; if first.N then cumProb=0; /* initialize cumulative amount to 0 for each room */ cumProb + ProbMatch; Previous = lag(cumProb); if first.N then Previous=0; /* initialize baseline value for each room */ run; ods graphics / width=640px height=480px antialias=on; title "Distribution of Shared Birthdays"; proc sgpanel data=BDayProb; where N in (25, 40, 60) AND k<=12; /* show only three graphs and truncate X axis */ panelby N / columns=1 rowheaderpos=right; vbar k / response=ProbMatch; label k="Number of Shared Birthdays" ProbMatch="Probability"; rowaxis grid; run; title "Distribution of Shared Birthdays"; title2 "Expected proportion of k shared birthdays in a room with N people"; proc sgplot data=BDayProb; where k < 10; /* truncate X axis */ band x=N lower=Previous upper=cumProb / group=k name="bands"; series x=N y=cumProb / group=k lineattrs=(color=gray thickness=2); xaxis thresholdmin=0; yaxis grid; keylegend "bands" / title="k" position=right sortorder=reverseauto; /* SAS 9.4M5: reverse legend order */ label cumProb = "Cumulative Probability of a Shared Birthday" N = "N: Number of People in Room"; run; proc print data=BDayProb noobs; where N=40 and k < 5; run;