## Code by Georgi Z. Georgiev (2023). Developed with the help of Ronny Kohavi & Alex Deng
## Simulation with a true effects distribution mimicking Bing's data from Figure 3a of Azevedo E.M., Deng A., Olea J.L.M., Rao J., and Weyl E.G. (2019) "A/B Testing with Fat Tails", Journal of Political Economy (128), 12. doi: 10.1086/710607
## Key parameters follow the paper: 20 million users per test group in a one-sided test at 0.025 (statsig is z-score > 1.96), equal to a two-sided test at 0.05, and the base rate is 0.25. These can be adjusted at will to explore other scenarios.
## The adjusted FPR formula works since it uses the empirical p-values instead of alpha (sig_threshold). The FPR computed using equation (4) does not work, as expected, even if the target power of 0.8 is replaced with the mean power against the true effect sizes
library(distr)
## Basic simulation parameters
sim_runs = 100000;
sample_size = 20000000; #sample size per test group
base_rate = 0.25; # baseline conversion rate
sig_threshold = 0.025; # 0.025 one-sided = 0.05 two-sided.
## Define the distribution of true effects so it has heavy tails. Not necessary and can be replaced with a simple normal distribution, but produces true effects more aligned with those observed in practice.
myMix = UnivarMixingDistribution(Norm(mean=0, sd=0.0002),
Norm(mean=0, sd=0.0007),
mixCoeff=c(0.8, 0.2));
rmyMix <- r(myMix);
## Draw random true effects from the distribution
true_effects = rmyMix(sim_runs);
#true_effects <- rep(0,sim_runs) #uncomment for nil effect sims (a.k.a. A/A tests, should result in FPR = 1.0, Fr(SS|H0) = sig_threshold)
## Assign random outcomes for the test groups
obs_control = rbinom(n=sim_runs, size=sample_size, prob=base_rate);
obs_variant = rbinom(n=sim_runs, size=sample_size, prob=base_rate + true_effects);
## Calculate power = 1-beta against the true (otherwise unknown) effect size, replaces P(SS|H1) in the FPR equation
power = power.prop.test(n = sample_size, p1 = base_rate, p2 = base_rate + true_effects[true_effects > 0], sig.level = sig_threshold, alternative='one.sided')$power
## p-value for absolute difference of proportions
calcPval = function(i) {
return(prop.test(x = c(obs_variant[i], obs_control[i]), n = c(sample_size, sample_size), alternative='greater')$p.value);
}
pvals = sapply(1:sim_runs, calcPval);
rejected = pvals < sig_threshold; # statistically significant outcome rejecting H0, or not?
falsepos = sum(rejected[true_effects <= 0]); # count of false positive tests (statistically significant with true H0)
truepos = sum(rejected[true_effects > 0]); # count tests as true positive (statistically significant with false H0)
fpf = falsepos / (falsepos + truepos); # frequency of false positives among all positives
trueH0 = length(true_effects[true_effects <= 0]); # count of true H0s
FrSSH0 = falsepos / trueH0; # Fr(ss|H0) # frequency of rejected H0s
mean_power = mean(power, na.rm = T); # mean power against the true effect size determined analytically
#mean_power = truepos / (sim_runs - trueH0) # empirical mean power against the true effect size (can be used to check pre-test vs empirical, but pre-test is the required input as per the FPR formula)
## False positive risk equation calculation
true_odds = 1/1 # with a median of zero of the true effects distribution and simple superiority test the true odds are 1/1
fpr = FrSSH0 / mean_power * true_odds / (1 + FrSSH0 / mean_power * true_odds)
fpreq4 = sig_threshold / 0.8 * true_odds / (1 + sig_threshold / 0.8 * true_odds) # FPR computed following equation (4)
## Output of FPR and other statistics and sanity checks
cat(paste('False positive freq. (empirical): ', fpf, '(', round(fpf * 100, 2), '%)', '\nFalse positive risk (formula) : ', fpr, '(', round(fpr * 100, 2), '%)', '\nFalse positive risk (equation 4): ', fpreq4, '(', round(fpreq4 * 100, 2), '%)', '\nMean true effect: ', mean(true_effects), '\nMedian true effect: ', median(true_effects), '(', round(median(true_effects),6), ')', '\nTrue effect sigma: ', sd(true_effects), '\nFr(true_effect <= 0) = ', trueH0 / length(true_effects), '\nFr(ss|H0): ', FrSSH0, '(', round(FrSSH0 * 100, 2), '%)', '\nMean power vs true effects under H1 (Fr(ss|H1)): ', mean_power, '(', round(mean_power * 100, 2), '%)'), sep='\n')
## Examine the input for the FPR equation (comment out the unnecessary ones) and sanity checks
hist(true_effects, breaks=40);
hist(pvals, breaks=20);
#hist(power, breaks=20);