## Code by Georgi Z. Georgiev (2023). Developed with the help of Ronny Kohavi & Alex Deng ## Simulation with a true effects distribution mimicking Bing's data from Figure 3a of Azevedo E.M., Deng A., Olea J.L.M., Rao J., and Weyl E.G. (2019) "A/B Testing with Fat Tails", Journal of Political Economy (128), 12. doi: 10.1086/710607 ## Key parameters follow the paper: 20 million users per test group in a one-sided test at 0.025 (statsig is z-score > 1.96), equal to a two-sided test at 0.05, and the base rate is 0.25. These can be adjusted at will to explore other scenarios. ## The adjusted FPR formula works since it uses the empirical p-values instead of alpha (sig_threshold). The FPR computed using equation (4) does not work, as expected, even if the target power of 0.8 is replaced with the mean power against the true effect sizes library(distr) ## Basic simulation parameters sim_runs = 100000; sample_size = 20000000; #sample size per test group base_rate = 0.25; # baseline conversion rate sig_threshold = 0.025; # 0.025 one-sided = 0.05 two-sided. ## Define the distribution of true effects so it has heavy tails. Not necessary and can be replaced with a simple normal distribution, but produces true effects more aligned with those observed in practice. myMix = UnivarMixingDistribution(Norm(mean=0, sd=0.0002), Norm(mean=0, sd=0.0007), mixCoeff=c(0.8, 0.2)); rmyMix <- r(myMix); ## Draw random true effects from the distribution true_effects = rmyMix(sim_runs); #true_effects <- rep(0,sim_runs) #uncomment for nil effect sims (a.k.a. A/A tests, should result in FPR = 1.0, Fr(SS|H0) = sig_threshold) ## Assign random outcomes for the test groups obs_control = rbinom(n=sim_runs, size=sample_size, prob=base_rate); obs_variant = rbinom(n=sim_runs, size=sample_size, prob=base_rate + true_effects); ## Calculate power = 1-beta against the true (otherwise unknown) effect size, replaces P(SS|H1) in the FPR equation power = power.prop.test(n = sample_size, p1 = base_rate, p2 = base_rate + true_effects[true_effects > 0], sig.level = sig_threshold, alternative='one.sided')$power ## p-value for absolute difference of proportions calcPval = function(i) { return(prop.test(x = c(obs_variant[i], obs_control[i]), n = c(sample_size, sample_size), alternative='greater')$p.value); } pvals = sapply(1:sim_runs, calcPval); rejected = pvals < sig_threshold; # statistically significant outcome rejecting H0, or not? falsepos = sum(rejected[true_effects <= 0]); # count of false positive tests (statistically significant with true H0) truepos = sum(rejected[true_effects > 0]); # count tests as true positive (statistically significant with false H0) fpf = falsepos / (falsepos + truepos); # frequency of false positives among all positives trueH0 = length(true_effects[true_effects <= 0]); # count of true H0s FrSSH0 = falsepos / trueH0; # Fr(ss|H0) # frequency of rejected H0s mean_power = mean(power, na.rm = T); # mean power against the true effect size determined analytically #mean_power = truepos / (sim_runs - trueH0) # empirical mean power against the true effect size (can be used to check pre-test vs empirical, but pre-test is the required input as per the FPR formula) ## False positive risk equation calculation true_odds = 1/1 # with a median of zero of the true effects distribution and simple superiority test the true odds are 1/1 fpr = FrSSH0 / mean_power * true_odds / (1 + FrSSH0 / mean_power * true_odds) fpreq4 = sig_threshold / 0.8 * true_odds / (1 + sig_threshold / 0.8 * true_odds) # FPR computed following equation (4) ## Output of FPR and other statistics and sanity checks cat(paste('False positive freq. (empirical): ', fpf, '(', round(fpf * 100, 2), '%)', '\nFalse positive risk (formula) : ', fpr, '(', round(fpr * 100, 2), '%)', '\nFalse positive risk (equation 4): ', fpreq4, '(', round(fpreq4 * 100, 2), '%)', '\nMean true effect: ', mean(true_effects), '\nMedian true effect: ', median(true_effects), '(', round(median(true_effects),6), ')', '\nTrue effect sigma: ', sd(true_effects), '\nFr(true_effect <= 0) = ', trueH0 / length(true_effects), '\nFr(ss|H0): ', FrSSH0, '(', round(FrSSH0 * 100, 2), '%)', '\nMean power vs true effects under H1 (Fr(ss|H1)): ', mean_power, '(', round(mean_power * 100, 2), '%)'), sep='\n') ## Examine the input for the FPR equation (comment out the unnecessary ones) and sanity checks hist(true_effects, breaks=40); hist(pvals, breaks=20); #hist(power, breaks=20);