from __future__ import division
import os,sys,glob,re,itertools,json,random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# True/mythical/infinitely-sized "population"
population = [int(np.random.random()<0.7) for i in xrange(100000)]
# True value/parameter. In reality, will NOT have access to this.
np.mean(population)
# Model: the observed dataset was randomly drawn from the population.
N = 100
dataset = [random.choice(population) for i in xrange(N)];
np.mean(dataset)
# Bootstrap sample: sample size N, with replacement
boot_sample = [random.choice(dataset) for i in xrange(N)]
np.mean(boot_sample)
get_boot_sample = lambda: [random.choice(dataset) for i in xrange(N)]
np.mean(get_boot_sample())
Nsim = 10000
boot_means = [ np.mean(get_boot_sample()) for s in xrange(Nsim) ]
plt.hist(boot_means)
plt.axvline(x=np.mean(dataset),color='black')
plt.axvline(x=np.mean(population),color='red')
# Central 95% confidence interval (bootstrap percentile method)
# Percentiles alpha/2, (1-alpha/2)
np.percentile(boot_means, [2.5, 97.5])
# Normal approximation 95% CI
boot_se = np.std(boot_means)
boot_se
point_estimate = np.mean(dataset)
point_estimate
Normal approx: $[\hat{\theta} - z_{\alpha/2}\ \sigma_{boot}, \hat{\theta} + z_{1-\alpha/2}\ \sigma_{boot}]$
For 95% CI: $\hat{\theta} \pm 1.96 \sigma_{boot}$
[point_estimate - 1.96*boot_se, point_estimate + 1.96*boot_se]
pairs = [
(0,1),(0,1),(0,1),(0,1),
]
pairs += [(0,0)]*1000
pairs += [(1,1)]*1000
np.mean([x[0] for x in pairs]), np.mean([x[1] for x in pairs]),
acc1=[]
acc2=[]
diff=[]
Nsim=1000
for b in xrange(Nsim):
sample = [random.choice(pairs) for i in xrange(len(pairs))]
a1 = np.mean([x[0] for x in sample])
a2 = np.mean([x[1] for x in sample])
acc1.append(a1)
acc2.append(a2)
diff.append( a2-a1 )
plt.hist(acc1); plt.hist(acc2);None
np.percentile(acc1, [2.5,97.5])
np.percentile(acc2, [2.5,97.5])
plt.hist(diff)
np.percentile(diff, [2.5,97.5])