from __future__ import division
import os,sys,glob,re,itertools,json,random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Bootstrapping the (binomial) mean¶

# True/mythical/infinitely-sized "population"
population = [int(np.random.random()<0.7) for i in xrange(100000)]

# True value/parameter.  In reality, will NOT have access to this.
np.mean(population)

0.70148

# Model: the observed dataset was randomly drawn from the population.
N = 100
dataset = [random.choice(population) for i in xrange(N)];
np.mean(dataset)

0.7

# Bootstrap sample: sample size N, with replacement
boot_sample = [random.choice(dataset) for i in xrange(N)]
np.mean(boot_sample)

0.77

get_boot_sample = lambda: [random.choice(dataset) for i in xrange(N)]

np.mean(get_boot_sample())

0.81

Nsim = 10000
boot_means = [  np.mean(get_boot_sample()) for s in xrange(Nsim)  ]

plt.hist(boot_means)
plt.axvline(x=np.mean(dataset),color='black')
plt.axvline(x=np.mean(population),color='red')

<matplotlib.lines.Line2D at 0x1a1808e150>

# Central 95% confidence interval (bootstrap percentile method)
# Percentiles alpha/2, (1-alpha/2)
np.percentile(boot_means, [2.5, 97.5])

array([0.61, 0.79])

# Normal approximation 95% CI
boot_se = np.std(boot_means) 
boot_se

0.0458763241661753

point_estimate = np.mean(dataset)
point_estimate

0.7

Normal approx: $[\hat{\theta} - z_{\alpha/2}\ \sigma_{boot}, \hat{\theta} + z_{1-\alpha/2}\ \sigma_{boot}]$

For 95% CI: $\hat{\theta} \pm 1.96 \sigma_{boot}$

[point_estimate - 1.96*boot_se, point_estimate + 1.96*boot_se]

[0.6100824046342964, 0.7899175953657035]

Paired bootstrap¶

pairs = [
    (0,1),(0,1),(0,1),(0,1),
]
pairs += [(0,0)]*1000
pairs += [(1,1)]*1000

np.mean([x[0] for x in pairs]), np.mean([x[1] for x in pairs]),

(0.499001996007984, 0.500998003992016)

acc1=[]
acc2=[]
diff=[]
Nsim=1000
for b in xrange(Nsim):
    sample = [random.choice(pairs) for i in xrange(len(pairs))]
    a1 = np.mean([x[0] for x in sample])
    a2 = np.mean([x[1] for x in sample])
    acc1.append(a1)
    acc2.append(a2)
    diff.append( a2-a1 )

plt.hist(acc1); plt.hist(acc2);None

np.percentile(acc1, [2.5,97.5])

array([0.47704591, 0.52047156])

np.percentile(acc2, [2.5,97.5])

array([0.47904192, 0.52296657])

plt.hist(diff)

(array([ 94., 150., 195., 344.,  90.,  65.,  56.,   5.,   0.,   1.]),
 array([0.        , 0.0006487 , 0.00129741, 0.00194611, 0.00259481,
        0.00324351, 0.00389222, 0.00454092, 0.00518962, 0.00583832,
        0.00648703]),
 <a list of 10 Patch objects>)

np.percentile(diff, [2.5,97.5])

array([0.00048653, 0.00399202])