In [1]:
from __future__ import division
import os,sys,glob,re,itertools,json,random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Bootstrapping the (binomial) mean

In [2]:
# True/mythical/infinitely-sized "population"
population = [int(np.random.random()<0.7) for i in xrange(100000)]
In [3]:
# True value/parameter.  In reality, will NOT have access to this.
np.mean(population)
Out[3]:
0.70148
In [4]:
# Model: the observed dataset was randomly drawn from the population.
N = 100
dataset = [random.choice(population) for i in xrange(N)];
np.mean(dataset)
Out[4]:
0.7
In [5]:
# Bootstrap sample: sample size N, with replacement
boot_sample = [random.choice(dataset) for i in xrange(N)]
np.mean(boot_sample)
Out[5]:
0.77
In [6]:
get_boot_sample = lambda: [random.choice(dataset) for i in xrange(N)]
In [7]:
np.mean(get_boot_sample())
Out[7]:
0.81
In [8]:
Nsim = 10000
boot_means = [  np.mean(get_boot_sample()) for s in xrange(Nsim)  ]
In [9]:
plt.hist(boot_means)
plt.axvline(x=np.mean(dataset),color='black')
plt.axvline(x=np.mean(population),color='red')
Out[9]:
<matplotlib.lines.Line2D at 0x1a1808e150>
In [10]:
# Central 95% confidence interval (bootstrap percentile method)
# Percentiles alpha/2, (1-alpha/2)
np.percentile(boot_means, [2.5, 97.5])
Out[10]:
array([0.61, 0.79])
In [11]:
# Normal approximation 95% CI
boot_se = np.std(boot_means) 
boot_se
Out[11]:
0.0458763241661753
In [12]:
point_estimate = np.mean(dataset)
point_estimate
Out[12]:
0.7

Normal approx: $[\hat{\theta} - z_{\alpha/2}\ \sigma_{boot}, \hat{\theta} + z_{1-\alpha/2}\ \sigma_{boot}]$

For 95% CI: $\hat{\theta} \pm 1.96 \sigma_{boot}$

In [13]:
[point_estimate - 1.96*boot_se, point_estimate + 1.96*boot_se]
Out[13]:
[0.6100824046342964, 0.7899175953657035]
In [ ]:
 

Paired bootstrap

In [14]:
pairs = [
    (0,1),(0,1),(0,1),(0,1),
]
pairs += [(0,0)]*1000
pairs += [(1,1)]*1000
In [15]:
np.mean([x[0] for x in pairs]), np.mean([x[1] for x in pairs]), 
Out[15]:
(0.499001996007984, 0.500998003992016)
In [16]:
acc1=[]
acc2=[]
diff=[]
Nsim=1000
for b in xrange(Nsim):
    sample = [random.choice(pairs) for i in xrange(len(pairs))]
    a1 = np.mean([x[0] for x in sample])
    a2 = np.mean([x[1] for x in sample])
    acc1.append(a1)
    acc2.append(a2)
    diff.append( a2-a1 )
In [18]:
plt.hist(acc1); plt.hist(acc2);None
In [19]:
np.percentile(acc1, [2.5,97.5])
Out[19]:
array([0.47704591, 0.52047156])
In [20]:
np.percentile(acc2, [2.5,97.5])
Out[20]:
array([0.47904192, 0.52296657])
In [21]:
plt.hist(diff)
Out[21]:
(array([ 94., 150., 195., 344.,  90.,  65.,  56.,   5.,   0.,   1.]),
 array([0.        , 0.0006487 , 0.00129741, 0.00194611, 0.00259481,
        0.00324351, 0.00389222, 0.00454092, 0.00518962, 0.00583832,
        0.00648703]),
 <a list of 10 Patch objects>)
In [22]:
np.percentile(diff, [2.5,97.5])
Out[22]:
array([0.00048653, 0.00399202])