모수 추정, 선형 회귀, Bootstrap CI, 가설 검정, 순열 샘플링


import numpy as np
import pandas as pd
import seaborn as sb
sb.set()
import matplotlib.pyplot as plt
%matplotlib inline
# setting plot defatult size
%pylab inline
pylab.rcParams['figure.figsize'] = (12, 6)
Populating the interactive namespace from numpy and matplotlib
# Empirical cumulative distribution function
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    n = len(data)      # Number of data points
    x = np.sort(data)
    y = np.arange(1, n+1) / n

    return x, y

1. Parameter estimation (모수 추정)

Exponential distribution (지수 분포)

  • 일정 구간에서 일어날 확률이 균등할 경우 - 균등 분포
  • 평균에 가까울수록 일어날 확률이 커질 경우 - 정규 분포
  • 시간이 지날수록 확률이 점점 작아지는 경우 - 지수 분포
# 미 메이저리그에서 no-hitter 나타나는 주기 (단위:game) - 지수 분포
# tau : mean interval time
nohitter_times = np.array([843, 1613, 1101, 215, 684, 814, 278, 324, 161, 219, 545, 715, 966, 624, 29, 450, 107, 20, 91, 1325, 124, 1468, 104, 1309, 429, 62, 1878, 1104, 123, 251, 93, 188, 983, 166, 96, 702, 23, 524, 26, 299, 59, 39, 12, 2, 308, 1114, 813, 887, 645, 2088, 42, 2090, 11, 886, 1665, 1084, 2900, 2432, 750, 4021, 1070, 1765, 1322, 26, 548, 1525, 77, 2181, 2752, 127, 2147, 211, 41, 1575, 151, 479, 697, 557, 2267, 542, 392, 73, 603, 233, 255, 528, 397, 1529, 1023, 1194, 462, 583, 37, 943, 996, 480, 1497, 717, 224, 219, 1531, 498, 44, 288, 267, 600, 52, 269, 1086, 386, 176, 2199, 216, 54, 675, 1243, 463, 650, 171, 327, 110, 774, 509, 8, 197, 136, 12, 1124, 64, 380, 811, 232, 192, 731, 715, 226, 605, 539, 1491, 323, 240, 179, 702, 156, 82, 1397, 354, 778, 603, 1001, 385, 986, 203, 149, 576, 445, 180, 1403, 252, 675, 1351, 2983, 1568, 45, 899, 3260, 1025, 31, 100, 2055, 4043, 79, 238, 3931, 2351, 595, 110, 215, 0, 563, 206, 660, 242, 577, 179, 157, 192, 192, 1848, 792, 1693, 55, 388, 225, 1134, 1172, 1555, 31, 1582, 1044, 378, 1687, 2915, 280, 765, 2819, 511, 1521, 745, 2491, 580, 2072, 6450, 578, 745, 1075, 1103, 1549, 1520, 138, 1202, 296, 277, 351, 391, 950, 459, 62, 1056, 1128, 139, 420, 87, 71, 814, 603, 1349, 162, 1027, 783, 326, 101, 876, 381, 905, 156, 419, 239, 119, 129, 467])
tau = np.mean(nohitter_times)  # parameter
# tau를 통해 지수 분포 생성
inter_nohitter_time = np.random.exponential(tau, 100000)
# plot
_ = plt.hist(inter_nohitter_time, bins=50, normed=True, histtype='step', edgecolor='red')
_ = plt.xlabel('Games between no-hitters')
_ = plt.ylabel('PDF')

png

# ECDF from real data
x, y = ecdf(nohitter_times)

# CDF from theoretical samples
x_theor, y_theor = ecdf(inter_nohitter_time)

# plot - 실제 데이터와 이론상 데이터가 유사한 패턴을 보인다.
plt.plot(x_theor, y_theor)
plt.plot(x, y, marker='.', linestyle='none', color='orange')
plt.margins(0.02)
plt.xlabel('Games between no-hitters')
plt.ylabel('CDF')
<matplotlib.text.Text at 0x11335be10>

png

# tau 값이 1/2 또는 2배인 지수 분포와 함께 보면 tau 값이 optimal parameter 임을 알 수 있다.
plt.plot(x_theor, y_theor)
plt.plot(x, y, marker='.', linestyle='none')
plt.margins(0.02)
plt.xlabel('Games between no-hitters')
plt.ylabel('CDF')

samples_half = np.random.exponential(tau/2, 10000)
samples_double = np.random.exponential(tau*2, 10000)

x_half, y_half = ecdf(samples_half)
x_double, y_double = ecdf(samples_double)

_ = plt.plot(x_half, y_half)
_ = plt.plot(x_double, y_double)
_ = plt.legend(('theoretical', 'real', 'half', 'double'), loc='lower right')
plt.show()

png

Linear Regression

  • correlation between female iliteracy and fertility (defined as the average number of children born per woman)
  • 자녀 수가 많을 수록 문맹률이 높은가?
fertility = np.array([ 1.769,  2.682,  2.077,  2.132,  1.827,  3.872,  2.288,  5.173, 1.393,  1.262,  2.156,  3.026,  2.033,  1.324,  2.816,  5.211, 2.1  ,  1.781,  1.822,  5.908,  1.881,  1.852,  1.39 ,  2.281, 2.505,  1.224,  1.361,  1.468,  2.404,  5.52 ,  4.058,  2.223, 4.859,  1.267,  2.342,  1.579,  6.254,  2.334,  3.961,  6.505, 2.53 ,  2.823,  2.498,  2.248,  2.508,  3.04 ,  1.854,  4.22 , 5.1  ,  4.967,  1.325,  4.514,  3.173,  2.308,  4.62 ,  4.541, 5.637,  1.926,  1.747,  2.294,  5.841,  5.455,  7.069,  2.859, 4.018,  2.513,  5.405,  5.737,  3.363,  4.89 ,  1.385,  1.505, 6.081,  1.784,  1.378,  1.45 ,  1.841,  1.37 ,  2.612,  5.329, 5.33 ,  3.371,  1.281,  1.871,  2.153,  5.378,  4.45 ,  1.46 , 1.436,  1.612,  3.19 ,  2.752,  3.35 ,  4.01 ,  4.166,  2.642, 2.977,  3.415,  2.295,  3.019,  2.683,  5.165,  1.849,  1.836, 2.518,  2.43 ,  4.528,  1.263,  1.885,  1.943,  1.899,  1.442, 1.953,  4.697,  1.582,  2.025,  1.841,  5.011,  1.212,  1.502, 2.516,  1.367,  2.089,  4.388,  1.854,  1.748,  2.978,  2.152, 2.362,  1.988,  1.426,  3.29 ,  3.264,  1.436,  1.393,  2.822, 4.969,  5.659,  3.24 ,  1.693,  1.647,  2.36 ,  1.792,  3.45 , 1.516,  2.233,  2.563,  5.283,  3.885,  0.966,  2.373,  2.663, 1.251,  2.052,  3.371,  2.093,  2.   ,  3.883,  3.852,  3.718, 1.732,  3.928])
illiteracy = np.array([  9.5,  49.2,   1. ,  11.2,   9.8,  60. ,  50.2,  51.2,   0.6, 1. ,   8.5,   6.1,   9.8,   1. ,  42.2,  77.2,  18.7,  22.8, 8.5,  43.9,   1. ,   1. ,   1.5,  10.8,  11.9,   3.4,   0.4, 3.1,   6.6,  33.7,  40.4,   2.3,  17.2,   0.7,  36.1,   1. , 33.2,  55.9,  30.8,  87.4,  15.4,  54.6,   5.1,   1.1,  10.2, 19.8,   0. ,  40.7,  57.2,  59.9,   3.1,  55.7,  22.8,  10.9, 34.7,  32.2,  43. ,   1.3,   1. ,   0.5,  78.4,  34.2,  84.9, 29.1,  31.3,  18.3,  81.8,  39. ,  11.2,  67. ,   4.1,   0.2, 78.1,   1. ,   7.1,   1. ,  29. ,   1.1,  11.7,  73.6,  33.9, 14. ,   0.3,   1. ,   0.8,  71.9,  40.1,   1. ,   2.1,   3.8, 16.5,   4.1,   0.5,  44.4,  46.3,  18.7,   6.5,  36.8,  18.6, 11.1,  22.1,  71.1,   1. ,   0. ,   0.9,   0.7,  45.5,   8.4, 0. ,   3.8,   8.5,   2. ,   1. ,  58.9,   0.3,   1. ,  14. , 47. ,   4.1,   2.2,   7.2,   0.3,   1.5,  50.5,   1.3,   0.6, 19.1,   6.9,   9.2,   2.2,   0.2,  12.3,   4.9,   4.6,   0.3, 16.5,  65.7,  63.5,  16.8,   0.2,   1.8,   9.6,  15.2,  14.4, 3.3,  10.6,  61.3,  10.9,  32.2,   9.3,  11.6,  20.7,   6.5, 6.7,   3.5,   1. ,   1.6,  20.5,   1.5,  16.7,   2. , 0.9])
_ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')
_ = plt.xlabel('illiterate (%)')
_ = plt.ylabel('fertility')
plt.margins(0.02)
plt.show()

png

np.corrcoef(illiteracy, fertility)
array([[ 1.       ,  0.8041324],
       [ 0.8041324,  1.       ]])
# np.polyfit - 최소자승법을 이용한 선형회귀
_ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')
_ = plt.xlabel('percent illiterate')
_ = plt.ylabel('fertility')
plt.margins(0.02)

# Perform a linear regression using np.polyfit()
a, b = np.polyfit(illiteracy, fertility, 1)

# Print the results to the screen
print('slope =', a, 'children per woman / percent illiterate')
print('intercept =', b, 'children per woman')

# Make theoretical line to plot - regression line
x = np.array([0, 100])
y = a * x + b

_ = plt.plot(x, y)
slope = 0.0497985480906 children per woman / percent illiterate
intercept = 1.88805061064 children per woman

png

2. Bootstrap confidence intervals

  • Bootstrap : resampled data to perform statistical inference
df = pd.read_csv('data/election2012.csv')
df_ny = df.loc[df['state'] == 'NY']
vote_ny = df_ny['Obama']
# NY 주에서 오바마에 대한 지지율로 50개의 bootstrap 생성하여 ECDF 비교
for _ in range(50):
    # Generate bootstrap sample
    bs_sample = np.random.choice(vote_ny, size=len(vote_ny))

    # Compute and plot ECDF from bootstrap sample
    x, y = ecdf(bs_sample)
    _ = plt.plot(x, y, marker='.', linestyle='none', color='gray', alpha=0.1)
    
# real data
x, y = ecdf(vote_ny)
_ = plt.plot(x, y, marker='.')

plt.margins(0.02)
_ = plt.xlabel('vote Obama (NY)')
_ = plt.ylabel('ECDF')
plt.show()

png

Bootstrap confidence intervals

def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))
def draw_bootstrap_reps(data, func, size=1):
    """Draw bootstrap replicates."""
    
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)

    return bs_replicates
  • 오바마에 대한 NY주의 지지율 평균을 10,000개의 bootstrap replicates로 추정
bs_replicates = draw_bootstrap_reps(vote_ny, np.mean, size=10000)

print('standard error of mean :', np.std(vote_ny) / np.sqrt(len(vote_ny)))
print('standard deviation of bootstraps :', np.std(bs_replicates))

# Make a histogram of the results
_ = plt.hist(bs_replicates, bins=50, normed=True, edgecolor='white')
_ = plt.xlabel('mean annual rainfall (mm)')
_ = plt.ylabel('PDF')
plt.show()
standard error of mean : 1.39909435215
standard deviation of bootstraps : 1.38923447343

png

# 95% 신뢰구간
np.percentile(bs_replicates, [2.5, 97.5])
array([ 49.33548387,  54.85483871])
  • no-hitter 에 대한 신뢰구간 추정
bs_replicates = draw_bootstrap_reps(nohitter_times, np.mean, size=10000)

conf_int = np.percentile(bs_replicates, [2.5, 97.5])
print('95% confidence interval =', conf_int, 'games')

_ = plt.hist(bs_replicates, bins=50, normed=True, edgecolor='white')
_ = plt.xlabel(r'$\tau$ (games)')
_ = plt.ylabel('PDF')
95% confidence interval = [ 660.99960159  869.30717131] games

png

Nonparametric inference (비모수 추정)

Pairs bootstrap for linear regression

  • Resample data in pairs
  • Compute slope and intercept from resampled data
  • Each slope and intercept is a bootstrap replicate
  • Compute confidence intervals from percentiles of bootstrap replicates
def draw_bs_pairs_linreg(x, y, size=1):
    """Perform pairs bootstrap for linear regression."""

    # Set up array of indices to sample
    inds = np.arange(len(x))

    # Initialize replicates
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)

    return bs_slope_reps, bs_intercept_reps
# 출산율 대비 문맹률 데이터에 대한 추정
# estimate of the slope
bs_slope_reps, bs_intercept_reps = draw_bs_pairs_linreg(illiteracy, fertility, size=1000)

# 95% CI for slope
print('95% confidence interval =', np.percentile(bs_slope_reps, [2.5, 97.5]))

_ = plt.hist(bs_slope_reps, bins=50, normed=True, edgecolor='white')
_ = plt.xlabel('slope')
_ = plt.ylabel('PDF')
plt.show()
95% confidence interval = [ 0.04394249  0.05530549]

png

x = np.array([0, 100])

# Plot the bootstrap lines
for i in range(100):
    _ = plt.plot(x, bs_slope_reps[i] * x + bs_intercept_reps[i], linewidth=0.5, alpha=0.2, color='red')

# Plot the data
_ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')

# Label axes, set the margins, and show the plot
_ = plt.xlabel('illiteracy')
_ = plt.ylabel('fertility')
plt.margins(0.02)
plt.show()

png

3. Hypothesis testing

permutation sampling (순열 샘플링)

  • 두 변수가 동일한 확률을 갖는다는 가설을 증명
  • 두 배열을 합쳐서 섞은 다음 각각 원래의 그 수만큼 분할
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""
    
    data = np.concatenate((data1, data2))
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2
# 제품 A와 B의 1월 강수량 비교
rain_a = np.array([66.2,39.7,76.4,26.5,11.2,61.8,6.1,48.4,89.2,104.,34.,60.6,57.1,79.1,90.9,32.3,63.8,78.2,27.5,43.4,30.1,17.3,77.5,44.9,92.2,39.6,79.4,66.1,53.5,98.5,20.8,55.5,39.6,56.,65.1,14.8,13.2,88.1,8.4,32.1,19.6,40.4,2.2,77.5,105.4,77.2,38.,27.1,111.8,17.2,26.7,23.3,77.2,87.2,27.7,50.6,60.3,15.1,6.,29.4,39.3,56.3,80.4,85.3,68.4,72.5,13.3,28.4,14.7,37.4,49.5,57.2,85.9,82.1,31.8,126.6,30.7,41.4,33.9,13.5,99.1,70.2,91.8,61.3,13.7,54.9,62.5,24.2,69.4,83.1,44.,48.5,11.9,16.6,66.4,90.,43.2])
rain_b = np.array([83.6,30.9,62.2,37.,41.,160.2,18.2,122.4,71.3,44.2,49.1,37.6,114.5,28.8,82.5,71.9,50.7,67.7,112.,63.6,42.8,57.2,99.1,86.4,84.4,38.1,17.7,102.2,101.3,58.,82.,101.4,81.4,100.1,54.6,39.6,57.5,29.2,48.8,37.3,115.4,55.6,62.,95.,84.2,118.1,153.2,83.4,104.7,59.,46.4,50.,147.6,76.8,59.9,101.8,136.6,173.,92.5,37.,59.8,142.1,9.9,158.2,72.6,28.,112.9,119.3,199.2,50.7,44.,170.7,67.2,21.4,61.3,15.6,106.,116.2,42.3,38.5,132.5,40.8,147.5,93.9,71.4,87.3,163.7,141.4,62.6,84.9,28.8,121.1,28.6,32.4,112.,50.,126.8])
for _ in range(50):
    # Generate permutation samples
    perm_sample_1, perm_sample_2 = permutation_sample(rain_a, rain_b)

    # ECDFs of permutation sample
    x_1, y_1 = ecdf(perm_sample_1)
    x_2, y_2 = ecdf(perm_sample_2)
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red', alpha=0.02)
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue', alpha=0.02)

# ECDFs from original data
x_1, y_1 = ecdf(rain_a)
x_2, y_2 = ecdf(rain_b)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')

# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('rain fall (mm)')
_ = plt.ylabel('ECDF')
plt.show()

# 두 지역의 강수량은 동일한 분포를 갖지 않음을 알 수 있다.

png

A/B Test

  • A그룹은 프로모션 상품을 153명이 구매, 91명은 비구매. B그룹은 136명 구매, 35명 비구매.
  • A와 B 그룹의 특성이 상품 구매에 영향을 미쳤는가?
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates
groupA = np.array([True] * 153 + [False] * 91)
groupB = np.array([True] * 136 + [False] * 35)

# A 그룹에서 상품을 구매한 비율
def frac_yes(groupA, groupB):
    frac = np.sum(groupA) / len(groupA)
    return frac

# permutation samples
perm_replicates = draw_perm_reps(groupA, groupB, frac_yes, 10000)

frac = np.sum(groupA) / len(groupA)
p = np.sum(perm_replicates <= frac) / len(perm_replicates)
print('p-value =', p)

# p-value가 0.05보다 작기 때문에 그룹이 영향을 미치지 않는다는 가설은 기각된다.
p-value = 0.0002

5. Case study

  • 1975년과 2012년에 측정한 finch 새의 부리 깊이와 길이(mm)
bd_1975 = np.array([8.4, 8.8, 8.4, 8., 7.9, 8.9, 8.6, 8.5, 8.9, 9.1, 8.6, 9.8, 8.2, 9., 9.7, 8.6, 8.2, 9., 8.4, 8.6, 8.9, 9.1, 8.3, 8.7, 9.6, 8.5, 9.1, 9., 9.2, 9.9, 8.6, 9.2, 8.4, 8.9, 8.5, 10.4, 9.6, 9.1, 9.3, 9.3, 8.8, 8.3, 8.8, 9.1, 10.1, 8.9, 9.2, 8.5, 10.2, 10.1, 9.2, 9.7, 9.1, 8.5, 8.2, 9., 9.3, 8., 9.1, 8.1, 8.3, 8.7, 8.8, 8.6, 8.7, 8., 8.8, 9., 9.1, 9.74, 9.1, 9.8, 10.4, 8.3, 9.44, 9.04, 9., 9.05, 9.65, 9.45, 8.65, 9.45, 9.45, 9.05, 8.75, 9.45, 8.35])
len(bd_1975)
87
bl_1975 = np.array([13.9, 14., 12.9, 13.5, 12.9, 14.6, 13., 14.2, 14., 14.2, 13.1, 15.1, 13.5, 14.4, 14.9, 12.9, 13., 14.9, 14., 13.8, 13., 14.75, 13.7, 13.8, 14., 14.6, 15.2, 13.5, 15.1, 15., 12.8, 14.9, 15.3, 13.4, 14.2, 15.1, 15.1, 14., 13.6, 14., 14., 13.9, 14., 14.9, 15.6, 13.8, 14.4, 12.8, 14.2, 13.4, 14., 14.8, 14.2, 13.5, 13.4, 14.6, 13.5, 13.7, 13.9, 13.1, 13.4, 13.8, 13.6, 14., 13.5, 12.8, 14., 13.4, 14.9, 15.54, 14.63, 14.73, 15.73, 14.83, 15.94, 15.14, 14.23, 14.15, 14.35, 14.95, 13.95, 14.05, 14.55, 14.05, 14.45, 15.05, 13.25])
len(bl_1975)
87
bd_2012 = np.array([9.4, 8.9, 9.5, 11., 8.7, 8.4, 9.1, 8.7, 10.2, 9.6, 8.85, 8.8, 9.5, 9.2, 9., 9.8, 9.3, 9., 10.2, 7.7, 9., 9.5, 9.4, 8., 8.9, 9.4, 9.5, 8., 10., 8.95, 8.2, 8.8, 9.2, 9.4, 9.5, 8.1, 9.5, 8.4, 9.3, 9.3, 9.6, 9.2, 10., 8.9, 10.5, 8.9, 8.6, 8.8, 9.15, 9.5, 9.1, 10.2, 8.4, 10., 10.2, 9.3, 10.8, 8.3, 7.8, 9.8, 7.9, 8.9, 7.7, 8.9, 9.4, 9.4, 8.5, 8.5, 9.6, 10.2, 8.8, 9.5, 9.3, 9., 9.2, 8.7, 9., 9.1, 8.7, 9.4, 9.8, 8.6, 10.6, 9., 9.5, 8.1, 9.3, 9.6, 8.5, 8.2, 8., 9.5, 9.7, 9.9, 9.1, 9.5, 9.8, 8.4, 8.3, 9.6, 9.4, 10., 8.9, 9.1, 9.8, 9.3, 9.9, 8.9, 8.5, 10.6, 9.3, 8.9, 8.9, 9.7, 9.8, 10.5, 8.4, 10., 9., 8.7, 8.8, 8.4, 9.3, 9.8, 8.9, 9.8, 9.1])
len(bd_2012)
127
bl_2012 = np.array([14.3, 12.5, 13.7, 13.8, 12., 13., 13., 13.6, 12.8, 13.6, 12.95, 13.1, 13.4, 13.9, 12.3, 14., 12.5, 12.3, 13.9, 13.1, 12.5, 13.9, 13.7, 12., 14.4, 13.5, 13.8, 13., 14.9, 12.5, 12.3, 12.8, 13.4, 13.8, 13.5, 13.5, 13.4, 12.3, 14.35, 13.2, 13.8, 14.6, 14.3, 13.8, 13.6, 12.9, 13., 13.5, 13.2, 13.7, 13.1, 13.2, 12.6, 13., 13.9, 13.2, 15., 13.37, 11.4, 13.8, 13., 13., 13.1, 12.8, 13.3, 13.5, 12.4, 13.1, 14., 13.5, 11.8, 13.7, 13.2, 12.2, 13., 13.1, 14.7, 13.7, 13.5, 13.3, 14.1, 12.5, 13.7, 14.6, 14.1, 12.9, 13.9, 13.4, 13., 12.7, 12.1, 14., 14.9, 13.9, 12.9, 14.6, 14., 13., 12.7, 14., 14.1, 14.1, 13., 13.5, 13.4, 13.9, 13.1, 12.9, 14., 14., 14.1, 14.7, 13.4, 13.8, 13.4, 13.8, 12.4, 14.1, 12.9, 13.9, 14.3, 13.2, 14.2, 13., 14.6, 13.1, 15.2])
len(bl_2012)
127
df1 = pd.DataFrame({'beak_depth':bd_1975, 'beak_length':bl_1975,
                    'year':list(map(lambda item: '1975', range(len(bd_1975))))})
df2 = pd.DataFrame({'beak_depth':bd_2012, 'beak_length':bl_2012,
                    'year':list(map(lambda item: '2012', range(len(bd_2012))))})
df_finch = df1.append(df2)
df_finch.shape
(214, 3)
df_finch.head()
beak_depth beak_length year
0 8.4 13.9 1975
1 8.8 14.0 1975
2 8.4 12.9 1975
3 8.0 13.5 1975
4 7.9 12.9 1975

bee swarm plot

_ = sb.swarmplot(data=df_finch, x='year', y='beak_depth')
_ = plt.xlabel('year')
_ = plt.ylabel('beak depth (mm)')
plt.show()

png

# 두 데이터가 분명하게 구분된다고 보기는 어렵다.
# 2012년 데이터의 평균이 약간 높고, 분산이 더 커 보인다.

ECDFs of beak depths

# ECDFs
x_1975, y_1975 = ecdf(bd_1975)
x_2012, y_2012 = ecdf(bd_2012)

_ = plt.plot(x_1975, y_1975, marker='.', linestyle='none')
_ = plt.plot(x_2012, y_2012, marker='.', linestyle='none')
plt.margins(0.02)
_ = plt.xlabel('beak depth (mm)')
_ = plt.ylabel('ECDF')
_ = plt.legend(('1975', '2012'), loc='lower right')
plt.show()

png

# ECDF 에서 차이가 좀 더 분명하게 드러난다.

Parameter estimate

# difference of the sample means
mean_diff = np.mean(bd_2012) - np.mean(bd_1975)

# bootstrap replicates of means
bs_replicates_1975 = draw_bootstrap_reps(bd_1975, np.mean, 10000)
bs_replicates_2012 = draw_bootstrap_reps(bd_2012, np.mean, 10000)

# difference of means
bs_diff_replicates = bs_replicates_2012 - bs_replicates_1975

# 95% confidence interval
conf_int = np.percentile(bs_diff_replicates, [2.5, 97.5])

print('difference of means =', mean_diff, 'mm')
print('95% confidence interval =', conf_int, 'mm')
difference of means = 0.226220472441 mm
95% confidence interval = [ 0.05981571  0.3871821 ] mm

Hypothesis test : 2012년에 부리가 더 깊어졌는가?

  • finch 새의 부리가 더 깊어진 것이 단지 우연의 결과가 아님을 보기 위해 permutation test 수행.
# Shift the samples
combined_mean = np.mean(np.concatenate((bd_1975, bd_2012)))
bd_1975_shifted = bd_1975 - np.mean(bd_1975) + combined_mean
bd_2012_shifted = bd_2012 - np.mean(bd_2012) + combined_mean

# bootstrap replicates of shifted data sets
bs_replicates_1975 = draw_bootstrap_reps(bd_1975_shifted, np.mean, 10000)
bs_replicates_2012 = draw_bootstrap_reps(bd_2012_shifted, np.mean, 10000)
bs_diff_replicates = bs_replicates_2012 - bs_replicates_1975

# p-value
p = np.sum(bs_diff_replicates >= mean_diff) / len(bs_diff_replicates)
print('p =', p)
p = 0.0036
# 통계적으로 유의미한 차이가 있음을 알 수 있다.
# 그러나 평균의 차이가 겨우 0.2 mm 이므로 큰 차이가 있다고 하기는 어렵다. 

relation of length & depth

_ = plt.plot(bl_1975, bd_1975, marker='.', linestyle='none', color='blue', alpha=0.5)
_ = plt.plot(bl_2012, bd_2012, marker='.', linestyle='none', color='red', alpha=0.5)
_ = plt.xlabel('beak length (mm)')
_ = plt.ylabel('beak depth (mm)')
_ = plt.legend(('1975', '2012'), loc='upper left')
plt.show()

png

# 빨간점(2012)을 보면 부리는 더 깊어지기는 했지만 길이가 늘어나지는 않았다. 오히려 조금 짧아진 것으로 보인다.

Linear regressions

# linear regressions
slope_1975, intercept_1975 = np.polyfit(bl_1975, bd_1975, 1)
slope_2012, intercept_2012 = np.polyfit(bl_2012, bd_2012, 1)

# pairs bootstrap for the linear regressions
bs_slope_reps_1975, bs_intercept_reps_1975 = draw_bs_pairs_linreg(bl_1975, bd_1975, 1000)
bs_slope_reps_2012, bs_intercept_reps_2012 = draw_bs_pairs_linreg(bl_2012, bd_2012, 1000)

# confidence intervals of slopes
slope_conf_int_1975 = np.percentile(bs_slope_reps_1975, [2.5, 97.5])
slope_conf_int_2012 = np.percentile(bs_slope_reps_2012, [2.5, 97.5])
intercept_conf_int_1975 = np.percentile(bs_intercept_reps_1975, [2.5, 97.5])
intercept_conf_int_2012 = np.percentile(bs_intercept_reps_2012, [2.5, 97.5])

# Print the results
print('1975: slope =', slope_1975, 'conf int =', slope_conf_int_1975)
print('1975: intercept =', intercept_1975, 'conf int =', intercept_conf_int_1975)
print('2012: slope =', slope_2012, 'conf int =', slope_conf_int_2012)
print('2012: intercept =', intercept_2012, 'conf int =', intercept_conf_int_2012)
1975: slope = 0.465205169161 conf int = [ 0.33720846  0.58679333]
1975: intercept = 2.39087523658 conf int = [ 0.683399    4.23168645]
2012: slope = 0.462630358835 conf int = [ 0.33744671  0.6003248 ]
2012: intercept = 2.97724749824 conf int = [ 1.09982491  4.66346537]
_ = plt.plot(bl_1975, bd_1975, marker='.', linestyle='none', color='blue', alpha=0.5)
_ = plt.plot(bl_2012, bd_2012, marker='.', linestyle='none', color='red', alpha=0.5)
_ = plt.xlabel('beak length (mm)')
_ = plt.ylabel('beak depth (mm)')
_ = plt.legend(('1975', '2012'), loc='upper left')

# Generate x-values for bootstrap lines
x = np.array([10, 17])

# Plot the bootstrap lines
for i in range(100):
    plt.plot(x, bs_slope_reps_1975[i] * x + bs_intercept_reps_1975[i], linewidth=0.5, alpha=0.2, color='blue')
    plt.plot(x, bs_slope_reps_2012[i] * x + bs_intercept_reps_2012[i], linewidth=0.5, alpha=0.2, color='red')

plt.show()

png

# slope는 같은데, 깊이(intercept)가 약 0.5 mm 증가했다.
# 부리의 길이/깊이 비율
ratio_1975 = bl_1975 / bd_1975
ratio_2012 = bl_2012 / bd_2012

mean_ratio_1975 = np.mean(ratio_1975)
mean_ratio_2012 = np.mean(ratio_2012)

bs_replicates_1975 = draw_bootstrap_reps(ratio_1975, np.mean, size=10000)
bs_replicates_2012 = draw_bootstrap_reps(ratio_2012, np.mean, size=10000)

# 99% confidence intervals
conf_int_1975 = np.percentile(bs_replicates_1975, [0.5, 99.5])
conf_int_2012 = np.percentile(bs_replicates_2012, [0.5, 99.5])

print('1975: mean ratio =', mean_ratio_1975, 'conf int =', conf_int_1975)
print('2012: mean ratio =', mean_ratio_2012, 'conf int =', conf_int_2012)
1975: mean ratio = 1.57888237719 conf int = [ 1.55625152  1.60200243]
2012: mean ratio = 1.46583422768 conf int = [ 1.44420506  1.48776669]
# 평균 부리의 길이/깊이 비율은 약 0.1mm 또는 7 % 감소했다. 99% 신뢰구간이 겹치지 않기 때문에 실제 변화이다.