탐색적 분석, Bee swarm plot, 이항 분포, 지수 분포


import numpy as np
import pandas as pd
import seaborn as sb
# Set default Seaborn style
sb.set()
import matplotlib.pyplot as plt
%matplotlib inline
# setting plot defatult size
%pylab inline
pylab.rcParams['figure.figsize'] = (12, 6)
Populating the interactive namespace from numpy and matplotlib

1. Graphical exploratory data analysis

from sklearn.datasets import load_iris
iris = load_iris()
iris.feature_names
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
iris.target_names
array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')
virginica_petal_length = iris.data[100:150,2]
virginica_petal_length
array([ 6. ,  5.1,  5.9,  5.6,  5.8,  6.6,  4.5,  6.3,  5.8,  6.1,  5.1,
        5.3,  5.5,  5. ,  5.1,  5.3,  5.5,  6.7,  6.9,  5. ,  5.7,  4.9,
        6.7,  4.9,  5.7,  6. ,  4.8,  4.9,  5.6,  5.8,  6.1,  6.4,  5.6,
        5.1,  5.6,  6.1,  5.6,  5.5,  4.8,  5.4,  5.6,  5.1,  5.1,  5.9,
        5.7,  5.2,  5. ,  5.2,  5.4,  5.1])

Histogram

plt.hist(virginica_petal_length)
plt.ylabel('count')
plt.xlabel('petal length (cm)')
plt.title('virginica_petal_length')
plt.show()

png

# Adjusting the number of bins
n_data = len(virginica_petal_length)
n_bins = int(np.sqrt(n_data))
n_bins
7
plt.hist(virginica_petal_length, bins=n_bins, edgecolor='black', facecolor='orange')
plt.ylabel('count')
plt.xlabel('petal length (cm)')
plt.title('virginica_petal_length')
<matplotlib.text.Text at 0x10f778f28>

png

Transform iris dataset to pandas DataFrame

df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
                       columns = iris['feature_names'] + ['target'])
df_iris.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0.0
1 4.9 3.0 1.4 0.2 0.0
2 4.7 3.2 1.3 0.2 0.0
3 4.6 3.1 1.5 0.2 0.0
4 5.0 3.6 1.4 0.2 0.0
# 0:setosa, 1:versicolor, 2:virginica
df_iris.columns = ['sepal_length','sepal_width','petal_length','petal_width','target']
df_iris['target'] = df_iris['target'].astype('int')
df_iris.head()
sepal_length sepal_width petal_length petal_width target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
def make_species(x):
    l_species = []
    for i in x:
        if (i == 0):
            l_species.append('setosa')
        elif (i == 1):
            l_species.append('versicolor')
        else:
            l_species.append('virginica')
            
    return l_species
df_iris['species'] = make_species(np.array(df_iris['target']))
df_iris.head()
sepal_length sepal_width petal_length petal_width target species
0 5.1 3.5 1.4 0.2 0 setosa
1 4.9 3.0 1.4 0.2 0 setosa
2 4.7 3.2 1.3 0.2 0 setosa
3 4.6 3.1 1.5 0.2 0 setosa
4 5.0 3.6 1.4 0.2 0 setosa

Bee swarm plots

_ = sb.swarmplot(x='species', y='petal_length', data=df_iris)
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')
plt.show()

png

# 2012년 미국 대선 데이터
df = pd.read_csv('data/election2012.csv')
states = (df['state'] == 'CA') | (df['state'] == 'NY') | (df['state'] == 'OH') | (df['state'] == 'TX')
df2012 = df.loc[states]
df2012.head()
state county Obama Romney winner total margin turnout
157 CA Alameda 78.5 18.7 Obama 542421 59.8 2.8
158 CA Alpine 60.1 36.5 Obama 625 23.6 3.4
159 CA Amador 39.2 58.2 Romney 14904 19.0 2.6
160 CA Butte 46.4 50.3 Romney 67879 3.9 3.3
161 CA Calaveras 40.1 57.1 Romney 17941 17.0 2.8
_ = sb.swarmplot(x='state', y='Obama', data=df2012)
_ = plt.xlabel('state')
_ = plt.ylabel('vote Obama %')
_ = plt.title('distribution of counties')
plt.show()

png

Empirical cumulative distribution function (ECDF)

def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    n = len(data)      # Number of data points
    x = np.sort(data)
    y = np.arange(1, n+1) / n

    return x, y
# Compute ECDF for versicolor data
x_value, y_value = ecdf(virginica_petal_length)

# Generate plot
_ = plt.plot(x_value, y_value, marker='.', linestyle='none')
_ = plt.margins(0.02)
_ = plt.ylabel('ECDF')
_ = plt.xlabel('virginica petal length')
plt.show()

# 4.5 ~ 5.5cm : 전체의 50%
# 6.0cm 이상 : 전체의 20%

png

# Comparison of ECDFs
setosa_petal_length = iris.data[0:50,2]
versicolor_petal_length = iris.data[50:100,2]
virginica_petal_length = iris.data[100:150,2]
# Compute ECDFs
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)

# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')

plt.margins(0.02)

# Annotate the plot
_ = plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

plt.show()

png

# 2012 미국 대선
vote_ny = df2012[df2012['state'] == 'NY']['Obama']
vote_oh = df2012[df2012['state'] == 'OH']['Obama']
vote_tx = df2012[df2012['state'] == 'TX']['Obama']
# Compute ECDFs
x_set, y_set = ecdf(vote_ny)
x_vers, y_vers = ecdf(vote_oh)
x_virg, y_virg = ecdf(vote_tx)

# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')

plt.margins(0.02)

# Annotate the plot
_ = plt.legend(('NY', 'OH', 'TX'), loc='lower right')
_ = plt.xlabel('vote Obama')
_ = plt.ylabel('ECDF')

plt.show()

png

2. Quantitative exploratory data analysis

np.mean(vote_ny)
51.967741935483886
np.median(vote_ny)
50.0

Box plot

sb.boxplot(x='species', y='petal_length', data=df_iris)
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')
plt.show()

png

sb.boxplot(x='state', y='Obama', data=df2012)
_ = plt.xlabel('state')
_ = plt.ylabel('vote Obama %')
plt.show()

png

# 미투표자 비율에 대한 오바마 지지율
_ = plt.plot(df2012.turnout, df2012.Obama, marker='.', linestyle='none')
_ = plt.xlabel('% of not vote')
_ = plt.ylabel('percent of vote for Obama')

png

np.cov(df2012.turnout, df2012.Obama)
array([[   0.56269093,    4.97298927],
       [   4.97298927,  299.29556718]])
def pearson_r(x, y):
    """Compute Pearson correlation coefficient between two arrays."""
    corr_mat = np.corrcoef(x, y)
    return corr_mat[0,1]
pearson_r(df2012.turnout, df2012.Obama)
0.38320610690704882

3. Probability - Discrete variables

Bernoulli trials - 1 or 0 (성공 / 실패)

def perform_bernoulli_trials(n, p):
    """Perform n Bernoulli trials with success probability p and return number of successes."""
    
    n_success = 0

    for i in range(n):
        # Choose random number between 0 and 1
        random_number = np.random.random()

        # If less than p, it's a success  so add one to n_success
        if random_number < p:
            n_success += 1

    return n_success
# 100 명의 대출자 파산 확률 0.05. 1000 번 시뮬레이션
n_defaults = np.empty(1000)

for i in range(1000):
    n_defaults[i] = perform_bernoulli_trials(100, 0.05)

_ = plt.hist(n_defaults, normed=True, edgecolor='black')
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('probability')
plt.show()

png

# 위 상황에 대한 ECDF
x, y = ecdf(n_defaults)

_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('number of defaults out of 100')
_ = plt.ylabel('CDF')
plt.show()

# 파산자가 10명 이상이 될 경우의 수 합계
n_lose_money = np.sum(n_defaults >= 10)
print('Probability of losing money =', n_lose_money / len(n_defaults))

png

Probability of losing money = 0.016

Binomial distribution (이항 분포)

n_defaults = np.random.binomial(n=100, p=0.05, size=10000)

# CDF
x, y = ecdf(n_defaults)

_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('CDF')
plt.show()

png

Probability mass function (PMF) - The set of probabilities of discrete outcomes

bins = np.arange(0, max(n_defaults) + 1.5) - 0.5

_ = plt.hist(n_defaults, normed=True, bins=bins, edgecolor='white')
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('PMF')
plt.margins(0.02)
plt.show()

png

4. Probability - Continuous variables

# 1926년부터 경마 최고 기록
horse_race = np.array([ 148.51, 146.65, 148.52, 150.7, 150.42, 150.88, 151.57, 147.54, 149.65, 148.74, 147.86, 148.75, 147.5, 148.26, 149.71, 146.56, 151.19, 147.88, 149.16, 148.82, 148.96, 152.02, 146.82, 149.97, 146.13, 148.1, 147.2, 146., 149.8, 147., 147.2, 147.8, 148.2, 149., 149.8, 148.6, 146.8, 149.6, 149., 148.2, 149.2, 148., 150.4, 148.8, 147.2, 148.8, 149.6, 148.4, 148.4, 150.2, 148.8, 149.2, 149.2, 148.4, 150.2, 146.6, 149.8, 149., 150.8, 148.6, 150.2, 149., 148.6, 150.2, 148.2, 149.4, 150.8, 150.2, 152.2, 148.2, 149.2, 151., 149.6, 149.6, 149.4, 148.6, 150., 150.6, 149.2, 152.6, 152.8, 149.6, 151.6, 152.8, 153.2, 152.4, 152.2 ])
# 평균과 표준편차
mu = np.mean(horse_race)
sigma = np.std(horse_race)
horse_race
# 샘플링
samples = np.random.normal(mu, sigma, size=10000)

# 샘플링 데이터와 실제 데이터의 CDF
x_theor, y_theor = ecdf(samples)
x, y = ecdf(horse_race)

_ = plt.plot(x_theor, y_theor)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('Belmont winning time (sec.)')
_ = plt.ylabel('CDF')
plt.margins(0.02) 
plt.show()

png

np.min(horse_race)
146.0
# 위 기록을 깰 확률을 시뮬레이션
samples = np.random.normal(mu, sigma, size=100000)
prob = np.sum(samples <= 146) / len(samples)

print('위 기록을 깰 확률:', prob)
위 기록을 깰 확률: 0.02097

Exponential distribution

# 미국 메이저리그 야구
# no-hitter : 투수가 한 경기에서 한번도 안타를 내주지 않음. 평균 764 경기 당 1회 발생.
# cycling hit : 타자가 한 경기에서 1루타, 2루타, 3루타, 홈런을 모두 기록. 평균 715 경기 당 1회 발생.
def successive_poisson(tau1, tau2, size=1):
    # Draw samples out of first exponential distribution
    t1 = np.random.exponential(tau1, size=size)

    # Draw samples out of second exponential distribution
    t2 = np.random.exponential(tau2, size=size)

    return t1 + t2
# no-hitter와 cycling hit가 같이 나타나기 까지의 waiting game 수 시뮬레이션
waiting_times = successive_poisson(764, 715, size=100000)
waiting_times
array([ 1519.61768867,  2444.63870183,   913.80439492, ...,   726.54811866,
        1397.66102354,  1111.50886872])
_ = plt.hist(waiting_times, bins=100, histtype='step', normed=True, edgecolor='blue')
_ = plt.xlabel('total waiting time (games)')
_ = plt.ylabel('PDF')
plt.show()

png