Data indexing, slicing, filtering & transforming. Change index, reshaping & pivot. Detect outliers.


import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
# setting plot defatult size
%pylab inline
pylab.rcParams['figure.figsize'] = (12, 6)
Populating the interactive namespace from numpy and matplotlib

1. Extracting and transforming data

df = pd.read_csv('data/election2012.csv')
df = df.loc[df.state == 'PA']
df.head()
state county Obama Romney winner total margin turnout
2957 PA Adams 35.5 63.1 Romney 41383 27.6 1.4
2958 PA Allegheny 56.6 42.2 Obama 607455 14.4 1.2
2959 PA Armstrong 30.7 67.9 Romney 27925 37.2 1.4
2960 PA Beaver 46.0 52.6 Romney 78951 6.6 1.4
2961 PA Bedford 22.1 77.0 Romney 21239 54.9 0.9

indexing & slicing

election = df.set_index('county')
election.head()
state Obama Romney winner total margin turnout
county
Adams PA 35.5 63.1 Romney 41383 27.6 1.4
Allegheny PA 56.6 42.2 Obama 607455 14.4 1.2
Armstrong PA 30.7 67.9 Romney 27925 37.2 1.4
Beaver PA 46.0 52.6 Romney 78951 6.6 1.4
Bedford PA 22.1 77.0 Romney 21239 54.9 0.9
p_counties = election.loc['Perry':'Potter']
p_counties
state Obama Romney winner total margin turnout
county
Perry PA 29.8 68.6 Romney 17941 38.8 1.6
Philadelphia PA 85.2 14.1 Obama 648864 71.1 0.7
Pike PA 43.9 54.9 Romney 22883 11.0 1.2
Potter PA 26.3 72.2 Romney 7091 45.9 1.5
p_counties = election.loc['Potter':'Perry':-1]   # reverse
p_counties
state Obama Romney winner total margin turnout
county
Potter PA 26.3 72.2 Romney 7091 45.9 1.5
Pike PA 43.9 54.9 Romney 22883 11.0 1.2
Philadelphia PA 85.2 14.1 Obama 648864 71.1 0.7
Perry PA 29.8 68.6 Romney 17941 38.8 1.6

filtering

turnout_df = election[election['turnout'] < 1]
turnout_df
state Obama Romney winner total margin turnout
county
Bedford PA 22.1 77.0 Romney 21239 54.9 0.9
Philadelphia PA 85.2 14.1 Obama 648864 71.1 0.7
low_margin = election[(election.margin < 3) & (election.turnout < 3)]   # or = |
low_margin
state Obama Romney winner total margin turnout
county
Berks PA 48.9 49.5 Romney 160752 0.6 1.6
Bucks PA 50.0 48.8 Obama 315474 1.2 1.2
Centre PA 48.9 49.0 Romney 67374 0.1 2.1
Chester PA 49.2 49.7 Romney 245512 0.5 1.1
Mercer PA 48.0 50.6 Romney 47386 2.6 1.4
# 득표율 차이가 3% 이내이면 winner 를 NULL 처리.
too_close_row = election['margin'] < 3
election.loc[too_close_row, 'winner'] = np.nan
election.info()
<class 'pandas.core.frame.DataFrame'>
Index: 67 entries, Adams to York
Data columns (total 7 columns):
state      67 non-null object
Obama      67 non-null float64
Romney     67 non-null float64
winner     62 non-null object
total      67 non-null int64
margin     67 non-null float64
turnout    67 non-null float64
dtypes: float64(4), int64(1), object(2)
memory usage: 4.2+ KB
election['winner'].value_counts()
Romney    51
Obama     11
Name: winner, dtype: int64

Trasforming DataFrames : apply()

# test data
sales = pd.DataFrame({'month':['Jan','Feb','Mar','Apr','May','Jun'],
                  'eggs':[47,110,226,82,132,210],
                  'salt':[12,50,89,87,np.nan,60],
                  'spam':[17,31,72,20,52,55]})
sales = sales.set_index('month')
sales
eggs salt spam
month
Jan 47 12.0 17
Feb 110 50.0 31
Mar 226 89.0 72
Apr 82 87.0 20
May 132 NaN 52
Jun 210 60.0 55
# convert to dozon unit function
def dozens(n):
    return n//12
sales.apply(dozens)
eggs salt spam
month
Jan 3 1.0 1
Feb 9 4.0 2
Mar 18 7.0 6
Apr 6 7.0 1
May 11 NaN 4
Jun 17 5.0 4
sales.apply(lambda n: n//12)
eggs salt spam
month
Jan 3 1.0 1
Feb 9 4.0 2
Mar 18 7.0 6
Apr 6 7.0 1
May 11 NaN 4
Jun 17 5.0 4
sales['dozen_spam'] = sales.spam.apply(dozens)
sales['salty_spam'] = sales.salt + sales.spam
sales
eggs salt spam dozen_spam salty_spam
month
Jan 47 12.0 17 1 29.0
Feb 110 50.0 31 2 81.0
Mar 226 89.0 72 6 161.0
Apr 82 87.0 20 1 107.0
May 132 NaN 52 4 NaN
Jun 210 60.0 55 4 115.0

map() with a dictionary

red_vs_blue = {'Obama':'blue', 'Romney':'red'}
election['color'] = election.winner.map(red_vs_blue)

election.head()
state Obama Romney winner total margin turnout color
county
Adams PA 35.5 63.1 Romney 41383 27.6 1.4 red
Allegheny PA 56.6 42.2 Obama 607455 14.4 1.2 blue
Armstrong PA 30.7 67.9 Romney 27925 37.2 1.4 red
Beaver PA 46.0 52.6 Romney 78951 6.6 1.4 red
Bedford PA 22.1 77.0 Romney 21239 54.9 0.9 red
from scipy.stats import zscore
turnout_zscore = zscore(election['turnout'])    # z-score
election['turnout_zscore'] = turnout_zscore
election.tail()
state Obama Romney winner total margin turnout color turnout_zscore
county
Washington PA 42.7 56.0 Romney 88958 13.3 1.3 red -0.507754
Wayne PA 38.8 59.8 Romney 20669 21.0 1.4 red -0.133913
Westmoreland PA 37.6 61.3 Romney 166809 23.7 1.1 red -1.255436
Wyoming PA 42.9 55.2 Romney 11001 12.3 1.9 red 1.735292
York PA 38.7 59.9 Romney 183702 21.2 1.4 red -0.133913

Change Index

sales.index = [idx.upper() for idx in sales.index]
sales.head(3)
eggs salt spam dozen_spam salty_spam
JAN 47 12.0 17 1 29.0
FEB 110 50.0 31 2 81.0
MAR 226 89.0 72 6 161.0
sales.index.name = 'MONTHS'
sales.head(3)
eggs salt spam dozen_spam salty_spam
MONTHS
JAN 47 12.0 17 1 29.0
FEB 110 50.0 31 2 81.0
MAR 226 89.0 72 6 161.0

Hierarchical indexing / multi index

# make test data
sales = pd.DataFrame({'state':['CA','CA','TX','TX','NY','NY'], 'month':[1,2,1,2,1,2],
                      'eggs':[47,110,221,77,69,88],
                      'salt':[12,50,89,87,73,49], 'spam':[17,31,72,20,37,56]})
sales
eggs month salt spam state
0 47 1 12 17 CA
1 110 2 50 31 CA
2 221 1 89 72 TX
3 77 2 87 20 TX
4 69 1 73 37 NY
5 88 2 49 56 NY
sales2 = sales.set_index(['state','month'])   # multi index
sales2
eggs salt spam
state month
CA 1 47 12 17
2 110 50 31
TX 1 221 89 72
2 77 87 20
NY 1 69 73 37
2 88 49 56
sales2.index      # 인덱스 정보 조회
MultiIndex(levels=[['CA', 'NY', 'TX'], [1, 2]],
           labels=[[0, 0, 2, 2, 1, 1], [0, 1, 0, 1, 0, 1]],
           names=['state', 'month'])
sales2.loc[('NY', 1)]
eggs    69
salt    73
spam    37
Name: (NY, 1), dtype: int64
sales2.loc['CA']
eggs salt spam
month
1 47 12 17
2 110 50 31

2. Rearranging and reshaping data

Pivot / Stack & Unstack

sales = pd.DataFrame({'weekday':['Sun','Sun','Mon','Mon'],
                  'item':['egg','spam','egg','spam'],
                  'sold':[120,250,189,187],
                  'retail':[17,31,72,20]})
sales
item retail sold weekday
0 egg 17 120 Sun
1 spam 31 250 Sun
2 egg 72 189 Mon
3 spam 20 187 Mon
pivot_sold = sales.pivot(index='weekday', columns='item', values='sold')
pivot_sold
item egg spam
weekday
Mon 189 187
Sun 120 250
pivot_sales = sales.pivot(index='weekday', columns='item')
pivot_sales
retail sold
item egg spam egg spam
weekday
Mon 72 20 189 187
Sun 17 31 120 250

stack & unstack : 참조블로그

sales = sales.set_index(['weekday','item'])
sales
retail sold
weekday item
Sun egg 17 120
spam 31 250
Mon egg 72 189
spam 20 187
byweekday = sales.unstack(level='weekday')
byweekday
retail sold
weekday Mon Sun Mon Sun
item
egg 72 17 189 120
spam 20 31 187 250
byweekday.stack(level='weekday')
retail sold
item weekday
egg Mon 72 189
Sun 17 120
spam Mon 20 187
Sun 31 250
byitem = sales.unstack(level='item')
byitem
retail sold
item egg spam egg spam
weekday
Mon 72 20 189 187
Sun 17 31 120 250
byitem.stack(level='item')
retail sold
weekday item
Mon egg 72 189
spam 20 187
Sun egg 17 120
spam 31 250

Melting DataFrames : 참조블로그

sales = pd.DataFrame({'weekday':['Sun','Sun','Mon','Mon'],
                  'item':['egg','spam','egg','spam'],
                  'sold':[120,250,189,187],
                  'retail':[17,31,72,20]})
sales
item retail sold weekday
0 egg 17 120 Sun
1 spam 31 250 Sun
2 egg 72 189 Mon
3 spam 20 187 Mon
sales2 = pd.melt(sales, id_vars=['weekday'], value_name='value')
sales2
weekday variable value
0 Sun item egg
1 Sun item spam
2 Mon item egg
3 Mon item spam
4 Sun retail 17
5 Sun retail 31
6 Mon retail 72
7 Mon retail 20
8 Sun sold 120
9 Sun sold 250
10 Mon sold 189
11 Mon sold 187
sales3 = pd.melt(sales, id_vars=['weekday','item'])
sales3
weekday item variable value
0 Sun egg retail 17
1 Sun spam retail 31
2 Mon egg retail 72
3 Mon spam retail 20
4 Sun egg sold 120
5 Sun spam sold 250
6 Mon egg sold 189
7 Mon spam sold 187
kv_pairs = pd.melt(sales, col_level=0)
kv_pairs.tail()
variable value
11 sold 187
12 weekday Sun
13 weekday Sun
14 weekday Mon
15 weekday Mon

pivot_table & aggregation

sales
item retail sold weekday
0 egg 17 120 Sun
1 spam 31 250 Sun
2 egg 72 189 Mon
3 spam 20 187 Mon
by_item_day = sales.pivot_table(index='weekday', columns='item')
by_item_day
retail sold
item egg spam egg spam
weekday
Mon 72 20 189 187
Sun 17 31 120 250
sales.pivot_table(index='weekday', aggfunc=sum)
retail sold
weekday
Mon 92 376
Sun 48 370
sales.pivot_table(index='weekday', aggfunc=sum, margins=True)   # All 포함.
retail sold
weekday
Mon 92.0 376.0
Sun 48.0 370.0
All 140.0 746.0

3. Grouping data

titanic = pd.read_csv('data/titanic.csv')
titanic.head()
id survived pclass name sex age sibsp parch ticket fare cabin embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
by_class = titanic.groupby('pclass')
by_class
<pandas.core.groupby.DataFrameGroupBy object at 0x10eefaf28>
by_class['survived'].count()
pclass
1    216
2    184
3    491
Name: survived, dtype: int64
by_class_sub = by_class[['age','fare']]
aggregated = by_class_sub.agg(['max','median','mean'])
aggregated
age fare
max median mean max median mean
pclass
1 80.0 37.0 38.233441 512.3292 60.2875 84.154687
2 70.0 29.0 29.877630 73.5000 14.2500 20.662183
3 74.0 24.0 25.140620 69.5500 8.0500 13.675550
aggregated.loc[:, ('age','max')]
pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64
titanic.groupby(['embarked','pclass'])['survived'].count()
embarked  pclass
C         1          85
          2          17
          3          66
Q         1           2
          2           3
          3          72
S         1         127
          2         164
          3         353
Name: survived, dtype: int64

Aggregating on index levels/fields

titanic_idx = pd.read_csv('data/titanic.csv', index_col=['pclass','sex']).sort_index()
titanic_idx
id survived name age sibsp parch ticket fare cabin embarked
pclass sex
1 female 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 C
female 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 S
female 12 1 Bonnell, Miss. Elizabeth 58.0 0 0 113783 26.5500 C103 S
female 32 1 Spencer, Mrs. William Augustus (Marie Eugenie) NaN 1 0 PC 17569 146.5208 B78 C
female 53 1 Harper, Mrs. Henry Sleeper (Myna Haxtun) 49.0 1 0 PC 17572 76.7292 D33 C
female 62 1 Icard, Miss. Amelie 38.0 0 0 113572 80.0000 B28 NaN
female 89 1 Fortune, Miss. Mabel Helen 23.0 3 2 19950 263.0000 C23 C25 C27 S
female 137 1 Newsom, Miss. Helen Monypeny 19.0 0 2 11752 26.2833 D47 S
female 152 1 Pears, Mrs. Thomas (Edith Wearne) 22.0 1 0 113776 66.6000 C2 S
female 167 1 Chibnall, Mrs. (Edith Martha Bowerman) NaN 0 1 113505 55.0000 E33 S
female 178 0 Isham, Miss. Ann Elizabeth 50.0 0 0 PC 17595 28.7125 C49 C
female 195 1 Brown, Mrs. James Joseph (Margaret Tobin) 44.0 0 0 PC 17610 27.7208 B4 C
female 196 1 Lurette, Miss. Elise 58.0 0 0 PC 17569 146.5208 B80 C
female 216 1 Newell, Miss. Madeleine 31.0 1 0 35273 113.2750 D36 C
female 219 1 Bazzani, Miss. Albina 32.0 0 0 11813 76.2917 D15 C
female 231 1 Harris, Mrs. Henry Birkhardt (Irene Wallach) 35.0 1 0 36973 83.4750 C83 S
female 257 1 Thorne, Mrs. Gertrude Maybelle NaN 0 0 PC 17585 79.2000 NaN C
female 258 1 Cherry, Miss. Gladys 30.0 0 0 110152 86.5000 B77 S
female 259 1 Ward, Miss. Anna 35.0 0 0 PC 17755 512.3292 NaN C
female 269 1 Graham, Mrs. William Thompson (Edith Junkins) 58.0 0 1 PC 17582 153.4625 C125 S
female 270 1 Bissette, Miss. Amelia 35.0 0 0 PC 17760 135.6333 C99 S
female 276 1 Andrews, Miss. Kornelia Theodosia 63.0 1 0 13502 77.9583 D7 S
female 291 1 Barber, Miss. Ellen "Nellie" 26.0 0 0 19877 78.8500 NaN S
female 292 1 Bishop, Mrs. Dickinson H (Helen Walton) 19.0 1 0 11967 91.0792 B49 C
female 298 0 Allison, Miss. Helen Loraine 2.0 1 2 113781 151.5500 C22 C26 S
female 300 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) 50.0 0 1 PC 17558 247.5208 B58 B60 C
female 307 1 Fleming, Miss. Margaret NaN 0 0 17421 110.8833 NaN C
female 308 1 Penasco y Castellana, Mrs. Victor de Satode (M... 17.0 1 0 PC 17758 108.9000 C65 C
female 310 1 Francatelli, Miss. Laura Mabel 30.0 0 0 PC 17485 56.9292 E36 C
female 311 1 Hays, Miss. Margaret Bechstein 24.0 0 0 11767 83.1583 C54 C
... ... ... ... ... ... ... ... ... ... ... ...
3 male 825 0 Panula, Master. Urho Abraham 2.0 4 1 3101295 39.6875 NaN S
male 826 0 Flynn, Mr. John NaN 0 0 368323 6.9500 NaN Q
male 827 0 Lam, Mr. Len NaN 0 0 1601 56.4958 NaN S
male 829 1 McCormack, Mr. Thomas Joseph NaN 0 0 367228 7.7500 NaN Q
male 833 0 Saad, Mr. Amin NaN 0 0 2671 7.2292 NaN C
male 834 0 Augustsson, Mr. Albert 23.0 0 0 347468 7.8542 NaN S
male 835 0 Allum, Mr. Owen George 18.0 0 0 2223 8.3000 NaN S
male 837 0 Pasic, Mr. Jakob 21.0 0 0 315097 8.6625 NaN S
male 838 0 Sirota, Mr. Maurice NaN 0 0 392092 8.0500 NaN S
male 839 1 Chip, Mr. Chang 32.0 0 0 1601 56.4958 NaN S
male 841 0 Alhomaki, Mr. Ilmari Rudolf 20.0 0 0 SOTON/O2 3101287 7.9250 NaN S
male 844 0 Lemberopolous, Mr. Peter L 34.5 0 0 2683 6.4375 NaN C
male 845 0 Culumovic, Mr. Jeso 17.0 0 0 315090 8.6625 NaN S
male 846 0 Abbing, Mr. Anthony 42.0 0 0 C.A. 5547 7.5500 NaN S
male 847 0 Sage, Mr. Douglas Bullen NaN 8 2 CA. 2343 69.5500 NaN S
male 848 0 Markoff, Mr. Marin 35.0 0 0 349213 7.8958 NaN C
male 851 0 Andersson, Master. Sigvard Harald Elias 4.0 4 2 347082 31.2750 NaN S
male 852 0 Svensson, Mr. Johan 74.0 0 0 347060 7.7750 NaN S
male 860 0 Razi, Mr. Raihed NaN 0 0 2629 7.2292 NaN C
male 861 0 Hansen, Mr. Claus Peter 41.0 2 0 350026 14.1083 NaN S
male 869 0 van Melkebeke, Mr. Philemon NaN 0 0 345777 9.5000 NaN S
male 870 1 Johnson, Master. Harold Theodor 4.0 1 1 347742 11.1333 NaN S
male 871 0 Balkic, Mr. Cerin 26.0 0 0 349248 7.8958 NaN S
male 874 0 Vander Cruyssen, Mr. Victor 47.0 0 0 345765 9.0000 NaN S
male 877 0 Gustafsson, Mr. Alfred Ossian 20.0 0 0 7534 9.8458 NaN S
male 878 0 Petroff, Mr. Nedelio 19.0 0 0 349212 7.8958 NaN S
male 879 0 Laleff, Mr. Kristo NaN 0 0 349217 7.8958 NaN S
male 882 0 Markun, Mr. Johann 33.0 0 0 349257 7.8958 NaN S
male 885 0 Sutehall, Mr. Henry Jr 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
male 891 0 Dooley, Mr. Patrick 32.0 0 0 370376 7.7500 NaN Q

891 rows × 10 columns

by_grp1 = titanic_idx.groupby(level=['pclass','sex'])
by_grp1
<pandas.core.groupby.DataFrameGroupBy object at 0x10f0b4828>
# Define the function to compute spread
def spread(series):
    return series.max() - series.min()
# Aggregate by using the dictionary
aggregator = {'survived':'sum', 'age':'mean', 'fare':spread}
aggr = by_grp1.agg(aggregator)
aggr
fare survived age
pclass sex
1 female 486.4000 91 34.611765
male 512.3292 45 41.281386
2 female 54.5000 70 28.722973
male 73.5000 17 30.740707
3 female 62.8000 72 21.750000
male 69.5500 47 26.507589

Grouping on a function of the index

# make test data
dates = pd.date_range('20161001', periods=30)
df = pd.DataFrame(np.random.rand(30,3).round(2), index=dates, columns=list('ABC'))
df.index.name = 'Date'
df.head()
A B C
Date
2016-10-01 0.03 0.31 0.05
2016-10-02 1.00 0.04 0.10
2016-10-03 0.06 0.66 0.36
2016-10-04 0.60 0.37 0.27
2016-10-05 0.78 0.21 0.59
by_day = df.groupby(df.index.strftime('%a'))
by_day['C'].sum()
Fri    1.64
Mon    2.23
Sat    1.32
Sun    2.46
Thu    1.42
Tue    2.41
Wed    2.51
Name: C, dtype: float64

Detecting outliers with Z-Scores

from scipy.stats import zscore
import warnings
warnings.simplefilter(action = "ignore", category = RuntimeWarning)
cars = pd.read_csv('data/automobile.csv')
cars.head()
symboling normalized_losses maker fuel aspiration doors body wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
0 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.4 10.0 102 5500 24 30 13950
1 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.4 8.0 115 5500 18 22 17450
2 1 158 audi gas std four sedan fwd front 105.8 ... 136 mpfi 3.19 3.4 8.5 110 5500 19 25 17710
3 1 158 audi gas turbo four sedan fwd front 105.8 ... 131 mpfi 3.13 3.4 8.3 140 5500 17 20 23875
4 2 192 bmw gas std two sedan rwd front 101.2 ... 108 mpfi 3.50 2.8 8.8 101 5800 23 29 16430

5 rows × 26 columns

np.seterr(divide='ignore')
standardized = cars.groupby('cylinders')['city_mpg','price'].transform(zscore)
# identify outliers
outliers = (standardized['city_mpg'] < -3) | (standardized['city_mpg'] > 3)
cars.loc[outliers]
symboling normalized_losses maker fuel aspiration doors body wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
19 2 137 honda gas std two hatchback fwd front 86.6 ... 92 1bbl 2.91 3.41 9.6 58 4800 49 54 6479
60 1 128 nissan diesel std two sedan fwd front 94.5 ... 103 idi 2.99 3.47 21.9 55 4800 45 50 7099

2 rows × 26 columns

Filling missing data (imputation) by group

titanic.tail()
id survived pclass name sex age sibsp parch ticket fare cabin embarked
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.00 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.00 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())
# grouping by sex & pclass
by_sex_class = titanic.groupby(['sex','pclass'])
# sex & pclass 그룹별로 median 값을 missing data에 할당
titanic['age'] = by_sex_class['age'].transform(impute_median)
titanic.tail()
id survived pclass name sex age sibsp parch ticket fare cabin embarked
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.00 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.00 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 21.5 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q

transformations with .apply

def disparity(gr):
    # spread
    s = gr['fare'].max() - gr['fare'].min()
    # z-score
    z = (gr['fare'] - gr['fare'].mean())/gr['fare'].std()
    return pd.DataFrame({'zscore':z , 'spread':s})
by_class = titanic.groupby('pclass')
by_class['fare'].mean()
pclass
1    84.154687
2    20.662183
3    13.675550
Name: fare, dtype: float64
pgroup_disp = by_class.apply(disparity)  # 모든 row에 대해 각각 적용됨.
pgroup_disp.head()
spread zscore
0 69.5500 -0.545549
1 512.3292 -0.164217
2 69.5500 -0.488239
3 512.3292 -0.396205
4 69.5500 -0.477626
# C deck 의 생존율
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    return gr.loc[c_passengers, 'survived'].mean()
titanic.groupby('sex').apply(c_deck_survival)
sex
female    0.888889
male      0.343750
dtype: float64
# 10살 기준 생존율
under10 = (titanic['age'] < 10).map({True:'under 10', False:'over 10'})   # Create the Boolean Series
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
survived_mean_1
age
over 10     0.366707
under 10    0.612903
Name: survived, dtype: float64
survived_mean_2 = titanic.groupby([under10, 'pclass'])['survived'].mean()
survived_mean_2
age       pclass
over 10   1         0.629108
          2         0.419162
          3         0.222717
under 10  1         0.666667
          2         1.000000
          3         0.452381
Name: survived, dtype: float64

4. Excercise

Olympic medals dataset

medals = pd.read_csv('data/olympic_medals.csv')
medals.head()
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
0 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men 100m freestyle M Gold
1 Athens 1896 Aquatics Swimming HERSCHMANN, Otto AUT Men 100m freestyle M Silver
2 Athens 1896 Aquatics Swimming DRIVAS, Dimitrios GRE Men 100m freestyle for sailors M Bronze
3 Athens 1896 Aquatics Swimming MALOKINIS, Ioannis GRE Men 100m freestyle for sailors M Gold
4 Athens 1896 Aquatics Swimming CHASAPIS, Spiridon GRE Men 100m freestyle for sailors M Silver
# 국가별 메달 합계
medals['NOC'].value_counts()[0:10]
USA    4335
URS    2049
GBR    1594
FRA    1314
ITA    1228
GER    1211
AUS    1075
HUN    1053
SWE    1021
GDR     825
Name: NOC, dtype: int64
# count medals by type
counted = medals.pivot_table(index='NOC', values='Athlete', columns='Medal', aggfunc='count')
counted['totals'] = counted.sum(axis='columns')   # Create the new column
counted = counted.sort_values('totals', ascending=False)
counted.head()
Medal Bronze Gold Silver totals
NOC
USA 1052.0 2088.0 1195.0 4335.0
URS 584.0 838.0 627.0 2049.0
GBR 505.0 498.0 591.0 1594.0
FRA 475.0 378.0 461.0 1314.0
ITA 374.0 460.0 394.0 1228.0
# drop_duplicates
ev_gen = medals[['Event_gender', 'Gender']]
ev_gen_uniques = ev_gen.drop_duplicates()
ev_gen_uniques
Event_gender Gender
0 M Men
348 X Men
416 W Women
639 X Women
23675 W Men
medals_by_gender = medals.groupby(['Event_gender', 'Gender'])
medal_count_by_gender = medals_by_gender.count()
medal_count_by_gender
City Edition Sport Discipline Athlete NOC Event Medal
Event_gender Gender
M Men 20067 20067 20067 20067 20067 20067 20067 20067
W Men 1 1 1 1 1 1 1 1
Women 7277 7277 7277 7277 7277 7277 7277 7277
X Men 1653 1653 1653 1653 1653 1653 1653 1653
Women 218 218 218 218 218 218 218 218
medals[(medals.Event_gender == 'W') & (medals.Gender == 'Men')]
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
23675 Sydney 2000 Athletics Athletics CHEPCHUMBA, Joyce KEN Men marathon W Bronze
# distinct : nunique()
# 국가별로 메달을 획득한 종목 수
country_grouped = medals.groupby('NOC')
Nsports = country_grouped['Sport'].nunique()
Nsports = Nsports.sort_values(ascending=False)
Nsports.head()
NOC
USA    34
GBR    31
FRA    28
GER    26
CHN    24
Name: Sport, dtype: int64
# USA vs. USSR
during_cold_war = (medals.Edition>=1952) & (medals.Edition<=1988)
is_usa_urs = medals.NOC.isin(['USA', 'URS'])

cold_war_medals = medals.loc[during_cold_war & is_usa_urs]

country_grouped = cold_war_medals.groupby('NOC')
Nsports = country_grouped['Sport'].nunique().sort_values(ascending=False)
Nsports
NOC
URS    21
USA    20
Name: Sport, dtype: int64
medals_won_by_country = medals.pivot_table(index='Edition', columns='NOC', values='Athlete', aggfunc='count')
cold_war_usa_usr_medals = medals_won_by_country.loc[1952:1988, ['USA','URS']]   # Slice
most_medals = cold_war_usa_usr_medals.idxmax(axis='columns')
most_medals.value_counts()
URS    8
USA    2
dtype: int64
# USA Medal Counts by Edition
usa = medals[medals.NOC == 'USA']
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()

# Reshape
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Line Plot
usa_medals_by_year.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10efad0b8>

png

# Area Plot
usa_medals_by_year.plot.area()
<matplotlib.axes._subplots.AxesSubplot at 0x112b3d2b0>

png

# Area Plot with Ordered Medals
# Redefine 'Medal' as an ordered categorical
medals.Medal = pd.Categorical(values=medals.Medal, categories=['Bronze', 'Silver', 'Gold'], ordered=True)

# Create the DataFrame
usa = medals[medals.NOC == 'USA']
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')

usa_medals_by_year.plot.area()
<matplotlib.axes._subplots.AxesSubplot at 0x112b56630>

png