Merging DataFrames
여러 파일에서 데이터 읽기, 데이터 합치기, Merge with inner/left/outer join
import pandas as pd
import numpy as np
1. Reading multiple data files
filenames = ['sales_gold.csv', 'sales_silver.csv', 'sales_bronze.csv']
# 1. Loop
dataframes = []
for filename in filenames:
dataframes.append(pd.read_csv('data/'+filename))
dataframes[0].head()
item | total | |
---|---|---|
0 | apple | 500 |
1 | orange | 300 |
2 | peach | 200 |
3 | berry | 100 |
# 2. comprehension
dataframes = [pd.read_csv('data/'+f) for f in filenames]
dataframes[0].head()
item | total | |
---|---|---|
0 | apple | 500 |
1 | orange | 300 |
2 | peach | 200 |
3 | berry | 100 |
# 3. glob
from glob import glob
filenames = glob('data/sales*.csv')
dataframes = [pd.read_csv(f) for f in filenames]
dataframes[0].head()
month | total | |
---|---|---|
0 | apple | 600 |
1 | orange | 750 |
2 | peach | 570 |
3 | berry | 210 |
# combine
filenames
['data/sales_bronze.csv', 'data/sales_gold.csv', 'data/sales_silver.csv']
items = dataframes[1].copy()
items.columns = ['item','gold']
items['silver'] = dataframes[2]['total']
items['bronze'] = dataframes[0]['total']
items
item | gold | silver | bronze | |
---|---|---|---|---|
0 | apple | 500 | 700 | 600 |
1 | orange | 300 | 550 | 750 |
2 | peach | 200 | 400 | 570 |
3 | berry | 100 | 180 | 210 |
index & sort
items2 = items.set_index('item')
items2
gold | silver | bronze | |
---|---|---|---|
item | |||
apple | 500 | 700 | 600 |
orange | 300 | 550 | 750 |
peach | 200 | 400 | 570 |
berry | 100 | 180 | 210 |
items2.sort_index()
gold | silver | bronze | |
---|---|---|---|
item | |||
apple | 500 | 700 | 600 |
berry | 100 | 180 | 210 |
orange | 300 | 550 | 750 |
peach | 200 | 400 | 570 |
items2.sort_values('gold')
gold | silver | bronze | |
---|---|---|---|
item | |||
berry | 100 | 180 | 210 |
peach | 200 | 400 | 570 |
orange | 300 | 550 | 750 |
apple | 500 | 700 | 600 |
# Reindexing DataFrame from a list
itemlist = ['apple', 'banana', 'berry', 'orange', 'peach', 'mango']
items3 = items2.reindex(itemlist)
items3
gold | silver | bronze | |
---|---|---|---|
item | |||
apple | 500.0 | 700.0 | 600.0 |
banana | NaN | NaN | NaN |
berry | 100.0 | 180.0 | 210.0 |
orange | 300.0 | 550.0 | 750.0 |
peach | 200.0 | 400.0 | 570.0 |
mango | NaN | NaN | NaN |
# Reindexing using another DataFrame Index
items4 = items3.reindex(items2.index)
items4
gold | silver | bronze | |
---|---|---|---|
item | |||
apple | 500.0 | 700.0 | 600.0 |
orange | 300.0 | 550.0 | 750.0 |
peach | 200.0 | 400.0 | 570.0 |
berry | 100.0 | 180.0 | 210.0 |
Arithmetic with Series & DataFrames
df = pd.DataFrame({'Date':['2017-01-10','2017-01-11','2017-01-12'], 'TemperatureF':[32,25,34]})
temps_f = df.set_index('Date')
temps_f
TemperatureF | |
---|---|
Date | |
2017-01-10 | 32 |
2017-01-11 | 25 |
2017-01-12 | 34 |
temps_f.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 2017-01-10 to 2017-01-12
Data columns (total 1 columns):
TemperatureF 3 non-null int64
dtypes: int64(1)
memory usage: 48.0+ bytes
temps_c = ((temps_f - 32) * 5/9).round(2)
temps_c.columns = temps_c.columns.str.replace('F', 'C')
temps_c
TemperatureC | |
---|---|
Date | |
2017-01-10 | 0.00 |
2017-01-11 | -3.89 |
2017-01-12 | 1.11 |
2. Concatenating data
- result = s1.append(s2).append(s3)
- result = pd.concat([s1, s2, s3])
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df_area1 = df1.set_index('area')
df_area1
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
df2 = pd.DataFrame({'area':['FL', 'GA', 'AL'], 'stores':[90,115,25]})
df_area2 = df2.set_index('area')
df_area2
stores | |
---|---|
area | |
FL | 90 |
GA | 115 |
AL | 25 |
# append
store = df_area1.append(df_area2)
store
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
FL | 90 |
GA | 115 |
AL | 25 |
# concat
store = pd.concat([df_area1, df_area2])
store
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
FL | 90 |
GA | 115 |
AL | 25 |
# Dataframe의 컬럼이 서로 다른 경우
df_area1
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
df3 = pd.DataFrame({'area':['FL', 'NY', 'AL'], 'members':[90,115,25]})
df_area3 = df3.set_index('area')
df_area3
members | |
---|---|
area | |
FL | 90 |
NY | 115 |
AL | 25 |
store = df_area1.append(df_area3) # ---> index 동일한 것이 있으면 중복.
store
members | stores | |
---|---|---|
area | ||
CT | NaN | 320.0 |
MA | NaN | 205.0 |
NY | NaN | 845.0 |
FL | 90.0 | NaN |
NY | 115.0 | NaN |
AL | 25.0 | NaN |
store = pd.concat([df_area1, df_area3], axis=1) # index 동일한 것은 하나로 묶음.
store
stores | members | |
---|---|---|
AL | NaN | 25.0 |
CT | 320.0 | NaN |
FL | NaN | 90.0 |
MA | 205.0 | NaN |
NY | 845.0 | 115.0 |
Concat using multi-index on rows / columns
df2015 = df_area1.copy()
df2015
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[839,560,745]})
df2016 = df1.set_index('area')
df2016
stores | |
---|---|
area | |
CT | 839 |
MA | 560 |
NY | 745 |
df_store = pd.concat([df2015, df2016], keys=[2015,2016], axis=0)
df_store
stores | ||
---|---|---|
area | ||
2015 | CT | 320 |
MA | 205 | |
NY | 845 | |
2016 | CT | 839 |
MA | 560 | |
NY | 745 |
df_store = pd.concat([df2015, df2016], keys=[2015,2016], axis='columns')
df_store
2015 | 2016 | |
---|---|---|
stores | stores | |
area | ||
CT | 320 | 839 |
MA | 205 | 560 |
NY | 845 | 745 |
concat with Dictionary
store_dic = {2015:df2015, 2016:df2016}
df_store = pd.concat(store_dic, axis='columns')
df_store
2015 | 2016 | |
---|---|---|
stores | stores | |
area | ||
CT | 320 | 839 |
MA | 205 | 560 |
NY | 845 | 745 |
inner join / outer join
df_area1
stores | |
---|---|
area | |
CT | 320 |
MA | 205 |
NY | 845 |
df_area3
members | |
---|---|
area | |
FL | 90 |
NY | 115 |
AL | 25 |
pd.concat([df_area1, df_area3], axis=1, join='inner')
stores | members | |
---|---|---|
area | ||
NY | 845 | 115 |
pd.concat([df_area1, df_area3], axis=1, join='outer')
stores | members | |
---|---|---|
AL | NaN | 25.0 |
CT | 320.0 | NaN |
FL | NaN | 90.0 |
MA | 205.0 | NaN |
NY | 845.0 | 115.0 |
3. Merging DataFrames
# pd.merge : index가 없는 DataFrame 들을 컬럼 기준으로 합친다.
컬럼명이 서로 다른 경우
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df1
area | stores | |
---|---|---|
0 | CT | 320 |
1 | MA | 205 |
2 | NY | 845 |
df2 = pd.DataFrame({'area':['FL', 'NY', 'AL'], 'members':[90,115,25]})
df2
area | members | |
---|---|---|
0 | FL | 90 |
1 | NY | 115 |
2 | AL | 25 |
pd.merge(df1, df2)
area | stores | members | |
---|---|---|---|
0 | NY | 845 | 115 |
컬럼명이 서로 같은 경우
df2015 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df2016 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[900,115,250]})
pd.merge(df2015, df2016)
area | stores |
---|
pd.merge(df2015, df2016, on='area')
area | stores_x | stores_y | |
---|---|---|---|
0 | CT | 320 | 900 |
1 | MA | 205 | 115 |
2 | NY | 845 | 250 |
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'])
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | CT | 320 | 900 |
1 | MA | 205 | 115 |
2 | NY | 845 | 250 |
기준이 되는 컬럼명이 서로 다른 경우
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df2 = pd.DataFrame({'state':['CT', 'MA', 'NY'], 'stores':[900,115,250]})
pd.merge(df1, df2, left_on='area', right_on='state')
area | stores_x | state | stores_y | |
---|---|---|---|---|
0 | CT | 320 | CT | 900 |
1 | MA | 205 | MA | 115 |
2 | NY | 845 | NY | 250 |
Merging with inner / left / outer join
df2015 = pd.DataFrame({'area':['NY', 'MA', 'CT'], 'stores':[320,205,845]})
df2016 = pd.DataFrame({'area':['CT', 'TX', 'NY'], 'stores':[900,115,250]})
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016']) # inner
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | NY | 320 | 250 |
1 | CT | 845 | 900 |
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='left')
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | NY | 320 | 250.0 |
1 | MA | 205 | NaN |
2 | CT | 845 | 900.0 |
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='right')
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | NY | 320.0 | 250 |
1 | CT | 845.0 | 900 |
2 | TX | NaN | 115 |
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='outer')
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | NY | 320.0 | 250.0 |
1 | MA | 205.0 | NaN |
2 | CT | 845.0 | 900.0 |
3 | TX | NaN | 115.0 |
# Ordered Merge
pd.merge_ordered(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='outer')
area | stores_2015 | stores_2016 | |
---|---|---|---|
0 | CT | 845.0 | 900.0 |
1 | MA | 205.0 | NaN |
2 | NY | 320.0 | 250.0 |
3 | TX | NaN | 115.0 |