Pandas Basics 2 DataFrame
DataFrame에서 데이터 선택, 데이터 추가 및 변경, missing data 처리, Merging, Grouping
데이터 선택하기
import pandas as pd
import numpy as np
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr = [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]
my_dict = {
'country':names,
'drives_right':dr,
'cars_per_cap':cpc
}
cars = pd.DataFrame(my_dict)
cars
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 0 | 809 | United States | True |
| 1 | 731 | Australia | False |
| 2 | 588 | Japan | False |
| 3 | 18 | India | False |
| 4 | 200 | Russia | True |
| 5 | 70 | Morocco | True |
| 6 | 45 | Egypt | True |
| 7 | 122 | Korea | True |
| 8 | 397 | China | True |
| 9 | 255 | England | True |
# 1개 컬럼 선택, series 로 추출됨
cars['country']
0 United States
1 Australia
2 Japan
3 India
4 Russia
5 Morocco
6 Egypt
7 Korea
8 China
9 England
Name: country, dtype: object
# 1개 컬럼 선택, DataFrame 으로 추출됨
cars[['country']]
| country | |
|---|---|
| 0 | United States |
| 1 | Australia |
| 2 | Japan |
| 3 | India |
| 4 | Russia |
| 5 | Morocco |
| 6 | Egypt |
| 7 | Korea |
| 8 | China |
| 9 | England |
# row index 로 잘라내기
cars[1:5]
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 1 | 731 | Australia | False |
| 2 | 588 | Japan | False |
| 3 | 18 | India | False |
| 4 | 200 | Russia | True |
컬럼명으로 선택하기 (loc)
cars.loc[:, ['country', 'drives_right']]
| country | drives_right | |
|---|---|---|
| 0 | United States | True |
| 1 | Australia | False |
| 2 | Japan | False |
| 3 | India | False |
| 4 | Russia | True |
| 5 | Morocco | True |
| 6 | Egypt | True |
| 7 | Korea | True |
| 8 | China | True |
| 9 | England | True |
cars.loc[2:5, ['country', 'drives_right']]
| country | drives_right | |
|---|---|---|
| 2 | Japan | False |
| 3 | India | False |
| 4 | Russia | True |
| 5 | Morocco | True |
위치 인덱스로 선택하기 (iloc)
cars.iloc[3]
cars_per_cap 18
country India
drives_right False
Name: 3, dtype: object
cars.iloc[[1,2,4], [1,2]]
| country | drives_right | |
|---|---|---|
| 1 | Australia | False |
| 2 | Japan | False |
| 4 | Russia | True |
cars.iloc[2:5, :]
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 2 | 588 | Japan | False |
| 3 | 18 | India | False |
| 4 | 200 | Russia | True |
Boolean Indexing 조건절을 이용한 데이터 선택
cars[cars.drives_right == True]
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 0 | 809 | United States | True |
| 4 | 200 | Russia | True |
| 5 | 70 | Morocco | True |
| 6 | 45 | Egypt | True |
| 7 | 122 | Korea | True |
| 8 | 397 | China | True |
| 9 | 255 | England | True |
cars[cars.cars_per_cap > 500]
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 0 | 809 | United States | True |
| 1 | 731 | Australia | False |
| 2 | 588 | Japan | False |
cars[cars['country'].isin(['Korea','England'])] # 문자열 컬럼일 경우 문자열 비교 가능
| cars_per_cap | country | drives_right | |
|---|---|---|---|
| 7 | 122 | Korea | True |
| 9 | 255 | England | True |
데이터프레임에 값 추가 / 변경
svalue = pd.Series([1,2,3,np.nan,6,8,9,7,4,5])
cars['level'] = svalue # dataframe 에 Series 값으로 컬럼 추가
cars
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 0 | 809 | United States | True | 1.0 |
| 1 | 731 | Australia | False | 2.0 |
| 2 | 588 | Japan | False | 3.0 |
| 3 | 18 | India | False | NaN |
| 4 | 200 | Russia | True | 6.0 |
| 5 | 70 | Morocco | True | 8.0 |
| 6 | 45 | Egypt | True | 9.0 |
| 7 | 122 | Korea | True | 7.0 |
| 8 | 397 | China | True | 4.0 |
| 9 | 255 | England | True | 5.0 |
cars.at[0, 'country'] # at : 인덱스 번호와 컬럼명으로 데이터 선택
'United States'
cars.at[2, 'level'] = 30 # 인덱스 번호와 컬럼명에 해당하는 데이터 값 변경
cars.iloc[[2]]
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 2 | 588 | Japan | False | 30.0 |
cars.iat[5,0] = 77 # iat : 인덱스 번호와 컬럼 순서번호
cars.iloc[[5]]
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 5 | 77 | Morocco | True | 8.0 |
Mising Data 처리
cars[pd.isnull(cars.level)]
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 3 | 18 | India | False | NaN |
cars.dropna(how='any') # 한 컬럼이라도 null 값 존재하는 row 삭제.
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 0 | 809 | United States | True | 1.0 |
| 1 | 731 | Australia | False | 2.0 |
| 2 | 588 | Japan | False | 30.0 |
| 4 | 200 | Russia | True | 6.0 |
| 5 | 77 | Morocco | True | 8.0 |
| 6 | 45 | Egypt | True | 9.0 |
| 7 | 122 | Korea | True | 7.0 |
| 8 | 397 | China | True | 4.0 |
| 9 | 255 | England | True | 5.0 |
cars = cars.fillna(value=cars.level.mean()) # null 값을 다른 값으로 변경
cars
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 0 | 809 | United States | True | 1.0 |
| 1 | 731 | Australia | False | 2.0 |
| 2 | 588 | Japan | False | 30.0 |
| 3 | 18 | India | False | 8.0 |
| 4 | 200 | Russia | True | 6.0 |
| 5 | 77 | Morocco | True | 8.0 |
| 6 | 45 | Egypt | True | 9.0 |
| 7 | 122 | Korea | True | 7.0 |
| 8 | 397 | China | True | 4.0 |
| 9 | 255 | England | True | 5.0 |
데이터 합치기 (Merge)
Concat
pieces = [cars[7:], cars[3:7], cars[:3]]
pd.concat(pieces) ## 데이터를 쪼갠 순서대로 합친다.
| cars_per_cap | country | drives_right | level | |
|---|---|---|---|---|
| 7 | 122 | Korea | True | 7.0 |
| 8 | 397 | China | True | 4.0 |
| 9 | 255 | England | True | 5.0 |
| 3 | 18 | India | False | 8.0 |
| 4 | 200 | Russia | True | 6.0 |
| 5 | 77 | Morocco | True | 8.0 |
| 6 | 45 | Egypt | True | 9.0 |
| 0 | 809 | United States | True | 1.0 |
| 1 | 731 | Australia | False | 2.0 |
| 2 | 588 | Japan | False | 30.0 |
Merge (SQL style)
left = pd.DataFrame({'key': ['Kate', 'Susan'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Susan'], 'rightval': [4, 5]})
pd.merge(left, right, on='key')
| key | leftval | rightval | |
|---|---|---|---|
| 0 | Kate | 1 | 4 |
| 1 | Susan | 2 | 5 |
left = pd.DataFrame({'key': ['Kate', 'Kate'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Kate'], 'rightval': [4, 5]})
df = pd.merge(left, right, on='key') # key 값이 동일한 경우 row 가 늘어난다.
df
| key | leftval | rightval | |
|---|---|---|---|
| 0 | Kate | 1 | 4 |
| 1 | Kate | 1 | 5 |
| 2 | Kate | 2 | 4 |
| 3 | Kate | 2 | 5 |
Append
s = pd.DataFrame({'key':['Jessica'], 'leftval':[9], 'rightval':[7]})
df.append(s, ignore_index=True)
| key | leftval | rightval | |
|---|---|---|---|
| 0 | Kate | 1 | 4 |
| 1 | Kate | 1 | 5 |
| 2 | Kate | 2 | 4 |
| 3 | Kate | 2 | 5 |
| 4 | Jessica | 9 | 7 |
Grouping
df = pd.DataFrame({'col1' : ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'A'],
'col2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'col3' : np.random.randn(8).round(2)*100,
'col4' : np.random.randn(8).round(3)*10})
df
| col1 | col2 | col3 | col4 | |
|---|---|---|---|---|
| 0 | A | one | -42.0 | 5.12 |
| 1 | B | one | -35.0 | -0.60 |
| 2 | A | two | -60.0 | 1.71 |
| 3 | B | three | -30.0 | -0.50 |
| 4 | A | two | -85.0 | 3.57 |
| 5 | B | two | -97.0 | -2.08 |
| 6 | A | one | -56.0 | 3.88 |
| 7 | A | three | 43.0 | -10.77 |
df.groupby('col1').sum()
| col3 | col4 | |
|---|---|---|
| col1 | ||
| A | -200.0 | 3.51 |
| B | -162.0 | -3.18 |
df.groupby(['col1','col2']).sum()
| col3 | col4 | ||
|---|---|---|---|
| col1 | col2 | ||
| A | one | -98.0 | 9.00 |
| three | 43.0 | -10.77 | |
| two | -145.0 | 5.28 | |
| B | one | -35.0 | -0.60 |
| three | -30.0 | -0.50 | |
| two | -97.0 | -2.08 |
# pivot table
pd.pivot_table(df, values='col3', index=['col1'], columns=['col2'])
| col2 | one | three | two |
|---|---|---|---|
| col1 | |||
| A | -49.0 | 43.0 | -72.5 |
| B | -35.0 | -30.0 | -97.0 |