Pandas Basics 2 DataFrame
DataFrame에서 데이터 선택, 데이터 추가 및 변경, missing data 처리, Merging, Grouping
데이터 선택하기
import pandas as pd
import numpy as np
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr = [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]
my_dict = {
'country':names,
'drives_right':dr,
'cars_per_cap':cpc
}
cars = pd.DataFrame(my_dict)
cars
cars_per_cap | country | drives_right | |
---|---|---|---|
0 | 809 | United States | True |
1 | 731 | Australia | False |
2 | 588 | Japan | False |
3 | 18 | India | False |
4 | 200 | Russia | True |
5 | 70 | Morocco | True |
6 | 45 | Egypt | True |
7 | 122 | Korea | True |
8 | 397 | China | True |
9 | 255 | England | True |
# 1개 컬럼 선택, series 로 추출됨
cars['country']
0 United States
1 Australia
2 Japan
3 India
4 Russia
5 Morocco
6 Egypt
7 Korea
8 China
9 England
Name: country, dtype: object
# 1개 컬럼 선택, DataFrame 으로 추출됨
cars[['country']]
country | |
---|---|
0 | United States |
1 | Australia |
2 | Japan |
3 | India |
4 | Russia |
5 | Morocco |
6 | Egypt |
7 | Korea |
8 | China |
9 | England |
# row index 로 잘라내기
cars[1:5]
cars_per_cap | country | drives_right | |
---|---|---|---|
1 | 731 | Australia | False |
2 | 588 | Japan | False |
3 | 18 | India | False |
4 | 200 | Russia | True |
컬럼명으로 선택하기 (loc)
cars.loc[:, ['country', 'drives_right']]
country | drives_right | |
---|---|---|
0 | United States | True |
1 | Australia | False |
2 | Japan | False |
3 | India | False |
4 | Russia | True |
5 | Morocco | True |
6 | Egypt | True |
7 | Korea | True |
8 | China | True |
9 | England | True |
cars.loc[2:5, ['country', 'drives_right']]
country | drives_right | |
---|---|---|
2 | Japan | False |
3 | India | False |
4 | Russia | True |
5 | Morocco | True |
위치 인덱스로 선택하기 (iloc)
cars.iloc[3]
cars_per_cap 18
country India
drives_right False
Name: 3, dtype: object
cars.iloc[[1,2,4], [1,2]]
country | drives_right | |
---|---|---|
1 | Australia | False |
2 | Japan | False |
4 | Russia | True |
cars.iloc[2:5, :]
cars_per_cap | country | drives_right | |
---|---|---|---|
2 | 588 | Japan | False |
3 | 18 | India | False |
4 | 200 | Russia | True |
Boolean Indexing 조건절을 이용한 데이터 선택
cars[cars.drives_right == True]
cars_per_cap | country | drives_right | |
---|---|---|---|
0 | 809 | United States | True |
4 | 200 | Russia | True |
5 | 70 | Morocco | True |
6 | 45 | Egypt | True |
7 | 122 | Korea | True |
8 | 397 | China | True |
9 | 255 | England | True |
cars[cars.cars_per_cap > 500]
cars_per_cap | country | drives_right | |
---|---|---|---|
0 | 809 | United States | True |
1 | 731 | Australia | False |
2 | 588 | Japan | False |
cars[cars['country'].isin(['Korea','England'])] # 문자열 컬럼일 경우 문자열 비교 가능
cars_per_cap | country | drives_right | |
---|---|---|---|
7 | 122 | Korea | True |
9 | 255 | England | True |
데이터프레임에 값 추가 / 변경
svalue = pd.Series([1,2,3,np.nan,6,8,9,7,4,5])
cars['level'] = svalue # dataframe 에 Series 값으로 컬럼 추가
cars
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
0 | 809 | United States | True | 1.0 |
1 | 731 | Australia | False | 2.0 |
2 | 588 | Japan | False | 3.0 |
3 | 18 | India | False | NaN |
4 | 200 | Russia | True | 6.0 |
5 | 70 | Morocco | True | 8.0 |
6 | 45 | Egypt | True | 9.0 |
7 | 122 | Korea | True | 7.0 |
8 | 397 | China | True | 4.0 |
9 | 255 | England | True | 5.0 |
cars.at[0, 'country'] # at : 인덱스 번호와 컬럼명으로 데이터 선택
'United States'
cars.at[2, 'level'] = 30 # 인덱스 번호와 컬럼명에 해당하는 데이터 값 변경
cars.iloc[[2]]
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
2 | 588 | Japan | False | 30.0 |
cars.iat[5,0] = 77 # iat : 인덱스 번호와 컬럼 순서번호
cars.iloc[[5]]
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
5 | 77 | Morocco | True | 8.0 |
Mising Data 처리
cars[pd.isnull(cars.level)]
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
3 | 18 | India | False | NaN |
cars.dropna(how='any') # 한 컬럼이라도 null 값 존재하는 row 삭제.
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
0 | 809 | United States | True | 1.0 |
1 | 731 | Australia | False | 2.0 |
2 | 588 | Japan | False | 30.0 |
4 | 200 | Russia | True | 6.0 |
5 | 77 | Morocco | True | 8.0 |
6 | 45 | Egypt | True | 9.0 |
7 | 122 | Korea | True | 7.0 |
8 | 397 | China | True | 4.0 |
9 | 255 | England | True | 5.0 |
cars = cars.fillna(value=cars.level.mean()) # null 값을 다른 값으로 변경
cars
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
0 | 809 | United States | True | 1.0 |
1 | 731 | Australia | False | 2.0 |
2 | 588 | Japan | False | 30.0 |
3 | 18 | India | False | 8.0 |
4 | 200 | Russia | True | 6.0 |
5 | 77 | Morocco | True | 8.0 |
6 | 45 | Egypt | True | 9.0 |
7 | 122 | Korea | True | 7.0 |
8 | 397 | China | True | 4.0 |
9 | 255 | England | True | 5.0 |
데이터 합치기 (Merge)
Concat
pieces = [cars[7:], cars[3:7], cars[:3]]
pd.concat(pieces) ## 데이터를 쪼갠 순서대로 합친다.
cars_per_cap | country | drives_right | level | |
---|---|---|---|---|
7 | 122 | Korea | True | 7.0 |
8 | 397 | China | True | 4.0 |
9 | 255 | England | True | 5.0 |
3 | 18 | India | False | 8.0 |
4 | 200 | Russia | True | 6.0 |
5 | 77 | Morocco | True | 8.0 |
6 | 45 | Egypt | True | 9.0 |
0 | 809 | United States | True | 1.0 |
1 | 731 | Australia | False | 2.0 |
2 | 588 | Japan | False | 30.0 |
Merge (SQL style)
left = pd.DataFrame({'key': ['Kate', 'Susan'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Susan'], 'rightval': [4, 5]})
pd.merge(left, right, on='key')
key | leftval | rightval | |
---|---|---|---|
0 | Kate | 1 | 4 |
1 | Susan | 2 | 5 |
left = pd.DataFrame({'key': ['Kate', 'Kate'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Kate'], 'rightval': [4, 5]})
df = pd.merge(left, right, on='key') # key 값이 동일한 경우 row 가 늘어난다.
df
key | leftval | rightval | |
---|---|---|---|
0 | Kate | 1 | 4 |
1 | Kate | 1 | 5 |
2 | Kate | 2 | 4 |
3 | Kate | 2 | 5 |
Append
s = pd.DataFrame({'key':['Jessica'], 'leftval':[9], 'rightval':[7]})
df.append(s, ignore_index=True)
key | leftval | rightval | |
---|---|---|---|
0 | Kate | 1 | 4 |
1 | Kate | 1 | 5 |
2 | Kate | 2 | 4 |
3 | Kate | 2 | 5 |
4 | Jessica | 9 | 7 |
Grouping
df = pd.DataFrame({'col1' : ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'A'],
'col2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'col3' : np.random.randn(8).round(2)*100,
'col4' : np.random.randn(8).round(3)*10})
df
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | A | one | -42.0 | 5.12 |
1 | B | one | -35.0 | -0.60 |
2 | A | two | -60.0 | 1.71 |
3 | B | three | -30.0 | -0.50 |
4 | A | two | -85.0 | 3.57 |
5 | B | two | -97.0 | -2.08 |
6 | A | one | -56.0 | 3.88 |
7 | A | three | 43.0 | -10.77 |
df.groupby('col1').sum()
col3 | col4 | |
---|---|---|
col1 | ||
A | -200.0 | 3.51 |
B | -162.0 | -3.18 |
df.groupby(['col1','col2']).sum()
col3 | col4 | ||
---|---|---|---|
col1 | col2 | ||
A | one | -98.0 | 9.00 |
three | 43.0 | -10.77 | |
two | -145.0 | 5.28 | |
B | one | -35.0 | -0.60 |
three | -30.0 | -0.50 | |
two | -97.0 | -2.08 |
# pivot table
pd.pivot_table(df, values='col3', index=['col1'], columns=['col2'])
col2 | one | three | two |
---|---|---|---|
col1 | |||
A | -49.0 | 43.0 | -72.5 |
B | -35.0 | -30.0 | -97.0 |