pandas Series와 DataFrame 생성, 조회, 정렬, 함수 적용


pandas 데이터 구조

  • Series : 1D 동질의 데이터 타입을 갖는 배열(array)
  • DataFrame : 2D 테이블 구조. 각 컬럼은 서로 다른 데이터타입을 가질 수 있음.
  • Panel : 3D 테이블 구조.
import pandas as pd
import numpy as np

객체 생성

1. Series

s = pd.Series([1,3,5,np.nan,6,8])
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
s.index
RangeIndex(start=0, stop=6, step=1)
s2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])   # 인덱스 지정
s2
d    4
b    7
a   -5
c    3
dtype: int64
s2['a']   # 인덱스로 값 선택
-5
# Dictionary to Series
europe = {'spain': 46.77, 'france': 66.03, 'germany': 80.62, 'norway': 5.084}
s3 = pd.Series(europe)
s3
france     66.030
germany    80.620
norway      5.084
spain      46.770
dtype: float64

2. DataFrame

# row, column 데이터 지정하여 생성
dates = pd.date_range('20161001', periods=7)
dates
DatetimeIndex(['2016-10-01', '2016-10-02', '2016-10-03', '2016-10-04',
               '2016-10-05', '2016-10-06', '2016-10-07'],
              dtype='datetime64[ns]', freq='D')
# 랜덤으로 소수점 2자리 수 생성하여 각 컬럼의 값으로 사용
df = pd.DataFrame(np.random.rand(7,4).round(2), index=dates, columns=list('ABCD'))
df
A B C D
2016-10-01 0.87 0.06 0.03 0.56
2016-10-02 0.71 0.37 0.26 0.06
2016-10-03 0.98 0.53 0.88 0.30
2016-10-04 0.10 0.16 0.02 0.90
2016-10-05 0.20 0.64 0.36 0.54
2016-10-06 0.49 0.50 0.66 0.21
2016-10-07 0.54 0.05 0.33 0.63
# dictionary 를 dataframe으로 변환
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr =  [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]

my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}
cars = pd.DataFrame(my_dict)
cars
cars_per_cap country drives_right
0 809 United States True
1 731 Australia False
2 588 Japan False
3 18 India False
4 200 Russia True
5 70 Morocco True
6 45 Egypt True
7 122 Korea True
8 397 China True
9 255 England True
# dataframe 구조 보기
cars.dtypes
cars_per_cap     int64
country         object
drives_right      bool
dtype: object
cars.shape
(10, 3)

데이터 조회

cars.head()
cars_per_cap country drives_right
0 809 United States True
1 731 Australia False
2 588 Japan False
3 18 India False
4 200 Russia True
cars.tail(3)
cars_per_cap country drives_right
7 122 Korea True
8 397 China True
9 255 England True
cars.index   # 각 행의 인덱스
RangeIndex(start=0, stop=10, step=1)
cars.columns   # 컬럼명
Index(['cars_per_cap', 'country', 'drives_right'], dtype='object')
cars.values   # 전체 데이터 조회
array([[809, 'United States', True],
       [731, 'Australia', False],
       [588, 'Japan', False],
       [18, 'India', False],
       [200, 'Russia', True],
       [70, 'Morocco', True],
       [45, 'Egypt', True],
       [122, 'Korea', True],
       [397, 'China', True],
       [255, 'England', True]], dtype=object)
cars.T   # transposing data
0 1 2 3 4 5 6 7 8 9
cars_per_cap 809 731 588 18 200 70 45 122 397 255
country United States Australia Japan India Russia Morocco Egypt Korea China England
drives_right True False False False True True True True True True
cars.describe()   # 요약된 통계 정보
cars_per_cap
count 10.000000
mean 323.500000
std 293.035929
min 18.000000
25% 83.000000
50% 227.500000
75% 540.250000
max 809.000000
df.describe()
A B C D
count 7.000000 7.000000 7.000000 7.000000
mean 0.555714 0.330000 0.362857 0.457143
std 0.326948 0.240416 0.315104 0.284881
min 0.100000 0.050000 0.020000 0.060000
25% 0.345000 0.110000 0.145000 0.255000
50% 0.540000 0.370000 0.330000 0.540000
75% 0.790000 0.515000 0.510000 0.595000
max 0.980000 0.640000 0.880000 0.900000
df.mean(axis='columns')   # 전체 컬럼의  평균
2016-10-01    0.3800
2016-10-02    0.3500
2016-10-03    0.6725
2016-10-04    0.2950
2016-10-05    0.4350
2016-10-06    0.4650
2016-10-07    0.3875
Freq: D, dtype: float64
df.mean(axis='rows')  # 각 컬럼별 행의 평균
A    0.555714
B    0.330000
C    0.362857
D    0.457143
dtype: float64

Reindex

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
obj.reindex(['a', 'b', 'c', 'd', 'e'])
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

함수 적용

dates = pd.date_range('20161011', periods=7)
df = pd.DataFrame(np.random.rand(7,4).round(4), index=[dates], columns=list('DABC'))
df
D A B C
2016-10-11 0.9553 0.3991 0.1396 0.6029
2016-10-12 0.6353 0.3549 0.2553 0.5542
2016-10-13 0.2645 0.9218 0.8819 0.8357
2016-10-14 0.7968 0.2667 0.4447 0.6609
2016-10-15 0.7353 0.1914 0.5258 0.5743
2016-10-16 0.7666 0.3549 0.5251 0.0739
2016-10-17 0.3224 0.2378 0.6657 0.3437
f = lambda x: x.max() - x.min()
df.apply(f)
D    0.6908
A    0.7304
B    0.7423
C    0.7618
dtype: float64
df.apply(f, axis=1)
2016-10-11    0.8157
2016-10-12    0.3800
2016-10-13    0.6573
2016-10-14    0.5301
2016-10-15    0.5439
2016-10-16    0.6927
2016-10-17    0.4279
Freq: D, dtype: float64
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
df.apply(f2)
D A B C
min 0.2645 0.1914 0.1396 0.0739
max 0.9553 0.9218 0.8819 0.8357
# dataframe의 실수값을 문자열로 일괄 변환
f_form = lambda x: '%.2f' % x
df.applymap(f_form)
D A B C
2016-10-11 0.96 0.40 0.14 0.60
2016-10-12 0.64 0.35 0.26 0.55
2016-10-13 0.26 0.92 0.88 0.84
2016-10-14 0.80 0.27 0.44 0.66
2016-10-15 0.74 0.19 0.53 0.57
2016-10-16 0.77 0.35 0.53 0.07
2016-10-17 0.32 0.24 0.67 0.34

정렬

# sort_index
df.sort_index(axis=1)
A B C D
2016-10-11 0.3991 0.1396 0.6029 0.9553
2016-10-12 0.3549 0.2553 0.5542 0.6353
2016-10-13 0.9218 0.8819 0.8357 0.2645
2016-10-14 0.2667 0.4447 0.6609 0.7968
2016-10-15 0.1914 0.5258 0.5743 0.7353
2016-10-16 0.3549 0.5251 0.0739 0.7666
2016-10-17 0.2378 0.6657 0.3437 0.3224
df.sort_index(axis=1, ascending=False)   # 컬럼 순서 뒤집기
D C B A
2016-10-11 0.9553 0.6029 0.1396 0.3991
2016-10-12 0.6353 0.5542 0.2553 0.3549
2016-10-13 0.2645 0.8357 0.8819 0.9218
2016-10-14 0.7968 0.6609 0.4447 0.2667
2016-10-15 0.7353 0.5743 0.5258 0.1914
2016-10-16 0.7666 0.0739 0.5251 0.3549
2016-10-17 0.3224 0.3437 0.6657 0.2378
# sort_values : 특정 컬럼의 값을 기준으로 정렬
df.sort_values(by='C')
D A B C
2016-10-16 0.7666 0.3549 0.5251 0.0739
2016-10-17 0.3224 0.2378 0.6657 0.3437
2016-10-12 0.6353 0.3549 0.2553 0.5542
2016-10-15 0.7353 0.1914 0.5258 0.5743
2016-10-11 0.9553 0.3991 0.1396 0.6029
2016-10-14 0.7968 0.2667 0.4447 0.6609
2016-10-13 0.2645 0.9218 0.8819 0.8357
cars.sort_values(by='country')
cars_per_cap country drives_right
1 731 Australia False
8 397 China True
6 45 Egypt True
9 255 England True
3 18 India False
2 588 Japan False
7 122 Korea True
5 70 Morocco True
4 200 Russia True
0 809 United States True
# rank
df.rank(method='first')   # 값이 같은 경우 위치에 따라 순위 부여. (etc : average, min, max)
D A B C
2016-10-11 7.0 6.0 1.0 5.0
2016-10-12 3.0 4.0 2.0 3.0
2016-10-13 1.0 7.0 7.0 7.0
2016-10-14 6.0 3.0 3.0 6.0
2016-10-15 4.0 1.0 5.0 4.0
2016-10-16 5.0 5.0 4.0 1.0
2016-10-17 2.0 2.0 6.0 2.0
df.rank(axis=1)
D A B C
2016-10-11 4.0 2.0 1.0 3.0
2016-10-12 4.0 2.0 1.0 3.0
2016-10-13 1.0 4.0 3.0 2.0
2016-10-14 4.0 1.0 2.0 3.0
2016-10-15 4.0 1.0 2.0 3.0
2016-10-16 4.0 2.0 3.0 1.0
2016-10-17 2.0 1.0 4.0 3.0