Python Basics 1

2016-11-30

Dictionary, Data Grouping, Loop

Dictionary

europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin', 'norway':'oslo', 'australia':'vienna'}

print(europe.keys())
print(europe['norway'])

dict_keys(['france', 'australia', 'norway', 'spain', 'germany'])
oslo

europe['italy'] = 'rome'    # add or update
europe['poland'] = 'warsaw'
del(europe['australia'])    # Remove

print(europe)               # 순서는 고정 안됨

{'france': 'paris', 'spain': 'madrid', 'poland': 'warsaw', 'italy': 'rome', 'norway': 'oslo', 'germany': 'berlin'}

print('italy' in europe)

True

# Dictionary of dictionaries
europe = {'spain': {'capital':'madrid', 'population':46.77},
          'france': {'capital':'paris', 'population':66.03},
          'germany': {'capital':'berlin', 'population':80.62},
          'norway': {'capital':'oslo', 'population':5.084}}

print(europe['france']['capital'])

paris

# Create sub-dictionary data
data = {'capital':'rome', 'population':59.83}
europe['italy'] = data

print(europe)

{'france': {'population': 66.03, 'capital': 'paris'}, 'norway': {'population': 5.084, 'capital': 'oslo'}, 'italy': {'population': 59.83, 'capital': 'rome'}, 'spain': {'population': 46.77, 'capital': 'madrid'}, 'germany': {'population': 80.62, 'capital': 'berlin'}}

Pandas

import pandas as pd
import numpy as np

names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}

# Build a DataFrame
cars = pd.DataFrame(my_dict)
cars.index = ['US', 'AUS', 'JAP', 'IN', 'RU', 'MOR', 'EG']
cars

	cars_per_cap	country	drives_right
US	809	United States	True
AUS	731	Australia	False
JAP	588	Japan	False
IN	18	India	False
RU	200	Russia	True
MOR	70	Morocco	True
EG	45	Egypt	True

cars = pd.read_csv('data/cars.csv', index_col = 0)
cars

	country	cars_per_cap	drives_right
US	United States	809	True
AUS	Australia	731	False
JAP	Japan	588	False
IN	India	18	False
RU	Russia	200	True
MOR	Morocco	70	True
EG	Egypt	45	True

print("\n{}".format(cars[1:4]))
print("\n{}".format(cars["country"]))      # output as Pandas Series
print("\n{}".format(cars[["country"]]))    # output as Pandas DataFrame
print("\n{}".format(cars[["country", "drives_right"]]))

       country  cars_per_cap drives_right
AUS  Australia           731        False
JAP      Japan           588        False
IN       India            18        False

US     United States
AUS        Australia
JAP            Japan
IN             India
RU            Russia
MOR          Morocco
EG             Egypt
Name: country, dtype: object

           country
US   United States
AUS      Australia
JAP          Japan
IN           India
RU          Russia
MOR        Morocco
EG           Egypt

           country drives_right
US   United States         True
AUS      Australia        False
JAP          Japan        False
IN           India        False
RU          Russia         True
MOR        Morocco         True
EG           Egypt         True

# loc & iloc
print(cars.loc[['RU', 'AUS']])
print(cars.iloc[[4, 1]])

       country  cars_per_cap drives_right
RU      Russia           200         True
AUS  Australia           731        False
       country  cars_per_cap drives_right
RU      Russia           200         True
AUS  Australia           731        False

print(cars.loc['MOR', 'drives_right'])
print(cars.loc[['RU','MOR'], ['country', 'drives_right']])  # sub-DataFrame

True
     country drives_right
RU    Russia         True
MOR  Morocco         True

print(cars.loc[:, 'drives_right'])      # Series
print(cars.loc[:, ['drives_right']])    # DataFrame
print(cars.loc[:, ['cars_per_cap', 'drives_right']])

US      True
AUS    False
JAP    False
IN     False
RU      True
MOR     True
EG      True
Name: drives_right, dtype: bool
    drives_right
US          True
AUS        False
JAP        False
IN         False
RU          True
MOR         True
EG          True
     cars_per_cap drives_right
US            809         True
AUS           731        False
JAP           588        False
IN             18        False
RU            200         True
MOR            70         True
EG             45         True

# Filtering pandas dataframe
cars[cars['drives_right'] == True]

	country	cars_per_cap	drives_right
US	United States	809	True
RU	Russia	200	True
MOR	Morocco	70	True
EG	Egypt	45	True

many_cars = cars["cars_per_cap"] > 500
cars[many_cars]

	country	cars_per_cap	drives_right
US	United States	809	True
AUS	Australia	731	False
JAP	Japan	588	False

Pandas Grouping

car = pd.read_csv('data/automobile.csv')
print(car.shape)
car.head()

(159, 26)

	symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	...	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
0	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.4	10.0	102	5500	24	30	13950
1	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.4	8.0	115	5500	18	22	17450
2	1	158	audi	gas	std	four	sedan	fwd	front	105.8	...	136	mpfi	3.19	3.4	8.5	110	5500	19	25	17710
3	1	158	audi	gas	turbo	four	sedan	fwd	front	105.8	...	131	mpfi	3.13	3.4	8.3	140	5500	17	20	23875
4	2	192	bmw	gas	std	two	sedan	rwd	front	101.2	...	108	mpfi	3.50	2.8	8.8	101	5800	23	29	16430

5 rows × 26 columns

car.loc[car.wheels == '4wd']

	symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	...	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
1	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450
99	2	83	subaru	gas	std	two	hatchback	4wd	front	93.3	...	108	2bbl	3.62	2.64	8.7	73	4400	26	31	7603
103	0	102	subaru	gas	std	four	sedan	4wd	front	97.0	...	108	2bbl	3.62	2.64	9.0	82	4800	24	25	9233
104	0	102	subaru	gas	turbo	four	sedan	4wd	front	97.0	...	108	mpfi	3.62	2.64	7.7	111	4800	24	29	11259
107	0	85	subaru	gas	std	four	wagon	4wd	front	96.9	...	108	2bbl	3.62	2.64	9.0	82	4800	23	29	8013
108	0	85	subaru	gas	turbo	four	wagon	4wd	front	96.9	...	108	mpfi	3.62	2.64	7.7	111	4800	23	23	11694
113	0	81	toyota	gas	std	four	wagon	4wd	front	95.7	...	92	2bbl	3.05	3.03	9.0	62	4800	27	32	7898
114	0	91	toyota	gas	std	four	wagon	4wd	front	95.7	...	92	2bbl	3.05	3.03	9.0	62	4800	27	32	8778

8 rows × 26 columns

# symboling : 차량 안전등급 지수
car.loc[car.wheels == '4wd', 'symboling']

    2
   2
  0
  0
  0
  0
  0
  0
Name: symboling, dtype: int64

a1 = car.loc[car.wheels == '4wd', 'symboling'].mean()
a2 = car.loc[car.wheels == 'fwd', 'symboling'].mean()
print(a1);print(a2)

0.5
0.8952380952380953

grouped = car.groupby('wheels')
grouped.get_group('4wd')

	symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	...	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
1	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450
99	2	83	subaru	gas	std	two	hatchback	4wd	front	93.3	...	108	2bbl	3.62	2.64	8.7	73	4400	26	31	7603
103	0	102	subaru	gas	std	four	sedan	4wd	front	97.0	...	108	2bbl	3.62	2.64	9.0	82	4800	24	25	9233
104	0	102	subaru	gas	turbo	four	sedan	4wd	front	97.0	...	108	mpfi	3.62	2.64	7.7	111	4800	24	29	11259
107	0	85	subaru	gas	std	four	wagon	4wd	front	96.9	...	108	2bbl	3.62	2.64	9.0	82	4800	23	29	8013
108	0	85	subaru	gas	turbo	four	wagon	4wd	front	96.9	...	108	mpfi	3.62	2.64	7.7	111	4800	23	23	11694
113	0	81	toyota	gas	std	four	wagon	4wd	front	95.7	...	92	2bbl	3.05	3.03	9.0	62	4800	27	32	7898
114	0	91	toyota	gas	std	four	wagon	4wd	front	95.7	...	92	2bbl	3.05	3.03	9.0	62	4800	27	32	8778

8 rows × 26 columns

grouped['symboling'].mean()

wheels
4wd    0.500000
fwd    0.895238
rwd    0.413043
Name: symboling, dtype: float64

print(grouped['symboling'].agg([np.mean, np.sum]))
print(grouped['symboling'].agg({'평균': np.mean, '합계': np.sum}))

            mean  sum
wheels               
4wd     0.500000    4
fwd     0.895238   94
rwd     0.413043   19
              평균  합계
wheels              
4wd     0.500000   4
fwd     0.895238  94
rwd     0.413043  19

# Ordered Dictionary
from collections import OrderedDict

d = OrderedDict([('평균', np.mean), ('합계', np.sum)])
d

OrderedDict([('평균', <function numpy.core.fromnumeric.mean>),
             ('합계', <function numpy.core.fromnumeric.sum>)])

d['평균']

<function numpy.core.fromnumeric.mean>

grouped['symboling'].agg(OrderedDict([('평균', np.mean), ('합계', np.sum)]))

	평균	합계
wheels
4wd	0.500000	4
fwd	0.895238	94
rwd	0.413043	19

Loop

distance = [11.25, 18.0, 20.0, 10.75, 9.50]
for d in distance :
    print(d)

enumerate

for index, a in enumerate(distance) :
    print("room " + str(index + 1) + " : " + str(a))

room 1 : 11.25
room 2 : 18.0
room 3 : 20.0
room 4 : 10.75
room 5 : 9.5

Loop over list

distance = [["London", 11.25],
            ["Rome", 18.0],
            ["Oslo", 20.0],
            ["Paris", 10.75],
            ["Madrid", 9.50]]

for city in distance:
    print("the " + city[0] + " : " + str(city[1]) + " km")

the London : 11.25 km
the Rome : 18.0 km
the Oslo : 20.0 km
the Paris : 10.75 km
the Madrid : 9.5 km

Loop over dictionary

europe = {'spain':'madrid', 'france':'paris', 'germany':'bonn', 'norway':'oslo', 'italy':'rome', 
          'poland':'warsaw', 'australia':'vienna'}

for key, value in europe.items():
    print("the capital of " + key.upper() + " is " + value)

the capital of FRANCE is paris
the capital of AUSTRALIA is vienna
the capital of SPAIN is madrid
the capital of POLAND is warsaw
the capital of ITALY is rome
the capital of NORWAY is oslo
the capital of GERMANY is bonn

Loop over Numpy array

height = [74, 79, 72, 77, 73, 69, 67, 71, 76]
np_height = np.array(height)

for x in np_height :                # 1D array
    print(str(x) + " inches")

inches
inches
inches
inches
inches
inches
inches
inches
inches

people = [[74, 180], [74, 215], [72, 210], [72, 210], [73, 188], [69, 176]]
np_people = np.array(people)

for x in np_people :   # 2D array
    print(x)

[ 74 180]
[ 74 215]
[ 72 210]
[ 72 210]
[ 73 188]
[ 69 176]

for x in np.nditer(np_people) :   # 2D array
    print(x)

Loop over DataFrame

cars = pd.read_csv('data/cars.csv', index_col = 0)

for lab, row in cars.iterrows() :
    print(lab + " --- " + row['country'] + " : " + str(row['cars_per_cap']))

US --- United States : 809
AUS --- Australia : 731
JAP --- Japan : 588
IN --- India : 18
RU --- Russia : 200
MOR --- Morocco : 70
EG --- Egypt : 45

# adds COUNTRY column
for lab, row in cars.iterrows() :
    cars.loc[lab, 'COUNTRY'] = row['country'].upper()

cars

	country	cars_per_cap	drives_right	COUNTRY
US	United States	809	True	UNITED STATES
AUS	Australia	731	False	AUSTRALIA
JAP	Japan	588	False	JAPAN
IN	India	18	False	INDIA
RU	Russia	200	True	RUSSIA
MOR	Morocco	70	True	MOROCCO
EG	Egypt	45	True	EGYPT

Case Study: Hacker Statistics. Normal Distribution

np.random.seed(9999)
print(np.random.rand())             # random float
print(np.random.randint(1, 7))      # random int (1~6 범위)

0.8233890742543671
2

# Random Walk
all_walks = []
for i in range(1000) :
    random_walk = [0]

    for x in range(100) :
        # step에 마지막 숫자 설정
        step = random_walk[-1]

        # 주사위 던지기
        dice = np.random.randint(1,7)

        # 다음 step 결정.
        # 주사위 2 이하이면 -1. 3에서 5 사이이면 +1.
        if dice <= 2:
            step = max(0, step - 1)     # 음수값 되면 0 리턴
        elif dice <= 5:
            step += 1
        else:
            step += np.random.randint(1, 7)

        # append next_step to random_walk
        random_walk.append(step)
        
    # random_walk 결과를 전체 결과 array에 추가
    all_walks.append(random_walk)

np_all_walks = np.array(all_walks)
np_all_walks

array([[ 0,  6,  7, ..., 62, 61, 63],
       [ 0,  1,  2, ..., 67, 66, 65],
       [ 0,  1,  2, ..., 70, 71, 72],
       ..., 
       [ 0,  0,  1, ..., 44, 43, 44],
       [ 0,  1,  2, ..., 51, 52, 53],
       [ 0,  0,  0, ..., 79, 84, 85]])

np_aw_t = np.transpose(np_all_walks)
np_aw_t

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 6,  1,  1, ...,  0,  1,  0],
       [ 7,  2,  2, ...,  1,  2,  0],
       ..., 
       [62, 67, 70, ..., 44, 51, 79],
       [61, 66, 71, ..., 43, 52, 84],
       [63, 65, 72, ..., 44, 53, 85]])

%matplotlib inline
import matplotlib.pyplot as plt

# setting plot defatult size
%pylab inline
pylab.rcParams['figure.figsize'] = (12, 6)

Populating the interactive namespace from numpy and matplotlib

plt.plot(np_aw_t)
plt.show()

png

ends = np_aw_t[-1]
ends[0:50]

array([ 63,  65,  72,  51,  47,  47,  62,  42,  91,  71,  75,  82,  65,
        66,  80,  68,  64, 103,  80, 104,  81,  91,  75,  87,  75,  98,
        97, 118,  83,  81,  71,  41, 108,  66,  41,  84,  54,  76,  71,
        55,  65, 100,  69,  62,  81,  71,  57,  70, 112,  68])

plt.hist(ends)
plt.show()

png

Python Basics 1

Recent articles