Python Basics 3
Iterator, Generator, List comprehensions, generators for streaming data & large files
Iterating
import pandas as pd
flist = ['targaryen', 'stark', 'baratheon', 'tully', 'lannister', 'tyrell', 'martell']
alist = ['dragon','wolf','deer','fish','lion','rose','sun']
nlist = ['대너리스','에다드','로버트','브린덴','제이미','로라스','도란'];
for item in flist:
print(item)
targaryen
stark
baratheon
tully
lannister
tyrell
martell
# make iterator from list
superspeed = iter(flist)
print(next(superspeed))
print(next(superspeed))
print(next(superspeed))
targaryen
stark
baratheon
small_value = iter(range(3))
print(next(small_value))
print(next(small_value))
print(next(small_value))
0
1
2
googol = iter(range(10 ** 1000)) # big number - may not work
print(next(googol))
print(next(googol))
0
1
enumerate
enumlist = list(enumerate(flist)) # list of tuple
print(enumlist)
[(0, 'targaryen'), (1, 'stark'), (2, 'baratheon'), (3, 'tully'), (4, 'lannister'), (5, 'tyrell'), (6, 'martell')]
for index1, value1 in enumlist: # unpack tuple
print(index1, value1)
0 targaryen
1 stark
2 baratheon
3 tully
4 lannister
5 tyrell
6 martell
for index2, value2 in enumerate(flist, start = 1): # Change the start index
print(index2, value2)
1 targaryen
2 stark
3 baratheon
4 tully
5 lannister
6 tyrell
7 martell
zip
m_data = list(zip(nlist, flist, alist))
print(m_data)
[('대너리스', 'targaryen', 'dragon'), ('에다드', 'stark', 'wolf'), ('로버트', 'baratheon', 'deer'), ('브린덴', 'tully', 'fish'), ('제이미', 'lannister', 'lion'), ('로라스', 'tyrell', 'rose'), ('도란', 'martell', 'sun')]
m_zip = zip(nlist, flist, alist) # create zip object
print(m_zip)
for value1, value2, value3 in m_zip: # unzip
print(value1, value2, value3)
<zip object at 0x0000017E86695F48>
대너리스 targaryen dragon
에다드 stark wolf
로버트 baratheon deer
브린덴 tully fish
제이미 lannister lion
로라스 tyrell rose
도란 martell sun
# unzip (*)
m_zip = zip(nlist, flist, alist)
print(*m_zip)
('대너리스', 'targaryen', 'dragon') ('에다드', 'stark', 'wolf') ('로버트', 'baratheon', 'deer') ('브린덴', 'tully', 'fish') ('제이미', 'lannister', 'lion') ('로라스', 'tyrell', 'rose') ('도란', 'martell', 'sun')
m_zip = zip(nlist, flist, alist)
result1, result2, result3 = zip(*m_zip) # unzip to tuple
print(result2)
print(flist)
print(result2 == flist) # Check if unpacked tuples are equivalent to original tuples
('targaryen', 'stark', 'baratheon', 'tully', 'lannister', 'tyrell', 'martell')
['targaryen', 'stark', 'baratheon', 'tully', 'lannister', 'tyrell', 'martell']
False
iteration by chunk
counts_dict = {}
# Iterate over the file chunk by chunk
for chunk in pd.read_csv('data/tweets.csv', chunksize = 10):
# Iterate over the column in dataframe
for entry in chunk.lang:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1
print(counts_dict)
{'kr': 12, 'jp': 36, 'ru': 6, 'en': 222, 'et': 24}
# count entry function with chunksize
def count_entries(csv_file, c_size, colname):
"""Return a dictionary with counts of ccurrences as value for each key."""
counts_dict = {}
for chunk in pd.read_csv(csv_file, chunksize=c_size):
for entry in chunk[colname]:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1
return counts_dict
result_counts = count_entries('data/tweets.csv', 10, 'lang')
print(result_counts)
{'kr': 12, 'jp': 36, 'ru': 6, 'en': 222, 'et': 24}
List comprehensions
squares = [i**2 for i in range(10)]
# Create a 5 x 5 matrix using a list of lists
matrix = [[col for col in range(5)] for row in range(5)]
for row in matrix:
print(row)
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
# conditionals in comprehensions
new_fellowship = [member for member in flist if len(member) >= 7]
print(new_fellowship)
['targaryen', 'baratheon', 'lannister', 'martell']
new_fellowship = [member if len(member) >= 7 else '' for member in flist]
print(new_fellowship)
['targaryen', '', 'baratheon', '', 'lannister', '', 'martell']
# tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']
# Dict comprehensions
new_fellowship = {member:len(member) for member in flist}
print(new_fellowship)
{'tully': 5, 'tyrell': 6, 'martell': 7, 'targaryen': 9, 'stark': 5, 'baratheon': 9, 'lannister': 9}
Generator : Lazy evaluation
# Create a generator object
lengths = (len(person) for person in flist)
for value in lengths:
print(value)
9
5
9
5
9
6
7
# [num for num in range(10**10000000)] -- 대용량 데이터일 경우 문제 발생
# (num for num in range(10**100000)) # generator 객체 만든 후 요청시에만 처리되기 때문에 대용량 데이터 처리시 안전.
Dictionary by zip
# 2개의 list 를 dictionary로 만드는 함수
def lists2dict(list1, list2):
"""list1 provides the keys and list2 provides the values."""
zipped_lists = zip(list1, list2)
rs_dict = dict(zipped_lists)
return rs_dict
fa_dict = lists2dict(alist, flist)
fa_dict
{'deer': 'baratheon',
'dragon': 'targaryen',
'fish': 'tully',
'lion': 'lannister',
'rose': 'tyrell',
'sun': 'martell',
'wolf': 'stark'}
list + list > dictionary > dataframe
feature_names = ['familyname','firstname','crest']
row_list = []
m_zip = zip(flist, nlist, alist)
for value1, value2, value3 in m_zip:
item = (value1, value2, value3)
row_list.append(item)
row_list
[('targaryen', '대너리스', 'dragon'),
('stark', '에다드', 'wolf'),
('baratheon', '로버트', 'deer'),
('tully', '브린덴', 'fish'),
('lannister', '제이미', 'lion'),
('tyrell', '로라스', 'rose'),
('martell', '도란', 'sun')]
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_list]
list_of_dicts
[{'crest': 'dragon', 'familyname': 'targaryen', 'firstname': '대너리스'},
{'crest': 'wolf', 'familyname': 'stark', 'firstname': '에다드'},
{'crest': 'deer', 'familyname': 'baratheon', 'firstname': '로버트'},
{'crest': 'fish', 'familyname': 'tully', 'firstname': '브린덴'},
{'crest': 'lion', 'familyname': 'lannister', 'firstname': '제이미'},
{'crest': 'rose', 'familyname': 'tyrell', 'firstname': '로라스'},
{'crest': 'sun', 'familyname': 'martell', 'firstname': '도란'}]
df_game = pd.DataFrame(list_of_dicts)
df_game
crest | familyname | firstname | |
---|---|---|---|
0 | dragon | targaryen | 대너리스 |
1 | wolf | stark | 에다드 |
2 | deer | baratheon | 로버트 |
3 | fish | tully | 브린덴 |
4 | lion | lannister | 제이미 |
5 | rose | tyrell | 로라스 |
6 | sun | martell | 도란 |
Python generators for streaming data
- Use a generator to load a file line by line.
- Works on streaming data.
- Read and process the file until all lines are exhausted.
일반적인 방법
# Open a connection to the file
with open('data/movielens_ratings.csv') as file:
file.readline() # Skip the column names
counts_dict = {}
# Process only the first 1000 rows
for j in range(1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: userid
first_col = line[0]
# userId 별 평점 부여 횟수
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
print(counts_dict)
{'3': 51, '14': 20, '9': 45, '8': 116, '10': 46, '12': 61, '1': 20, '15': 38, '4': 204, '5': 100, '2': 76, '13': 53, '7': 88, '11': 38, '6': 44}
Large file인 경우 처리방법
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
counts_dict = {}
counts = 0
with open('data/movielens_ratings.csv') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
counts += 1
if counts >= 1000:
break
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
print(counts_dict)
{'3': 51, '14': 20, 'userId': 1, '9': 45, '8': 116, '10': 46, '12': 61, '1': 20, '15': 36, '4': 204, '5': 100, '2': 76, '13': 53, '7': 88, '11': 38, '6': 44}
iterator to load data in chunks
file_reader = pd.read_csv('data/movielens_ratings.csv', chunksize=1000)
# Get the first dataframe chunk
df_reader = next(file_reader)
df_reader.head()
userId | movieId | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 31 | 2.5 | 1260759144 |
1 | 1 | 1029 | 3.0 | 1260759179 |
2 | 1 | 1061 | 3.0 | 1260759182 |
3 | 1 | 1129 | 2.0 | 1260759185 |
4 | 1 | 1172 | 4.0 | 1260759205 |
iterator to load data in chunks 2
import matplotlib.pyplot as plt
%matplotlib inline
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('data/ind_pop_data.csv', chunksize=1000)
# Get the first dataframe chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'AFG']
df_pop_ceb.head()
CountryName | CountryCode | Year | TotalPop | UrbanPopRatio | |
---|---|---|---|---|---|
0 | Afghanistan | AFG | 1960 | 8990000.0 | 8.22 |
1 | Afghanistan | AFG | 1961 | 9160000.0 | 8.51 |
2 | Afghanistan | AFG | 1962 | 9340000.0 | 8.81 |
3 | Afghanistan | AFG | 1963 | 9530000.0 | 9.11 |
4 | Afghanistan | AFG | 1964 | 9730000.0 | 9.43 |
# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['TotalPop'], df_pop_ceb['UrbanPopRatio'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new dataframe column 'Total Urban Population'
df_pop_ceb['TotalUrbanPopulation'] = [int(tup[0] * tup[1]) for tup in pops_list]
df_pop_ceb.head()
C:\Python\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from ipykernel import kernelapp as app
CountryName | CountryCode | Year | TotalPop | UrbanPopRatio | TotalUrbanPopulation | |
---|---|---|---|---|---|---|
0 | Afghanistan | AFG | 1960 | 8990000.0 | 8.22 | 73897800 |
1 | Afghanistan | AFG | 1961 | 9160000.0 | 8.51 | 77951600 |
2 | Afghanistan | AFG | 1962 | 9340000.0 | 8.81 | 82285400 |
3 | Afghanistan | AFG | 1963 | 9530000.0 | 9.11 | 86818300 |
4 | Afghanistan | AFG | 1964 | 9730000.0 | 9.43 | 91753900 |
# Plot urban population data
plt.scatter(df_pop_ceb['Year'], df_pop_ceb['TotalUrbanPopulation'])
plt.xlabel('Year')
plt.ylabel('TotalUrbanPopulation')
plt.title('Afghanistan')
<matplotlib.text.Text at 0x17e880b7a58>