MovieLens 영화 평점 분석
영화의 성별 평균 평점, 여성이 가장 좋아한 영화, 남녀간 평균 평점 차이
2. MovieLens 영화평점 dataset
- 출처 : “파이썬 라이브러리를 활용한 데이터분석 (웨스 맥키니)”
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('figure', figsize=(12, 5))
np.set_printoptions(precision=4)
ratings = pd.read_csv('data/movielens_ratings.csv')
movies = pd.read_csv('data/movielens_movies.csv')
users = pd.read_csv('data/movielens_users.csv')
ratings.tail()
userId | movieId | rating | timestamp | |
---|---|---|---|---|
99999 | 671 | 6268 | 2.5 | 1065579370 |
100000 | 671 | 6269 | 4.0 | 1065149201 |
100001 | 671 | 6365 | 4.0 | 1070940363 |
100002 | 671 | 6385 | 2.5 | 1070979663 |
100003 | 671 | 6565 | 3.5 | 1074784724 |
movies.tail()
movieId | title | genres | |
---|---|---|---|
9120 | 162672 | Mohenjo Daro (2016) | Adventure|Drama|Romance |
9121 | 163056 | Shin Godzilla (2016) | Action|Adventure|Fantasy|Sci-Fi |
9122 | 163949 | The Beatles: Eight Days a Week - The Touring Y... | Documentary |
9123 | 164977 | The Gay Desperado (1936) | Comedy |
9124 | 164979 | Women of '69, Unboxed | Documentary |
users.tail()
userId | gender | age | |
---|---|---|---|
6035 | 6036 | F | 25 |
6036 | 6037 | F | 45 |
6037 | 6038 | F | 56 |
6038 | 6039 | F | 45 |
6039 | 6040 | M | 25 |
ratings.shape
(100004, 4)
movies.shape
(9125, 3)
users.shape
(6040, 3)
# 공통 컬럼을 이용한 데이터 병합
data = pd.merge(pd.merge(ratings, users), movies)
user 평점 갯수 순위
x = pd.DataFrame(data.groupby('userId').size())
x.sort_values(by=0, ascending=False).head()
0 | |
---|---|
userId | |
547 | 2391 |
564 | 1868 |
624 | 1735 |
15 | 1700 |
73 | 1610 |
data[data['userId'] == 547].head()
userId | movieId | rating | timestamp | gender | age | title | genres | |
---|---|---|---|---|---|---|---|---|
72 | 547 | 1029 | 5.0 | 1011142236 | M | 35 | Dumbo (1941) | Animation|Children|Drama|Musical |
156 | 547 | 1129 | 3.5 | 1073443756 | M | 35 | Escape from New York (1981) | Action|Adventure|Sci-Fi|Thriller |
207 | 547 | 1172 | 5.0 | 1373125067 | M | 35 | Cinema Paradiso (Nuovo cinema Paradiso) (1989) | Drama |
250 | 547 | 1263 | 5.0 | 974810170 | M | 35 | Deer Hunter, The (1978) | Drama|War |
343 | 547 | 1293 | 5.0 | 981312829 | M | 35 | Gandhi (1982) | Drama |
각 영화의 성별 평균 평점
# fill_value : missing value 대치값
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean', fill_value=0)
mean_ratings.head()
gender | F | M |
---|---|---|
title | ||
"Great Performances" Cats (1998) | 0.0 | 1.75 |
$9.99 (2008) | 2.5 | 4.50 |
'Hellboy': The Seeds of Creation (2004) | 0.0 | 2.00 |
'Neath the Arizona Skies (1934) | 0.0 | 0.50 |
'Round Midnight (1986) | 0.0 | 2.25 |
# 평점이 220개 이상인 영화만 필터링
ratings_by_title = data.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 220]
active_titles
Index(['American Beauty (1999)', 'Back to the Future (1985)',
'Braveheart (1995)', 'Fargo (1996)', 'Forrest Gump (1994)',
'Jurassic Park (1993)', 'Matrix, The (1999)', 'Pulp Fiction (1994)',
'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
'Schindler's List (1993)', 'Shawshank Redemption, The (1994)',
'Silence of the Lambs, The (1991)',
'Star Wars: Episode IV - A New Hope (1977)',
'Star Wars: Episode V - The Empire Strikes Back (1980)',
'Terminator 2: Judgment Day (1991)', 'Toy Story (1995)'],
dtype='object', name='title')
# 평점이 200개 이상인 영화들의 성별 평균 평점
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings
gender | F | M |
---|---|---|
title | ||
American Beauty (1999) | 4.263158 | 4.226994 |
Back to the Future (1985) | 4.064516 | 3.996951 |
Braveheart (1995) | 4.000000 | 3.917219 |
Fargo (1996) | 4.164384 | 4.301325 |
Forrest Gump (1994) | 4.106481 | 4.030043 |
Jurassic Park (1993) | 3.695652 | 3.711538 |
Matrix, The (1999) | 4.014286 | 4.246032 |
Pulp Fiction (1994) | 4.250000 | 4.259174 |
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | 4.105263 | 4.223926 |
Schindler's List (1993) | 4.267123 | 4.318713 |
Shawshank Redemption, The (1994) | 4.396040 | 4.530952 |
Silence of the Lambs, The (1991) | 4.071429 | 4.163636 |
Star Wars: Episode IV - A New Hope (1977) | 4.160494 | 4.245238 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.147541 | 4.263006 |
Terminator 2: Judgment Day (1991) | 4.030303 | 3.997076 |
Toy Story (1995) | 3.775641 | 3.917160 |
여성이 가장 좋아한 영화 순위
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]
gender | F | M |
---|---|---|
title | ||
Shawshank Redemption, The (1994) | 4.396040 | 4.530952 |
Schindler's List (1993) | 4.267123 | 4.318713 |
American Beauty (1999) | 4.263158 | 4.226994 |
Pulp Fiction (1994) | 4.250000 | 4.259174 |
Fargo (1996) | 4.164384 | 4.301325 |
Star Wars: Episode IV - A New Hope (1977) | 4.160494 | 4.245238 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.147541 | 4.263006 |
Forrest Gump (1994) | 4.106481 | 4.030043 |
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | 4.105263 | 4.223926 |
Silence of the Lambs, The (1991) | 4.071429 | 4.163636 |
남녀간의 평균 평점 차이
# 평점 차이 컬럼 추가
mean_ratings['diff'] = mean_ratings['F'] - mean_ratings['M']
mean_ratings.head()
gender | F | M | diff |
---|---|---|---|
title | |||
American Beauty (1999) | 4.263158 | 4.226994 | 0.036164 |
Back to the Future (1985) | 4.064516 | 3.996951 | 0.067565 |
Braveheart (1995) | 4.000000 | 3.917219 | 0.082781 |
Fargo (1996) | 4.164384 | 4.301325 | -0.136941 |
Forrest Gump (1994) | 4.106481 | 4.030043 | 0.076439 |
mean_ratings.sort_values(by='diff', ascending=False)
gender | F | M | diff |
---|---|---|---|
title | |||
Braveheart (1995) | 4.000000 | 3.917219 | 0.082781 |
Forrest Gump (1994) | 4.106481 | 4.030043 | 0.076439 |
Back to the Future (1985) | 4.064516 | 3.996951 | 0.067565 |
American Beauty (1999) | 4.263158 | 4.226994 | 0.036164 |
Terminator 2: Judgment Day (1991) | 4.030303 | 3.997076 | 0.033227 |
Pulp Fiction (1994) | 4.250000 | 4.259174 | -0.009174 |
Jurassic Park (1993) | 3.695652 | 3.711538 | -0.015886 |
Schindler's List (1993) | 4.267123 | 4.318713 | -0.051590 |
Star Wars: Episode IV - A New Hope (1977) | 4.160494 | 4.245238 | -0.084744 |
Silence of the Lambs, The (1991) | 4.071429 | 4.163636 | -0.092208 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.147541 | 4.263006 | -0.115465 |
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) | 4.105263 | 4.223926 | -0.118663 |
Shawshank Redemption, The (1994) | 4.396040 | 4.530952 | -0.134913 |
Fargo (1996) | 4.164384 | 4.301325 | -0.136941 |
Toy Story (1995) | 3.775641 | 3.917160 | -0.141519 |
Matrix, The (1999) | 4.014286 | 4.246032 | -0.231746 |
평점의 표준편차가 큰 영화
ratings_by_sd = data.groupby('title')['rating'].std()
ratings_by_sd = ratings_by_sd.ix[active_titles]
ratings_by_sd.sort_values(ascending=False)
title
Braveheart (1995) 1.023006
Toy Story (1995) 0.958981
Jurassic Park (1993) 0.917073
Star Wars: Episode IV - A New Hope (1977) 0.908682
Matrix, The (1999) 0.901202
Fargo (1996) 0.887102
Schindler's List (1993) 0.882051
Star Wars: Episode V - The Empire Strikes Back (1980) 0.880296
Terminator 2: Judgment Day (1991) 0.879957
Silence of the Lambs, The (1991) 0.874459
Forrest Gump (1994) 0.871521
Pulp Fiction (1994) 0.866897
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 0.835950
American Beauty (1999) 0.811473
Back to the Future (1985) 0.796369
Shawshank Redemption, The (1994) 0.702412
Name: rating, dtype: float64