한국 방문 해외 관광객의 로밍통화 분석
# 우리나라를 방문한 해외 관광객의 로밍통화 log 데이터(Call Data Records, Call Detail Records)
# 데이터의 한행은 통화나 SMS가 한 통화 수신 혹은 발신되었음을 의미.
# 0. load data
cdrData <- read.csv("data/trainData_CDR.csv", header = T, stringsAsFactors = F)
summary(cdrData)
## touristID dateChar timeChar
## Length:200000 Length:200000 Length:200000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## nation city
## Length:200000 Length:200000
## Class :character Class :character
## Mode :character Mode :character
head(cdrData)
## touristID dateChar timeChar nation city
## 1 tour272975 2014/1230 1713-03 Japan Inchon
## 2 tour092184 2015/0129 2028-42 China Seoul
## 3 tour084068 2015/0218 1840-44 Japan Gangwon
## 4 tour017504 2014/1201 1721-31 China Seoul
## 5 tour018445 2015/0125 2258-07 Japan Seoul
## 6 tour166649 2015/0102 1856-19 China Seoul
require(dplyr)
require(reshape2)
library(ggplot2)
# 1. 우리나라를 가장 많이 방문한 나라의 이름은? 1등과 2등 국가의 방문객수 차이는?
# touristID 를 기준으로 묶은 후, nation 별로 집계
sumByNation <- cdrData %>%
group_by(touristID, nation) %>%
summarize(N=n()) %>%
group_by(nation) %>%
summarize(Total=n()) %>%
arrange(desc(Total))
# Answer
sumByNation # 1위 China, 2위 Japan
## # A tibble: 10 × 2
## nation Total
## <chr> <int>
## 1 China 42711
## 2 Japan 33399
## 3 Canada 6486
## 4 Germany 5375
## 5 USA 5273
## 6 Philippine 3414
## 7 India 2397
## 8 Russia 2268
## 9 UK 1190
## 10 Taiwan 264
sumByNation$Total[1] - sumByNation$Total[2] # 1,2위 방문객수 차이 = 9,312
## [1] 9312
ggplot(sumByNation, aes(nation, Total)) + geom_bar(aes(fill = nation), width = 0.5, stat = "identity")
# 2.해외여행객들의 통화건수가 가장 많은 요일은?
# ***** 방법 1 *****
cdrData$weekDay <- weekdays.Date(strptime(cdrData$dateChar, format = "%Y/%m%d"))
head(cdrData)
## touristID dateChar timeChar nation city weekDay
## 1 tour272975 2014/1230 1713-03 Japan Inchon 화요일
## 2 tour092184 2015/0129 2028-42 China Seoul 목요일
## 3 tour084068 2015/0218 1840-44 Japan Gangwon 수요일
## 4 tour017504 2014/1201 1721-31 China Seoul 월요일
## 5 tour018445 2015/0125 2258-07 Japan Seoul 일요일
## 6 tour166649 2015/0102 1856-19 China Seoul 금요일
sumByWeek = cdrData %>%
group_by(weekDay) %>%
summarize(Total=n()) %>%
arrange(desc(Total))
# Answer : 금요일 32,217건
f_ratio <- round(sumByWeek$Total/sum(sumByWeek$Total)*100, 1)
f_days <- sumByWeek$weekDay
f_labels <- paste(f_days, "\n", f_ratio, "%")
pie(sumByWeek$Total, main="Call number by weekday", init.angle = 90,
col=rainbow(7), radius = 1, cex=1.0, labels=f_labels)
# ***** 방법 2 *****
data_Q2 <- cdrData
head(data_Q2)
## touristID dateChar timeChar nation city weekDay
## 1 tour272975 2014/1230 1713-03 Japan Inchon 화요일
## 2 tour092184 2015/0129 2028-42 China Seoul 목요일
## 3 tour084068 2015/0218 1840-44 Japan Gangwon 수요일
## 4 tour017504 2014/1201 1721-31 China Seoul 월요일
## 5 tour018445 2015/0125 2258-07 Japan Seoul 일요일
## 6 tour166649 2015/0102 1856-19 China Seoul 금요일
# dateChar를 date 형식으로 변경. date 형식은 dplyr 적용 안됨!!!
# 해당 날짜의 요일. 숫자 factor를 문자로 변경
data_Q2$date <- strptime(data_Q2$dateChar, format = "%Y/%m%d")
data_Q2$wday <- data_Q2$date$wday # week : 0 = Sunday
data_Q2$wday <- factor(data_Q2$wday,
levels = c(0:6), labels = c("일","월","화","수","목","금","토"),
ordered = T)
# 요일별 통화건수 집계 후 정렬
sort(table(data_Q2$wday), decreasing = T)
##
## 금 토 수 화 목 일 월
## 32217 31800 31293 31274 30923 21721 20772
# 3.해외여행객들의 통화건수가 가장 많은 시간대는?
# ***** 방법 1 *****
sumByTime = cdrData %>%
group_by(Time=substr(timeChar,1,2)) %>%
summarize(Total=n()) %>%
arrange(Time)
sumByTime
## # A tibble: 14 × 2
## Time Total
## <chr> <int>
## 1 10 14470
## 2 11 17316
## 3 12 16767
## 4 13 15661
## 5 14 15996
## 6 15 17374
## 7 16 17366
## 8 17 18248
## 9 18 16805
## 10 19 13186
## 11 20 11569
## 12 21 10593
## 13 22 8940
## 14 23 5709
# Answer : 17시 18,248건
ggplot(sumByTime, aes(Time, Total)) + geom_line() + geom_point()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
# ***** 방법 2 *****
# timeChar 첫 두글자가 시간
data_Q2$callTime <- substr(data_Q2$timeChar, 1, 2)
# 시간대별 집계 후 정렬
sort(table(data_Q2$callTime), decreasing = T)
##
## 17 15 16 11 18 12 14 13 10 19 20 21
## 18248 17374 17366 17316 16805 16767 15996 15661 14470 13186 11569 10593
## 22 23
## 8940 5709
# 4.국내행정구역별로 해외방문객의 국가별 통화건수 비중을 데이터프레임으로 작성.
summary1 <-cdrData %>%
group_by(city, nation) %>%
summarize(Total=n())
summary1
## Source: local data frame [159 x 3]
## Groups: city [?]
##
## city nation Total
## <chr> <chr> <int>
## 1 Cheonbuk Canada 46
## 2 Cheonbuk China 836
## 3 Cheonbuk Germany 21
## 4 Cheonbuk India 35
## 5 Cheonbuk Japan 712
## 6 Cheonbuk Philippine 66
## 7 Cheonbuk Russia 20
## 8 Cheonbuk Taiwan 4
## 9 Cheonbuk UK 9
## 10 Cheonbuk USA 105
## # ... with 149 more rows
# 행은 city, 열은 nation
summary2 <- dcast(summary1, city ~ nation, fun.aggregate=sum, value.var="Total")
# 각 city별로 합계와 비중.
summary2$Total <- apply(summary2[-1], 1, sum)
totalCount <- sum(summary2$Total)
summary2$Ratio <- round(summary2$Total / totalCount * 100,2)
summary2
## city Canada China Germany India Japan Philippine Russia Taiwan UK
## 1 Cheonbuk 46 836 21 35 712 66 20 4 9
## 2 CheonNam 153 1148 23 14 328 71 7 20 48
## 3 ChungBuk 43 434 19 9 322 99 12 9 22
## 4 ChungNam 88 1127 35 37 1355 123 46 7 40
## 5 Daegu 85 1420 54 30 1165 253 53 16 38
## 6 Gangwon 522 1131 588 374 386 102 210 5 48
## 7 Inchon 1353 12830 1117 329 5657 986 436 70 419
## 8 Jeju 295 12116 114 258 1317 31 31 2 15
## 9 Kwangju 10 194 5 1 111 25 3 0 9
## 10 Kyeongbuk 82 1432 53 44 1307 129 50 6 19
## 11 Kyeonggi 1002 10011 601 375 7715 1388 721 49 243
## 12 KyeongNam 737 1207 75 127 1718 279 74 10 73
## 13 Pusan 466 2777 282 191 7077 205 116 29 73
## 14 Sejong 4 48 2 3 50 2 6 2 2
## 15 Seoul 6852 48223 6075 1966 29836 2299 1544 183 838
## 16 Ulsan 287 494 24 70 592 183 42 14 41
## USA Total Ratio
## 1 105 1854 0.93
## 2 17 1829 0.91
## 3 22 991 0.50
## 4 137 2995 1.50
## 5 161 3275 1.64
## 6 238 3604 1.80
## 7 1137 24334 12.17
## 8 153 14332 7.17
## 9 8 366 0.18
## 10 137 3259 1.63
## 11 1534 23639 11.82
## 12 129 4429 2.21
## 13 386 11602 5.80
## 14 1 120 0.06
## 15 3782 101598 50.80
## 16 26 1773 0.89
# Answer : 비중을 기준으로 정렬
df_AreaNation <- summary2[order(-summary2$Total),]
df_AreaNation
## city Canada China Germany India Japan Philippine Russia Taiwan UK
## 15 Seoul 6852 48223 6075 1966 29836 2299 1544 183 838
## 7 Inchon 1353 12830 1117 329 5657 986 436 70 419
## 11 Kyeonggi 1002 10011 601 375 7715 1388 721 49 243
## 8 Jeju 295 12116 114 258 1317 31 31 2 15
## 13 Pusan 466 2777 282 191 7077 205 116 29 73
## 12 KyeongNam 737 1207 75 127 1718 279 74 10 73
## 6 Gangwon 522 1131 588 374 386 102 210 5 48
## 5 Daegu 85 1420 54 30 1165 253 53 16 38
## 10 Kyeongbuk 82 1432 53 44 1307 129 50 6 19
## 4 ChungNam 88 1127 35 37 1355 123 46 7 40
## 1 Cheonbuk 46 836 21 35 712 66 20 4 9
## 2 CheonNam 153 1148 23 14 328 71 7 20 48
## 16 Ulsan 287 494 24 70 592 183 42 14 41
## 3 ChungBuk 43 434 19 9 322 99 12 9 22
## 9 Kwangju 10 194 5 1 111 25 3 0 9
## 14 Sejong 4 48 2 3 50 2 6 2 2
## USA Total Ratio
## 15 3782 101598 50.80
## 7 1137 24334 12.17
## 11 1534 23639 11.82
## 8 153 14332 7.17
## 13 386 11602 5.80
## 12 129 4429 2.21
## 6 238 3604 1.80
## 5 161 3275 1.64
## 10 137 3259 1.63
## 4 137 2995 1.50
## 1 105 1854 0.93
## 2 17 1829 0.91
## 16 26 1773 0.89
## 3 22 991 0.50
## 9 8 366 0.18
## 14 1 120 0.06
library(treemap)
treemap(df_AreaNation, vSize="Ratio", index="city")
# 5.국내를 방문한 해외 관광객 DB 생성. 해외 방문 고객 DB를 만드는 코드와 그 결과물 제출.
# - touristID : 해외방문객의 unique한 touristID
# - totalCall : 총통화건수
# - callDays : 총통화발생일수 (여러번의 통화가 있더라도 같은 날이면 하루로 환산)
# - visitAreas : 국내 방문지역 수 (행정구역 기준)
#
# dplyr 패키지를 적절한 group_by를 통해 각각의 필드에 해당하는 데이터를 추출한 후, join으로 DB를 구성.
# 5-1. totalCall
callCount <- cdrData %>%
group_by(touristID) %>%
summarize(totalCall=n())
head(callCount);nrow(callCount)
## # A tibble: 6 × 2
## touristID totalCall
## <chr> <int>
## 1 tour000001 2
## 2 tour000003 17
## 3 tour000005 3
## 4 tour000006 2
## 5 tour000008 3
## 6 tour000009 2
## [1] 102777
# 5-2. callDays
dayCount <- cdrData %>%
group_by(touristID, dateChar) %>%
summarize(N=n()) %>%
group_by(touristID) %>%
summarize(callDays=n())
head(dayCount);nrow(dayCount)
## # A tibble: 6 × 2
## touristID callDays
## <chr> <int>
## 1 tour000001 2
## 2 tour000003 13
## 3 tour000005 2
## 4 tour000006 1
## 5 tour000008 2
## 6 tour000009 2
## [1] 102777
# 5-3. visitAreas
areaCount <- cdrData %>%
group_by(touristID, city) %>%
summarize(N=n()) %>%
group_by(touristID) %>%
summarize(visitAreas=n())
head(areaCount);nrow(areaCount)
## # A tibble: 6 × 2
## touristID visitAreas
## <chr> <int>
## 1 tour000001 1
## 2 tour000003 4
## 3 tour000005 1
## 4 tour000006 1
## 5 tour000008 1
## 6 tour000009 2
## [1] 102777
# 5-4. 위 데이터 join
require(plyr)
touristsData <- join(callCount, dayCount)
## Joining by: touristID
touristsData <- join(touristsData, areaCount)
## Joining by: touristID
head(touristsData)
## touristID totalCall callDays visitAreas
## 1 tour000001 2 2 1
## 2 tour000003 17 13 4
## 3 tour000005 3 2 1
## 4 tour000006 2 1 1
## 5 tour000008 3 2 1
## 6 tour000009 2 2 2
nrow(touristsData)
## [1] 102777
# Answer
# write.csv(touristsData, "touristsData_callSummary.csv")