한국 방문 해외 관광객의 로밍통화 분석
# 우리나라를 방문한 해외 관광객의 로밍통화 log 데이터(Call Data Records, Call Detail Records)
# 데이터의 한행은 통화나 SMS가 한 통화 수신 혹은 발신되었음을 의미.
# 0. load data

cdrData <- read.csv("data/trainData_CDR.csv", header = T, stringsAsFactors = F)
summary(cdrData)
##   touristID           dateChar           timeChar        
##  Length:200000      Length:200000      Length:200000     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##     nation              city          
##  Length:200000      Length:200000     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character
head(cdrData)
##    touristID  dateChar timeChar nation    city
## 1 tour272975 2014/1230  1713-03  Japan  Inchon
## 2 tour092184 2015/0129  2028-42  China   Seoul
## 3 tour084068 2015/0218  1840-44  Japan Gangwon
## 4 tour017504 2014/1201  1721-31  China   Seoul
## 5 tour018445 2015/0125  2258-07  Japan   Seoul
## 6 tour166649 2015/0102  1856-19  China   Seoul
require(dplyr)
require(reshape2)
library(ggplot2)
# 1. 우리나라를 가장 많이 방문한 나라의 이름은? 1등과 2등 국가의 방문객수 차이는?

# touristID 를 기준으로 묶은 후, nation 별로 집계
sumByNation <- cdrData %>%
    group_by(touristID, nation) %>%
    summarize(N=n()) %>%
        group_by(nation) %>%
        summarize(Total=n()) %>%
        arrange(desc(Total))

# Answer
sumByNation                                     # 1위 China, 2위 Japan
## # A tibble: 10 × 2
##        nation Total
##         <chr> <int>
## 1       China 42711
## 2       Japan 33399
## 3      Canada  6486
## 4     Germany  5375
## 5         USA  5273
## 6  Philippine  3414
## 7       India  2397
## 8      Russia  2268
## 9          UK  1190
## 10     Taiwan   264
sumByNation$Total[1] - sumByNation$Total[2]     # 1,2위 방문객수 차이 = 9,312
## [1] 9312
ggplot(sumByNation, aes(nation, Total)) + geom_bar(aes(fill = nation), width = 0.5, stat = "identity")

# 2.해외여행객들의 통화건수가 가장 많은 요일은?

# ***** 방법 1 *****
cdrData$weekDay <- weekdays.Date(strptime(cdrData$dateChar, format = "%Y/%m%d"))
head(cdrData)
##    touristID  dateChar timeChar nation    city weekDay
## 1 tour272975 2014/1230  1713-03  Japan  Inchon  화요일
## 2 tour092184 2015/0129  2028-42  China   Seoul  목요일
## 3 tour084068 2015/0218  1840-44  Japan Gangwon  수요일
## 4 tour017504 2014/1201  1721-31  China   Seoul  월요일
## 5 tour018445 2015/0125  2258-07  Japan   Seoul  일요일
## 6 tour166649 2015/0102  1856-19  China   Seoul  금요일
sumByWeek = cdrData %>%
    group_by(weekDay) %>%
    summarize(Total=n())  %>%
    arrange(desc(Total))

# Answer : 금요일 32,217건

f_ratio <- round(sumByWeek$Total/sum(sumByWeek$Total)*100, 1)
f_days <- sumByWeek$weekDay
f_labels <- paste(f_days, "\n", f_ratio, "%")
pie(sumByWeek$Total, main="Call number by weekday", init.angle = 90, 
    col=rainbow(7), radius = 1, cex=1.0, labels=f_labels)

# ***** 방법 2 *****
data_Q2 <- cdrData
head(data_Q2)
##    touristID  dateChar timeChar nation    city weekDay
## 1 tour272975 2014/1230  1713-03  Japan  Inchon  화요일
## 2 tour092184 2015/0129  2028-42  China   Seoul  목요일
## 3 tour084068 2015/0218  1840-44  Japan Gangwon  수요일
## 4 tour017504 2014/1201  1721-31  China   Seoul  월요일
## 5 tour018445 2015/0125  2258-07  Japan   Seoul  일요일
## 6 tour166649 2015/0102  1856-19  China   Seoul  금요일
# dateChar를 date 형식으로 변경. date 형식은 dplyr 적용 안됨!!!
# 해당 날짜의 요일. 숫자 factor를 문자로 변경
data_Q2$date <- strptime(data_Q2$dateChar, format = "%Y/%m%d")
data_Q2$wday <- data_Q2$date$wday     # week : 0 = Sunday
data_Q2$wday <- factor(data_Q2$wday, 
                       levels = c(0:6), labels = c("일","월","화","수","목","금","토"), 
                       ordered = T)

# 요일별 통화건수 집계 후 정렬
sort(table(data_Q2$wday), decreasing = T) 
## 
##    금    토    수    화    목    일    월 
## 32217 31800 31293 31274 30923 21721 20772
# 3.해외여행객들의 통화건수가 가장 많은 시간대는? 

# ***** 방법 1 *****
sumByTime = cdrData %>%
    group_by(Time=substr(timeChar,1,2)) %>%
    summarize(Total=n())  %>%
    arrange(Time)

sumByTime
## # A tibble: 14 × 2
##     Time Total
##    <chr> <int>
## 1     10 14470
## 2     11 17316
## 3     12 16767
## 4     13 15661
## 5     14 15996
## 6     15 17374
## 7     16 17366
## 8     17 18248
## 9     18 16805
## 10    19 13186
## 11    20 11569
## 12    21 10593
## 13    22  8940
## 14    23  5709
# Answer : 17시 18,248건

ggplot(sumByTime, aes(Time, Total)) + geom_line() + geom_point()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

# ***** 방법 2 *****
# timeChar 첫 두글자가 시간
data_Q2$callTime <- substr(data_Q2$timeChar, 1, 2)

# 시간대별 집계 후 정렬
sort(table(data_Q2$callTime), decreasing = T) 
## 
##    17    15    16    11    18    12    14    13    10    19    20    21 
## 18248 17374 17366 17316 16805 16767 15996 15661 14470 13186 11569 10593 
##    22    23 
##  8940  5709
# 4.국내행정구역별로 해외방문객의 국가별 통화건수 비중을 데이터프레임으로 작성.

summary1 <-cdrData %>%
    group_by(city, nation) %>%
    summarize(Total=n())

summary1
## Source: local data frame [159 x 3]
## Groups: city [?]
## 
##        city     nation Total
##       <chr>      <chr> <int>
## 1  Cheonbuk     Canada    46
## 2  Cheonbuk      China   836
## 3  Cheonbuk    Germany    21
## 4  Cheonbuk      India    35
## 5  Cheonbuk      Japan   712
## 6  Cheonbuk Philippine    66
## 7  Cheonbuk     Russia    20
## 8  Cheonbuk     Taiwan     4
## 9  Cheonbuk         UK     9
## 10 Cheonbuk        USA   105
## # ... with 149 more rows
# 행은 city, 열은 nation
summary2 <- dcast(summary1, city ~ nation, fun.aggregate=sum, value.var="Total")

# 각 city별로 합계와 비중. 
summary2$Total <- apply(summary2[-1], 1, sum)
totalCount <- sum(summary2$Total)
summary2$Ratio <- round(summary2$Total / totalCount * 100,2)
summary2
##         city Canada China Germany India Japan Philippine Russia Taiwan  UK
## 1   Cheonbuk     46   836      21    35   712         66     20      4   9
## 2   CheonNam    153  1148      23    14   328         71      7     20  48
## 3   ChungBuk     43   434      19     9   322         99     12      9  22
## 4   ChungNam     88  1127      35    37  1355        123     46      7  40
## 5      Daegu     85  1420      54    30  1165        253     53     16  38
## 6    Gangwon    522  1131     588   374   386        102    210      5  48
## 7     Inchon   1353 12830    1117   329  5657        986    436     70 419
## 8       Jeju    295 12116     114   258  1317         31     31      2  15
## 9    Kwangju     10   194       5     1   111         25      3      0   9
## 10 Kyeongbuk     82  1432      53    44  1307        129     50      6  19
## 11  Kyeonggi   1002 10011     601   375  7715       1388    721     49 243
## 12 KyeongNam    737  1207      75   127  1718        279     74     10  73
## 13     Pusan    466  2777     282   191  7077        205    116     29  73
## 14    Sejong      4    48       2     3    50          2      6      2   2
## 15     Seoul   6852 48223    6075  1966 29836       2299   1544    183 838
## 16     Ulsan    287   494      24    70   592        183     42     14  41
##     USA  Total Ratio
## 1   105   1854  0.93
## 2    17   1829  0.91
## 3    22    991  0.50
## 4   137   2995  1.50
## 5   161   3275  1.64
## 6   238   3604  1.80
## 7  1137  24334 12.17
## 8   153  14332  7.17
## 9     8    366  0.18
## 10  137   3259  1.63
## 11 1534  23639 11.82
## 12  129   4429  2.21
## 13  386  11602  5.80
## 14    1    120  0.06
## 15 3782 101598 50.80
## 16   26   1773  0.89
# Answer : 비중을 기준으로 정렬
df_AreaNation <- summary2[order(-summary2$Total),]
df_AreaNation
##         city Canada China Germany India Japan Philippine Russia Taiwan  UK
## 15     Seoul   6852 48223    6075  1966 29836       2299   1544    183 838
## 7     Inchon   1353 12830    1117   329  5657        986    436     70 419
## 11  Kyeonggi   1002 10011     601   375  7715       1388    721     49 243
## 8       Jeju    295 12116     114   258  1317         31     31      2  15
## 13     Pusan    466  2777     282   191  7077        205    116     29  73
## 12 KyeongNam    737  1207      75   127  1718        279     74     10  73
## 6    Gangwon    522  1131     588   374   386        102    210      5  48
## 5      Daegu     85  1420      54    30  1165        253     53     16  38
## 10 Kyeongbuk     82  1432      53    44  1307        129     50      6  19
## 4   ChungNam     88  1127      35    37  1355        123     46      7  40
## 1   Cheonbuk     46   836      21    35   712         66     20      4   9
## 2   CheonNam    153  1148      23    14   328         71      7     20  48
## 16     Ulsan    287   494      24    70   592        183     42     14  41
## 3   ChungBuk     43   434      19     9   322         99     12      9  22
## 9    Kwangju     10   194       5     1   111         25      3      0   9
## 14    Sejong      4    48       2     3    50          2      6      2   2
##     USA  Total Ratio
## 15 3782 101598 50.80
## 7  1137  24334 12.17
## 11 1534  23639 11.82
## 8   153  14332  7.17
## 13  386  11602  5.80
## 12  129   4429  2.21
## 6   238   3604  1.80
## 5   161   3275  1.64
## 10  137   3259  1.63
## 4   137   2995  1.50
## 1   105   1854  0.93
## 2    17   1829  0.91
## 16   26   1773  0.89
## 3    22    991  0.50
## 9     8    366  0.18
## 14    1    120  0.06
library(treemap)
treemap(df_AreaNation, vSize="Ratio", index="city")

# 5.국내를 방문한 해외 관광객 DB 생성. 해외 방문 고객 DB를 만드는 코드와 그 결과물 제출.
# - touristID : 해외방문객의 unique한 touristID
# - totalCall : 총통화건수
# - callDays : 총통화발생일수 (여러번의 통화가 있더라도 같은 날이면 하루로 환산)
# - visitAreas : 국내 방문지역 수 (행정구역 기준)
# 
# dplyr 패키지를 적절한 group_by를 통해 각각의 필드에 해당하는 데이터를 추출한 후,  join으로 DB를 구성.

# 5-1. totalCall
callCount <- cdrData %>%
    group_by(touristID) %>%
    summarize(totalCall=n())

head(callCount);nrow(callCount)
## # A tibble: 6 × 2
##    touristID totalCall
##        <chr>     <int>
## 1 tour000001         2
## 2 tour000003        17
## 3 tour000005         3
## 4 tour000006         2
## 5 tour000008         3
## 6 tour000009         2
## [1] 102777
# 5-2. callDays
dayCount <- cdrData %>%
    group_by(touristID, dateChar) %>%
    summarize(N=n())  %>%
        group_by(touristID) %>%
        summarize(callDays=n())

head(dayCount);nrow(dayCount)
## # A tibble: 6 × 2
##    touristID callDays
##        <chr>    <int>
## 1 tour000001        2
## 2 tour000003       13
## 3 tour000005        2
## 4 tour000006        1
## 5 tour000008        2
## 6 tour000009        2
## [1] 102777
# 5-3. visitAreas
areaCount <- cdrData %>%
    group_by(touristID, city) %>%
    summarize(N=n())  %>%
    group_by(touristID) %>%
    summarize(visitAreas=n())

head(areaCount);nrow(areaCount)
## # A tibble: 6 × 2
##    touristID visitAreas
##        <chr>      <int>
## 1 tour000001          1
## 2 tour000003          4
## 3 tour000005          1
## 4 tour000006          1
## 5 tour000008          1
## 6 tour000009          2
## [1] 102777
# 5-4. 위 데이터 join
require(plyr)
touristsData <- join(callCount, dayCount)
## Joining by: touristID
touristsData <- join(touristsData, areaCount)
## Joining by: touristID
head(touristsData)
##    touristID totalCall callDays visitAreas
## 1 tour000001         2        2          1
## 2 tour000003        17       13          4
## 3 tour000005         3        2          1
## 4 tour000006         2        1          1
## 5 tour000008         3        2          1
## 6 tour000009         2        2          2
nrow(touristsData)
## [1] 102777
# Answer
# write.csv(touristsData, "touristsData_callSummary.csv")