Data Mining - Data Munging with dplyr
library(dplyr)
# 0. Data Setting
cs <- read.csv("data/dataCustomers.csv", header = T, stringsAsFactors = T)
head(cs)
## custid gender age marriage residence job
## 1 10070 여 28 미혼 Yongsan-gu 제조업
## 2 10139 여 28 미혼 Gangdong-gu 정보서비스
## 3 10208 여 28 미혼 Gwangjin-gu 제조업
## 4 10275 여 28 미혼 Eunpyeong-gu 개인사업
## 5 10350 남 28 미혼 Gangnam-gu 제조업
## 6 10425 남 28 미혼 Songpa-gu 교육기관
str(cs)
## 'data.frame': 500 obs. of 6 variables:
## $ custid : int 10070 10139 10208 10275 10350 10425 10499 10567 10632 10707 ...
## $ gender : Factor w/ 2 levels "남","여": 2 2 2 2 1 1 2 2 2 2 ...
## $ age : int 28 28 28 28 28 28 28 28 28 28 ...
## $ marriage : Factor w/ 2 levels "기혼","미혼": 2 2 2 2 2 2 2 2 2 1 ...
## $ residence: Factor w/ 25 levels "Dobong-gu","Dongdaemun-gu",..: 25 6 12 4 7 22 19 3 4 3 ...
## $ job : Factor w/ 16 levels "개인사업","건설업",..: 16 15 16 1 16 5 1 15 1 1 ...
cs$custid <- as.character(cs$custid)
tr <- read.csv("data/dataTransactions.csv", header = T, stringsAsFactors = T)
head(tr)
## datetime custid store product brand corner import
## 1 2000-05-01 10:43 18313 신촌점 4.104840e+12 샤넬 화장품 1
## 2 2000-05-01 11:00 18313 신촌점 2.700000e+12 식품 일반식품 0
## 3 2000-05-01 11:33 27222 신촌점 4.545371e+12 까사미아 가구 0
## 4 2000-05-01 11:43 27222 신촌점 4.500860e+12 대아통상 기타 0
## 5 2000-05-01 11:53 27222 신촌점 4.538130e+12 토이플러스 문화완구 0
## 6 2000-05-01 12:00 27222 신촌점 4.406010e+12 베베 유아동복 0
## amount installment
## 1 113000 3
## 2 91950 3
## 3 598000 3
## 4 20100 1
## 5 24000 1
## 6 28000 1
str(tr)
## 'data.frame': 19125 obs. of 9 variables:
## $ datetime : Factor w/ 13804 levels "2000-05-01 10:43",..: 1 2 3 4 5 6 6 7 7 8 ...
## $ custid : int 18313 18313 27222 27222 27222 27222 47084 27222 31090 27222 ...
## $ store : Factor w/ 4 levels "무역점","본점",..: 3 3 3 3 3 3 1 3 1 3 ...
## $ product : num 4.10e+12 2.70e+12 4.55e+12 4.50e+12 4.54e+12 ...
## $ brand : Factor w/ 1191 levels "012베네통","1492",..: 533 619 136 204 1058 456 785 45 911 379 ...
## $ corner : Factor w/ 26 levels "가구","가전",..: 26 16 1 3 7 15 11 15 11 15 ...
## $ import : int 1 0 0 0 0 0 0 0 0 0 ...
## $ amount : int 113000 91950 598000 20100 24000 28000 150000 43800 118000 47000 ...
## $ installment: int 3 3 3 1 1 1 3 1 3 1 ...
tr$datetime <- as.character(tr$datetime)
tr$custid <- as.character(tr$custid)
tr$import <- as.factor(tr$import)
tr$installment <- as.factor(tr$installment)
tr$product <- as.character(format(tr$product, digits = 13))
summary(tr)
## datetime custid store product
## Length:19125 Length:19125 무역점:4955 Length:19125
## Class :character Class :character 본점 :3722 Class :character
## Mode :character Mode :character 신촌점:6110 Mode :character
## 천호점:4338
##
##
##
## brand corner import amount
## 식품 : 5267 일반식품 :5267 0:17063 Min. : 650
## 지오다노 : 301 화장품 :2158 1: 2062 1st Qu.: 25000
## 랑콤 : 251 유아동복 :1345 Median : 53000
## 크리니크 : 188 유니캐주얼:1138 Mean : 109198
## 샤넬 : 181 스포츠 :1129 3rd Qu.: 118000
## 에스티로더: 177 피혁 : 931 Max. :8000000
## (Other) :12760 (Other) :7157
## installment
## 1 :12295
## 3 : 6368
## 6 : 201
## 5 : 106
## 2 : 55
## 10 : 39
## (Other): 61
# 1. Selects a subset
filter(cs, age > 60 & gender == "여")
## custid gender age marriage residence job
## 1 47842 여 61 기혼 Songpa-gu 개인사업
## 2 48044 여 62 기혼 Nowon-gu 전문직
## 3 48169 여 62 기혼 Seocho-gu 공공기관
## 4 48278 여 63 기혼 Seodaemun-gu 개인사업
## 5 48401 여 63 기혼 Gangnam-gu 교육기관
## 6 48556 여 64 기혼 Seodaemun-gu 개인사업
## 7 48686 여 63 기혼 Songpa-gu 도소매업
## 8 48997 여 66 기혼 Gangnam-gu 개인사업
## 9 49112 여 65 기혼 Seodaemun-gu 개인사업
## 10 49232 여 67 기혼 Seongdong-gu 개인사업
## 11 49351 여 68 기혼 Mapo-gu 개인사업
filter(tr, installment == 10, amount > mean(amount))
## datetime custid store product brand
## 1 2000-05-08 17:54 33751 천호점 4300471019900 헨리코튼
## 2 2000-05-14 15:03 38747 신촌점 4131690003200 세라구두
## 3 2000-05-14 15:50 38747 신촌점 4202205111020 CC클럽
## 4 2000-05-16 19:33 16805 신촌점 4130790009100 룩선글라스ck외
## 5 2000-07-13 14:00 33751 천호점 4405600026020 퀵실버
## 6 2000-07-18 16:53 33751 천호점 4139141008000 랑콤
## corner import amount installment
## 1 트래디셔널캐주얼 0 247000 10
## 2 피혁 0 276000 10
## 3 영캐주얼 0 238400 10
## 4 섬유 0 325000 10
## 5 유니캐주얼 0 133600 10
## 6 화장품 0 330000 10
x1 <- select(tr, custid:amount) # range fields
head(x1)
## custid store product brand corner import amount
## 1 18313 신촌점 4104840008000 샤넬 화장품 1 113000
## 2 18313 신촌점 2700000000000 식품 일반식품 0 91950
## 3 27222 신촌점 4545370944500 까사미아 가구 0 598000
## 4 27222 신촌점 4500860043900 대아통상 기타 0 20100
## 5 27222 신촌점 4538130048700 토이플러스 문화완구 0 24000
## 6 27222 신촌점 4406010020474 베베 유아동복 0 28000
x2 <- select(tr, -datetime, -import) # remove fields
head(x2)
## custid store product brand corner amount installment
## 1 18313 신촌점 4104840008000 샤넬 화장품 113000 3
## 2 18313 신촌점 2700000000000 식품 일반식품 91950 3
## 3 27222 신촌점 4545370944500 까사미아 가구 598000 3
## 4 27222 신촌점 4500860043900 대아통상 기타 20100 1
## 5 27222 신촌점 4538130048700 토이플러스 문화완구 24000 1
## 6 27222 신촌점 4406010020474 베베 유아동복 28000 1
x3 <- select(cs, custid, job, gender:residence) # reorder fields
head(x3)
## custid job gender age marriage residence
## 1 10070 제조업 여 28 미혼 Yongsan-gu
## 2 10139 정보서비스 여 28 미혼 Gangdong-gu
## 3 10208 제조업 여 28 미혼 Gwangjin-gu
## 4 10275 개인사업 여 28 미혼 Eunpyeong-gu
## 5 10350 제조업 남 28 미혼 Gangnam-gu
## 6 10425 교육기관 남 28 미혼 Songpa-gu
x4 <- rename(cs, juso = residence, upjong = job) # rename fields
head(x4)
## custid gender age marriage juso upjong
## 1 10070 여 28 미혼 Yongsan-gu 제조업
## 2 10139 여 28 미혼 Gangdong-gu 정보서비스
## 3 10208 여 28 미혼 Gwangjin-gu 제조업
## 4 10275 여 28 미혼 Eunpyeong-gu 개인사업
## 5 10350 남 28 미혼 Gangnam-gu 제조업
## 6 10425 남 28 미혼 Songpa-gu 교육기관
# 2. Select random sample
a <- sample_n(cs, 30) # fixed number
nrow(a)
## [1] 30
a <- sample_frac(cs, 0.1) # fixed fraction
nrow(a)
## [1] 50
a <- cs[as.logical((1:nrow(cs)%%20 == 1)), ] # 1-in-n sampling
head(a, 10)
## custid gender age marriage residence job
## 1 10070 여 28 미혼 Yongsan-gu 제조업
## 21 11489 남 29 미혼 Gangbuk-gu 개인사업
## 41 12867 여 29 기혼 Jongno-gu 금융기관
## 61 14258 여 29 기혼 Seodaemun-gu 개인사업
## 81 15641 여 30 미혼 Seongdong-gu 도소매업
## 101 17007 여 31 기혼 Mapo-gu 개인사업
## 121 18374 여 31 미혼 Mapo-gu 개인사업
## 141 19752 남 32 미혼 Seongdong-gu 전문직
## 161 21161 남 32 미혼 Gangnam-gu 정보서비스
## 181 22592 여 34 기혼 Mapo-gu 개인사업
# 3. Summarize information
cs %>% group_by(gender) %>% summarise(Total = n())
## # A tibble: 2 × 2
## gender Total
## <fctr> <int>
## 1 남 154
## 2 여 346
cs %>% group_by(gender, marriage) %>% summarise(Count = n())
## Source: local data frame [4 x 3]
## Groups: gender [?]
##
## gender marriage Count
## <fctr> <fctr> <int>
## 1 남 기혼 115
## 2 남 미혼 39
## 3 여 기혼 234
## 4 여 미혼 112
tr %>% group_by(store) %>%
summarise(Total = sum(amount), Mean = mean(amount))
## # A tibble: 4 × 3
## store Total Mean
## <fctr> <int> <dbl>
## 1 무역점 597280920 120541.05
## 2 본점 478251982 128493.28
## 3 신촌점 564742855 92429.27
## 4 천호점 448129075 103303.15
tr %>% group_by(store) %>%
summarise_each(funs(sum, min, max), amount)
## # A tibble: 4 × 4
## store sum min max
## <fctr> <int> <int> <int>
## 1 무역점 597280920 2000 3930000
## 2 본점 478251982 1400 7544000
## 3 신촌점 564742855 650 2780000
## 4 천호점 448129075 1400 8000000
# 4. Reorder records
tr %>% group_by(custid, store) %>%
summarise(Total = sum(amount), Count = n()) %>%
arrange(desc(Total), desc(Count))
## Source: local data frame [837 x 4]
## Groups: custid [500]
##
## custid store Total Count
## <chr> <fctr> <int> <int>
## 1 42800 무역점 59353579 235
## 2 15968 천호점 44014591 225
## 3 13493 무역점 35091556 178
## 4 42322 신촌점 34305296 172
## 5 48278 신촌점 30149391 216
## 6 27074 본점 29419710 92
## 7 32232 천호점 28250679 267
## 8 33829 본점 19554792 116
## 9 37340 천호점 19509317 88
## 10 41313 천호점 17824062 90
## # ... with 827 more rows
cs %>% group_by(residence) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
## # A tibble: 25 × 2
## residence Count
## <fctr> <int>
## 1 Gangnam-gu 90
## 2 Seocho-gu 54
## 3 Mapo-gu 47
## 4 Seodaemun-gu 47
## 5 Gangdong-gu 45
## 6 Songpa-gu 33
## 7 Gwangjin-gu 29
## 8 Seongdong-gu 21
## 9 Yongsan-gu 19
## 10 Dongjak-gu 16
## # ... with 15 more rows
# 5. Distinct values in specified fields
filter(cs, !duplicated(custid))
## custid gender age marriage residence job
## 1 10070 여 28 미혼 Yongsan-gu 제조업
## 2 10139 여 28 미혼 Gangdong-gu 정보서비스
## 3 10208 여 28 미혼 Gwangjin-gu 제조업
## 4 10275 여 28 미혼 Eunpyeong-gu 개인사업
## 5 10350 남 28 미혼 Gangnam-gu 제조업
## 6 10425 남 28 미혼 Songpa-gu 교육기관
## 7 10499 여 28 미혼 Seodaemun-gu 개인사업
## 8 10567 여 28 미혼 Dongjak-gu 정보서비스
## 9 10632 여 28 미혼 Eunpyeong-gu 개인사업
## 10 10707 여 28 기혼 Dongjak-gu 개인사업
distinct(cs, custid)
## custid
## 1 10070
## 2 10139
## 3 10208
## 4 10275
## 5 10350
## 6 10425
## 7 10499
## 8 10567
## 9 10632
## 10 10707
# 6. Derive new fields based on existing fields
score <- read.csv("data/score.csv", header = T)
score
## ID exam1 exam2 quiz
## 1 1 3.3 2.0 3.7
## 2 2 4.0 4.0 4.0
## 3 3 2.3 0.0 3.3
## 4 4 2.2 1.0 3.3
## 5 5 3.1 1.2 3.9
score <- mutate(score, ExamSum = exam1 + exam2, ExamMean = ExamSum / 2)
score
## ID exam1 exam2 quiz ExamSum ExamMean
## 1 1 3.3 2.0 3.7 5.3 2.65
## 2 2 4.0 4.0 4.0 8.0 4.00
## 3 3 2.3 0.0 3.3 2.3 1.15
## 4 4 2.2 1.0 3.3 3.2 1.60
## 5 5 3.1 1.2 3.9 4.3 2.15
# 7. Replace with new values
score$extra <- c(1, 1, NA, NA, 2)
score
## ID exam1 exam2 quiz ExamSum ExamMean extra
## 1 1 3.3 2.0 3.7 5.3 2.65 1
## 2 2 4.0 4.0 4.0 8.0 4.00 1
## 3 3 2.3 0.0 3.3 2.3 1.15 NA
## 4 4 2.2 1.0 3.3 3.2 1.60 NA
## 5 5 3.1 1.2 3.9 4.3 2.15 2
score$extra[is.na(score$extra)] <- 0
score
## ID exam1 exam2 quiz ExamSum ExamMean extra
## 1 1 3.3 2.0 3.7 5.3 2.65 1
## 2 2 4.0 4.0 4.0 8.0 4.00 1
## 3 3 2.3 0.0 3.3 2.3 1.15 0
## 4 4 2.2 1.0 3.3 3.2 1.60 0
## 5 5 3.1 1.2 3.9 4.3 2.15 2
# 8. Merge records
score2 <- read.csv("data/score2.csv", header = T)
score2
## CID exam3 exam4 finalexam
## 1 1 3.1 2.2 3.5
## 2 2 3.9 4.5 4.0
## 3 3 2.2 3.9 3.7
## 4 4 2.1 1.1 3.3
## 5 5 3.0 2.0 3.8
merge(score, score2, by.x = "ID", by.y = "CID")
## ID exam1 exam2 quiz ExamSum ExamMean extra exam3 exam4 finalexam
## 1 1 3.3 2.0 3.7 5.3 2.65 1 3.1 2.2 3.5
## 2 2 4.0 4.0 4.0 8.0 4.00 1 3.9 4.5 4.0
## 3 3 2.3 0.0 3.3 2.3 1.15 0 2.2 3.9 3.7
## 4 4 2.2 1.0 3.3 3.2 1.60 0 2.1 1.1 3.3
## 5 5 3.1 1.2 3.9 4.3 2.15 2 3.0 2.0 3.8
# 9. Melt & Cast
library(reshape)
id <- c(1, 1, 1, 1, 2, 2, 2)
site <- c("a", "b", "c", "a", "a", "b", "b")
pageview <- c(1, 2, 3, 4, 5, 6, 7)
dwelltime <- c(7, 6, 5, 4, 3, 2, 1)
mydata <- data.frame(id, site, pageview, dwelltime)
mydata
## id site pageview dwelltime
## 1 1 a 1 7
## 2 1 b 2 6
## 3 1 c 3 5
## 4 1 a 4 4
## 5 2 a 5 3
## 6 2 b 6 2
## 7 2 b 7 1
tx <- melt(mydata, id = c("id", "site"))
tx
## id site variable value
## 1 1 a pageview 1
## 2 1 b pageview 2
## 3 1 c pageview 3
## 4 1 a pageview 4
## 5 2 a pageview 5
## 6 2 b pageview 6
## 7 2 b pageview 7
## 8 1 a dwelltime 7
## 9 1 b dwelltime 6
## 10 1 c dwelltime 5
## 11 1 a dwelltime 4
## 12 2 a dwelltime 3
## 13 2 b dwelltime 2
## 14 2 b dwelltime 1
cast(tx, id ~ site, sum)
## id a b c
## 1 1 16 8 8
## 2 2 8 16 0
cast(tx, id ~ site, sum, subset = variable == "pageview")
## id a b c
## 1 1 5 2 3
## 2 2 5 13 0
cast(tx, id + site ~ variable, sum)
## id site pageview dwelltime
## 1 1 a 5 11
## 2 1 b 2 6
## 3 1 c 3 5
## 4 2 a 5 3
## 5 2 b 13 3
cast(tx, id ~ variable, mean, subset = variable == "pageview")
## id pageview
## 1 1 2.5
## 2 2 6.0
cast(tx, id ~ variable, range)
## id pageview_X1 pageview_X2 dwelltime_X1 dwelltime_X2
## 1 1 1 4 4 7
## 2 2 5 7 1 3
# 10. Binning : Converts numeric fields into discrete pieces
cust <- tr %>% group_by(custid) %>% summarise(visits = n())
head(cust)
## # A tibble: 6 × 2
## custid visits
## <chr> <int>
## 1 10070 20
## 2 10139 16
## 3 10208 173
## 4 10275 7
## 5 10350 17
## 6 10425 3
summary(cust)
## custid visits
## Length:500 Min. : 1.00
## Class :character 1st Qu.: 10.00
## Mode :character Median : 24.00
## Mean : 38.25
## 3rd Qu.: 48.00
## Max. :268.00
cust$level <- cut(cust$visits, breaks = 10, labels = F)
head(cust);tail(cust)
## # A tibble: 6 × 3
## custid visits level
## <chr> <int> <int>
## 1 10070 20 1
## 2 10139 16 1
## 3 10208 173 7
## 4 10275 7 1
## 5 10350 17 1
## 6 10425 3 1
## # A tibble: 6 × 3
## custid visits level
## <chr> <int> <int>
## 1 48890 46 2
## 2 48997 171 7
## 3 49112 12 1
## 4 49232 15 1
## 5 49351 9 1
## 6 49470 40 2
cust$level <- cut(cust$visits, c(0, 50, 100, 150, 200, 300), labels = F)
head(cust);tail(cust)
## # A tibble: 6 × 3
## custid visits level
## <chr> <int> <int>
## 1 10070 20 1
## 2 10139 16 1
## 3 10208 173 4
## 4 10275 7 1
## 5 10350 17 1
## 6 10425 3 1
## # A tibble: 6 × 3
## custid visits level
## <chr> <int> <int>
## 1 48890 46 1
## 2 48997 171 4
## 3 49112 12 1
## 4 49232 15 1
## 5 49351 9 1
## 6 49470 40 1
# Practice
# 지점별 수입/국산품 판매건수
tr$importGubun <- ifelse(tr$import == 1, "수입", "국산")
table(tr$store, tr$importGubun)
##
## 국산 수입
## 무역점 4363 592
## 본점 3090 632
## 신촌점 5622 488
## 천호점 3988 350
# 남녀별 건당 구매액 최소, 중앙, 최대값
mg <- merge(cs, tr) # tr 수만큼 custid로 merge
mg %>% group_by(gender) %>% summarise_each(funs(min, median, max), amount)
## # A tibble: 2 × 4
## gender min median max
## <fctr> <int> <dbl> <int>
## 1 남 650 51020 8000000
## 2 여 840 54354 3930000
# 총구매액 상위 10명
tr %>% group_by(custid) %>% summarise(Total = sum(amount)) %>%
arrange(desc(Total)) %>%
head(10)
## # A tibble: 10 × 2
## custid Total
## <chr> <int>
## 1 42800 61672778
## 2 15968 44478591
## 3 13493 35480804
## 4 42322 34305296
## 5 48278 31569011
## 6 27074 29419710
## 7 32232 28308279
## 8 33829 26050612
## 9 30026 23227040
## 10 37340 20128817
library(ggplot2)
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
# 1
aggregate(price ~ cut, diamonds, mean)
## cut price
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
diamonds %>% group_by(cut) %>% summarise(price = mean(price))
## # A tibble: 5 × 2
## cut price
## <ord> <dbl>
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
# 2
aggregate(price ~ cut + color, diamonds, mean)
## cut color price
## 1 Fair D 4291.061
## 2 Good D 3405.382
## 3 Very Good D 3470.467
## 4 Premium D 3631.293
## 5 Ideal D 2629.095
## 6 Fair E 3682.312
## 7 Good E 3423.644
## 8 Very Good E 3214.652
## 9 Premium E 3538.914
## 10 Ideal E 2597.550
## 11 Fair F 3827.003
## 12 Good F 3495.750
## 13 Very Good F 3778.820
## 14 Premium F 4324.890
## 15 Ideal F 3374.939
## 16 Fair G 4239.255
## 17 Good G 4123.482
## 18 Very Good G 3872.754
## 19 Premium G 4500.742
## 20 Ideal G 3720.706
## 21 Fair H 5135.683
## 22 Good H 4276.255
## 23 Very Good H 4535.390
## 24 Premium H 5216.707
## 25 Ideal H 3889.335
## 26 Fair I 4685.446
## 27 Good I 5078.533
## 28 Very Good I 5255.880
## 29 Premium I 5946.181
## 30 Ideal I 4451.970
## 31 Fair J 4975.655
## 32 Good J 4574.173
## 33 Very Good J 5103.513
## 34 Premium J 6294.592
## 35 Ideal J 4918.186
diamonds %>% group_by(cut, color) %>% summarise(price = mean(price))
## Source: local data frame [35 x 3]
## Groups: cut [?]
##
## cut color price
## <ord> <ord> <dbl>
## 1 Fair D 4291.061
## 2 Fair E 3682.312
## 3 Fair F 3827.003
## 4 Fair G 4239.255
## 5 Fair H 5135.683
## 6 Fair I 4685.446
## 7 Fair J 4975.655
## 8 Good D 3405.382
## 9 Good E 3423.644
## 10 Good F 3495.750
## # ... with 25 more rows
# 3
aggregate(cbind(price, carat) ~ cut, diamonds, mean)
## cut price carat
## 1 Fair 4358.758 1.0461366
## 2 Good 3928.864 0.8491847
## 3 Very Good 3981.760 0.8063814
## 4 Premium 4584.258 0.8919549
## 5 Ideal 3457.542 0.7028370
diamonds %>% group_by(cut) %>% summarise_each(funs(mean), price, carat)
## # A tibble: 5 × 3
## cut price carat
## <ord> <dbl> <dbl>
## 1 Fair 4358.758 1.0461366
## 2 Good 3928.864 0.8491847
## 3 Very Good 3981.760 0.8063814
## 4 Premium 4584.258 0.8919549
## 5 Ideal 3457.542 0.7028370