Data Mining - Customer Signature : 파생변수 만들기
# H 백화점 2000.0501 ~ 2001.04.29 : 고객정보, 구매정보 데이터
(링크) lubridate package 사용법
library(dplyr)
library(lubridate)
library(ggplot2)
cs <- read.delim("data/HDS_Customers.tab", stringsAsFactors = F)
tr <- read.delim("data/HDS_Transactions.tab", stringsAsFactors = F)
card <- read.delim("data/HDS_Cards.tab", stringsAsFactors = F)
job <- read.delim("data/HDS_Jobs.tab", stringsAsFactors = F)
# 데이터 탐색
cs %>% group_by(sex) %>% summarise(Count = n())
## # A tibble: 3 × 2
## sex Count
## <int> <int>
## 1 0 24
## 2 1 15162
## 3 2 34809
cs %>% group_by(h_type1) %>% summarise(Count = n()) %>% arrange(desc(Count))
## # A tibble: 8 × 2
## h_type1 Count
## <chr> <int>
## 1 A 28213
## 2 N 14260
## 3 V 4289
## 4 Z 3194
## 5 X 22
## 6 B 14
## 7 H 2
## 8 $null$ 1
jobtype <- cs %>% group_by(job_stype) %>% summarise(Count = n()) %>% arrange(desc(Count))
jobtype
## # A tibble: 267 × 2
## job_stype Count
## <int> <int>
## 1 7060 16086
## 2 4200 2518
## 3 4260 1633
## 4 2810 1281
## 5 4000 1264
## 6 7050 1061
## 7 7000 993
## 8 3010 977
## 9 6030 917
## 10 4020 846
## # ... with 257 more rows
jobtype %>% left_join(job) %>%
group_by(job_nm_gr) %>% summarise(Count = sum(Count)) %>% arrange(desc(Count))
## Joining, by = "job_stype"
## # A tibble: 26 × 2
## job_nm_gr Count
## <chr> <int>
## 1 개인사업 18458
## 2 제조업 8050
## 3 그룹사 3493
## 4 교육기관 3445
## 5 금융기관 3003
## 6 의료기관 2224
## 7 일반정보서비스 1826
## 8 군인경찰 1637
## 9 국영기업 1009
## 10 전문직종사자1 977
## # ... with 16 more rows
distinct(card, card_sflg_nm)
## card_sflg_nm
## 1 회장단 및 당사중역
## 2 금강병원
## 3 임대업주
## 4 퇴직자 Gold
## 5 퇴직자 silver
## 6 모니터 (폐지)
## 7 그룹퇴직자(폐지)
## 8 파트 ·촉탁
## 9 파트
## 10 판촉
## 11 용역업체
## 12 경인계열
## 13 법인카드
## 14 경인일반
## 15 경인일반2
## 16 경인골드
## 17 기타
## 18 부산점계열사
## 19 부산점일반
## 20 부산점골드
## 21 울산계열
## 22 울산일반
## 23 울산골드
## 24 광주계열
## 25 광주일반
## 26 광주골드
tr %>% group_by(hour = as.integer(sales_time/100)) %>% summarise(Count = n())
## # A tibble: 18 × 2
## hour Count
## <int> <int>
## 1 0 9
## 2 1 5
## 3 8 3
## 4 9 633
## 5 10 28061
## 6 11 120683
## 7 12 156342
## 8 13 174073
## 9 14 196450
## 10 15 223365
## 11 16 229652
## 12 17 221947
## 13 18 218425
## 14 19 144977
## 15 20 11568
## 16 21 228
## 17 22 7
## 18 23 2
tr %>% group_by(str_nm) %>%
summarise(Count = n(), Total = sum(as.numeric(net_amt))/1000000) %>%
arrange(desc(Total))
## # A tibble: 4 × 3
## str_nm Count Total
## <chr> <int> <dbl>
## 1 본점 496521 56269.54
## 2 무역점 478214 48308.16
## 3 신촌점 413848 32223.96
## 4 천호점 337847 27754.27
# 1. 고객의 환불형태(금액, 건수) : 환불 = net_amt <0
cs.v1 <- cs %>% select(custid)
refund <- tr %>%
filter(net_amt < 0) %>%
group_by(custid) %>% summarize(rf_amt = sum(net_amt), rf_cnt = n())
cs.v1 <- left_join(cs.v1, refund) %>%
mutate(rf_amt = ifelse(is.na(rf_amt), 0, rf_amt),
rf_cnt = ifelse(is.na(rf_cnt), 0, rf_cnt))
## Joining, by = "custid"
cs.v1[order(cs.v1$rf_amt), ] %>% head(10)
## custid rf_amt rf_cnt
## 49727 49732 -85353770 46
## 32527 32532 -72902000 3
## 45604 45609 -72892750 51
## 47460 47465 -66955550 40
## 35389 35394 -62948060 59
## 46403 46408 -50683350 38
## 39865 39870 -49813000 12
## 47879 47884 -48102874 54
## 17068 17073 -46655400 12
## 33990 33995 -43274220 103
cs.v1[order(cs.v1$rf_cnt, decreasing = T), ] %>% head(10)
## custid rf_amt rf_cnt
## 34167 34172 -27071540 137
## 48273 48278 -14990134 110
## 33990 33995 -43274220 103
## 23580 23585 -26782120 94
## 45308 45313 -6987057 92
## 39936 39941 -4361052 84
## 26723 26728 -22131010 78
## 33448 33453 -16004370 78
## 40225 40230 -6441618 76
## 34947 34952 -17942000 75
# 2. 고객의 구매상품 다양성
cs.v2 <- tr %>% distinct(custid, brd_nm) %>%
group_by(custid) %>% summarize(buy_brd = n())
cs.v2[order(cs.v2$buy_brd, decreasing = T), ]
## # A tibble: 49,995 × 2
## custid buy_brd
## <int> <int>
## 1 43915 188
## 2 34717 182
## 3 43039 155
## 4 31665 147
## 5 15840 138
## 6 42935 138
## 7 44001 136
## 8 26295 134
## 9 32649 133
## 10 21192 128
## # ... with 49,985 more rows
# 3. 고객의 내점일수와 평균구매주기(API, average purchasing interval)
start_date = ymd(ymd_hms(min(tr$sales_date))) # 2000-05-03 00:00:00
end_date = ymd(ymd_hms(max(tr$sales_date)))
cs.v3 <- tr %>% distinct(custid, sales_date) %>%
group_by(custid) %>% summarize(visits = n()) %>%
mutate(API = as.integer((end_date - start_date)/visits))
max(cs.v3$visits)
## [1] 249
filter(cs.v3, visits >= 100, API < 10 )
## # A tibble: 362 × 3
## custid visits API
## <int> <int> <int>
## 1 8 114 3
## 2 504 143 2
## 3 724 102 3
## 4 1300 113 3
## 5 3376 106 3
## 6 4185 128 2
## 7 5414 117 3
## 8 6474 100 3
## 9 7864 135 2
## 10 8314 103 3
## # ... with 352 more rows
# 4. 내점당구매건수(Number of Purchases Per Visit)
tmp <- tr %>% group_by(custid) %>% summarise(PCount = n())
tmp
## # A tibble: 49,995 × 2
## custid PCount
## <int> <int>
## 1 1 77
## 2 2 28
## 3 3 68
## 4 4 6
## 5 5 4
## 6 6 29
## 7 7 11
## 8 8 243
## 9 9 28
## 10 10 41
## # ... with 49,985 more rows
cs.v4 <- inner_join(cs.v3, tmp) %>%
mutate(NPPV = round(PCount / visits, 2)) %>%
select(custid, NPPV)
## Joining, by = "custid"
cs.v4 %>% head(10)
## # A tibble: 10 × 2
## custid NPPV
## <int> <dbl>
## 1 1 1.88
## 2 2 2.55
## 3 3 2.52
## 4 4 1.50
## 5 5 1.33
## 6 6 1.93
## 7 7 1.57
## 8 8 2.13
## 9 9 3.11
## 10 10 4.10
# 5. 고객의 주중/주말 구매패턴
# wday : 1 = sunday
cs.v5 <- tr %>%
mutate(wk_amt = ifelse(wday(sales_date) %in% 2:6, net_amt, 0),
we_amt = ifelse(wday(sales_date) %in% c(1,7), net_amt, 0)) %>%
group_by(custid) %>% summarize_each(funs(sum), wk_amt, we_amt) %>%
mutate(wk_pat = ifelse(wk_amt >= we_amt * 1.5, "주중형",
ifelse(we_amt >= wk_amt * 1.5, "주말형", "유형없음")))
cs.v5 %>% head(10)
## # A tibble: 10 × 4
## custid wk_amt we_amt wk_pat
## <int> <dbl> <dbl> <chr>
## 1 1 2590892 1167489 주중형
## 2 2 1735009 326170 주중형
## 3 3 5742729 683320 주중형
## 4 4 254300 127000 주중형
## 5 5 129600 26100 주중형
## 6 6 460900 1556500 주말형
## 7 7 767000 1903000 주말형
## 8 8 6917729 3465820 주중형
## 9 9 457580 554100 유형없음
## 10 10 365055 817120 주말형
ggplot(cs.v5, aes(wk_pat)) + geom_bar(aes(fill = wk_pat))
# 6. 고객의 생일로부터 특정시점의 나이와 연령대를 계산
# age == NA 이면 평균 연령으로
cs.v6 <- cs[c("custid", "birth")] %>%
mutate(age = year('2001-05-01') - year(ymd_hms(birth))) %>%
mutate(age = ifelse(age < 10 | age > 100, NA, age)) %>%
mutate(age = ifelse(is.na(age), round(mean(age, na.rm = T)), age)) %>%
mutate(agegrp = cut(age, c(0, 19, 29, 39, 49, 59, 69, 100), labels = F) * 10) %>%
select(custid, age, agegrp)
## Warning: 7839 failed to parse.
ggplot(cs.v6, aes(agegrp)) + geom_bar(aes(fill = agegrp))
cs.v6 %>% group_by(agegrp) %>% summarise(n = n())
## # A tibble: 7 × 2
## agegrp n
## <dbl> <int>
## 1 10 9
## 2 20 13784
## 3 30 22489
## 4 40 7703
## 5 50 4457
## 6 60 1310
## 7 70 243
# 7. 기간별(최근 12개월, 6개월, 3개월) 구매금액 및 구매횟수
end_date <- ymd(ymd_hms(max(tr$sales_date)))
start_date <- ymd('20010501') - months(12) # 특정 일자 기준으로 12개월 전 날짜
cs.v7.12 <- tr %>%
filter(sales_date >= start_date & sales_date <= end_date) %>%
group_by(custid) %>% summarise(amt12 = sum(net_amt), nop12 = n())
start_date <- ymd('20010501') - months(6)
cs.v7.6 <- tr %>%
filter(sales_date >= start_date & sales_date <= end_date) %>%
group_by(custid) %>% summarise(amt6 = sum(net_amt), nop6 = n())
start_date <- ymd('20010501') - months(3)
cs.v7.3 <- tr %>%
filter(sales_date >= start_date & sales_date <= end_date) %>%
group_by(custid) %>% summarise(amt3 = sum(net_amt), nop3 = n())
cs.v7 <- left_join(cs.v7.12, cs.v7.6) %>% left_join(cs.v7.3) %>%
mutate(amt6 = ifelse(is.na(amt6), 0, amt6),
nop6 = ifelse(is.na(nop6), 0, nop6),
amt3 = ifelse(is.na(amt3), 0, amt3),
nop3 = ifelse(is.na(nop3), 0, nop3))
## Joining, by = "custid"
## Joining, by = "custid"
cs.v7 %>% head(10)
## # A tibble: 10 × 7
## custid amt12 nop12 amt6 nop6 amt3 nop3
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 3758381 77 1337108 34 511654 13
## 2 2 2061179 28 190929 7 190929 7
## 3 3 6426049 68 4021376 41 3227130 18
## 4 4 381300 6 381300 6 89700 2
## 5 5 155700 4 155700 4 155700 4
## 6 6 2017400 29 786060 14 18060 3
## 7 7 2670000 11 2670000 11 2670000 11
## 8 8 10383549 243 6040055 114 2425630 47
## 9 9 1011680 28 1011680 28 1011680 28
## 10 10 1182175 41 1182175 41 1182175 41
# Customer Signature
custsig<-cs%>%
left_join(cs.v1) %>%
left_join(cs.v2) %>%
left_join(cs.v3) %>%
left_join(cs.v4) %>%
left_join(cs.v5) %>%
left_join(cs.v6) %>%
left_join(cs.v7)
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"