R DataMining 파생변수 도출

Data Mining - Customer Signature : 파생변수 만들기

# H 백화점 2000.0501 ~ 2001.04.29 : 고객정보, 구매정보 데이터

(링크) lubridate package 사용법

library(dplyr)
library(lubridate)
library(ggplot2)

cs <- read.delim("data/HDS_Customers.tab", stringsAsFactors = F)
tr <- read.delim("data/HDS_Transactions.tab", stringsAsFactors = F)
card <- read.delim("data/HDS_Cards.tab", stringsAsFactors = F)
job <- read.delim("data/HDS_Jobs.tab", stringsAsFactors = F)

# 데이터 탐색

cs %>% group_by(sex) %>% summarise(Count = n())

## # A tibble: 3 × 2
##     sex Count
##   <int> <int>
## 1     0    24
## 2     1 15162
## 3     2 34809

cs %>% group_by(h_type1) %>% summarise(Count = n()) %>% arrange(desc(Count))

## # A tibble: 8 × 2
##   h_type1 Count
##     <chr> <int>
## 1       A 28213
## 2       N 14260
## 3       V  4289
## 4       Z  3194
## 5       X    22
## 6       B    14
## 7       H     2
## 8  $null$     1

jobtype <- cs %>% group_by(job_stype) %>% summarise(Count = n()) %>% arrange(desc(Count))
jobtype

## # A tibble: 267 × 2
##    job_stype Count
##        <int> <int>
## 1       7060 16086
## 2       4200  2518
## 3       4260  1633
## 4       2810  1281
## 5       4000  1264
## 6       7050  1061
## 7       7000   993
## 8       3010   977
## 9       6030   917
## 10      4020   846
## # ... with 257 more rows

jobtype %>% left_join(job) %>% 
    group_by(job_nm_gr) %>% summarise(Count = sum(Count)) %>% arrange(desc(Count))

## Joining, by = "job_stype"

## # A tibble: 26 × 2
##         job_nm_gr Count
##             <chr> <int>
## 1        개인사업 18458
## 2          제조업  8050
## 3          그룹사  3493
## 4        교육기관  3445
## 5        금융기관  3003
## 6        의료기관  2224
## 7  일반정보서비스  1826
## 8        군인경찰  1637
## 9        국영기업  1009
## 10  전문직종사자1   977
## # ... with 16 more rows

distinct(card, card_sflg_nm)

##          card_sflg_nm
## 1  회장단 및 당사중역
## 2            금강병원
## 3            임대업주
## 4         퇴직자 Gold
## 5       퇴직자 silver
## 6       모니터 (폐지)
## 7    그룹퇴직자(폐지)
## 8         파트 ·촉탁
## 9                파트
## 10               판촉
## 11           용역업체
## 12           경인계열
## 13           법인카드
## 14           경인일반
## 15          경인일반2
## 16           경인골드
## 17               기타
## 18       부산점계열사
## 19         부산점일반
## 20         부산점골드
## 21           울산계열
## 22           울산일반
## 23           울산골드
## 24           광주계열
## 25           광주일반
## 26           광주골드

tr %>% group_by(hour = as.integer(sales_time/100)) %>% summarise(Count = n())

## # A tibble: 18 × 2
##     hour  Count
##    <int>  <int>
## 1      0      9
## 2      1      5
## 3      8      3
## 4      9    633
## 5     10  28061
## 6     11 120683
## 7     12 156342
## 8     13 174073
## 9     14 196450
## 10    15 223365
## 11    16 229652
## 12    17 221947
## 13    18 218425
## 14    19 144977
## 15    20  11568
## 16    21    228
## 17    22      7
## 18    23      2

tr %>% group_by(str_nm) %>% 
    summarise(Count = n(), Total = sum(as.numeric(net_amt))/1000000) %>% 
    arrange(desc(Total))

## # A tibble: 4 × 3
##   str_nm  Count    Total
##    <chr>  <int>    <dbl>
## 1   본점 496521 56269.54
## 2 무역점 478214 48308.16
## 3 신촌점 413848 32223.96
## 4 천호점 337847 27754.27

# 1. 고객의 환불형태(금액, 건수) : 환불 = net_amt <0

cs.v1 <- cs %>% select(custid)

refund <- tr %>% 
    filter(net_amt < 0) %>% 
    group_by(custid) %>% summarize(rf_amt = sum(net_amt), rf_cnt = n())

cs.v1 <- left_join(cs.v1, refund) %>%
    mutate(rf_amt = ifelse(is.na(rf_amt), 0, rf_amt),
           rf_cnt = ifelse(is.na(rf_cnt), 0, rf_cnt))

## Joining, by = "custid"

cs.v1[order(cs.v1$rf_amt), ] %>% head(10)

##       custid    rf_amt rf_cnt
## 49727  49732 -85353770     46
## 32527  32532 -72902000      3
## 45604  45609 -72892750     51
## 47460  47465 -66955550     40
## 35389  35394 -62948060     59
## 46403  46408 -50683350     38
## 39865  39870 -49813000     12
## 47879  47884 -48102874     54
## 17068  17073 -46655400     12
## 33990  33995 -43274220    103

cs.v1[order(cs.v1$rf_cnt, decreasing = T), ] %>% head(10)

##       custid    rf_amt rf_cnt
## 34167  34172 -27071540    137
## 48273  48278 -14990134    110
## 33990  33995 -43274220    103
## 23580  23585 -26782120     94
## 45308  45313  -6987057     92
## 39936  39941  -4361052     84
## 26723  26728 -22131010     78
## 33448  33453 -16004370     78
## 40225  40230  -6441618     76
## 34947  34952 -17942000     75

# 2. 고객의 구매상품 다양성

cs.v2 <- tr %>% distinct(custid, brd_nm) %>% 
    group_by(custid) %>% summarize(buy_brd = n())

cs.v2[order(cs.v2$buy_brd, decreasing = T), ]

## # A tibble: 49,995 × 2
##    custid buy_brd
##     <int>   <int>
## 1   43915     188
## 2   34717     182
## 3   43039     155
## 4   31665     147
## 5   15840     138
## 6   42935     138
## 7   44001     136
## 8   26295     134
## 9   32649     133
## 10  21192     128
## # ... with 49,985 more rows

# 3. 고객의 내점일수와 평균구매주기(API, average purchasing interval)

start_date = ymd(ymd_hms(min(tr$sales_date)))   # 2000-05-03 00:00:00
end_date = ymd(ymd_hms(max(tr$sales_date)))

cs.v3 <- tr %>% distinct(custid, sales_date) %>% 
    group_by(custid) %>% summarize(visits = n()) %>% 
    mutate(API = as.integer((end_date - start_date)/visits))

max(cs.v3$visits)

## [1] 249

filter(cs.v3, visits >= 100, API < 10 )

## # A tibble: 362 × 3
##    custid visits   API
##     <int>  <int> <int>
## 1       8    114     3
## 2     504    143     2
## 3     724    102     3
## 4    1300    113     3
## 5    3376    106     3
## 6    4185    128     2
## 7    5414    117     3
## 8    6474    100     3
## 9    7864    135     2
## 10   8314    103     3
## # ... with 352 more rows

# 4. 내점당구매건수(Number of Purchases Per Visit)

tmp <- tr %>% group_by(custid) %>% summarise(PCount = n())
tmp

## # A tibble: 49,995 × 2
##    custid PCount
##     <int>  <int>
## 1       1     77
## 2       2     28
## 3       3     68
## 4       4      6
## 5       5      4
## 6       6     29
## 7       7     11
## 8       8    243
## 9       9     28
## 10     10     41
## # ... with 49,985 more rows

cs.v4 <- inner_join(cs.v3, tmp) %>%
    mutate(NPPV = round(PCount / visits, 2)) %>%
    select(custid, NPPV)

## Joining, by = "custid"

cs.v4 %>% head(10)

## # A tibble: 10 × 2
##    custid  NPPV
##     <int> <dbl>
## 1       1  1.88
## 2       2  2.55
## 3       3  2.52
## 4       4  1.50
## 5       5  1.33
## 6       6  1.93
## 7       7  1.57
## 8       8  2.13
## 9       9  3.11
## 10     10  4.10

# 5. 고객의 주중/주말 구매패턴
# wday : 1 = sunday

cs.v5 <- tr %>%
    mutate(wk_amt = ifelse(wday(sales_date) %in% 2:6, net_amt, 0),
           we_amt = ifelse(wday(sales_date) %in% c(1,7), net_amt, 0)) %>%
    group_by(custid) %>% summarize_each(funs(sum), wk_amt, we_amt) %>%
    mutate(wk_pat = ifelse(wk_amt >= we_amt * 1.5, "주중형",
                    ifelse(we_amt >= wk_amt * 1.5, "주말형", "유형없음")))

cs.v5 %>% head(10)

## # A tibble: 10 × 4
##    custid  wk_amt  we_amt   wk_pat
##     <int>   <dbl>   <dbl>    <chr>
## 1       1 2590892 1167489   주중형
## 2       2 1735009  326170   주중형
## 3       3 5742729  683320   주중형
## 4       4  254300  127000   주중형
## 5       5  129600   26100   주중형
## 6       6  460900 1556500   주말형
## 7       7  767000 1903000   주말형
## 8       8 6917729 3465820   주중형
## 9       9  457580  554100 유형없음
## 10     10  365055  817120   주말형

ggplot(cs.v5, aes(wk_pat)) + geom_bar(aes(fill = wk_pat))

# 6. 고객의 생일로부터 특정시점의 나이와 연령대를 계산
# age == NA 이면 평균 연령으로

cs.v6 <- cs[c("custid", "birth")] %>%
    mutate(age = year('2001-05-01') - year(ymd_hms(birth))) %>%
    mutate(age = ifelse(age < 10 | age > 100, NA, age)) %>%
    mutate(age = ifelse(is.na(age), round(mean(age, na.rm = T)), age)) %>%
    mutate(agegrp = cut(age, c(0, 19, 29, 39, 49, 59, 69, 100), labels = F) * 10) %>%
    select(custid, age, agegrp)

## Warning: 7839 failed to parse.

ggplot(cs.v6, aes(agegrp)) + geom_bar(aes(fill = agegrp))

cs.v6 %>% group_by(agegrp) %>% summarise(n = n())

## # A tibble: 7 × 2
##   agegrp     n
##    <dbl> <int>
## 1     10     9
## 2     20 13784
## 3     30 22489
## 4     40  7703
## 5     50  4457
## 6     60  1310
## 7     70   243

# 7. 기간별(최근 12개월, 6개월, 3개월) 구매금액 및 구매횟수

end_date <- ymd(ymd_hms(max(tr$sales_date)))

start_date <- ymd('20010501') - months(12)   # 특정 일자 기준으로 12개월 전 날짜
cs.v7.12 <- tr %>%
    filter(sales_date >= start_date & sales_date <= end_date) %>%
    group_by(custid) %>% summarise(amt12 = sum(net_amt), nop12 = n())

start_date <- ymd('20010501') - months(6)
cs.v7.6 <- tr %>%
    filter(sales_date >= start_date & sales_date <= end_date) %>%
    group_by(custid) %>% summarise(amt6 = sum(net_amt), nop6 = n())

start_date <- ymd('20010501') - months(3)
cs.v7.3 <- tr %>%
    filter(sales_date >= start_date & sales_date <= end_date) %>%
    group_by(custid) %>% summarise(amt3 = sum(net_amt), nop3 = n())

cs.v7 <- left_join(cs.v7.12, cs.v7.6) %>% left_join(cs.v7.3) %>% 
    mutate(amt6 = ifelse(is.na(amt6), 0, amt6),
           nop6 = ifelse(is.na(nop6), 0, nop6),
           amt3 = ifelse(is.na(amt3), 0, amt3),
           nop3 = ifelse(is.na(nop3), 0, nop3))

## Joining, by = "custid"
## Joining, by = "custid"

cs.v7 %>% head(10)

## # A tibble: 10 × 7
##    custid    amt12 nop12    amt6  nop6    amt3  nop3
##     <int>    <int> <int>   <dbl> <dbl>   <dbl> <dbl>
## 1       1  3758381    77 1337108    34  511654    13
## 2       2  2061179    28  190929     7  190929     7
## 3       3  6426049    68 4021376    41 3227130    18
## 4       4   381300     6  381300     6   89700     2
## 5       5   155700     4  155700     4  155700     4
## 6       6  2017400    29  786060    14   18060     3
## 7       7  2670000    11 2670000    11 2670000    11
## 8       8 10383549   243 6040055   114 2425630    47
## 9       9  1011680    28 1011680    28 1011680    28
## 10     10  1182175    41 1182175    41 1182175    41

# Customer Signature

custsig<-cs%>%
    left_join(cs.v1) %>%
    left_join(cs.v2) %>%
    left_join(cs.v3) %>%
    left_join(cs.v4) %>%
    left_join(cs.v5) %>%
    left_join(cs.v6) %>%
    left_join(cs.v7)

## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"
## Joining, by = "custid"

DataMining 파생변수 도출 : Customer Signature

woosa7