ARM (Association Rule Mining) 연관규칙탐사
하나의 거래나 사건에 포함되어 있는 항목들의 경향을 파악해서 상호연관성을 발견하는 것
이론설명 자료 링크
장점
- 전문 지식이 필요치 않으며 결과에 대한 이해가 쉬움
- 도출된 규칙 간의 상호비교, 평가가 쉬움
- Undirected Data 분석에 유용
- 다양한 크기의 데이터에 적합
- 신경망이나 유전자알고리즘에 비해 단순

단점
- 문제의 크기가 커질수록 지수적으로 증가
- 데이터 속성에 대한 제한적 지원
- 항목에 대한 올바른 수 결정의 어려움
- 희박한 항목에 대해서는 문제
- 품목의 수에 비해 거래수가 충분치 못하면 신뢰확률이 낮은 연관규칙 발견 가능성
##### Mine Associations using arules package
library(arules)
library(dplyr)
tr <- read.csv("data/dataTransactions.csv", header = T, stringsAsFactors = T)
head(tr)
##           datetime custid  store      product      brand   corner import
## 1 2000-05-01 10:43  18313 신촌점 4.104840e+12       샤넬   화장품      1
## 2 2000-05-01 11:00  18313 신촌점 2.700000e+12       식품 일반식품      0
## 3 2000-05-01 11:33  27222 신촌점 4.545371e+12   까사미아     가구      0
## 4 2000-05-01 11:43  27222 신촌점 4.500860e+12   대아통상     기타      0
## 5 2000-05-01 11:53  27222 신촌점 4.538130e+12 토이플러스 문화완구      0
## 6 2000-05-01 12:00  27222 신촌점 4.406010e+12       베베 유아동복      0
##   amount installment
## 1 113000           3
## 2  91950           3
## 3 598000           3
## 4  20100           1
## 5  24000           1
## 6  28000           1
summary(tr)
##              datetime         custid         store     
##  2000-10-15 16:50:   10   Min.   :10070   무역점:4955  
##  2000-07-05 17:10:    7   1st Qu.:21235   본점  :3722  
##  2000-09-29 18:00:    7   Median :31090   신촌점:6110  
##  2000-10-02 12:30:    7   Mean   :30119   천호점:4338  
##  2000-10-15 16:20:    7   3rd Qu.:38586                
##  2001-04-13 16:20:    7   Max.   :49470                
##  (Other)         :19080                                
##     product                 brand              corner         import      
##  Min.   :2.116e+12   식품      : 5267   일반식품  :5267   Min.   :0.0000  
##  1st Qu.:2.700e+12   지오다노  :  301   화장품    :2158   1st Qu.:0.0000  
##  Median :4.141e+12   랑콤      :  251   유아동복  :1345   Median :0.0000  
##  Mean   :3.837e+12   크리니크  :  188   유니캐주얼:1138   Mean   :0.1078  
##  3rd Qu.:4.405e+12   샤넬      :  181   스포츠    :1129   3rd Qu.:0.0000  
##  Max.   :6.555e+12   에스티로더:  177   피혁      : 931   Max.   :1.0000  
##                      (Other)   :12760   (Other)   :7157                   
##      amount         installment    
##  Min.   :    650   Min.   : 1.000  
##  1st Qu.:  25000   1st Qu.: 1.000  
##  Median :  53000   Median : 1.000  
##  Mean   : 109198   Mean   : 1.788  
##  3rd Qu.: 118000   3rd Qu.: 3.000  
##  Max.   :8000000   Max.   :12.000  
## 
# 고객 관점 : 고객이 1년동안 구매한 것을 하나의 트랜잭션으로 가정함.
# 아주 흔하게 발생하는 카테고리 제외.
tr.filter <- tr %>%
  filter(!(corner %in% c("일반식품","화장품","기타"))) %>%
  distinct(custid, corner)

head(tr.filter)
##   custid   corner
## 1  27222     가구
## 2  27222 문화완구
## 3  27222 유아동복
## 4  47084   스포츠
## 5  31090   스포츠
## 6  31090 영캐주얼
# arules의 transactions format으로 변환
trans <- as(split(tr.filter$corner, tr.filter$custid), "transactions")
trans
## transactions in sparse format with
##  487 transactions (rows) and
##  23 items (columns)
# raw data 바로 사용할 경우
# trans <- read.transactions("dataTransactions.tab", format = "single", sep="\t", cols = c(2,6), skip=1)
inspect(trans[1:2])   # transactionID = custid
##     items                                                             
## [1] {니트단품,스포츠,영캐주얼,유니캐주얼,유아동복,장신구,캐릭터캐주얼}
## [2] {문화완구,섬유,스포츠,엘레강스캐주얼,영캐주얼,유아동복,타운모피}  
##     transactionID
## [1] 10070        
## [2] 10139
transactionInfo(trans[size(trans) > 15])
##     transactionID
## 3           10208
## 43          13072
## 84          15968
## 102         17279
## 143         20243
## 172         22385
## 202         24548
## 203         24621
## 286         31090
## 298         32004
## 301         32232
## 310         32923
## 359         36999
## 420         42322
## 425         42800
## 477         48278
# 구매 내역이 적으면 지지도를 낮춰야 하므로 전체적인 분포를 파악.
image(trans[1:5])

image(sample(trans, 100, replace = FALSE), main = "matrix diagram")

# 주로 구매가 발생하는 항목
itemFrequency(trans, type="absolute")
##             가구             가전         니트단품   도자기크리스탈 
##               41              140              258              105 
##   디자이너부띠끄         문화완구         생활용품             섬유 
##               74              158                3              259 
##         수입명품           스포츠   엘레강스캐주얼         영캐주얼 
##              107              281              163              235 
##       유니캐주얼         유아동복           장신구         정장셔츠 
##              266              222              178              149 
##         조리욕실         침구수예     캐릭터캐주얼         타운모피 
##              165               74              212               33 
## 트래디셔널캐주얼             피혁           행사장 
##              189              323               13
itemFrequency(trans, type="relative")
##             가구             가전         니트단품   도자기크리스탈 
##      0.084188912      0.287474333      0.529774127      0.215605749 
##   디자이너부띠끄         문화완구         생활용품             섬유 
##      0.151950719      0.324435318      0.006160164      0.531827515 
##         수입명품           스포츠   엘레강스캐주얼         영캐주얼 
##      0.219712526      0.577002053      0.334702259      0.482546201 
##       유니캐주얼         유아동복           장신구         정장셔츠 
##      0.546201232      0.455852156      0.365503080      0.305954825 
##         조리욕실         침구수예     캐릭터캐주얼         타운모피 
##      0.338809035      0.151950719      0.435318275      0.067761807 
## 트래디셔널캐주얼             피혁           행사장 
##      0.388090349      0.663244353      0.026694045
itemFrequency(trans)[order(itemFrequency(trans), decreasing = TRUE)]
##             피혁           스포츠       유니캐주얼             섬유 
##      0.663244353      0.577002053      0.546201232      0.531827515 
##         니트단품         영캐주얼         유아동복     캐릭터캐주얼 
##      0.529774127      0.482546201      0.455852156      0.435318275 
## 트래디셔널캐주얼           장신구         조리욕실   엘레강스캐주얼 
##      0.388090349      0.365503080      0.338809035      0.334702259 
##         문화완구         정장셔츠             가전         수입명품 
##      0.324435318      0.305954825      0.287474333      0.219712526 
##   도자기크리스탈   디자이너부띠끄         침구수예             가구 
##      0.215605749      0.151950719      0.151950719      0.084188912 
##         타운모피           행사장         생활용품 
##      0.067761807      0.026694045      0.006160164
itemFrequencyPlot(trans, support=0.2, cex.names=0.8)

itemFrequencyPlot(trans, topN = 20, main = "support top 20 items")

# 연관규칙탐사
rules <- apriori(trans, parameter=list(support=0.2, confidence=0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.2      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 97 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 487 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [70 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 70 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
##  1 40 29 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     3.0     3.0     3.4     4.0     4.0 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.2012   Min.   :0.8000   Min.   :1.233  
##  1st Qu.:0.2115   1st Qu.:0.8182   1st Qu.:1.283  
##  Median :0.2259   Median :0.8413   Median :1.353  
##  Mean   :0.2341   Mean   :0.8444   Mean   :1.383  
##  3rd Qu.:0.2464   3rd Qu.:0.8624   3rd Qu.:1.463  
##  Max.   :0.3265   Max.   :0.9160   Max.   :1.696  
## 
## mining info:
##   data ntransactions support confidence
##  trans           487     0.2        0.8
inspect(rules)
##      lhs                                     rhs          support  
## [1]  {트래디셔널캐주얼}                   => {피혁}       0.3264887
## [2]  {스포츠,엘레강스캐주얼}              => {피혁}       0.2012320
## [3]  {스포츠,조리욕실}                    => {피혁}       0.2053388
## [4]  {조리욕실,피혁}                      => {스포츠}     0.2053388
## [5]  {영캐주얼,장신구}                    => {피혁}       0.2012320
## [6]  {유니캐주얼,장신구}                  => {피혁}       0.2012320
## [7]  {니트단품,장신구}                    => {섬유}       0.2032854
## [8]  {니트단품,장신구}                    => {피혁}       0.2114990
## [9]  {섬유,장신구}                        => {피혁}       0.2156057
## [10] {스포츠,장신구}                      => {피혁}       0.2114990
    .........
## [65] {니트단품,유니캐주얼,피혁}           => {섬유}       0.2258727
## [66] {니트단품,스포츠,유니캐주얼}         => {피혁}       0.2279261
## [67] {니트단품,유니캐주얼,피혁}           => {스포츠}     0.2279261
## [68] {섬유,스포츠,유니캐주얼}             => {피혁}       0.2361396
## [69] {섬유,유니캐주얼,피혁}               => {스포츠}     0.2361396
## [70] {니트단품,섬유,스포츠}               => {피혁}       0.2320329
##      confidence lift    
## [1]  0.8412698  1.268416
## [2]  0.8596491  1.296127
## [3]  0.8403361  1.267008
## [4]  0.8196721  1.420571
## [5]  0.8596491  1.296127
## [6]  0.8521739  1.284857
## [7]  0.8114754  1.525824
## [8]  0.8442623  1.272928
## [9]  0.8333333  1.256450
## [10] 0.8306452  1.252397
    .........
## [65] 0.8029197  1.509737
## [66] 0.8809524  1.328247
## [67] 0.8102190  1.404187
## [68] 0.8914729  1.344109
## [69] 0.8156028  1.413518
## [70] 0.9040000  1.362997
inspect(sort(rules, by = "lift")[1:30])
##      lhs                                   rhs          support  
## [1]  {니트단품,섬유,유니캐주얼}         => {영캐주얼}   0.2032854
## [2]  {니트단품,스포츠,유니캐주얼}       => {영캐주얼}   0.2114990
## [3]  {영캐주얼,트래디셔널캐주얼}        => {니트단품}   0.2094456
## [4]  {섬유,영캐주얼,피혁}               => {니트단품}   0.2258727
## [5]  {섬유,영캐주얼,유니캐주얼}         => {니트단품}   0.2032854
## [6]  {스포츠,영캐주얼,피혁}             => {유니캐주얼} 0.2381930
## [7]  {니트단품,스포츠,영캐주얼}         => {유니캐주얼} 0.2114990
## [8]  {섬유,스포츠,영캐주얼}             => {니트단품}   0.2012320
## [9]  {니트단품,장신구}                  => {섬유}       0.2032854
## [10] {니트단품,트래디셔널캐주얼}        => {섬유}       0.2135524
## [11] {영캐주얼,트래디셔널캐주얼}        => {유니캐주얼} 0.2053388
## [12] {니트단품,스포츠,영캐주얼}         => {섬유}       0.2012320
## [13] {섬유,영캐주얼}                    => {니트단품}   0.2546201
## [14] {니트단품,유니캐주얼,피혁}         => {섬유}       0.2258727
## [15] {스포츠,영캐주얼}                  => {유니캐주얼} 0.2710472
## [16] {섬유,스포츠,영캐주얼}             => {유니캐주얼} 0.2032854
## [17] {섬유,영캐주얼,피혁}               => {유니캐주얼} 0.2197125
## [18] {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠}     0.2094456
## [19] {영캐주얼,트래디셔널캐주얼}        => {스포츠}     0.2053388
## [20] {섬유,영캐주얼,유니캐주얼}         => {스포츠}     0.2032854
## [21] {니트단품,영캐주얼,유니캐주얼}     => {스포츠}     0.2114990
## [22] {조리욕실,피혁}                    => {스포츠}     0.2053388
## [23] {섬유,영캐주얼,피혁}               => {스포츠}     0.2197125
## [24] {섬유,유니캐주얼,피혁}             => {스포츠}     0.2361396
## [25] {니트단품,트래디셔널캐주얼}        => {스포츠}     0.2156057
## [26] {유니캐주얼,트래디셔널캐주얼}      => {스포츠}     0.2299795
## [27] {니트단품,유니캐주얼,피혁}         => {스포츠}     0.2279261
## [28] {영캐주얼,유니캐주얼,피혁}         => {스포츠}     0.2381930
## [29] {트래디셔널캐주얼,피혁}            => {스포츠}     0.2628337
## [30] {섬유,트래디셔널캐주얼}            => {스포츠}     0.2217659
##      confidence lift    
## [1]  0.8181818  1.695551
## [2]  0.8174603  1.694056
## [3]  0.8429752  1.591197
## [4]  0.8396947  1.585005
## [5]  0.8250000  1.557267
## [6]  0.8467153  1.550189
## [7]  0.8442623  1.545698
## [8]  0.8099174  1.528797
## [9]  0.8114754  1.525824
## [10] 0.8062016  1.515908
## [11] 0.8264463  1.513080
## [12] 0.8032787  1.510412
## [13] 0.8000000  1.510078
## [14] 0.8029197  1.509737
## [15] 0.8198758  1.501051
## [16] 0.8181818  1.497949
## [17] 0.8167939  1.495408
## [18] 0.8500000  1.473132
## [19] 0.8264463  1.432311
## [20] 0.8250000  1.429804
## [21] 0.8240000  1.428071
## [22] 0.8196721  1.420571
## [23] 0.8167939  1.415582
## [24] 0.8156028  1.413518
## [25] 0.8139535  1.410660
## [26] 0.8115942  1.406571
## [27] 0.8102190  1.404187
## [28] 0.8055556  1.396105
## [29] 0.8050314  1.395197
## [30] 0.8000000  1.386477
# 우측에 특정 카테고리가 있는 것만 탐색할 경우
rules_spo <- apriori(trans, parameter=list(support=0.2, confidence=0.8), appearance=list(rhs="스포츠",default="lhs"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.2      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 97 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[23 item(s), 487 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules_spo)
## set of 13 rules
## 
## rule length distribution (lhs + rhs):sizes
## 3 4 
## 6 7 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   4.000   3.538   4.000   4.000 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.2033   Min.   :0.8000   Min.   :1.386  
##  1st Qu.:0.2094   1st Qu.:0.8102   1st Qu.:1.404  
##  Median :0.2197   Median :0.8156   Median :1.414  
##  Mean   :0.2221   Mean   :0.8172   Mean   :1.416  
##  3rd Qu.:0.2300   3rd Qu.:0.8240   3rd Qu.:1.428  
##  Max.   :0.2628   Max.   :0.8500   Max.   :1.473  
## 
## mining info:
##   data ntransactions support confidence
##  trans           487     0.2        0.8
inspect(rules_spo)
##      lhs                                   rhs      support   confidence
## [1]  {조리욕실,피혁}                    => {스포츠} 0.2053388 0.8196721 
## [2]  {영캐주얼,트래디셔널캐주얼}        => {스포츠} 0.2053388 0.8264463 
## [3]  {유니캐주얼,트래디셔널캐주얼}      => {스포츠} 0.2299795 0.8115942 
## [4]  {니트단품,트래디셔널캐주얼}        => {스포츠} 0.2156057 0.8139535 
## [5]  {섬유,트래디셔널캐주얼}            => {스포츠} 0.2217659 0.8000000 
## [6]  {트래디셔널캐주얼,피혁}            => {스포츠} 0.2628337 0.8050314 
## [7]  {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000 
## [8]  {니트단품,영캐주얼,유니캐주얼}     => {스포츠} 0.2114990 0.8240000 
## [9]  {섬유,영캐주얼,유니캐주얼}         => {스포츠} 0.2032854 0.8250000 
## [10] {영캐주얼,유니캐주얼,피혁}         => {스포츠} 0.2381930 0.8055556 
## [11] {섬유,영캐주얼,피혁}               => {스포츠} 0.2197125 0.8167939 
## [12] {니트단품,유니캐주얼,피혁}         => {스포츠} 0.2279261 0.8102190 
## [13] {섬유,유니캐주얼,피혁}             => {스포츠} 0.2361396 0.8156028 
##      lift    
## [1]  1.420571
## [2]  1.432311
## [3]  1.406571
## [4]  1.410660
## [5]  1.386477
## [6]  1.395197
## [7]  1.473132
## [8]  1.428071
## [9]  1.429804
## [10] 1.396105
## [11] 1.415582
## [12] 1.404187
## [13] 1.413518
inspect(sort(rules_spo, by = "lift")[1:10])
##      lhs                                   rhs      support   confidence
## [1]  {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000 
## [2]  {영캐주얼,트래디셔널캐주얼}        => {스포츠} 0.2053388 0.8264463 
## [3]  {섬유,영캐주얼,유니캐주얼}         => {스포츠} 0.2032854 0.8250000 
## [4]  {니트단품,영캐주얼,유니캐주얼}     => {스포츠} 0.2114990 0.8240000 
## [5]  {조리욕실,피혁}                    => {스포츠} 0.2053388 0.8196721 
## [6]  {섬유,영캐주얼,피혁}               => {스포츠} 0.2197125 0.8167939 
## [7]  {섬유,유니캐주얼,피혁}             => {스포츠} 0.2361396 0.8156028 
## [8]  {니트단품,트래디셔널캐주얼}        => {스포츠} 0.2156057 0.8139535 
## [9]  {유니캐주얼,트래디셔널캐주얼}      => {스포츠} 0.2299795 0.8115942 
## [10] {니트단품,유니캐주얼,피혁}         => {스포츠} 0.2279261 0.8102190 
##      lift    
## [1]  1.473132
## [2]  1.432311
## [3]  1.429804
## [4]  1.428071
## [5]  1.420571
## [6]  1.415582
## [7]  1.413518
## [8]  1.410660
## [9]  1.406571
## [10] 1.404187
# Lift 기준으로 filtering
rules.target <- subset(rules, rhs %in% "스포츠" & lift > 1.4)
inspect(sort(rules.target, by="confidence"))
##      lhs                                   rhs      support   confidence
## [1]  {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000 
## [2]  {영캐주얼,트래디셔널캐주얼}        => {스포츠} 0.2053388 0.8264463 
## [3]  {섬유,영캐주얼,유니캐주얼}         => {스포츠} 0.2032854 0.8250000 
## [4]  {니트단품,영캐주얼,유니캐주얼}     => {스포츠} 0.2114990 0.8240000 
## [5]  {조리욕실,피혁}                    => {스포츠} 0.2053388 0.8196721 
## [6]  {섬유,영캐주얼,피혁}               => {스포츠} 0.2197125 0.8167939 
## [7]  {섬유,유니캐주얼,피혁}             => {스포츠} 0.2361396 0.8156028 
## [8]  {니트단품,트래디셔널캐주얼}        => {스포츠} 0.2156057 0.8139535 
## [9]  {유니캐주얼,트래디셔널캐주얼}      => {스포츠} 0.2299795 0.8115942 
## [10] {니트단품,유니캐주얼,피혁}         => {스포츠} 0.2279261 0.8102190 
##      lift    
## [1]  1.473132
## [2]  1.432311
## [3]  1.429804
## [4]  1.428071
## [5]  1.420571
## [6]  1.415582
## [7]  1.413518
## [8]  1.410660
## [9]  1.406571
## [10] 1.404187
rule.interest <- subset(rules, items %in% c("장신구", "섬유"))  # items : lhs, rhs 모두 검색
inspect(rule.interest[1:10])
##      lhs                            rhs      support   confidence lift    
## [1]  {영캐주얼,장신구}           => {피혁}   0.2012320 0.8596491  1.296127
## [2]  {유니캐주얼,장신구}         => {피혁}   0.2012320 0.8521739  1.284857
## [3]  {니트단품,장신구}           => {섬유}   0.2032854 0.8114754  1.525824
## [4]  {니트단품,장신구}           => {피혁}   0.2114990 0.8442623  1.272928
## [5]  {섬유,장신구}               => {피혁}   0.2156057 0.8333333  1.256450
## [6]  {스포츠,장신구}             => {피혁}   0.2114990 0.8306452  1.252397
## [7]  {니트단품,트래디셔널캐주얼} => {섬유}   0.2135524 0.8062016  1.515908
## [8]  {섬유,트래디셔널캐주얼}     => {스포츠} 0.2217659 0.8000000  1.386477
## [9]  {섬유,트래디셔널캐주얼}     => {피혁}   0.2340862 0.8444444  1.273203
## [10] {섬유,유아동복}             => {피혁}   0.2114990 0.8174603  1.232518
# csv 파일로 저장
# write(rules.target, file="arules.csv", sep=",", row.name=F)

# xml 파일로 저장
# library(pmml)
# write.PMML(rules.target, file = "arules.xml")
rule_df <- as(rules, "data.frame")
head(rule_df)
##                               rules   support confidence     lift
## 1      {트래디셔널캐주얼} => {피혁} 0.3264887  0.8412698 1.268416
## 2 {스포츠,엘레강스캐주얼} => {피혁} 0.2012320  0.8596491 1.296127
## 3       {스포츠,조리욕실} => {피혁} 0.2053388  0.8403361 1.267008
## 4       {조리욕실,피혁} => {스포츠} 0.2053388  0.8196721 1.420571
## 5       {영캐주얼,장신구} => {피혁} 0.2012320  0.8596491 1.296127
## 6     {유니캐주얼,장신구} => {피혁} 0.2012320  0.8521739 1.284857
##### Visualize Association Rules using arulesViz package
library(arulesViz)
plot(rules)

plot(sort(rules, by = "lift")[1:20], method = "grouped")

plot(rules, method = "graph", control = list(type="items"))

##### Exercise
# 여성쇼핑몰 C사 고객 786명의 10가지 구매품목에 대한 거래이력을 기초로 
# 반응률이 높은 교차판매전략을 기획하기 위해 연관규칙탐사 수행.
# 고객id별로 각 품목에 대한 구매여부 0 or 1로 표시

data <- read.delim("data/shoppingmall.txt", stringsAsFactors=FALSE)
head(data)
##   ID heel tee skirt knit jacket jewelry coat flat shorts blous
## 1  1    1   0     0    0      0       0    0    0      1     0
## 2  2    1   0     0    0      0       0    0    1      0     0
## 3  3    1   0     0    0      0       0    0    1      1     0
## 4  4    1   0     0    0      1       1    0    0      0     0
## 5  5    1   0     0    0      0       0    0    0      0     0
## 6  6    1   0     0    0      0       1    0    0      1     1
st <- as.matrix(data[,-1])
trans <- as(st, "transactions")
trans
## transactions in sparse format with
##  786 transactions (rows) and
##  10 items (columns)
head(inspect(trans))
##       items    
## [1]   {heel,   
##        shorts} 
## [2]   {heel,   
##        flat}   
## [3]   {heel,   
##        flat,   
##        shorts} 
## [4]   {heel,   
##        jacket, 
##        jewelry}
## [5]   {heel}   
## [6]   {heel,   
##        jewelry,
##        shorts, 
##        blous}  
image(trans[1:10])

image(sample(trans, 100, replace = FALSE), main = "matrix diagram")

itemFrequency(trans, type="absolute")
##    heel     tee   skirt    knit  jacket jewelry    coat    flat  shorts 
##     387     316     310      65     148     337      23      78     373 
##   blous 
##     358
itemFrequency(trans, type="relative")
##       heel        tee      skirt       knit     jacket    jewelry 
## 0.49236641 0.40203562 0.39440204 0.08269720 0.18829517 0.42875318 
##       coat       flat     shorts      blous 
## 0.02926209 0.09923664 0.47455471 0.45547074
itemFrequency(trans)[order(itemFrequency(trans), decreasing = TRUE)]
##       heel     shorts      blous    jewelry        tee      skirt 
## 0.49236641 0.47455471 0.45547074 0.42875318 0.40203562 0.39440204 
##     jacket       flat       knit       coat 
## 0.18829517 0.09923664 0.08269720 0.02926209
# 구매 내역이 적으면 지지도를 낮춰야 함.
itemFrequencyPlot(trans, support=0.2, cex.names=0.8)

itemFrequencyPlot(trans, topN = 10, main = "support top 20 items")

rules <- apriori(trans, parameter=list(support=0.07, confidence=0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.07      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 55 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 786 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [17 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules)
## set of 17 rules
## 
## rule length distribution (lhs + rhs):sizes
##  3  4  5 
##  1 11  5 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   4.000   4.000   4.235   5.000   5.000 
## 
## summary of quality measures:
##     support          confidence          lift      
##  Min.   :0.07125   Min.   :0.8000   Min.   :1.654  
##  1st Qu.:0.07252   1st Qu.:0.8143   1st Qu.:1.879  
##  Median :0.07761   Median :0.8316   Median :1.949  
##  Mean   :0.07955   Mean   :0.8327   Mean   :1.924  
##  3rd Qu.:0.07888   3rd Qu.:0.8472   3rd Qu.:2.004  
##  Max.   :0.10051   Max.   :0.8841   Max.   :2.065  
## 
## mining info:
##   data ntransactions support confidence
##  trans           786    0.07        0.8
inspect(rules)
##      lhs                             rhs       support    confidence
## [1]  {tee,jacket}                 => {jewelry} 0.09033079 0.8352941 
## [2]  {tee,skirt,jacket}           => {jewelry} 0.07251908 0.8636364 
## [3]  {skirt,jacket,jewelry}       => {tee}     0.07251908 0.8028169 
## [4]  {tee,jacket,jewelry}         => {skirt}   0.07251908 0.8028169 
## [5]  {heel,skirt,jacket}          => {jewelry} 0.07124682 0.8000000 
## [6]  {tee,jacket,blous}           => {jewelry} 0.07124682 0.8750000 
## [7]  {jacket,shorts,blous}        => {jewelry} 0.07124682 0.8358209 
## [8]  {heel,jacket,blous}          => {jewelry} 0.07888041 0.8378378 
## [9]  {heel,jacket,shorts}         => {jewelry} 0.07379135 0.8055556 
## [10] {skirt,jewelry,blous}        => {heel}    0.10050891 0.8144330 
## [11] {heel,skirt,blous}           => {jewelry} 0.10050891 0.8315789 
## [12] {tee,shorts,blous}           => {jewelry} 0.09414758 0.8222222 
## [13] {tee,skirt,jewelry,blous}    => {heel}    0.07760814 0.8243243 
## [14] {heel,tee,skirt,blous}       => {jewelry} 0.07760814 0.8591549 
## [15] {heel,tee,jewelry,shorts}    => {skirt}   0.07251908 0.8142857 
## [16] {skirt,jewelry,shorts,blous} => {heel}    0.07760814 0.8840580 
## [17] {heel,skirt,shorts,blous}    => {jewelry} 0.07760814 0.8472222 
##      lift    
## [1]  1.948193
## [2]  2.014297
## [3]  1.996880
## [4]  2.035529
## [5]  1.865875
## [6]  2.040801
## [7]  1.949422
## [8]  1.954126
## [9]  1.878833
## [10] 1.654120
## [11] 1.939528
## [12] 1.917705
## [13] 1.674209
## [14] 2.003845
## [15] 2.064608
## [16] 1.795529
## [17] 1.976014
inspect(sort(rules, by = "lift"))
##      lhs                             rhs       support    confidence
## [1]  {heel,tee,jewelry,shorts}    => {skirt}   0.07251908 0.8142857 
## [2]  {tee,jacket,blous}           => {jewelry} 0.07124682 0.8750000 
## [3]  {tee,jacket,jewelry}         => {skirt}   0.07251908 0.8028169 
## [4]  {tee,skirt,jacket}           => {jewelry} 0.07251908 0.8636364 
## [5]  {heel,tee,skirt,blous}       => {jewelry} 0.07760814 0.8591549 
## [6]  {skirt,jacket,jewelry}       => {tee}     0.07251908 0.8028169 
## [7]  {heel,skirt,shorts,blous}    => {jewelry} 0.07760814 0.8472222 
## [8]  {heel,jacket,blous}          => {jewelry} 0.07888041 0.8378378 
## [9]  {jacket,shorts,blous}        => {jewelry} 0.07124682 0.8358209 
## [10] {tee,jacket}                 => {jewelry} 0.09033079 0.8352941 
## [11] {heel,skirt,blous}           => {jewelry} 0.10050891 0.8315789 
## [12] {tee,shorts,blous}           => {jewelry} 0.09414758 0.8222222 
## [13] {heel,jacket,shorts}         => {jewelry} 0.07379135 0.8055556 
## [14] {heel,skirt,jacket}          => {jewelry} 0.07124682 0.8000000 
## [15] {skirt,jewelry,shorts,blous} => {heel}    0.07760814 0.8840580 
## [16] {tee,skirt,jewelry,blous}    => {heel}    0.07760814 0.8243243 
## [17] {skirt,jewelry,blous}        => {heel}    0.10050891 0.8144330 
##      lift    
## [1]  2.064608
## [2]  2.040801
## [3]  2.035529
## [4]  2.014297
## [5]  2.003845
## [6]  1.996880
## [7]  1.976014
## [8]  1.954126
## [9]  1.949422
## [10] 1.948193
## [11] 1.939528
## [12] 1.917705
## [13] 1.878833
## [14] 1.865875
## [15] 1.795529
## [16] 1.674209
## [17] 1.654120
plot(rules)

plot(sort(rules, by = "lift"), method = "grouped")

plot(rules, method = "graph", control = list(type="items"))

# frequent itemset
rules2 <- apriori(trans, parameter=list(support=0.2, target = "frequent itemsets"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5     0.2      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 157 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 786 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [20 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(rules2)
## set of 20 itemsets
## 
## most frequent items:
##    heel     tee jewelry  shorts   skirt (Other) 
##       6       6       6       6       5       5 
## 
## element (itemset/transaction) length distribution:sizes
##  1  2 
##  6 14 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.0     2.0     1.7     2.0     2.0 
## 
## summary of quality measures:
##     support      
##  Min.   :0.2074  
##  1st Qu.:0.2160  
##  Median :0.2290  
##  Mean   :0.2889  
##  3rd Qu.:0.3963  
##  Max.   :0.4924  
## 
## includes transaction ID lists: FALSE 
## 
## mining info:
##   data ntransactions support confidence
##  trans           786     0.2          1
inspect(sort(rules2, by = "support"))
##      items            support  
## [1]  {heel}           0.4923664
## [2]  {shorts}         0.4745547
## [3]  {blous}          0.4554707
## [4]  {jewelry}        0.4287532
## [5]  {tee}            0.4020356
## [6]  {skirt}          0.3944020
## [7]  {heel,jewelry}   0.2557252
## [8]  {heel,shorts}    0.2442748
## [9]  {jewelry,shorts} 0.2328244
## [10] {tee,skirt}      0.2302799
## [11] {jewelry,blous}  0.2277354
## [12] {shorts,blous}   0.2239186
## [13] {tee,jewelry}    0.2213740
## [14] {skirt,shorts}   0.2188295
## [15] {heel,blous}     0.2162850
## [16] {skirt,jewelry}  0.2150127
## [17] {tee,shorts}     0.2137405
## [18] {heel,skirt}     0.2124682
## [19] {heel,tee}       0.2111959
## [20] {tee,blous}      0.2073791