ARM (Association Rule Mining) 연관규칙탐사
하나의 거래나 사건에 포함되어 있는 항목들의 경향을 파악해서 상호연관성을 발견하는 것
이론설명 자료 링크
장점
- 전문 지식이 필요치 않으며 결과에 대한 이해가 쉬움
- 도출된 규칙 간의 상호비교, 평가가 쉬움
- Undirected Data 분석에 유용
- 다양한 크기의 데이터에 적합
- 신경망이나 유전자알고리즘에 비해 단순
단점
- 문제의 크기가 커질수록 지수적으로 증가
- 데이터 속성에 대한 제한적 지원
- 항목에 대한 올바른 수 결정의 어려움
- 희박한 항목에 대해서는 문제
- 품목의 수에 비해 거래수가 충분치 못하면 신뢰확률이 낮은 연관규칙 발견 가능성
##### Mine Associations using arules package
library(arules)
library(dplyr)
tr <- read.csv("data/dataTransactions.csv", header = T, stringsAsFactors = T)
head(tr)
## datetime custid store product brand corner import
## 1 2000-05-01 10:43 18313 신촌점 4.104840e+12 샤넬 화장품 1
## 2 2000-05-01 11:00 18313 신촌점 2.700000e+12 식품 일반식품 0
## 3 2000-05-01 11:33 27222 신촌점 4.545371e+12 까사미아 가구 0
## 4 2000-05-01 11:43 27222 신촌점 4.500860e+12 대아통상 기타 0
## 5 2000-05-01 11:53 27222 신촌점 4.538130e+12 토이플러스 문화완구 0
## 6 2000-05-01 12:00 27222 신촌점 4.406010e+12 베베 유아동복 0
## amount installment
## 1 113000 3
## 2 91950 3
## 3 598000 3
## 4 20100 1
## 5 24000 1
## 6 28000 1
summary(tr)
## datetime custid store
## 2000-10-15 16:50: 10 Min. :10070 무역점:4955
## 2000-07-05 17:10: 7 1st Qu.:21235 본점 :3722
## 2000-09-29 18:00: 7 Median :31090 신촌점:6110
## 2000-10-02 12:30: 7 Mean :30119 천호점:4338
## 2000-10-15 16:20: 7 3rd Qu.:38586
## 2001-04-13 16:20: 7 Max. :49470
## (Other) :19080
## product brand corner import
## Min. :2.116e+12 식품 : 5267 일반식품 :5267 Min. :0.0000
## 1st Qu.:2.700e+12 지오다노 : 301 화장품 :2158 1st Qu.:0.0000
## Median :4.141e+12 랑콤 : 251 유아동복 :1345 Median :0.0000
## Mean :3.837e+12 크리니크 : 188 유니캐주얼:1138 Mean :0.1078
## 3rd Qu.:4.405e+12 샤넬 : 181 스포츠 :1129 3rd Qu.:0.0000
## Max. :6.555e+12 에스티로더: 177 피혁 : 931 Max. :1.0000
## (Other) :12760 (Other) :7157
## amount installment
## Min. : 650 Min. : 1.000
## 1st Qu.: 25000 1st Qu.: 1.000
## Median : 53000 Median : 1.000
## Mean : 109198 Mean : 1.788
## 3rd Qu.: 118000 3rd Qu.: 3.000
## Max. :8000000 Max. :12.000
##
# 고객 관점 : 고객이 1년동안 구매한 것을 하나의 트랜잭션으로 가정함.
# 아주 흔하게 발생하는 카테고리 제외.
tr.filter <- tr %>%
filter(!(corner %in% c("일반식품","화장품","기타"))) %>%
distinct(custid, corner)
head(tr.filter)
## custid corner
## 1 27222 가구
## 2 27222 문화완구
## 3 27222 유아동복
## 4 47084 스포츠
## 5 31090 스포츠
## 6 31090 영캐주얼
# arules의 transactions format으로 변환
trans <- as(split(tr.filter$corner, tr.filter$custid), "transactions")
trans
## transactions in sparse format with
## 487 transactions (rows) and
## 23 items (columns)
# raw data 바로 사용할 경우
# trans <- read.transactions("dataTransactions.tab", format = "single", sep="\t", cols = c(2,6), skip=1)
inspect(trans[1:2]) # transactionID = custid
## items
## [1] {니트단품,스포츠,영캐주얼,유니캐주얼,유아동복,장신구,캐릭터캐주얼}
## [2] {문화완구,섬유,스포츠,엘레강스캐주얼,영캐주얼,유아동복,타운모피}
## transactionID
## [1] 10070
## [2] 10139
transactionInfo(trans[size(trans) > 15])
## transactionID
## 3 10208
## 43 13072
## 84 15968
## 102 17279
## 143 20243
## 172 22385
## 202 24548
## 203 24621
## 286 31090
## 298 32004
## 301 32232
## 310 32923
## 359 36999
## 420 42322
## 425 42800
## 477 48278
# 구매 내역이 적으면 지지도를 낮춰야 하므로 전체적인 분포를 파악.
image(trans[1:5])
image(sample(trans, 100, replace = FALSE), main = "matrix diagram")
# 주로 구매가 발생하는 항목
itemFrequency(trans, type="absolute")
## 가구 가전 니트단품 도자기크리스탈
## 41 140 258 105
## 디자이너부띠끄 문화완구 생활용품 섬유
## 74 158 3 259
## 수입명품 스포츠 엘레강스캐주얼 영캐주얼
## 107 281 163 235
## 유니캐주얼 유아동복 장신구 정장셔츠
## 266 222 178 149
## 조리욕실 침구수예 캐릭터캐주얼 타운모피
## 165 74 212 33
## 트래디셔널캐주얼 피혁 행사장
## 189 323 13
itemFrequency(trans, type="relative")
## 가구 가전 니트단품 도자기크리스탈
## 0.084188912 0.287474333 0.529774127 0.215605749
## 디자이너부띠끄 문화완구 생활용품 섬유
## 0.151950719 0.324435318 0.006160164 0.531827515
## 수입명품 스포츠 엘레강스캐주얼 영캐주얼
## 0.219712526 0.577002053 0.334702259 0.482546201
## 유니캐주얼 유아동복 장신구 정장셔츠
## 0.546201232 0.455852156 0.365503080 0.305954825
## 조리욕실 침구수예 캐릭터캐주얼 타운모피
## 0.338809035 0.151950719 0.435318275 0.067761807
## 트래디셔널캐주얼 피혁 행사장
## 0.388090349 0.663244353 0.026694045
itemFrequency(trans)[order(itemFrequency(trans), decreasing = TRUE)]
## 피혁 스포츠 유니캐주얼 섬유
## 0.663244353 0.577002053 0.546201232 0.531827515
## 니트단품 영캐주얼 유아동복 캐릭터캐주얼
## 0.529774127 0.482546201 0.455852156 0.435318275
## 트래디셔널캐주얼 장신구 조리욕실 엘레강스캐주얼
## 0.388090349 0.365503080 0.338809035 0.334702259
## 문화완구 정장셔츠 가전 수입명품
## 0.324435318 0.305954825 0.287474333 0.219712526
## 도자기크리스탈 디자이너부띠끄 침구수예 가구
## 0.215605749 0.151950719 0.151950719 0.084188912
## 타운모피 행사장 생활용품
## 0.067761807 0.026694045 0.006160164
itemFrequencyPlot(trans, support=0.2, cex.names=0.8)
itemFrequencyPlot(trans, topN = 20, main = "support top 20 items")
# 연관규칙탐사
rules <- apriori(trans, parameter=list(support=0.2, confidence=0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.2 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 97
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[23 item(s), 487 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [70 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 70 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 1 40 29
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 3.0 3.0 3.4 4.0 4.0
##
## summary of quality measures:
## support confidence lift
## Min. :0.2012 Min. :0.8000 Min. :1.233
## 1st Qu.:0.2115 1st Qu.:0.8182 1st Qu.:1.283
## Median :0.2259 Median :0.8413 Median :1.353
## Mean :0.2341 Mean :0.8444 Mean :1.383
## 3rd Qu.:0.2464 3rd Qu.:0.8624 3rd Qu.:1.463
## Max. :0.3265 Max. :0.9160 Max. :1.696
##
## mining info:
## data ntransactions support confidence
## trans 487 0.2 0.8
inspect(rules)
## lhs rhs support
## [1] {트래디셔널캐주얼} => {피혁} 0.3264887
## [2] {스포츠,엘레강스캐주얼} => {피혁} 0.2012320
## [3] {스포츠,조리욕실} => {피혁} 0.2053388
## [4] {조리욕실,피혁} => {스포츠} 0.2053388
## [5] {영캐주얼,장신구} => {피혁} 0.2012320
## [6] {유니캐주얼,장신구} => {피혁} 0.2012320
## [7] {니트단품,장신구} => {섬유} 0.2032854
## [8] {니트단품,장신구} => {피혁} 0.2114990
## [9] {섬유,장신구} => {피혁} 0.2156057
## [10] {스포츠,장신구} => {피혁} 0.2114990
.........
## [65] {니트단품,유니캐주얼,피혁} => {섬유} 0.2258727
## [66] {니트단품,스포츠,유니캐주얼} => {피혁} 0.2279261
## [67] {니트단품,유니캐주얼,피혁} => {스포츠} 0.2279261
## [68] {섬유,스포츠,유니캐주얼} => {피혁} 0.2361396
## [69] {섬유,유니캐주얼,피혁} => {스포츠} 0.2361396
## [70] {니트단품,섬유,스포츠} => {피혁} 0.2320329
## confidence lift
## [1] 0.8412698 1.268416
## [2] 0.8596491 1.296127
## [3] 0.8403361 1.267008
## [4] 0.8196721 1.420571
## [5] 0.8596491 1.296127
## [6] 0.8521739 1.284857
## [7] 0.8114754 1.525824
## [8] 0.8442623 1.272928
## [9] 0.8333333 1.256450
## [10] 0.8306452 1.252397
.........
## [65] 0.8029197 1.509737
## [66] 0.8809524 1.328247
## [67] 0.8102190 1.404187
## [68] 0.8914729 1.344109
## [69] 0.8156028 1.413518
## [70] 0.9040000 1.362997
inspect(sort(rules, by = "lift")[1:30])
## lhs rhs support
## [1] {니트단품,섬유,유니캐주얼} => {영캐주얼} 0.2032854
## [2] {니트단품,스포츠,유니캐주얼} => {영캐주얼} 0.2114990
## [3] {영캐주얼,트래디셔널캐주얼} => {니트단품} 0.2094456
## [4] {섬유,영캐주얼,피혁} => {니트단품} 0.2258727
## [5] {섬유,영캐주얼,유니캐주얼} => {니트단품} 0.2032854
## [6] {스포츠,영캐주얼,피혁} => {유니캐주얼} 0.2381930
## [7] {니트단품,스포츠,영캐주얼} => {유니캐주얼} 0.2114990
## [8] {섬유,스포츠,영캐주얼} => {니트단품} 0.2012320
## [9] {니트단품,장신구} => {섬유} 0.2032854
## [10] {니트단품,트래디셔널캐주얼} => {섬유} 0.2135524
## [11] {영캐주얼,트래디셔널캐주얼} => {유니캐주얼} 0.2053388
## [12] {니트단품,스포츠,영캐주얼} => {섬유} 0.2012320
## [13] {섬유,영캐주얼} => {니트단품} 0.2546201
## [14] {니트단품,유니캐주얼,피혁} => {섬유} 0.2258727
## [15] {스포츠,영캐주얼} => {유니캐주얼} 0.2710472
## [16] {섬유,스포츠,영캐주얼} => {유니캐주얼} 0.2032854
## [17] {섬유,영캐주얼,피혁} => {유니캐주얼} 0.2197125
## [18] {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456
## [19] {영캐주얼,트래디셔널캐주얼} => {스포츠} 0.2053388
## [20] {섬유,영캐주얼,유니캐주얼} => {스포츠} 0.2032854
## [21] {니트단품,영캐주얼,유니캐주얼} => {스포츠} 0.2114990
## [22] {조리욕실,피혁} => {스포츠} 0.2053388
## [23] {섬유,영캐주얼,피혁} => {스포츠} 0.2197125
## [24] {섬유,유니캐주얼,피혁} => {스포츠} 0.2361396
## [25] {니트단품,트래디셔널캐주얼} => {스포츠} 0.2156057
## [26] {유니캐주얼,트래디셔널캐주얼} => {스포츠} 0.2299795
## [27] {니트단품,유니캐주얼,피혁} => {스포츠} 0.2279261
## [28] {영캐주얼,유니캐주얼,피혁} => {스포츠} 0.2381930
## [29] {트래디셔널캐주얼,피혁} => {스포츠} 0.2628337
## [30] {섬유,트래디셔널캐주얼} => {스포츠} 0.2217659
## confidence lift
## [1] 0.8181818 1.695551
## [2] 0.8174603 1.694056
## [3] 0.8429752 1.591197
## [4] 0.8396947 1.585005
## [5] 0.8250000 1.557267
## [6] 0.8467153 1.550189
## [7] 0.8442623 1.545698
## [8] 0.8099174 1.528797
## [9] 0.8114754 1.525824
## [10] 0.8062016 1.515908
## [11] 0.8264463 1.513080
## [12] 0.8032787 1.510412
## [13] 0.8000000 1.510078
## [14] 0.8029197 1.509737
## [15] 0.8198758 1.501051
## [16] 0.8181818 1.497949
## [17] 0.8167939 1.495408
## [18] 0.8500000 1.473132
## [19] 0.8264463 1.432311
## [20] 0.8250000 1.429804
## [21] 0.8240000 1.428071
## [22] 0.8196721 1.420571
## [23] 0.8167939 1.415582
## [24] 0.8156028 1.413518
## [25] 0.8139535 1.410660
## [26] 0.8115942 1.406571
## [27] 0.8102190 1.404187
## [28] 0.8055556 1.396105
## [29] 0.8050314 1.395197
## [30] 0.8000000 1.386477
# 우측에 특정 카테고리가 있는 것만 탐색할 경우
rules_spo <- apriori(trans, parameter=list(support=0.2, confidence=0.8), appearance=list(rhs="스포츠",default="lhs"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.2 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 97
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[23 item(s), 487 transaction(s)] done [0.00s].
## sorting and recoding items ... [17 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [13 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules_spo)
## set of 13 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 6 7
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 4.000 3.538 4.000 4.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.2033 Min. :0.8000 Min. :1.386
## 1st Qu.:0.2094 1st Qu.:0.8102 1st Qu.:1.404
## Median :0.2197 Median :0.8156 Median :1.414
## Mean :0.2221 Mean :0.8172 Mean :1.416
## 3rd Qu.:0.2300 3rd Qu.:0.8240 3rd Qu.:1.428
## Max. :0.2628 Max. :0.8500 Max. :1.473
##
## mining info:
## data ntransactions support confidence
## trans 487 0.2 0.8
inspect(rules_spo)
## lhs rhs support confidence
## [1] {조리욕실,피혁} => {스포츠} 0.2053388 0.8196721
## [2] {영캐주얼,트래디셔널캐주얼} => {스포츠} 0.2053388 0.8264463
## [3] {유니캐주얼,트래디셔널캐주얼} => {스포츠} 0.2299795 0.8115942
## [4] {니트단품,트래디셔널캐주얼} => {스포츠} 0.2156057 0.8139535
## [5] {섬유,트래디셔널캐주얼} => {스포츠} 0.2217659 0.8000000
## [6] {트래디셔널캐주얼,피혁} => {스포츠} 0.2628337 0.8050314
## [7] {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000
## [8] {니트단품,영캐주얼,유니캐주얼} => {스포츠} 0.2114990 0.8240000
## [9] {섬유,영캐주얼,유니캐주얼} => {스포츠} 0.2032854 0.8250000
## [10] {영캐주얼,유니캐주얼,피혁} => {스포츠} 0.2381930 0.8055556
## [11] {섬유,영캐주얼,피혁} => {스포츠} 0.2197125 0.8167939
## [12] {니트단품,유니캐주얼,피혁} => {스포츠} 0.2279261 0.8102190
## [13] {섬유,유니캐주얼,피혁} => {스포츠} 0.2361396 0.8156028
## lift
## [1] 1.420571
## [2] 1.432311
## [3] 1.406571
## [4] 1.410660
## [5] 1.386477
## [6] 1.395197
## [7] 1.473132
## [8] 1.428071
## [9] 1.429804
## [10] 1.396105
## [11] 1.415582
## [12] 1.404187
## [13] 1.413518
inspect(sort(rules_spo, by = "lift")[1:10])
## lhs rhs support confidence
## [1] {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000
## [2] {영캐주얼,트래디셔널캐주얼} => {스포츠} 0.2053388 0.8264463
## [3] {섬유,영캐주얼,유니캐주얼} => {스포츠} 0.2032854 0.8250000
## [4] {니트단품,영캐주얼,유니캐주얼} => {스포츠} 0.2114990 0.8240000
## [5] {조리욕실,피혁} => {스포츠} 0.2053388 0.8196721
## [6] {섬유,영캐주얼,피혁} => {스포츠} 0.2197125 0.8167939
## [7] {섬유,유니캐주얼,피혁} => {스포츠} 0.2361396 0.8156028
## [8] {니트단품,트래디셔널캐주얼} => {스포츠} 0.2156057 0.8139535
## [9] {유니캐주얼,트래디셔널캐주얼} => {스포츠} 0.2299795 0.8115942
## [10] {니트단품,유니캐주얼,피혁} => {스포츠} 0.2279261 0.8102190
## lift
## [1] 1.473132
## [2] 1.432311
## [3] 1.429804
## [4] 1.428071
## [5] 1.420571
## [6] 1.415582
## [7] 1.413518
## [8] 1.410660
## [9] 1.406571
## [10] 1.404187
# Lift 기준으로 filtering
rules.target <- subset(rules, rhs %in% "스포츠" & lift > 1.4)
inspect(sort(rules.target, by="confidence"))
## lhs rhs support confidence
## [1] {유니캐주얼,트래디셔널캐주얼,피혁} => {스포츠} 0.2094456 0.8500000
## [2] {영캐주얼,트래디셔널캐주얼} => {스포츠} 0.2053388 0.8264463
## [3] {섬유,영캐주얼,유니캐주얼} => {스포츠} 0.2032854 0.8250000
## [4] {니트단품,영캐주얼,유니캐주얼} => {스포츠} 0.2114990 0.8240000
## [5] {조리욕실,피혁} => {스포츠} 0.2053388 0.8196721
## [6] {섬유,영캐주얼,피혁} => {스포츠} 0.2197125 0.8167939
## [7] {섬유,유니캐주얼,피혁} => {스포츠} 0.2361396 0.8156028
## [8] {니트단품,트래디셔널캐주얼} => {스포츠} 0.2156057 0.8139535
## [9] {유니캐주얼,트래디셔널캐주얼} => {스포츠} 0.2299795 0.8115942
## [10] {니트단품,유니캐주얼,피혁} => {스포츠} 0.2279261 0.8102190
## lift
## [1] 1.473132
## [2] 1.432311
## [3] 1.429804
## [4] 1.428071
## [5] 1.420571
## [6] 1.415582
## [7] 1.413518
## [8] 1.410660
## [9] 1.406571
## [10] 1.404187
rule.interest <- subset(rules, items %in% c("장신구", "섬유")) # items : lhs, rhs 모두 검색
inspect(rule.interest[1:10])
## lhs rhs support confidence lift
## [1] {영캐주얼,장신구} => {피혁} 0.2012320 0.8596491 1.296127
## [2] {유니캐주얼,장신구} => {피혁} 0.2012320 0.8521739 1.284857
## [3] {니트단품,장신구} => {섬유} 0.2032854 0.8114754 1.525824
## [4] {니트단품,장신구} => {피혁} 0.2114990 0.8442623 1.272928
## [5] {섬유,장신구} => {피혁} 0.2156057 0.8333333 1.256450
## [6] {스포츠,장신구} => {피혁} 0.2114990 0.8306452 1.252397
## [7] {니트단품,트래디셔널캐주얼} => {섬유} 0.2135524 0.8062016 1.515908
## [8] {섬유,트래디셔널캐주얼} => {스포츠} 0.2217659 0.8000000 1.386477
## [9] {섬유,트래디셔널캐주얼} => {피혁} 0.2340862 0.8444444 1.273203
## [10] {섬유,유아동복} => {피혁} 0.2114990 0.8174603 1.232518
# csv 파일로 저장
# write(rules.target, file="arules.csv", sep=",", row.name=F)
# xml 파일로 저장
# library(pmml)
# write.PMML(rules.target, file = "arules.xml")
rule_df <- as(rules, "data.frame")
head(rule_df)
## rules support confidence lift
## 1 {트래디셔널캐주얼} => {피혁} 0.3264887 0.8412698 1.268416
## 2 {스포츠,엘레강스캐주얼} => {피혁} 0.2012320 0.8596491 1.296127
## 3 {스포츠,조리욕실} => {피혁} 0.2053388 0.8403361 1.267008
## 4 {조리욕실,피혁} => {스포츠} 0.2053388 0.8196721 1.420571
## 5 {영캐주얼,장신구} => {피혁} 0.2012320 0.8596491 1.296127
## 6 {유니캐주얼,장신구} => {피혁} 0.2012320 0.8521739 1.284857
##### Visualize Association Rules using arulesViz package
library(arulesViz)
plot(rules)
plot(sort(rules, by = "lift")[1:20], method = "grouped")
plot(rules, method = "graph", control = list(type="items"))
##### Exercise
# 여성쇼핑몰 C사 고객 786명의 10가지 구매품목에 대한 거래이력을 기초로
# 반응률이 높은 교차판매전략을 기획하기 위해 연관규칙탐사 수행.
# 고객id별로 각 품목에 대한 구매여부 0 or 1로 표시
data <- read.delim("data/shoppingmall.txt", stringsAsFactors=FALSE)
head(data)
## ID heel tee skirt knit jacket jewelry coat flat shorts blous
## 1 1 1 0 0 0 0 0 0 0 1 0
## 2 2 1 0 0 0 0 0 0 1 0 0
## 3 3 1 0 0 0 0 0 0 1 1 0
## 4 4 1 0 0 0 1 1 0 0 0 0
## 5 5 1 0 0 0 0 0 0 0 0 0
## 6 6 1 0 0 0 0 1 0 0 1 1
st <- as.matrix(data[,-1])
trans <- as(st, "transactions")
trans
## transactions in sparse format with
## 786 transactions (rows) and
## 10 items (columns)
head(inspect(trans))
## items
## [1] {heel,
## shorts}
## [2] {heel,
## flat}
## [3] {heel,
## flat,
## shorts}
## [4] {heel,
## jacket,
## jewelry}
## [5] {heel}
## [6] {heel,
## jewelry,
## shorts,
## blous}
image(trans[1:10])
image(sample(trans, 100, replace = FALSE), main = "matrix diagram")
itemFrequency(trans, type="absolute")
## heel tee skirt knit jacket jewelry coat flat shorts
## 387 316 310 65 148 337 23 78 373
## blous
## 358
itemFrequency(trans, type="relative")
## heel tee skirt knit jacket jewelry
## 0.49236641 0.40203562 0.39440204 0.08269720 0.18829517 0.42875318
## coat flat shorts blous
## 0.02926209 0.09923664 0.47455471 0.45547074
itemFrequency(trans)[order(itemFrequency(trans), decreasing = TRUE)]
## heel shorts blous jewelry tee skirt
## 0.49236641 0.47455471 0.45547074 0.42875318 0.40203562 0.39440204
## jacket flat knit coat
## 0.18829517 0.09923664 0.08269720 0.02926209
# 구매 내역이 적으면 지지도를 낮춰야 함.
itemFrequencyPlot(trans, support=0.2, cex.names=0.8)
itemFrequencyPlot(trans, topN = 10, main = "support top 20 items")
rules <- apriori(trans, parameter=list(support=0.07, confidence=0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.07 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 55
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 786 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [17 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 17 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4 5
## 1 11 5
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 4.000 4.000 4.235 5.000 5.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.07125 Min. :0.8000 Min. :1.654
## 1st Qu.:0.07252 1st Qu.:0.8143 1st Qu.:1.879
## Median :0.07761 Median :0.8316 Median :1.949
## Mean :0.07955 Mean :0.8327 Mean :1.924
## 3rd Qu.:0.07888 3rd Qu.:0.8472 3rd Qu.:2.004
## Max. :0.10051 Max. :0.8841 Max. :2.065
##
## mining info:
## data ntransactions support confidence
## trans 786 0.07 0.8
inspect(rules)
## lhs rhs support confidence
## [1] {tee,jacket} => {jewelry} 0.09033079 0.8352941
## [2] {tee,skirt,jacket} => {jewelry} 0.07251908 0.8636364
## [3] {skirt,jacket,jewelry} => {tee} 0.07251908 0.8028169
## [4] {tee,jacket,jewelry} => {skirt} 0.07251908 0.8028169
## [5] {heel,skirt,jacket} => {jewelry} 0.07124682 0.8000000
## [6] {tee,jacket,blous} => {jewelry} 0.07124682 0.8750000
## [7] {jacket,shorts,blous} => {jewelry} 0.07124682 0.8358209
## [8] {heel,jacket,blous} => {jewelry} 0.07888041 0.8378378
## [9] {heel,jacket,shorts} => {jewelry} 0.07379135 0.8055556
## [10] {skirt,jewelry,blous} => {heel} 0.10050891 0.8144330
## [11] {heel,skirt,blous} => {jewelry} 0.10050891 0.8315789
## [12] {tee,shorts,blous} => {jewelry} 0.09414758 0.8222222
## [13] {tee,skirt,jewelry,blous} => {heel} 0.07760814 0.8243243
## [14] {heel,tee,skirt,blous} => {jewelry} 0.07760814 0.8591549
## [15] {heel,tee,jewelry,shorts} => {skirt} 0.07251908 0.8142857
## [16] {skirt,jewelry,shorts,blous} => {heel} 0.07760814 0.8840580
## [17] {heel,skirt,shorts,blous} => {jewelry} 0.07760814 0.8472222
## lift
## [1] 1.948193
## [2] 2.014297
## [3] 1.996880
## [4] 2.035529
## [5] 1.865875
## [6] 2.040801
## [7] 1.949422
## [8] 1.954126
## [9] 1.878833
## [10] 1.654120
## [11] 1.939528
## [12] 1.917705
## [13] 1.674209
## [14] 2.003845
## [15] 2.064608
## [16] 1.795529
## [17] 1.976014
inspect(sort(rules, by = "lift"))
## lhs rhs support confidence
## [1] {heel,tee,jewelry,shorts} => {skirt} 0.07251908 0.8142857
## [2] {tee,jacket,blous} => {jewelry} 0.07124682 0.8750000
## [3] {tee,jacket,jewelry} => {skirt} 0.07251908 0.8028169
## [4] {tee,skirt,jacket} => {jewelry} 0.07251908 0.8636364
## [5] {heel,tee,skirt,blous} => {jewelry} 0.07760814 0.8591549
## [6] {skirt,jacket,jewelry} => {tee} 0.07251908 0.8028169
## [7] {heel,skirt,shorts,blous} => {jewelry} 0.07760814 0.8472222
## [8] {heel,jacket,blous} => {jewelry} 0.07888041 0.8378378
## [9] {jacket,shorts,blous} => {jewelry} 0.07124682 0.8358209
## [10] {tee,jacket} => {jewelry} 0.09033079 0.8352941
## [11] {heel,skirt,blous} => {jewelry} 0.10050891 0.8315789
## [12] {tee,shorts,blous} => {jewelry} 0.09414758 0.8222222
## [13] {heel,jacket,shorts} => {jewelry} 0.07379135 0.8055556
## [14] {heel,skirt,jacket} => {jewelry} 0.07124682 0.8000000
## [15] {skirt,jewelry,shorts,blous} => {heel} 0.07760814 0.8840580
## [16] {tee,skirt,jewelry,blous} => {heel} 0.07760814 0.8243243
## [17] {skirt,jewelry,blous} => {heel} 0.10050891 0.8144330
## lift
## [1] 2.064608
## [2] 2.040801
## [3] 2.035529
## [4] 2.014297
## [5] 2.003845
## [6] 1.996880
## [7] 1.976014
## [8] 1.954126
## [9] 1.949422
## [10] 1.948193
## [11] 1.939528
## [12] 1.917705
## [13] 1.878833
## [14] 1.865875
## [15] 1.795529
## [16] 1.674209
## [17] 1.654120
plot(rules)
plot(sort(rules, by = "lift"), method = "grouped")
plot(rules, method = "graph", control = list(type="items"))
# frequent itemset
rules2 <- apriori(trans, parameter=list(support=0.2, target = "frequent itemsets"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.2 1
## maxlen target ext
## 10 frequent itemsets FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 157
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 786 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [20 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules2)
## set of 20 itemsets
##
## most frequent items:
## heel tee jewelry shorts skirt (Other)
## 6 6 6 6 5 5
##
## element (itemset/transaction) length distribution:sizes
## 1 2
## 6 14
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 2.0 1.7 2.0 2.0
##
## summary of quality measures:
## support
## Min. :0.2074
## 1st Qu.:0.2160
## Median :0.2290
## Mean :0.2889
## 3rd Qu.:0.3963
## Max. :0.4924
##
## includes transaction ID lists: FALSE
##
## mining info:
## data ntransactions support confidence
## trans 786 0.2 1
inspect(sort(rules2, by = "support"))
## items support
## [1] {heel} 0.4923664
## [2] {shorts} 0.4745547
## [3] {blous} 0.4554707
## [4] {jewelry} 0.4287532
## [5] {tee} 0.4020356
## [6] {skirt} 0.3944020
## [7] {heel,jewelry} 0.2557252
## [8] {heel,shorts} 0.2442748
## [9] {jewelry,shorts} 0.2328244
## [10] {tee,skirt} 0.2302799
## [11] {jewelry,blous} 0.2277354
## [12] {shorts,blous} 0.2239186
## [13] {tee,jewelry} 0.2213740
## [14] {skirt,shorts} 0.2188295
## [15] {heel,blous} 0.2162850
## [16] {skirt,jewelry} 0.2150127
## [17] {tee,shorts} 0.2137405
## [18] {heel,skirt} 0.2124682
## [19] {heel,tee} 0.2111959
## [20] {tee,blous} 0.2073791