# Data Mining Classification Process Demo
# PEP: Personal Equity Plan, 연금보험 구매 여부
library(caret)
library(ROCR)
library(C50)
library(e1071)
# 1. Data Preparation
# Train Set
train <- read.csv("data/pepTrainSet.csv", stringsAsFactors = T)
head(train)
## id age sex region income married children car save_act
## 1 ID12101 48 FEMALE INNER_CITY 17546.0 NO 1 NO NO
## 2 ID12102 40 MALE TOWN 30085.1 YES 3 YES NO
## 3 ID12103 51 FEMALE INNER_CITY 16575.4 YES 0 YES YES
## 4 ID12104 23 FEMALE TOWN 20375.4 YES 3 NO NO
## 5 ID12105 57 FEMALE RURAL 50576.3 YES 0 NO YES
## 6 ID12106 57 FEMALE TOWN 37869.6 YES 2 NO YES
## current_act mortgage pep
## 1 NO NO YES
## 2 YES YES NO
## 3 YES NO NO
## 4 YES NO NO
## 5 NO NO NO
## 6 YES NO YES
summary(train)
## id age sex region
## ID12101: 1 Min. :18.00 FEMALE:145 INNER_CITY:146
## ID12102: 1 1st Qu.:29.00 MALE :155 RURAL : 46
## ID12103: 1 Median :41.50 SUBURBAN : 23
## ID12104: 1 Mean :42.09 TOWN : 85
## ID12105: 1 3rd Qu.:54.00
## ID12106: 1 Max. :67.00
## (Other):294
## income married children car save_act current_act
## Min. : 5014 NO : 99 Min. :0.000 NO :157 NO : 97 NO : 63
## 1st Qu.:17099 YES:201 1st Qu.:0.000 YES:143 YES:203 YES:237
## Median :24718 Median :1.000
## Mean :27250 Mean :1.057
## 3rd Qu.:35350 3rd Qu.:2.000
## Max. :63130 Max. :3.000
##
## mortgage pep
## NO :203 NO :165
## YES: 97 YES:135
##
# Test Set
test <- read.csv("data/pepTestSet.csv", stringsAsFactors = T)
head(test)
## id age sex region income married children car save_act
## 1 ID12401 19 FEMALE INNER_CITY 8162.42 YES 1 YES YES
## 2 ID12402 37 FEMALE TOWN 15349.60 YES 0 NO YES
## 3 ID12403 45 FEMALE TOWN 29231.40 YES 0 NO YES
## 4 ID12404 49 MALE RURAL 41462.30 YES 3 NO YES
## 5 ID12405 67 FEMALE RURAL 57398.10 NO 3 NO YES
## 6 ID12406 35 FEMALE RURAL 11520.80 YES 0 NO NO
## current_act mortgage pep
## 1 YES YES NO
## 2 NO NO NO
## 3 NO NO NO
## 4 YES YES NO
## 5 YES NO YES
## 6 YES NO NO
summary(test)
## id age sex region
## ID12401: 1 Min. :18.0 FEMALE:155 INNER_CITY:123
## ID12402: 1 1st Qu.:31.0 MALE :145 RURAL : 50
## ID12403: 1 Median :43.0 SUBURBAN : 39
## ID12404: 1 Mean :42.7 TOWN : 88
## ID12405: 1 3rd Qu.:56.0
## ID12406: 1 Max. :67.0
## (Other):294
## income married children car save_act
## Min. : 6294 NO :105 Min. :0.0000 NO :147 NO : 89
## 1st Qu.:17739 YES:195 1st Qu.:0.0000 YES:153 YES:211
## Median :25691 Median :1.0000
## Mean :27798 Mean :0.9667
## 3rd Qu.:36611 3rd Qu.:2.0000
## Max. :61555 Max. :3.0000
##
## current_act mortgage pep
## NO : 82 NO :188 NO :161
## YES:218 YES:112 YES:139
##
train <- subset(train, select=-c(id)) # id 제거
test <- subset(test, select=-c(id))
# 예측에 사용한 New Data
newdata <- read.csv("data/pepNewCustomers.csv", stringsAsFactors = T)
head(newdata)
## id age sex region income married children car save_act
## 1 ID12701 23 MALE INNER_CITY 18766.90 YES 0 YES YES
## 2 ID12702 30 MALE RURAL 9915.67 NO 1 NO YES
## 3 ID12703 45 FEMALE RURAL 21881.60 NO 0 YES YES
## 4 ID12704 50 MALE TOWN 46794.40 YES 2 NO YES
## 5 ID12705 41 FEMALE INNER_CITY 20721.10 YES 0 YES YES
## 6 ID12706 20 MALE INNER_CITY 16688.50 NO 1 NO YES
## current_act mortgage
## 1 NO YES
## 2 NO YES
## 3 YES NO
## 4 NO YES
## 5 YES NO
## 6 YES YES
summary(newdata)
## id age sex region
## ID12701: 1 Min. :18.00 FEMALE:106 INNER_CITY:85
## ID12702: 1 1st Qu.:29.00 MALE : 94 RURAL :22
## ID12703: 1 Median :41.50 SUBURBAN :33
## ID12704: 1 Mean :41.77 TOWN :60
## ID12705: 1 3rd Qu.:54.00
## ID12706: 1 Max. :67.00
## (Other):194
## income married children car save_act current_act
## Min. : 5960 NO : 82 Min. :0.00 NO : 91 NO : 66 NO : 53
## 1st Qu.:17207 YES:118 1st Qu.:0.00 YES:109 YES:134 YES:147
## Median :23908 Median :1.00
## Mean :26767 Mean :1.11
## 3rd Qu.:35976 3rd Qu.:2.00
## Max. :61477 Max. :3.00
##
## mortgage
## NO :122
## YES: 78
##
newdata$id <- as.character(newdata$id) # id 변수 factor --> character로 변환
# 2. Modeling
# 1st candidate model : decision tree
c5_options <- C5.0Control(winnow = FALSE, noGlobalPruning = FALSE)
c5_model <- C5.0(pep ~ ., data = train, control = c5_options, rules = FALSE)
plot(c5_model)
pred_train <- predict(c5_model, train, type = "class")
confusionMatrix(pred_train, train$pep)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 159 13
## YES 6 122
##
## Accuracy : 0.9367
## 95% CI : (0.9029, 0.9614)
## No Information Rate : 0.55
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8714
## Mcnemar's Test P-Value : 0.1687
##
## Sensitivity : 0.9636
## Specificity : 0.9037
## Pos Pred Value : 0.9244
## Neg Pred Value : 0.9531
## Prevalence : 0.5500
## Detection Rate : 0.5300
## Detection Prevalence : 0.5733
## Balanced Accuracy : 0.9337
##
## 'Positive' Class : NO
##
# 2nd candidate model : logistic regression
lm_model <- glm(pep ~ ., data = train, family = binomial)
summary(lm_model)
##
## Call:
## glm(formula = pep ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1618 -0.9717 -0.6610 1.0941 2.0354
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.568e+00 5.666e-01 -2.767 0.00565 **
## age 1.593e-02 1.299e-02 1.227 0.21998
## sexMALE 6.887e-01 2.583e-01 2.666 0.00768 **
## regionRURAL 2.696e-01 3.702e-01 0.728 0.46641
## regionSUBURBAN -4.920e-02 4.905e-01 -0.100 0.92011
## regionTOWN -1.790e-01 2.960e-01 -0.605 0.54541
## income 3.863e-05 1.518e-05 2.545 0.01093 *
## marriedYES -6.382e-01 2.679e-01 -2.382 0.01720 *
## children -1.800e-01 1.195e-01 -1.506 0.13200
## carYES -2.357e-01 2.535e-01 -0.930 0.35241
## save_actYES -3.225e-01 2.786e-01 -1.158 0.24696
## current_actYES 3.680e-01 3.159e-01 1.165 0.24401
## mortgageYES -1.942e-01 2.696e-01 -0.720 0.47128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 412.88 on 299 degrees of freedom
## Residual deviance: 370.64 on 287 degrees of freedom
## AIC: 396.64
##
## Number of Fisher Scoring iterations: 4
# 3. Model Evaluation with Test Data by Confusion Matrix
# (1) C5 decision tree
test$c5_pred <- predict(c5_model, test, type = "class") # 예측결과
test$c5_pred_prob <- predict(c5_model, test, type = "prob") # 확률
confusionMatrix(test$c5_pred, test$pep)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 146 18
## YES 15 121
##
## Accuracy : 0.89
## 95% CI : (0.849, 0.9231)
## No Information Rate : 0.5367
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7785
## Mcnemar's Test P-Value : 0.7277
##
## Sensitivity : 0.9068
## Specificity : 0.8705
## Pos Pred Value : 0.8902
## Neg Pred Value : 0.8897
## Prevalence : 0.5367
## Detection Rate : 0.4867
## Detection Prevalence : 0.5467
## Balanced Accuracy : 0.8887
##
## 'Positive' Class : NO
##
head(test)
## age sex region income married children car save_act current_act
## 1 19 FEMALE INNER_CITY 8162.42 YES 1 YES YES YES
## 2 37 FEMALE TOWN 15349.60 YES 0 NO YES NO
## 3 45 FEMALE TOWN 29231.40 YES 0 NO YES NO
## 4 49 MALE RURAL 41462.30 YES 3 NO YES YES
## 5 67 FEMALE RURAL 57398.10 NO 3 NO YES YES
## 6 35 FEMALE RURAL 11520.80 YES 0 NO NO YES
## mortgage pep c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 1 YES NO NO 0.92500000 0.07500000
## 2 NO NO NO 0.89250000 0.10750000
## 3 NO NO NO 0.89250000 0.10750000
## 4 YES NO NO 0.92096774 0.07903226
## 5 NO YES YES 0.09166667 0.90833333
## 6 NO NO NO 0.89250000 0.10750000
# (2) logistic regression
test$lm_pred <- ifelse(predict(lm_model, test, type = "response") > 0.5, "YES", "NO")
test$lm_pred_prob <- predict(lm_model, test, type = "response")
confusionMatrix(test$lm_pred, test$pep)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 108 71
## YES 53 68
##
## Accuracy : 0.5867
## 95% CI : (0.5286, 0.643)
## No Information Rate : 0.5367
## P-Value [Acc > NIR] : 0.04626
##
## Kappa : 0.1614
## Mcnemar's Test P-Value : 0.12685
##
## Sensitivity : 0.6708
## Specificity : 0.4892
## Pos Pred Value : 0.6034
## Neg Pred Value : 0.5620
## Prevalence : 0.5367
## Detection Rate : 0.3600
## Detection Prevalence : 0.5967
## Balanced Accuracy : 0.5800
##
## 'Positive' Class : NO
##
# 4. Model Evaluation by ROC chart
c5_pred <- prediction(test$c5_pred_prob[, "YES"], test$pep)
c5_model.perf <- performance(c5_pred, "tpr", "fpr") # True positive rate, False positive rate
lm_pred <- prediction(test$lm_pred_prob, test$pep)
lm_model.perf <- performance(lm_pred, "tpr", "fpr")
plot(c5_model.perf, col = "red")
plot(lm_model.perf, col = "blue", add=T)
legend(0.7, 0.7, c("C5 ","LM "), cex = 0.9, col = c("red", "blue"), lty = 1)
# 5. Deployment - 신규 데이터에 모델 적용
newdata$c5_pred <- predict(c5_model, newdata, type = "class")
newdata$c5_pred_prob <- predict(c5_model, newdata, type = "prob")
# 연금보험 가입확률이 0.8 이상인 사람만 추출
target <- subset(newdata, c5_pred == "YES" & c5_pred_prob[ ,"YES"] > 0.8)
head(target)
## id age sex region income married children car save_act
## 3 ID12703 45 FEMALE RURAL 21881.6 NO 0 YES YES
## 4 ID12704 50 MALE TOWN 46794.4 YES 2 NO YES
## 6 ID12706 20 MALE INNER_CITY 16688.5 NO 1 NO YES
## 8 ID12708 50 FEMALE INNER_CITY 27740.8 YES 1 YES YES
## 10 ID12710 57 FEMALE TOWN 19621.3 YES 1 YES NO
## 12 ID12712 26 FEMALE INNER_CITY 22378.5 NO 0 YES NO
## current_act mortgage c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 3 YES NO YES 0.08500000 0.91500000
## 4 NO YES YES 0.05204082 0.94795918
## 6 YES YES YES 0.05344828 0.94655172
## 8 NO YES YES 0.07857143 0.92142857
## 10 YES NO YES 0.05344828 0.94655172
## 12 YES YES YES 0.13750000 0.86250000
selectedTarget <- target[order(target$c5_pred_prob[,"YES"], decreasing = T), ]
head(selectedTarget)
## id age sex region income married children car save_act
## 4 ID12704 50 MALE TOWN 46794.4 YES 2 NO YES
## 16 ID12716 44 MALE TOWN 34961.7 YES 1 NO NO
## 21 ID12721 40 MALE TOWN 37227.8 NO 1 YES YES
## 23 ID12723 54 MALE RURAL 49986.7 YES 1 YES YES
## 29 ID12729 58 MALE RURAL 41114.2 YES 2 YES YES
## 43 ID12743 66 MALE INNER_CITY 43720.4 NO 1 NO YES
## current_act mortgage c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 4 NO YES YES 0.05204082 0.94795918
## 16 NO YES YES 0.05204082 0.94795918
## 21 YES NO YES 0.05204082 0.94795918
## 23 YES NO YES 0.05204082 0.94795918
## 29 YES YES YES 0.05204082 0.94795918
## 43 NO YES YES 0.05204082 0.94795918
write.csv(target[order(target$c5_pred_prob[,"YES"], decreasing=T), ], "dm_target.csv", row.names=FALSE)