R 데이터마이닝 예측분석

# Data Mining Classification Process Demo
# PEP: Personal Equity Plan, 연금보험 구매 여부

library(caret)

library(ROCR)

library(C50)
library(e1071)

# 1. Data Preparation

# Train Set
train <- read.csv("data/pepTrainSet.csv", stringsAsFactors = T)
head(train)

##        id age    sex     region  income married children car save_act
## 1 ID12101  48 FEMALE INNER_CITY 17546.0      NO        1  NO       NO
## 2 ID12102  40   MALE       TOWN 30085.1     YES        3 YES       NO
## 3 ID12103  51 FEMALE INNER_CITY 16575.4     YES        0 YES      YES
## 4 ID12104  23 FEMALE       TOWN 20375.4     YES        3  NO       NO
## 5 ID12105  57 FEMALE      RURAL 50576.3     YES        0  NO      YES
## 6 ID12106  57 FEMALE       TOWN 37869.6     YES        2  NO      YES
##   current_act mortgage pep
## 1          NO       NO YES
## 2         YES      YES  NO
## 3         YES       NO  NO
## 4         YES       NO  NO
## 5          NO       NO  NO
## 6         YES       NO YES

summary(train)

##        id           age            sex             region   
##  ID12101:  1   Min.   :18.00   FEMALE:145   INNER_CITY:146  
##  ID12102:  1   1st Qu.:29.00   MALE  :155   RURAL     : 46  
##  ID12103:  1   Median :41.50                SUBURBAN  : 23  
##  ID12104:  1   Mean   :42.09                TOWN      : 85  
##  ID12105:  1   3rd Qu.:54.00                                
##  ID12106:  1   Max.   :67.00                                
##  (Other):294                                                
##      income      married      children      car      save_act  current_act
##  Min.   : 5014   NO : 99   Min.   :0.000   NO :157   NO : 97   NO : 63    
##  1st Qu.:17099   YES:201   1st Qu.:0.000   YES:143   YES:203   YES:237    
##  Median :24718             Median :1.000                                  
##  Mean   :27250             Mean   :1.057                                  
##  3rd Qu.:35350             3rd Qu.:2.000                                  
##  Max.   :63130             Max.   :3.000                                  
##                                                                           
##  mortgage   pep     
##  NO :203   NO :165  
##  YES: 97   YES:135  
##

# Test Set
test <- read.csv("data/pepTestSet.csv", stringsAsFactors = T)
head(test)

##        id age    sex     region   income married children car save_act
## 1 ID12401  19 FEMALE INNER_CITY  8162.42     YES        1 YES      YES
## 2 ID12402  37 FEMALE       TOWN 15349.60     YES        0  NO      YES
## 3 ID12403  45 FEMALE       TOWN 29231.40     YES        0  NO      YES
## 4 ID12404  49   MALE      RURAL 41462.30     YES        3  NO      YES
## 5 ID12405  67 FEMALE      RURAL 57398.10      NO        3  NO      YES
## 6 ID12406  35 FEMALE      RURAL 11520.80     YES        0  NO       NO
##   current_act mortgage pep
## 1         YES      YES  NO
## 2          NO       NO  NO
## 3          NO       NO  NO
## 4         YES      YES  NO
## 5         YES       NO YES
## 6         YES       NO  NO

summary(test)

##        id           age           sex             region   
##  ID12401:  1   Min.   :18.0   FEMALE:155   INNER_CITY:123  
##  ID12402:  1   1st Qu.:31.0   MALE  :145   RURAL     : 50  
##  ID12403:  1   Median :43.0                SUBURBAN  : 39  
##  ID12404:  1   Mean   :42.7                TOWN      : 88  
##  ID12405:  1   3rd Qu.:56.0                                
##  ID12406:  1   Max.   :67.0                                
##  (Other):294                                               
##      income      married      children       car      save_act 
##  Min.   : 6294   NO :105   Min.   :0.0000   NO :147   NO : 89  
##  1st Qu.:17739   YES:195   1st Qu.:0.0000   YES:153   YES:211  
##  Median :25691             Median :1.0000                      
##  Mean   :27798             Mean   :0.9667                      
##  3rd Qu.:36611             3rd Qu.:2.0000                      
##  Max.   :61555             Max.   :3.0000                      
##                                                                
##  current_act mortgage   pep     
##  NO : 82     NO :188   NO :161  
##  YES:218     YES:112   YES:139  
##

train <- subset(train, select=-c(id))   # id 제거
test <- subset(test, select=-c(id))

# 예측에 사용한 New Data
newdata <- read.csv("data/pepNewCustomers.csv", stringsAsFactors = T)
head(newdata)

##        id age    sex     region   income married children car save_act
## 1 ID12701  23   MALE INNER_CITY 18766.90     YES        0 YES      YES
## 2 ID12702  30   MALE      RURAL  9915.67      NO        1  NO      YES
## 3 ID12703  45 FEMALE      RURAL 21881.60      NO        0 YES      YES
## 4 ID12704  50   MALE       TOWN 46794.40     YES        2  NO      YES
## 5 ID12705  41 FEMALE INNER_CITY 20721.10     YES        0 YES      YES
## 6 ID12706  20   MALE INNER_CITY 16688.50      NO        1  NO      YES
##   current_act mortgage
## 1          NO      YES
## 2          NO      YES
## 3         YES       NO
## 4          NO      YES
## 5         YES       NO
## 6         YES      YES

summary(newdata)

##        id           age            sex             region  
##  ID12701:  1   Min.   :18.00   FEMALE:106   INNER_CITY:85  
##  ID12702:  1   1st Qu.:29.00   MALE  : 94   RURAL     :22  
##  ID12703:  1   Median :41.50                SUBURBAN  :33  
##  ID12704:  1   Mean   :41.77                TOWN      :60  
##  ID12705:  1   3rd Qu.:54.00                               
##  ID12706:  1   Max.   :67.00                               
##  (Other):194                                               
##      income      married      children     car      save_act  current_act
##  Min.   : 5960   NO : 82   Min.   :0.00   NO : 91   NO : 66   NO : 53    
##  1st Qu.:17207   YES:118   1st Qu.:0.00   YES:109   YES:134   YES:147    
##  Median :23908             Median :1.00                                  
##  Mean   :26767             Mean   :1.11                                  
##  3rd Qu.:35976             3rd Qu.:2.00                                  
##  Max.   :61477             Max.   :3.00                                  
##                                                                          
##  mortgage 
##  NO :122  
##  YES: 78  
##

newdata$id <- as.character(newdata$id)   # id 변수 factor --> character로 변환

# 2. Modeling

# 1st candidate model : decision tree
c5_options <- C5.0Control(winnow = FALSE, noGlobalPruning = FALSE)
c5_model <- C5.0(pep ~ ., data = train, control = c5_options, rules = FALSE)
plot(c5_model)

pred_train <- predict(c5_model, train, type = "class")
confusionMatrix(pred_train, train$pep)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  159  13
##        YES   6 122
##                                           
##                Accuracy : 0.9367          
##                  95% CI : (0.9029, 0.9614)
##     No Information Rate : 0.55            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8714          
##  Mcnemar's Test P-Value : 0.1687          
##                                           
##             Sensitivity : 0.9636          
##             Specificity : 0.9037          
##          Pos Pred Value : 0.9244          
##          Neg Pred Value : 0.9531          
##              Prevalence : 0.5500          
##          Detection Rate : 0.5300          
##    Detection Prevalence : 0.5733          
##       Balanced Accuracy : 0.9337          
##                                           
##        'Positive' Class : NO              
##

# 2nd candidate model : logistic regression
lm_model <- glm(pep ~ ., data = train, family = binomial)
summary(lm_model)

## 
## Call:
## glm(formula = pep ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1618  -0.9717  -0.6610   1.0941   2.0354  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -1.568e+00  5.666e-01  -2.767  0.00565 **
## age             1.593e-02  1.299e-02   1.227  0.21998   
## sexMALE         6.887e-01  2.583e-01   2.666  0.00768 **
## regionRURAL     2.696e-01  3.702e-01   0.728  0.46641   
## regionSUBURBAN -4.920e-02  4.905e-01  -0.100  0.92011   
## regionTOWN     -1.790e-01  2.960e-01  -0.605  0.54541   
## income          3.863e-05  1.518e-05   2.545  0.01093 * 
## marriedYES     -6.382e-01  2.679e-01  -2.382  0.01720 * 
## children       -1.800e-01  1.195e-01  -1.506  0.13200   
## carYES         -2.357e-01  2.535e-01  -0.930  0.35241   
## save_actYES    -3.225e-01  2.786e-01  -1.158  0.24696   
## current_actYES  3.680e-01  3.159e-01   1.165  0.24401   
## mortgageYES    -1.942e-01  2.696e-01  -0.720  0.47128   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 412.88  on 299  degrees of freedom
## Residual deviance: 370.64  on 287  degrees of freedom
## AIC: 396.64
## 
## Number of Fisher Scoring iterations: 4

# 3. Model Evaluation with Test Data by Confusion Matrix 

# (1) C5 decision tree
test$c5_pred <- predict(c5_model, test, type = "class")      # 예측결과
test$c5_pred_prob <- predict(c5_model, test, type = "prob")  # 확률
confusionMatrix(test$c5_pred, test$pep)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  146  18
##        YES  15 121
##                                          
##                Accuracy : 0.89           
##                  95% CI : (0.849, 0.9231)
##     No Information Rate : 0.5367         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7785         
##  Mcnemar's Test P-Value : 0.7277         
##                                          
##             Sensitivity : 0.9068         
##             Specificity : 0.8705         
##          Pos Pred Value : 0.8902         
##          Neg Pred Value : 0.8897         
##              Prevalence : 0.5367         
##          Detection Rate : 0.4867         
##    Detection Prevalence : 0.5467         
##       Balanced Accuracy : 0.8887         
##                                          
##        'Positive' Class : NO             
##

head(test)

##   age    sex     region   income married children car save_act current_act
## 1  19 FEMALE INNER_CITY  8162.42     YES        1 YES      YES         YES
## 2  37 FEMALE       TOWN 15349.60     YES        0  NO      YES          NO
## 3  45 FEMALE       TOWN 29231.40     YES        0  NO      YES          NO
## 4  49   MALE      RURAL 41462.30     YES        3  NO      YES         YES
## 5  67 FEMALE      RURAL 57398.10      NO        3  NO      YES         YES
## 6  35 FEMALE      RURAL 11520.80     YES        0  NO       NO         YES
##   mortgage pep c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 1      YES  NO      NO      0.92500000       0.07500000
## 2       NO  NO      NO      0.89250000       0.10750000
## 3       NO  NO      NO      0.89250000       0.10750000
## 4      YES  NO      NO      0.92096774       0.07903226
## 5       NO YES     YES      0.09166667       0.90833333
## 6       NO  NO      NO      0.89250000       0.10750000

# (2) logistic regression
test$lm_pred <- ifelse(predict(lm_model, test, type = "response") > 0.5, "YES", "NO")
test$lm_pred_prob <- predict(lm_model, test, type = "response")
confusionMatrix(test$lm_pred, test$pep)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  108  71
##        YES  53  68
##                                          
##                Accuracy : 0.5867         
##                  95% CI : (0.5286, 0.643)
##     No Information Rate : 0.5367         
##     P-Value [Acc > NIR] : 0.04626        
##                                          
##                   Kappa : 0.1614         
##  Mcnemar's Test P-Value : 0.12685        
##                                          
##             Sensitivity : 0.6708         
##             Specificity : 0.4892         
##          Pos Pred Value : 0.6034         
##          Neg Pred Value : 0.5620         
##              Prevalence : 0.5367         
##          Detection Rate : 0.3600         
##    Detection Prevalence : 0.5967         
##       Balanced Accuracy : 0.5800         
##                                          
##        'Positive' Class : NO             
##

# 4. Model Evaluation by ROC chart

c5_pred <- prediction(test$c5_pred_prob[, "YES"], test$pep)
c5_model.perf <- performance(c5_pred, "tpr", "fpr")   # True positive rate, False positive rate

lm_pred <- prediction(test$lm_pred_prob, test$pep)
lm_model.perf <- performance(lm_pred, "tpr", "fpr")

plot(c5_model.perf, col = "red")
plot(lm_model.perf, col = "blue", add=T)
legend(0.7, 0.7, c("C5 ","LM "), cex = 0.9, col = c("red", "blue"), lty = 1)

# 5. Deployment - 신규 데이터에 모델 적용

newdata$c5_pred <- predict(c5_model, newdata, type = "class")
newdata$c5_pred_prob <- predict(c5_model, newdata, type = "prob")

# 연금보험 가입확률이 0.8 이상인 사람만 추출
target <- subset(newdata, c5_pred == "YES" & c5_pred_prob[ ,"YES"] > 0.8)
head(target)

##          id age    sex     region  income married children car save_act
## 3   ID12703  45 FEMALE      RURAL 21881.6      NO        0 YES      YES
## 4   ID12704  50   MALE       TOWN 46794.4     YES        2  NO      YES
## 6   ID12706  20   MALE INNER_CITY 16688.5      NO        1  NO      YES
## 8   ID12708  50 FEMALE INNER_CITY 27740.8     YES        1 YES      YES
## 10  ID12710  57 FEMALE       TOWN 19621.3     YES        1 YES       NO
## 12  ID12712  26 FEMALE INNER_CITY 22378.5      NO        0 YES       NO
##     current_act mortgage c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 3           YES       NO     YES      0.08500000       0.91500000
## 4            NO      YES     YES      0.05204082       0.94795918
## 6           YES      YES     YES      0.05344828       0.94655172
## 8            NO      YES     YES      0.07857143       0.92142857
## 10          YES       NO     YES      0.05344828       0.94655172
## 12          YES      YES     YES      0.13750000       0.86250000

selectedTarget <- target[order(target$c5_pred_prob[,"YES"], decreasing = T), ]
head(selectedTarget)

##         id age  sex     region  income married children car save_act
## 4  ID12704  50 MALE       TOWN 46794.4     YES        2  NO      YES
## 16 ID12716  44 MALE       TOWN 34961.7     YES        1  NO       NO
## 21 ID12721  40 MALE       TOWN 37227.8      NO        1 YES      YES
## 23 ID12723  54 MALE      RURAL 49986.7     YES        1 YES      YES
## 29 ID12729  58 MALE      RURAL 41114.2     YES        2 YES      YES
## 43 ID12743  66 MALE INNER_CITY 43720.4      NO        1  NO      YES
##    current_act mortgage c5_pred c5_pred_prob.NO c5_pred_prob.YES
## 4           NO      YES     YES      0.05204082       0.94795918
## 16          NO      YES     YES      0.05204082       0.94795918
## 21         YES       NO     YES      0.05204082       0.94795918
## 23         YES       NO     YES      0.05204082       0.94795918
## 29         YES      YES     YES      0.05204082       0.94795918
## 43          NO      YES     YES      0.05204082       0.94795918

write.csv(target[order(target$c5_pred_prob[,"YES"], decreasing=T), ], "dm_target.csv", row.names=FALSE)

R 데이터마이닝 예측분석

woosa7