R 변수의 구간화 결측치 처리

###############################################################
# 데이터 가공 & 관리
###############################################################

1. 변수의 중요도

library(klaR)

data("B3")      # West German Business Cycles 1955-1994
head(B3)

##        PHASEN BSP91JW CP91JW DEFRATE EWAJW EXIMRATE GM1JW IAU91JW IB91JW
## 1955,4      2   10.53   9.31    0.05   5.7     3.08 11.15   23.56  14.69
## 1956,1      2   10.60  12.66    0.06   5.2     1.96 11.03   12.72  24.95
## 1956,2      3    9.21   6.55    0.05   4.8     2.82 10.04   11.52  14.90
## 1956,3      3    5.17   7.87    0.05   3.3     3.74  8.33    0.85   7.55
## 1956,4      3    4.93   8.60    0.04   2.1     4.16  7.69   -2.08   3.23
## 1957,1      3    8.39   5.62    0.04   3.2     2.90  6.62   -3.76  14.58
##        LSTKJW PBSPJW PCPJW ZINSK ZINSLR
## 1955,4   3.00   2.89  1.91  6.27   3.21
## 1956,1   2.36   2.59  2.20  4.60   3.54
## 1956,2   3.39   3.01  3.09  6.19   3.22
## 1956,3   5.30   3.03  2.08  6.71   3.37
## 1956,4   6.91   3.46  1.48  7.10   3.14
## 1957,1   1.03   1.95  1.65  4.96   4.95

str(B3)

## 'data.frame':    157 obs. of  14 variables:
##  $ PHASEN  : Factor w/ 4 levels "1","2","3","4": 2 2 3 3 3 3 3 3 3 3 ...
##  $ BSP91JW : num  10.53 10.6 9.21 5.17 4.93 ...
##  $ CP91JW  : num  9.31 12.66 6.55 7.87 8.6 ...
##  $ DEFRATE : num  0.05 0.06 0.05 0.05 0.04 0.04 0.04 0.03 0.03 0 ...
##  $ EWAJW   : num  5.7 5.2 4.8 3.3 2.1 3.2 2.5 2.7 3 0.3 ...
##  $ EXIMRATE: num  3.08 1.96 2.82 3.74 4.16 2.9 3.65 4.57 4.37 2.89 ...
##  $ GM1JW   : num  11.15 11.03 10.04 8.33 7.69 ...
##  $ IAU91JW : num  23.56 12.72 11.52 0.85 -2.08 ...
##  $ IB91JW  : num  14.69 24.95 14.9 7.55 3.23 ...
##  $ LSTKJW  : num  3 2.36 3.39 5.3 6.91 1.03 3.73 6.2 4.12 7.94 ...
##  $ PBSPJW  : num  2.89 2.59 3.01 3.03 3.46 1.95 3.18 3.98 3.29 5.63 ...
##  $ PCPJW   : num  1.91 2.2 3.09 2.08 1.48 1.65 1.47 3.29 3.59 4.19 ...
##  $ ZINSK   : num  6.27 4.6 6.19 6.71 7.1 4.96 5.21 4.83 4.5 3.83 ...
##  $ ZINSLR  : num  3.21 3.54 3.22 3.37 3.14 4.95 3.82 3.09 3.91 1.47 ...

# AIC 이용한 변수 선택법 : PHASEN이 범주형 변수이므로 사용할 수 없음.
# step(model, direction = "both")

# Wilks.lambda : 집단내 분산 / 총분산
# 종속변수에 미치는 영향력에 따라 변수의 중요도를 정리 (작을수록 적합)

greedy.wilks(PHASEN ~ ., data = B3, niveau = 0.1)

## Formula containing included variables: 
## 
## PHASEN ~ EWAJW + LSTKJW + ZINSK + CP91JW + IAU91JW + PBSPJW + 
##     ZINSLR + PCPJW
## <environment: 0x00000000142a8998>
## 
## 
## Values calculated in each step of the selection procedure: 
## 
##      vars Wilks.lambda F.statistics.overall p.value.overall
## 1   EWAJW    0.6058201             33.18341    1.405358e-16
## 2  LSTKJW    0.4271561             26.85606    1.218146e-25
## 3   ZINSK    0.3614525             21.20584    7.607587e-29
## 4  CP91JW    0.3002868             19.05337    1.153881e-32
## 5 IAU91JW    0.2624925             17.11094    6.597858e-35
## 6  PBSPJW    0.2451025             14.99388    3.695840e-35
## 7  ZINSLR    0.2205325             13.94619    1.442943e-36
## 8   PCPJW    0.1999847             13.10739    9.454573e-38
##   F.statistics.diff p.value.diff
## 1         33.183411 1.405358e-16
## 2         21.192038 1.554268e-11
## 3          9.149422 1.326989e-05
## 4         10.184539 3.783582e-06
## 5          7.151127 1.604993e-04
## 6          3.500196 1.708972e-02
## 7          5.459204 1.379166e-03
## 8          5.000333 2.486333e-03

# 13개 변수 중에 8개 선택됨
# PHASEN ~ EWAJW + LSTKJW + ZINSK + CP91JW + IAU91JW + PBSPJW + ZINSLR + PCPJW

2. (연속형) 변수의 구간화

# Binning : 각각 동일한 갯수의 데이터를 50개 이하의 구간에 할당한 후 구간을 병합하면서 구간을 줄여나가는 방식
# 의사결정나무 : 연속형 데이터의 구간을 나누는 분기점을 찾을 수 있다.

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# 한 변수를 기준으로 구간 분석
iris2 <- iris[ , c(1,3,5)]
head(iris2)

##   Sepal.Length Petal.Length Species
## 1          5.1          1.4  setosa
## 2          4.9          1.4  setosa
## 3          4.7          1.3  setosa
## 4          4.6          1.5  setosa
## 5          5.0          1.4  setosa
## 6          5.4          1.7  setosa

plineplot(Species ~ ., data = iris2, method = "lda", x = iris[ , 4], xlab = "Petal.Width")

## [1] 0.03333333

# 0.6 / 1.7 지점에서 구간을 나누는 것이 좋다.

# 모든 변수를 기준으로 구간 분석
m <- NaiveBayes(Species ~ ., data = iris)
plot(m)

# 의사결정트리를 통해 구간 분석
library(party)

m <- ctree(Species ~ ., data = iris)
m

## 
##   Conditional inference tree with 4 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  150 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264
##   2)*  weights = 50 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894
##     4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865
##       5)*  weights = 46 
##     4) Petal.Length > 4.8
##       6)*  weights = 8 
##   3) Petal.Width > 1.7
##     7)*  weights = 46

plot(m)

3. 결측값 처리

(1) 단순 대치법 (Single Imputation)

- completes analysis : 결측값 존재하는 레코드 삭제
- mean imputaion : 관측 데이터의 평균으로 대치(비조건부) 또는 회귀분석을 활용한 대치(조건부)

(2) 다중 대치법 (Multiple Imputation)

- 단순 대치법을 m 번 수행. imputation - analysis - combination step.
- Amelia : time series cross sectional dataset을 활용

(3) imputation in R

- rfImpute() : Random Forest 모델은 결측값 존재시 바로 에러 발생. 이 함수 이용하여 결측값 대치 후 알고리즘 적용.
- complete.cases() : 데이터 내에 결측값 있으면 False
- is.na() : 결측값이 NA 인지 체크
- centralImputation() : DMwR 패키지. 중위수 또는 최빈값(factor)으로 대치
- knnImputation() : DMwR 패키지. knn 분류 알고리즘 사용
- amelia()

# Impute
library(Hmisc)

df <- data.frame(age = c(11, 23, NA, 40, 35, 15), gender = c('female', 'male'))
df

##   age gender
## 1  11 female
## 2  23   male
## 3  NA female
## 4  40   male
## 5  35 female
## 6  15   male

df$imputed_age <- with(df, impute(age, mean))
df$imputedR_age <- with(df, impute(age, "random"))
df

##   age gender imputed_age imputedR_age
## 1  11 female        11.0           11
## 2  23   male        23.0           23
## 3  NA female        24.8           40
## 4  40   male        40.0           40
## 5  35 female        35.0           35
## 6  15   male        15.0           15

# Amelia : Multiple Imputation of Incomplete Multivariate Data
library(Amelia)

data(freetrade)   # Trade Policy and Democracy in 9 Asian States
head(freetrade)

##   year  country tariff polity      pop   gdp.pc intresmi signed fiveop
## 1 1981 SriLanka     NA      6 14988000 461.0236 1.937347      0   12.4
## 2 1982 SriLanka     NA      5 15189000 473.7634 1.964430      0   12.5
## 3 1983 SriLanka   41.3      5 15417000 489.2266 1.663936      1   12.3
## 4 1984 SriLanka     NA      5 15599000 508.1739 2.797462      0   12.3
## 5 1985 SriLanka   31.0      5 15837000 525.5609 2.259116      0   12.3
## 6 1986 SriLanka     NA      5 16117000 538.9237 1.832549      0   12.5
##       usheg
## 1 0.2593112
## 2 0.2558008
## 3 0.2655022
## 4 0.2988009
## 5 0.2952431
## 6 0.2886563

summary(freetrade)   # tariff (관세) : NA 58개.

##       year        country              tariff           polity      
##  Min.   :1981   Length:171         Min.   :  7.10   Min.   :-8.000  
##  1st Qu.:1985   Class :character   1st Qu.: 16.30   1st Qu.:-2.000  
##  Median :1990   Mode  :character   Median : 25.20   Median : 5.000  
##  Mean   :1990                      Mean   : 31.65   Mean   : 2.905  
##  3rd Qu.:1995                      3rd Qu.: 40.80   3rd Qu.: 8.000  
##  Max.   :1999                      Max.   :100.00   Max.   : 9.000  
##                                    NA's   :58       NA's   :2       
##       pop                gdp.pc           intresmi          signed      
##  Min.   : 14105080   Min.   :  149.5   Min.   :0.9036   Min.   :0.0000  
##  1st Qu.: 19676715   1st Qu.:  420.1   1st Qu.:2.2231   1st Qu.:0.0000  
##  Median : 52799040   Median :  814.3   Median :3.1815   Median :0.0000  
##  Mean   :149904501   Mean   : 1867.3   Mean   :3.3752   Mean   :0.1548  
##  3rd Qu.:120888400   3rd Qu.: 2462.9   3rd Qu.:4.4063   3rd Qu.:0.0000  
##  Max.   :997515200   Max.   :12086.2   Max.   :7.9346   Max.   :1.0000  
##                                        NA's   :13       NA's   :3       
##      fiveop          usheg       
##  Min.   :12.30   Min.   :0.2558  
##  1st Qu.:12.50   1st Qu.:0.2623  
##  Median :12.60   Median :0.2756  
##  Mean   :12.74   Mean   :0.2764  
##  3rd Qu.:13.20   3rd Qu.:0.2887  
##  Max.   :13.20   Max.   :0.3083  
##  NA's   :18

missmap(freetrade)  # 결측치 분포 시각화

# 결측치 대치값 생성. 시작값 = min(tariff)
am_data <- amelia(freetrade, ts = "year", cs = "country")

## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14

# m : the number of imputed datasets to create.
# ts : time series column
# cs : cross section variable

summary(am_data)

## 
## Amelia output with 5 imputed datasets.
## Return code:  1 
## Message:  Normal EM convergence. 
## 
## Chain Lengths:
## --------------
## Imputation 1:  17
## Imputation 2:  21
## Imputation 3:  13
## Imputation 4:  15
## Imputation 5:  14
## 
## Rows after Listwise Deletion:  96 
## Rows after Imputation:  171 
## Patterns of missingness in the data:  8 
## 
## Fraction Missing for original variables: 
## -----------------------------------------
## 
##          Fraction Missing
## year           0.00000000
## country        0.00000000
## tariff         0.33918129
## polity         0.01169591
## pop            0.00000000
## gdp.pc         0.00000000
## intresmi       0.07602339
## signed         0.01754386
## fiveop         0.10526316
## usheg          0.00000000

summary(am_data$imputations[[1]]$tariff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -14.76   17.40   27.80   33.60   46.49  100.00

summary(am_data$imputations[[2]]$tariff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.813  16.950  27.000  31.080  41.550 100.000

summary(am_data$imputations[[3]]$tariff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -18.53   16.95   27.00   32.99   47.61  100.00

# 첫번째 am_data 값 적용
impute1 <- am_data$imputations[[1]]$tariff
hist(impute1, col = "grey", border = "black")

freetrade$tariff <- impute1     # 결측치 대치
missmap(freetrade)

plot(am_data)
par(mfrow=c(1,1))

R 변수의 중요도 구간화 결측치 처리

woosa7