###############################################################
# 데이터 가공 & 관리
###############################################################
1. 변수의 중요도
library(klaR)
data("B3") # West German Business Cycles 1955-1994
head(B3)
## PHASEN BSP91JW CP91JW DEFRATE EWAJW EXIMRATE GM1JW IAU91JW IB91JW
## 1955,4 2 10.53 9.31 0.05 5.7 3.08 11.15 23.56 14.69
## 1956,1 2 10.60 12.66 0.06 5.2 1.96 11.03 12.72 24.95
## 1956,2 3 9.21 6.55 0.05 4.8 2.82 10.04 11.52 14.90
## 1956,3 3 5.17 7.87 0.05 3.3 3.74 8.33 0.85 7.55
## 1956,4 3 4.93 8.60 0.04 2.1 4.16 7.69 -2.08 3.23
## 1957,1 3 8.39 5.62 0.04 3.2 2.90 6.62 -3.76 14.58
## LSTKJW PBSPJW PCPJW ZINSK ZINSLR
## 1955,4 3.00 2.89 1.91 6.27 3.21
## 1956,1 2.36 2.59 2.20 4.60 3.54
## 1956,2 3.39 3.01 3.09 6.19 3.22
## 1956,3 5.30 3.03 2.08 6.71 3.37
## 1956,4 6.91 3.46 1.48 7.10 3.14
## 1957,1 1.03 1.95 1.65 4.96 4.95
str(B3)
## 'data.frame': 157 obs. of 14 variables:
## $ PHASEN : Factor w/ 4 levels "1","2","3","4": 2 2 3 3 3 3 3 3 3 3 ...
## $ BSP91JW : num 10.53 10.6 9.21 5.17 4.93 ...
## $ CP91JW : num 9.31 12.66 6.55 7.87 8.6 ...
## $ DEFRATE : num 0.05 0.06 0.05 0.05 0.04 0.04 0.04 0.03 0.03 0 ...
## $ EWAJW : num 5.7 5.2 4.8 3.3 2.1 3.2 2.5 2.7 3 0.3 ...
## $ EXIMRATE: num 3.08 1.96 2.82 3.74 4.16 2.9 3.65 4.57 4.37 2.89 ...
## $ GM1JW : num 11.15 11.03 10.04 8.33 7.69 ...
## $ IAU91JW : num 23.56 12.72 11.52 0.85 -2.08 ...
## $ IB91JW : num 14.69 24.95 14.9 7.55 3.23 ...
## $ LSTKJW : num 3 2.36 3.39 5.3 6.91 1.03 3.73 6.2 4.12 7.94 ...
## $ PBSPJW : num 2.89 2.59 3.01 3.03 3.46 1.95 3.18 3.98 3.29 5.63 ...
## $ PCPJW : num 1.91 2.2 3.09 2.08 1.48 1.65 1.47 3.29 3.59 4.19 ...
## $ ZINSK : num 6.27 4.6 6.19 6.71 7.1 4.96 5.21 4.83 4.5 3.83 ...
## $ ZINSLR : num 3.21 3.54 3.22 3.37 3.14 4.95 3.82 3.09 3.91 1.47 ...
# AIC 이용한 변수 선택법 : PHASEN이 범주형 변수이므로 사용할 수 없음.
# step(model, direction = "both")
# Wilks.lambda : 집단내 분산 / 총분산
# 종속변수에 미치는 영향력에 따라 변수의 중요도를 정리 (작을수록 적합)
greedy.wilks(PHASEN ~ ., data = B3, niveau = 0.1)
## Formula containing included variables:
##
## PHASEN ~ EWAJW + LSTKJW + ZINSK + CP91JW + IAU91JW + PBSPJW +
## ZINSLR + PCPJW
## <environment: 0x00000000142a8998>
##
##
## Values calculated in each step of the selection procedure:
##
## vars Wilks.lambda F.statistics.overall p.value.overall
## 1 EWAJW 0.6058201 33.18341 1.405358e-16
## 2 LSTKJW 0.4271561 26.85606 1.218146e-25
## 3 ZINSK 0.3614525 21.20584 7.607587e-29
## 4 CP91JW 0.3002868 19.05337 1.153881e-32
## 5 IAU91JW 0.2624925 17.11094 6.597858e-35
## 6 PBSPJW 0.2451025 14.99388 3.695840e-35
## 7 ZINSLR 0.2205325 13.94619 1.442943e-36
## 8 PCPJW 0.1999847 13.10739 9.454573e-38
## F.statistics.diff p.value.diff
## 1 33.183411 1.405358e-16
## 2 21.192038 1.554268e-11
## 3 9.149422 1.326989e-05
## 4 10.184539 3.783582e-06
## 5 7.151127 1.604993e-04
## 6 3.500196 1.708972e-02
## 7 5.459204 1.379166e-03
## 8 5.000333 2.486333e-03
# 13개 변수 중에 8개 선택됨
# PHASEN ~ EWAJW + LSTKJW + ZINSK + CP91JW + IAU91JW + PBSPJW + ZINSLR + PCPJW
2. (연속형) 변수의 구간화
# Binning : 각각 동일한 갯수의 데이터를 50개 이하의 구간에 할당한 후 구간을 병합하면서 구간을 줄여나가는 방식
# 의사결정나무 : 연속형 데이터의 구간을 나누는 분기점을 찾을 수 있다.
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# 한 변수를 기준으로 구간 분석
iris2 <- iris[ , c(1,3,5)]
head(iris2)
## Sepal.Length Petal.Length Species
## 1 5.1 1.4 setosa
## 2 4.9 1.4 setosa
## 3 4.7 1.3 setosa
## 4 4.6 1.5 setosa
## 5 5.0 1.4 setosa
## 6 5.4 1.7 setosa
plineplot(Species ~ ., data = iris2, method = "lda", x = iris[ , 4], xlab = "Petal.Width")
## [1] 0.03333333
# 0.6 / 1.7 지점에서 구간을 나누는 것이 좋다.
# 모든 변수를 기준으로 구간 분석
m <- NaiveBayes(Species ~ ., data = iris)
plot(m)
# 의사결정트리를 통해 구간 분석
library(party)
m <- ctree(Species ~ ., data = iris)
m
##
## Conditional inference tree with 4 terminal nodes
##
## Response: Species
## Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
## Number of observations: 150
##
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264
## 2)* weights = 50
## 1) Petal.Length > 1.9
## 3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894
## 4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865
## 5)* weights = 46
## 4) Petal.Length > 4.8
## 6)* weights = 8
## 3) Petal.Width > 1.7
## 7)* weights = 46
plot(m)
3. 결측값 처리
(1) 단순 대치법 (Single Imputation)
- completes analysis : 결측값 존재하는 레코드 삭제
- mean imputaion : 관측 데이터의 평균으로 대치(비조건부) 또는 회귀분석을 활용한 대치(조건부)
(2) 다중 대치법 (Multiple Imputation)
- 단순 대치법을 m 번 수행. imputation - analysis - combination step.
- Amelia : time series cross sectional dataset을 활용
(3) imputation in R
- rfImpute() : Random Forest 모델은 결측값 존재시 바로 에러 발생. 이 함수 이용하여 결측값 대치 후 알고리즘 적용.
- complete.cases() : 데이터 내에 결측값 있으면 False
- is.na() : 결측값이 NA 인지 체크
- centralImputation() : DMwR 패키지. 중위수 또는 최빈값(factor)으로 대치
- knnImputation() : DMwR 패키지. knn 분류 알고리즘 사용
- amelia()
# Impute
library(Hmisc)
df <- data.frame(age = c(11, 23, NA, 40, 35, 15), gender = c('female', 'male'))
df
## age gender
## 1 11 female
## 2 23 male
## 3 NA female
## 4 40 male
## 5 35 female
## 6 15 male
df$imputed_age <- with(df, impute(age, mean))
df$imputedR_age <- with(df, impute(age, "random"))
df
## age gender imputed_age imputedR_age
## 1 11 female 11.0 11
## 2 23 male 23.0 23
## 3 NA female 24.8 40
## 4 40 male 40.0 40
## 5 35 female 35.0 35
## 6 15 male 15.0 15
# Amelia : Multiple Imputation of Incomplete Multivariate Data
library(Amelia)
data(freetrade) # Trade Policy and Democracy in 9 Asian States
head(freetrade)
## year country tariff polity pop gdp.pc intresmi signed fiveop
## 1 1981 SriLanka NA 6 14988000 461.0236 1.937347 0 12.4
## 2 1982 SriLanka NA 5 15189000 473.7634 1.964430 0 12.5
## 3 1983 SriLanka 41.3 5 15417000 489.2266 1.663936 1 12.3
## 4 1984 SriLanka NA 5 15599000 508.1739 2.797462 0 12.3
## 5 1985 SriLanka 31.0 5 15837000 525.5609 2.259116 0 12.3
## 6 1986 SriLanka NA 5 16117000 538.9237 1.832549 0 12.5
## usheg
## 1 0.2593112
## 2 0.2558008
## 3 0.2655022
## 4 0.2988009
## 5 0.2952431
## 6 0.2886563
summary(freetrade) # tariff (관세) : NA 58개.
## year country tariff polity
## Min. :1981 Length:171 Min. : 7.10 Min. :-8.000
## 1st Qu.:1985 Class :character 1st Qu.: 16.30 1st Qu.:-2.000
## Median :1990 Mode :character Median : 25.20 Median : 5.000
## Mean :1990 Mean : 31.65 Mean : 2.905
## 3rd Qu.:1995 3rd Qu.: 40.80 3rd Qu.: 8.000
## Max. :1999 Max. :100.00 Max. : 9.000
## NA's :58 NA's :2
## pop gdp.pc intresmi signed
## Min. : 14105080 Min. : 149.5 Min. :0.9036 Min. :0.0000
## 1st Qu.: 19676715 1st Qu.: 420.1 1st Qu.:2.2231 1st Qu.:0.0000
## Median : 52799040 Median : 814.3 Median :3.1815 Median :0.0000
## Mean :149904501 Mean : 1867.3 Mean :3.3752 Mean :0.1548
## 3rd Qu.:120888400 3rd Qu.: 2462.9 3rd Qu.:4.4063 3rd Qu.:0.0000
## Max. :997515200 Max. :12086.2 Max. :7.9346 Max. :1.0000
## NA's :13 NA's :3
## fiveop usheg
## Min. :12.30 Min. :0.2558
## 1st Qu.:12.50 1st Qu.:0.2623
## Median :12.60 Median :0.2756
## Mean :12.74 Mean :0.2764
## 3rd Qu.:13.20 3rd Qu.:0.2887
## Max. :13.20 Max. :0.3083
## NA's :18
missmap(freetrade) # 결측치 분포 시각화
# 결측치 대치값 생성. 시작값 = min(tariff)
am_data <- amelia(freetrade, ts = "year", cs = "country")
## -- Imputation 1 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
##
## -- Imputation 2 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 21
##
## -- Imputation 3 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## -- Imputation 4 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## -- Imputation 5 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
# m : the number of imputed datasets to create.
# ts : time series column
# cs : cross section variable
summary(am_data)
##
## Amelia output with 5 imputed datasets.
## Return code: 1
## Message: Normal EM convergence.
##
## Chain Lengths:
## --------------
## Imputation 1: 17
## Imputation 2: 21
## Imputation 3: 13
## Imputation 4: 15
## Imputation 5: 14
##
## Rows after Listwise Deletion: 96
## Rows after Imputation: 171
## Patterns of missingness in the data: 8
##
## Fraction Missing for original variables:
## -----------------------------------------
##
## Fraction Missing
## year 0.00000000
## country 0.00000000
## tariff 0.33918129
## polity 0.01169591
## pop 0.00000000
## gdp.pc 0.00000000
## intresmi 0.07602339
## signed 0.01754386
## fiveop 0.10526316
## usheg 0.00000000
summary(am_data$imputations[[1]]$tariff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.76 17.40 27.80 33.60 46.49 100.00
summary(am_data$imputations[[2]]$tariff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.813 16.950 27.000 31.080 41.550 100.000
summary(am_data$imputations[[3]]$tariff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -18.53 16.95 27.00 32.99 47.61 100.00
# 첫번째 am_data 값 적용
impute1 <- am_data$imputations[[1]]$tariff
hist(impute1, col = "grey", border = "black")
freetrade$tariff <- impute1 # 결측치 대치
missmap(freetrade)
plot(am_data)
par(mfrow=c(1,1))