# 이상치(outlier) 찾기 및 처리
outlier 식별
- EDS (Extreme Studentized Deviation) : 평균에서 3 표준편차 이상 떨어진 값
- 사분위수 이용. boxplot outer fence 벗어난 값
outlier 처리방법
- 절단(trimming) : outlier 포함된 레코드 삭제
- 조정(winsorizing) : outlier를 상한 또는 하한 값으로 조정
# Case 1. iris
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# (1) 사분위수 이용 : 1개 변수에 대한 이상치
boxplot(iris)
boxplot(iris$Sepal.Width)
a <- iris$Sepal.Width
# fivenum : minimum, lower-hinge, median, upper-hinge, maximum
which(a < fivenum(a)[2] - 1.5*IQR(a))
## [1] 61
which(a > fivenum(a)[4] + 1.5*IQR(a))
## [1] 16 33 34
# (2) lofactor 함수 (local outlier factor algorithm) : 모든 변수 고려한 이상치
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
outlier.score <- lofactor(iris[ , 1:4], k = 5) # k : outlier 계산을 위한 이웃 갯수
plot(density(outlier.score), main = "outlier score of iris") # score 가 2.0, 2.5인 데이터가 outlier
sort(outlier.score, decreasing = T)[1:10] # score > 1.9 인 3개 데이터를 이상치로 결정
## [1] 2.479960 2.029263 1.959143 1.602584 1.548281 1.528656 1.480122
## [8] 1.463896 1.455104 1.451772
outliers <- order(outlier.score, decreasing = T)[1:3]
outliers
## [1] 42 107 23
# outler 만 plot에서 번호 표시
labels <- 1:nrow(iris)
labels[-outliers] <- "."
# 주성분분석
biplot(prcomp(iris[ , 1:4]), cex = 0.8, xlabs = labels)
pch <- rep(".", nrow(iris))
pch[outliers] <- "@"
col <- rep("black", nrow(iris))
col[outliers] <- "red"
pairs(iris[ , 1:4], pch = pch, col = col)
# Case 2.
library(psych)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
df <- USairpollution
head(df)
## SO2 temp manu popul wind precip predays
## Albany 46 47.6 44 116 8.8 33.36 135
## Albuquerque 11 56.8 46 244 8.9 7.77 58
## Atlanta 24 61.5 368 497 9.1 48.34 115
## Baltimore 47 55.0 625 905 9.6 41.31 111
## Buffalo 11 47.1 391 463 12.4 36.11 166
## Charleston 31 55.2 35 71 6.5 40.75 148
outlier.score <- lofactor(df, k = 5)
plot(density(outlier.score), main = "outlier score of USairpollution")
sort(outlier.score, decreasing = T) # score > 3 인 이상치 2개
## [1] 6.7308161 3.6374196 2.4217025 2.1712077 1.8761657 1.8285197 1.4999312
## [8] 1.3550799 1.2746045 1.2568747 1.2199493 1.1737022 1.1598072 1.1545934
## [15] 1.1054748 1.1030826 1.1018627 1.0744590 1.0705834 1.0631965 1.0591652
## [22] 1.0560957 1.0524985 1.0515998 1.0511851 1.0315732 1.0211214 1.0185135
## [29] 1.0161363 1.0158912 1.0110542 1.0025476 0.9947553 0.9904630 0.9828882
## [36] 0.9796012 0.9784709 0.9767661 0.9708327 0.9684696 0.9450678
outliers <- order(outlier.score, decreasing = T)[1:2]
outliers
## [1] 7 30
df[outliers, ]
## SO2 temp manu popul wind precip predays
## Chicago 110 50.6 3344 3369 10.4 34.44 122
## Philadelphia 69 54.6 1692 1950 9.6 39.93 115
# outler 만 plot에서 번호 표시
labels <- 1:nrow(df)
labels[-outliers] <- "."
# 주성분분석
biplot(prcomp(df), cex = 0.8, xlabs = labels)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
# plot 1
pch <- rep(4, nrow(df))
pch[outliers] <- 1
col <- rep("black", nrow(df))
col[outliers] <- "red"
pairs(df, pch = pch, col = col)
# plot 2
df$col <- "normal"
df[outliers, ]$col <- "outlier"
df$col <- factor(df$col)
head(df, 10)
## SO2 temp manu popul wind precip predays col
## Albany 46 47.6 44 116 8.8 33.36 135 normal
## Albuquerque 11 56.8 46 244 8.9 7.77 58 normal
## Atlanta 24 61.5 368 497 9.1 48.34 115 normal
## Baltimore 47 55.0 625 905 9.6 41.31 111 normal
## Buffalo 11 47.1 391 463 12.4 36.11 166 normal
## Charleston 31 55.2 35 71 6.5 40.75 148 normal
## Chicago 110 50.6 3344 3369 10.4 34.44 122 outlier
## Cincinnati 23 54.0 462 453 7.1 39.04 132 normal
## Cleveland 65 49.7 1007 751 10.9 34.99 155 normal
## Columbus 26 51.5 266 540 8.6 37.01 134 normal
pairs.panels(df[1:7], bg = c("black", "yellow")[df$col], pch = 21)