# 이상치(outlier) 찾기 및 처리
outlier 식별
- EDS (Extreme Studentized Deviation) : 평균에서 3 표준편차 이상 떨어진 값
- 사분위수 이용. boxplot outer fence 벗어난 값

outlier 처리방법
- 절단(trimming) : outlier 포함된 레코드 삭제
- 조정(winsorizing) : outlier를 상한 또는 하한 값으로 조정
# Case 1. iris

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# (1) 사분위수 이용 : 1개 변수에 대한 이상치

boxplot(iris)

boxplot(iris$Sepal.Width)

a <- iris$Sepal.Width

# fivenum : minimum, lower-hinge, median, upper-hinge, maximum
which(a < fivenum(a)[2] - 1.5*IQR(a))
## [1] 61
which(a > fivenum(a)[4] + 1.5*IQR(a))
## [1] 16 33 34
# (2) lofactor 함수 (local outlier factor algorithm) : 모든 변수 고려한 이상치

library(DMwR)
## Loading required package: lattice
## Loading required package: grid

outlier.score <- lofactor(iris[ , 1:4], k = 5)                  # k : outlier 계산을 위한 이웃 갯수
plot(density(outlier.score), main = "outlier score of iris")    # score 가 2.0, 2.5인 데이터가 outlier

sort(outlier.score, decreasing = T)[1:10]   # score > 1.9 인 3개 데이터를 이상치로 결정
##  [1] 2.479960 2.029263 1.959143 1.602584 1.548281 1.528656 1.480122
##  [8] 1.463896 1.455104 1.451772
outliers <- order(outlier.score, decreasing = T)[1:3]
outliers
## [1]  42 107  23
# outler 만 plot에서 번호 표시
labels <- 1:nrow(iris)
labels[-outliers] <- "."

# 주성분분석
biplot(prcomp(iris[ , 1:4]), cex = 0.8, xlabs = labels)

pch <- rep(".", nrow(iris))
pch[outliers] <- "@"

col <- rep("black", nrow(iris))
col[outliers] <- "red"

pairs(iris[ , 1:4], pch = pch, col = col)

# Case 2. 

library(psych)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
df <- USairpollution
head(df)
##             SO2 temp manu popul wind precip predays
## Albany       46 47.6   44   116  8.8  33.36     135
## Albuquerque  11 56.8   46   244  8.9   7.77      58
## Atlanta      24 61.5  368   497  9.1  48.34     115
## Baltimore    47 55.0  625   905  9.6  41.31     111
## Buffalo      11 47.1  391   463 12.4  36.11     166
## Charleston   31 55.2   35    71  6.5  40.75     148
outlier.score <- lofactor(df, k = 5)
plot(density(outlier.score), main = "outlier score of USairpollution")

sort(outlier.score, decreasing = T)     # score > 3 인 이상치 2개
##  [1] 6.7308161 3.6374196 2.4217025 2.1712077 1.8761657 1.8285197 1.4999312
##  [8] 1.3550799 1.2746045 1.2568747 1.2199493 1.1737022 1.1598072 1.1545934
## [15] 1.1054748 1.1030826 1.1018627 1.0744590 1.0705834 1.0631965 1.0591652
## [22] 1.0560957 1.0524985 1.0515998 1.0511851 1.0315732 1.0211214 1.0185135
## [29] 1.0161363 1.0158912 1.0110542 1.0025476 0.9947553 0.9904630 0.9828882
## [36] 0.9796012 0.9784709 0.9767661 0.9708327 0.9684696 0.9450678
outliers <- order(outlier.score, decreasing = T)[1:2]
outliers
## [1]  7 30
df[outliers, ]
##              SO2 temp manu popul wind precip predays
## Chicago      110 50.6 3344  3369 10.4  34.44     122
## Philadelphia  69 54.6 1692  1950  9.6  39.93     115
# outler 만 plot에서 번호 표시
labels <- 1:nrow(df)
labels[-outliers] <- "."

# 주성분분석
biplot(prcomp(df), cex = 0.8, xlabs = labels)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

# plot 1
pch <- rep(4, nrow(df))
pch[outliers] <- 1

col <- rep("black", nrow(df))
col[outliers] <- "red"

pairs(df, pch = pch, col = col)

# plot 2
df$col <- "normal"
df[outliers, ]$col <- "outlier"
df$col <- factor(df$col)
head(df, 10)
##             SO2 temp manu popul wind precip predays     col
## Albany       46 47.6   44   116  8.8  33.36     135  normal
## Albuquerque  11 56.8   46   244  8.9   7.77      58  normal
## Atlanta      24 61.5  368   497  9.1  48.34     115  normal
## Baltimore    47 55.0  625   905  9.6  41.31     111  normal
## Buffalo      11 47.1  391   463 12.4  36.11     166  normal
## Charleston   31 55.2   35    71  6.5  40.75     148  normal
## Chicago     110 50.6 3344  3369 10.4  34.44     122 outlier
## Cincinnati   23 54.0  462   453  7.1  39.04     132  normal
## Cleveland    65 49.7 1007   751 10.9  34.99     155  normal
## Columbus     26 51.5  266   540  8.6  37.01     134  normal
pairs.panels(df[1:7], bg = c("black", "yellow")[df$col], pch = 21)