R을 활용한 통계분석 - 6.분산분석(ANOVA)

# 한 그룹의 평균과 특정한 수 비교   : One Sample T-test
# 두 그룹의 평균 비교               : Two Sample T-test 
# 세 그룹 이상의 평균 비교
#   --- 설명변수 범주형             : ANOVA (분산분석)
#   --- 설명변수 범주형 + 범주형    : Two-way ANOVA
#   --- 설명변수 범주형 + 연속형    : ANCOVA (공분산분석)

# 분산분석 (ANOVA) = 설명변수가 "범주형"인 회귀분석

# 분산분석 in R
# --- 회귀분석과 마찬가지로 lm 명령어를 사용
# --- 설명(독립)변수가 그룹을 의미하는 범주형 변수 (Factor함수 사용하여 정의)

# y = b0 + b1*x + e   :   x가 0 또는 1을 가지는 범주형 변수라면?
# x = 0 이면 y = b0 + e
# x = 1 이면 y = b0 + b1 + e
# b1 = 0 이면 x의 두 그룹은 평균이 같다.
# 즉, H0 : mu1 = mu2  <-->  H0 : b1 = 0

# 그룹이 3개 이상이라면? x가 3개 그룹을 정의하는 질적변수라면? (서울, 부산, 제주)
# 더미 변수 (k-1) 개를 만든다.
# y = b0 + b1*x1 + b2*x2 + e

#           x1      x2  
# 서울      1       0
# 부산      0       1
# 제주      0       0

# 영화 등급이 총관객수에 영향을 미치는가?

movie <- read.csv("data/movie.csv", header = T)
summary(movie$rating)

## 12세이상관람가 15세이상관람가     전체관람가 청소년관람불가 
##             43             94             48             42

# 독립변수 = factor
levels(movie$rating)

## [1] "12세이상관람가" "15세이상관람가" "전체관람가"     "청소년관람불가"

levels(movie$rating) <- c("12years", "15years", "All", "AdultOnly")

library(psych)
describeBy(movie$total_seen, group = movie$rating, mat = T)

##     item    group1 vars  n      mean        sd  median   trimmed       mad
## X11    1   12years    1 43 1774489.7 2069031.5  893027 1390505.8 1082776.9
## X12    2   15years    1 94 2095732.6 2824207.5 1045561 1477272.3 1139430.7
## X13    3       All    1 48  638541.3  532817.5  343360  571803.5  269161.6
## X14    4 AdultOnly    1 42 1015156.6 1133648.8  493634  803595.4  492685.0
##        min      max    range     skew    kurtosis        se
## X11 101351  9001312  8899961 1.663481  2.32459613 315524.35
## X12 101425 12983330 12881905 2.211749  4.75169040 291294.76
## X13 106432  2080445  1974013 1.014964 -0.04105884  76905.58
## X14 113848  4720050  4606202 1.735021  2.78575581 174925.82

# 종속변수
hist(movie$total_seen)      # 데이터가 한쪽으로 몰려있으므로 log 변환을 해준다.

hist(log(movie$total_seen))

with(movie, tapply(log(total_seen), rating, mean))

##   12years   15years       All AdultOnly 
##  13.70416  13.77233  13.02122  13.27292

boxplot(log(total_seen) ~ rating, col = "red", data = movie)

# 회귀분석
par(mfcol=c(2,2))

out0 <- lm(total_seen ~ rating, movie)
plot(out0)

out <- lm(log(total_seen) ~ rating, movie)
plot(out)                   # 종속변수 변환 전보다 후에 잔차도가 안정됨

summary(out)

## 
## Call:
## lm(formula = log(total_seen) ~ rating, data = movie)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.24526 -0.86293 -0.01617  0.87955  2.60684 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.70416    0.17979  76.221  < 2e-16 ***
## rating15years    0.06818    0.21706   0.314  0.75375    
## ratingAll       -0.68294    0.24756  -2.759  0.00628 ** 
## ratingAdultOnly -0.43123    0.25578  -1.686  0.09320 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.179 on 223 degrees of freedom
## Multiple R-squared:  0.06604,    Adjusted R-squared:  0.05347 
## F-statistic: 5.256 on 3 and 223 DF,  p-value: 0.001601

par(mfcol=c(1,1))

# Result

# F-검정 (H0 : mu1 = mu2 = mu3 = mu4, 모든 그룹의 평균이 같다)
# p-value: 0.001601 < 0.05 : 등급별 총관객수는 서로 유의미한 '차이'가 있다.

# (Intercept)                   --- (b0) 12years 평균
# rating15years     0.75375     --- (b1) 15years 평균 - 12years 평균 : 차이 없음
# ratingAll         0.00628 **  --- (b2) All 평균 - 12years 평균 : 전체관람가와 12세관람가는 유의한 차이가 있다.
# ratingAdultOnly   0.09320     --- (b3) Adult - 12 years

# But, 어떤 등급끼리 차이가 있는지 알 수 없다.
# 각 변수 95% 신뢰구간 ---> 0.95 * 0.95 * 0.95 : 신뢰구간이 작아진다.
# 실제로 유의하지 않은데 유의하게 결론이 나올 수 있다.
# 그래서 t-test 대신에 Dunnett 또는 Tukey 방법론 사용 !!!

# 다중비교

# Dunnett Method : reference level(기준)과 각 범주의 평균 차이 검정 
# Tukey Method : 가능한 모든 범주 쌍의 평균 차이 검정

library(multcomp)

out <- lm(log(total_seen) ~ rating, movie)

# Dunnett Method
dunnet <- glht(out, linfct = mcp(rating = "Dunnett"))
summary(dunnet)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Dunnett Contrasts
## 
## 
## Fit: lm(formula = log(total_seen) ~ rating, data = movie)
## 
## Linear Hypotheses:
##                          Estimate Std. Error t value Pr(>|t|)  
## 15years - 12years == 0    0.06818    0.21706   0.314    0.977  
## All - 12years == 0       -0.68294    0.24756  -2.759    0.017 *
## AdultOnly - 12years == 0 -0.43123    0.25578  -1.686    0.213  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

plot(dunnet)

    # All vs 12years : 유의한 차이가 있다.
    # 나머지는 p-value > 0.05 이고, 95% 신뢰구간이 0을 포함하므로 유의한 차이가 없다.

# Tukey Method
tukey <- glht(out, linfct = mcp(rating = "Tukey"))
summary(tukey)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lm(formula = log(total_seen) ~ rating, data = movie)
## 
## Linear Hypotheses:
##                          Estimate Std. Error t value Pr(>|t|)   
## 15years - 12years == 0    0.06818    0.21706   0.314  0.98911   
## All - 12years == 0       -0.68294    0.24756  -2.759  0.03139 * 
## AdultOnly - 12years == 0 -0.43123    0.25578  -1.686  0.33068   
## All - 15years == 0       -0.75111    0.20916  -3.591  0.00228 **
## AdultOnly - 15years == 0 -0.49941    0.21882  -2.282  0.10409   
## AdultOnly - All == 0      0.25170    0.24911   1.010  0.74125   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

plot(tukey)

    # All vs 15years 사이에도 유의한 차이가 있음을 알 수 있다.

# 추정된 회귀식
# y = 13.70 + 0.068*x1 - 0.682*x2 - 0.431*x3
# 13.70 : 12세관람가 영화의 평균 log(총관객수)
# 13.70 + 0.068 : 15세 관람가 영화의 평균 log(총관객수)

# 특정 범주를 하나로 합쳐서 분석하고자 할 경우

movie$rating2 <- movie$rating
levels(movie$rating2)

## [1] "12years"   "15years"   "All"       "AdultOnly"

levels(movie$rating2) <- c(2,2,1,3)     # 12세 + 15세
summary(movie$rating2)

##   2   1   3 
## 137  48  42

# 가장 앞에 있는 level "2"가 reference level로 자동 설정
out2 <- lm(log(total_seen) ~ rating2, movie)
summary(out2)

## 
## Call:
## lm(formula = log(total_seen) ~ rating2, data = movie)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22459 -0.88038 -0.04003  0.86960  2.62824 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.7509     0.1005 136.791  < 2e-16 ***
## rating21     -0.7297     0.1974  -3.698 0.000274 ***
## rating23     -0.4780     0.2075  -2.303 0.022176 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.177 on 224 degrees of freedom
## Multiple R-squared:  0.06562,    Adjusted R-squared:  0.05728 
## F-statistic: 7.866 on 2 and 224 DF,  p-value: 0.0004994

# reference level을 1 (All)로 설정
movie$rating2 <- relevel(movie$rating2, ref = "1")
out2 <- lm(log(total_seen) ~ rating2, movie)
summary(out2)

## 
## Call:
## lm(formula = log(total_seen) ~ rating2, data = movie)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22459 -0.88038 -0.04003  0.86960  2.62824 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.0212     0.1698  76.672  < 2e-16 ***
## rating22      0.7297     0.1974   3.698 0.000274 ***
## rating23      0.2517     0.2486   1.012 0.312411    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.177 on 224 degrees of freedom
## Multiple R-squared:  0.06562,    Adjusted R-squared:  0.05728 
## F-statistic: 7.866 on 2 and 224 DF,  p-value: 0.0004994

# 1 (All)과 2 (12세+15세)는 평균 총관객수에 유의한 차이가 있다.
# 1 (All)과 3 (AdultOnly)은 차이가 없다.

# 공분산분석 (ANCOVA)

# 공분산분석 = 분산분석 + 회귀분석
# 종속변수의 변동을 설명하는데 그룹변수 이외의 다른 변인이 있을때 그 효과를 통제
# 설명(독립)변수가 질적변수와 양적변수가 함께 있는 경우

# Case.
# 거식증 치료제 : CBT, FT, Control 3가지 치료방법
# 종속변수 : 치료 전/후 몸무게 차이 (postwt - prewt)
# 설명변수 : 치료 전 몸무게 (공변량, 통제할 변수), 치료방법 (주요 관심 설명변수)

# 분산분석 : 치료전후 몸무게 변화가 치료방법 간에 차이가 있는가 검증
# 공분산분석 : 치료 전 몸무게가 무거울수록 치료 후 몸무게 변화가 크지 않을까? 
#              이것이 치료방법 간 차이를 보는데 방해가 될 수 있는지 검증

df <- read.csv("data/anorexia.csv")
str(df)

## 'data.frame':    72 obs. of  3 variables:
##  $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Prewt : num  80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
##  $ Postwt: num  80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...

summary(df)

##   Treat        Prewt           Postwt      
##  CBT :29   Min.   :70.00   Min.   : 71.30  
##  Cont:26   1st Qu.:79.60   1st Qu.: 79.33  
##  FT  :17   Median :82.30   Median : 84.05  
##            Mean   :82.41   Mean   : 85.17  
##            3rd Qu.:86.00   3rd Qu.: 91.55  
##            Max.   :94.90   Max.   :103.60

# dummy 변수 생성시 Control 그룹을 레퍼런스로 지정하기 위해 relevel
df$Treat <- relevel(df$Treat, ref = "Cont")
summary(df)

##   Treat        Prewt           Postwt      
##  Cont:26   Min.   :70.00   Min.   : 71.30  
##  CBT :29   1st Qu.:79.60   1st Qu.: 79.33  
##  FT  :17   Median :82.30   Median : 84.05  
##            Mean   :82.41   Mean   : 85.17  
##            3rd Qu.:86.00   3rd Qu.: 91.55  
##            Max.   :94.90   Max.   :103.60

boxplot(Prewt ~ Treat, df)

boxplot(Postwt - Prewt ~ Treat, df)

# 분산분석 : 이전 몸무게 관심 없음
out <- lm(Postwt - Prewt ~ Treat, df)
summary(out)

## 
## Call:
## lm(formula = Postwt - Prewt ~ Treat, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.565  -4.543  -1.007   3.846  17.893 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   -0.450      1.476  -0.305   0.7614   
## TreatCBT       3.457      2.033   1.700   0.0936 . 
## TreatFT        7.715      2.348   3.285   0.0016 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.528 on 69 degrees of freedom
## Multiple R-squared:  0.1358, Adjusted R-squared:  0.1108 
## F-statistic: 5.422 on 2 and 69 DF,  p-value: 0.006499

# 공분산 분석 : 이전 몸무게가 동일한 사람들을 기준으로 분석
# 설명변수 = 범주형(Treat) + 연속형(Prewt)
out <- lm(Postwt - Prewt ~ Prewt + Treat, df)
anova(out)

## Analysis of Variance Table
## 
## Response: Postwt - Prewt
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Prewt      1  447.9  447.85  9.1970 0.0034297 ** 
## Treat      2  766.3  383.14  7.8681 0.0008438 ***
## Residuals 68 3311.3   48.70                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    # Prewt 통제시 Treat 변수가 설명해 주는 y의 변동성에 대한 F-test. 
    # p-value < 0.05 --> Treat 3개 그룹간 평균의 차이가 유의하다.
summary(out)

## 
## Call:
## lm(formula = Postwt - Prewt ~ Prewt + Treat, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1083  -4.2773  -0.5484   5.4838  15.2922 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.6740    13.2167   3.456 0.000950 ***
## Prewt        -0.5655     0.1612  -3.509 0.000803 ***
## TreatCBT      4.0971     1.8935   2.164 0.033999 *  
## TreatFT       8.6601     2.1931   3.949 0.000189 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared:  0.2683, Adjusted R-squared:  0.236 
## F-statistic: 8.311 on 3 and 68 DF,  p-value: 8.725e-05

    # Prewt가 클수록 Postwt-Prewt 변화율이 적다 (음의 상관관계)
    # TreatCBT & TreatFT 모두 유의한 차이를 가짐
    # 각 범주간 평균의 차이 검증 --> Dunnett Method
plot(out)

dunnet <- glht(out, linfct = mcp(Treat = "Dunnett"))
summary(dunnet)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Dunnett Contrasts
## 
## 
## Fit: lm(formula = Postwt - Prewt ~ Prewt + Treat, data = df)
## 
## Linear Hypotheses:
##                 Estimate Std. Error t value Pr(>|t|)    
## CBT - Cont == 0    4.097      1.893   2.164 0.062939 .  
## FT - Cont == 0     8.660      2.193   3.949 0.000373 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

    # CBT보다는 FT가 더 유의미한 차이를 보여준다.


# Result

# y = 45.674 - 0.565 * prewt + 4.097 * CBT + 8.660 * FT
# --- control : y = 45.674 - 0.565 * prewt
# --- CBT     : y = 45.674 + 4.097 - 0.565 * prewt
#               Prewt가 평균이었던 사람에 대해 CBT는 control 그룹보다 4.097만큼 더 몸무게 변화를 주었다.
# --- FT      : y = 45.674 + 8.660 - 0.565 * prewt
#               Prewt가 평균이었던 사람에 대해 FT는 control 그룹보다 8.66만큼 더 몸무게 변화를 주었다.

gubun <- as.numeric(df$Treat)
plot(Postwt - Prewt ~ Prewt, df, col = gubun, pch = gubun)
legend("topright", levels(df$Treat), col = 1:3, pch = 1:3, lty = 1)
abline(45.674, - 0.565, col = 1)
abline(45.674 + 4.097, - 0.565, col = 2)
abline(45.674 + 8.660, - 0.565, col = 3)

# 더미변수 포함한 회귀분석

# Case.
# 수축기혈압(Systolic blood pressure; SBP)과 연령 (age)을 남자 40명, 여자 29명으로부터 기록
# 연령이 높을수록 수축기혈압이 높은 경향. 연령과 혈압 사이의 관계가 남녀간에 차이가 있는가?

# y = b0 + b1*x + b2*z + b3*x*z + e : x = age, z = sex (1 female, 0 male)

sbp <- read.csv("data/SBP.csv")
head(sbp)

##   SBP AGE SEX
## 1 144  39   1
## 2 138  45   1
## 3 145  47   1
## 4 162  65   1
## 5 142  46   1
## 6 170  67   1

summary(sbp)

##       SBP             AGE             SEX        
##  Min.   :110.0   Min.   :17.00   Min.   :0.0000  
##  1st Qu.:135.0   1st Qu.:36.00   1st Qu.:0.0000  
##  Median :149.0   Median :47.00   Median :0.0000  
##  Mean   :148.7   Mean   :46.14   Mean   :0.4203  
##  3rd Qu.:162.0   3rd Qu.:59.00   3rd Qu.:1.0000  
##  Max.   :185.0   Max.   :70.00   Max.   :1.0000

model <- lm(SBP ~ AGE + SEX, sbp)
summary(model)

## 
## Call:
## lm(formula = SBP ~ AGE + SEX, data = sbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.705  -3.299   1.248   4.325  21.160 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.28698    3.63824  30.313  < 2e-16 ***
## AGE           0.95606    0.07153  13.366  < 2e-16 ***
## SEX         -13.51345    2.16932  -6.229  3.7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.878 on 66 degrees of freedom
## Multiple R-squared:  0.7759, Adjusted R-squared:  0.7691 
## F-statistic: 114.2 on 2 and 66 DF,  p-value: < 2.2e-16

# 두 회귀선은 평행한가? --> H0 : b3 = 0
model1 <- lm(SBP ~ AGE + SEX + AGE*SEX, sbp)
summary(model1)

## 
## Call:
## lm(formula = SBP ~ AGE + SEX + AGE * SEX, data = sbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.647  -3.410   1.254   4.314  21.153 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.03853    4.73610  23.234  < 2e-16 ***
## AGE           0.96135    0.09632   9.980 9.63e-15 ***
## SEX         -12.96144    7.01172  -1.849   0.0691 .  
## AGE:SEX      -0.01203    0.14519  -0.083   0.9342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.946 on 65 degrees of freedom
## Multiple R-squared:  0.7759, Adjusted R-squared:  0.7656 
## F-statistic: 75.02 on 3 and 65 DF,  p-value: < 2.2e-16

anova(model1)

## Analysis of Variance Table
## 
## Response: SBP
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## AGE        1 14951.3 14951.3 186.8390 < 2.2e-16 ***
## SEX        1  3058.5  3058.5  38.2210 4.692e-08 ***
## AGE:SEX    1     0.5     0.5   0.0069    0.9342    
## Residuals 65  5201.4    80.0                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    # AGE:SEX의 p-value 0.9342 > 0.05 이므로 서로 interaction이 없다.

# 두 회귀선이 동일한가? --> H0 : b2 = b3 = 0
model2 <- lm(SBP ~ AGE, sbp)
summary(model2)

## 
## Call:
## lm(formula = SBP ~ AGE, data = sbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.782  -7.632   1.968   8.201  22.651 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 103.34905    4.33190   23.86   <2e-16 ***
## AGE           0.98333    0.08929   11.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.1 on 67 degrees of freedom
## Multiple R-squared:  0.6441, Adjusted R-squared:  0.6388 
## F-statistic: 121.3 on 1 and 67 DF,  p-value: < 2.2e-16

anova(model2)

## Analysis of Variance Table
## 
## Response: SBP
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## AGE        1 14951.3 14951.3  121.27 < 2.2e-16 ***
## Residuals 67  8260.5   123.3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(model2, model1)   # model1과 model2 비교. 귀무가설 기각 = 동일하지 않다.

## Analysis of Variance Table
## 
## Model 1: SBP ~ AGE
## Model 2: SBP ~ AGE + SEX + AGE * SEX
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)    
## 1     67 8260.5                                 
## 2     65 5201.4  2    3059.1 19.114 2.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Two-way ANOVA = 범주형 설명변수가 2개인 회귀분석

# 두 범주형 변수가 서로 interaction을 하는 경우

attach(warpbreaks)  
summary(warpbreaks)

##      breaks      wool   tension
##  Min.   :10.00   A:27   L:18   
##  1st Qu.:18.25   B:27   M:18   
##  Median :26.00          H:18   
##  Mean   :28.15                 
##  3rd Qu.:34.00                 
##  Max.   :70.00

# breaks : 베틀에서 실이 끊어지는 횟수
# wool : 울의 타입
# tension : 실의 장력 수준

boxplot(breaks ~ wool + tension, col = "red", data = warpbreaks, xlab = "wool+tension")

interaction.plot(tension, wool, breaks, col = c("red", "blue"))

    # 두 변수가 interaction이 없으면 비슷한 거리를 유지하면서 평행하다
    # 두 변수가 interaction이 있으면 서로 만난다

model <- lm(breaks ~ wool + tension + wool*tension, warpbreaks)
anova(model)

## Analysis of Variance Table
## 
## Response: breaks
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## wool          1  450.7  450.67  3.7653 0.0582130 .  
## tension       2 2034.3 1017.13  8.4980 0.0006926 ***
## wool:tension  2 1002.8  501.39  4.1891 0.0210442 *  
## Residuals    48 5745.1  119.69                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    # wool:tension의 p-value < 0.05 로 서로 interaction이 있다.
    # interaction이 있으면 두 변수이 효과를 각각 분리해서 해석할 수 없다.

summary(model)

## 
## Call:
## lm(formula = breaks ~ wool + tension + wool * tension, data = warpbreaks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.5556  -6.8889  -0.6667   7.1944  25.4444 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      44.556      3.647  12.218 2.43e-16 ***
## woolB           -16.333      5.157  -3.167 0.002677 ** 
## tensionM        -20.556      5.157  -3.986 0.000228 ***
## tensionH        -20.000      5.157  -3.878 0.000320 ***
## woolB:tensionM   21.111      7.294   2.895 0.005698 ** 
## woolB:tensionH   10.556      7.294   1.447 0.154327    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.94 on 48 degrees of freedom
## Multiple R-squared:  0.3778, Adjusted R-squared:  0.3129 
## F-statistic: 5.828 on 5 and 48 DF,  p-value: 0.0002772

plot(model)

# 공분산분석 (ANCOVA)  vs.  더미변수 포함한 회귀분석

# ANCOVA
# --- 주요 관심사는 집단간 종속변수의 평균 차이
# --- 공변량의 효과를 통제한 후 집단간 차이를 파악하는 것이 목적
# --- 공변량이 전체 평균인 수준에서 종속변수의 평균치를 비교
# --- 각 집단의 회귀식이 평행하지 않으면 의미 없음
# --- 회귀선들이 평행한지 검정 후 귀무가설(회귀선이 평행하다)이 기각되지 않으면 ANCOVA 실시

# 더미변수를 포함한 회귀분석
# --- 범주형 변수 뿐만 아니라 공변량도 관심대상
# --- 만일 회귀선이 평행하지 않다면 해당 설명변수들 간에 interaction effect 존재
# --- 예) 나이가 어릴때는 여자의 혈압이 더 높지만 나이가 들면 남자의 혈압이 높다.

# practice 6

df <- read.csv("data/Forbes500.csv")
str(df)

## 'data.frame':    79 obs. of  8 variables:
##  $ Company     : Factor w/ 79 levels "AH Robins","Air Products",..: 2 3 4 5 6 7 8 9 10 11 ...
##  $ Assets      : int  2687 13271 13621 3614 6425 1022 1093 1529 2788 19788 ...
##  $ Sales       : int  1870 9115 4848 367 6131 1754 1679 1295 271 9084 ...
##  $ Market_Value: int  1890 8190 4572 90 2448 1370 1070 444 304 10636 ...
##  $ Profits     : num  145.7 -279 485 14.1 345.8 ...
##  $ Cash_Flow   : num  352.2 83 898.9 24.6 682.5 ...
##  $ Employees   : num  18.2 143.8 23.4 1.1 49.5 ...
##  $ sector      : Factor w/ 9 levels "Communication",..: 7 7 2 3 9 4 5 7 3 1 ...

summary(df)

##                       Company       Assets          Sales        
##  AH Robins                : 1   Min.   :  223   Min.   :  176.0  
##  Air Products             : 1   1st Qu.: 1122   1st Qu.:  815.5  
##  Allied Signal            : 1   Median : 2788   Median : 1754.0  
##  American Electric Power  : 1   Mean   : 5941   Mean   : 4178.3  
##  American Savings Bank FSB: 1   3rd Qu.: 5802   3rd Qu.: 4563.5  
##  AMR                      : 1   Max.   :52634   Max.   :50056.0  
##  (Other)                  :73                                    
##   Market_Value        Profits         Cash_Flow         Employees     
##  Min.   :   53.0   Min.   :-771.5   Min.   :-651.90   Min.   :  0.60  
##  1st Qu.:  512.5   1st Qu.:  39.0   1st Qu.:  75.15   1st Qu.:  3.95  
##  Median :  944.0   Median :  70.5   Median : 133.30   Median : 15.40  
##  Mean   : 3269.8   Mean   : 209.8   Mean   : 400.93   Mean   : 37.60  
##  3rd Qu.: 1961.5   3rd Qu.: 188.1   3rd Qu.: 328.85   3rd Qu.: 48.50  
##  Max.   :95697.0   Max.   :6555.0   Max.   :9874.00   Max.   :400.20  
##                                                                       
##            sector  
##  Finance      :17  
##  Energy       :15  
##  Manufacturing:10  
##  Retail       :10  
##  HiTech       : 8  
##  Other        : 7  
##  (Other)      :12

summary(df$sector)

##  Communication         Energy        Finance         HiTech  Manufacturing 
##              2             15             17              8             10 
##        Medical          Other         Retail Transportation 
##              4              7             10              6

df$sector <- relevel(df$sector, ref = "HiTech")

# Case 1. Sales ~ sector : 분산분석

# boxplot을 통해 sector간 비교.
boxplot(Sales ~ sector, df)

boxplot(log(Sales) ~ sector, df)   # log변환한 데이터가 분석에 더 적합

# Sales가 sector 간에 서로 다른지 판단하기 위한 분산분석
# 구체적으로 어느 sector간 통계적으로 유의한 차이가 있는지 Dunnett test
model <- lm(log(Sales) ~ sector, df)
summary(model)

## 
## Call:
## lm(formula = log(Sales) ~ sector, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0049 -0.6900 -0.1449  0.3527  3.0534 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.51516    0.35469  24.007  < 2e-16 ***
## sectorCommunication  -0.03473    0.79311  -0.044  0.96520    
## sectorEnergy         -1.21955    0.43921  -2.777  0.00704 ** 
## sectorFinance        -1.87598    0.43012  -4.361 4.35e-05 ***
## sectorManufacturing  -0.26482    0.47587  -0.557  0.57964    
## sectorMedical        -2.02592    0.61434  -3.298  0.00153 ** 
## sectorOther          -1.05866    0.51921  -2.039  0.04523 *  
## sectorRetail         -0.05092    0.47587  -0.107  0.91510    
## sectorTransportation -0.63658    0.54180  -1.175  0.24400    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.003 on 70 degrees of freedom
## Multiple R-squared:  0.3767, Adjusted R-squared:  0.3054 
## F-statistic: 5.288 on 8 and 70 DF,  p-value: 3.387e-05

    # F-statistic: 5.288 on 8 and 70 DF, p-value: 3.387e-05 < 0.05 : sector간 차이가 있다.

dunnet <- glht(model, linfct = mcp(sector = "Dunnett"))
summary(dunnet)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Dunnett Contrasts
## 
## 
## Fit: lm(formula = log(Sales) ~ sector, data = df)
## 
## Linear Hypotheses:
##                              Estimate Std. Error t value Pr(>|t|)    
## Communication - HiTech == 0  -0.03473    0.79311  -0.044   1.0000    
## Energy - HiTech == 0         -1.21955    0.43921  -2.777   0.0436 *  
## Finance - HiTech == 0        -1.87598    0.43012  -4.361   <0.001 ***
## Manufacturing - HiTech == 0  -0.26482    0.47587  -0.557   0.9957    
## Medical - HiTech == 0        -2.02592    0.61434  -3.298   0.0105 *  
## Other - HiTech == 0          -1.05866    0.51921  -2.039   0.2307    
## Retail - HiTech == 0         -0.05092    0.47587  -0.107   1.0000    
## Transportation - HiTech == 0 -0.63658    0.54180  -1.175   0.7864    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

    # Hi-tech를 기준으로, Energy, Finance, Medical가 유의한 차이를 보인다.

# Case 2. Sales ~ Assets + sector : 공분산분석

# 산점도
plot(df$Assets, log(df$Sales))

plot(log(df$Assets), log(df$Sales))   # Sales와 Assets 모두 log 변환 후 분석

# 공분산분석
model <- lm(log(Sales) ~ log(Assets) + sector, df)
summary(model)

## 
## Call:
## lm(formula = log(Sales) ~ log(Assets) + sector, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87909 -0.31926 -0.03989  0.19548  1.81473 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.18402    0.45917   4.756 1.04e-05 ***
## log(Assets)           0.76745    0.05151  14.899  < 2e-16 ***
## sectorCommunication  -0.77359    0.39215  -1.973   0.0525 .  
## sectorEnergy         -0.99258    0.21596  -4.596 1.89e-05 ***
## sectorFinance        -2.14698    0.21175 -10.139 2.63e-15 ***
## sectorManufacturing  -0.06805    0.23377  -0.291   0.7719    
## sectorMedical        -0.77141    0.31286  -2.466   0.0162 *  
## sectorOther          -0.35301    0.25903  -1.363   0.1774    
## sectorRetail          0.50120    0.23632   2.121   0.0375 *  
## sectorTransportation -0.11525    0.26803  -0.430   0.6686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4921 on 69 degrees of freedom
## Multiple R-squared:  0.8522, Adjusted R-squared:  0.8329 
## F-statistic:  44.2 on 9 and 69 DF,  p-value: < 2.2e-16

dunnet <- glht(model, linfct = mcp(sector = "Dunnett"))
summary(dunnet)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Dunnett Contrasts
## 
## 
## Fit: lm(formula = log(Sales) ~ log(Assets) + sector, data = df)
## 
## Linear Hypotheses:
##                              Estimate Std. Error t value Pr(>|t|)    
## Communication - HiTech == 0  -0.77359    0.39215  -1.973   0.2628    
## Energy - HiTech == 0         -0.99258    0.21596  -4.596   <0.001 ***
## Finance - HiTech == 0        -2.14698    0.21175 -10.139   <0.001 ***
## Manufacturing - HiTech == 0  -0.06805    0.23377  -0.291   1.0000    
## Medical - HiTech == 0        -0.77141    0.31286  -2.466   0.0942 .  
## Other - HiTech == 0          -0.35301    0.25903  -1.363   0.6563    
## Retail - HiTech == 0          0.50120    0.23632   2.121   0.1978    
## Transportation - HiTech == 0 -0.11525    0.26803  -0.430   0.9993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

# --> log(Assets) 이 평균수준으로 동일할 때 HiTech와 Energy, HiTech와 Finance sector 간에는 log(Sales)에 유의한 차이가 있다.


# Asset이 3000 million이고 finance sector에 있는 회사의 sales를 예측.
# log(Sales) = 2.184 - 2.146 + 0.767 * log(Assets)

y = exp(2.184 - 2.146 + 0.767 * log(3000))
y

## [1] 482.4532

R을 활용한 통계분석 - 6.분산분석(ANOVA)

woosa7