R을 활용한 통계분석 - 2.평균에 대한 추론

# 추정(estimation) : 표본을 통해 모집단 특성이 어떠한가 추측
# 가설검정(testing hypothesis) : 모집단 실제 값이 얼마나 되는가 하는 주장과 관련해 표본이 가지고 있는 정보를 이용해 가설이 올바른지 판정

# 구간 추정

# 평균에 대한 추정 : 관심이 되는 변수가 '양적변수'인 경우

library(reshape)
library(ggplot2)
attach(tips)
summary(tips)

##    total_bill         tip             sex      smoker      day    
##  Min.   : 3.07   Min.   : 1.000   Female: 87   No :151   Fri :19  
##  1st Qu.:13.35   1st Qu.: 2.000   Male  :157   Yes: 93   Sat :87  
##  Median :17.80   Median : 2.900                          Sun :76  
##  Mean   :19.79   Mean   : 2.998                          Thur:62  
##  3rd Qu.:24.13   3rd Qu.: 3.562                                   
##  Max.   :50.81   Max.   :10.000                                   
##      time          size     
##  Dinner:176   Min.   :1.00  
##  Lunch : 68   1st Qu.:2.00  
##               Median :2.00  
##               Mean   :2.57  
##               3rd Qu.:3.00  
##               Max.   :6.00

str(tips)

## 'data.frame':    244 obs. of  7 variables:
##  $ total_bill: num  17 10.3 21 23.7 24.6 ...
##  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
##  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
##  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...

# --- 한 레스토랑에 오는 손님들의 평균 결제금액은 얼마인가? = 19.79
# --- 팁은 평균적으로 얼마나 주나? = 2.998
# --- 평균적으로 몇 명이 한 팀으로 식사를 하는가? = 2.57

# 레스토랑의 손님 244개 팀의 평균 결제금액은 $19.79 (점추정)
# --- 전체 손님의 평균도 19.79일까? 다른 표본이 추출되면?

# 구간추정 = 점추정치를 기준으로 일정구간을 만들어 그 구간안에 모수가 포함될 가능성을 높이는 것.
# --- 모분산이 알려져 있는 경우 : 표준정규분포를 따르는 Z 통계량 이용
# --- 모분산을 모르는 경우 : 자유도가 n-1인 t-분포를 따르는 T 통계량 이용

t.test(tips$total_bill)

## 
##  One Sample t-test
## 
## data:  tips$total_bill
## t = 34.717, df = 243, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  18.66333 20.90855
## sample estimates:
## mean of x 
##  19.78594

# 95 percent confidence interval:
#     18.66333 20.90855             --> 95% 신뢰구간 = 18.66 ~ 20.91
# sample estimates: mean of x 
#     19.78594                      --> 점추정값
# t = 34.717, df = 243, p-value < 2.2e-16 --> p-value < 0.05 이기 때문에 추정값이 유의미하다


#데이터 개수가 많아지면 conf.lev이 작아져도 나쁘지 않음
t.test(tips$total_bill, conf.level = 0.99)   # 99% 신뢰구간

## 
##  One Sample t-test
## 
## data:  tips$total_bill
## t = 34.717, df = 243, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
##  18.30631 21.26557
## sample estimates:
## mean of x 
##  19.78594

# 한 그룹의 평균에 대한 T-검정 (One sample T-test)

# 한 모집단의 평균이 어떤 특정한 값과 같은지 검증

# (1) 귀무가설, 대립가설 설정
# (2) 가정 체크
# (3) 검정통계량과 p-value 계산 (t-검정 / t-test)
# (4) 결론

# 2006년 한국인 1인 1일 평균 알콜 섭취량 : 8.1g
# 2008년 무작위 조사. 1일 평균 알콜 섭취량이 늘어났는가?

# Case 1.
x <- c(15.5, 11.21, 12.67, 8.87, 12.15, 9.88, 2.06, 14.5, 0, 4.97)
summary(x)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.945  10.550   9.181  12.540  15.500

# Case 2.
xx <- c(15.5, 11.21, 12.67, 8.87, 12.15, 9.88, 2.06, 14.5, 0, 4.97, 10.55, 20.1, 9.99, 17.97, 8.05, 13.13, 15.78)
summary(xx)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.87   11.21   11.02   14.50   20.10

# (1) 귀무가설 : mu = 8.1 (늘지 않았다)
# (2) 가정 체크
#     --- 자료가 정규분포를 따르는지.
#     --- 심하게 편중되거나 극단치가 있는 경우 표본수가 50개 (또는 30개) 이상인지.

boxplot(x)   # 극단치 없음

hist(x)

qqnorm(x)
qqline(x)    # 어느 정도 정규분포를 따른다.

boxplot(xx)

hist(x)

qqnorm(xx)
qqline(xx)

# Shapiro-Wilk normality test (정규성 검정)
# P-value > 0.05 : 귀무가설(정규분포를 따른다) 기각 못함 --> t-test 진행
shapiro.test(x)

## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.92341, p-value = 0.3863

shapiro.test(xx)

## 
##  Shapiro-Wilk normality test
## 
## data:  xx
## W = 0.97313, p-value = 0.8706

# (3) T-검정

t.test(x, mu = 8.1)

## 
##  One Sample t-test
## 
## data:  x
## t = 0.653, df = 9, p-value = 0.5301
## alternative hypothesis: true mean is not equal to 8.1
## 95 percent confidence interval:
##   5.436132 12.925868
## sample estimates:
## mean of x 
##     9.181

    # t = 0.653, df = 9, p-value = 0.5301
    # mean of x : 9.181 

t.test(xx, mu = 8.1)

## 
##  One Sample t-test
## 
## data:  xx
## t = 2.276, df = 16, p-value = 0.03695
## alternative hypothesis: true mean is not equal to 8.1
## 95 percent confidence interval:
##   8.300426 13.744280
## sample estimates:
## mean of x 
##  11.02235

    # t = 2.276, df = 16, p-value = 0.03695
    # mean of x : 11.02235 


# (4) 결론

# Case 1.
# 모평균이 8.1일 때, 표본평균이 9.18이 나올 가능성은 0.53
# p-value > 0.05 이므로 귀무가설 기각할 수 없음.
# 95% 신뢰구간 : 5.436 ~ 12.926

# Case 2.
# p-value < 0.05 이므로 귀무가설 기각 --> 2008년 평균 알콜 섭취량이 증가했다

t.test(xx, mu = 8.1)                             # Ha : mu <> 8.1

## 
##  One Sample t-test
## 
## data:  xx
## t = 2.276, df = 16, p-value = 0.03695
## alternative hypothesis: true mean is not equal to 8.1
## 95 percent confidence interval:
##   8.300426 13.744280
## sample estimates:
## mean of x 
##  11.02235

t.test(xx, mu = 8.1, alternative = "greater")    # Ha : mu > 8.1

## 
##  One Sample t-test
## 
## data:  xx
## t = 2.276, df = 16, p-value = 0.01847
## alternative hypothesis: true mean is greater than 8.1
## 95 percent confidence interval:
##  8.780664      Inf
## sample estimates:
## mean of x 
##  11.02235

t.test(xx, mu = 8.1, alternative = "less")       # Ha : mu < 8.1

## 
##  One Sample t-test
## 
## data:  xx
## t = 2.276, df = 16, p-value = 0.9815
## alternative hypothesis: true mean is less than 8.1
## 95 percent confidence interval:
##      -Inf 13.26404
## sample estimates:
## mean of x 
##  11.02235

# t-test 결과물

out <- t.test(x, mu = 8.1)
names(out)

## [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
## [6] "null.value"  "alternative" "method"      "data.name"

out$statistic

##         t 
## 0.6529981

out$p.value

## [1] 0.5300829

# p-value

# 귀무가설이 옳다면, 이런 데이터 패턴이 관찰될 가능성
# p-value가 작으면 귀무가설이 틀렸을 거라고 합리적으로 의심.

# 위약을 투약한 환자의 49%, 신약을 투약한 환자의 91%가 호전
#   : 신약이 심장병에 효과가 없는데(귀무가설) 위의 결과를 얻을 확률은?
# 자폐증 어린이 59명의 뇌가 그렇지 않은 어린이 38명에 비해 최대 10% 이상 큼
#   : 두 집단의 뇌크기에 실제로 차이가 없는데 표본집단에서 뇌크기의 차이가 관찰될 확률은?

# 유의수준(a)

# 귀무가설이 옳은데 기각하는 오류(1종 오류)를 범할 확률
# 귀무가설을 기각하기 위한 p-value의 임계치로 사용

# 1종 오류와 2종 오류의 상충관계
# (귀무가설 = 효과가 없다 일 경우)
# a가 너무 크면 효과가 없는 약을 효과가 있다고 잘못 판단 (1종 오류)
# a가 너무 작으면 효과가 있는 약을 승인하지 않을 수 있음 (2종 오류)
# 상황에 따라 어느 오류를 줄이는 것이 중요한가 (유의수준 조정) 판단 필요

# Tips Practice

# tip 평균이 2.5인지
t.test(tips$tip, alternative = "two.sided", mu = 2.5)   # --> 기각

## 
##  One Sample t-test
## 
## data:  tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
##  2.823799 3.172758
## sample estimates:
## mean of x 
##  2.998279

# tip 평균이 2.5보다 큰지
t.test(tips$tip, alternative = "greater", mu = 2.5)   # --> 기각

## 
##  One Sample t-test
## 
## data:  tips$tip
## t = 5.6253, df = 243, p-value = 2.54e-08
## alternative hypothesis: true mean is greater than 2.5
## 95 percent confidence interval:
##  2.852023      Inf
## sample estimates:
## mean of x 
##  2.998279

# t분포
randT = rt(30000, df = NROW(tips) - 1)   # t분포 생성
tipsTest = t.test(tips$tip, alternative = "two.sided", mu = 2.5)

ggplot(data.frame(x = randT)) + 
    geom_density(aes(x = x), fill = "grey", color = "grey") +
    geom_vline(xintercept = tipsTest$statistic) +                            # 실선 : tips T-통계량
    geom_vline(xintercept = mean(randT) + c(-3,3)*sd(randT), linetype = 2)   # 점선 : 3 표준편차

    # tips T-통계량이 해당 분포의 너무 바깥쪽에 있기 때문에 귀무가설 기각

# 두 그룹의 평균에 대한 독립표본 T-검정 (Two sample T-test)

# 줄기세포 이용한 배양치아 제작
# 다른 두 조건(control, test)에서 배양된 뼈세포수(resp) 측정 후 비교
# 범주형 변수와 양적 변수

# (1) 귀무가설 H0 : mu1 = mu2

# (2) 가정 체크
# --- 두 집단 모두 정규분포를 따른다.
# --- 정규분포를 따르지 않아도 관측치가 충분히 많다면(n1 + n2 > 30) 독립표본 T-검정 사용 가능.

dental <- read.csv("data/dental.csv")
dental

##    treatment resp
## 1       test  148
## 2       test  190
## 3       test   68
## 4       test   79
## 5       test   70
## 6    control   40
## 7    control   80
## 8    control   64
## 9    control   52
## 10   control   45

library(psych)

describeBy(dental, group = dental$treatment)

## $control
##            vars n mean    sd median trimmed   mad min max range skew
## treatment*    1 5  1.0  0.00      1     1.0  0.00   1   1     0  NaN
## resp          2 5 56.2 16.07     52    56.2 17.79  40  80    40  0.4
##            kurtosis   se
## treatment*      NaN 0.00
## resp          -1.77 7.19
## 
## $test
##            vars n mean    sd median trimmed   mad min max range skew
## treatment*    1 5    2  0.00      2       2  0.00   2   2     0  NaN
## resp          2 5  111 55.15     79     111 16.31  68 190   122 0.43
##            kurtosis    se
## treatment*      NaN  0.00
## resp          -1.96 24.66
## 
## attr(,"call")
## by.data.frame(data = x, INDICES = group, FUN = describe, type = type)

# test군의 분산이 control군의 분산보다 크고, 두 그룹 모두 편향되어(skew) 있다.
# log 변환을 통해 분산 차이를 좁히고, 편향도를 낮춘다.
# 많은 경우 실제 데이터는 편향된 분포를 가진다.
# --> 여러 통계기법들은 정규분포를 가정하므로 분석 전 변수 변환이 필요한지 체크한다.

par(mfcol=c(1,2))
boxplot(resp ~ treatment, data = dental, col = "red", ylab = "resp")
boxplot(log(resp) ~ treatment, data = dental, col = "blue", ylab = "log(resp)")

par(mfcol=c(1,1))

# T-test
# One Sample T-test : T = 자료의 평균 x_bar가 모집단 평균 mu로부터 떨어진 상대적인 거리.
# Two Sample T-test : T = (x1_bar - x2_bar) / root(Var(x1_bar - x2_bar)). 두 자료의 평균이 서로 떨어진 상대적인 거리.

# 등분산검정 : var.test
# 두 집단의 분산이 같으면 t.test( , var.equal = T) --> 귀무가설
# 두 집단의 분산이 다르면 t.test()

var.test(log(resp) ~ treatment, data = dental)   # p-value > 0.05 : 귀무가설 인정. 분산이 같다.

## 
##  F test to compare two variances
## 
## data:  log(resp) by treatment
## F = 0.34321, num df = 4, denom df = 4, p-value = 0.325
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.03573413 3.29636586
## sample estimates:
## ratio of variances 
##          0.3432095

var.test(resp ~ treatment, data = dental)        # p-value < 0.05 : 귀무가설 기각.

## 
##  F test to compare two variances
## 
## data:  resp by treatment
## F = 0.084906, num df = 4, denom df = 4, p-value = 0.03483
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.008840233 0.815484912
## sample estimates:
## ratio of variances 
##         0.08490628

# (3) T-검정

t.test(log(resp) ~ treatment, var.equal = T, data = dental)

## 
##  Two Sample t-test
## 
## data:  log(resp) by treatment
## t = -2.5217, df = 8, p-value = 0.03571
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.18465764 -0.05293907
## sample estimates:
## mean in group control    mean in group test 
##              3.997539              4.616337

# Case 1
# p-value = 0.03571 < 0.05 : 귀무가설 기각.
# 두 그룹의 평균은 유의수준 5% 하에서 차이가 있다.

t.test(resp ~ treatment, data = dental)

## 
##  Welch Two Sample t-test
## 
## data:  resp by treatment
## t = -2.1333, df = 4.6744, p-value = 0.08988
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -122.23919   12.63919
## sample estimates:
## mean in group control    mean in group test 
##                  56.2                 111.0

# Case 2
# log 변환 전 자료로 가설검정시
# p-value = 0.08988 > 0.05 : 귀무가설 기각할 수 없다.
# 두 그룹의 평균은 유의수준 5% 하에서 차이가 없다.
# 95% 신뢰구간에 0 이 포함됨 (-122.239 ~ 12.639)

# 두 그룹의 평균에 대한 쌍체표본 T-검정 (Paired T-test)

# 쌍을 이룬 두 변수 (matched sample)의 차이를 보는 검정
# 한 집단을 대상으로 약의 복용 전후, 치료 전후, 교육방법 도입 등
# 두 집단이더라도 쌍둥이 또는 부부처럼 변수들 간의 상관관계가 존재할 때
# http://math7.tistory.com/107

# 15명의 개인, 케이블 TV 시청시간과 라디오 청취시간에 대한 자료를 비교.
# 과거 3개월간 DM 발송 유무에 따른 평균 구매금액 차이 (A/B Test)
# 디자인 변경 전/후 상품 구매자 수 증가 유무
# ---> 쌍체표본 T-검정


# Case
# 거식증 치료제 FT 복용 전후 체중변화 --> FT가 "체중 증가"에 영향이 있는지 검증
# 귀무가설 : postwt - prewt = 0
# 대립가설 : postwt - prewt > 0

ft <- read.csv("data/FT.csv")
ft

##    Treat Prewt Postwt
## 1     FT  83.8   95.2
## 2     FT  83.3   94.3
## 3     FT  86.0   91.5
## 4     FT  82.5   91.9
## 5     FT  86.7  100.3
## 6     FT  79.6   76.7
## 7     FT  76.9   76.8
## 8     FT  94.2  101.6
## 9     FT  73.4   94.9
## 10    FT  80.5   75.2
## 11    FT  81.6   77.8
## 12    FT  82.1   95.5
## 13    FT  77.6   90.7
## 14    FT  83.5   92.5
## 15    FT  89.9   93.8
## 16    FT  86.0   91.7
## 17    FT  87.3   98.0

changed <- ft$Postwt - ft$Prewt

boxplot(changed)

hist(changed)

qqnorm(changed)
qqline(changed)

shapiro.test(changed)  # p-value = 0.5156 > 0.05 : 정규분포를 따른다

## 
##  Shapiro-Wilk normality test
## 
## data:  changed
## W = 0.95358, p-value = 0.5156

t.test(changed, alternative = "greater")

## 
##  One Sample t-test
## 
## data:  changed
## t = 4.1849, df = 16, p-value = 0.0003501
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
##  4.233975      Inf
## sample estimates:
## mean of x 
##  7.264706

t.test(ft$Postwt, ft$Prewt, alternative = "greater", paired = TRUE)

## 
##  Paired t-test
## 
## data:  ft$Postwt and ft$Prewt
## t = 4.1849, df = 16, p-value = 0.0003501
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  4.233975      Inf
## sample estimates:
## mean of the differences 
##                7.264706

    # 위 두 t.test의 결과는 동일하다.
    # p-value = 0.0003501 < 0.05 : 귀무가설 기각. 유의한 체중 중가가 있다.

# practice 1

# 2012 - 2013 년 국내 개봉 영화
kmovie <- read.csv("data/movie.csv")
View(kmovie)
str(kmovie)

## 'data.frame':    227 obs. of  11 variables:
##  $ title       : Factor w/ 227 levels "007 스카이폴",..: 48 160 98 176 196 156 108 212 38 49 ...
##  $ release_date: Factor w/ 110 levels "2012-01-05","2012-01-11",..: 1 1 2 3 3 4 4 4 4 4 ...
##  $ week1_sales : num  1.17e+09 3.99e+09 2.20e+09 7.75e+09 7.37e+08 ...
##  $ week1_seen  : int  142322 541314 281785 883384 104258 183724 916902 335960 209516 1239057 ...
##  $ nation      : Factor w/ 17 levels "남아프리카공화국",..: 5 16 5 5 16 5 16 16 16 16 ...
##  $ production  : Factor w/ 89 levels "","(유)에이디사공육,(주)다세포클럽",..: 1 32 1 1 81 1 27 11 72 44 ...
##  $ distributor : Factor w/ 36 levels "","(주)나이너스엔터테인먼트",..: 32 24 36 28 21 32 3 8 18 28 ...
##  $ rating      : Factor w/ 4 levels "12세이상관람가",..: 1 2 4 3 3 3 2 1 2 1 ...
##  $ genre       : Factor w/ 9 levels "가족","기타",..: 8 3 6 7 7 1 3 3 9 9 ...
##  $ total_seen  : int  162704 986287 443855 2080445 206344 276334 3459864 467697 283449 4058225 ...
##  $ total_sales : num  1.32e+09 7.26e+09 3.50e+09 1.76e+10 1.44e+09 ...

unique(kmovie$rating)

## [1] 12세이상관람가 15세이상관람가 청소년관람불가 전체관람가    
## Levels: 12세이상관람가 15세이상관람가 전체관람가 청소년관람불가

# (1) 15세이상 관람가 영화의 평균 관객수를 95% 신뢰구간을 통해 추정
df <- kmovie[kmovie$rating == "15세이상관람가", ]
summary(df$total_seen)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   101400   385200  1046000  2096000  2421000 12980000

t.test(df$total_seen)   # 95% 신뢰구간 : 1,517,279 ~ 2,674,186

## 
##  One Sample t-test
## 
## data:  df$total_seen
## t = 7.1945, df = 93, p-value = 1.571e-10
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  1517279 2674186
## sample estimates:
## mean of x 
##   2095733

# (2) 한 영화사에서 15세이상 관람가 영화의 평균 관객수가 1,500,000 보다 크다고 주장
# ---> One sample T-test

# 귀무가설 : mu = 1,500,000
# 대립가설 : mu > 1,500,000
boxplot(df$total_seen)

qqnorm(df$total_seen)
qqline(df$total_seen)

shapiro.test(df$total_seen)   # p-value = 8.782e-13 : 정규분포 따르지 않는다. but 데이터가 충분히 크므로 t-test 진행.

## 
##  Shapiro-Wilk normality test
## 
## data:  df$total_seen
## W = 0.69027, p-value = 8.782e-13

t.test(df$total_seen, mu = 1500000, alternative = "greater")   # p-value = 0.02183 < 0.05 : 귀무가설 기각

## 
##  One Sample t-test
## 
## data:  df$total_seen
## t = 2.0451, df = 93, p-value = 0.02183
## alternative hypothesis: true mean is greater than 1500000
## 95 percent confidence interval:
##  1611774     Inf
## sample estimates:
## mean of x 
##   2095733

# (3) 한 영화사에서 15세이상 관람가 영화의 평균 관객수가 12세이상 관람가 영화의 평균관객수보다 많다고 주장
# ---> Two sample T-test

# 귀무가설 : 12세 - 15세 = 0
# 대립가설 : 12세 - 15세 < 0
df2 <- kmovie[kmovie$rating == "15세이상관람가" | kmovie$rating == "12세이상관람가", c("rating", "total_seen")]
df2$rating <- factor(df2$rating, labels = c("12세", "15세"))
head(df2)

##     rating total_seen
## 1     12세     162704
## 2     15세     986287
## 7     15세    3459864
## 8     12세     467697
## 9     15세     283449
## 10    12세    4058225

boxplot(total_seen ~ rating, df2)       # 이상치 많고, 편향되어 있으므로 변수 변환

boxplot(log(total_seen) ~ rating, df2)

var.test(log(total_seen) ~ rating, df2) # 등분산검정 : p-value > 0.05 분산이 같다.

## 
##  F test to compare two variances
## 
## data:  log(total_seen) by rating
## F = 0.92169, num df = 42, denom df = 93, p-value = 0.784
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5617817 1.5924249
## sample estimates:
## ratio of variances 
##          0.9216909

t.test(log(total_seen) ~ rating, df2, var.equal=T, alternative = "less") # 12세 - 15세

## 
##  Two Sample t-test
## 
## data:  log(total_seen) by rating
## t = -0.2848, df = 135, p-value = 0.3881
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf 0.3282848
## sample estimates:
## mean in group 12세 mean in group 15세 
##           13.70416           13.77233

    # p-value = 0.3881 > 0.05 : 귀무가설을 기각할 수 없다.
    # 15세이상 관람가 영화의 평균 관객수가 12세 관람가 영화보다 많다고 할 수 없다.

# practice 2

# 대기업의 2002년 주당이익 자료.
# 2002년 이전에 애널리스트들이 이들 대기업에 대한 주당이익을 예측

earning <- read.csv("data/earnings.csv")
earning

##             Company Actual Predicted
## 1              AT&T   1.29      0.38
## 2      Amer Express   2.01      2.31
## 3         Citigroup   2.59      3.43
## 4         Coca Cola   1.60      1.78
## 5            DuPont   1.84      2.18
## 6       Exxon-Mobil   2.72      2.19
## 7  General Electric   1.51      1.71
## 8   Johnson/Johnson   2.28      2.18
## 9         McDonalds   0.77      1.55
## 10         Wal-Mart   1.81      1.74

attach(earning)

# (1) 실제 평균 주당이익과 추정 평균 주당이익에 대한 기술 통계
library(psych)
describe(earning[ , c(2,3)])

##           vars  n mean   sd median trimmed  mad  min  max range  skew
## Actual       1 10 1.84 0.59   1.83    1.87 0.57 0.77 2.72  1.95 -0.13
## Predicted    2 10 1.94 0.76   1.98    1.96 0.38 0.38 3.43  3.05 -0.13
##           kurtosis   se
## Actual       -1.09 0.19
## Predicted     0.22 0.24

summary(earning)

##          Company      Actual        Predicted    
##  Amer Express:1   Min.   :0.770   Min.   :0.380  
##  AT&T        :1   1st Qu.:1.532   1st Qu.:1.718  
##  Citigroup   :1   Median :1.825   Median :1.980  
##  Coca Cola   :1   Mean   :1.842   Mean   :1.945  
##  DuPont      :1   3rd Qu.:2.212   3rd Qu.:2.188  
##  Exxon-Mobil :1   Max.   :2.720   Max.   :3.430  
##  (Other)     :4

# (2) 실제 모집단 평균 주당이익과 추정 모집단 평균 주당이익 간의 차이에 대하여 가설검정을 a = 0.05에서 수행
# --> Paired T-test

# 귀무가설 : Actual - Predicted = 0

boxplot(Actual - Predicted)

qqnorm(Actual - Predicted)
qqline(Actual - Predicted)

shapiro.test(Actual - Predicted)  # p-value = 0.6816

## 
##  Shapiro-Wilk normality test
## 
## data:  Actual - Predicted
## W = 0.9511, p-value = 0.6816

t.test(Actual - Predicted)   # p-value = 0.5602 : 귀무가설 기각할 수 없음. 실제와 추정의 차이가 크지 않다.

## 
##  One Sample t-test
## 
## data:  Actual - Predicted
## t = -0.60486, df = 9, p-value = 0.5602
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.4882175  0.2822175
## sample estimates:
## mean of x 
##    -0.103

# 두 평균의 차이에 대한 점 추정치 : mean of x = -0.103 
# 해당 기업의 주당 이익에 대해 약간 과대평가.

# Wilcoxon Signed-Rank Test

# One sample / Paired T-test에 해당하는 비모수적 방법

# 정규분포는 평균과 표준편차로 모양이 결정된다. (평균 = 0)
# 그러나 데이터의 분포를 모르는 경우 정규분포를 가정하여 t-test를 하면 잘못된 결과를 얻을 수 있다.
# 극단치가 있을 경우 t-test는 평균에 영향이 주어져 결과가 달라질 수 있지만, 비모수적 방법은 극단치에 영향을 받지 않는다.

# Wilcoxon Signed-Rank Test는 자료의 순서를 사용한다.
# 자료의 절대값 기준으로 순위를 부여한 후, 다시 부호를 붙이고 양과 음의 순위합을 구하여 중위수(median)가 0 인지 검정한다.
# http://dermabae.tistory.com/159

df <- read.csv("data/anorexia.csv")
df <- df[df$Treat == "CBT", ]
head(df)

##    Treat Prewt Postwt
## 27   CBT  80.5   82.2
## 28   CBT  84.9   85.6
## 29   CBT  81.5   81.4
## 30   CBT  82.6   81.9
## 31   CBT  79.9   76.4
## 32   CBT  88.7  103.6

str(df)

## 'data.frame':    29 obs. of  3 variables:
##  $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Prewt : num  80.5 84.9 81.5 82.6 79.9 88.7 94.9 76.3 81 80.5 ...
##  $ Postwt: num  82.2 85.6 81.4 81.9 76.4 ...

changed <- df$Postwt - df$Prewt

boxplot(changed)

qqnorm(changed)
qqline(changed)

shapiro.test(changed)  # p-value = 0.007945 < 0.05 : 정규분포를 따르지 않는다.

## 
##  Shapiro-Wilk normality test
## 
## data:  changed
## W = 0.89618, p-value = 0.007945

# H0 : CBT 복용 전후 몸무게 차이의 median은 0 이다.

wilcox.test(changed)   # warning : tie가 있어 정확한 p값을 계산할 수 없다 (절대값이 같은 것들이 존재)

## Warning in wilcox.test.default(changed): cannot compute exact p-value with
## ties

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  changed
## V = 303.5, p-value = 0.06447
## alternative hypothesis: true location is not equal to 0

wilcox.test(changed, exact = F)   # p-value = 0.06447 > 0.05 : 귀무가설 기각 못함.

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  changed
## V = 303.5, p-value = 0.06447
## alternative hypothesis: true location is not equal to 0

# Wilcoxon Rank-Sum Test

# Two sample T-test에 해당하는 비모수적 방법

dental <- read.csv("data/dental.csv")
dental

##    treatment resp
## 1       test  148
## 2       test  190
## 3       test   68
## 4       test   79
## 5       test   70
## 6    control   40
## 7    control   80
## 8    control   64
## 9    control   52
## 10   control   45

# control, test 각 그룹의 자료가 5개 밖에 없으므로 정규분포를 따른다고 하기 힘들다.
# 비모수적 분석방법 적용

wilcox.test(resp ~ treatment, data = dental)

## 
##  Wilcoxon rank sum test
## 
## data:  resp by treatment
## W = 3, p-value = 0.05556
## alternative hypothesis: true location shift is not equal to 0

    # p-value = 0.05556 > 0.05 : 귀무가설 기각할 수 없다. 유의한 차이가 없다.

R을 활용한 통계분석 - 2.평균에 대한 추론

woosa7