Statistical Inference
# Behavioral Risk Factor Surveillance System (BRFSS)
load(url("http://assets.datacamp.com/course/dasi/cdc.Rdata"))
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
summary(cdc)
## genhlth exerany hlthplan smoke100
## excellent:4657 Min. :0.0000 Min. :0.0000 Min. :0.0000
## very good:6972 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## good :5675 Median :1.0000 Median :1.0000 Median :0.0000
## fair :2019 Mean :0.7457 Mean :0.8738 Mean :0.4721
## poor : 677 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## height weight wtdesire age gender
## Min. :48.00 Min. : 68.0 Min. : 68.0 Min. :18.00 m: 9569
## 1st Qu.:64.00 1st Qu.:140.0 1st Qu.:130.0 1st Qu.:31.00 f:10431
## Median :67.00 Median :165.0 Median :150.0 Median :43.00
## Mean :67.18 Mean :169.7 Mean :155.1 Mean :45.07
## 3rd Qu.:70.00 3rd Qu.:190.0 3rd Qu.:175.0 3rd Qu.:57.00
## Max. :93.00 Max. :500.0 Max. :680.0 Max. :99.00
# relative frequency
table(cdc$genhlth) / nrow(cdc)
##
## excellent very good good fair poor
## 0.23285 0.34860 0.28375 0.10095 0.03385
gender_smokers = table(cdc$gender, cdc$smoke100)
mosaicplot(gender_smokers)
mosaicplot(table(cdc$genhlth, cdc$smoke100))
bmi = cdc$weight / (cdc$height ^ 2) * 703
boxplot(bmi~cdc$genhlth)
hist(bmi, breaks=50)
plot(cdc$weight, cdc$wtdesire)
Hot Hand Phenomenon
# 이전에 던진 슛이 성공한 경우 다음 슛 역시 성공할 것이라고 믿는 현상.
load(url("http://assets.datacamp.com/course/dasi/kobe.RData"))
head(kobe)
## vs game quarter time
## 1 ORL 1 1 9:47
## 2 ORL 1 1 9:07
## 3 ORL 1 1 8:11
## 4 ORL 1 1 7:41
## 5 ORL 1 1 7:03
## 6 ORL 1 1 6:01
## description basket
## 1 Kobe Bryant makes 4-foot two point shot H
## 2 Kobe Bryant misses jumper M
## 3 Kobe Bryant misses 7-foot jumper M
## 4 Kobe Bryant makes 16-foot jumper (Derek Fisher assists) H
## 5 Kobe Bryant makes driving layup H
## 6 Kobe Bryant misses jumper M
table(kobe$basket)
##
## H M
## 58 75
58/nrow(kobe)
## [1] 0.4360902
# 슈팅 성공 연속 횟수
kobe_streak <- calc_streak(kobe$basket)
kobe_streak
## [1] 1 0 2 0 0 0 3 2 0 3 0 1 3 0 0 0 0 0 1 1 0 4 1 0 1 0 1 0 1 2 0 1 2 1 0
## [36] 0 1 0 0 0 1 1 0 1 0 2 0 0 0 3 0 1 0 1 2 1 0 1 0 0 1 3 3 1 1 0 0 0 0 0
## [71] 1 1 0 0 0 1
table(kobe_streak)
## kobe_streak
## 0 1 2 3 4
## 39 24 6 6 1
# 이전 슛 성공여부는 다음 슛 성공여부와 상관없다 : 독립적
# 이전 슛 성공은 다음 슛 성공 확률을 높인다 : 비독립적
# 슛 성공 확률
# P(shot1=H) = 0.45
# hot hand일 경우 첫 슛이 성공했을 때 두번째 슛 성공 확률
# P(shot2=H | shot1=H) > 0.45
# simulation - independent shooter
outcomes <- c("H", "M")
sim_basket <- sample(outcomes, size = 133, replace = TRUE, prob = c(0.45, 0.55))
sim_streak <- calc_streak(sim_basket)
barplot(table(kobe_streak))
barplot(table(sim_streak))
# 두 테이블의 분산이 비슷하므로 Kobe는 hot hands라고 할 수 없다.
Sampling distributions - point estimate
# 아이오와 주 에임스 시 부동산 데이터
load(url("http://assets.datacamp.com/course/dasi/ames.RData"))
head(ames)
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1 1 526301100 20 RL 141 31770 Pave <NA>
## 2 2 526350040 20 RH 80 11622 Pave <NA>
## 3 3 526351010 20 RL 81 14267 Pave <NA>
## 4 4 526353030 20 RL 93 11160 Pave <NA>
## 5 5 527105010 60 RL 74 13830 Pave <NA>
## 6 6 527105030 60 RL 78 9978 Pave <NA>
## Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1 IR1 Lvl AllPub Corner Gtl NAmes
## 2 Reg Lvl AllPub Inside Gtl NAmes
## 3 IR1 Lvl AllPub Corner Gtl NAmes
## 4 Reg Lvl AllPub Corner Gtl NAmes
## 5 IR1 Lvl AllPub Inside Gtl Gilbert
## 6 IR1 Lvl AllPub Inside Gtl Gilbert
## Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1 Norm Norm 1Fam 1Story 6 5
## 2 Feedr Norm 1Fam 1Story 5 6
## 3 Norm Norm 1Fam 1Story 6 6
## 4 Norm Norm 1Fam 1Story 7 5
## 5 Norm Norm 1Fam 2Story 5 5
## 6 Norm Norm 1Fam 2Story 6 6
## Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1 1960 1960 Hip CompShg BrkFace Plywood
## 2 1961 1961 Gable CompShg VinylSd VinylSd
## 3 1958 1958 Hip CompShg Wd Sdng Wd Sdng
## 4 1968 1968 Hip CompShg BrkFace BrkFace
## 5 1997 1998 Gable CompShg VinylSd VinylSd
## 6 1998 1998 Gable CompShg VinylSd VinylSd
## Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1 Stone 112 TA TA CBlock TA
## 2 None 0 TA TA CBlock TA
## 3 BrkFace 108 TA TA CBlock TA
## 4 None 0 Gd TA CBlock TA
## 5 None 0 TA TA PConc Gd
## 6 BrkFace 20 TA TA PConc TA
## Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1 Gd Gd BLQ 639 Unf
## 2 TA No Rec 468 LwQ
## 3 TA No ALQ 923 Unf
## 4 TA No ALQ 1065 Unf
## 5 TA No GLQ 791 Unf
## 6 TA No GLQ 602 Unf
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1 0 441 1080 GasA Fa Y
## 2 144 270 882 GasA TA Y
## 3 0 406 1329 GasA TA Y
## 4 0 1045 2110 GasA Ex Y
## 5 0 137 928 GasA Gd Y
## 6 0 324 926 GasA Ex Y
## Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 1 SBrkr 1656 0 0 1656
## 2 SBrkr 896 0 0 896
## 3 SBrkr 1329 0 0 1329
## 4 SBrkr 2110 0 0 2110
## 5 SBrkr 928 701 0 1629
## 6 SBrkr 926 678 0 1604
## Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## 1 1 0 1 0 3
## 2 0 0 1 0 2
## 3 0 0 1 1 3
## 4 1 0 2 1 3
## 5 0 0 2 1 3
## 6 0 0 2 1 3
## Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces
## 1 1 TA 7 Typ 2
## 2 1 TA 5 Typ 0
## 3 1 Gd 6 Typ 0
## 4 1 Ex 8 Typ 2
## 5 1 TA 6 Typ 1
## 6 1 Gd 7 Typ 1
## Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars
## 1 Gd Attchd 1960 Fin 2
## 2 <NA> Attchd 1961 Unf 1
## 3 <NA> Attchd 1958 Unf 1
## 4 TA Attchd 1968 Fin 2
## 5 TA Attchd 1997 Fin 2
## 6 Gd Attchd 1998 Fin 2
## Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 1 528 TA TA P 210
## 2 730 TA TA Y 140
## 3 312 TA TA Y 393
## 4 522 TA TA Y 0
## 5 482 TA TA Y 212
## 6 470 TA TA Y 360
## Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC
## 1 62 0 0 0 0 <NA>
## 2 0 0 0 120 0 <NA>
## 3 36 0 0 0 0 <NA>
## 4 0 0 0 0 0 <NA>
## 5 34 0 0 0 0 <NA>
## 6 36 0 0 0 0 <NA>
## Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1 <NA> <NA> 0 5 2010 WD Normal
## 2 MnPrv <NA> 0 6 2010 WD Normal
## 3 <NA> Gar2 12500 6 2010 WD Normal
## 4 <NA> <NA> 0 4 2010 WD Normal
## 5 MnPrv <NA> 0 3 2010 WD Normal
## 6 <NA> <NA> 0 6 2010 WD Normal
## SalePrice
## 1 215000
## 2 105000
## 3 172000
## 4 244000
## 5 189900
## 6 195500
dim(ames)
## [1] 2930 82
names(ames)
## [1] "Order" "PID" "MS.SubClass"
## [4] "MS.Zoning" "Lot.Frontage" "Lot.Area"
## [7] "Street" "Alley" "Lot.Shape"
## [10] "Land.Contour" "Utilities" "Lot.Config"
## [13] "Land.Slope" "Neighborhood" "Condition.1"
## [16] "Condition.2" "Bldg.Type" "House.Style"
## [19] "Overall.Qual" "Overall.Cond" "Year.Built"
## [22] "Year.Remod.Add" "Roof.Style" "Roof.Matl"
## [25] "Exterior.1st" "Exterior.2nd" "Mas.Vnr.Type"
## [28] "Mas.Vnr.Area" "Exter.Qual" "Exter.Cond"
## [31] "Foundation" "Bsmt.Qual" "Bsmt.Cond"
## [34] "Bsmt.Exposure" "BsmtFin.Type.1" "BsmtFin.SF.1"
## [37] "BsmtFin.Type.2" "BsmtFin.SF.2" "Bsmt.Unf.SF"
## [40] "Total.Bsmt.SF" "Heating" "Heating.QC"
## [43] "Central.Air" "Electrical" "X1st.Flr.SF"
## [46] "X2nd.Flr.SF" "Low.Qual.Fin.SF" "Gr.Liv.Area"
## [49] "Bsmt.Full.Bath" "Bsmt.Half.Bath" "Full.Bath"
## [52] "Half.Bath" "Bedroom.AbvGr" "Kitchen.AbvGr"
## [55] "Kitchen.Qual" "TotRms.AbvGrd" "Functional"
## [58] "Fireplaces" "Fireplace.Qu" "Garage.Type"
## [61] "Garage.Yr.Blt" "Garage.Finish" "Garage.Cars"
## [64] "Garage.Area" "Garage.Qual" "Garage.Cond"
## [67] "Paved.Drive" "Wood.Deck.SF" "Open.Porch.SF"
## [70] "Enclosed.Porch" "X3Ssn.Porch" "Screen.Porch"
## [73] "Pool.Area" "Pool.QC" "Fence"
## [76] "Misc.Feature" "Misc.Val" "Mo.Sold"
## [79] "Yr.Sold" "Sale.Type" "Sale.Condition"
## [82] "SalePrice"
area <- ames$Gr.Liv.Area # ground living area
summary(area)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1126 1442 1500 1743 5642
hist(area)
# sample size가 증가할수록 그 분포의 중심이 실제 모집단 평균에 대해 신뢰성이 높아진다.
# 샘플 분포의 가변성이 줄어듬.
sample_means10 <- rep(NA, 5000)
sample_means50 <- rep(NA, 5000)
sample_means100 <- rep(NA, 5000)
for (i in 1:5000) {
samp <- sample(area, 10)
sample_means10[i] <- mean(samp)
samp <- sample(area, 50)
sample_means50[i] <- mean(samp)
samp <- sample(area, 100)
sample_means100[i] <- mean(samp)
}
xlimits <- range(sample_means10)
par(mfrow = c(3, 1))
hist(sample_means10, breaks = 20, xlim = xlimits)
hist(sample_means50, breaks = 20, xlim = xlimits)
hist(sample_means100, breaks = 20, xlim = xlimits)
# confidence intervals
samp_mean <- rep(NA, 50)
samp_sd <- rep(NA, 50)
n <- 60
for (i in 1:50) {
samp <- sample(area, n)
samp_mean[i] <- mean(samp)
samp_sd[i] <- sd(samp)
}
# 95% confidence intervals
se <- samp_sd/sqrt(n)
lower <- samp_mean - 1.96 * se
upper <- samp_mean + 1.96 * se
library(ggplot2)
confint.dat = data.frame(samp_mean, lower, upper, i = 1:length(samp_mean))
confint.dat$excl0 = ifelse(confint.dat$upper < mean(area) |
mean(area) < confint.dat$lower, 1, 0)
confint.dat$excl0 = factor(confint.dat$excl0)
ggplot(data = confint.dat, aes(x = i, y = samp_mean, ymin = lower, ymax = upper, colour = excl0)) +
geom_pointrange() +
scale_x_continuous(breaks = NULL) + geom_hline(yintercept = mean(area)) +
scale_colour_manual(values = c("black", "red")) + guides(colour = FALSE) +
ylab("Sample mean") + xlab("") + coord_flip()