Statistical Inference

# Behavioral Risk Factor Surveillance System (BRFSS)

load(url("http://assets.datacamp.com/course/dasi/cdc.Rdata"))

str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00
# relative frequency
table(cdc$genhlth) / nrow(cdc)
## 
## excellent very good      good      fair      poor 
##   0.23285   0.34860   0.28375   0.10095   0.03385
gender_smokers = table(cdc$gender, cdc$smoke100)
mosaicplot(gender_smokers)

mosaicplot(table(cdc$genhlth, cdc$smoke100))

bmi = cdc$weight / (cdc$height ^ 2) * 703
boxplot(bmi~cdc$genhlth)

hist(bmi, breaks=50)

plot(cdc$weight, cdc$wtdesire)

Hot Hand Phenomenon

# 이전에 던진 슛이 성공한 경우 다음 슛 역시 성공할 것이라고 믿는 현상.
load(url("http://assets.datacamp.com/course/dasi/kobe.RData"))
head(kobe)
##    vs game quarter time
## 1 ORL    1       1 9:47
## 2 ORL    1       1 9:07
## 3 ORL    1       1 8:11
## 4 ORL    1       1 7:41
## 5 ORL    1       1 7:03
## 6 ORL    1       1 6:01
##                                               description basket
## 1                 Kobe Bryant makes 4-foot two point shot      H
## 2                               Kobe Bryant misses jumper      M
## 3                        Kobe Bryant misses 7-foot jumper      M
## 4 Kobe Bryant makes 16-foot jumper (Derek Fisher assists)      H
## 5                         Kobe Bryant makes driving layup      H
## 6                               Kobe Bryant misses jumper      M
table(kobe$basket)
## 
##  H  M 
## 58 75
58/nrow(kobe)
## [1] 0.4360902
# 슈팅 성공 연속 횟수
kobe_streak <- calc_streak(kobe$basket)
kobe_streak
##  [1] 1 0 2 0 0 0 3 2 0 3 0 1 3 0 0 0 0 0 1 1 0 4 1 0 1 0 1 0 1 2 0 1 2 1 0
## [36] 0 1 0 0 0 1 1 0 1 0 2 0 0 0 3 0 1 0 1 2 1 0 1 0 0 1 3 3 1 1 0 0 0 0 0
## [71] 1 1 0 0 0 1
table(kobe_streak)
## kobe_streak
##  0  1  2  3  4 
## 39 24  6  6  1
# 이전 슛 성공여부는 다음 슛 성공여부와 상관없다 : 독립적
# 이전 슛 성공은 다음 슛 성공 확률을 높인다 : 비독립적

# 슛 성공 확률
# P(shot1=H) = 0.45
# hot hand일 경우 첫 슛이 성공했을 때 두번째 슛 성공 확률
# P(shot2=H | shot1=H) > 0.45

# simulation - independent shooter
outcomes <- c("H", "M")
sim_basket <- sample(outcomes, size = 133, replace = TRUE, prob = c(0.45, 0.55))
sim_streak <- calc_streak(sim_basket)

barplot(table(kobe_streak))

barplot(table(sim_streak))

# 두 테이블의 분산이 비슷하므로 Kobe는 hot hands라고 할 수 없다.

Sampling distributions - point estimate

# 아이오와 주 에임스 시 부동산 데이터
load(url("http://assets.datacamp.com/course/dasi/ames.RData"))
head(ames)
##   Order       PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1     1 526301100          20        RL          141    31770   Pave  <NA>
## 2     2 526350040          20        RH           80    11622   Pave  <NA>
## 3     3 526351010          20        RL           81    14267   Pave  <NA>
## 4     4 526353030          20        RL           93    11160   Pave  <NA>
## 5     5 527105010          60        RL           74    13830   Pave  <NA>
## 6     6 527105030          60        RL           78     9978   Pave  <NA>
##   Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 2       Reg          Lvl    AllPub     Inside        Gtl        NAmes
## 3       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 4       Reg          Lvl    AllPub     Corner        Gtl        NAmes
## 5       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
## 6       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
##   Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1        Norm        Norm      1Fam      1Story            6            5
## 2       Feedr        Norm      1Fam      1Story            5            6
## 3        Norm        Norm      1Fam      1Story            6            6
## 4        Norm        Norm      1Fam      1Story            7            5
## 5        Norm        Norm      1Fam      2Story            5            5
## 6        Norm        Norm      1Fam      2Story            6            6
##   Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1       1960           1960        Hip   CompShg      BrkFace      Plywood
## 2       1961           1961      Gable   CompShg      VinylSd      VinylSd
## 3       1958           1958        Hip   CompShg      Wd Sdng      Wd Sdng
## 4       1968           1968        Hip   CompShg      BrkFace      BrkFace
## 5       1997           1998      Gable   CompShg      VinylSd      VinylSd
## 6       1998           1998      Gable   CompShg      VinylSd      VinylSd
##   Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1        Stone          112         TA         TA     CBlock        TA
## 2         None            0         TA         TA     CBlock        TA
## 3      BrkFace          108         TA         TA     CBlock        TA
## 4         None            0         Gd         TA     CBlock        TA
## 5         None            0         TA         TA      PConc        Gd
## 6      BrkFace           20         TA         TA      PConc        TA
##   Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1        Gd            Gd            BLQ          639            Unf
## 2        TA            No            Rec          468            LwQ
## 3        TA            No            ALQ          923            Unf
## 4        TA            No            ALQ         1065            Unf
## 5        TA            No            GLQ          791            Unf
## 6        TA            No            GLQ          602            Unf
##   BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1            0         441          1080    GasA         Fa           Y
## 2          144         270           882    GasA         TA           Y
## 3            0         406          1329    GasA         TA           Y
## 4            0        1045          2110    GasA         Ex           Y
## 5            0         137           928    GasA         Gd           Y
## 6            0         324           926    GasA         Ex           Y
##   Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 1      SBrkr        1656           0               0        1656
## 2      SBrkr         896           0               0         896
## 3      SBrkr        1329           0               0        1329
## 4      SBrkr        2110           0               0        2110
## 5      SBrkr         928         701               0        1629
## 6      SBrkr         926         678               0        1604
##   Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## 1              1              0         1         0             3
## 2              0              0         1         0             2
## 3              0              0         1         1             3
## 4              1              0         2         1             3
## 5              0              0         2         1             3
## 6              0              0         2         1             3
##   Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces
## 1             1           TA             7        Typ          2
## 2             1           TA             5        Typ          0
## 3             1           Gd             6        Typ          0
## 4             1           Ex             8        Typ          2
## 5             1           TA             6        Typ          1
## 6             1           Gd             7        Typ          1
##   Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars
## 1           Gd      Attchd          1960           Fin           2
## 2         <NA>      Attchd          1961           Unf           1
## 3         <NA>      Attchd          1958           Unf           1
## 4           TA      Attchd          1968           Fin           2
## 5           TA      Attchd          1997           Fin           2
## 6           Gd      Attchd          1998           Fin           2
##   Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 1         528          TA          TA           P          210
## 2         730          TA          TA           Y          140
## 3         312          TA          TA           Y          393
## 4         522          TA          TA           Y            0
## 5         482          TA          TA           Y          212
## 6         470          TA          TA           Y          360
##   Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC
## 1            62              0           0            0         0    <NA>
## 2             0              0           0          120         0    <NA>
## 3            36              0           0            0         0    <NA>
## 4             0              0           0            0         0    <NA>
## 5            34              0           0            0         0    <NA>
## 6            36              0           0            0         0    <NA>
##   Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1  <NA>         <NA>        0       5    2010       WD          Normal
## 2 MnPrv         <NA>        0       6    2010       WD          Normal
## 3  <NA>         Gar2    12500       6    2010       WD          Normal
## 4  <NA>         <NA>        0       4    2010       WD          Normal
## 5 MnPrv         <NA>        0       3    2010       WD          Normal
## 6  <NA>         <NA>        0       6    2010       WD          Normal
##   SalePrice
## 1    215000
## 2    105000
## 3    172000
## 4    244000
## 5    189900
## 6    195500
dim(ames)
## [1] 2930   82
names(ames)
##  [1] "Order"           "PID"             "MS.SubClass"    
##  [4] "MS.Zoning"       "Lot.Frontage"    "Lot.Area"       
##  [7] "Street"          "Alley"           "Lot.Shape"      
## [10] "Land.Contour"    "Utilities"       "Lot.Config"     
## [13] "Land.Slope"      "Neighborhood"    "Condition.1"    
## [16] "Condition.2"     "Bldg.Type"       "House.Style"    
## [19] "Overall.Qual"    "Overall.Cond"    "Year.Built"     
## [22] "Year.Remod.Add"  "Roof.Style"      "Roof.Matl"      
## [25] "Exterior.1st"    "Exterior.2nd"    "Mas.Vnr.Type"   
## [28] "Mas.Vnr.Area"    "Exter.Qual"      "Exter.Cond"     
## [31] "Foundation"      "Bsmt.Qual"       "Bsmt.Cond"      
## [34] "Bsmt.Exposure"   "BsmtFin.Type.1"  "BsmtFin.SF.1"   
## [37] "BsmtFin.Type.2"  "BsmtFin.SF.2"    "Bsmt.Unf.SF"    
## [40] "Total.Bsmt.SF"   "Heating"         "Heating.QC"     
## [43] "Central.Air"     "Electrical"      "X1st.Flr.SF"    
## [46] "X2nd.Flr.SF"     "Low.Qual.Fin.SF" "Gr.Liv.Area"    
## [49] "Bsmt.Full.Bath"  "Bsmt.Half.Bath"  "Full.Bath"      
## [52] "Half.Bath"       "Bedroom.AbvGr"   "Kitchen.AbvGr"  
## [55] "Kitchen.Qual"    "TotRms.AbvGrd"   "Functional"     
## [58] "Fireplaces"      "Fireplace.Qu"    "Garage.Type"    
## [61] "Garage.Yr.Blt"   "Garage.Finish"   "Garage.Cars"    
## [64] "Garage.Area"     "Garage.Qual"     "Garage.Cond"    
## [67] "Paved.Drive"     "Wood.Deck.SF"    "Open.Porch.SF"  
## [70] "Enclosed.Porch"  "X3Ssn.Porch"     "Screen.Porch"   
## [73] "Pool.Area"       "Pool.QC"         "Fence"          
## [76] "Misc.Feature"    "Misc.Val"        "Mo.Sold"        
## [79] "Yr.Sold"         "Sale.Type"       "Sale.Condition" 
## [82] "SalePrice"
area <- ames$Gr.Liv.Area   # ground living area

summary(area)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1126    1442    1500    1743    5642
hist(area)

# sample size가 증가할수록 그 분포의 중심이 실제 모집단 평균에 대해 신뢰성이 높아진다.
# 샘플 분포의 가변성이 줄어듬.

sample_means10 <- rep(NA, 5000)
sample_means50 <- rep(NA, 5000)
sample_means100 <- rep(NA, 5000)

for (i in 1:5000) {
    samp <- sample(area, 10)
    sample_means10[i] <- mean(samp)
    samp <- sample(area, 50)
    sample_means50[i] <- mean(samp)
    samp <- sample(area, 100)
    sample_means100[i] <- mean(samp)
}

xlimits <- range(sample_means10)

par(mfrow = c(3, 1))
hist(sample_means10, breaks = 20, xlim = xlimits)
hist(sample_means50, breaks = 20, xlim = xlimits)
hist(sample_means100, breaks = 20, xlim = xlimits)

# confidence intervals
samp_mean <- rep(NA, 50)
samp_sd <- rep(NA, 50)
n <- 60

for (i in 1:50) {
    samp <- sample(area, n) 
    samp_mean[i] <- mean(samp)
    samp_sd[i] <- sd(samp)
}

# 95% confidence intervals
se <- samp_sd/sqrt(n)
lower <- samp_mean - 1.96 * se
upper <- samp_mean + 1.96 * se

library(ggplot2)

confint.dat = data.frame(samp_mean, lower, upper, i = 1:length(samp_mean))
confint.dat$excl0 = ifelse(confint.dat$upper < mean(area) | 
                               mean(area) < confint.dat$lower, 1, 0)
confint.dat$excl0 = factor(confint.dat$excl0)

ggplot(data = confint.dat, aes(x = i, y = samp_mean, ymin = lower, ymax = upper, colour = excl0)) + 
    geom_pointrange() + 
    scale_x_continuous(breaks = NULL) + geom_hline(yintercept = mean(area)) + 
    scale_colour_manual(values = c("black", "red")) + guides(colour = FALSE) + 
    ylab("Sample mean") + xlab("") + coord_flip()