다중회귀분석 (범주형 설명변수)
# 보험업계의 혁신에 대한 연구
# • 혁신을 받아들이는데 걸리는 시간이 회사 규모와 유형에 따라 달라지는가?
# • Y: 혁신을 받아들이는데까지 걸리는기간을 월 단위로측정
# • X1: 회사의자산규모
# • X2: 회사 유형 (stock, mutual)
insu = read.csv("data/insurance.csv", header = T)
head(insu)
## time size type
## 1 17 151 Mutual
## 2 26 92 Mutual
## 3 21 175 Mutual
## 4 30 31 Mutual
## 5 22 104 Mutual
## 6 0 277 Mutual
summary(insu)
## time size type
## Min. : 0.00 Min. : 31.0 Mutual:10
## 1st Qu.:13.75 1st Qu.:116.0 Stock :10
## Median :19.50 Median :170.5
## Mean :19.40 Mean :181.8
## 3rd Qu.:26.50 3rd Qu.:252.5
## Max. :38.00 Max. :305.0
library(ggplot2)
ggplot(insu, aes(y=time, x=size, color=type)) + geom_point(size=3)
model.matrix(time ~ size + type, insu)
## (Intercept) size typeStock
## 1 1 151 0
## 2 1 92 0
## 3 1 175 0
## 4 1 31 0
## 5 1 104 0
## 6 1 277 0
## 7 1 210 0
## 8 1 120 0
## 9 1 290 0
## 10 1 238 0
## 11 1 164 1
## 12 1 272 1
## 13 1 295 1
## 14 1 68 1
## 15 1 85 1
## 16 1 224 1
## 17 1 166 1
## 18 1 305 1
## 19 1 124 1
## 20 1 246 1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
model1 = lm(time ~ size + type, insu)
summary(model1)
##
## Call:
## lm(formula = time ~ size + type, data = insu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6915 -1.7036 -0.4385 1.9210 6.3406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.874069 1.813858 18.675 9.15e-13 ***
## size -0.101742 0.008891 -11.443 2.07e-09 ***
## typeStock 8.055469 1.459106 5.521 3.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared: 0.8951, Adjusted R-squared: 0.8827
## F-statistic: 72.5 on 2 and 17 DF, p-value: 4.765e-09
# 회귀식 : y = b0 + b1*X1 + b2*X2 + e
# X1 = size
# X2 = type : 1 if stock, 0 if mutual
# Mutual : y = b0 + b1*X1
# Stock : y = (b0 + b2) + b1*X1
insu$fit = model1$fitted.values
ggplot(insu, aes(y=fit, x=size, group=type, color=type)) +
geom_line() +
geom_point(aes(y=time, x=size, color=type))
# 기준이 되는 집단(reference level)을 변경
insu$type = relevel(insu$type, ref = "Stock")
model2 = lm(time ~ size+type, insu)
summary(model2)
##
## Call:
## lm(formula = time ~ size + type, data = insu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6915 -1.7036 -0.4385 1.9210 6.3406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.929538 2.010101 20.859 1.50e-13 ***
## size -0.101742 0.008891 -11.443 2.07e-09 ***
## typeMutual -8.055469 1.459106 -5.521 3.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared: 0.8951, Adjusted R-squared: 0.8827
## F-statistic: 72.5 on 2 and 17 DF, p-value: 4.765e-09
model.matrix(time ~ size+type, insu)
## (Intercept) size typeMutual
## 1 1 151 1
## 2 1 92 1
## 3 1 175 1
## 4 1 31 1
## 5 1 104 1
## 6 1 277 1
## 7 1 210 1
## 8 1 120 1
## 9 1 290 1
## 10 1 238 1
## 11 1 164 0
## 12 1 272 0
## 13 1 295 0
## 14 1 68 0
## 15 1 85 0
## 16 1 224 0
## 17 1 166 0
## 18 1 305 0
## 19 1 124 0
## 20 1 246 0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
# Effect Codeing
# 특정 집단을 기준으로 하는 것이 아니라 전체의 평균을 기준으로 비교
model3 = lm(time ~ size+type, insu, contrasts = list(type=contr.sum))
summary(model3)
##
## Call:
## lm(formula = time ~ size + type, data = insu, contrasts = list(type = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6915 -1.7036 -0.4385 1.9210 6.3406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.901804 1.770041 21.413 9.78e-14 ***
## size -0.101742 0.008891 -11.443 2.07e-09 ***
## type1 4.027735 0.729553 5.521 3.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared: 0.8951, Adjusted R-squared: 0.8827
## F-statistic: 72.5 on 2 and 17 DF, p-value: 4.765e-09
model.matrix(time ~ size+type, insu, contrasts = list(type=contr.sum))
## (Intercept) size type1
## 1 1 151 -1
## 2 1 92 -1
## 3 1 175 -1
## 4 1 31 -1
## 5 1 104 -1
## 6 1 277 -1
## 7 1 210 -1
## 8 1 120 -1
## 9 1 290 -1
## 10 1 238 -1
## 11 1 164 1
## 12 1 272 1
## 13 1 295 1
## 14 1 68 1
## 15 1 85 1
## 16 1 224 1
## 17 1 166 1
## 18 1 305 1
## 19 1 124 1
## 20 1 246 1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [,1]
## Stock 1
## Mutual -1
# 두 개 이상의 level을 가진 범주형 변수
# County demographic information (CDI)
# • 미국의 가장인구가 많은 440개 county의자료
# • Crime: 범죄 발생수(y)
# • Pop: 인구 (X1)
# • Region: 지역(1=NE, 2=NC, 3=S, 4=W)
# • 범죄발생 건수가 인구, 실업률과 지역에 따라 어떻게 달라지는가?
cdata = read.csv("data/CDI.csv", header = T)
head(cdata)
## id county state area pop pop_18_34 pop_65 physician hospital
## 1 1 Los_Angeles CA 4060 8863164 32.1 9.7 23677 27700
## 2 2 Cook IL 946 5105067 29.2 12.4 15153 21550
## 3 3 Harris TX 1729 2818199 31.3 7.1 7553 12449
## 4 4 San_Diego CA 4205 2498016 33.5 10.9 5905 6179
## 5 5 Orange CA 790 2410556 32.6 9.2 6062 6369
## 6 6 Kings NY 71 2300664 28.3 12.4 4861 8942
## crime highschool bachelor poverty unemployment income total_income
## 1 688936 70.0 22.3 11.6 8.0 20786 184230
## 2 436936 73.4 22.8 11.1 7.2 21729 110928
## 3 253526 74.9 25.4 12.5 5.7 19517 55003
## 4 173821 81.9 25.3 8.1 6.1 19588 48931
## 5 144524 81.2 27.8 5.2 4.8 24400 58818
## 6 680966 63.7 16.6 19.5 9.5 16803 38658
## region
## 1 4
## 2 2
## 3 3
## 4 4
## 5 4
## 6 1
summary(cdata)
## id county state area
## Min. : 1.0 Jefferson : 7 CA : 34 Min. : 15.0
## 1st Qu.:110.8 Montgomery: 6 FL : 29 1st Qu.: 451.2
## Median :220.5 Washington: 5 PA : 29 Median : 656.5
## Mean :220.5 Cumberland: 4 TX : 28 Mean : 1041.4
## 3rd Qu.:330.2 Jackson : 4 OH : 24 3rd Qu.: 946.8
## Max. :440.0 Lake : 4 NY : 22 Max. :20062.0
## (Other) :410 (Other):274
## pop pop_18_34 pop_65 physician
## Min. : 100043 Min. :16.40 Min. : 3.000 Min. : 39.0
## 1st Qu.: 139027 1st Qu.:26.20 1st Qu.: 9.875 1st Qu.: 182.8
## Median : 217280 Median :28.10 Median :11.750 Median : 401.0
## Mean : 393011 Mean :28.57 Mean :12.170 Mean : 988.0
## 3rd Qu.: 436064 3rd Qu.:30.02 3rd Qu.:13.625 3rd Qu.: 1036.0
## Max. :8863164 Max. :49.70 Max. :33.800 Max. :23677.0
##
## hospital crime highschool bachelor
## Min. : 92.0 Min. : 563 Min. :46.60 Min. : 8.10
## 1st Qu.: 390.8 1st Qu.: 6220 1st Qu.:73.88 1st Qu.:15.28
## Median : 755.0 Median : 11820 Median :77.70 Median :19.70
## Mean : 1458.6 Mean : 27112 Mean :77.56 Mean :21.08
## 3rd Qu.: 1575.8 3rd Qu.: 26280 3rd Qu.:82.40 3rd Qu.:25.32
## Max. :27700.0 Max. :688936 Max. :92.90 Max. :52.30
##
## poverty unemployment income total_income
## Min. : 1.400 Min. : 2.200 Min. : 8899 Min. : 1141
## 1st Qu.: 5.300 1st Qu.: 5.100 1st Qu.:16118 1st Qu.: 2311
## Median : 7.900 Median : 6.200 Median :17759 Median : 3857
## Mean : 8.721 Mean : 6.597 Mean :18561 Mean : 7869
## 3rd Qu.:10.900 3rd Qu.: 7.500 3rd Qu.:20270 3rd Qu.: 8654
## Max. :36.300 Max. :21.300 Max. :37541 Max. :184230
##
## region
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.461
## 3rd Qu.:3.000
## Max. :4.000
##
cdata$region = factor(cdata$region)
model1 = lm(crime ~ pop + region, cdata)
summary(model1)
##
## Call:
## lm(formula = crime ~ pop + region, data = cdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73819 -4087 202 3971 493435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.109e+04 2.773e+03 -3.999 7.48e-05 ***
## pop 8.633e-02 2.148e-03 40.195 < 2e-16 ***
## region2 2.982e+03 3.690e+03 0.808 0.4195
## region3 9.674e+03 3.421e+03 2.828 0.0049 **
## region4 1.116e+03 4.055e+03 0.275 0.7833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26780 on 435 degrees of freedom
## Multiple R-squared: 0.7904, Adjusted R-squared: 0.7885
## F-statistic: 410.2 on 4 and 435 DF, p-value: < 2.2e-16
model.matrix(crime ~ pop + region, cdata)
## (Intercept) pop region2 region3 region4
## 1 1 8863164 0 0 1
## 2 1 5105067 1 0 0
## 3 1 2818199 0 1 0
## 4 1 2498016 0 0 1
## 5 1 2410556 0 0 1
## 6 1 2300664 0 0 0
## 7 1 2122101 0 0 1
## 8 1 2111687 1 0 0
## 9 1 1937094 0 1 0
## 432 1 102525 0 0 0
## 433 1 101469 0 0 1
## 434 1 101461 1 0 0
## 435 1 101154 0 1 0
## 436 1 101115 0 1 0
## 437 1 100900 0 1 0
## 438 1 100498 0 1 0
## 439 1 100374 0 0 1
## 440 1 100043 0 1 0
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$region
## [1] "contr.treatment"
plot(model1) # 6 : outlier
# outlier 제거한 후 다시 모델링
cdata = cdata[-6, ]
model1 = lm(crime ~ pop + region, cdata)
summary(model1)
##
## Call:
## lm(formula = crime ~ pop + region, data = cdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62597 -4366 230 4326 88959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.357e+04 1.221e+03 -11.111 < 2e-16 ***
## pop 8.008e-02 9.561e-04 83.754 < 2e-16 ***
## region2 7.626e+03 1.627e+03 4.687 3.71e-06 ***
## region3 1.421e+04 1.509e+03 9.420 < 2e-16 ***
## region4 7.230e+03 1.789e+03 4.040 6.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11780 on 434 degrees of freedom
## Multiple R-squared: 0.9432, Adjusted R-squared: 0.9427
## F-statistic: 1801 on 4 and 434 DF, p-value: < 2.2e-16
anova(model1) # H0 : b1 = b2 = b3 = 0 ---> 가설 기각
## Analysis of Variance Table
##
## Response: crime
## Df Sum Sq Mean Sq F value Pr(>F)
## pop 1 9.8775e+11 9.8775e+11 7116.192 < 2.2e-16 ***
## region 3 1.2430e+10 4.1433e+09 29.851 < 2.2e-16 ***
## Residuals 434 6.0240e+10 1.3880e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cdata$fitted = model1$fitted.values
ggplot(cdata, aes(y=fitted, x=pop, color=region)) +
geom_line() +
geom_point(aes(y=crime, x=pop, shape=region))
# contrasts
model2 = lm(crime ~ pop + region, cdata, contrasts = list(region=contr.sum))
summary(model2)
##
## Call:
## lm(formula = crime ~ pop + region, data = cdata, contrasts = list(region = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -62597 -4366 230 4326 88959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.300e+03 6.983e+02 -9.022 < 2e-16 ***
## pop 8.008e-02 9.561e-04 83.754 < 2e-16 ***
## region1 -7.267e+03 1.008e+03 -7.208 2.54e-12 ***
## region2 3.594e+02 9.906e+02 0.363 0.717
## region3 6.944e+03 8.930e+02 7.776 5.48e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11780 on 434 degrees of freedom
## Multiple R-squared: 0.9432, Adjusted R-squared: 0.9427
## F-statistic: 1801 on 4 and 434 DF, p-value: < 2.2e-16
model.matrix(crime ~ pop + region, cdata, contrasts = list(region=contr.sum))
## (Intercept) pop region1 region2 region3
## 1 1 8863164 -1 -1 -1
## 2 1 5105067 0 1 0
## 3 1 2818199 0 0 1
## 4 1 2498016 -1 -1 -1
## 5 1 2410556 -1 -1 -1
## 7 1 2122101 -1 -1 -1
## 8 1 2111687 0 1 0
## 9 1 1937094 0 0 1
## 430 1 102637 0 0 1
## 431 1 102583 0 1 0
## 432 1 102525 1 0 0
## 433 1 101469 -1 -1 -1
## 434 1 101461 0 1 0
## 435 1 101154 0 0 1
## 436 1 101115 0 0 1
## 437 1 100900 0 0 1
## 438 1 100498 0 0 1
## 439 1 100374 -1 -1 -1
## 440 1 100043 0 0 1
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$region
## [,1] [,2] [,3]
## 1 1 0 0
## 2 0 1 0
## 3 0 0 1
## 4 -1 -1 -1
# Interaction 교호작용
# 지역에 따라 인구 규모가 범죄건수에 미치는 영향의 정도가 다를까?
model3 = lm(crime ~ pop + region + pop*region, cdata)
summary(model3)
##
## Call:
## lm(formula = crime ~ pop + region + pop * region, data = cdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47267 -2797 455 3245 51931
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.781e+03 1.489e+03 -1.869 0.0624 .
## pop 5.148e-02 3.022e-03 17.035 < 2e-16 ***
## region2 -4.378e+03 1.850e+03 -2.367 0.0184 *
## region3 -4.150e+03 1.830e+03 -2.267 0.0239 *
## region4 -1.778e+03 1.945e+03 -0.914 0.3612
## pop:region2 3.212e-02 3.459e-03 9.285 < 2e-16 ***
## pop:region3 5.162e-02 3.733e-03 13.830 < 2e-16 ***
## pop:region4 2.554e-02 3.191e-03 8.003 1.14e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9669 on 431 degrees of freedom
## Multiple R-squared: 0.962, Adjusted R-squared: 0.9614
## F-statistic: 1559 on 7 and 431 DF, p-value: < 2.2e-16
# pop:region2 - pop의 기울기가 NE에 비해 NC 지역에서 얼마나 높은가
# pop:region3 - pop의 기울기가 NE에 비해 S 지역에서 얼마나 높은가
# pop:region4 - pop의 기울기가 NE에 비해 W 지역에서 얼마나 높은가
# 지역별로 기울기의 차이가 있는가?
anova(model3)
## Analysis of Variance Table
##
## Response: crime
## Df Sum Sq Mean Sq F value Pr(>F)
## pop 1 9.8775e+11 9.8775e+11 10564.546 < 2.2e-16 ***
## region 3 1.2430e+10 4.1433e+09 44.316 < 2.2e-16 ***
## pop:region 3 1.9943e+10 6.6478e+09 71.102 < 2.2e-16 ***
## Residuals 431 4.0297e+10 9.3496e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cdata$fitted = model3$fitted.values
ggplot(cdata, aes(y=fitted, x=pop, color=region)) +
geom_line() +
geom_point(aes(y=crime, x=pop, shape=region))
insumodel = lm(time ~ size + type + size*type, insu)
summary(insumodel)
##
## Call:
## lm(formula = time ~ size + type + size * type, data = insu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7144 -1.7064 -0.4557 1.9311 6.3259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.9696196 2.7194341 15.433 4.98e-11 ***
## size -0.1019478 0.0128711 -7.921 6.31e-07 ***
## typeMutual -8.1312501 3.6540517 -2.225 0.0408 *
## size:typeMutual 0.0004171 0.0183312 0.023 0.9821
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.32 on 16 degrees of freedom
## Multiple R-squared: 0.8951, Adjusted R-squared: 0.8754
## F-statistic: 45.49 on 3 and 16 DF, p-value: 4.675e-08
anova(insumodel)
## Analysis of Variance Table
##
## Response: time
## Df Sum Sq Mean Sq F value Pr(>F)
## size 1 1188.17 1188.17 107.7819 1.627e-08 ***
## type 1 316.25 316.25 28.6875 6.430e-05 ***
## size:type 1 0.01 0.01 0.0005 0.9821
## Residuals 16 176.38 11.02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 두 개의 연속형 변수의 교호작용
model4 = lm(crime ~ pop + unemployment + pop*unemployment, cdata)
summary(model4)
##
## Call:
## lm(formula = crime ~ pop + unemployment + pop * unemployment,
## data = cdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70804 -3685 -53 3683 88690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.708e+03 2.556e+03 -0.668 0.504429
## pop 6.085e-02 5.530e-03 11.002 < 2e-16 ***
## unemployment -4.492e+02 3.508e+02 -1.281 0.201040
## pop:unemployment 2.598e-03 7.482e-04 3.472 0.000567 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12720 on 435 degrees of freedom
## Multiple R-squared: 0.9336, Adjusted R-squared: 0.9332
## F-statistic: 2039 on 3 and 435 DF, p-value: < 2.2e-16
# unemployment 의 p-value가 유의하지 않다고 나왔지만, pop:unemployment는 유의함.
# 주효과가 유의하지 않더라도 교호작용이 유의하면 제거하지 않음.
plot(model4)
summary(cdata$unemployment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.20 5.10 6.20 6.59 7.45 21.30
intc = -1.708e+03
ipop = 6.085e-02
iunemp = -4.492e+02
ipopunemp = 2.598e-03
ggplot(cdata, aes(x=pop, y=crime, colour="data")) +
geom_point() +
geom_abline(intercept = intc + iunemp*2.2, slope = ipop + ipopunemp*2.2, aes(colour='min')) +
geom_abline(intercept = intc + iunemp*6.59, slope = ipop + ipopunemp*6.59, aes(colour='mean')) +
geom_abline(intercept = intc + iunemp*21.3, slope = ipop + ipopunemp*21.3, aes(colour='max')) +
scale_color_discrete(name="Unemployment")
# 동일한 실업률 수준이 유지될 때 인구가 증가할수록 범죄 건수가 증가함
# 기울기가 실업률이 높을 수록 가파름