다중회귀분석 (범주형 설명변수)

# 보험업계의 혁신에 대한 연구
# • 혁신을 받아들이는데 걸리는 시간이 회사 규모와 유형에 따라 달라지는가?
# • Y: 혁신을 받아들이는데까지 걸리는기간을 월 단위로측정
# • X1: 회사의자산규모
# • X2: 회사 유형 (stock, mutual)
insu = read.csv("data/insurance.csv", header = T)
head(insu)
##   time size   type
## 1   17  151 Mutual
## 2   26   92 Mutual
## 3   21  175 Mutual
## 4   30   31 Mutual
## 5   22  104 Mutual
## 6    0  277 Mutual
summary(insu)
##       time            size           type   
##  Min.   : 0.00   Min.   : 31.0   Mutual:10  
##  1st Qu.:13.75   1st Qu.:116.0   Stock :10  
##  Median :19.50   Median :170.5              
##  Mean   :19.40   Mean   :181.8              
##  3rd Qu.:26.50   3rd Qu.:252.5              
##  Max.   :38.00   Max.   :305.0
library(ggplot2)
ggplot(insu, aes(y=time, x=size, color=type)) + geom_point(size=3)

model.matrix(time ~ size + type, insu)
##    (Intercept) size typeStock
## 1            1  151         0
## 2            1   92         0
## 3            1  175         0
## 4            1   31         0
## 5            1  104         0
## 6            1  277         0
## 7            1  210         0
## 8            1  120         0
## 9            1  290         0
## 10           1  238         0
## 11           1  164         1
## 12           1  272         1
## 13           1  295         1
## 14           1   68         1
## 15           1   85         1
## 16           1  224         1
## 17           1  166         1
## 18           1  305         1
## 19           1  124         1
## 20           1  246         1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
model1 = lm(time ~ size + type, insu)
summary(model1)
## 
## Call:
## lm(formula = time ~ size + type, data = insu)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6915 -1.7036 -0.4385  1.9210  6.3406 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.874069   1.813858  18.675 9.15e-13 ***
## size        -0.101742   0.008891 -11.443 2.07e-09 ***
## typeStock    8.055469   1.459106   5.521 3.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared:  0.8951, Adjusted R-squared:  0.8827 
## F-statistic:  72.5 on 2 and 17 DF,  p-value: 4.765e-09
# 회귀식 : y = b0 + b1*X1 + b2*X2 + e
# X1 = size
# X2 = type : 1 if stock, 0 if mutual
# Mutual    : y = b0 + b1*X1
# Stock     : y = (b0 + b2) + b1*X1

insu$fit = model1$fitted.values

ggplot(insu, aes(y=fit, x=size, group=type, color=type)) +
    geom_line() + 
    geom_point(aes(y=time, x=size, color=type))

# 기준이 되는 집단(reference level)을 변경

insu$type = relevel(insu$type, ref = "Stock")

model2 = lm(time ~ size+type, insu)
summary(model2)
## 
## Call:
## lm(formula = time ~ size + type, data = insu)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6915 -1.7036 -0.4385  1.9210  6.3406 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.929538   2.010101  20.859 1.50e-13 ***
## size        -0.101742   0.008891 -11.443 2.07e-09 ***
## typeMutual  -8.055469   1.459106  -5.521 3.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared:  0.8951, Adjusted R-squared:  0.8827 
## F-statistic:  72.5 on 2 and 17 DF,  p-value: 4.765e-09
model.matrix(time ~ size+type, insu)
##    (Intercept) size typeMutual
## 1            1  151          1
## 2            1   92          1
## 3            1  175          1
## 4            1   31          1
## 5            1  104          1
## 6            1  277          1
## 7            1  210          1
## 8            1  120          1
## 9            1  290          1
## 10           1  238          1
## 11           1  164          0
## 12           1  272          0
## 13           1  295          0
## 14           1   68          0
## 15           1   85          0
## 16           1  224          0
## 17           1  166          0
## 18           1  305          0
## 19           1  124          0
## 20           1  246          0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
# Effect Codeing
# 특정 집단을 기준으로 하는 것이 아니라 전체의 평균을 기준으로 비교

model3 = lm(time ~ size+type, insu, contrasts = list(type=contr.sum))
summary(model3)
## 
## Call:
## lm(formula = time ~ size + type, data = insu, contrasts = list(type = contr.sum))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6915 -1.7036 -0.4385  1.9210  6.3406 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.901804   1.770041  21.413 9.78e-14 ***
## size        -0.101742   0.008891 -11.443 2.07e-09 ***
## type1        4.027735   0.729553   5.521 3.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.221 on 17 degrees of freedom
## Multiple R-squared:  0.8951, Adjusted R-squared:  0.8827 
## F-statistic:  72.5 on 2 and 17 DF,  p-value: 4.765e-09
model.matrix(time ~ size+type, insu, contrasts = list(type=contr.sum))
##    (Intercept) size type1
## 1            1  151    -1
## 2            1   92    -1
## 3            1  175    -1
## 4            1   31    -1
## 5            1  104    -1
## 6            1  277    -1
## 7            1  210    -1
## 8            1  120    -1
## 9            1  290    -1
## 10           1  238    -1
## 11           1  164     1
## 12           1  272     1
## 13           1  295     1
## 14           1   68     1
## 15           1   85     1
## 16           1  224     1
## 17           1  166     1
## 18           1  305     1
## 19           1  124     1
## 20           1  246     1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
##        [,1]
## Stock     1
## Mutual   -1
# 두 개 이상의 level을 가진 범주형 변수
# County demographic information (CDI)
# • 미국의 가장인구가 많은 440개 county의자료
# • Crime: 범죄 발생수(y)
# • Pop: 인구 (X1)
# • Region: 지역(1=NE, 2=NC, 3=S, 4=W)
# • 범죄발생 건수가 인구, 실업률과 지역에 따라 어떻게 달라지는가?

cdata = read.csv("data/CDI.csv", header = T)
head(cdata)
##   id      county state area     pop pop_18_34 pop_65 physician hospital
## 1  1 Los_Angeles    CA 4060 8863164      32.1    9.7     23677    27700
## 2  2        Cook    IL  946 5105067      29.2   12.4     15153    21550
## 3  3      Harris    TX 1729 2818199      31.3    7.1      7553    12449
## 4  4   San_Diego    CA 4205 2498016      33.5   10.9      5905     6179
## 5  5      Orange    CA  790 2410556      32.6    9.2      6062     6369
## 6  6       Kings    NY   71 2300664      28.3   12.4      4861     8942
##    crime highschool bachelor poverty unemployment income total_income
## 1 688936       70.0     22.3    11.6          8.0  20786       184230
## 2 436936       73.4     22.8    11.1          7.2  21729       110928
## 3 253526       74.9     25.4    12.5          5.7  19517        55003
## 4 173821       81.9     25.3     8.1          6.1  19588        48931
## 5 144524       81.2     27.8     5.2          4.8  24400        58818
## 6 680966       63.7     16.6    19.5          9.5  16803        38658
##   region
## 1      4
## 2      2
## 3      3
## 4      4
## 5      4
## 6      1
summary(cdata)
##        id               county        state          area        
##  Min.   :  1.0   Jefferson :  7   CA     : 34   Min.   :   15.0  
##  1st Qu.:110.8   Montgomery:  6   FL     : 29   1st Qu.:  451.2  
##  Median :220.5   Washington:  5   PA     : 29   Median :  656.5  
##  Mean   :220.5   Cumberland:  4   TX     : 28   Mean   : 1041.4  
##  3rd Qu.:330.2   Jackson   :  4   OH     : 24   3rd Qu.:  946.8  
##  Max.   :440.0   Lake      :  4   NY     : 22   Max.   :20062.0  
##                  (Other)   :410   (Other):274                    
##       pop            pop_18_34         pop_65         physician      
##  Min.   : 100043   Min.   :16.40   Min.   : 3.000   Min.   :   39.0  
##  1st Qu.: 139027   1st Qu.:26.20   1st Qu.: 9.875   1st Qu.:  182.8  
##  Median : 217280   Median :28.10   Median :11.750   Median :  401.0  
##  Mean   : 393011   Mean   :28.57   Mean   :12.170   Mean   :  988.0  
##  3rd Qu.: 436064   3rd Qu.:30.02   3rd Qu.:13.625   3rd Qu.: 1036.0  
##  Max.   :8863164   Max.   :49.70   Max.   :33.800   Max.   :23677.0  
##                                                                      
##     hospital           crime          highschool       bachelor    
##  Min.   :   92.0   Min.   :   563   Min.   :46.60   Min.   : 8.10  
##  1st Qu.:  390.8   1st Qu.:  6220   1st Qu.:73.88   1st Qu.:15.28  
##  Median :  755.0   Median : 11820   Median :77.70   Median :19.70  
##  Mean   : 1458.6   Mean   : 27112   Mean   :77.56   Mean   :21.08  
##  3rd Qu.: 1575.8   3rd Qu.: 26280   3rd Qu.:82.40   3rd Qu.:25.32  
##  Max.   :27700.0   Max.   :688936   Max.   :92.90   Max.   :52.30  
##                                                                    
##     poverty        unemployment        income       total_income   
##  Min.   : 1.400   Min.   : 2.200   Min.   : 8899   Min.   :  1141  
##  1st Qu.: 5.300   1st Qu.: 5.100   1st Qu.:16118   1st Qu.:  2311  
##  Median : 7.900   Median : 6.200   Median :17759   Median :  3857  
##  Mean   : 8.721   Mean   : 6.597   Mean   :18561   Mean   :  7869  
##  3rd Qu.:10.900   3rd Qu.: 7.500   3rd Qu.:20270   3rd Qu.:  8654  
##  Max.   :36.300   Max.   :21.300   Max.   :37541   Max.   :184230  
##                                                                    
##      region     
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :3.000  
##  Mean   :2.461  
##  3rd Qu.:3.000  
##  Max.   :4.000  
## 
cdata$region = factor(cdata$region)

model1 = lm(crime ~ pop + region, cdata)
summary(model1)
## 
## Call:
## lm(formula = crime ~ pop + region, data = cdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -73819  -4087    202   3971 493435 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.109e+04  2.773e+03  -3.999 7.48e-05 ***
## pop          8.633e-02  2.148e-03  40.195  < 2e-16 ***
## region2      2.982e+03  3.690e+03   0.808   0.4195    
## region3      9.674e+03  3.421e+03   2.828   0.0049 ** 
## region4      1.116e+03  4.055e+03   0.275   0.7833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26780 on 435 degrees of freedom
## Multiple R-squared:  0.7904, Adjusted R-squared:  0.7885 
## F-statistic: 410.2 on 4 and 435 DF,  p-value: < 2.2e-16
model.matrix(crime ~ pop + region, cdata)
##     (Intercept)     pop region2 region3 region4
## 1             1 8863164       0       0       1
## 2             1 5105067       1       0       0
## 3             1 2818199       0       1       0
## 4             1 2498016       0       0       1
## 5             1 2410556       0       0       1
## 6             1 2300664       0       0       0
## 7             1 2122101       0       0       1
## 8             1 2111687       1       0       0
## 9             1 1937094       0       1       0
## 432           1  102525       0       0       0
## 433           1  101469       0       0       1
## 434           1  101461       1       0       0
## 435           1  101154       0       1       0
## 436           1  101115       0       1       0
## 437           1  100900       0       1       0
## 438           1  100498       0       1       0
## 439           1  100374       0       0       1
## 440           1  100043       0       1       0
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$region
## [1] "contr.treatment"
plot(model1) # 6 : outlier

# outlier 제거한 후 다시 모델링
cdata = cdata[-6, ]
model1 = lm(crime ~ pop + region, cdata)
summary(model1)
## 
## Call:
## lm(formula = crime ~ pop + region, data = cdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62597  -4366    230   4326  88959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.357e+04  1.221e+03 -11.111  < 2e-16 ***
## pop          8.008e-02  9.561e-04  83.754  < 2e-16 ***
## region2      7.626e+03  1.627e+03   4.687 3.71e-06 ***
## region3      1.421e+04  1.509e+03   9.420  < 2e-16 ***
## region4      7.230e+03  1.789e+03   4.040 6.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11780 on 434 degrees of freedom
## Multiple R-squared:  0.9432, Adjusted R-squared:  0.9427 
## F-statistic:  1801 on 4 and 434 DF,  p-value: < 2.2e-16
anova(model1) # H0 : b1 = b2 = b3 = 0 ---> 가설 기각
## Analysis of Variance Table
## 
## Response: crime
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## pop         1 9.8775e+11 9.8775e+11 7116.192 < 2.2e-16 ***
## region      3 1.2430e+10 4.1433e+09   29.851 < 2.2e-16 ***
## Residuals 434 6.0240e+10 1.3880e+08                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cdata$fitted = model1$fitted.values
ggplot(cdata, aes(y=fitted, x=pop, color=region)) +
    geom_line() + 
    geom_point(aes(y=crime, x=pop, shape=region))

# contrasts
model2 = lm(crime ~ pop + region, cdata, contrasts = list(region=contr.sum))
summary(model2)
## 
## Call:
## lm(formula = crime ~ pop + region, data = cdata, contrasts = list(region = contr.sum))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62597  -4366    230   4326  88959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.300e+03  6.983e+02  -9.022  < 2e-16 ***
## pop          8.008e-02  9.561e-04  83.754  < 2e-16 ***
## region1     -7.267e+03  1.008e+03  -7.208 2.54e-12 ***
## region2      3.594e+02  9.906e+02   0.363    0.717    
## region3      6.944e+03  8.930e+02   7.776 5.48e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11780 on 434 degrees of freedom
## Multiple R-squared:  0.9432, Adjusted R-squared:  0.9427 
## F-statistic:  1801 on 4 and 434 DF,  p-value: < 2.2e-16
model.matrix(crime ~ pop + region, cdata, contrasts = list(region=contr.sum))
##     (Intercept)     pop region1 region2 region3
## 1             1 8863164      -1      -1      -1
## 2             1 5105067       0       1       0
## 3             1 2818199       0       0       1
## 4             1 2498016      -1      -1      -1
## 5             1 2410556      -1      -1      -1
## 7             1 2122101      -1      -1      -1
## 8             1 2111687       0       1       0
## 9             1 1937094       0       0       1
## 430           1  102637       0       0       1
## 431           1  102583       0       1       0
## 432           1  102525       1       0       0
## 433           1  101469      -1      -1      -1
## 434           1  101461       0       1       0
## 435           1  101154       0       0       1
## 436           1  101115       0       0       1
## 437           1  100900       0       0       1
## 438           1  100498       0       0       1
## 439           1  100374      -1      -1      -1
## 440           1  100043       0       0       1
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$region
##   [,1] [,2] [,3]
## 1    1    0    0
## 2    0    1    0
## 3    0    0    1
## 4   -1   -1   -1
# Interaction 교호작용
# 지역에 따라 인구 규모가 범죄건수에 미치는 영향의 정도가 다를까?
model3 = lm(crime ~ pop + region + pop*region, cdata)
summary(model3)
## 
## Call:
## lm(formula = crime ~ pop + region + pop * region, data = cdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47267  -2797    455   3245  51931 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.781e+03  1.489e+03  -1.869   0.0624 .  
## pop          5.148e-02  3.022e-03  17.035  < 2e-16 ***
## region2     -4.378e+03  1.850e+03  -2.367   0.0184 *  
## region3     -4.150e+03  1.830e+03  -2.267   0.0239 *  
## region4     -1.778e+03  1.945e+03  -0.914   0.3612    
## pop:region2  3.212e-02  3.459e-03   9.285  < 2e-16 ***
## pop:region3  5.162e-02  3.733e-03  13.830  < 2e-16 ***
## pop:region4  2.554e-02  3.191e-03   8.003 1.14e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9669 on 431 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.9614 
## F-statistic:  1559 on 7 and 431 DF,  p-value: < 2.2e-16
# pop:region2 - pop의 기울기가 NE에 비해 NC 지역에서 얼마나 높은가
# pop:region3 - pop의 기울기가 NE에 비해 S 지역에서 얼마나 높은가
# pop:region4 - pop의 기울기가 NE에 비해 W 지역에서 얼마나 높은가

# 지역별로 기울기의 차이가 있는가?
anova(model3)
## Analysis of Variance Table
## 
## Response: crime
##             Df     Sum Sq    Mean Sq   F value    Pr(>F)    
## pop          1 9.8775e+11 9.8775e+11 10564.546 < 2.2e-16 ***
## region       3 1.2430e+10 4.1433e+09    44.316 < 2.2e-16 ***
## pop:region   3 1.9943e+10 6.6478e+09    71.102 < 2.2e-16 ***
## Residuals  431 4.0297e+10 9.3496e+07                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cdata$fitted = model3$fitted.values
ggplot(cdata, aes(y=fitted, x=pop, color=region)) +
    geom_line() + 
    geom_point(aes(y=crime, x=pop, shape=region))

insumodel = lm(time ~ size + type + size*type, insu)
summary(insumodel)
## 
## Call:
## lm(formula = time ~ size + type + size * type, data = insu)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7144 -1.7064 -0.4557  1.9311  6.3259 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     41.9696196  2.7194341  15.433 4.98e-11 ***
## size            -0.1019478  0.0128711  -7.921 6.31e-07 ***
## typeMutual      -8.1312501  3.6540517  -2.225   0.0408 *  
## size:typeMutual  0.0004171  0.0183312   0.023   0.9821    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.32 on 16 degrees of freedom
## Multiple R-squared:  0.8951, Adjusted R-squared:  0.8754 
## F-statistic: 45.49 on 3 and 16 DF,  p-value: 4.675e-08
anova(insumodel)
## Analysis of Variance Table
## 
## Response: time
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## size       1 1188.17 1188.17 107.7819 1.627e-08 ***
## type       1  316.25  316.25  28.6875 6.430e-05 ***
## size:type  1    0.01    0.01   0.0005    0.9821    
## Residuals 16  176.38   11.02                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 두 개의 연속형 변수의 교호작용
model4 = lm(crime ~ pop + unemployment + pop*unemployment, cdata)
summary(model4)
## 
## Call:
## lm(formula = crime ~ pop + unemployment + pop * unemployment, 
##     data = cdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70804  -3685    -53   3683  88690 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.708e+03  2.556e+03  -0.668 0.504429    
## pop               6.085e-02  5.530e-03  11.002  < 2e-16 ***
## unemployment     -4.492e+02  3.508e+02  -1.281 0.201040    
## pop:unemployment  2.598e-03  7.482e-04   3.472 0.000567 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12720 on 435 degrees of freedom
## Multiple R-squared:  0.9336, Adjusted R-squared:  0.9332 
## F-statistic:  2039 on 3 and 435 DF,  p-value: < 2.2e-16
# unemployment 의 p-value가 유의하지 않다고 나왔지만, pop:unemployment는 유의함.
# 주효과가 유의하지 않더라도 교호작용이 유의하면 제거하지 않음.

plot(model4)

summary(cdata$unemployment)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.20    5.10    6.20    6.59    7.45   21.30
intc = -1.708e+03
ipop = 6.085e-02
iunemp = -4.492e+02
ipopunemp = 2.598e-03

ggplot(cdata, aes(x=pop, y=crime, colour="data")) +
    geom_point() +
    geom_abline(intercept = intc + iunemp*2.2, slope = ipop + ipopunemp*2.2, aes(colour='min')) +
    geom_abline(intercept = intc + iunemp*6.59, slope = ipop + ipopunemp*6.59, aes(colour='mean')) +
    geom_abline(intercept = intc + iunemp*21.3, slope = ipop + ipopunemp*21.3, aes(colour='max')) +
    scale_color_discrete(name="Unemployment")

# 동일한 실업률 수준이 유지될 때 인구가 증가할수록 범죄 건수가 증가함
# 기울기가 실업률이 높을 수록 가파름