R을 활용한 통계분석 - 5.다중회귀분석

# 다중회귀분석 (Multiple Regression Model)

# programmer 20명
# salary가 experience(경력년수), score (직무적성검사성적)과 연관성을 갖는지 검증.
df <- read.csv("data/salary.csv")
head(df)

##   experience score salary
## 1          4    78   24.0
## 2          7   100   43.0
## 3          1    86   23.7
## 4          5    82   34.3
## 5          8    86   35.8
## 6         10    84   38.0

summary(df)

##    experience        score            salary     
##  Min.   : 0.00   Min.   : 70.00   Min.   :22.20  
##  1st Qu.: 3.00   1st Qu.: 77.25   1st Qu.:27.80  
##  Median : 5.50   Median : 82.50   Median :30.85  
##  Mean   : 5.20   Mean   : 82.75   Mean   :31.23  
##  3rd Qu.: 7.25   3rd Qu.: 87.25   3rd Qu.:34.67  
##  Max.   :10.00   Max.   :100.00   Max.   :43.00

library(psych)
pairs.panels(df)    # salary ~ experience 상관계수 0.86

# 단순회귀 : 경력 증가시 연봉 증가 상관관계
model <- lm(salary ~ experience, data = df)
summary(model)

## 
## Call:
## lm(formula = salary ~ experience, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.291 -1.441  0.249  0.719  8.849 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.8111     1.3761  16.576 2.39e-12 ***
## experience    1.6200     0.2313   7.004 1.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.991 on 18 degrees of freedom
## Multiple R-squared:  0.7316, Adjusted R-squared:  0.7167 
## F-statistic: 49.06 on 1 and 18 DF,  p-value: 1.541e-06

# 다중회귀 : 경력 증가시 적성검사 점수 증가로 인한 연봉 증가까지 포함된 관계
# experience ~ score 의 cor() = 0.34
# model <- lm(salary ~ ., data = df) 
model <- lm(salary ~ experience + score, data = df)
summary(model)

## 
## Call:
## lm(formula = salary ~ experience + score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3586 -1.4581 -0.0341  1.1862  4.9102 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.17394    6.15607   0.516  0.61279    
## experience   1.40390    0.19857   7.070 1.88e-06 ***
## score        0.25089    0.07735   3.243  0.00478 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.419 on 17 degrees of freedom
## Multiple R-squared:  0.8342, Adjusted R-squared:  0.8147 
## F-statistic: 42.76 on 2 and 17 DF,  p-value: 2.328e-07

# 추정된 회귀식
# salary = 3.174 + 1.404 * experience + 0.251 * score

# b1 : b2(score)가 일정하다고 할 때, experience가 1년 증가하면 salary가 $1,404 증가할 것으로 기대된다.
# b2 : b1(experience)가 일정하다고 할 때, score가 1점 증가하면 salary가 $251 증가할 것으로 기대된다.

# 다중회귀분석 결과 해석

# (1) Adjusted R-squared
# R-squared: 0.83 --> experience와 score가 salary 변동량의 83%를 설명한다.
# But, 설명변수 갯수가 증가하면 결정계수도 증가
# --> 설명변수 갯수에 대한 패널티 적용한 결정계수 = Adjusted R-squared

# (2) F-test
# H0 : b1 = b2 = ...... = bk = 0
# 종속변수와 모든 독립(설명)변수 집합간에 유의한 관계가 존재하는지 검정
# b0 는 큰 의미가 없다.

# (3) T-test
# H0 : bi = 0
# 각 개별 독립변수의 유의성 검정

# (4) 잔차분석 --> Residuals plot / Normal Q-Q plot / Leverage plot

# 영향점이 있는 경우

plot(model)   # --> Leverage plot에서 2번째 자료가 이상치 & 영향점

dcolor <- rep(1, length(df$salary))
dcolor[2] = 2
pairs(df, col = dcolor, pch = dcolor)     # 2번 자료만 다르게 표시

# 영향점 제거는 주관적으로 판단하는 수밖에 없다.

df2 <- df[-2, ]         # 영향점 제거할 경우
pairs.panels(df2)       # salary ~ experience 상관계수 높아짐(0.91). 다른 상관계수는 낮아짐.

model2 <- lm(salary ~ experience + score, data = df2)
summary(model2)         # score 회귀계수가 유의하지 않다.

## 
## Call:
## lm(formula = salary ~ experience + score, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5221 -1.4259  0.1133  1.3351  3.8131 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.31238    5.93883   2.073   0.0547 .  
## experience   1.42607    0.16426   8.682 1.89e-07 ***
## score        0.13469    0.07486   1.799   0.0909 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.999 on 16 degrees of freedom
## Multiple R-squared:  0.8592, Adjusted R-squared:  0.8416 
## F-statistic: 48.83 on 2 and 16 DF,  p-value: 1.542e-07

# 추정과 예측

# 경력 5년, 적성검사성적 80점인 사람과 경력 10년, 성적 70점인 사람의 연봉 예측

# 평균 연봉의 95% 신뢰구간
predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)), 
        interval = "confidence")

##        fit      lwr      upr
## 1 30.26428 29.04555 31.48302
## 2 34.77494 31.24174 38.30815

# 새로운 한 명에 대한 95% 예측구간
predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)), 
        interval = "prediction")

##        fit      lwr      upr
## 1 30.26428 25.01763 35.51094
## 2 34.77494 28.56804 40.98184

# 다중공선성 (Multicollinearity)

# 독립변수들이 서로 높은 상관관계를 가지면 회귀계수의 정확한 추정이 어렵다.
# ---> 모형 선택 방법론을 적용하여 가장 적절한 변수를 선택할 수 있다.

# 30개 부서에서 부서당 35명의 직원 설문조사
# 데이터 숫자는 해당 질문에 긍정한 직원의 비율
attitude

##    rating complaints privileges learning raises critical advance
## 1      43         51         30       39     61       92      45
## 2      63         64         51       54     63       73      47
## 3      71         70         68       69     76       86      48
## 4      61         63         45       47     54       84      35
## 5      81         78         56       66     71       83      47
## 6      43         55         49       44     54       49      34
## 7      58         67         42       56     66       68      35
## 8      71         75         50       55     70       66      41
## 9      72         82         72       67     71       83      31
## 10     67         61         45       47     62       80      41
## 11     64         53         53       58     58       67      34
## 12     67         60         47       39     59       74      41
## 13     69         62         57       42     55       63      25
## 14     68         83         83       45     59       77      35
## 15     77         77         54       72     79       77      46
## 16     81         90         50       72     60       54      36
## 17     74         85         64       69     79       79      63
## 18     65         60         65       75     55       80      60
## 19     65         70         46       57     75       85      46
## 20     50         58         68       54     64       78      52
## 21     50         40         33       34     43       64      33
## 22     64         61         52       62     66       80      41
## 23     53         66         52       50     63       80      37
## 24     40         37         42       58     50       57      49
## 25     63         54         42       48     66       75      33
## 26     66         77         66       63     88       76      72
## 27     78         75         58       74     80       78      49
## 28     48         57         44       45     51       83      38
## 29     85         85         71       71     77       74      55
## 30     82         82         39       59     64       78      39

round(cor(attitude),3)

##            rating complaints privileges learning raises critical advance
## rating      1.000      0.825      0.426    0.624  0.590    0.156   0.155
## complaints  0.825      1.000      0.558    0.597  0.669    0.188   0.225
## privileges  0.426      0.558      1.000    0.493  0.445    0.147   0.343
## learning    0.624      0.597      0.493    1.000  0.640    0.116   0.532
## raises      0.590      0.669      0.445    0.640  1.000    0.377   0.574
## critical    0.156      0.188      0.147    0.116  0.377    1.000   0.283
## advance     0.155      0.225      0.343    0.532  0.574    0.283   1.000

pairs.panels(attitude)

# cor : complaints + learning = 0.597
# cor : complaints + raises = 0.669

plot(attitude[ , c("rating", "complaints", "learning")])

a <- lm(rating ~ complaints + learning, data = attitude)
summary(a)

## 
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5568  -5.7331   0.6701   6.5341  10.3610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.8709     7.0612   1.398    0.174    
## complaints    0.6435     0.1185   5.432 9.57e-06 ***
## learning      0.2112     0.1344   1.571    0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.817 on 27 degrees of freedom
## Multiple R-squared:  0.708,  Adjusted R-squared:  0.6864 
## F-statistic: 32.74 on 2 and 27 DF,  p-value: 6.058e-08

# learning의 t-test p-value 값을 보면 유의하지 않다. 
# 하지만 rating과 상관관계가 없는 것이 아니다. 
# complaints 와의 상관관계도 있기 때문에 rating 변수에 대한 역할이 작아보일 뿐이다.

# 모형 선택법 (Model Selection) = 설명변수 선택

# *** 해당 업무분야에서 반드시 들어가야 하는 변수는 고정 !!!
# (1) Forward selection
# --- 가장 유의한 변수부터 하나씩 추가 (R-sq 기준)
# --- 변수값의 작은 변동에도 결과가 크게 달라져 안정성 부족

# (2) Backward selection
# --- 모든 변수를 넣고 가장 기여도가 낮은 것부터 하나씩 제거
# --- 전체 변수 정보를 이용하는 장점
# --- 변수의 갯수가 많은 경우 사용 어려움. 안정성 부족.

# (3) Stepwise selection
# --- Forward selection과 backward selection을 조합
# --- 새로운 변수 추가 후에 기존 변수의 중요도가 약화되면 그 변수 제거

# (4) All Subsets Regression
# --- 모든 가능한 모형을 비교하여 최적의 모형선택
# --- 여러 모형 중 최소 AIC, BIC, Mallow’s Cp 또는 최대 adjusted R-sq를 갖는 모형을 선택
# --- 모형의 복잡도에 벌점을 주는 방법. AIC (Akaike information criterion), BIC (Bayesian ...)

# Backward selection

out <- lm(rating ~ ., attitude)
summary(out)

## 
## Call:
## lm(formula = rating ~ ., data = attitude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9418  -4.3555   0.3158   5.5425  11.5990 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.78708   11.58926   0.931 0.361634    
## complaints   0.61319    0.16098   3.809 0.000903 ***
## privileges  -0.07305    0.13572  -0.538 0.595594    
## learning     0.32033    0.16852   1.901 0.069925 .  
## raises       0.08173    0.22148   0.369 0.715480    
## critical     0.03838    0.14700   0.261 0.796334    
## advance     -0.21706    0.17821  -1.218 0.235577    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.068 on 23 degrees of freedom
## Multiple R-squared:  0.7326, Adjusted R-squared:  0.6628 
## F-statistic:  10.5 on 6 and 23 DF,  p-value: 1.24e-05

anova(out)      # 각 회귀계수 t검정 p-value 기준 선별. critical 제거.

## Analysis of Variance Table
## 
## Response: rating
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## complaints  1 2927.58 2927.58 58.6026 9.056e-08 ***
## privileges  1    7.52    7.52  0.1505    0.7016    
## learning    1  137.25  137.25  2.7473    0.1110    
## raises      1    0.94    0.94  0.0189    0.8920    
## critical    1    0.56    0.56  0.0113    0.9163    
## advance     1   74.11   74.11  1.4835    0.2356    
## Residuals  23 1149.00   49.96                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

out2 <- lm(rating ~ . - critical, data = attitude)
summary(out2)

## 
## Call:
## lm(formula = rating ~ . - critical, data = attitude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.8088  -4.8353   0.4199   5.5775  11.5276 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.79791    8.49061   1.507 0.144785    
## complaints   0.61315    0.15783   3.885 0.000704 ***
## privileges  -0.07224    0.13303  -0.543 0.592122    
## learning     0.31172    0.16202   1.924 0.066300 .  
## raises       0.09795    0.20842   0.470 0.642621    
## advance     -0.21111    0.17328  -1.218 0.234956    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.929 on 24 degrees of freedom
## Multiple R-squared:  0.7318, Adjusted R-squared:  0.6759 
## F-statistic:  13.1 on 5 and 24 DF,  p-value: 3.278e-06

anova(out2)     # raises 제거

## Analysis of Variance Table
## 
## Response: rating
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## complaints  1 2927.58 2927.58 60.9698 4.835e-08 ***
## privileges  1    7.52    7.52  0.1566    0.6958    
## learning    1  137.25  137.25  2.8583    0.1039    
## raises      1    0.94    0.94  0.0196    0.8898    
## advance     1   71.27   71.27  1.4842    0.2350    
## Residuals  24 1152.41   48.02                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Backward selection 자동화
backward <- step(out, direction = "backward", trace = T)

## Start:  AIC=123.36
## rating ~ complaints + privileges + learning + raises + critical + 
##     advance
## 
##              Df Sum of Sq    RSS    AIC
## - critical    1      3.41 1152.4 121.45
## - raises      1      6.80 1155.8 121.54
## - privileges  1     14.47 1163.5 121.74
## - advance     1     74.11 1223.1 123.24
## <none>                    1149.0 123.36
## - learning    1    180.50 1329.5 125.74
## - complaints  1    724.80 1873.8 136.04
## 
## Step:  AIC=121.45
## rating ~ complaints + privileges + learning + raises + advance
## 
##              Df Sum of Sq    RSS    AIC
## - raises      1     10.61 1163.0 119.73
## - privileges  1     14.16 1166.6 119.82
## - advance     1     71.27 1223.7 121.25
## <none>                    1152.4 121.45
## - learning    1    177.74 1330.1 123.75
## - complaints  1    724.70 1877.1 134.09
## 
## Step:  AIC=119.73
## rating ~ complaints + privileges + learning + advance
## 
##              Df Sum of Sq    RSS    AIC
## - privileges  1     16.10 1179.1 118.14
## - advance     1     61.60 1224.6 119.28
## <none>                    1163.0 119.73
## - learning    1    197.03 1360.0 122.42
## - complaints  1   1165.94 2328.9 138.56
## 
## Step:  AIC=118.14
## rating ~ complaints + learning + advance
## 
##              Df Sum of Sq    RSS    AIC
## - advance     1     75.54 1254.7 118.00
## <none>                    1179.1 118.14
## - learning    1    186.12 1365.2 120.54
## - complaints  1   1259.91 2439.0 137.94
## 
## Step:  AIC=118
## rating ~ complaints + learning
## 
##              Df Sum of Sq    RSS    AIC
## <none>                    1254.7 118.00
## - learning    1    114.73 1369.4 118.63
## - complaints  1   1370.91 2625.6 138.16

backward <- step(out, direction = "backward", trace = F)
backward        # 최종 선택된 회귀모형 : rating ~ complaints + learning

## 
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
## 
## Coefficients:
## (Intercept)   complaints     learning  
##      9.8709       0.6435       0.2112

backward$anova  # critical, raises, privileges, advance 순으로 제거됨

##           Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1              NA        NA        23   1149.000 123.3635
## 2   - critical  1  3.405864        24   1152.406 121.4523
## 3     - raises  1 10.605443        25   1163.012 119.7271
## 4 - privileges  1 16.097508        26   1179.109 118.1395
## 5    - advance  1 75.539831        27   1254.649 118.0024

# Stepwise selection

both <- step(out, direction = "both", trace = F)
both

## 
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
## 
## Coefficients:
## (Intercept)   complaints     learning  
##      9.8709       0.6435       0.2112

both$anova

##           Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1              NA        NA        23   1149.000 123.3635
## 2   - critical  1  3.405864        24   1152.406 121.4523
## 3     - raises  1 10.605443        25   1163.012 119.7271
## 4 - privileges  1 16.097508        26   1179.109 118.1395
## 5    - advance  1 75.539831        27   1254.649 118.0024

# All Subsets Regression

library(leaps)

leap <- regsubsets(rating ~ ., attitude, nbest = 5)   # size당 5개의 최적 모형 저장
summary(leap)

## Subset selection object
## Call: regsubsets.formula(rating ~ ., attitude, nbest = 5)
## 6 Variables  (and intercept)
##            Forced in Forced out
## complaints     FALSE      FALSE
## privileges     FALSE      FALSE
## learning       FALSE      FALSE
## raises         FALSE      FALSE
## critical       FALSE      FALSE
## advance        FALSE      FALSE
## 5 subsets of each size up to 6
## Selection Algorithm: exhaustive
##          complaints privileges learning raises critical advance
## 1  ( 1 ) "*"        " "        " "      " "    " "      " "    
## 1  ( 2 ) " "        " "        "*"      " "    " "      " "    
## 1  ( 3 ) " "        " "        " "      "*"    " "      " "    
## 1  ( 4 ) " "        "*"        " "      " "    " "      " "    
## 1  ( 5 ) " "        " "        " "      " "    "*"      " "    
## 2  ( 1 ) "*"        " "        "*"      " "    " "      " "    
## 2  ( 2 ) "*"        " "        " "      "*"    " "      " "    
## 2  ( 3 ) "*"        "*"        " "      " "    " "      " "    
## 2  ( 4 ) "*"        " "        " "      " "    " "      "*"    
## 2  ( 5 ) "*"        " "        " "      " "    "*"      " "    
## 3  ( 1 ) "*"        " "        "*"      " "    " "      "*"    
## 3  ( 2 ) "*"        "*"        "*"      " "    " "      " "    
## 3  ( 3 ) "*"        " "        "*"      "*"    " "      " "    
## 3  ( 4 ) "*"        " "        "*"      " "    "*"      " "    
## 3  ( 5 ) "*"        " "        " "      "*"    " "      "*"    
## 4  ( 1 ) "*"        "*"        "*"      " "    " "      "*"    
## 4  ( 2 ) "*"        " "        "*"      "*"    " "      "*"    
## 4  ( 3 ) "*"        " "        "*"      " "    "*"      "*"    
## 4  ( 4 ) "*"        "*"        "*"      "*"    " "      " "    
## 4  ( 5 ) "*"        "*"        "*"      " "    "*"      " "    
## 5  ( 1 ) "*"        "*"        "*"      "*"    " "      "*"    
## 5  ( 2 ) "*"        "*"        "*"      " "    "*"      "*"    
## 5  ( 3 ) "*"        " "        "*"      "*"    "*"      "*"    
## 5  ( 4 ) "*"        "*"        "*"      "*"    "*"      " "    
## 5  ( 5 ) "*"        "*"        " "      "*"    "*"      "*"    
## 6  ( 1 ) "*"        "*"        "*"      "*"    "*"      "*"

plot(leap)

plot(leap, scale = "adjr2") # adjusted r-squred 기준

# practice 5

# hotel margin prediction

data <- read.csv("data/laquinta.csv")
summary(data)

##      Margin          Number        Nearest       Office.Space  
##  Min.   :27.30   Min.   :1613   Min.   :0.100   Min.   :140.0  
##  1st Qu.:40.15   1st Qu.:2729   1st Qu.:1.675   1st Qu.:391.5  
##  Median :46.00   Median :2934   Median :2.250   Median :486.5  
##  Mean   :45.74   Mean   :2985   Mean   :2.310   Mean   :492.2  
##  3rd Qu.:51.62   3rd Qu.:3269   3rd Qu.:2.925   3rd Qu.:588.0  
##  Max.   :62.80   Max.   :4214   Max.   :4.200   Max.   :875.0  
##    Enrollment        Income         Distance     
##  Min.   : 6.00   Min.   :28.00   Min.   : 0.200  
##  1st Qu.:13.38   1st Qu.:33.00   1st Qu.: 4.550  
##  Median :16.00   Median :36.00   Median : 7.350  
##  Mean   :16.07   Mean   :36.22   Mean   : 6.918  
##  3rd Qu.:19.50   3rd Qu.:39.00   3rd Qu.: 9.025  
##  Max.   :26.50   Max.   :46.00   Max.   :14.400

str(data)

## 'data.frame':    100 obs. of  7 variables:
##  $ Margin      : num  55.5 33.8 49 31.9 57.4 49 46 50.2 46 45.5 ...
##  $ Number      : int  3203 2810 2890 3422 2687 3759 2341 3021 2655 2691 ...
##  $ Nearest     : num  4.2 2.8 2.4 3.3 0.9 2.9 2.3 1.7 1.1 3.2 ...
##  $ Office.Space: int  549 496 254 434 678 635 580 572 666 519 ...
##  $ Enrollment  : num  8 17.5 20 15.5 15.5 19 23 8.5 22 13.5 ...
##  $ Income      : int  37 35 35 38 42 33 29 41 34 46 ...
##  $ Distance    : num  2.7 14.4 2.6 12.1 6.9 10.8 7.4 5.5 8.1 5.7 ...

# 자료의 산점도 확인
round( cor(data), 3)

##              Margin Number Nearest Office.Space Enrollment Income Distance
## Margin        1.000 -0.470   0.160        0.501      0.123  0.248   -0.092
## Number       -0.470  1.000   0.082       -0.093     -0.064  0.037    0.073
## Nearest       0.160  0.082   1.000        0.043      0.071 -0.045    0.091
## Office.Space  0.501 -0.093   0.043        1.000     -0.001  0.153    0.033
## Enrollment    0.123 -0.064   0.071       -0.001      1.000 -0.113    0.097
## Income        0.248  0.037  -0.045        0.153     -0.113  1.000   -0.052
## Distance     -0.092  0.073   0.091        0.033      0.097 -0.052    1.000

pairs.panels(data)   # 설명변수간의 correlation도 낮다. 종속변수와도 낮다.

# 회귀모형
model <- lm(Margin ~ ., data)
summary(model)      # F-test 유의함. R-squared: 0.525. Distance, Enrollment 제외한 회귀계수 유의함.

## 
## Call:
## lm(formula = Margin ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.267  -3.022  -0.086   4.234  13.596 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.138575   6.992948   5.454 4.04e-07 ***
## Number       -0.007618   0.001255  -6.069 2.77e-08 ***
## Nearest       1.646237   0.632837   2.601   0.0108 *  
## Office.Space  0.019766   0.003410   5.796 9.24e-08 ***
## Enrollment    0.211783   0.133428   1.587   0.1159    
## Income        0.413122   0.139552   2.960   0.0039 ** 
## Distance     -0.225258   0.178709  -1.260   0.2107    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.512 on 93 degrees of freedom
## Multiple R-squared:  0.5251, Adjusted R-squared:  0.4944 
## F-statistic: 17.14 on 6 and 93 DF,  p-value: 3.034e-13

plot(model)         # 잔차도 이상 없음

backward <- step(model, direction = "backward", trace = F)
backward

## 
## Call:
## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment + 
##     Income, data = data)
## 
## Coefficients:
##  (Intercept)        Number       Nearest  Office.Space    Enrollment  
##    37.128891     -0.007742      1.586923      0.019576      0.196385  
##       Income  
##     0.421411

both <- step(model, direction = "both", trace = F)
both

## 
## Call:
## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment + 
##     Income, data = data)
## 
## Coefficients:
##  (Intercept)        Number       Nearest  Office.Space    Enrollment  
##    37.128891     -0.007742      1.586923      0.019576      0.196385  
##       Income  
##     0.421411

# 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Enrollment + Income
# Coefficients:
#     (Intercept)        Number       Nearest  Office.Space    Enrollment        Income  
#       37.128891     -0.007742      1.586923      0.019576      0.196385      0.421411 


# 다음 조건을 가진 한 지역의 Margin을 95% 신뢰구간으로 예측
new <- data.frame("Number" = 3815, "Nearest" = 0.9, "Office.Space" = 476, 
                  "Enrollment" = 24.5, "Income" = 35, "Distance" = 11.2)
new

##   Number Nearest Office.Space Enrollment Income Distance
## 1   3815     0.9          476       24.5     35     11.2

predict(model, new, interval = "prediction")

##        fit      lwr      upr
## 1 37.09149 25.39525 48.78772

# BIC 값을 최소로 하는 설명변수의 조합을 찾아 회귀식을 추정
regsub <- regsubsets(Margin ~ ., data, nbest = 5)
plot(regsub)   # 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Income

plot(regsub, scale="adjr2")

R을 활용한 통계분석 - 5.다중회귀분석

woosa7