# 다중회귀분석 (Multiple Regression Model)
# programmer 20명
# salary가 experience(경력년수), score (직무적성검사성적)과 연관성을 갖는지 검증.
df <- read.csv("data/salary.csv")
head(df)
## experience score salary
## 1 4 78 24.0
## 2 7 100 43.0
## 3 1 86 23.7
## 4 5 82 34.3
## 5 8 86 35.8
## 6 10 84 38.0
summary(df)
## experience score salary
## Min. : 0.00 Min. : 70.00 Min. :22.20
## 1st Qu.: 3.00 1st Qu.: 77.25 1st Qu.:27.80
## Median : 5.50 Median : 82.50 Median :30.85
## Mean : 5.20 Mean : 82.75 Mean :31.23
## 3rd Qu.: 7.25 3rd Qu.: 87.25 3rd Qu.:34.67
## Max. :10.00 Max. :100.00 Max. :43.00
library(psych)
pairs.panels(df) # salary ~ experience 상관계수 0.86
# 단순회귀 : 경력 증가시 연봉 증가 상관관계
model <- lm(salary ~ experience, data = df)
summary(model)
##
## Call:
## lm(formula = salary ~ experience, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.291 -1.441 0.249 0.719 8.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.8111 1.3761 16.576 2.39e-12 ***
## experience 1.6200 0.2313 7.004 1.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.991 on 18 degrees of freedom
## Multiple R-squared: 0.7316, Adjusted R-squared: 0.7167
## F-statistic: 49.06 on 1 and 18 DF, p-value: 1.541e-06
# 다중회귀 : 경력 증가시 적성검사 점수 증가로 인한 연봉 증가까지 포함된 관계
# experience ~ score 의 cor() = 0.34
# model <- lm(salary ~ ., data = df)
model <- lm(salary ~ experience + score, data = df)
summary(model)
##
## Call:
## lm(formula = salary ~ experience + score, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3586 -1.4581 -0.0341 1.1862 4.9102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.17394 6.15607 0.516 0.61279
## experience 1.40390 0.19857 7.070 1.88e-06 ***
## score 0.25089 0.07735 3.243 0.00478 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.419 on 17 degrees of freedom
## Multiple R-squared: 0.8342, Adjusted R-squared: 0.8147
## F-statistic: 42.76 on 2 and 17 DF, p-value: 2.328e-07
# 추정된 회귀식
# salary = 3.174 + 1.404 * experience + 0.251 * score
# b1 : b2(score)가 일정하다고 할 때, experience가 1년 증가하면 salary가 $1,404 증가할 것으로 기대된다.
# b2 : b1(experience)가 일정하다고 할 때, score가 1점 증가하면 salary가 $251 증가할 것으로 기대된다.
# 다중회귀분석 결과 해석
# (1) Adjusted R-squared
# R-squared: 0.83 --> experience와 score가 salary 변동량의 83%를 설명한다.
# But, 설명변수 갯수가 증가하면 결정계수도 증가
# --> 설명변수 갯수에 대한 패널티 적용한 결정계수 = Adjusted R-squared
# (2) F-test
# H0 : b1 = b2 = ...... = bk = 0
# 종속변수와 모든 독립(설명)변수 집합간에 유의한 관계가 존재하는지 검정
# b0 는 큰 의미가 없다.
# (3) T-test
# H0 : bi = 0
# 각 개별 독립변수의 유의성 검정
# (4) 잔차분석 --> Residuals plot / Normal Q-Q plot / Leverage plot
# 영향점이 있는 경우
plot(model) # --> Leverage plot에서 2번째 자료가 이상치 & 영향점
dcolor <- rep(1, length(df$salary))
dcolor[2] = 2
pairs(df, col = dcolor, pch = dcolor) # 2번 자료만 다르게 표시
# 영향점 제거는 주관적으로 판단하는 수밖에 없다.
df2 <- df[-2, ] # 영향점 제거할 경우
pairs.panels(df2) # salary ~ experience 상관계수 높아짐(0.91). 다른 상관계수는 낮아짐.
model2 <- lm(salary ~ experience + score, data = df2)
summary(model2) # score 회귀계수가 유의하지 않다.
##
## Call:
## lm(formula = salary ~ experience + score, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5221 -1.4259 0.1133 1.3351 3.8131
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.31238 5.93883 2.073 0.0547 .
## experience 1.42607 0.16426 8.682 1.89e-07 ***
## score 0.13469 0.07486 1.799 0.0909 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.999 on 16 degrees of freedom
## Multiple R-squared: 0.8592, Adjusted R-squared: 0.8416
## F-statistic: 48.83 on 2 and 16 DF, p-value: 1.542e-07
# 추정과 예측
# 경력 5년, 적성검사성적 80점인 사람과 경력 10년, 성적 70점인 사람의 연봉 예측
# 평균 연봉의 95% 신뢰구간
predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)),
interval = "confidence")
## fit lwr upr
## 1 30.26428 29.04555 31.48302
## 2 34.77494 31.24174 38.30815
# 새로운 한 명에 대한 95% 예측구간
predict(model, data.frame("experience" = c(5,10), "score" = c(80,70)),
interval = "prediction")
## fit lwr upr
## 1 30.26428 25.01763 35.51094
## 2 34.77494 28.56804 40.98184
# 다중공선성 (Multicollinearity)
# 독립변수들이 서로 높은 상관관계를 가지면 회귀계수의 정확한 추정이 어렵다.
# ---> 모형 선택 방법론을 적용하여 가장 적절한 변수를 선택할 수 있다.
# 30개 부서에서 부서당 35명의 직원 설문조사
# 데이터 숫자는 해당 질문에 긍정한 직원의 비율
attitude
## rating complaints privileges learning raises critical advance
## 1 43 51 30 39 61 92 45
## 2 63 64 51 54 63 73 47
## 3 71 70 68 69 76 86 48
## 4 61 63 45 47 54 84 35
## 5 81 78 56 66 71 83 47
## 6 43 55 49 44 54 49 34
## 7 58 67 42 56 66 68 35
## 8 71 75 50 55 70 66 41
## 9 72 82 72 67 71 83 31
## 10 67 61 45 47 62 80 41
## 11 64 53 53 58 58 67 34
## 12 67 60 47 39 59 74 41
## 13 69 62 57 42 55 63 25
## 14 68 83 83 45 59 77 35
## 15 77 77 54 72 79 77 46
## 16 81 90 50 72 60 54 36
## 17 74 85 64 69 79 79 63
## 18 65 60 65 75 55 80 60
## 19 65 70 46 57 75 85 46
## 20 50 58 68 54 64 78 52
## 21 50 40 33 34 43 64 33
## 22 64 61 52 62 66 80 41
## 23 53 66 52 50 63 80 37
## 24 40 37 42 58 50 57 49
## 25 63 54 42 48 66 75 33
## 26 66 77 66 63 88 76 72
## 27 78 75 58 74 80 78 49
## 28 48 57 44 45 51 83 38
## 29 85 85 71 71 77 74 55
## 30 82 82 39 59 64 78 39
round(cor(attitude),3)
## rating complaints privileges learning raises critical advance
## rating 1.000 0.825 0.426 0.624 0.590 0.156 0.155
## complaints 0.825 1.000 0.558 0.597 0.669 0.188 0.225
## privileges 0.426 0.558 1.000 0.493 0.445 0.147 0.343
## learning 0.624 0.597 0.493 1.000 0.640 0.116 0.532
## raises 0.590 0.669 0.445 0.640 1.000 0.377 0.574
## critical 0.156 0.188 0.147 0.116 0.377 1.000 0.283
## advance 0.155 0.225 0.343 0.532 0.574 0.283 1.000
pairs.panels(attitude)
# cor : complaints + learning = 0.597
# cor : complaints + raises = 0.669
plot(attitude[ , c("rating", "complaints", "learning")])
a <- lm(rating ~ complaints + learning, data = attitude)
summary(a)
##
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5568 -5.7331 0.6701 6.5341 10.3610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.8709 7.0612 1.398 0.174
## complaints 0.6435 0.1185 5.432 9.57e-06 ***
## learning 0.2112 0.1344 1.571 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.817 on 27 degrees of freedom
## Multiple R-squared: 0.708, Adjusted R-squared: 0.6864
## F-statistic: 32.74 on 2 and 27 DF, p-value: 6.058e-08
# learning의 t-test p-value 값을 보면 유의하지 않다.
# 하지만 rating과 상관관계가 없는 것이 아니다.
# complaints 와의 상관관계도 있기 때문에 rating 변수에 대한 역할이 작아보일 뿐이다.
# 모형 선택법 (Model Selection) = 설명변수 선택
# *** 해당 업무분야에서 반드시 들어가야 하는 변수는 고정 !!!
# (1) Forward selection
# --- 가장 유의한 변수부터 하나씩 추가 (R-sq 기준)
# --- 변수값의 작은 변동에도 결과가 크게 달라져 안정성 부족
# (2) Backward selection
# --- 모든 변수를 넣고 가장 기여도가 낮은 것부터 하나씩 제거
# --- 전체 변수 정보를 이용하는 장점
# --- 변수의 갯수가 많은 경우 사용 어려움. 안정성 부족.
# (3) Stepwise selection
# --- Forward selection과 backward selection을 조합
# --- 새로운 변수 추가 후에 기존 변수의 중요도가 약화되면 그 변수 제거
# (4) All Subsets Regression
# --- 모든 가능한 모형을 비교하여 최적의 모형선택
# --- 여러 모형 중 최소 AIC, BIC, Mallow’s Cp 또는 최대 adjusted R-sq를 갖는 모형을 선택
# --- 모형의 복잡도에 벌점을 주는 방법. AIC (Akaike information criterion), BIC (Bayesian ...)
# Backward selection
out <- lm(rating ~ ., attitude)
summary(out)
##
## Call:
## lm(formula = rating ~ ., data = attitude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9418 -4.3555 0.3158 5.5425 11.5990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.78708 11.58926 0.931 0.361634
## complaints 0.61319 0.16098 3.809 0.000903 ***
## privileges -0.07305 0.13572 -0.538 0.595594
## learning 0.32033 0.16852 1.901 0.069925 .
## raises 0.08173 0.22148 0.369 0.715480
## critical 0.03838 0.14700 0.261 0.796334
## advance -0.21706 0.17821 -1.218 0.235577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.068 on 23 degrees of freedom
## Multiple R-squared: 0.7326, Adjusted R-squared: 0.6628
## F-statistic: 10.5 on 6 and 23 DF, p-value: 1.24e-05
anova(out) # 각 회귀계수 t검정 p-value 기준 선별. critical 제거.
## Analysis of Variance Table
##
## Response: rating
## Df Sum Sq Mean Sq F value Pr(>F)
## complaints 1 2927.58 2927.58 58.6026 9.056e-08 ***
## privileges 1 7.52 7.52 0.1505 0.7016
## learning 1 137.25 137.25 2.7473 0.1110
## raises 1 0.94 0.94 0.0189 0.8920
## critical 1 0.56 0.56 0.0113 0.9163
## advance 1 74.11 74.11 1.4835 0.2356
## Residuals 23 1149.00 49.96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
out2 <- lm(rating ~ . - critical, data = attitude)
summary(out2)
##
## Call:
## lm(formula = rating ~ . - critical, data = attitude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.8088 -4.8353 0.4199 5.5775 11.5276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.79791 8.49061 1.507 0.144785
## complaints 0.61315 0.15783 3.885 0.000704 ***
## privileges -0.07224 0.13303 -0.543 0.592122
## learning 0.31172 0.16202 1.924 0.066300 .
## raises 0.09795 0.20842 0.470 0.642621
## advance -0.21111 0.17328 -1.218 0.234956
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.929 on 24 degrees of freedom
## Multiple R-squared: 0.7318, Adjusted R-squared: 0.6759
## F-statistic: 13.1 on 5 and 24 DF, p-value: 3.278e-06
anova(out2) # raises 제거
## Analysis of Variance Table
##
## Response: rating
## Df Sum Sq Mean Sq F value Pr(>F)
## complaints 1 2927.58 2927.58 60.9698 4.835e-08 ***
## privileges 1 7.52 7.52 0.1566 0.6958
## learning 1 137.25 137.25 2.8583 0.1039
## raises 1 0.94 0.94 0.0196 0.8898
## advance 1 71.27 71.27 1.4842 0.2350
## Residuals 24 1152.41 48.02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Backward selection 자동화
backward <- step(out, direction = "backward", trace = T)
## Start: AIC=123.36
## rating ~ complaints + privileges + learning + raises + critical +
## advance
##
## Df Sum of Sq RSS AIC
## - critical 1 3.41 1152.4 121.45
## - raises 1 6.80 1155.8 121.54
## - privileges 1 14.47 1163.5 121.74
## - advance 1 74.11 1223.1 123.24
## <none> 1149.0 123.36
## - learning 1 180.50 1329.5 125.74
## - complaints 1 724.80 1873.8 136.04
##
## Step: AIC=121.45
## rating ~ complaints + privileges + learning + raises + advance
##
## Df Sum of Sq RSS AIC
## - raises 1 10.61 1163.0 119.73
## - privileges 1 14.16 1166.6 119.82
## - advance 1 71.27 1223.7 121.25
## <none> 1152.4 121.45
## - learning 1 177.74 1330.1 123.75
## - complaints 1 724.70 1877.1 134.09
##
## Step: AIC=119.73
## rating ~ complaints + privileges + learning + advance
##
## Df Sum of Sq RSS AIC
## - privileges 1 16.10 1179.1 118.14
## - advance 1 61.60 1224.6 119.28
## <none> 1163.0 119.73
## - learning 1 197.03 1360.0 122.42
## - complaints 1 1165.94 2328.9 138.56
##
## Step: AIC=118.14
## rating ~ complaints + learning + advance
##
## Df Sum of Sq RSS AIC
## - advance 1 75.54 1254.7 118.00
## <none> 1179.1 118.14
## - learning 1 186.12 1365.2 120.54
## - complaints 1 1259.91 2439.0 137.94
##
## Step: AIC=118
## rating ~ complaints + learning
##
## Df Sum of Sq RSS AIC
## <none> 1254.7 118.00
## - learning 1 114.73 1369.4 118.63
## - complaints 1 1370.91 2625.6 138.16
backward <- step(out, direction = "backward", trace = F)
backward # 최종 선택된 회귀모형 : rating ~ complaints + learning
##
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
##
## Coefficients:
## (Intercept) complaints learning
## 9.8709 0.6435 0.2112
backward$anova # critical, raises, privileges, advance 순으로 제거됨
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 23 1149.000 123.3635
## 2 - critical 1 3.405864 24 1152.406 121.4523
## 3 - raises 1 10.605443 25 1163.012 119.7271
## 4 - privileges 1 16.097508 26 1179.109 118.1395
## 5 - advance 1 75.539831 27 1254.649 118.0024
# Stepwise selection
both <- step(out, direction = "both", trace = F)
both
##
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
##
## Coefficients:
## (Intercept) complaints learning
## 9.8709 0.6435 0.2112
both$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 23 1149.000 123.3635
## 2 - critical 1 3.405864 24 1152.406 121.4523
## 3 - raises 1 10.605443 25 1163.012 119.7271
## 4 - privileges 1 16.097508 26 1179.109 118.1395
## 5 - advance 1 75.539831 27 1254.649 118.0024
# All Subsets Regression
library(leaps)
leap <- regsubsets(rating ~ ., attitude, nbest = 5) # size당 5개의 최적 모형 저장
summary(leap)
## Subset selection object
## Call: regsubsets.formula(rating ~ ., attitude, nbest = 5)
## 6 Variables (and intercept)
## Forced in Forced out
## complaints FALSE FALSE
## privileges FALSE FALSE
## learning FALSE FALSE
## raises FALSE FALSE
## critical FALSE FALSE
## advance FALSE FALSE
## 5 subsets of each size up to 6
## Selection Algorithm: exhaustive
## complaints privileges learning raises critical advance
## 1 ( 1 ) "*" " " " " " " " " " "
## 1 ( 2 ) " " " " "*" " " " " " "
## 1 ( 3 ) " " " " " " "*" " " " "
## 1 ( 4 ) " " "*" " " " " " " " "
## 1 ( 5 ) " " " " " " " " "*" " "
## 2 ( 1 ) "*" " " "*" " " " " " "
## 2 ( 2 ) "*" " " " " "*" " " " "
## 2 ( 3 ) "*" "*" " " " " " " " "
## 2 ( 4 ) "*" " " " " " " " " "*"
## 2 ( 5 ) "*" " " " " " " "*" " "
## 3 ( 1 ) "*" " " "*" " " " " "*"
## 3 ( 2 ) "*" "*" "*" " " " " " "
## 3 ( 3 ) "*" " " "*" "*" " " " "
## 3 ( 4 ) "*" " " "*" " " "*" " "
## 3 ( 5 ) "*" " " " " "*" " " "*"
## 4 ( 1 ) "*" "*" "*" " " " " "*"
## 4 ( 2 ) "*" " " "*" "*" " " "*"
## 4 ( 3 ) "*" " " "*" " " "*" "*"
## 4 ( 4 ) "*" "*" "*" "*" " " " "
## 4 ( 5 ) "*" "*" "*" " " "*" " "
## 5 ( 1 ) "*" "*" "*" "*" " " "*"
## 5 ( 2 ) "*" "*" "*" " " "*" "*"
## 5 ( 3 ) "*" " " "*" "*" "*" "*"
## 5 ( 4 ) "*" "*" "*" "*" "*" " "
## 5 ( 5 ) "*" "*" " " "*" "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" "*" "*"
plot(leap)
plot(leap, scale = "adjr2") # adjusted r-squred 기준
# practice 5
# hotel margin prediction
data <- read.csv("data/laquinta.csv")
summary(data)
## Margin Number Nearest Office.Space
## Min. :27.30 Min. :1613 Min. :0.100 Min. :140.0
## 1st Qu.:40.15 1st Qu.:2729 1st Qu.:1.675 1st Qu.:391.5
## Median :46.00 Median :2934 Median :2.250 Median :486.5
## Mean :45.74 Mean :2985 Mean :2.310 Mean :492.2
## 3rd Qu.:51.62 3rd Qu.:3269 3rd Qu.:2.925 3rd Qu.:588.0
## Max. :62.80 Max. :4214 Max. :4.200 Max. :875.0
## Enrollment Income Distance
## Min. : 6.00 Min. :28.00 Min. : 0.200
## 1st Qu.:13.38 1st Qu.:33.00 1st Qu.: 4.550
## Median :16.00 Median :36.00 Median : 7.350
## Mean :16.07 Mean :36.22 Mean : 6.918
## 3rd Qu.:19.50 3rd Qu.:39.00 3rd Qu.: 9.025
## Max. :26.50 Max. :46.00 Max. :14.400
str(data)
## 'data.frame': 100 obs. of 7 variables:
## $ Margin : num 55.5 33.8 49 31.9 57.4 49 46 50.2 46 45.5 ...
## $ Number : int 3203 2810 2890 3422 2687 3759 2341 3021 2655 2691 ...
## $ Nearest : num 4.2 2.8 2.4 3.3 0.9 2.9 2.3 1.7 1.1 3.2 ...
## $ Office.Space: int 549 496 254 434 678 635 580 572 666 519 ...
## $ Enrollment : num 8 17.5 20 15.5 15.5 19 23 8.5 22 13.5 ...
## $ Income : int 37 35 35 38 42 33 29 41 34 46 ...
## $ Distance : num 2.7 14.4 2.6 12.1 6.9 10.8 7.4 5.5 8.1 5.7 ...
# 자료의 산점도 확인
round( cor(data), 3)
## Margin Number Nearest Office.Space Enrollment Income Distance
## Margin 1.000 -0.470 0.160 0.501 0.123 0.248 -0.092
## Number -0.470 1.000 0.082 -0.093 -0.064 0.037 0.073
## Nearest 0.160 0.082 1.000 0.043 0.071 -0.045 0.091
## Office.Space 0.501 -0.093 0.043 1.000 -0.001 0.153 0.033
## Enrollment 0.123 -0.064 0.071 -0.001 1.000 -0.113 0.097
## Income 0.248 0.037 -0.045 0.153 -0.113 1.000 -0.052
## Distance -0.092 0.073 0.091 0.033 0.097 -0.052 1.000
pairs.panels(data) # 설명변수간의 correlation도 낮다. 종속변수와도 낮다.
# 회귀모형
model <- lm(Margin ~ ., data)
summary(model) # F-test 유의함. R-squared: 0.525. Distance, Enrollment 제외한 회귀계수 유의함.
##
## Call:
## lm(formula = Margin ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.267 -3.022 -0.086 4.234 13.596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.138575 6.992948 5.454 4.04e-07 ***
## Number -0.007618 0.001255 -6.069 2.77e-08 ***
## Nearest 1.646237 0.632837 2.601 0.0108 *
## Office.Space 0.019766 0.003410 5.796 9.24e-08 ***
## Enrollment 0.211783 0.133428 1.587 0.1159
## Income 0.413122 0.139552 2.960 0.0039 **
## Distance -0.225258 0.178709 -1.260 0.2107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.512 on 93 degrees of freedom
## Multiple R-squared: 0.5251, Adjusted R-squared: 0.4944
## F-statistic: 17.14 on 6 and 93 DF, p-value: 3.034e-13
plot(model) # 잔차도 이상 없음
backward <- step(model, direction = "backward", trace = F)
backward
##
## Call:
## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment +
## Income, data = data)
##
## Coefficients:
## (Intercept) Number Nearest Office.Space Enrollment
## 37.128891 -0.007742 1.586923 0.019576 0.196385
## Income
## 0.421411
both <- step(model, direction = "both", trace = F)
both
##
## Call:
## lm(formula = Margin ~ Number + Nearest + Office.Space + Enrollment +
## Income, data = data)
##
## Coefficients:
## (Intercept) Number Nearest Office.Space Enrollment
## 37.128891 -0.007742 1.586923 0.019576 0.196385
## Income
## 0.421411
# 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Enrollment + Income
# Coefficients:
# (Intercept) Number Nearest Office.Space Enrollment Income
# 37.128891 -0.007742 1.586923 0.019576 0.196385 0.421411
# 다음 조건을 가진 한 지역의 Margin을 95% 신뢰구간으로 예측
new <- data.frame("Number" = 3815, "Nearest" = 0.9, "Office.Space" = 476,
"Enrollment" = 24.5, "Income" = 35, "Distance" = 11.2)
new
## Number Nearest Office.Space Enrollment Income Distance
## 1 3815 0.9 476 24.5 35 11.2
predict(model, new, interval = "prediction")
## fit lwr upr
## 1 37.09149 25.39525 48.78772
# BIC 값을 최소로 하는 설명변수의 조합을 찾아 회귀식을 추정
regsub <- regsubsets(Margin ~ ., data, nbest = 5)
plot(regsub) # 최종 회귀모형 : Margin ~ Number + Nearest + Office.Space + Income
plot(regsub, scale="adjr2")