All posible subsets

Using the brdige construction example, we consider all possible subsets.

Q1. How many subsets do we have if we the intercept is always included?

A1. The number of predictors is \(p - 1\), therefore, the total number of models is \(2^{p - 1}\).

bridge <- read.table("bridge.txt", header=TRUE)
attach(bridge)

# figure of adjusted R^2 
m1 <- lm(log(Time)~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans))
logDArea <- log(DArea)
logCCost <- log(CCost)
logDwgs <- log(Dwgs)
logLength <- log(Length)
logSpans <- log(Spans)
X <- cbind(logDArea,logCCost,logDwgs,logLength,logSpans)
library(leaps)
b <- regsubsets(as.matrix(X),log(Time))
rs <- summary(b)
par(mfrow=c(1,2))
plot(1:5,rs$adjr2,xlab="Subset Size",ylab="Adjusted R-squared")
library(car)
subsets(b,statistic=c("adjr2"),legend="center")

# Generate Table of adjusted R-squared/AIC/AICc/BIC
# Calculate adjusted R-squared
rs$adjr2
## [1] 0.7022401 0.7530191 0.7582178 0.7534273 0.7475037
om1 <- lm(log(Time)~log(Dwgs))
om2 <- lm(log(Time)~log(Dwgs)+log(Spans))
om3 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost))
om4 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost)+log(DArea))
om5 <- m1
#Subset size=1
n <- length(om1$residuals)
npar <- length(om1$coefficients) +1
#Calculate AIC
extractAIC(om1,k=2)
## [1]   2.00000 -94.89754
#Calculate AICc
extractAIC(om1,k=2)+2*npar*(npar+1)/(n-npar-1)
## [1]   2.585366 -94.312171
#Calculate BIC
extractAIC(om1,k=log(n))
## [1]   2.00000 -91.28421
#Subset size=2
npar <- length(om2$coefficients) +1
#Calculate AIC
extractAIC(om2,k=2)
## [1]    3.0000 -102.3703
#Calculate AICc
extractAIC(om2,k=2)+2*npar*(npar+1)/(n-npar-1)
## [1]    4.0000 -101.3703
#Calculate BIC
extractAIC(om2,k=log(n))
## [1]   3.00000 -96.95036
#Subset size=3
npar <- length(om3$coefficients) +1
#Calculate AIC
extractAIC(om3,k=2)
## [1]    4.0000 -102.4121
#Calculate AICc
extractAIC(om3,k=2)+2*npar*(npar+1)/(n-npar-1)
## [1]    5.538462 -100.873608
#Calculate BIC
extractAIC(om3,k=log(n))
## [1]   4.00000 -95.18542
#Subset size=4
npar <- length(om4$coefficients) +1
#Calculate AIC
extractAIC(om4,k=2)
## [1]    5.0000 -100.6403
#Calculate AICc
extractAIC(om4,k=2)+2*npar*(npar+1)/(n-npar-1)
## [1]   7.210526 -98.429816
#Calculate BIC
extractAIC(om4,k=log(n))
## [1]   5.00000 -91.60703
#Subset size=5
npar <- length(om5$coefficients) +1
#Calculate AIC
extractAIC(om5,k=2)
## [1]   6.00000 -98.71136
#Calculate AICc
extractAIC(om5,k=2)+2*npar*(npar+1)/(n-npar-1)
## [1]   9.027027 -95.684330
#Calculate BIC
extractAIC(om5,k=log(n))
## [1]   6.00000 -87.87138

Best model with 2 variables:

summary(om2)
## 
## Call:
## lm(formula = log(Time) ~ log(Dwgs) + log(Spans))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68649 -0.24728 -0.05988  0.26050  0.63759 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.66173    0.26871   9.905 1.49e-12 ***
## log(Dwgs)    1.04163    0.15420   6.755 3.26e-08 ***
## log(Spans)   0.28530    0.09095   3.137  0.00312 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3105 on 42 degrees of freedom
## Multiple R-squared:  0.7642, Adjusted R-squared:  0.753 
## F-statistic: 68.08 on 2 and 42 DF,  p-value: 6.632e-14

Best model with 3 variables:

summary(om3)
## 
## Call:
## lm(formula = log(Time) ~ log(Dwgs) + log(Spans) + log(CCost))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69415 -0.17456 -0.03566  0.22739  0.64945 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.3317     0.3577   6.519  7.9e-08 ***
## log(Dwgs)     0.8356     0.2135   3.914 0.000336 ***
## log(Spans)    0.1963     0.1107   1.773 0.083710 .  
## log(CCost)    0.1483     0.1075   1.380 0.175212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3072 on 41 degrees of freedom
## Multiple R-squared:  0.7747, Adjusted R-squared:  0.7582 
## F-statistic: 46.99 on 3 and 41 DF,  p-value: 2.484e-13

Notice that both predictor variables are judged to be statistically significant in the two-variable model, while just one variable is judged to be statistically significant in the three-variable model. In view of this, it seems that the three-variable model over-fits the data and as such the two-variable model is to be preferred.

Stepwise subsets

Foward seletion:

Backward elimination:

Q2. How many models does Forward selection or Backward elimination consider (again, the intercept is included in all models)?

A2. We at most consider \((p - 1) + (p - 2) + \cdots + 1 = p(p - 1)/2\) models.

# Backward elimination based on AIC and BIC
backAIC <- step(m1,direction="backward", data=bridge)
## Start:  AIC=-98.71
## log(Time) ~ log(DArea) + log(CCost) + log(Dwgs) + log(Length) + 
##     log(Spans)
## 
##               Df Sum of Sq    RSS      AIC
## - log(Length)  1   0.00607 3.8497 -100.640
## - log(DArea)   1   0.01278 3.8564 -100.562
## <none>                     3.8436  -98.711
## - log(CCost)   1   0.18162 4.0252  -98.634
## - log(Spans)   1   0.26616 4.1098  -97.698
## - log(Dwgs)    1   1.45358 5.2972  -86.277
## 
## Step:  AIC=-100.64
## log(Time) ~ log(DArea) + log(CCost) + log(Dwgs) + log(Spans)
## 
##              Df Sum of Sq    RSS      AIC
## - log(DArea)  1   0.01958 3.8693 -102.412
## <none>                    3.8497 -100.640
## - log(CCost)  1   0.18064 4.0303 -100.577
## - log(Spans)  1   0.31501 4.1647  -99.101
## - log(Dwgs)   1   1.44946 5.2991  -88.260
## 
## Step:  AIC=-102.41
## log(Time) ~ log(CCost) + log(Dwgs) + log(Spans)
## 
##              Df Sum of Sq    RSS      AIC
## <none>                    3.8693 -102.412
## - log(CCost)  1   0.17960 4.0488 -102.370
## - log(Spans)  1   0.29656 4.1658 -101.089
## - log(Dwgs)   1   1.44544 5.3147  -90.128
backBIC <- step(m1,direction="backward", data=bridge, k=log(n))
## Start:  AIC=-87.87
## log(Time) ~ log(DArea) + log(CCost) + log(Dwgs) + log(Length) + 
##     log(Spans)
## 
##               Df Sum of Sq    RSS     AIC
## - log(Length)  1   0.00607 3.8497 -91.607
## - log(DArea)   1   0.01278 3.8564 -91.529
## - log(CCost)   1   0.18162 4.0252 -89.600
## - log(Spans)   1   0.26616 4.1098 -88.665
## <none>                     3.8436 -87.871
## - log(Dwgs)    1   1.45358 5.2972 -77.244
## 
## Step:  AIC=-91.61
## log(Time) ~ log(DArea) + log(CCost) + log(Dwgs) + log(Spans)
## 
##              Df Sum of Sq    RSS     AIC
## - log(DArea)  1   0.01958 3.8693 -95.185
## - log(CCost)  1   0.18064 4.0303 -93.350
## - log(Spans)  1   0.31501 4.1647 -91.874
## <none>                    3.8497 -91.607
## - log(Dwgs)   1   1.44946 5.2991 -81.034
## 
## Step:  AIC=-95.19
## log(Time) ~ log(CCost) + log(Dwgs) + log(Spans)
## 
##              Df Sum of Sq    RSS     AIC
## - log(CCost)  1   0.17960 4.0488 -96.950
## - log(Spans)  1   0.29656 4.1658 -95.669
## <none>                    3.8693 -95.185
## - log(Dwgs)   1   1.44544 5.3147 -84.708
## 
## Step:  AIC=-96.95
## log(Time) ~ log(Dwgs) + log(Spans)
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    4.0488 -96.950
## - log(Spans)  1    0.9487 4.9975 -91.284
## - log(Dwgs)   1    4.3989 8.4478 -67.661

Thus, backward elimination based on AIC chooses the model with the three predictors log(CCost), log(Dwgs) and log(Spans). It can be shown that backward elimination based on BIC chooses the model with the two predictors log(Dwgs) and log(Spans).

# Forward selection based on AIC and BIC 
mint <- lm(log(Time)~1,data=bridge)
forwardAIC <- step(mint,scope=list(lower=~1, 
upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)),
direction="forward", data=bridge)
## Start:  AIC=-41.35
## log(Time) ~ 1
## 
##               Df Sum of Sq     RSS     AIC
## + log(Dwgs)    1   12.1765  4.9975 -94.898
## + log(CCost)   1   11.6147  5.5593 -90.104
## + log(DArea)   1   10.2943  6.8797 -80.514
## + log(Length)  1   10.0120  7.1620 -78.704
## + log(Spans)   1    8.7262  8.4478 -71.274
## <none>                     17.1740 -41.347
## 
## Step:  AIC=-94.9
## log(Time) ~ log(Dwgs)
## 
##               Df Sum of Sq    RSS      AIC
## + log(Spans)   1   0.94866 4.0488 -102.370
## + log(CCost)   1   0.83170 4.1658 -101.089
## + log(Length)  1   0.66914 4.3284  -99.366
## + log(DArea)   1   0.47568 4.5218  -97.399
## <none>                     4.9975  -94.898
## 
## Step:  AIC=-102.37
## log(Time) ~ log(Dwgs) + log(Spans)
## 
##               Df Sum of Sq    RSS     AIC
## + log(CCost)   1  0.179598 3.8693 -102.41
## <none>                     4.0488 -102.37
## + log(DArea)   1  0.018535 4.0303 -100.58
## + log(Length)  1  0.016924 4.0319 -100.56
## 
## Step:  AIC=-102.41
## log(Time) ~ log(Dwgs) + log(Spans) + log(CCost)
## 
##               Df Sum of Sq    RSS     AIC
## <none>                     3.8693 -102.41
## + log(DArea)   1  0.019578 3.8497 -100.64
## + log(Length)  1  0.012868 3.8564 -100.56
forwardBIC <- step(mint,scope=list(lower=~1, 
upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)),
direction="forward", data=bridge,k=log(n))
## Start:  AIC=-39.54
## log(Time) ~ 1
## 
##               Df Sum of Sq     RSS     AIC
## + log(Dwgs)    1   12.1765  4.9975 -91.284
## + log(CCost)   1   11.6147  5.5593 -86.490
## + log(DArea)   1   10.2943  6.8797 -76.901
## + log(Length)  1   10.0120  7.1620 -75.091
## + log(Spans)   1    8.7262  8.4478 -67.661
## <none>                     17.1740 -39.540
## 
## Step:  AIC=-91.28
## log(Time) ~ log(Dwgs)
## 
##               Df Sum of Sq    RSS     AIC
## + log(Spans)   1   0.94866 4.0488 -96.950
## + log(CCost)   1   0.83170 4.1658 -95.669
## + log(Length)  1   0.66914 4.3284 -93.946
## + log(DArea)   1   0.47568 4.5218 -91.979
## <none>                     4.9975 -91.284
## 
## Step:  AIC=-96.95
## log(Time) ~ log(Dwgs) + log(Spans)
## 
##               Df Sum of Sq    RSS     AIC
## <none>                     4.0488 -96.950
## + log(CCost)   1  0.179598 3.8693 -95.185
## + log(DArea)   1  0.018535 4.0303 -93.350
## + log(Length)  1  0.016924 4.0319 -93.332
detach(bridge)

Forward selection based on AIC (shown below) arrives at the same model as backward elimination based on AIC. It can be shown that forward selection based on BIC arrives at the same model as backward elimination based on BIC. We are again faced with a choice between the two-predictor and three-predictor models discussed earlier.

There is no guarantee that backward elimination and forward selection will produce the same final model. However, in practice they produce the same model in many different situations as illustrated by the bridge construction example above.