Chapter 13 Machine Learning Workflow

13.1 House votes data

hv <- read.csv("../data/house-votes-84.csv.bz2",
               header=FALSE)
head(hv, 3)
##           V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 republican  n  y  n  y  y  y  n  n   n   y   ?   y   y   y   n   y
## 2 republican  n  y  n  y  y  y  n  n   n   n   n   y   y   y   n   ?
## 3   democrat  ?  y  y  ?  y  y  n  n   n   n   y   n   y   y   n   n

Make it into a logical variable with TRUE for yes, FALSE for non-yes:

hv <- hv %>%
   mutate(across(V2:V17, function(x) x == "y")) %>%
   rename(party = V1) %>%
   mutate(party = party == "republican")
head(hv, 3)
##   party    V2   V3    V4    V5   V6   V7    V8    V9   V10   V11   V12   V13
## 1  TRUE FALSE TRUE FALSE  TRUE TRUE TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE
## 2  TRUE FALSE TRUE FALSE  TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
## 3 FALSE FALSE TRUE  TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
##    V14  V15   V16   V17
## 1 TRUE TRUE FALSE  TRUE
## 2 TRUE TRUE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE

Fit a logit model:

m <- glm(party ~ ., family="binomial", data=hv)
yhat <- predict(m, type="response") > 0.5

Confusion matrix and accuracy:

table(hv$party, yhat)
##        yhat
##         FALSE TRUE
##   FALSE   261    6
##   TRUE      4  164
mean(hv$party == yhat)
## [1] 0.9770115

Add random variable to data:

r <- sample(c(TRUE, FALSE), nrow(hv), replace=TRUE)
hv1 <- cbind(hv, r)
m <- glm(party ~ ., family="binomial", data=hv1)
yhat <- predict(m, type="response") > 0.5
mean(hv1$party == yhat)
## [1] 0.9770115

Add more random variables:

r10 <- sample(c(TRUE, FALSE), 10*nrow(hv), replace=TRUE) %>%
   matrix(ncol=10)
hv10 <- cbind(hv, r10)
m <- glm(party ~ ., family="binomial", data=hv10)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
yhat <- predict(m, type="response") > 0.5
mean(hv10$party == yhat)
## [1] 0.9632184

Add even more random variables:

r100 <- sample(c(TRUE, FALSE), 100*nrow(hv), replace=TRUE) %>%
   matrix(ncol=100)
hv100 <- cbind(hv, r100)
m <- glm(party ~ ., family="binomial", data=hv100)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
yhat <- predict(m, type="response") > 0.5
mean(hv100$party == yhat)
## [1] 1

We see that the accuracy is now 1–adding random variables to the data in fact improves the results.

13.2 Training-validation split

We should compute model performance on validation data, not training data:

iTrain <- sample(nrow(hv), 0.8*nrow(hv))
hvTrain <- hv[iTrain,]
hvValid <- hv[-iTrain,]

For a check, let’s see the dimensions:

dim(hvTrain)
## [1] 348  17
dim(hvValid)
## [1] 87 17

The original data is split between 348 training and 87 validation cases, roughly in 80/20 ratio.

m <- glm(party ~ ., family="binomial", data=hvTrain)
yhat <- predict(m, type="response", newdata=hvValid) > 0.5
mean(hvValid$party == yhat)
## [1] 0.954023

The performance on the training data in the original form is similar to performance on the complete data.

Now see how the performance looks like if we add random columns:

hvTrain <- hv100[iTrain,]
hvValid <- hv100[-iTrain,]
m <- glm(party ~ ., family="binomial", data=hvTrain)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
yhat <- predict(m, type="response", newdata=hvValid) > 0.5
mean(hvValid$party == yhat)
## [1] 0.8965517

Now the performance is noticeably less–only 90%.

We can see how the performance of the model changes if we continue adding random columns. We can just loop over number of columns and just continue adding random columns:

rcolumns <- seq(1, 350, by=5)
aTrain <- aValid <- numeric(length(rcolumns))
iTrain <- sample(nrow(hv), 0.8*nrow(hv))
i <- 1
for(rcols in rcolumns) {
   r <- sample(c(TRUE, FALSE), rcols*nrow(hv), replace=TRUE) %>%
      matrix(ncol=rcols)
   X <- cbind(hv, r)
   XTrain <- X[iTrain,]
   XValid <- X[-iTrain,]
   m <- glm(party ~ ., family="binomial", data=XTrain)
   yhat <- predict(m, type="response") > 0.5
   aTrain[i] <- mean(XTrain$party == yhat)
   yhat <- predict(m, type="response", newdata=XValid) > 0.5
   aValid[i] <- mean(XValid$party == yhat)
   i <- i + 1
}

Finally, let’s plot these results:

plot(rcolumns, aValid, type="l",
     xlab="# of random columns", ylab="Accuracy",
     ylim=c(0,1))
legend("bottomleft",
       legend=c("Training", "Validation"),
       lty=1, col=c("black", "red"))
lines(rcolumns, aTrain, col="red")

plot of chunk unnamed-chunk-12