Chapter 13 Machine Learning Workflow
13.1 House votes data
read.csv("../data/house-votes-84.csv.bz2",
hv <-header=FALSE)
head(hv, 3)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 republican n y n y y y n n n y ? y y y n y
## 2 republican n y n y y y n n n n n y y y n ?
## 3 democrat ? y y ? y y n n n n y n y y n n
Make it into a logical variable with TRUE
for yes, FALSE
for
non-yes:
hv %>%
hv <- mutate(across(V2:V17, function(x) x == "y")) %>%
rename(party = V1) %>%
mutate(party = party == "republican")
head(hv, 3)
## party V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 1 TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
## 2 TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## 3 FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## V14 V15 V16 V17
## 1 TRUE TRUE FALSE TRUE
## 2 TRUE TRUE FALSE FALSE
## 3 TRUE TRUE FALSE FALSE
Fit a logit model:
glm(party ~ ., family="binomial", data=hv)
m <- predict(m, type="response") > 0.5 yhat <-
Confusion matrix and accuracy:
table(hv$party, yhat)
## yhat
## FALSE TRUE
## FALSE 261 6
## TRUE 4 164
mean(hv$party == yhat)
## [1] 0.9770115
Add random variable to data:
sample(c(TRUE, FALSE), nrow(hv), replace=TRUE)
r <- cbind(hv, r)
hv1 <- glm(party ~ ., family="binomial", data=hv1)
m <- predict(m, type="response") > 0.5
yhat <-mean(hv1$party == yhat)
## [1] 0.9770115
Add more random variables:
sample(c(TRUE, FALSE), 10*nrow(hv), replace=TRUE) %>%
r10 <- matrix(ncol=10)
cbind(hv, r10)
hv10 <- glm(party ~ ., family="binomial", data=hv10) m <-
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
predict(m, type="response") > 0.5
yhat <-mean(hv10$party == yhat)
## [1] 0.9632184
Add even more random variables:
sample(c(TRUE, FALSE), 100*nrow(hv), replace=TRUE) %>%
r100 <- matrix(ncol=100)
cbind(hv, r100)
hv100 <- glm(party ~ ., family="binomial", data=hv100) m <-
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
predict(m, type="response") > 0.5
yhat <-mean(hv100$party == yhat)
## [1] 1
We see that the accuracy is now 1–adding random variables to the data in fact improves the results.
13.2 Training-validation split
We should compute model performance on validation data, not training data:
sample(nrow(hv), 0.8*nrow(hv))
iTrain <- hv[iTrain,]
hvTrain <- hv[-iTrain,] hvValid <-
For a check, let’s see the dimensions:
dim(hvTrain)
## [1] 348 17
dim(hvValid)
## [1] 87 17
The original data is split between 348 training and 87 validation cases, roughly in 80/20 ratio.
glm(party ~ ., family="binomial", data=hvTrain)
m <- predict(m, type="response", newdata=hvValid) > 0.5
yhat <-mean(hvValid$party == yhat)
## [1] 0.954023
The performance on the training data in the original form is similar to performance on the complete data.
Now see how the performance looks like if we add random columns:
hv100[iTrain,]
hvTrain <- hv100[-iTrain,]
hvValid <- glm(party ~ ., family="binomial", data=hvTrain) m <-
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
predict(m, type="response", newdata=hvValid) > 0.5
yhat <-mean(hvValid$party == yhat)
## [1] 0.8965517
Now the performance is noticeably less–only 90%.
We can see how the performance of the model changes if we continue adding random columns. We can just loop over number of columns and just continue adding random columns:
seq(1, 350, by=5)
rcolumns <- aValid <- numeric(length(rcolumns))
aTrain <- sample(nrow(hv), 0.8*nrow(hv))
iTrain <- 1
i <-for(rcols in rcolumns) {
sample(c(TRUE, FALSE), rcols*nrow(hv), replace=TRUE) %>%
r <- matrix(ncol=rcols)
cbind(hv, r)
X <- X[iTrain,]
XTrain <- X[-iTrain,]
XValid <- glm(party ~ ., family="binomial", data=XTrain)
m <- predict(m, type="response") > 0.5
yhat <- mean(XTrain$party == yhat)
aTrain[i] <- predict(m, type="response", newdata=XValid) > 0.5
yhat <- mean(XValid$party == yhat)
aValid[i] <- i + 1
i <- }
Finally, let’s plot these results:
plot(rcolumns, aValid, type="l",
xlab="# of random columns", ylab="Accuracy",
ylim=c(0,1))
legend("bottomleft",
legend=c("Training", "Validation"),
lty=1, col=c("black", "red"))
lines(rcolumns, aTrain, col="red")