## Example code for binary logit MLE:  Model Fitting
## Voting behavior example
##
## Christopher Adolph    faculty.washington.edu/cadolph
## 23 October 2016
##
##  Estimation by ML using optim() or by glm() on reduced dataset:
##     Model 1:  Age, Age^2, HS, College
##     Model 2:  Age, Age^2, HS, College, Married
##
##  Goodness of fit tests shown here:
##
##  Likelihood ratio test
##  Akaike Information Criterion
##  Bayesian Information Criterion
##  Deviance
##  Percent Correctly Predicted
##  Separation plots
##  Actual vs Predicted Plots
##  Error vs Predicted Plots
##  ROC plots
##  Residual vs Leverage Plots
##  Cross-validation

# Clear memory
rm(list=ls())

# Load libraries
library(MASS)
library(nlme)
library(boot)            # For cv.glm()
library(separationplot)  # For separation plot
library(pscl)            # Alternative PCP code
library(verification)    # For ROC area
library(tile)            # For some graphics; used by plot.binPredict()
library(RColorBrewer)    # For nice colors
source("binaryGOF.R")    # Percent correctly predicted and concordance indexes
source("binPredict.R")   # Code for making predicted vs actual plots

# Get nice colors
col <- brewer.pal(5, "Set1")
blue <- col[2]
orange <- col[5]

# Models in R formula format
m1 <- vote00 ~ age + I(age^2) + hsdeg + coldeg
m2 <- vote00 ~ age + I(age^2) + hsdeg + coldeg + marriedo

# Note:  the variable marriedo is current marrieds,
#        the variable married is ever-marrieds

# Load data
file <- "nes00a.csv"    
fulldata <- read.csv(file,header=TRUE)

# Keep only cases observed for all models
data <- extractdata(m2, fulldata, na.rm = TRUE)
attach(data)

# Construct variables and model objects
y <- vote00
x1 <- cbind(age,age^2,hsdeg,coldeg)
x2 <- cbind(age,age^2,hsdeg,coldeg,marriedo)

# Likelihood function for logit
llk.logit <- function(param,y,x) {
  os <- rep(1,length(x[,1]))
  x <- cbind(os,x)
  b <- param[ 1 : ncol(x) ]
  xb <- x%*%b
  sum( y*log(1+exp(-xb)) + (1-y)*log(1+exp(xb)))
               # optim is a minimizer, so min -ln L(param|y)
}

# Fit logit model using optim
ls.result <- lm(y~x1)  # use ls estimates as starting values
stval <- ls.result$coefficients  # initial guesses
logit.m1 <- optim(stval,llk.logit,method="BFGS",hessian=T,y=y,x=x1)
                   # call minimizer procedure
pe.m1 <- logit.m1$par   # point estimates
vc.m1 <- solve(logit.m1$hessian)  # var-cov matrix
se.m1 <- sqrt(diag(vc.m1))    # standard errors
ll.m1 <- -logit.m1$value  # likelihood at maximum

# Alternative estimation technique:  GLM
glm.m1 <- glm(m1, data=data, family="binomial")
print(summary.glm(glm.m1))


# Fit logit model with added covariate:  married
ls.result <- lm(y~x2)  # use ls estimates as starting values
stval <- ls.result$coefficients  # initial guesses
logit.m2 <- optim(stval,llk.logit,method="BFGS",hessian=T,y=y,x=x2)
                   # call minimizer procedure
pe.m2 <- logit.m2$par   # point estimates
vc.m2 <- solve(logit.m2$hessian)  # var-cov matrix
se.m2 <- sqrt(diag(vc.m2))    # standard errors
ll.m2 <- -logit.m2$value  # likelihood at maximum

# GLM estimation of model with married
glm.m2 <- glm(m2, data=data, family="binomial")
print(summary.glm(glm.m2))


## Goodness of fit of model 1 and model 2

# Check number of parameters in each model
k.m1 <- length(pe.m1)
k.m2 <- length(pe.m2)

# Likelihood ratio (LR) test
lr.test <- 2*(ll.m2 - ll.m1)
lr.test.p <- pchisq(lr.test,df=(k.m2 - k.m1),lower.tail=FALSE)

# Bayesian Information Criterion (BIC)
bic.m1 <- log(nrow(x1))*k.m1 - 2*ll.m1
bic.m2 <- log(nrow(x2))*k.m2 - 2*ll.m2
bic.test <- bic.m2 - bic.m1

# Akaike Information Criterion (AIC)
aic.m1 <- 2*k.m1 - 2*ll.m1
aic.m2 <- 2*k.m2 - 2*ll.m2
aic.test <- aic.m2 - aic.m1

# Deviance (the "-0" terms refer to the log-likelihood of the saturated model,
# which is zero for categorical outcomes)
deviance.m1 <- -2*(ll.m1 - 0)
deviance.m2 <- -2*(ll.m2 - 0)

# Percent correctly predicted (using glm result and my source code)
pcp.null <- pcp.glm(glm.m1, vote00, type="null")
pcp.m1 <- pcp.glm(glm.m1, vote00, type="model")
pcp.m2 <- pcp.glm(glm.m2, vote00, type="model")
pcpi.m1 <- pcp.glm(glm.m1, vote00, type="improve")
pcpi.m2 <- pcp.glm(glm.m2, vote00, type="improve")

## Another way to cumpute PCP with the pscl package
#library(pscl)
#hitmiss(glm.m1)
#hitmiss(glm.m1, k=.3) #change the threshold

## Still another way with the DAMisc package 
#pre(glm.m1)

# Separation plots
separationplot(pred=glm.m1$fitted.values, actual=glm.m1$y,
               file="sepplot_m1.pdf")

separationplot(pred=glm.m2$fitted.values, actual=glm.m2$y,
               file="sepplot_m2.pdf")

# binPredict for Actual vs Predicted plots, Error vs Predicted plots, and ROC plots
# From binPredict.R source code

# We use a helper function binPredict() to compute bins and ROC curves for us.  
# The we can plot one or more models using the plot function

# Other options for binPredict():
#   bins = scalar, number of bins (default is 20)
#   quantiles = logical, force bins to same # of observations (default is FALSE)
#   sims = scalar, if sim=0 use point estimates to compute predictions;
#                  if sims>0 use (this many) simulations from predictive distribution
#                            to compute predictions (accounts for model uncertainty)
#          default is 100 simulations; note: ROC curves always use point estimates only

binnedM1 <- binPredict(glm.m1, col=blue, label="M1: Age, Edu", quantiles=TRUE)

binnedM2 <- binPredict(glm.m2, col=orange, label="M2: Age, Edu, Married", quantiles=TRUE)

## To make bins of equal probability width instead of equal # obs: 
#binnedM1b <- binPredict(glm.m1, col=blue, label="M1: Age, Edu", quantiles=FALSE)
#binnedM2b <- binPredict(glm.m2, col=orange, label="M2: Age, Edu, Married", quantiles=FALSE)


## Some options for plot.binPredict (more in source code)
##   together = logical, plot models overlapping on same plot (default is TRUE)
##   display = character, avp:  plot predicted actual vs predicted probs
##                        evr:  plot actual/predicted vs predicted probs
##                        roc:  plot receiver operator characteristic curves
##             default is c("avp", "evp", "roc") for all three
##   thresholds = numeric, show these thresholds on ROC plot (default is NULL)
##   hide = logical, do not show number of observations in each bin (default is TRUE)
##   ignore = scalar, do not show bins with fewer observations than this (default = 5)
##   totalarea = scalar, total area of all circles for a model relative to plot (default=0.1)
##   cex = scalar, size of numeric labels
##   showbins = logical, show bin boundaries
##   file = character, save result to a pdf with this file name

# Show actual vs predicted of M1 on screen
plot(binnedM1, display="avp", hide=TRUE, labx=0.35)

# Show actual vs predicted of M1 and M2 to file
plot(binnedM1, binnedM2, display="avp", hide=TRUE, labx=0.35, file="avpM1M2")

# Send error vs predicted of M1 and M2 to file
plot(binnedM1, binnedM2, display="evp", hide=TRUE, labx=0.35, file="evpM1M2")

# Send ROC plots for M1 and M2 to file
plot(binnedM1, binnedM2, display="roc", thresholds=c(0.9, 0.8, 0.7, 0.6),
     labx=0.35, file="rocM1M2")

# Send actual vs predicted, error rate vs predicted, and ROC to file
plot(binnedM1, binnedM2, thresholds=c(0.9, 0.8, 0.7, 0.6),
     hide=TRUE, labx=0.35, file="avpErrorRocM1M2")


# Also see ROCR package for ROC curves and many other prediction metrics
# and the verification package for a rudimentary roc plot function roc.plot()

# Concordance Indexes / AUC (using glm result and my source code)
concord.null <- concord.glm(glm.m1, vote00, type="null")
concord.m1 <- concord.glm(glm.m1, vote00, type="model")
concord.m2 <- concord.glm(glm.m2, vote00, type="model")
concordi.m1 <- concord.glm(glm.m1, vote00, type="improve")
concordi.m2 <- concord.glm(glm.m2, vote00, type="improve")

###  Residuals using glm version
hatscore.m1 <- hatvalues(glm.m1)/mean(hatvalues(glm.m1))
rstu.m1 <- rstudent(glm.m1)

hatscore.m2 <- hatvalues(glm.m2)/mean(hatvalues(glm.m2))
rstu.m2 <- rstudent(glm.m2)

usr <- c(0,10,-3,3)

pdf("nes_resid1.pdf", pointsize=14, width=5.5, height=5)
plot.new()
par(usr=usr, tcl=-0.1, mgp=c(2,0.35,0))
axis(2, las=1)
par(usr=usr, tcl=-0.1, mgp=c(2,0.15,0))
axis(1, at=c(0,1,2,3,4,5,6,7,8,9,10))

title(xlab="Standardized hat-values",
      ylab="Studentized residuals")
points(hatscore.m1, rstu.m1,col = blue)
lines(c(usr[1], usr[2]), c(-2,-2), lty="dashed")
lines(c(usr[1], usr[2]), c(2,2), lty="dashed")
lines(c(3,3), c(usr[3], usr[4]), lty="dashed")
dev.off()

pdf("nes_resid2.pdf", pointsize=14, width=5.5, height=5)
plot.new()
par(usr=usr,tcl=-0.1,mgp=c(2,0.35,0))
axis(2,las=1)
par(usr=usr,tcl=-0.1,mgp=c(2,0.15,0))
axis(1,at=c(0,1,2,3,4,5,6,7,8,9,10))

title(xlab="Standardized hat-values",
      ylab="Studentized residuals")
points(hatscore.m2, rstu.m2, col=orange)
lines(c(usr[1], usr[2]), c(-2,-2), lty="dashed")
lines(c(usr[1], usr[2]), c(2,2), lty="dashed")
lines(c(3,3), c(usr[3], usr[4]), lty="dashed")
dev.off()

### Cross-validation (takes a few minutes to run)

## A precent-correctly-predicted-style cost function
## r is actual y, pi is expected y
## Rate of inaccuracy: mean(vote00!=round(yp))
costpcp <- function(r, pi=0) mean(r!=round(pi))

## an alternative cost function for binary data
## cost <- function(r, pi=0) mean(abs(r-pi)>0.5)

cv.m1 <- cv.glm(data, glm.m1, costpcp)
cvPCP.m1 <- 1 - cv.m1$delta[2]

cv.m2 <- cv.glm(data, glm.m2, costpcp)
cvPCP.m2 <- 1 - cv.m2$delta[2]

#### More cross-validation

## A simple leave-one-out cross-validation function for logit glm; returns predicted probs
loocv <- function (obj) {
    data <- obj$data
    m <- dim(data)[1]
    form <- formula(obj)
    fam <- obj$family$family
    loo <- rep(NA, m)
    for (i in 1:m) {
        i.glm <- glm(form, data = data[-i, ], family = fam)
        loo[i] <- predict(i.glm, newdata = data[i,], family = fam, type = "response")
    }
    loo
}

# LOOCV for models 1 and 2
predCVm1 <- loocv(glm.m1)
predCVm2 <- loocv(glm.m2)

# Make cross-validated AVP and ROC plots; note use of newpred input in binPredict
binnedM1cv <- binPredict(glm.m1, newpred=predCVm1, col=blue, label="M1: LOO-CV", quantiles=TRUE)

plot(binnedM1cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6),
     labx=0.25, file="avpRocM1cv")

binnedM2cv <- binPredict(glm.m2, newpred=predCVm2, col=orange, label="M2: LOO-CV", quantiles=TRUE)

plot(binnedM2cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6),
     labx=0.25, file="avpRocM2cv")

plot(binnedM1cv, binnedM2cv, display=c("avp","roc"), hide=TRUE, thresholds=c(0.9, 0.8, 0.7, 0.6),
     labx=0.25, file="avpRocM1M2cv")