# POLS/CSSS 503
# Lab Session 7: May 11, 2012
# Daniel Berlner 

#Agenda:
#1. Logit Models
#2. Plotting Predicted Probabilities
#3. Assessing Model Fit with ROC Plots

library(MASS)
library(ROCR)

options(scipen=10)
setwd("C:/Documents and Settings/Dan B/Desktop/POLS 503 TA 2012/Lab 7")
#### Use your own file path instead of the above code ####


### 1. Logit Models

### Examples of S-curves
x1<-seq(from=-5, to=5, by=0.1)
y1<- 1/(1 + exp(-x1))
plot(x1,y1)


par(mfrow=c(1,2))
plot(x1, dnorm(x1), ylim=c(0,1))
plot(x1, pnorm(x1), ylim=c(0,1))

dev.off()
###

data<-read.csv("nes00a.csv", header=T)
summary(data)

# Run a logit model:
model1<-glm(vote00 ~ age + hsdeg + coldeg, family=binomial(link="logit"), data=data)
summary(model1)


# What do the coefficients mean? Use predict() to get predicted probabilities.
# Technically we are now using a different function, predict.glm() instead of predict.lm(). 
# R simply knows which one to use based on what type of model we apply it to.
predict(model1,newdata=
	list(age=mean(data$age),
		hsdeg=0,
		coldeg=0),
type="response")
# This is the predicted probability that someone of mean age (47.2) with no high school or college degree voted in the 2000 election. 

# Now get three predictions: No degree, high school only, and both high school and college.
predict(model1,newdata=
	list(age=rep(mean(data$age),3),
		hsdeg=c(0,1,1),
		coldeg=c(0,0,1)),
type="response")

# But what about confidence intervals? Predict.glm() cannot do that. This motivates simulation approaches. 



### 2. Plotting Predicted Probabilities

pe<-model1$coefficients
vc<-vcov(model1)
sims <- 10000
simbetas <- mvrnorm(sims,pe,vc)

valueseq<-seq(18,98,5)

### Varying age for No Degree
EVs.nodeg<-rep(NA,length(valueseq))
lowerCIs.nodeg<-rep(NA,length(valueseq))
upperCIs.nodeg<-rep(NA,length(valueseq))

for (i in 1:length(valueseq)){
    xhyp<-c(1, valueseq[i], 0, 0)
    linearpredictor.logit<- simbetas %*% xhyp
    phat.logit<-1/(1+exp(-linearpredictor.logit)) 
    EVs.nodeg[i]<-sort(phat.logit)[5000]
    lowerCIs.nodeg[i]<-sort(phat.logit)[250]
    upperCIs.nodeg[i]<-sort(phat.logit)[9750]
    } # close i loop

	
### Varying age for HS Degree
EVs.hsdeg<-rep(NA,length(valueseq))
lowerCIs.hsdeg<-rep(NA,length(valueseq))
upperCIs.hsdeg<-rep(NA,length(valueseq))

for (i in 1:length(valueseq)){
    xhyp<-c(1, valueseq[i], 1, 0)
    linearpredictor.logit<- simbetas %*% xhyp
    phat.logit<-1/(1+exp(-linearpredictor.logit)) #could also just use plogis(linearpredictor.logit)
    EVs.hsdeg[i]<-sort(phat.logit)[5000]
    lowerCIs.hsdeg[i]<-sort(phat.logit)[250]
    upperCIs.hsdeg[i]<-sort(phat.logit)[9750]
    } # close i loop
	
### Varying age for college Degree
EVs.coldeg<-rep(NA,length(valueseq))
lowerCIs.coldeg<-rep(NA,length(valueseq))
upperCIs.coldeg<-rep(NA,length(valueseq))

for (i in 1:length(valueseq)){
    xhyp<-c(1, valueseq[i], 1, 1)
    linearpredictor.logit<- simbetas %*% xhyp
    phat.logit<-1/(1+exp(-linearpredictor.logit))
    EVs.coldeg[i]<-sort(phat.logit)[5000]
    lowerCIs.coldeg[i]<-sort(phat.logit)[250]
    upperCIs.coldeg[i]<-sort(phat.logit)[9750]
    } # close i loop
	

	
# Create plot
plot(x=valueseq, y=EVs.nodeg, 
	type="n", ylim=c(0,1), xlab="Age", ylab="Predicted Probability of Voting", main="Age, Education, and Voting in the 2000 Election")

lines(valueseq,EVs.nodeg,lwd=3, col="blue")
lines(valueseq,lowerCIs.nodeg, lwd=1, lty=2, col="blue")
lines(valueseq,upperCIs.nodeg, lwd=1, lty=2, col="blue")

lines(valueseq,EVs.hsdeg,lwd=3, col="purple")
lines(valueseq,lowerCIs.hsdeg, lwd=1, lty=2, col="purple")
lines(valueseq,upperCIs.hsdeg, lwd=1, lty=2, col="purple")

lines(valueseq,EVs.coldeg,lwd=3, col="red")
lines(valueseq,lowerCIs.coldeg, lwd=1, lty=2, col="red")
lines(valueseq,upperCIs.coldeg, lwd=1, lty=2, col="red")

text(x=c(82, 45, 43), y=c(.4, .5, .9), labels=c("Less than High School", "High School", "College"), 
col=c("blue", "purple", "red"), cex=.8)


### 3. ROC Plots

# One way to assess model performance is to pick a threshold with which to assign predicted probabilities to either 0 or 1, and then compare then with the actual values of either 0 or 1. 

threshold<-0.5
table(model1$fitted.values>threshold, model1$y)
# For any particular threshold:
# Upper left corner is true negatives; lower right corner is true positives.
# Lower left corner is FALSE positives; upper right corner is FALSE negatives.
# Higher thresholds minimize false positives but lead to more false negatives; vice versa for lower thresholds.


# Here's a simple function (created by Brian Greenhill) that gives you the same information in a more user-friendly way.  It also lets you see how the FPR and TPR are calculated:
contingencytable<-function(modelobject,threshold) {

    phat<-modelobject$fitted.values
    y<-modelobject$y
    
    tp<-(phat>=threshold & y==1)
    fp<-(phat>=threshold & y==0)
    tn<-(phat<threshold & y==0)
    fn<-(phat<threshold & y==1)
    
    results<-matrix(c(sum(tn),sum(fp),sum(fn),sum(tp)),2,2)
    rownames(results)<-c("Predicted Non-Events","Predicted Events")
    colnames(results)<-c("Actual Non-Events","Actual Events")
    
    fpr<-sum(fp)/(sum(tn)+sum(fp))
    tpr<-sum(tp)/(sum(tp)+sum(fn))
    
    print(results)
    cat(paste("\nFalse positive rate is", round(fpr,2), "\nTrue Positive Rate is", round(tpr,2)))
    
    invisible(list(fpr=fpr, tpr=tpr))
    } # close function.

# example:
contingencytable(model1, threshold=0.9)

# You can imagine running this over a range of different thresholds.  The next tool, the ROC plot (Receiver Operating Characteristic... don't remember that), does exactly this. Each pair of FPR and TPR values represents one point on the ROC curve.


pred.object<-prediction(model1$fitted.values,model1$y)
perf.object<-performance(pred.object,"tpr","fpr")
plot(perf.object)


# Let's run two alternative model specifications, and use the ROC plot to compare them:

model2<-glm(vote00 ~ age + I(age^2) + hsdeg + coldeg, family=binomial(link="logit"), data=data)
summary(model2)

model3<-glm(vote00 ~ age + I(age^2) + hsdeg + coldeg + married, family=binomial(link="logit"), data=data)
summary(model3)


pred.object2<-prediction(model2$fitted.values,model2$y)
perf.object2<-performance(pred.object2,"tpr","fpr")

pred.object3<-prediction(model3$fitted.values,model3$y)
perf.object3<-performance(pred.object3,"tpr","fpr")

plot(perf.object, col="black")
plot(perf.object2, col="blue", add=T)
plot(perf.object3, col="red", add=T)
abline(a=0, b=1, lty=3)
legend(0.6,0.2,c("Age Linear","Age Squared", "Married Control"), col=c("black","blue","red"), lwd=c(1,1,1))


# In this case, the lines are all close together. But what we want to know is which has the highest Area Under the Curve:
performance(pred.object,"auc")@y.values
performance(pred.object2,"auc")@y.values
performance(pred.object3,"auc")@y.values

auc.res<-round(c(
as.numeric(performance(pred.object,"auc")@y.values),
as.numeric(performance(pred.object2,"auc")@y.values),
as.numeric(performance(pred.object3,"auc")@y.values)),4)
label.res<-paste( c("Age Linear","Age Squared", "Married Control"), auc.res, sep=": AUC = ")


plot(perf.object, col="black")
plot(perf.object2, col="blue", add=T)
plot(perf.object3, col="red", add=T)
abline(a=0, b=1, lty=3)
legend(0.4,0.2,label.res, col=c("black","blue","red"), lwd=c(1,1,1))

