# POLS/CSSS 503
# Lab Session 8: May 18, 2012
# Daniel Berlner 

#Agenda:
#1. simcf for linear models
#2. simcf for logit models
#3. Practice


library(MASS)
library(tile)
library(simcf)
#need to download tile and simcf from course site, install from zip in R
#also install packages RColorBrewer and WhatIf

#What is ColorBrewer?
library(RColorBrewer)
colors1<-brewer.pal(4, "Set2")



options(scipen=10)
setwd("C:/Documents and Settings/Dan B/Desktop/POLS 503 TA 2012/Lab 8")
#### Use your own file path instead of the above code ####

#1. simcf for linear models ("Simulate Counter-Factuals")

#Useful commands in simcf
help(package="simcf")
?cfMake #make counterfactual scenario... our equivalent of the "newdata" option in predict()
?linearsimev #simulate expected values for a linear model
?linearsimfd #simulate first differences for a linear model
?extractdata #extract data used in a formula from a larger dataframe
?influencePlot #influence plot for outliers
?lagpanel #lag panel data by one or multiple years
?makeFEdummies #make dummy variables out of a factor for fixed effects models

?logBound
?logitBound



data<-read.csv("rossoildata.csv", na.strings="")
data<-data[data$year==1990,c("cty_name","year","regime1","oil","GDPcap","oecd")]
data<-na.omit(data)

model1<-lm(regime1 ~ GDPcap + oil + oecd, data=data)
summary(model1)


#extract the coefficients and vcov matrix, draw 1000 of each parameter

pe <- model1$coefficients
vc <- vcov(model1)  #note if you wanted to use heteroskedastic standard errors use: vc <- hccm(model1)
sims <- 10000
simbetas <- mvrnorm(sims,pe,vc)

#make formula object:
model.form1 <- regime1 ~ GDPcap + oil + oecd

#EXPECTED VALUES:

#steps:
    #1. create object of hypothetical x's with # number of scenarios - default is mean values for all
    #2. change value for variable of interest in each scenario
    #3. simulate expected values of y for each scenario of hypothetical x's
    #4. plot using tile

#GDP per capita first:
valueseq.gdp<-seq(0,25000,2500)
#choice of value range is informed by examining the range of the variable with summary()

xhyp1 <- cfMake(model.form1, data, nscen=length(valueseq.gdp), f=mean)

xhyp1
#note that we use xhyp1$x only, if we're doing expected values (expected y for each x). 
#xhyp1$pre is ignored unless we're doing first differences (expected change in y resulting from change in x).

#now change hypothetical value for GDP per capita in each scen, but for a non-oecd country:
for(i in 1:length(valueseq.gdp)){
    xhyp1 <- cfChange(xhyp1, "GDPcap", valueseq.gdp[i], scen=i)
    xhyp1 <- cfChange(xhyp1, "oecd", 0, scen=i)  
    }
#look at what changed:
xhyp1

#now make expected y's (this is what took a lengthy loop when we did it "by hand"):
yhyp1 <- linearsimev(xhyp1, simbetas)

yhyp1

#now at this point, we could plot the way we have before:
plot(valueseq.gdp, yhyp1$pe, ylim=c(0,15), type="l", lwd=2, xlab="GDP per Capita", ylab="Expected Value of Democracy")
lines(valueseq.gdp, yhyp1$lower, lty="dashed")
lines(valueseq.gdp, yhyp1$upper, lty="dashed")


#or better yet, use the tile package: first make a series of traces, and then plot them

trace1 <- lineplot(x = valueseq.gdp,
	y = yhyp1$pe,
	lower = yhyp1$lower,
	upper = yhyp1$upper,
	ci = list(mark="shaded"),
	col = "blue",
	plot = 1
)

tc <- tile(trace1,
	RxC = c(1,1), #increase these if you're using multiple plots at once
	limits=c(0,25000,0,15),   #xlim first, then ylim
	xaxistitle=list(labels="GDP per Capita"), 
	yaxistitle=list(labels="Expected Value of Democracy"), 
	plottitle = list(labels="Effect of Democracy from Model 1"),
	gridlines = list(type="xy"),
	frame=TRUE,
	output=list(width=6, outfile="model1_results1", type="pdf")
)


#Let's add a plot for another covariate, and omit counterfactuals outside the convex hull of the data

valueseq.oil<-seq(0,75,5)
xhyp2 <- cfMake(model.form1, data, nscen=length(valueseq.oil), f=mean)
xhyp2

for(i in 1:length(valueseq.oil)){
    xhyp2 <- cfChange(xhyp2, "oil", valueseq.oil[i], scen=i)
    xhyp2 <- cfChange(xhyp2, "oecd", 0, scen=i)  
    }

xhyp2

yhyp2 <- linearsimev(xhyp2, simbetas)

yhyp2


trace2 <- lineplot(x = valueseq.oil,
	y = yhyp2$pe,
	lower = yhyp2$lower,
	upper = yhyp2$upper,
	ci = list(mark="shaded"),
	extrapolate=list(data=extractdata(model.form1, data), cfact=xhyp2$x, omit.extrapolated=FALSE),
	col = "red",
	plot = 2
)


trace1 <- lineplot(x = valueseq.gdp,
	y = yhyp1$pe,
	lower = yhyp1$lower,
	upper = yhyp1$upper,
	ci = list(mark="shaded"),
	extrapolate=list(data=extractdata(model.form1, data), cfact=xhyp1$x, omit.extrapolated=FALSE),
	col = "blue",
	plot = 1
)


tc <- tile(trace1,trace2,
    RxC = c(1,2), #increase these if you're using multiple plots at once
    limits=matrix(c(0,25000,0,10,0,75,0,10),nrow=2,ncol=4,byrow=T),   #xlim first, then ylim
    xaxistitle=list(labels=c("GDP per Capita","Oil Production")), 
    yaxistitle=list(labels="Expected Value of Democracy"), 
	maintitle = list(labels="Results from Model 1"),
    gridlines = list(type="xy"),
    frame=TRUE,
    output=list(width=10, outfile="model1_results2", type="pdf")
)

#### See documentation for more detail: ?tile



# What about first differences?
xhyp3 <- cfMake(model.form1, data, f=mean)
xhyp3 <- cfChange(xhyp3, "oecd", xpre=0, x=1)
xhyp3

yfd.oecd<-linearsimfd(xhyp3, simbetas) #note linearsimFD instead of EV
yfd.oecd

xhyp4 <- cfMake(model.form1, data, f=mean)
xhyp4 <- cfChange(xhyp4, "oil", xpre=mean(data$oil) - sd(data$oil), x=mean(data$oil) + sd(data$oil))
xhyp4

yfd.oil<-linearsimfd(xhyp4, simbetas) #note linearsimFD instead of EV
yfd.oil

##### Log example: now we need to manually create new variables which are log transformations, rather than relying on predict

data$logGDPcap<-log(data$GDPcap)

formula.log<-regime1 ~ logGDPcap + oil + oecd

model.log<-lm(formula.log, data=data)
summary(model.log)

pe.log <- model.log$coefficients
vc.log <- vcov(model.log)
sims <- 10000
simbetas.log <- mvrnorm(sims,pe.log,vc.log)

valueseq.gdp.log<-log(seq(100,25000,100)) #this is a very long sequence, since I wanted more detail on the curved line

xhyp.log <- cfMake(formula.log, data, nscen=length(valueseq.gdp.log), f=mean)

for(i in 1:length(valueseq.gdp.log)){
    xhyp.log <- cfChange(xhyp.log, "logGDPcap", valueseq.gdp.log[i], scen=i)
    xhyp.log <- cfChange(xhyp.log, "oecd", 0, scen=i)  
    }

xhyp.log

yhyp.log <- linearsimev(xhyp.log, simbetas.log)

yhyp.log

#Now we can either plot with a log scale on the x-axis...
plot(valueseq.gdp.log, yhyp.log$pe, ylim=c(-5,15), type="l", lwd=2, xlab="Log GDP per Capita", ylab="Expected Value of Democracy")
lines(valueseq.gdp.log, yhyp.log$lower, lty="dashed")
lines(valueseq.gdp.log, yhyp.log$upper, lty="dashed")

#... or exp() the x-axis to plot on the original scale
plot(exp(valueseq.gdp.log), yhyp.log$pe, ylim=c(-5,15), type="l", xlab="GDP per Capita", ylab="Expected Value of Democracy")
lines(exp(valueseq.gdp.log), yhyp.log$lower, lty="dashed")
lines(exp(valueseq.gdp.log), yhyp.log$upper, lty="dashed")



### 2. Visualizing Logit Results

# Using simcf and tile to explore an estimated logistic regression
# Voting example using 2000 NES data after King, Tomz, and Wittenberg
# Chris Adolph

# Load data
nesdata <- read.csv("nes00a.csv", header=TRUE)

# Set up model formula and model specific data frame
model <- vote00 ~ age + I(age^2) + hsdeg + coldeg
mdata <- extractdata(model, nesdata, na.rm=TRUE)

# Run logit & extract results
logit.result <- glm(model, family=binomial(link=logit), data=mdata)
pe <- logit.result$coefficients  # point estimates
vc <- vcov(logit.result)         # var-cov matrix

# Simulate parameter distributions
sims <- 10000
simbetas <- mvrnorm(sims, pe, vc)

# Set up counterfactuals:  all ages, each of three educations
xhyp <- seq(18,97,1)
nscen <- length(xhyp)
nohsScen <- hsScen <- collScen <- cfMake(model, mdata, nscen)
for (i in 1:nscen) {
  # No High school scenarios (loop over each age)
  nohsScen <- cfChange(nohsScen, "age", x = xhyp[i], scen = i)
  nohsScen <- cfChange(nohsScen, "hsdeg", x = 0, scen = i)
  nohsScen <- cfChange(nohsScen, "coldeg", x = 0, scen = i)

  # HS grad scenarios (loop over each age)
  hsScen <- cfChange(hsScen, "age", x = xhyp[i], scen = i)
  hsScen <- cfChange(hsScen, "hsdeg", x = 1, scen = i)
  hsScen <- cfChange(hsScen, "coldeg", x = 0, scen = i)

  # College grad scenarios (loop over each age)
  collScen <- cfChange(collScen, "age", x = xhyp[i], scen = i)
  collScen <- cfChange(collScen, "hsdeg", x = 1, scen = i)
  collScen <- cfChange(collScen, "coldeg", x = 1, scen = i)
}

# Simulate expected probabilities for all scenarios
nohsSims <- logitsimev(nohsScen, simbetas, ci=0.95)
hsSims <- logitsimev(hsScen, simbetas, ci=0.95)
collSims <- logitsimev(collScen, simbetas, ci=0.95)

# Get 3 nice colors for traces
col <- brewer.pal(3,"Dark2")

# Set up lineplot traces of expected probabilities
nohsTrace <- lineplot(x=xhyp,
                      y=nohsSims$pe,
                      lower=nohsSims$lower,
                      upper=nohsSims$upper,
                      col=col[1],
                      extrapolate=list(data=mdata,
                                       cfact=nohsScen$x,
                                       omit.extrapolated=TRUE),
                      plot=1)

hsTrace <- lineplot(x=xhyp,
                    y=hsSims$pe,
                    lower=hsSims$lower,
                    upper=hsSims$upper,
                    col=col[2],
                    extrapolate=list(data=mdata,
                                       cfact=hsScen$x,
                                       omit.extrapolated=TRUE),
                    plot=1)

collTrace <- lineplot(x=xhyp,
                      y=collSims$pe,
                      lower=collSims$lower,
                      upper=collSims$upper,
                      col=col[3],
                      extrapolate=list(data=mdata,
                                       cfact=collScen$x,
                                       omit.extrapolated=TRUE),
                      plot=1)

# Set up traces with labels and legend
labelTrace <- textTile(labels=c("Less than HS", "High School", "College"),
                       x=c( 55,    49,     30),
                       y=c( 0.26,  0.56,   0.87),
                       col=col,
                       plot=1)

legendTrace <- textTile(labels=c("Logit estimates:", "95% confidence", "interval is shaded"),
                        x=c(82, 82, 82),
                        y=c(0.2, 0.16, 0.12),
                        plot=1)

# Plot traces using tile
tile(nohsTrace,
     hsTrace,
     collTrace,
     labelTrace,
     legendTrace,
     width=list(null=5),
     limits=c(18,94,0,1),
     xaxis=list(at=c(20,30,40,50,60,70,80,90)),
     xaxistitle=list(labels="Age of Respondent"),
     yaxistitle=list(labels="Probability of Voting"),
     frame=TRUE
)


 

### 3. Practice:

# Using either your own data or one of the datasets used in this lab, run an interesting model and use simcf to present the results (you can use either tile or the base graphics to plot).