## Using simcf and tile to explore an estimated logistic regression ## Voting example using 2000 NES data after King, Tomz, and Wittenberg ## Chrisopher Adolph faculty.washington.edu/cadolph ## 28 October 2021 ## Clear memory rm(list=ls()) ## Load libraries library(MASS) library(simcf) library(tile) library(RColorBrewer) ## Load data file <- "nes00a.csv" data <- read.csv(file, header=TRUE) ## Estimate logit model using optim() ## Construct variables and model objects y <- data$vote00 x <- cbind(data$age,data$age^2,data$hsdeg,data$coldeg) ## Likelihood function for logit llk.logit <- function(param,y,x) { os <- rep(1,length(x[,1])) x <- cbind(os,x) b <- param[ 1 : ncol(x) ] xb <- x%*%b sum( y*log(1+exp(-xb)) + (1-y)*log(1+exp(xb))); # optim is a minimizer, so min -ln L(param|y) } ## Fit logit model using optim ls.result <- lm(y~x) # use ls estimates as starting values stval <- ls.result$coefficients # initial guesses logit.result.opt <- optim(stval, llk.logit, method="Nelder-Mead", hessian=TRUE, y=y, x=x) # call minimizer procedure pe.opt <- logit.result.opt$par # point estimates vc.opt <- solve(logit.result.opt$hessian) # var-cov matrix se.opt <- sqrt(diag(vc.opt)) # standard errors ll.opt <- -logit.result.opt$value # likelihood at maximum # Estimate logit model using glm() # Set up model formula and model specific data frame model <- vote00 ~ age + I(age^2) + hsdeg + coldeg mdata <- extractdata(model, data, na.rm=TRUE) # Run logit & extract results logit.result <- glm(model, family=binomial, data=mdata) pe.glm <- logit.result$coefficients # point estimates vc.glm <- vcov(logit.result) # var-cov matrix ## Simulate quantities of interest using simcf ## ## We could do this from the optim or glm results; ## here, we do from glm # Simulate parameter distributions sims <- 10000 simbetas <- mvrnorm(sims, pe.glm, vc.glm) # Set up counterfactuals: all ages, each of three educations xhyp <- seq(18,97,1) nscen <- length(xhyp) nohsScen <- hsScen <- collScen <- cfMake(model, mdata, nscen) for (i in 1:nscen) { # No High school scenarios (loop over each age) nohsScen <- cfChange(nohsScen, "age", x = xhyp[i], scen = i) nohsScen <- cfChange(nohsScen, "hsdeg", x = 0, scen = i) nohsScen <- cfChange(nohsScen, "coldeg", x = 0, scen = i) # HS grad scenarios (loop over each age) hsScen <- cfChange(hsScen, "age", x = xhyp[i], scen = i) hsScen <- cfChange(hsScen, "hsdeg", x = 1, scen = i) hsScen <- cfChange(hsScen, "coldeg", x = 0, scen = i) # College grad scenarios (loop over each age) collScen <- cfChange(collScen, "age", x = xhyp[i], scen = i) collScen <- cfChange(collScen, "hsdeg", x = 1, scen = i) collScen <- cfChange(collScen, "coldeg", x = 1, scen = i) } # Simulate expected probabilities for all scenarios nohsSims <- logitsimev(nohsScen, simbetas, ci=0.95) hsSims <- logitsimev(hsScen, simbetas, ci=0.95) collSims <- logitsimev(collScen, simbetas, ci=0.95) # Get 3 nice colors for traces col <- brewer.pal(3,"Dark2") # Set up lineplot traces of expected probabilities # # When recycling this code, omit the extrapolate input # if you are unsure how to use it correctly nohsTrace <- lineplot(x=xhyp, y=nohsSims$pe, lower=nohsSims$lower, upper=nohsSims$upper, col=col[1], extrapolate=list(data=mdata[,2:ncol(mdata)], cfact=nohsScen$x[,2:ncol(hsScen$x)], omit.extrapolated=TRUE), plot=1) hsTrace <- lineplot(x=xhyp, y=hsSims$pe, lower=hsSims$lower, upper=hsSims$upper, col=col[2], extrapolate=list(data=mdata[,2:ncol(mdata)], cfact=hsScen$x[,2:ncol(hsScen$x)], omit.extrapolated=TRUE), plot=1) collTrace <- lineplot(x=xhyp, y=collSims$pe, lower=collSims$lower, upper=collSims$upper, col=col[3], extrapolate=list(data=mdata[,2:ncol(mdata)], cfact=collScen$x[,2:ncol(hsScen$x)], omit.extrapolated=TRUE), plot=1) # Set up traces with labels and legend labelTrace <- textTile(labels=c("Less than HS", "High School", "College"), x=c( 55, 49, 30), y=c( 0.26, 0.56, 0.87), col=col, plot=1) legendTrace <- textTile(labels=c("Logit estimates:", "95% confidence", "interval is shaded"), x=c(82, 82, 82), y=c(0.2, 0.15, 0.10), cex=0.9, plot=1) # Plot traces using tile tile(nohsTrace, hsTrace, collTrace, labelTrace, legendTrace, limits=c(18,94,0,1), xaxis=list(at=c(20,30,40,50,60,70,80,90)), yaxis=list(label.loc=-0.5, major=FALSE), xaxistitle=list(labels="Age of Respondent"), yaxistitle=list(labels="Probability of Voting"), width=list(null=5,yaxistitle=4,yaxis.labelspace=-0.5), output=list(file="educationEV",width=5.5) ) ################################################################ # # Now consider a new specification adding the variable # "ever married", or marriedo # # We will estimate this new model with glm(), then # simulate new scenarios for marrieds and non-marrieds # # We could also rerun the age x education scenarios if we wanted # Estimate logit model using glm() # Set up model formula and model specific data frame model2 <- vote00 ~ age + I(age^2) + hsdeg + coldeg + marriedo mdata2 <- extractdata(model2, data, na.rm=TRUE) # Run logit & extract results logit.m2 <- glm(model2, family=binomial, data=mdata2) pe.m2 <- logit.m2$coefficients # point estimates vc.m2 <- vcov(logit.m2) # var-cov matrix # Simulate parameter distributions sims <- 10000 simbetas.m2 <- mvrnorm(sims, pe.m2, vc.m2) # Set up counterfactuals: all ages, each of three educations xhyp <- seq(18,97,1) nscen <- length(xhyp) marriedScen <- notmarrScen <- cfMake(model2, mdata2, nscen) for (i in 1:nscen) { # Married (loop over each age) # Note below the careful use of before scenarios (xpre) and after scenarios (x) # - we will use the marriedScen counterfactuals in FDs and RRs as well as EVs marriedScen <- cfChange(marriedScen, "age", x = xhyp[i], xpre= xhyp[i], scen = i) marriedScen <- cfChange(marriedScen, "marriedo", x = 1, xpre= 0, scen = i) # Married (loop over each age) notmarrScen <- cfChange(notmarrScen, "age", x = xhyp[i], scen = i) notmarrScen <- cfChange(notmarrScen, "marriedo", x = 0, scen = i) } # Simulate expected probabilities for all scenarios marriedSims <- logitsimev(marriedScen, simbetas.m2, ci=0.95) notmarrSims <- logitsimev(notmarrScen, simbetas.m2, ci=0.95) # Simulate first difference of voting wrt marriage marriedFD <- logitsimfd(marriedScen, simbetas.m2, ci=0.95) # Simulate relative risk of voting wrt marriage marriedRR <- logitsimrr(marriedScen, simbetas.m2, ci=0.95) ## Make plots using tile # Get 3 nice colors for traces col <- brewer.pal(3,"Dark2") # Set up lineplot traces of expected probabilities marriedTrace <- lineplot(x=xhyp, y=marriedSims$pe, lower=marriedSims$lower, upper=marriedSims$upper, col=col[1], extrapolate=list(data=mdata2[,2:ncol(mdata2)], cfact=marriedScen$x[,2:ncol(marriedScen$x)], omit.extrapolated=TRUE), plot=1) notmarrTrace <- lineplot(x=xhyp, y=notmarrSims$pe, lower=notmarrSims$lower, upper=notmarrSims$upper, col=col[2], ci = list(mark="dashed"), extrapolate=list(data=mdata2[,2:ncol(mdata2)], cfact=notmarrScen$x[,2:ncol(notmarrScen$x)], omit.extrapolated=TRUE), plot=1) # Set up traces with labels and legend labelTrace <- textTile(labels=c("Currently Married", "Not Married"), x=c( 35, 53), y=c( 0.8, 0.56), col=col, plot=1) legendTrace <- textTile(labels=c("Logit estimates:", "95% confidence", "interval is shaded"), x=c(80, 80, 80), y=c(0.2, 0.15, 0.10), cex=0.9, plot=1) # Plot traces using tile tile(marriedTrace, notmarrTrace, labelTrace, legendTrace, limits=c(18,94,0,1), xaxis=list(at=c(20,30,40,50,60,70,80,90)), yaxis=list(label.loc=-0.5, major=FALSE), xaxistitle=list(labels="Age of Respondent"), yaxistitle=list(labels="Probability of Voting"), width=list(null=5,yaxistitle=4,yaxis.labelspace=-0.5), output=list(file="marriedEV",width=5.5) ) # Plot First Difference # Set up lineplot trace of relative risk marriedFDTrace <- lineplot(x=xhyp, y=marriedFD$pe, lower=marriedFD$lower, upper=marriedFD$upper, col=col[1], extrapolate=list(data=mdata2[,2:ncol(mdata2)], cfact=marriedScen$x[,2:ncol(marriedScen$x)], omit.extrapolated=TRUE), plot=1) # Set up baseline: for first difference, this is 0 baseline <- linesTile(x=c(18,94), y=c(0,0), plot=1) # Set up traces with labels and legend labelFDTrace <- textTile(labels=c("Married compared \n to Not Married"), x=c( 40), y=c( 0.20), col=col[1], plot=1) legendFDTrace <- textTile(labels=c("Logit estimates:", "95% confidence", "interval is shaded"), x=c(80, 80, 80), y=c(-0.02, -0.05, -0.08), cex=0.9, plot=1) # Plot traces using tile tile(marriedFDTrace, labelFDTrace, legendFDTrace, baseline, limits=c(18,94,-0.1,0.5), xaxis=list(at=c(20,30,40,50,60,70,80,90)), yaxis=list(label.loc=-0.5, major=FALSE), xaxistitle=list(labels="Age of Respondent"), yaxistitle=list(labels="Difference in Probability of Voting"), width=list(null=5,yaxistitle=4,yaxis.labelspace=-0.5), output=list(file="marriedFD",width=5.5) ) # Plot Relative Risk # Set up lineplot trace of relative risk marriedRRTrace <- lineplot(x=xhyp, y=marriedRR$pe, lower=marriedRR$lower, upper=marriedRR$upper, col=col[1], extrapolate=list(data=mdata2[,2:ncol(mdata2)], cfact=marriedScen$x[,2:ncol(marriedScen$x)], omit.extrapolated=TRUE), plot=1) # Set up baseline: for relative risk, this is 1 baseline <- linesTile(x=c(18,94), y=c(1,1), plot=1) # Set up traces with labels and legend labelRRTrace <- textTile(labels=c("Married compared \n to Not Married"), x=c( 55), y=c( 1.25), col=col[1], plot=1) legendRRTrace <- textTile(labels=c("Logit estimates:", "95% confidence", "interval is shaded"), x=c(80, 80, 80), y=c(0.98, 0.95, 0.92), cex=0.9, plot=1) # Plot traces using tile tile(marriedRRTrace, labelRRTrace, legendRRTrace, baseline, limits=c(18,94,0.9,1.5), xaxis=list(at=c(20,30,40,50,60,70,80,90)), yaxis=list(label.loc=-0.5, major=FALSE), xaxistitle=list(labels="Age of Respondent"), yaxistitle=list(labels="Relative Risk of Voting"), width=list(null=5,yaxistitle=4,yaxis.labelspace=-0.5), output=list(file="marriedRR",width=5.5) )