# POLS/CSSS 503 # Lab Session 5: April 27, 2012 # Daniel Berlner #Agenda: # 1. More on prediction with interaction terms and transformations # 2. LaTeX workshop ###1. More on prediction with interaction terms and transformations options(scipen=10) setwd("C:/Documents and Settings/Dan B/Desktop/POLS 503 TA 2012/Lab 1") data<-read.csv("rossoildata.csv", na.strings="") data<-data[,c("cty_name","year","regime1","oil","GDPcap","oecd")] data<-na.omit(data) #basic model model1<-lm(regime1~GDPcap+oil+oecd, data=data) summary(model1) oilhyp<-seq(from=0,to=100,by=10) #notice that, instead of first creating a list object containing the hypothetical data, we are now creating that list within the predict() command. pred1<-predict(model1, newdata=list( GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(0,length(oilhyp)) ), interval="confidence") pred1 #plot: plot(oilhyp, pred1[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Effect of Oil Dependence on Democracy", ylim=c(-5,15)) lines(oilhyp, pred1[,1], lwd=3, col="red") lines(oilhyp, pred1[,2], lwd=1, lty=2, col="red") lines(oilhyp, pred1[,3], lwd=1, lty=2, col="red") #interaction term model2<-lm(regime1~GDPcap+oil*oecd, data=data) summary(model2) pred2_oecd0<-predict(model2, newdata=list( GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(0,length(oilhyp)) ), interval="confidence") pred2_oecd1<-predict(model2, newdata=list( GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(1,length(oilhyp)) ), interval="confidence") pred2_oecd0 pred2_oecd1 #plot: plot(oilhyp, pred2_oecd0[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Effect of Oil Dependence on Democracy", ylim=c(-5,15)) lines(oilhyp, pred2_oecd0[,1], lwd=3, col="red") lines(oilhyp, pred2_oecd0[,2], lwd=1, lty=2, col="red") lines(oilhyp, pred2_oecd0[,3], lwd=1, lty=2, col="red") lines(oilhyp, pred2_oecd1[,1], lwd=3, col="blue") lines(oilhyp, pred2_oecd1[,2], lwd=1, lty=2, col="blue") lines(oilhyp, pred2_oecd1[,3], lwd=1, lty=2, col="blue") ### What if you are interacting two continuous variables? #new model: let the effect of oil vary by year model3<-lm(regime1~GDPcap+oecd+oil*year, data=data) summary(model3) #in order to interpret the interaction term graphically, we need to pick one variable to treat as continuous - the x-axis - and another for which we will choose a "low" value and a "high" value. #this time, we will vary the year year_seq<-seq(from=min(data$year), to=max(data$year), by=1) #this is the same as simply saying 1966:1997, has we looked up the min and max first #often we choose the mean and one SD above the mean. oil_lo<-mean(data$oil) oil_hi<-mean(data$oil) + sd(data$oil) pred3_oil_lo<-predict(model3, newdata=list( year=year_seq, GDPcap=rep(mean(data$GDPcap),length(year_seq)), oil=rep(oil_lo, length(year_seq)), oecd=rep(0, length(year_seq)) ), interval="confidence") pred3_oil_hi<-predict(model3, newdata=list( year=year_seq, GDPcap=rep(mean(data$GDPcap),length(year_seq)), oil=rep(oil_hi, length(year_seq)), oecd=rep(0, length(year_seq)) ), interval="confidence") pred3_oil_lo pred3_oil_hi #plot: plot(year_seq, pred3_oil_lo[,1], type="n", xlab="Year", ylab="Democracy", main="Oil and Democracy over Time", ylim=c(-5,15)) lines(year_seq, pred3_oil_lo[,1], lwd=3, col="lightblue") lines(year_seq, pred3_oil_lo[,2], lwd=1, lty=2, col="lightblue") lines(year_seq, pred3_oil_lo[,3], lwd=1, lty=2, col="lightblue") lines(year_seq, pred3_oil_hi[,1], lwd=3, col="orange") lines(year_seq, pred3_oil_hi[,2], lwd=1, lty=2, col="orange") lines(year_seq, pred3_oil_hi[,3], lwd=1, lty=2, col="orange") text(x=1970, y=5, labels="Low Oil: Mean", col="lightblue", cex=.8) text(x=1970, y=2, labels="High Oil: Mean + 1 SD", col="orange", cex=.8) ####Alternately, we could have flipped these: oilhyp<-seq(from=0,to=100,by=10) year_lo<-1970 year_hi<-1995 pred3_year_lo<-predict(model3, newdata=list( year=rep(year_lo,length(oilhyp)), GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(0, length(oilhyp)) ), interval="confidence") pred3_year_hi<-predict(model3, newdata=list( year=rep(year_hi,length(oilhyp)), GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(0, length(oilhyp)) ), interval="confidence") pred3_year_lo pred3_year_hi #plot: plot(oilhyp, pred3_year_lo[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Oil and Democracy over Time Version 2", ylim=c(-5,15)) lines(oilhyp, pred3_year_lo[,1], lwd=3, col="darkgoldenrod1") lines(oilhyp, pred3_year_lo[,2], lwd=1, lty=2, col="darkgoldenrod1") lines(oilhyp, pred3_year_lo[,3], lwd=1, lty=2, col="darkgoldenrod1") lines(oilhyp, pred3_year_hi[,1], lwd=3, col="magenta2") lines(oilhyp, pred3_year_hi[,2], lwd=1, lty=2, col="magenta2") lines(oilhyp, pred3_year_hi[,3], lwd=1, lty=2, col="magenta2") text(x=10, y=2, labels="1970", col="darkgoldenrod1", cex=.8) text(x=10, y=7, labels="1995", col="magenta2", cex=.8) ######## #First Difference: What if you want to plot the marginal effect of a change in one variable, as this quantity changes as another variable varies? #That is, what if you want to visualize the interaction term with a single line, rather than two? #go back to the first way we predicted results of Model 3, with the sequence of years: #This vector shows the CHANGE in expected democracy resulting from a CHANGE in oil from its mean to 1 SD above the mean, as that quantity varies over different years. #We call this a first difference. first_diff<-pred3_oil_hi[,1] - pred3_oil_lo[,1] #plot: #Notice how we changed the y-axis. It is no longer Democracy, it is the Change in Democracy plot(year_seq, first_diff, type="l", lwd=2, xlab="Year", ylab="Change in Democracy", main="Oil and Democracy over Time Version 3:\nEffect of Change in Oil Dependence from Mean to Mean + 1 SD", ylim=c(-3,1)) abline(h=0, col="red") #Where are the confidence intervals? #These are much more complicated to get. The CIs that predict gives us for expected values of y do not easily translate into CIs for the CHANGE in expected values of y. To get confidence intervals for a first difference, we will need to use a simulation method. Next week! #Extra note: How can we look at these plots side by side? par(mfrow=c(1,3)) #plot 1: plot(year_seq, pred3_oil_lo[,1], type="n", xlab="Year", ylab="Democracy", main="Oil and Democracy over Time", ylim=c(-5,15)) lines(year_seq, pred3_oil_lo[,1], lwd=3, col="lightblue") lines(year_seq, pred3_oil_lo[,2], lwd=1, lty=2, col="lightblue") lines(year_seq, pred3_oil_lo[,3], lwd=1, lty=2, col="lightblue") lines(year_seq, pred3_oil_hi[,1], lwd=3, col="orange") lines(year_seq, pred3_oil_hi[,2], lwd=1, lty=2, col="orange") lines(year_seq, pred3_oil_hi[,3], lwd=1, lty=2, col="orange") text(x=1970, y=5, labels="Low Oil: Mean", col="lightblue", cex=.8) text(x=1970, y=2, labels="High Oil: Mean + 1 SD", col="orange", cex=.8) #plot 2: plot(oilhyp, pred3_year_lo[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Oil and Democracy over Time Version 2", ylim=c(-5,15)) lines(oilhyp, pred3_year_lo[,1], lwd=3, col="darkgoldenrod1") lines(oilhyp, pred3_year_lo[,2], lwd=1, lty=2, col="darkgoldenrod1") lines(oilhyp, pred3_year_lo[,3], lwd=1, lty=2, col="darkgoldenrod1") lines(oilhyp, pred3_year_hi[,1], lwd=3, col="magenta2") lines(oilhyp, pred3_year_hi[,2], lwd=1, lty=2, col="magenta2") lines(oilhyp, pred3_year_hi[,3], lwd=1, lty=2, col="magenta2") text(x=10, y=2, labels="1970", col="darkgoldenrod1", cex=.8) text(x=10, y=7, labels="1995", col="magenta2", cex=.8) #plot 3: plot(year_seq, first_diff, type="l", lwd=2, xlab="Year", ylab="Change in Democracy", main="Oil and Democracy over Time Version 3:\nEffect of Change in Oil Dependence from Mean to Mean + 1 SD", ylim=c(-3,1)) abline(h=0, col="red") ####### # Data Transformations # What if we suspect the effect of oil is non-linear? model4<-lm(regime1~GDPcap+oil+I(oil^2)+oecd, data=data) summary(model4) oilhyp<-seq(from=0,to=100,by=10) pred4<-predict(model4, newdata=list( GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil=oilhyp, oecd=rep(0,length(oilhyp)) ), interval="confidence") pred4 #plot: plot(oilhyp, pred4[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Effect of Oil Dependence on Democracy", ylim=c(-5,15)) lines(oilhyp, pred4[,1], lwd=3, col="red") lines(oilhyp, pred4[,2], lwd=1, lty=2, col="red") lines(oilhyp, pred4[,3], lwd=1, lty=2, col="red") # What about a log transformation? # Oil contains zeros, which cannot be logged. #There are better ways to deal with this, but for now... data$oil_mod<-(data$oil + 0.01) model5<-lm(regime1~GDPcap+log(oil_mod)+oecd, data=data) summary(model5) oilhyp<-seq(from=0.1,to=100,by=10) pred5<-predict(model5, newdata=list( GDPcap=rep(mean(data$GDPcap),length(oilhyp)), oil_mod=oilhyp, oecd=rep(0,length(oilhyp)) ), interval="confidence") pred5 #plot: plot(oilhyp, pred5[,1], type="n", xlab="Oil Exports per GDP", ylab="Democracy", main="Effect of Oil Dependence on Democracy", ylim=c(-5,15)) lines(oilhyp, pred5[,1], lwd=3, col="red") lines(oilhyp, pred5[,2], lwd=1, lty=2, col="red") lines(oilhyp, pred5[,3], lwd=1, lty=2, col="red") # Perhaps we should change the oilhyp sequence to increment by 1 instead of 10, to see the relationship at a more fine-grained level