# POLS/CSSS 503
# Lab Session 6: May 4, 2011
# Daniel Berlner 

#Agenda:
# 1. Miscellaneous
# 2. Heteroskedasticity
# 3. Outliers
# 4. Prediction using simulation
# 5. First differences plots
# 6. Exercise (if time)

options(scipen=10)
setwd("C:/Documents and Settings/Dan B/Desktop/POLS 503 TA 2012/Lab 1")

#You may need to install these packages first
library(xtable)
library(apsrtable)
library(car)
library(lmtest)


### 1. Miscellaneous: Making tables for LaTeX; exporting data; merging data

## Using xtable and apsrtable
data<-read.csv("rossoildata.csv", na.strings="")
m1<-lm(regime1~GDPcap+oil+oecd, data=data)
xtable(m1)

#xtable also works with ANY matrix:
xtable(cbind(coef(m1), sqrt(diag(vcov(m1)))))


m2<-lm(regime1~oil, data=data)
m3<-lm(regime1~log(GDPcap)+oil+oecd, data=data)
m4<-lm(regime1~GDPcap+oil+oecd+islam, data=data)
apsrtable(m1,m2,m3,m4, stars="default")


## Exporting data back to a file
data2<-data[data$year>=1980 & data$year<=1990,c("cty_name","year","regime1","oil","GDPcap","oecd")]
write.csv(data2, file="data_select.csv", row.names=FALSE)
# Note: Never overwrite your original data file!


## Merging data:

#If you have two dataframes, and each has the same set of countries (or states, or individuals) with identical country codes (where the country code variable is names "CODE" in each), then you would just run:

new_data<-merge(datasource_1, datasource_2, by="CODE")

#If you then want to merge in further datasets, you would just keep going...

new_data2<-merge(new_data, datasource_3, by="CODE")

#If some dataframes contain countries that aren't in others, then you can specify whether or not to keep the ones that don't match:

new_data<-merge(datasource_1, datasource_2, by="CODE", all.x=T, all.y=F)

#This would say "keep every observation in data1, even if it has no data in data2 and has to be assigned NA values on those variables; BUT delete observations from data2 if they have no match in data1". Change the "T" and "F" for each to make different choices. 

#And it's always a good idea to manually spot-check a few observations to make sure nothing went wrong.

#Finally, if your data is organized by countries AND years:
new_data<-merge(datasource_1, datasource_2, by=c("CODE","YEAR"), all.x=T, all.y=F)




### 2. Heteroskedasticity

data<-read.csv("rossoildata.csv", na.strings="")
data<-data[data$year==1990,c("cty_name","year","regime1","oil","GDPcap","oecd")]
data<-na.omit(data)
# Note that we're using only a cross-section of countries in the year 1990 here

#basic model
model1<-lm(regime1~GDPcap+oil+oecd, data=data)
summary(model1)


# Understanding residuals
model1$residuals
which.max(model1$residuals)
#why 4299? didn't we na.omit the data?

data[rownames(data)==4299,]


#examine the residuals
par(mfrow=c(2,2))
plot(model1$fitted,model1$residuals)
plot(data$GDPcap,model1$residuals)
plot(data$oil,model1$residuals)
plot(data$oecd,model1$residuals)

#model with logged GDP per capita
model2<-lm(regime1~log(GDPcap)+oil+oecd, data=data)
summary(model2)

plot(model1$fitted,model2$residuals)
plot(log(data$GDPcap),model2$residuals)



# If you absolutely must show someone a test for heteroskedasticity:
ncvTest(model1) 
# Breusch-Pagan Test. "ncv" stands for non-constant variance
# Here the null hypothesis is no heteroskedasticity, so this can only be rejected when p is sufficiently low.

# Estimating standard errors that are robust to heteroskedasticy
hccm(model1) 

# From the "car" library, stands for "heteroskedasticity-corrected covariance matrix"
# The output of this is a "corrected" variance-covariance matrix, which you can use exactly as you would the vcov(model1) matrix

# See the new "corrected" or "robust" standard errors:
sqrt(diag(hccm(model1)))

# Or, to see a table like in the model summary() do a significance test using the robust var-covar matrix, from the lmtest library:

coeftest(model1, hccm) 
# note that the second argument here is actually a function.  See ?coeftest for details.
# Compare with original results. Are there substantial differences?


### 3. Outliers

# get the hat scores
hatscore <- hatvalues(model1)/mean(hatvalues(model1))

# get the studentized residuals
rstu <- rstudent(model1)

#make the plot
plot(hatscore,rstu, xlab="Standardized hat-values", ylab="Studentized Residuals", main="Influence Plot")
abline(h=c(-2,2), lty=2)
abline(v=c(2,3), lty=c(2,3))
identify(hatscore,rstu, data$cty_name)
#press "escape" to go back to normal

#or use row numbers
plot(hatscore,rstu, xlab="Standardized hat-values", ylab="Studentized Residuals", main="Influence Plot")
abline(h=c(-2,2), lty=2)
abline(v=c(2,3), lty=c(2,3))
identify(hatscore,rstu, rownames(data))

#why doesn't this work?
data[430,]

#but these do:
data[rownames(data)==430,]
data[rownames(data)%in%c(430,48,1294,3170,3475),]



### 4. Prediction using simulation
# We're going to load the dataset on occupational prestige used by Fox in Chapter 5.
jobs<-read.table("http://socserv.socsci.mcmaster.ca/jfox/Books/Applied-Regression-2E/datasets/Prestige.txt")
summary(jobs)
# note: you can view the codebook here: 
# http://socserv.socsci.mcmaster.ca/jfox/Books/Applied-Regression-2E/datasets/Prestige.pdf

foxmodel<-lm(prestige~education+income+women, data=jobs)
summary(foxmodel)

# now let's extract the coefficients vector and the variance-covariance matrix:
pe<-coef(foxmodel)
vc<-vcov(foxmodel)

# Suppose we want to calculate the predicted value of prestige for a job that requires 10 years of education, pays #$10,000 (in 1971 dollars), and is 50% female.

# The long-winded way of doing it is:
predictedvalue<-pe[1] + 10*pe[2] + 10000*pe[3] + 50*pe[4]
# Remember: This is just a way of manually doing what predict() does

# A faster way of doing it uses matrix multiplication:
predictedvalue<-c(1, 10, 10000, 50) %*% pe

# Let's now try to model the uncertainty around this estimate.  We can do this by taking random draws from the #distribution of possible beta values that our model has estimated.
library(MASS)
simbetas<-mvrnorm(1000, pe, vc) # this gives us 1000 estimates of the coefficient estimates
head(simbetas) # look at it

#examine the uncertainty around each coefficient
plot(density(simbetas[,2]), col="blue", lwd=2, xlim=c(-1,6))
lines(density(simbetas[,3]), col="red", lwd=2)
lines(density(simbetas[,4]), col="green", lwd=2)

#examine coefficient for income more closely
plot(density(simbetas[,3]), col="red")

# Now let's calculate the predicted values of prestige for each of the 1000 possible coefficient estimates.  We can do #this most efficiently using matrix algebra:
xhyp<-c(1,10,10000,50) # define our scenario: vector of hypothetical x values
predictions<- simbetas %*% xhyp
head(predictions) # look at it

# Now let's look at the distribution of predicted values:
hist(predictions)
plot(density(predictions), main="Predicted Prestige", lwd=3, col="blue", bty="n", xlab="Prestige", ylab="Relative Frequency")

# We can calculate the mean of this distribution, and the lower and upper 95% confidence intervals like this:
EV<-mean(predictions)
lowerCI<-sort(predictions)[25]
upperCI<-sort(predictions)[975]

# Compare the EV to the predictedvalue above:
EV
predictedvalue # you'll see that it's roughly the same, but not exactly.  Why not?

# You can also compare these estimates to those obtained using the function predict().
predict.lm(foxmodel, newdata=list(education=10, income=10000, women=50), interval="confidence")
EV
lowerCI
upperCI

# Let's add lines to the graph indicating the 95% CIs:
plot(density(predictions), main="Predicted Prestige", lwd=3, col="blue", bty="n", xlab="Prestige", ylab="Relative Frequency")
abline(v=EV, col="red")
abline(v=c(lowerCI,upperCI), lty=2)

# Now let's estimate the distribution of prestige values for a job that is the same in every respect but pays $12,000 #instead of $10,000:
xhyp2<-c(1,10,12000,50) # define our scenario vector
predictions2<-simbetas %*% xhyp2 # multiply by the simulated coefficients
plot(density(predictions), main="Predicted Prestige", lwd=3, col="blue", bty="n", xlab="Prestige", ylab="Relative Frequency", xlim=c(40,60))
lines(density(predictions2), lwd=3, col="red")
legend(53, 0.20, c("$10k","$12k"), lwd=3, col=c("blue","red"))
#why are we using density plots instead of a histogram?

# Question: Why does the $12k distribution have greater variation than the $10k one?  
# Hint: look at the mean of the income variable.



### Plotting Expected Values

# Suppose we want to examine the effect of changes in income on prestige over a continuous range of values - i.e., not just the $10k and $12k scenarios we looked at above.
income_range<-seq(from=min(jobs$income), to=max(jobs$income), length.out=10)
#note we're using "length.out" instead of "by"

# create placeholders for the results:
EVs<-rep(NA,10)
lowerCIs<-rep(NA,10)
upperCIs<-rep(NA,10)

# get the coefficients and vcmatrix from the model:
pe<-coef(foxmodel)
vc<-vcov(foxmodel)

# Take 1000 draws from the multivariate normal distribution:
simbetas<-mvrnorm(1000, pe, vc)

# Set up a loop that goes through all 10 values in the income.range vector:
for (i in 1:10){
    
    # define the covariate values for the current scenario
    xhyp <-c(1, 10, income_range[i], 50) # make sure you understand where these numbers come from
    
    # calculate the 1000 predictions:
    yhyp <- simbetas %*% xhyp
    
    # Now slot the mean, lowerCI and upperCI values into the appropriate positions in the results vectors:
    EVs[i]<-mean(yhyp)
    lowerCIs[i]<-sort(yhyp)[25]
    upperCIs[i]<-sort(yhyp)[975]
    
    } # close i loop

# So, now that we've got the results, we can look at them in a table:
cbind(income_range, EVs, lowerCIs, upperCIs)

# But of course it's much better to plot them:
plot(income_range, EVs, type="n", bty="n", xlab="Income", ylab="Prestige", main="50% Women; 10 years of education", ylim=c(30, 80), las=1)
lines(income_range, EVs, lwd=2)
lines(income_range, lowerCIs, lty=2)
lines(income_range, upperCIs, lty=2)


# Bonus: Here's another way of presenting the same data with a shaded area representing the confidence interval.  It uses the polygon() command, whose syntax is a little more complicated.
plot(income.range, EVs, type="n", bty="n", xlab="Income", ylab="Prestige", main="50% Women; 10 years of education", ylim=c(30, 80), las=1)
polygon(c(income.range, rev(income.range)),c(lowerCIs, rev(upperCIs)), border=NA, col="grey80")
lines(income.range, EVs, lwd=3)



### 5. Illustrating interaction terms with a first differences plot:
	# Note: sometimes called "marginal effects" plot instead of "first differences"
	
foxmodel2<-lm(prestige~education+income*women, data=jobs)
summary(foxmodel2)
#interpret these results

# get the coefficients and vcmatrix from the model:
pe2<-coef(foxmodel2)
vc2<-vcov(foxmodel2)

# Take 1000 draws from the multivariate normal distribution:
simbetas2<-mvrnorm(1000, pe2, vc2)

#set up range of women employed in that occupation
women_range<-seq(from=0, to=100, by=10)

income_hi<-mean(jobs$income)+sd(jobs$income)
income_lo<-mean(jobs$income)-sd(jobs$income)

# create placeholders for the results
firstdiff_mean<-rep(NA, length(women_range)) 
firstdiff_lowerCI<-rep(NA, length(women_range)) 
firstdiff_upperCI<-rep(NA, length(women_range)) 

# Set up a loop that goes through all 11 values in the women.range vector:
for (i in 1:length(women_range)){
    
    # define the covariate values for the current scenario
    x.income_hi<-c(1, 10, income_hi, women_range[i], income_hi*women_range[i]) 
			#make sure you understand where these numbers are coming from
    x.income_lo<-c(1, 10, income_lo, women_range[i], income_lo*women_range[i])
  
	firstdifferences<-(simbetas2 %*% x.income_hi) - (simbetas2 %*% x.income_lo)
	firstdiff_mean[i]<-mean(firstdifferences)
	firstdiff_lowerCI[i]<-sort(firstdifferences)[25]
	firstdiff_upperCI[i]<-sort(firstdifferences)[975]
    
    } # close i loop

# Examine the results in a table:
cbind(women_range, firstdiff_mean, firstdiff_lowerCI, firstdiff_upperCI)


# But of course it's much better to plot them:
plot(women_range, firstdiff_mean, type="n", bty="n", xlab="Percentage of Individuals in an Occupation who are Women", ylab="Expected Increase in Prestige", main="Effect of Income on Occupational Prestige\nDepending on Women Employed in that Occupation", ylim=c(0, 60), las=1)
lines(women_range, firstdiff_mean, lwd=2)
lines(women_range, firstdiff_lowerCI, lty=2)
lines(women_range, firstdiff_upperCI, lty=2)



### Using a wireframe: 

# let's generate a 11x11 matrix of expected values of undercount at varying levels of povery and minority, holding all else constant at their median values:
resultsmatrix<-matrix(NA,11,11)
income_range<-seq(0,30000,length.out=11)
women_range<-seq(from=0, to=100, by=10) 
rownames(resultsmatrix)<-income_range
colnames(resultsmatrix)<-women_range

for (i in 1:11){ # pick a level of income
	for (j in 1:11){ # pick a level of women
		
		x.current<-c(1, 10, income_range[i], women_range[j], income_range[i]*women_range[j]) 
		resultsmatrix[i,j]<- x.current %*% coef(foxmodel2) 
			#note we're not simulating, since here we just want predicted values without uncertainty)
		
		} # close j loop
	} # close i loop

# look at the resultsmatrix first:
resultsmatrix

library(lattice)
wireframe(resultsmatrix, drape=T, xlab="Income", ylab="% Women", zlab="Prestige")

#or, a slight difference in presentation
wireframe(resultsmatrix, drape=T, xlab="Income", ylab="% Women", zlab="Prestige", scales=list(arrows=F, cex=.7), colorkey=F)

#This presentation has the advantage of showing the full range of interactions, where the marginal effects plot showed only one slice of this. The main disadvantage, of course, is in not showing the uncertainty around each expected value of prestige. 




#### 6. Exercise (if time)

#Return to the first model we ran on the occupational prestige data.

foxmodel<-lm(prestige~education+income+women, data=jobs)
summary(foxmodel)

# For each independent variable in the model, calculate the expected change in prestige that results from a change in that independent variable from one SD below its mean to one SD above its mean, along with 95 percent confidence intervals around this quantity. 

# Store your results in a matrix, and use xtable() to export that matrix into LaTeX code.