# Non-stationary ARIMA & ECM estimation, fiting, and interpretation
# Chris Adolph
#
# Revised 29 July 2014
rm(list=ls())

# Load libraries
library(Zelig)             # For Zelig simulation procedures & approval data 
library(tseries)           # For unit root tests
library(lmtest)            # For Breusch-Godfrey LM test of serial correlation
library(urca)              # For estimating cointegration models
library(simcf)             # For counterfactual simulation via ldvsimev()
library(MASS)              # For mvrnorm()
source("TSplotHelper.R")   # Helper function for counterfactual time series plots

# Load data
# US Presidential approval data (Bush, Monthly, 2/2001--6/2006)
# Includes average oil price data ($/barrel?)
#
# Variable names:  month year approve disapprove unsure
#                  sept.oct.2001 iraq.war avg.price

data(approval)
attach(approval)

# Create a random walk
set.seed(1)
phony <- rnorm(length(approve))
for (i in 2:length(phony))
    phony[i] <- phony[i-1] + rnorm(1) 

# Look at the time series
pdf("tsapproval.pdf",width=6,height=3.25)
plot(approve,type="l",ylab="Percent Approving",xlab="Time",
     main = "US Presidential Approval")
lines(x=c(8,8),y=c(-1000,1000),col="red")
lines(x=c(26,26),y=c(-1000,1000),col="blue")
text("9/11",x = 10, y = 40, col="red",cex=0.7)
text("Iraq \n War",x = 28, y = 40, col="blue",cex=0.7) 
dev.off()

pdf("tsprice.pdf",width=6,height=3.25)
plot(avg.price,type="l",ylab="$ per Barrel",xlab="Time",
     main = "Average Price of Oil")
lines(x=c(8,8),y=c(-1000,1000),col="red")
lines(x=c(26,26),y=c(-1000,1000),col="blue")
text("9/11",x = 10, y = 175, col="red",cex=0.7)
text("Iraq \n War",x = 28, y = 175, col="blue",cex=0.7) 
dev.off()

# Look at the ACF
pdf("acfapproval.pdf",width=6,height=3.25)
acf(approve)
dev.off()

pdf("acfprice.pdf",width=6,height=3.25)
acf(avg.price)
dev.off()

# Look at the PACF
pdf("pacfapproval.pdf",width=6,height=3.25)
pacf(approve)
dev.off()

pdf("pacfprice.pdf",width=6,height=3.25)
pacf(avg.price)
dev.off()

# Check for a unit root
PP.test(approve)
adf.test(approve)

PP.test(avg.price)
adf.test(avg.price)

# Consider the first difference of each variable
approveLag <- c(NA, approve[1:(length(approve)-1)])
approveDiff <- approve - approveLag

avg.priceLag <- c(NA, avg.price[1:(length(avg.price)-1)])
avg.priceDiff <- avg.price - avg.priceLag

# Look at the DIFFERENCED time series
pdf("tsapprovalDiff.pdf",width=6,height=3.25)
plot(approveDiff,type="l",ylab="Change in Percent Approving",xlab="Time",
     main = "US Presidential Approval")
lines(x=c(8,8),y=c(-1000,1000),col="red")
lines(x=c(26,26),y=c(-1000,1000),col="blue")
text("9/11",x = 10, y = 15, col="red",cex=0.7)
text("Iraq \n War",x = 28, y = 15, col="blue",cex=0.7) 
dev.off()

pdf("tspriceDiff.pdf",width=6,height=3.25)
plot(avg.priceDiff,type="l",ylab="Change in $ per Barrel",xlab="Time",
     main = "Average Price of Oil")
lines(x=c(8,8),y=c(-1000,1000),col="red")
lines(x=c(26,26),y=c(-1000,1000),col="blue")
text("9/11",x = 10, y = -30, col="red",cex=0.7)
text("Iraq \n War",x = 28, y = -30, col="blue",cex=0.7) 
dev.off()

# Look at the new ACF
pdf("acfapprovalDiff.pdf",width=6,height=3.25)
acf(approveDiff, na.action=na.pass)
dev.off()

pdf("acfpriceDiff.pdf",width=6,height=3.25)
acf(avg.priceDiff, na.action=na.pass)
dev.off()

# Look at the new PACF
pdf("pacfapprovalDiff.pdf",width=6,height=3.25)
pacf(approveDiff, na.action=na.pass)
dev.off()

pdf("pacfpriceDiff.pdf",width=6,height=3.25)
pacf(avg.priceDiff, na.action=na.pass)
dev.off()

# Check for a unit root in differenced time series

PP.test(as.vector(na.omit(approveDiff)))
adf.test(na.omit(approveDiff))

PP.test(as.vector(na.omit(avg.priceDiff)))
adf.test(na.omit(avg.priceDiff))

#################################################################
## Model 1a:  ARIMA(0,1,0) model of approve
##

## Estimate an ARIMA(0,1,0) using arima
xcovariates <- cbind(sept.oct.2001, iraq.war, avg.price)
arima.res1a <- arima(approve, order = c(0,1,0),
                     xreg = xcovariates, include.mean = TRUE
                     )
print(arima.res1a)

# Extract estimation results from arima.res1a
pe.1a <- arima.res1a$coef                    # parameter estimates (betas)
se.1a <- sqrt(diag(arima.res1a$var.coef))    # standard errors
ll.1a <- arima.res1a$loglik                  # log likelihood at its maximum
sigma2hat.1a <- arima.res1a$sigma2           # standard error of the regression
aic.1a <- arima.res1a$aic                    # Akaike Information Criterion
resid.1a <- arima.res1a$resid                # residuals


#################################################################
## Model 1c:  ARIMA(1,1,0) model of approve
##

## Estimate an ARIMA(1,1,0) using arima
xcovariates <- cbind(sept.oct.2001, iraq.war, avg.price)
arima.res1b <- arima(approve, order = c(1,1,0),
                     xreg = xcovariates, include.mean = TRUE
                     )
print(arima.res1b)

# Extract estimation results from arima.res1a
pe.1b <- arima.res1b$coef                    # parameter estimates (betas)
se.1b <- sqrt(diag(arima.res1b$var.coef))    # standard errors
ll.1b <- arima.res1b$loglik                  # log likelihood at its maximum
sigma2hat.1b <- arima.res1b$sigma2           # standard error of the regression
aic.1b <- arima.res1b$aic                    # Akaike Information Criterion
resid.1b <- arima.res1b$resid                # residuals



#################################################################
## Model 1b:  ARIMA(2,1,2) model of approve
##

## Estimate an ARIMA(2,1,2) using arima
xcovariates <- cbind(sept.oct.2001, iraq.war, avg.price)
arima.res1c <- arima(approve, order = c(2,1,2),
                     xreg = xcovariates, include.mean = TRUE
                     )
print(arima.res1c)

# Extract estimation results from arima.res1a
pe.1c <- arima.res1c$coef                    # parameter estimates (betas)
se.1c <- sqrt(diag(arima.res1c$var.coef))    # standard errors
ll.1c <- arima.res1c$loglik                  # log likelihood at its maximum
sigma2hat.1c <- arima.res1c$sigma2           # standard error of the regression
aic.1c <- arima.res1c$aic                    # Akaike Information Criterion
resid.1c <- arima.res1c$resid                # residuals



#################################################################
## Model 1d:  ARIMA(0,1,0) model of approve including a spurrious regressor
##

## Estimate an ARIMA(0,1,0) using arima
xcovariates <- cbind(sept.oct.2001, iraq.war, avg.price, phony)
arima.res1d <- arima(approve, order = c(0,1,0),
                     xreg = xcovariates, include.mean = TRUE
                     )
print(arima.res1d)

# Extract estimation results from arima.res1a
pe.1d <- arima.res1d$coef                    # parameter estimates (betas)
se.1d <- sqrt(diag(arima.res1d$var.coef))    # standard errors
ll.1d <- arima.res1d$loglik                  # log likelihood at its maximum
sigma2hat.1d <- arima.res1d$sigma2           # standard error of the regression
aic.1d <- arima.res1d$aic                    # Akaike Information Criterion
resid.1d <- arima.res1d$resid                # residuals


# Based on ACF, PACF, and AIC, let's select Model 1a to be Model 1
arima.res1 <- arima.res1a


## What would happen if we used linear regression on a single lag of approval?
lm.res1e <- lm(approve ~ approveLag + sept.oct.2001 + iraq.war + avg.price)
print(summary(lm.res1e))

# linear regression with a spurious regressor?
lm.res1f <- lm(approve ~ approveLag + sept.oct.2001 + iraq.war + avg.price + phony)
print(summary(lm.res1f))

# Check LS result for serial correlation in the first or second order
bgtest(lm.res1e,1)
bgtest(lm.res1f,2)



#### Because Zelig's arima simulator isn't currently available,
#### Let's use ldvsimev().

# First we need the simulated parameters
sims <- 10000
pe <- arima.res1$coef
vc <- arima.res1$var.coef
simparams <- mvrnorm(sims, pe, vc)
simbetas <- simparams[, (1+sum(arima.res1$arma[1:2])):ncol(simparams)]
#simphi <- simparams[, 1:arima.res1$arma[1]]  # if AR(1) or greater

# Choose counterfactual periods
periodsToSim <- seq(from=26, to=65, by=1)
nPeriodsToSim <- length(periodsToSim)

## Create hypothetical covariates over these periods
# Start with factual values
# For ARIMA, need to enter these covariates in same order as model
model <- approve ~ sept.oct.2001 + iraq.war + avg.price
selectdata <- extractdata(model, approval, na.rm = TRUE)
perioddata <- selectdata[periodsToSim,]

# Treat factual data as the baseline counterfactual
xhyp0 <- subset(perioddata, select = -approve )

# Suppose no Iraq War occurred
xhyp <- subset(perioddata, select = -approve )
xhyp$iraq.war <- rep(0,nPeriodsToSim)

# Leave out phi because this specification has no AR component
# phi <- simphi

# Construction of prior lags depends on order of ARIMA
# Only need initialY for I(1)
# Only need lagY for AR(1) or higher
initialY <- selectdata$approve[periodsToSim[1] - 1]
            # if ARIMA(1,1,0), instead: selectdata$approve[periodsToSim[1] - 2]
lagY <- selectdata$approve[periodsToSim[1] - 1]
            # if ARIMA(1,1,0), append: - selectdata$approve[periodsToSim[1] - 2]

# Simulate expected values of Y out to periods
# given hypothetical values of X, and an initial level of Y
# No Iraq scenario
noIraq.ev1 <- ldvsimev(xhyp,               # The matrix of hypothetical x's
                       simbetas,           # The matrix of simulated betas
                       ci=0.95,            # Desired confidence interval
                       constant=NA,        # Column containing the constant;
                                           #  no constant because model is differenced
                       #phi=phi,           # estimated AR parameters; length must match lagY 
                       #lagY=lagY,         # lags of y, most recent last
                       transform="diff",   # "log" to undo log transformation,
                                           # "diff" to under first differencing
                                           # "difflog" to do both
                       initialY=initialY   # for differenced models, lag of level of y
                       )

# Iraq scenario (factual)
Iraq.ev1 <- ldvsimev(xhyp0,              # The matrix of hypothetical x's
                     simbetas,           # The matrix of simulated betas
                     ci=0.95,            # Desired confidence interval
                     constant=NA,        # Column containing the constant;
                                         #  no constant because model is differenced
                     #phi=phi,           # estimated AR parameters; length must match lagY 
                     #lagY=lagY,         # lags of y, most recent last
                     transform="diff",   # "log" to undo log transformation,
                                         # "diff" to under first differencing
                                         # "difflog" to do both
                     initialY=initialY   # for differenced models, lag of level of y
                     )


at.xaxis <- seq(from = -1, to = length(approve[1:25]) + nPeriodsToSim, by = 12)
lab.xaxis <- seq(from = 2001, by = 1, length.out = length(at.xaxis))

pdf("noIraqARIMA.pdf",width=6,height=3.25)
ctrfactTS(observed = approve[1:25],
          predicted = noIraq.ev1$pe,
          lower = noIraq.ev1$lower,
          upper = noIraq.ev1$upper,
          #se = NULL,
          predicted0 = Iraq.ev1$pe,
          lower0 = Iraq.ev1$lower,
          upper0 = Iraq.ev1$upper,
          #se0 = NULL,
          factual = approve[26:65],
          at.xaxis = at.xaxis,
          lab.xaxis = lab.xaxis,
          #ylim = c(0, 100),
          main = "Ctrfactual Effect of No Iraq War from ARIMA(0,1,0)",
          xlab = "Year",
          ylab = "Presidential Approval (%)",
          col = "red",
          col0 = "blue")
dev.off()         





# What if we used an ARMA model in this case (ignored possible
# unit root)?  Would we get better long run estimates?

#################################################################
## Model 2a:  ARIMA(1,0,0) model of approve
##

## Estimate an ARIMA(1,0,0) using arima
xcovariates <- cbind(sept.oct.2001, iraq.war, avg.price)
arima.res2a <- arima(approve, order = c(1,0,0),
                     xreg = xcovariates, include.mean = TRUE
                     )
print(arima.res2a)

# Extract estimation results from arima.res1a
pe.2a <- arima.res2a$coef                    # parameter estimates (betas)
se.2a <- sqrt(diag(arima.res2a$var.coef))    # standard errors
ll.2a <- arima.res2a$loglik                  # log likelihood at its maximum
sigma2hat.2a <- arima.res2a$sigma2           # standard error of the regression
aic.2a <- arima.res2a$aic                    # Akaike Information Criterion
resid.2a <- arima.res2a$resid                # residuals

# First we need the simulated parameters
sims <- 10000
pe <- arima.res2a$coef
vc <- arima.res2a$var.coef
simparams <- mvrnorm(sims, pe, vc)
simbetas <- simparams[, (1+sum(arima.res2a$arma[1:2])):ncol(simparams)]
simphi <- simparams[, 1:arima.res2a$arma[1]]  # if AR(1) or greater

# Choose counterfactual periods
periodsToSim <- seq(from=26, to=65, by=1)
nPeriodsToSim <- length(periodsToSim)

## Create hypothetical covariates over these periods
# Start with factual values
# For ARIMA, need to enter these covariates in same order as model
model <- approve ~ sept.oct.2001 + iraq.war + avg.price
selectdata <- extractdata(model, approval, na.rm = TRUE)
perioddata <- selectdata[periodsToSim,]

# Treat factual data as the baseline counterfactual
xhyp0 <- subset(perioddata, select = -approve )

# Suppose no Iraq War occurred
xhyp <- subset(perioddata, select = -approve )
xhyp$iraq.war <- rep(0,nPeriodsToSim)

# Construction of prior lags depends on order of ARIMA
# Only need initialY for I(1)
# Only need lagY for AR(1) or higher
lagY <- selectdata$approve[periodsToSim[1] - 1]
            
# Simulate expected values of Y out to periods
# given hypothetical values of X, and an initial level of Y
# No Iraq scenario
noIraq.ev1 <- ldvsimev(xhyp,               # The matrix of hypothetical x's
                       simbetas,           # The matrix of simulated betas
                       ci=0.95,            # Desired confidence interval
                       constant=1,         # Column containing the constant;
                       phi=mean(simphi),   # estimated AR parameters; length must match lagY 
                       lagY=lagY           # lags of y, most recent last
                       )

# Iraq scenario (factual)
Iraq.ev1 <- ldvsimev(xhyp0,              # The matrix of hypothetical x's
                     simbetas,           # The matrix of simulated betas
                     ci=0.95,            # Desired confidence interval
                     constant=1,         # Column containing the constant;
                     phi=mean(simphi),   # estimated AR parameters; length must match lagY 
                     lagY=lagY           # lags of y, most recent last
                     )


at.xaxis <- seq(from = -1, to = length(approve[1:25]) + nPeriodsToSim, by = 12)
lab.xaxis <- seq(from = 2001, by = 1, length.out = length(at.xaxis))

pdf("noIraqARMA.pdf",width=6,height=3.25)
ctrfactTS(observed = approve[1:25],
          predicted = noIraq.ev1$pe,
          lower = noIraq.ev1$lower,
          upper = noIraq.ev1$upper,
          #se = NULL,
          predicted0 = Iraq.ev1$pe,
          lower0 = Iraq.ev1$lower,
          upper0 = Iraq.ev1$upper,
          #se0 = NULL,
          factual = approve[26:65],
          at.xaxis = at.xaxis,
          lab.xaxis = lab.xaxis,
          #ylim = c(0, 100),
          main = "Ctrfactual Effect of No Iraq War (Red) from AR(1)",
          xlab = "Year",
          ylab = "Presidential Approval (%)",
          col = "red",
          col0 = "blue")
dev.off()         





#############
# What if we used the linear regression model with a lagged DV?

lm.res3 <- lm(approve ~ approveLag + sept.oct.2001 + iraq.war + avg.price)
print(summary(lm.res3))

# Extract estimation results from arima.res1a
pe.3 <- coef(lm.res3)                # parameter estimates
vc.3 <- vcov(lm.res3)                # variance-covariance (of params)
se.3 <- sqrt(diag(vc.3))             # standard errors

# First we need the simulated parameters
sims <- 10000
simparams <- mvrnorm(sims, pe.3, vc.3)
# Pluck out the lag parameter
simphi <- simparams[,2]
# Get the rest of the betas
simbetas <- simparams[,c(1,3:ncol(simparams))]

# Choose counterfactual periods
periodsToSim <- seq(from=26, to=65, by=1)
nPeriodsToSim <- length(periodsToSim)

## Create hypothetical covariates over these periods
# Start with factual values (note I leave out the lag;
# ldvsimev() will construct it)
model <- approve ~ sept.oct.2001 + iraq.war + avg.price
selectdata <- extractdata(model, approval, na.rm = TRUE)
perioddata <- selectdata[periodsToSim,]

# Treat factual data as the baseline counterfactual
xhyp0 <- subset(perioddata, select = -approve )

# Suppose no Iraq War occurred
xhyp <- subset(perioddata, select = -approve )
xhyp$iraq.war <- rep(0,nPeriodsToSim)

# Construction of prior lags depends on order of ARIMA
# Only need initialY for I(1)
# Only need lagY for AR(1) or higher
lagY <- selectdata$approve[periodsToSim[1] - 1]
            
# Simulate expected values of Y out to periods
# given hypothetical values of X, and an initial level of Y
# No Iraq scenario
noIraq.ev1 <- ldvsimev(xhyp,               # The matrix of hypothetical x's
                       simbetas,           # The matrix of simulated betas
                       ci=0.95,            # Desired confidence interval
                       constant=1,         # Column containing the constant;
                       phi=mean(simphi),   # estimated AR parameters; length must match lagY 
                       lagY=lagY           # lags of y, most recent last
                       )

# Iraq scenario (factual)
Iraq.ev1 <- ldvsimev(xhyp0,              # The matrix of hypothetical x's
                     simbetas,           # The matrix of simulated betas
                     ci=0.95,            # Desired confidence interval
                     constant=1,         # Column containing the constant;
                     phi=mean(simphi),   # estimated AR parameters; length must match lagY 
                     lagY=lagY           # lags of y, most recent last
                     )


at.xaxis <- seq(from = -1, to = length(approve[1:25]) + nPeriodsToSim, by = 12)
lab.xaxis <- seq(from = 2001, by = 1, length.out = length(at.xaxis))

pdf("noIraqLS.pdf",width=6,height=3.25)
ctrfactTS(observed = approve[1:25],
          predicted = noIraq.ev1$pe,
          lower = noIraq.ev1$lower,
          upper = noIraq.ev1$upper,
          #se = NULL,
          predicted0 = Iraq.ev1$pe,
          lower0 = Iraq.ev1$lower,
          upper0 = Iraq.ev1$upper,
          #se0 = NULL,
          factual = approve[26:65],
          at.xaxis = at.xaxis,
          lab.xaxis = lab.xaxis,
          #ylim = c(0, 100),
          main = "Ctrfactual Effect of No Iraq War (Red) from LS with Lagged DV",
          xlab = "Year",
          ylab = "Presidential Approval (%)",
          col = "red",
          col0 = "blue")
dev.off()         








################################################
# Cointegration analysis
cointvars <- cbind(approve,avg.price)

ecm.test1 <- ca.jo(cointvars,
                   ecdet = "const",
                   type="eigen",
                   K=2,
                   spec="longrun",
                   dumvar=cbind(sept.oct.2001,iraq.war)
                   )

ecm.res1 <- cajorls(ecm.test1,
                    r = 1,
                    reg.number = 1)

summary(ecm.res1$rlm)



########################################################
# Cointegration analysis with a spurious regressor
cointvars <- cbind(approve,avg.price,phony)

ecm.test1 <- ca.jo(cointvars,
                   ecdet = "const",
                   type="eigen",
                   K=2,
                   spec="longrun",
                   dumvar=cbind(sept.oct.2001,iraq.war)
                   )

ecm.res1 <- cajorls(ecm.test1,
                    r = 1,
                    reg.number = 1)

summary(ecm.res1$rlm)