## Count model examples using HOAdata.org data: ## Poisson, Negative Binomial, Zero-Inflated Negative Binomial ## ## Christopher Adolph faculty.washington.edu ## 18 November 2016 ## Clear memory rm(list=ls()) ## Load libraries library(MASS) # For mvrnorm() library(tile) # For graphics library(simcf) # For post-estimation simulation library(pscl) # For zero inflation count models library(VGAM) # For GAMs to explore specifications library(RColorBrewer) # For nice colors ## Set some parameters sims <- 10000 doCV <- FALSE ## Get nice colors brewer <- brewer.pal(9, "Set1") purple <- brewer[4] # Poisson on full data red <- brewer[1] # Poisson on non-zeros blue <- brewer[2] # Negative Binomial on non-zeros orange <- brewer[5] # Zero-inflated Negative Binomial green <- brewer[3] # Zero-inflated Poisson brown <- brewer[7] # Quasipoisson nicegray <- "gray45" ## Load data file <- "hoaCounts.csv" data <- read.csv(file) ## Restrict to places with >=100 homes data <- data[data$nhomes>=100,] ## Construct exposure variable and add to dataset nyears <- length(1995:2001) data$exposure <- data$nhomes*nyears ## Explore relationship between ever-filers and age of neighborhood data$everfile <- as.numeric(data$file9501>0) gam.res <- vgam(everfile~s(mdyear) + mdvaluation + exposure, data=data, family=binomialff) ## GAM suggests constructing new neighborhood (after 1975) covariate data$post1975construction <- as.numeric(data$mdyear>1975) ## Create alternate "no zeros" dataset dataNZ <- data[data$file9501>0,] ## Process data for model m1 <- file9501 ~ log(mdvaluation) + post1975construction m1data <- extractdata(m1, data, extra=~exposure, na.rm = TRUE) m1dataNZ <- extractdata(m1, dataNZ, extra=~exposure, na.rm = TRUE) ## Estimate models ## Poisson using GLM with zeros included glm.poisWZ <- glm(m1, m1data, family=poisson, offset=log(m1data$exposure)) pe.poisWZ <- coef(glm.poisWZ) # point estimates vc.poisWZ <- vcov(glm.poisWZ) # var-cov matrix se.poisWZ <- sqrt(diag(vc.poisWZ)) # standard errors ll.poisWZ <- logLik(glm.poisWZ) # likelihood at maximum aic.poisWZ <- AIC(glm.poisWZ) ## Set up counterfactuals (range over mdvaluation, all else equal) valueseq <- seq(35000, 400000, 1000) xhypWZ <- cfMake(m1, m1data, nscen = length(valueseq)) for (i in 1:length(valueseq)) xhypWZ <- cfChange(xhypWZ, "mdvaluation", x=valueseq[i], scen=i) ## Simulate results simbetas.poisWZ <- mvrnorm(sims, pe.poisWZ, vc.poisWZ) # draw parameters yhyp.poisWZ <- loglinsimev(xhypWZ, simbetas.poisWZ) # Transform expected counts to expected rates yhyp.poisWZ$pe <- yhyp.poisWZ$pe*1000 yhyp.poisWZ$lower <- yhyp.poisWZ$lower*1000 yhyp.poisWZ$upper <- yhyp.poisWZ$upper*1000 # Set up lineplot trace of result for plotting using tile trace1poisWZ <- lineplot(x = valueseq/1000, y = yhyp.poisWZ$pe, lower = yhyp.poisWZ$lower, upper = yhyp.poisWZ$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1data), cfact=model.frame(m1, xhypWZ$x), omit.extrapolated=TRUE), col = purple, plot = 1 ) poisLabWZ <- textTile(labels="Poisson including excess zeros", x=77, y=3, col=purple, cex=1, plot=1) rug1 <- rugTile(x=m1data$mdvaluation[(m1data$mdvaluation>35000) &(m1data$mdvaluation<400000)]/1000, type="jitter", col=nicegray, alpha=0.4, plot=1) xat <- c(50, 100, 200) xlabs <- c("$50k", "$100k", "$200k") ## Plot traces using tile file <- "PoissonWZ" tile(trace1poisWZ, rug1, poisLabWZ, RxC = c(1,1), limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) # Poisson using GLM with zeros removed glm.poisNZ <- glm(m1, m1dataNZ, family=poisson, offset=log(m1dataNZ$exposure)) pe.poisNZ <- coef(glm.poisNZ) # point estimates vc.poisNZ <- vcov(glm.poisNZ) # var-cov matrix se.poisNZ <- sqrt(diag(vc.poisNZ)) # standard errors ll.poisNZ <- logLik(glm.poisNZ) # likelihood at maximum aic.poisNZ <- AIC(glm.poisNZ) # Set up counterfactuals (range over mdvaluation, all else equal) valueseq <- seq(35000, 400000, 1000) xhypNZ <- cfMake(m1, m1dataNZ, nscen = length(valueseq)) for (i in 1:length(valueseq)) xhypNZ <- cfChange(xhypNZ, "mdvaluation", x=valueseq[i], scen=i) # Simulate results simbetas.poisNZ <- mvrnorm(sims, pe.poisNZ, vc.poisNZ) # draw parameters yhyp.poisNZ <- loglinsimev(xhypNZ, simbetas.poisNZ) # Transform expected counts to expected rates yhyp.poisNZ$pe <- yhyp.poisNZ$pe*1000 yhyp.poisNZ$lower <- yhyp.poisNZ$lower*1000 yhyp.poisNZ$upper <- yhyp.poisNZ$upper*1000 # Set up lineplot trace of result for plotting using tile trace1poisNZ <- lineplot(x = valueseq/1000, y = yhyp.poisNZ$pe, lower = yhyp.poisNZ$lower, upper = yhyp.poisNZ$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1dataNZ), cfact=model.frame(m1, xhypNZ$x), omit.extrapolated=TRUE), col = red, plot = 1 ) poisLabNZ <- textTile(labels="Poisson with all zeros deleted", x=81, y=12.5, col=red, cex=1, plot=1) rug1 <- rugTile(x=m1dataNZ$mdvaluation[(m1dataNZ$mdvaluation>35000) &(m1dataNZ$mdvaluation<400000)]/1000, type="jitter", col=nicegray, alpha=0.4, plot=1) xat <- c(50, 100, 200) xlabs <- c("$50k", "$100k", "$200k") ## Plot traces using tile file <- "PoissonNZ" tile(trace1poisNZ, rug1, poisLabNZ, RxC = c(1,1), limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Re-analyze using negative binomial regression, zeros excluded ## Note: glm.nb requires offset be included in the model formula thus m1nb <- file9501 ~ log(mdvaluation) + post1975construction + offset(log(exposure)) nb.result <- glm.nb(m1nb, data=m1dataNZ) pe.nb <- coef(nb.result) vc.nb <- vcov(nb.result) se.nb <- sqrt(diag(vc.nb)) ll.nb <- logLik(nb.result) aic.nb <- AIC(nb.result) theta.nb <- nb.result$theta setheta.nb <- nb.result$SE.theta # Simulate results (we re-use xhyp from above) simbetas.nb <- mvrnorm(sims, pe.nb, vc.nb) # draw parameters yhyp.nb <- loglinsimev(xhypNZ, simbetas.nb) # Transform expected counts to expected rates yhyp.nb$pe <- yhyp.nb$pe*1000 yhyp.nb$lower <- yhyp.nb$lower*1000 yhyp.nb$upper <- yhyp.nb$upper*1000 # Set up lineplot trace of result for plotting using tile trace1nb <- lineplot(x = valueseq/1000, y = yhyp.nb$pe, lower = yhyp.nb$lower, upper = yhyp.nb$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1dataNZ), cfact=model.frame(m1, xhypNZ$x), omit.extrapolated=TRUE), col = blue, plot = 1 ) poisLabNB <- textTile(labels="Negative Binomial with all zeros deleted", x=98, y=12.5, col=blue, cex=1, plot=1) xat <- c(50, 100, 200) xlabs <- c("$50k", "$100k", "$200k") ## Plot traces using tile file <- "NegativeBinomialNZ" tile(trace1nb, rug1, poisLabNB, limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Quasipoisson using GLM with zeros removed glm.qpoisNZ <- glm(m1, m1dataNZ, family=quasipoisson, offset=log(m1dataNZ$exposure)) pe.qpoisNZ <- coef(glm.qpoisNZ) # point estimates vc.qpoisNZ <- vcov(glm.qpoisNZ) # var-cov matrix se.qpoisNZ <- sqrt(diag(vc.qpoisNZ)) # standard errors ll.qpoisNZ <- logLik(glm.qpoisNZ) # likelihood at maximum aic.qpoisNZ <- AIC(glm.qpoisNZ) ## Simulate results simbetas.qpoisNZ <- mvrnorm(sims, pe.qpoisNZ, vc.qpoisNZ) # draw parameters yhyp.qpoisNZ <- loglinsimev(xhypNZ, simbetas.qpoisNZ) ## Transform expected counts to expected rates yhyp.qpoisNZ$pe <- yhyp.qpoisNZ$pe*1000 yhyp.qpoisNZ$lower <- yhyp.qpoisNZ$lower*1000 yhyp.qpoisNZ$upper <- yhyp.qpoisNZ$upper*1000 ## Set up lineplot trace of result for plotting using tile trace1qpoisNZ <- lineplot(x = valueseq/1000, y = yhyp.qpoisNZ$pe, lower = yhyp.qpoisNZ$lower, upper = yhyp.qpoisNZ$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1dataNZ), cfact=model.frame(m1, xhypNZ$x), omit.extrapolated=TRUE), col = brown, plot = 1 ) qpoisLabNZ <- textTile(labels="Quasipoisson with all zeros deleted", x=93, y=12.5, col=brown, cex=1, plot=1) # Switch negative binomial CIs to dashed lines trace1nb$ci$mark <- "dashed" # Move negative binomial labels poisLabNB$y <- 7.5 poisLabNB$x <- 140 ## Plot traces using tile file <- "QuasipoissonNZ" tile(trace1qpoisNZ, trace1nb, rug1, qpoisLabNZ, poisLabNB, limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Switch poisson CIs to dashed lines trace1poisNZ$ci$mark <- "dashed" ## Move poisson labels poisLabNZ$y <- 7.5 poisLabNZ$x <- 125 ## Plot traces using tile file <- "QuasipoissonNZ0" tile(trace1qpoisNZ, rug1, qpoisLabNZ, poisLabNZ, trace1poisNZ, limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Re-analyze using Zero-Inflated Negative Binomial ## Estimate ZINB using zeroinfl in the pscl library ## Notice the particular form of the formula -- count first, then zero-inflation m1zinb <- file9501 ~ log(mdvaluation) + post1975construction | log(mdvaluation) + post1975construction + log(exposure) zinb.result <- zeroinfl(m1zinb, m1data, dist="negbin", offset=log(m1data$exposure)) ## Extract results, taking care to keep track of both equations pe.zinb.count <- zinb.result$coefficients$count pe.zinb.zeros <- zinb.result$coefficients$zero pe.zinb <- c(pe.zinb.count,pe.zinb.zeros) vc.zinb <- vcov(zinb.result) se.zinb <- sqrt(diag(vc.zinb)) se.zinb.count <- se.zinb[1:length(pe.zinb.count)] se.zinb.zeros <- se.zinb[ (length(pe.zinb.count)+1) : length(se.zinb) ] ll.zinb <- logLik(zinb.result) aic.zinb <- AIC(zinb.result) theta.zinb <- zinb.result$theta setheta.zinb <- zinb.result$SE.theta # Simulate results (we re-use xhyp from above) # Option 1: E(count | not a structural zero) # { other options include unconditional E(count) and Pr(Structural Zero) } simparam.zinb <- mvrnorm(sims,pe.zinb,vc.zinb) # draw parameters simbetas.zinb <- simparam.zinb[,1:length(pe.zinb.count)] simgammas.zinb <- simparam.zinb[,(length(pe.zinb.count)+1) : length(se.zinb) ] yhyp.zinbC <- loglinsimev(xhypWZ, simbetas.zinb) ## Transform expected counts to expected rates yhyp.zinbC$pe <- yhyp.zinbC$pe*1000 yhyp.zinbC$lower <- yhyp.zinbC$lower*1000 yhyp.zinbC$upper <- yhyp.zinbC$upper*1000 ## Set up lineplot trace of result for plotting using tile trace1zinbC <- lineplot(x = valueseq/1000, y = yhyp.zinbC$pe, lower = yhyp.zinbC$lower, upper = yhyp.zinbC$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1data), cfact=model.frame(m1, xhypWZ$x), omit.extrapolated=TRUE), col = orange, plot = 1 ) poisLabZINB2 <- textTile(labels="Zero-Inflated Negative Binomial", x=80, y=17.5, col=orange, cex=1, plot=1) xat <- c(50, 100, 200) xlabs <- c("$50k", "$100k", "$200k") ## Plot traces using tile file <- "ZINBcond" tile(trace1zinbC, rug1, poisLabZINB2, RxC = c(1,1), limits = c(35,400,0,28.5), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Expected HOA foreclosure filings per 1000 homes per year", x=0.39, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Option 3: Pr(Structural Zero) ## { other options include E(count|not a structural zero) and unconditional count } xhypWZ0 <- xhypWZ xhypWZ0$model <- file9501 ~ log(mdvaluation) + post1975construction + log(exposure) xhypWZ0$x$exposure <- xhypWZ$xpre$exposure <- mean(m1data$exposure) yhyp.zinbZ <- logitsimev(xhypWZ0, simgammas.zinb) ## Set up lineplot trace of result for plotting using tile trace1zinbZ <- lineplot(x = valueseq/1000, y = 1-yhyp.zinbZ$pe, lower = 1-yhyp.zinbZ$lower, upper = 1-yhyp.zinbZ$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1data), cfact=model.frame(m1, xhypWZ0$x), omit.extrapolated=TRUE), col = orange, plot = 1 ) poisLabZINB1 <- textTile(labels="Zero-Inflated Negative Binomial", x=189, y=0.3, col=orange, cex=1, plot=1) xat <- c(50, 100, 200) xlabs <- c("$50k", "$100k", "$200k") ## Plot traces using tile file <- "ZINBzeros" tile(trace1zinbZ, rug1, poisLabZINB1, RxC = c(1,1), limits = c(35,400,-0.01,1.01), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=6.5, outfile=file), xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels="Probability neighborhood has a foreclosure-filing HOA", x=0.341, y=0.75), height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), gridlines = list(type="y") ) ## Combined plot: revise some details trace1nb$plot <- 2 trace1nb$ci <- list(mark="dashed") trace1zinbC$plot <- 2 rug1$plot <- c(1,2) poisLabZINB2$plot <- 2 poisLabNB2 <- textTile(labels="Negative Binomial", x=115, y=7.5, col=blue, cex=1, plot=2) poisLabZINB1$x <- 205 poisLabZINB2$x <- 94 ## Plot traces using tile file <- "ZINBzc" tile(trace1zinbZ, trace1zinbC, rug1, poisLabZINB1, poisLabZINB2, RxC = c(1,2), limits = rbind(c(35,400,0,1.0), c(35,400,0,28.5)), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=10, outfile=file), #12 xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels1="Probability neighborhood has a filing HOA", labels2="Expected HOA foreclosure filings per 1000 homes", x1=0.32625, y1=0.75, x2=0.42025, y2=0.75), # 355 405 height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), width = list(spacer=3), gridlines = list(type="y") ) ## Plot traces using tile file <- "ZINBzc0" tile(trace1zinbZ, trace1zinbC, trace1nb, rug1, poisLabZINB1, poisLabZINB2, poisLabNB2, RxC = c(1,2), limits = rbind(c(35,400,0,1.0), c(35,400,0,28.5)), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=10, outfile=file, family="GillSans"), #12 xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels1="Probability neighborhood has a filing HOA", labels2="Expected HOA foreclosure filings per 1000 homes", x1=0.32625, y1=0.75, x2=0.42025, y2=0.75), # 355 405 height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), width = list(spacer=3), gridlines = list(type="y") ) embedGS(paste(file,".pdf",sep="")) ## Re-analyze using Zero-Inflated Poisson ## Estimate ZIP using zeroinfl in the pscl library zip.result <- zeroinfl(m1zinb, m1data, dist="poisson", offset=log(m1data$exposure)) ## Extract results, taking care to keep track of both equations pe.zip.count <- zip.result$coefficients$count pe.zip.zeros <- zip.result$coefficients$zero pe.zip <- c(pe.zip.count, pe.zip.zeros) vc.zip <- vcov(zip.result) se.zip <- sqrt(diag(vc.zip)) se.zip.count <- se.zip[1:length(pe.zip.count)] se.zip.zeros <- se.zip[ (length(pe.zip.count)+1) : length(se.zip) ] ll.zip <- logLik(zip.result) aic.zip <- AIC(zip.result) ## Simulate results (we re-use xhyp from above) ## Option 1: E(count | not a structural zero) ## { other options include unconditional E(count) and Pr(Structural Zero) } simparam.zip <- mvrnorm(sims,pe.zip,vc.zip) # draw parameters simbetas.zip <- simparam.zip[,1:length(pe.zip.count)] simgammas.zip <- simparam.zip[,(length(pe.zip.count)+1) : length(se.zip) ] yhyp.zipC <- loglinsimev(xhypWZ, simbetas.zip) ## Transform expected counts to expected rates yhyp.zipC$pe <- yhyp.zipC$pe*1000 yhyp.zipC$lower <- yhyp.zipC$lower*1000 yhyp.zipC$upper <- yhyp.zipC$upper*1000 ## Set up lineplot trace of result for plotting using tile trace1zipC <- lineplot(x = valueseq/1000, y = yhyp.zipC$pe, lower = yhyp.zipC$lower, upper = yhyp.zipC$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1data), cfact=model.frame(m1, xhypWZ$x), omit.extrapolated=TRUE), col = green, plot = 1 ) poisLabZIP2 <- textTile(labels="Zero-Inflated Poisson", x=120, y=7.5, col=green, cex=1, plot=1) ## Option 3: Pr(Structural Zero) ## { other options include E(count|not a structural zero) and unconditional count } yhyp.zipZ <- logitsimev(xhypWZ0, simgammas.zip) ## Set up lineplot trace of result for plotting using tile trace1zipZ <- lineplot(x = valueseq/1000, y = 1-yhyp.zipZ$pe, lower = 1-yhyp.zipZ$lower, upper = 1-yhyp.zipZ$upper, ci = list(mark="shaded"), lex=1.5, extrapolate = list(data=model.frame(m1, m1data), cfact=model.frame(m1, xhypWZ0$x), omit.extrapolated=TRUE), col = green, plot = 1 ) poisLabZIP1 <- textTile(labels="Zero-Inflated Poisson", x=205, y=0.133, col=green, cex=1, plot=1) ## Combined plot: revise some details trace1zipC$plot <- 2 poisLabZIP2$plot <- 2 trace1zinbZ$ci$mark <- "dashed" trace1zinbC$ci$mark <- "dashed" poisLabZINB1$x <- 75 poisLabZINB1$y <- 0.5 ## Plot traces using tile file <- "ZIPzc" tile(trace1zipZ, trace1zipC, rug1, poisLabZIP1, poisLabZIP2, RxC = c(1,2), limits = rbind(c(35,400,0,1.0), c(35,400,0,28.5)), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=10, outfile=file), #12 xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels1="Probability neighborhood has a filing HOA", labels2="Expected HOA foreclosure filings per 1000 homes", x1=0.32625, y1=0.75, x2=0.42025, y2=0.75), # 355 405 height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), width = list(spacer=3), gridlines = list(type="y") ) ## Plot traces using tile file <- "ZIPzc0" tile(trace1zipZ, trace1zipC, rug1, trace1zinbZ, trace1zinbC, poisLabZIP1, poisLabZIP2, poisLabZINB1, poisLabZINB2, RxC = c(1,2), limits = rbind(c(35,400,0,1.0), c(35,400,0,28.5)), yaxis=list(major=FALSE), xaxis=list(log=TRUE, at=xat, labels=xlabs), output = list(wide=10, outfile=file), #12 xaxistitle = list(labels="Median home price of neighborhood"), plottitle = list(labels1="Probability neighborhood has a filing HOA", labels2="Expected HOA foreclosure filings per 1000 homes", x1=0.32625, y1=0.75, x2=0.42025, y2=0.75), # 355 405 height = list(plot="golden", xaxistitle=3.5, plottitle=3.5), width = list(spacer=3), gridlines = list(type="y") )