dat <- read.csv("/home/pgilbert/BetzSummerInstitute/Institute2011/Data/Vax004data_Seattle_SummerInstitute_2011.csv",header=T)

# Define a variable denoting whether a subject is selected for phase 2 (i.e., has neutralization 
# levels and CD4 immune responses measured)
# This variable is the phase 2 selection indicators
dat$phase2select <- ifelse(!is.na(dat$neut) & !is.na(dat$CD4block),1,0)

# Note: Only vaccinees have neut and CD4blocking immune responses measured-- so all of the
# analyses assess vaccinees only

##################################################
# First analyze the data using standard weights:

# The phase 2 sample was selected stratifying on sex and white/non-white
# Define the stratum phase-2 sampling probabilities.
#
# We will use Borgan et al.'s (2000, Lifetime Data Analysis) Estimator II,
# a D estimator for which only controls (uninfected subjects) constitute the subcohort
# (the infected subjects are handled separately).  Estimator II is appropriate for
# retrospectice outcome-dependent sampling.

# Because Estimator II is used, the probabilities are defined for controls only
#
# Estimate the Phase 2 selection probabilities for the controls,
# with stratification by gender x white/nonwhite
#
# By an odd choice of hard-coding in cch, these sampling fractions are defined for all subjects
# (pooling over cases and controls), and the cch function 'subtracts out' the cases
#
# Important Note: cch assumes complete phase-2/immune biomarker data in cases, it does not allow
# sub-sampling of cases

alphamenwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==0 & dat$white==1])/
              length(dat$neut[dat$trt==1 & dat$sex==0 & dat$white==1])

alphawomenwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==1 & dat$white==1])/
                length(dat$neut[dat$trt==1 & dat$sex==1 & dat$white==1])

alphamennonwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==0 & dat$white==0])/
                 length(dat$neut[dat$trt==1 & dat$sex==0 & dat$white==0])

alphawomennonwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$infect==0 & dat$sex==1 & dat$white==0])/
                   length(dat$neut[dat$trt==1 & dat$sex==1 & dat$white==0])

# Subset on vaccinated subjects selected for Phase 2
kp <- dat$phase2select==1

in.subcohort <- dat$infect[kp]==0
insubcohortinds <- in.subcohort
stratuminds <- ifelse(dat$sex[kp]==0 & dat$wh[kp]==1,0,ifelse(dat$sex[kp]==1 & dat$wh[kp]==1,1,
               ifelse(dat$sex[kp]==0 & dat$wh[kp]==0,2,3)))

cohortstratasizes <- floor(table(stratuminds[insubcohortinds])*c(1/alphamenwh,1/alphawomenwh,1/alphamennonwh,1/alphawomennonwh))

phase2size <- length(dat$phase2select[dat$phase2select==1])

# We will analyze centered & standardized versions of the putative CoRs:

dat$neut <- (dat$neut - mean(dat$neut,na.rm=T))/sqrt(var(dat$neut[!is.na(dat$neut)]))
dat$CD4block  <- (dat$CD4block - mean(dat$CD4block,na.rm=T))/sqrt(var(dat$CD4block[!is.na(dat$CD4block)]))

# Sampling probabilities depend on sex and white/non-white; so important to control for both of these.
# Also control for riskscore because it predicts infection.

library(survival)
 
fit1 <- cch(Surv(dat$daysinfect[kp],dat$infect[kp]) ~ dat$neut[kp] + dat$sex[kp] 
+ dat$white[kp] + dat$riskscore[kp],
stratum=stratuminds,subcoh=in.subcohort,
id=c(1:phase2size),cohort.size=cohortstratasizes,method="II.Borgan")

fit2 <- cch(Surv(dat$daysinfect[kp],dat$infect[kp]) ~ dat$CD4block[kp] + dat$sex[kp]
+ dat$white[kp] + dat$riskscore[kp],
stratum=stratuminds,subcoh=in.subcohort,
id=c(1:phase2size),cohort.size=cohortstratasizes,method="II.Borgan")

#############################################################################
# Second, we analyze the data using calibrated weights (Breslow et al. 2009 method)

########################################################################################
# Function to implement Breslow et al. (2009) weighted 2-phase Cox model analysis:

# dat is a data.frame
# imputation.model is a formula that gives a model to impute missing data
# interest.model is a formula that gives the model we are interested to fit
# strata.formula is a formula that gives how the two-phase sampling is done
# subset is a vector of logicals that give which observations are selected for phase 2
#
# See the documentation of the survey package in R for details.

calibrated.weights.coxph = function (dat, imputation.model, interest.model, strata.formula, subset) {
    #Step 1: Predict missing covariates for all subjects (not just for those with missing covariates)
    dstrat<-twophase(id=list(~1,~1),strata=list(NULL,strata.formula),subset=subset,data=dat) 
    fit.step1 = svyglm(imputation.model, design=dstrat)
    predicted = predict(fit.step1,type="response",newdata=dat,se=F)
    #The left hand side in the imputation model may be a variable name, e.g., s, or a transformation, e.g. logit(s)
    lhs=as.character(imputation.model)[2]
    dat.step1 = dat 
    if (contain(lhs, "(")) {
        tmp=strsplit(lhs,"[\\()]")[[1]]
        transf = tmp[1]
        lhs = tmp[2]
        if (transf=="logit") transf.f=expit else stop ("transformation not supported")
        dat.step1[,lhs] <- transf.f(predicted)
    } else {
        dat.step1[,lhs] <- predicted
    }

# Step 2: Fit an augmented dataset with risk model to get the auxiliary variable: dfbeta
    tmp = as.character(interest.model); interest.model.str = paste(tmp[2],"~",tmp[3])
    calmodel<-coxph(as.formula(interest.model.str), data=dat.step1 )
    db = resid(calmodel,"dfbeta", data=dat.step1)+1 
    
    colnames(db)<-paste("db",1:ncol(db),sep="")
    datDB = cbind(dat, db)
    dstrt<-twophase(id=list(~1,~1),strata=list(NULL,strata.formula),subset=subset,data=datDB)
   
# Step 3: IPW fitting using the calibrated weights:
    dcal<-calibrate(dstrt,formula=make.formula(colnames(db)),pop=c(`(Intercept)`=nrow(dat),colSums(db)),calfun="raking",eps=0.0001)
# Alternative with Robins, Rotnitzky, Zhao (1994) weights:
#    dcal<-calibrate(dstrt,formula=make.formula(colnames(db)),pop=c(`(Intercept)`=nrow(dat),colSums(db)),calfun="rrz",eps=0.0001)
    cal<-svycoxph(as.formula(interest.model.str), design=dcal)    
}


##########################################################################
# Now apply the Breslow et al. method to the data

# For this method, make all cases (HIV infections) a separate stratum
# So now the statum variable has 5 levels:
# Statum 1 is all infected subjects; Stata 2-5 are for uninfected subjects
# cross-classified by gender and white/nonwhite

dat$strata <- 0
kp <- dat$infect==0
dat$strata[kp] <- ifelse(dat$sex[kp]==0 & dat$white[kp]==1,1,
                  ifelse(dat$sex[kp]==1 & dat$white[kp]==1,2,
                  ifelse(dat$sex[kp]==0 & dat$white[kp]==0,3,4)))

# Source some helper functions needed to implement the calibrated weights Cox model:
source ("http://youtil.googlecode.com/files/youtil.R")
library(survey)

imputation.model=neut ~ infectassay
interest.model=Surv(daysinfect,infect) ~ neut + sex + white + riskscore
strata.formula=~strata
subset=dat$phase2select

fit3 <- calibrated.weights.coxph (dat, imputation.model, interest.model, strata.formula, subset)
summary(fit3)
fit3$var


imputation.model=CD4block ~ infectassay
interest.model=Surv(daysinfect,infect) ~ CD4block + sex + white + riskscore
strata.formula=~strata
subset=dat$phase2select

fit4 <- calibrated.weights.coxph (dat, imputation.model, interest.model, strata.formula, subset)
summary(fit4)
fit4$var


###########################################################################
# Third, evaluate neut and CD4block as specific SoPs via the method
# Gilbert and Hudgens (2008, Biometrics)

#############################################################################################
# PROGRAM:      GilbertHudgens060709R3PrincipalSurrogateCode
# AUTHOR:       Michael Hudgens         
# EMAIL:        mhudgens@bios.unc.edu
# WEB:          http://www.bios.unc.edu/~mhudgens/
# DATE:         2 Nov 2007
#
# DESCRIPTION:  R code to accompany Gilbert and Hudgens Biometrics paper (2008)
#		"Evaluating Candidate Principal Surrogate Endpoints".
#		Point estimates, CIs, and p-values associated with
#		CEP, PAE, and AS are computed under A4-NP as described
#	 	in the paper. The Breslow-Day type test is also computed.
#
# DISCLAIMER:
# 
#       THIS INFORMATION IS PROVIDED PROVIDED "AS IS".  THERE ARE NO
#       WARRANTIES, EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR
#       FITNESS FOR A PARTICULAR PURPOSE REGARDING THE ACCURACY OF
#       THE MATERIALS OR CODE CONTAINED HEREIN.
#
# TO EXECUTE:
#	1. create a dataframe called "mydata" with variables 
#			mydata$s		observed surrogate in treatment arm 1, i.e., S=S(1)
#			mydata$w		baseline predictor 
#			mydata$y		binary outcome: 0 or 1
#			mydata$delta		indicator whether w sampled
#			mydata$rx		treatment arm: 0 or 1 for placebo / vaccine
#	
#	mydata$s and mydata$w should be discrete variables having four levels (numeric variables taking values
#       1, 2, 3, 4 to indicate the four levels). 
#	It is assumed there is no missing data, other than for s and w.  
#       Missing values should equal NA.
#       Note for treatment arm 1 subjects, delta=1 implies both w and s are measured;
#       for treatment arm 0 subjects, delta=1 implies only w is measured. For both
#       treatment arms delta = 0 implies neither w nor s are measured.
#
#	2. specify number of bootstrap replicates (boots) and link function below
#
#	3. execute all of the code below (or save it in a file and source it)
#
#############################################################################################

# Assess the neut variable as a specific SoP

# Create quartilized versions of neut and of the BIP, the infectivity assay

# By convention, Gilbert and Hudgens assumes the s=4 level is the contant biomarkers level,
# i.e., a negative response.  Therefore, the quartilized neut variable takes 1 to be the highest neut quartile,
# and 4 to be the lowest neut quartile (hence the '-' in the next bit of code).

breakpts <- quantile(-dat$neut,probs=c(.25,.5,.75),na.rm=T)
dat$neutquart <- ifelse(-dat$neut<breakpts[1],1,ifelse(-dat$neut<breakpts[2],2,ifelse(-dat$neut<breakpts[3],3,4)))

breakpts <- quantile(dat$infectassay,probs=c(.25,.5,.75),na.rm=T)
dat$infectassayquart <- ifelse(dat$infectassay<breakpts[1],1,ifelse(dat$infectassay<breakpts[2],2,
ifelse(dat$infectassay<breakpts[3],3,4)))

s <- dat$neutquart
w <- dat$infectassayquart
#w[dat$trt==1 & dat$phase2select==0] <- NA
y <- dat$infect
delta <- dat$phase2select
rx <- dat$trt

mydata <- data.frame(s,w,y,delta,rx)

# Specify number of bootstrap iterations
boots <- 200

# link function (h(x,y) in paper)
link <- "log(x/y)"		# options "1-x/y", "log(x/y)", "x-y", "probit"
                                # The probit link is h(x,y) = Phi^{-1}(x) - Phi^{-1}(y),
                                # where Phi is the  stdandard normal cdf

print("")
print("R program being executed")
print(paste("Bootstraps:",boots))
print(paste("Link:",link));print("")

#############################################################################################
# helper functions, variables, etc.
#############################################################################################


mycep <- function(mles,cep="log(x/y)"){

# cep = causal effect predictiveness 
        eps <- 1e-06
        mles <- mles + eps      # avoid division by zero
        if (cep=="1-x/y")	ans <- 1 - mles[(quants+1):(quants*2)]/mles[1:quants]
        if (cep=="log(x/y)")	ans <- log(mles[(quants+1):(quants*2)]/mles[1:quants])
        if (cep=="x-y")		ans <-  mles[(quants+1):(quants*2)] - mles[1:quants]
        if (cep=="probit")	ans <-  qnorm(mles[(quants+1):(quants*2)]) - qnorm(mles[1:quants])
        ans
        }

mypae <- function(mles,marg,cep="log(x/y)",wts="none"){

# pae = proportion associative effect

        # cep(x,y)=log(x/y)
 
        if (wts=="none")        weights <- rep(1,quants-1)
        if (wts=="linear")      weights <- (2:quants)
        if (wts=="extreme")     weights <- c(rep(0,quants-2),1)

        p.s1 <- sum(marg[2:quants]*weights)

        eps <- 1e-06
        mles <- mles + eps      # avoid division by zero

        if (cep=="1-x/y"){
                eae <- sum( (1 - mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- 1 - mles[(quants+1)]/mles[1]
                }

        if (cep=="log(x/y)"){
                eae <- sum( log(mles[(quants+2):(quants*2)]/mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- log(mles[(quants+1)]/mles[1])
                }

        if (cep=="x-y"){
                eae <- sum( (mles[(quants+2):(quants*2)] - mles[2:quants]) * marg[2:quants] * weights)/p.s1
                ede <- mles[(quants+1)] - mles[1]
                }

        if (cep=="probit"){
                eae <- sum( (qnorm(mles[(quants+2):(quants*2)]) - qnorm(mles[2:quants])) * marg[2:quants] * weights)/p.s1
                ede <- qnorm(mles[(quants+1)]) - qnorm(mles[1])
                }

        pae <- abs(eae) / ( abs(ede) + abs(eae) )
        list(eae=eae,ede=ede,pae=pae)
        }

bd <- function(mles){
        mu0 <- mean(mles[1:quants])
        mu1 <- mean(mles[1:quants+quants])
        t <- 0
        for (jj in 2:quants)
                t <- t + (jj-1) * (mles[jj] - (mles[jj]+mles[jj+quants]) * (mu0/(mu0+mu1)) )
        t
        }


#############################################################################################
# wrapper for constrained optimizer
#############################################################################################
	
	
myopt <- function(){

        index <- is.na(mydata$s)
        coarse.s <- sort(unique(mydata$s[!index]))

        index <- is.na(mydata$w)
        coarse.w <- sort(unique(mydata$w[!index]))

	# calculate infections rates to be used below

	nvac <- sum(mydata$rx==1)
	npla <- sum(mydata$rx==0)

	infvac <- sum(mydata$rx==1 & mydata$y==1)
	infpla <- sum(mydata$rx==0 & mydata$y==1)

	# consistent estimates of joint distn of (s,w), marginal of s
	
	joint <- matrix(0,quants,quants)
	den <- sum(!is.na(mydata$s) & mydata$y==0)
	den1 <- sum(!is.na(mydata$s) & mydata$y==1)
	for (ss in 1:quants)
	 	for (ww in 1:quants){
			num <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==0)
			num1 <- sum(!is.na(mydata$s) & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww] & mydata$y==1)
			joint[ss,ww] <- num/den*(1-infvac/nvac) + num1/den1*infvac/nvac
			}

	marg.s <- matrix(0,1,quants)
	for (ss in 1:quants)
		marg.s[ss] <- sum(joint[ss,])/sum(joint)

	# choose initial values

	beta0 <- c( rep(infpla/npla,quants), rep(infvac/nvac,quants) )
	gamma0 <- matrix(0.1,1,quants-1)	# effect of w in nonparametric model

	# all possible observed data combinations

	delta0 <- matrix(0,2,1)
	for (zz in 0:1)
			delta0[zz+1,1] <- sum(mydata$delta==0 & mydata$rx==zz)			

	delta1z0 <- matrix(0,2,4)
	for (yy in 0:1)
		for (ww in 1:4)
			delta1z0[yy+1,ww] <- sum(mydata$delta==1 & mydata$rx==0 & mydata$y==yy & mydata$w==coarse.w[ww])			

	delta1z1 <- array(0,c(2,4,4))
	for (yy in 0:1)
		for (ss in 1:4)
			for (ww in 1:4)
	delta1z1[yy+1,ss,ww] <- sum(mydata$delta==1 & mydata$rx==1 & mydata$y==yy & mydata$s==coarse.s[ss] & mydata$w==coarse.w[ww]) 

	# likelihood

	like <- function(theta){
		beta <- theta[1:8]; gamma <- c(theta[9:11],0)
		ans <- 0

		# delta=0
		# z=0,y=1
			obg0 <- outer(beta[1:4],gamma,"+")*joint
			s0 <- sum(obg0)
			ans <- ans + log((1-s0)^delta0[1,1])

		# z=1,y=1
			obg1 <- outer(beta[1:4+quants],gamma,"+")*joint
			s1 <- sum(obg1)
			ans <- ans + log((1-s1)^delta0[2,1])

		# delta=1, z=0
			m1 <- matrix(1,1,4)
			arg1 <- m1 %*% obg0
			ans <- ans + m1 %*% t(log(arg1^delta1z0[2,]))

			sj <- m1 %*% joint
			arg0 <- sj-arg1
			ans <- ans + m1 %*% t(log(arg0^delta1z0[1,]))


		# delta=1, z=1
			ans <- ans + sum(log((joint-obg1)^delta1z1[1,,]))
			ans <- ans + sum(log(obg1^delta1z1[2,,]))

		-ans		# constrOptim is minimizing
		}
			
	# constraints
	ui <- matrix(0,43,11)
	ui[1:11,1:11] <- diag(11)
	counter <- 12
	for (ii in 1:8)
		for (jj in 1:4){
			ui[counter,ii] <- -1
			if (jj<4) ui[counter,8+jj] <- -1
			counter <- counter + 1
			}
	ci <- c(rep(1e-8,11),rep(1e-8-1,32))

	itheta <- c(beta0,gamma0)

	beta1 <- constrOptim(itheta,like,NULL,ui=ui,ci=ci,control=list(maxit=100),outer.iterations=100)$par

	ans <- list(mle=beta1[1:8],marg=marg.s,gamma=beta1[9:11])

	}


#############################################################################################
# call optimizer, bootstrap, compute results, etc.
#############################################################################################

	quants <- 4
	parms <- quants*2

	# call optimizer
  	opt.out <- myopt()
	print("Optimization complete")

	opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4	# average risk estimate

	# compute estimates, test statistics, etc
        cep <- mycep(opt.out$mle,cep=link)

	pae <- matrix(0,3,1)
	pae[1] <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="none")$pae
	pae[2] <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="linear")$pae
	ext    <- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="extreme")
	pae[3] <- ext$pae

	as  <- abs(ext$eae)-abs(ext$ede)

# pae is proportion associative effect; as is associative span, bd is Breslow-Day trend-type test

	bd.mle <- bd(opt.out$mle)

	# bootstrap
	print(paste("Begin",boots,"boostrap iterations"))
	n <- dim(mydata)[1]

	bd.boot <- matrix(0,boots,1)
        cep.boot <- matrix(0,boots,quants)
	pae.boot <- matrix(0,boots,3)
	as.boot <- matrix(0,boots,1)

        cep.ci.boot <- matrix(0,quants,2)
        cep.sd.boot <- matrix(0,quants,1)
        cep.mn.boot <- matrix(0,quants,1)

        bd.sd.boot <- 0
        bd.mn.boot <- 0

        as.ci.boot <- 0
        as.sd.boot <- 0
        as.mn.boot <- 0

        pae.ci.boot <- matrix(0,3,2)
        pae.sd.boot <- matrix(0,3,1)
        pae.mn.boot <- matrix(0,3,1)

	o.data <- mydata
	for (ii in 1:boots){

		samp <- sample(c(1:n),size=n,replace=T)
		mydata <- o.data[samp,]

		opt.out <- myopt()
		opt.out$mle <- opt.out$mle + sum(opt.out$gamma)/4	# average risk estimate

                cep.boot[ii,] <- mycep(opt.out$mle,cep=link)

		pae.boot[ii,1] 	<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="none")$pae
		pae.boot[ii,2] 	<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="linear")$pae
		ext 		<- mypae(opt.out$mle,opt.out$marg,cep=link,	wts="extreme")
		pae.boot[ii,3] 	<- ext$pae
		as.boot[ii] 	<- abs(ext$eae)-abs(ext$ede)

		bd.boot[ii] <- bd(opt.out$mle)
		}
	mydata <- o.data
	print(paste("End",boots,"boostrap iterations"))
	print("")


        for (ii in 1:quants){
 		cep.sd.boot[ii] <- sd(cep.boot[,ii])
 		cep.mn.boot[ii] <- mean(cep.boot[,ii])
		cep.ci.boot[ii,] <- quantile(cep.boot[,ii],c(.025,.975))
		}

	bd.sd.boot <- sd(bd.boot)
	bd.mn.boot <- mean(bd.boot)

	for (ii in 1:3)	pae.sd.boot[ii] <- sd(pae.boot[,ii])
	for (ii in 1:3)	pae.mn.boot[ii] <- mean(pae.boot[,ii])
	for (ii in 1:3)	pae.ci.boot[ii,] <- quantile(pae.boot[,ii],c(.025,.975))

	as.ci.boot <- quantile(as.boot,c(.025,.975))
	as.sd.boot <- sd(as.boot)
	as.mn.boot <- mean(as.boot)

	# finish up, printout results, etc


	print("CEP estimates, 95% confidence intervals, and two-sided p-values")
	for (ii in 1:quants){
		p <- 2*pnorm(-abs(cep[ii]/cep.sd.boot[ii]))
		print(paste("CEP(",ii,",1)=",round(cep[ii],3), " (",round(cep.ci.boot[ii,1],3),", ",round(cep.ci.boot[ii,2],3),"), p=",round(p,5),sep=""))
		}

	print("");print("PAE estimates, 95% confidence intervals, and one-sided p-values")
	for (ii in 1:3){
		p <- 1-pnorm((pae[ii]-.5)/pae.sd.boot[ii])
		print(paste("PAE(w",ii,")=",round(pae[ii],3), " (",round(pae.ci.boot[ii,1],3),", ",round(pae.ci.boot[ii,2],3),"), p=",round(p,5),sep=""))
		}

	print("");print("AS estimate, 95% confidence interval, and one-sided p-value")
		p <- 1-pnorm(as/as.sd.boot)               
		print(paste("AS=",round(as,3), " (",round(as.ci.boot[1],3),", ",round(as.ci.boot[2],3),"), p=",round(p,5),sep=""))

	print("");print("BD test statistic, estimated standard error, and one-sided p-value")
	   bd.p <- 1-pnorm(bd.mle/bd.sd.boot)
		print(paste("BD=",round(bd.mle,3),", SE=",round(bd.sd.boot,3),", p=",round(bd.p,5),sep=""))