dat <- read.csv("/home/pgilbert/BetzSummerInstitute/Institute2011/Data/Vax004data_Seattle_SummerInstitute_2011.csv",header=T) # Define a variable denoting whether a subject is selected for phase 2 (i.e., has neutralization # levels and CD4 immune responses measured) # This variable is the phase 2 selection indicators dat$phase2select <- ifelse(!is.na(dat$neut) & !is.na(dat$CD4block),1,0) # Note: Only vaccinees have neut and CD4blocking immune responses measured-- so all of the # analyses assess vaccinees only ################################################## # First analyze the data using standard weights: # The phase 2 sample was selected stratifying on sex and white/non-white # Define the stratum phase-2 sampling probabilities. # # We will use Borgan et al.'s (2000, Lifetime Data Analysis) Estimator II, # a D estimator for which only controls (uninfected subjects) constitute the subcohort # (the infected subjects are handled separately). Estimator II is appropriate for # retrospectice outcome-dependent sampling. # Because Estimator II is used, the probabilities are defined for controls only # # Estimate the Phase 2 selection probabilities for the controls, # with stratification by gender x white/nonwhite # # By an odd choice of hard-coding in cch, these sampling fractions are defined for all subjects # (pooling over cases and controls), and the cch function 'subtracts out' the cases # # Important Note: cch assumes complete phase-2/immune biomarker data in cases, it does not allow # sub-sampling of cases alphamenwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==0 & dat$white==1])/ length(dat$neut[dat$trt==1 & dat$sex==0 & dat$white==1]) alphawomenwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==1 & dat$white==1])/ length(dat$neut[dat$trt==1 & dat$sex==1 & dat$white==1]) alphamennonwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$sex==0 & dat$white==0])/ length(dat$neut[dat$trt==1 & dat$sex==0 & dat$white==0]) alphawomennonwh <- length(dat$neut[dat$trt==1 & !is.na(dat$neut) & dat$infect==0 & dat$sex==1 & dat$white==0])/ length(dat$neut[dat$trt==1 & dat$sex==1 & dat$white==0]) # Subset on vaccinated subjects selected for Phase 2 kp <- dat$phase2select==1 in.subcohort <- dat$infect[kp]==0 insubcohortinds <- in.subcohort stratuminds <- ifelse(dat$sex[kp]==0 & dat$wh[kp]==1,0,ifelse(dat$sex[kp]==1 & dat$wh[kp]==1,1, ifelse(dat$sex[kp]==0 & dat$wh[kp]==0,2,3))) cohortstratasizes <- floor(table(stratuminds[insubcohortinds])*c(1/alphamenwh,1/alphawomenwh,1/alphamennonwh,1/alphawomennonwh)) phase2size <- length(dat$phase2select[dat$phase2select==1]) # We will analyze centered & standardized versions of the putative CoRs: dat$neut <- (dat$neut - mean(dat$neut,na.rm=T))/sqrt(var(dat$neut[!is.na(dat$neut)])) dat$CD4block <- (dat$CD4block - mean(dat$CD4block,na.rm=T))/sqrt(var(dat$CD4block[!is.na(dat$CD4block)])) # Sampling probabilities depend on sex and white/non-white; so important to control for both of these. # Also control for riskscore because it predicts infection. library(survival) fit1 <- cch(Surv(dat$daysinfect[kp],dat$infect[kp]) ~ dat$neut[kp] + dat$sex[kp] + dat$white[kp] + dat$riskscore[kp], stratum=stratuminds,subcoh=in.subcohort, id=c(1:phase2size),cohort.size=cohortstratasizes,method="II.Borgan") fit2 <- cch(Surv(dat$daysinfect[kp],dat$infect[kp]) ~ dat$CD4block[kp] + dat$sex[kp] + dat$white[kp] + dat$riskscore[kp], stratum=stratuminds,subcoh=in.subcohort, id=c(1:phase2size),cohort.size=cohortstratasizes,method="II.Borgan") ############################################################################# # Second, we analyze the data using calibrated weights (Breslow et al. 2009 method) ######################################################################################## # Function to implement Breslow et al. (2009) weighted 2-phase Cox model analysis: # dat is a data.frame # imputation.model is a formula that gives a model to impute missing data # interest.model is a formula that gives the model we are interested to fit # strata.formula is a formula that gives how the two-phase sampling is done # subset is a vector of logicals that give which observations are selected for phase 2 # # See the documentation of the survey package in R for details. calibrated.weights.coxph = function (dat, imputation.model, interest.model, strata.formula, subset) { #Step 1: Predict missing covariates for all subjects (not just for those with missing covariates) dstrat<-twophase(id=list(~1,~1),strata=list(NULL,strata.formula),subset=subset,data=dat) fit.step1 = svyglm(imputation.model, design=dstrat) predicted = predict(fit.step1,type="response",newdata=dat,se=F) #The left hand side in the imputation model may be a variable name, e.g., s, or a transformation, e.g. logit(s) lhs=as.character(imputation.model)[2] dat.step1 = dat if (contain(lhs, "(")) { tmp=strsplit(lhs,"[\\()]")[[1]] transf = tmp[1] lhs = tmp[2] if (transf=="logit") transf.f=expit else stop ("transformation not supported") dat.step1[,lhs] <- transf.f(predicted) } else { dat.step1[,lhs] <- predicted } # Step 2: Fit an augmented dataset with risk model to get the auxiliary variable: dfbeta tmp = as.character(interest.model); interest.model.str = paste(tmp[2],"~",tmp[3]) calmodel<-coxph(as.formula(interest.model.str), data=dat.step1 ) db = resid(calmodel,"dfbeta", data=dat.step1)+1 colnames(db)<-paste("db",1:ncol(db),sep="") datDB = cbind(dat, db) dstrt<-twophase(id=list(~1,~1),strata=list(NULL,strata.formula),subset=subset,data=datDB) # Step 3: IPW fitting using the calibrated weights: dcal<-calibrate(dstrt,formula=make.formula(colnames(db)),pop=c(`(Intercept)`=nrow(dat),colSums(db)),calfun="raking",eps=0.0001) # Alternative with Robins, Rotnitzky, Zhao (1994) weights: # dcal<-calibrate(dstrt,formula=make.formula(colnames(db)),pop=c(`(Intercept)`=nrow(dat),colSums(db)),calfun="rrz",eps=0.0001) cal<-svycoxph(as.formula(interest.model.str), design=dcal) } ########################################################################## # Now apply the Breslow et al. method to the data # For this method, make all cases (HIV infections) a separate stratum # So now the statum variable has 5 levels: # Statum 1 is all infected subjects; Stata 2-5 are for uninfected subjects # cross-classified by gender and white/nonwhite dat$strata <- 0 kp <- dat$infect==0 dat$strata[kp] <- ifelse(dat$sex[kp]==0 & dat$white[kp]==1,1, ifelse(dat$sex[kp]==1 & dat$white[kp]==1,2, ifelse(dat$sex[kp]==0 & dat$white[kp]==0,3,4))) # Source some helper functions needed to implement the calibrated weights Cox model: source ("http://youtil.googlecode.com/files/youtil.R") library(survey) imputation.model=neut ~ infectassay interest.model=Surv(daysinfect,infect) ~ neut + sex + white + riskscore strata.formula=~strata subset=dat$phase2select fit3 <- calibrated.weights.coxph (dat, imputation.model, interest.model, strata.formula, subset) summary(fit3) fit3$var imputation.model=CD4block ~ infectassay interest.model=Surv(daysinfect,infect) ~ CD4block + sex + white + riskscore strata.formula=~strata subset=dat$phase2select fit4 <- calibrated.weights.coxph (dat, imputation.model, interest.model, strata.formula, subset) summary(fit4) fit4$var ########################################################################### # Third, evaluate neut and CD4block as specific SoPs via the method # Gilbert and Hudgens (2008, Biometrics) ############################################################################################# # PROGRAM: GilbertHudgens060709R3PrincipalSurrogateCode # AUTHOR: Michael Hudgens # EMAIL: mhudgens@bios.unc.edu # WEB: http://www.bios.unc.edu/~mhudgens/ # DATE: 2 Nov 2007 # # DESCRIPTION: R code to accompany Gilbert and Hudgens Biometrics paper (2008) # "Evaluating Candidate Principal Surrogate Endpoints". # Point estimates, CIs, and p-values associated with # CEP, PAE, and AS are computed under A4-NP as described # in the paper. The Breslow-Day type test is also computed. # # DISCLAIMER: # # THIS INFORMATION IS PROVIDED PROVIDED "AS IS". THERE ARE NO # WARRANTIES, EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR # FITNESS FOR A PARTICULAR PURPOSE REGARDING THE ACCURACY OF # THE MATERIALS OR CODE CONTAINED HEREIN. # # TO EXECUTE: # 1. create a dataframe called "mydata" with variables # mydata$s observed surrogate in treatment arm 1, i.e., S=S(1) # mydata$w baseline predictor # mydata$y binary outcome: 0 or 1 # mydata$delta indicator whether w sampled # mydata$rx treatment arm: 0 or 1 for placebo / vaccine # # mydata$s and mydata$w should be discrete variables having four levels (numeric variables taking values # 1, 2, 3, 4 to indicate the four levels). # It is assumed there is no missing data, other than for s and w. # Missing values should equal NA. # Note for treatment arm 1 subjects, delta=1 implies both w and s are measured; # for treatment arm 0 subjects, delta=1 implies only w is measured. For both # treatment arms delta = 0 implies neither w nor s are measured. # # 2. specify number of bootstrap replicates (boots) and link function below # # 3. execute all of the code below (or save it in a file and source it) # ############################################################################################# # Assess the neut variable as a specific SoP # Create quartilized versions of neut and of the BIP, the infectivity assay # By convention, Gilbert and Hudgens assumes the s=4 level is the contant biomarkers level, # i.e., a negative response. Therefore, the quartilized neut variable takes 1 to be the highest neut quartile, # and 4 to be the lowest neut quartile (hence the '-' in the next bit of code). breakpts <- quantile(-dat$neut,probs=c(.25,.5,.75),na.rm=T) dat$neutquart <- ifelse(-dat$neut