######################################################################### ## 8/13/2010 ## R code to implement the Results 1-4 in Gilbert P, Yu X, Frahm N, ## Rotnitzky A (2011) ## "Optimal Auxiliary-Covariate Based Sampling Design for ## Semiparametric Efficient Estimation of a Mean or Mean Difference, with ## Application to Clinical Trials" ## ## Authors: Xuesong Yu and Peter Gilbert ## ## The program implements Results 1, 2, 3, 4 for the one-sample problem ## of estimating the mean of an expensive endpoint Y, given that a ## univariate auxiliary W is measured in all subjects (at phase 1) and ## Y is measured in a random sample of subjects (at phase 2). ## ## The functions result1 and result 2 output ## - the optimal \lambda* vector for subjects with W=j (j may be ## unique for all subjects) ## - the optimal sample size n=n* ## - the optimal \lambdabar for simple random sampling ## - the optimal n=n*bar under simple random sampling ## - the relative efficiency (RE) of the optimal sampling ## design versus the optimal simple random sampling design ## ## The functions result3 and result4 output ## - the optimal \lambda* vector for subjects with W=j ## - the optimal \lambdabar for simple random sampling ## - the relative efficiency (RE) of the optimal sampling ## design versus the optimal simple random sampling design ## ######################################################################### ######################################################################################## #### Notation #### # If not otherwise noted, notation and equation numbers are as in # Gilbert et al. # # BMINC0 is B-C0 in the paper # .v = in vector format # beta.v = a vector of betaj for all j, betaj=E(Y|W=j), specified # from pilot data # beta=E(Y)= the parameter of interest # p.v = vector of all pj, pj=P(W=j) # v.v = vector of vj, vj=Var(Y|W=j) # lambda.v = vector of lambdaj, lambdaj = lambda for W=j # c1 = per-person cost for measuring the univariate auxiliary W # c2j = per-person cost for measuring Y conditional on W=j # c2.v = vector of c2j # Vprime=Var(E[Y|W]) = sum((betaj-beta)^2*pj) # var.beta=Vprime + sum(vj*pj/lambdaj) as in equation (3) # ######################################################################################### ######################################################################################### ### Result 1: minimize variance given expected total cost B-C0 ######################################################################################### ## This function updates lambda* derived from the closed-form ## solution in equation (4), as described in Appendix A update.lambdas1 = function(lambda.closed, beta, beta.v, p.v, v.v, c2.v, c1, BMINC0, n0=1) { # optimal sample size n.star equation (6) n.star = BMINC0/(c1 + sum(c2.v*lambda.closed*p.v)) Vprime = sum((beta.v-beta)^2 *p.v) Estar = sum(sqrt(c2.v*v.v)*p.v) # case 1: all lambda.closed <=1 but n*=n0 but some lambda.closed >1 # case 3: n*1 # First deal with cases 2 and 3 all.mat = cbind(ind=1:length(v.v), c2.v, p.v, lambda.closed, v.v, lambda.orig=lambda.closed) all.mat = data.frame(all.mat) m = 1 while (any(all.mat$lambda.closed>1)) { all.mat.1= subset(all.mat, lambda.closed>=1) all.mat.2= subset(all.mat, lambda.closed<1) all.mat.1= all.mat.1[order(all.mat.1$lambda.closed), ] if (nrow(all.mat.1)>m){ all.mat.1$lambda.closed[1:m] = 1 all.mat.2 = rbind(all.mat.2, all.mat.1[-c(1:m),]) all.mat.1 = all.mat.1[1:m,] } else { all.mat.1$lambda.closed = 1 } ## Appendix A equation (21) cp.sum = sum(all.mat.1$c2.v*all.mat.1$p.v) vp.sum = sum(all.mat.1$v.v*all.mat.1$p.v) cvp.sum= sum(sqrt(all.mat.1$c2.v*all.mat.1$v.v)*all.mat.1$p.v) lambda.m = sqrt(all.mat.2$v.v/all.mat.2$c2.v)*sqrt((c1 + cp.sum)/(Vprime+vp.sum)) all.mat.2$lambda.closed = lambda.m all.mat=rbind(all.mat.2, all.mat.1) all.mat$lambda.closed[is.na(all.mat$lambda.closed)| is.infinite(all.mat$lambda.closed)] = 1 ## Appendix A equation (22) n.star = (BMINC0/sqrt(c1+cp.sum))/(sqrt(c1+cp.sum) +(Estar-cvp.sum)/ sqrt(Vprime+vp.sum)) m=m+1 } # case 1: all lambda.closed <=1 but n*1) { cat("Warning: sum(p.v) = ", sum(p.v), "\n") } beta=sum(beta.v*p.v) ## 1. Closed-form solution for optimal lambda as in equation (4) Vprime = sum((beta.v-beta)^2 *p.v) lambda.closed = ((v.v/Vprime)*(c1/c2.v))^(1/2) ## Section 3.6 deal with "perfect" auxiliary lambda.closed[is.na(lambda.closed)| is.infinite(lambda.closed)] = 1 lambda.closed[lambda.closed == 0] = 1 ## 2. fix the closed-form solution all.up = update.lambdas1(lambda.closed, beta, beta.v, p.v, v.v, c2.v, c1, BMINC0, n0) lambda.up = all.up$all.mat$lambda.closed var.beta = Vprime+sum(v.v*p.v/lambda.up) ## equation (3) # optimal sample size n.star equation (6) n.star = all.up$n.star # simple random sampling approach where lambdabar = sqrt(sum(v.v*p.v)/sum(c2.v*p.v)) * sqrt(c1/Vprime) ## equation (15) if (lambdabar > 1) lambdabar = 1 # Optimal n under simple random sampling nbar.star = BMINC0/(c1+lambdabar * sum(c2.v*p.v)) var.betabar = Vprime+sum(v.v*p.v/lambdabar) ## 3. calculate RE RE= round((var.beta/n.star)/(var.betabar/nbar.star),4) all.res=matrix(c(lambda.up, lambdabar, RE, n.star, nbar.star, var.beta, var.betabar), nrow=1) len=length(beta.v) all.res=round(all.res, 4) cat(paste(“RE optimal vs optimal SRS = “,RE),”\n”) colnames(all.res)=c(paste("lambda", seq(0, length=len), sep=""), "lambdabar", "RE", "n*", "n*bar", "var(beta)","var(beta)lambdabar") return(all.res) } ######################################################################################### ### Result 2: Dual Problem -- Minimize expected total cost given ### fixed variance ######################################################################################### ## This function updates lambda derived from the closed-form solution ## for Result 2, as described in Appendix A update.lambdas2 = function(V, varY, lambda.closed, p.v, v.v, c2.v, c1, n0=1) { # optimal sample size n.star equation (7) Vprime = sum((beta.v-beta)^2 *p.v) n.star=(sqrt(Vprime/c1) * sum(sqrt(c2.v*v.v)*p.v) + Vprime)/V Estar = sum(sqrt(c2.v*v.v)*p.v) # case 1: all lambda.closed <=1 but n*=n0 but some lambda.closed >1 # case 3: n*1 # First deal with cases 2 and 3 all.mat = cbind(ind=1:length(v.v), c2.v, p.v, lambda.closed, v.v, lambda.orig=lambda.closed) all.mat = data.frame(all.mat) m = 1 while (any(all.mat$lambda.closed>1)) { all.mat.1= subset(all.mat, lambda.closed>=1) all.mat.2= subset(all.mat, lambda.closed<1) all.mat.1= all.mat.1[order(all.mat.1$lambda.closed), ] if (nrow(all.mat.1)>m){ all.mat.1$lambda.closed[1:m] = 1 all.mat.2 = rbind(all.mat.2, all.mat.1[-c(1:m),]) all.mat.1 = all.mat.1[1:m,] } else { all.mat.1$lambda.closed = 1 } cp.sum = sum(all.mat.1$c2.v*all.mat.1$p.v) vp.sum = sum(all.mat.1$v.v*all.mat.1$p.v) cvp.sum= sum(sqrt(all.mat.1$c2.v*all.mat.1$v.v)*all.mat.1$p.v) ## Appendix A equation (40) lambda.m = sqrt(all.mat.2$v.v/all.mat.2$c2.v)*sqrt((c1 + cp.sum)/(Vprime+vp.sum)) all.mat.2$lambda.closed = lambda.m all.mat=rbind(all.mat.2, all.mat.1) all.mat$lambda.closed[is.na(all.mat$lambda.closed)| is.infinite(all.mat$lambda.closed)] = 1 ## Appendix A equation (41) n.star = (Vprime + vp.sum + sqrt((Vprime+vp.sum)/(c1 + cp.sum))*(Estar-cvp.sum))/V m=m+1 } # Now deal with case 1 # check if n* >= n0 # case 1: all lambda.closed <=1 but n* 1) lambdabar = 1 nstar.bar=(sum(v.v/lambdabar*p.v) + Vprime)/V Bbar = BMINC0 + nstar.bar*(c1 + lambdabar*sum(c2.v*p.v)) RE=round(B/Bbar,4) all.res=matrix(c(lambda.up, lambdabar, RE, n.star, nstar.bar), nrow=1) len=length(beta.v) all.res=round(all.res, 4) cat(paste(“RE optimal vs optimal SRS = “,RE),”\n”) colnames(all.res)=c(paste("lambda", seq(0, length=len), sep=""), "lambdabar", "RE", "n*", "n*bar") return(all.res) } ######################################################################################### ### Result 3: Minimize variance given fixed expected phase-2 budget ### and fixed study sample size n ######################################################################################### update.lambdas3 = function(lambda.closed, p.v, v.v, c2.v, Bprime,n) { Estar = sum(sqrt(c2.v*v.v)*p.v) # some lambda.closed >1 all.mat = cbind(ind=1:length(v.v), c2.v, p.v, lambda.closed, v.v, lambda.orig=lambda.closed) all.mat = data.frame(all.mat) m = 1 while (any(all.mat$lambda.closed>1)) { all.mat.1= subset(all.mat, lambda.closed>=1) all.mat.2= subset(all.mat, lambda.closed<1) all.mat.1= all.mat.1[order(all.mat.1$lambda.closed), ] if (nrow(all.mat.1)>m){ all.mat.1$lambda.closed[1:m] = 1 all.mat.2 = rbind(all.mat.2, all.mat.1[-c(1:m),]) all.mat.1 = all.mat.1[1:m,] } else { all.mat.1$lambda.closed = 1 } cp.sum = sum(all.mat.1$c2.v*all.mat.1$p.v) vp.sum = sum(all.mat.1$v.v*all.mat.1$p.v) cvp.sum= sum(sqrt(all.mat.1$c2.v*all.mat.1$v.v)*all.mat.1$p.v) ## Appendix A equation (50) lambda.m = sqrt(all.mat.2$v.v/all.mat.2$c2.v)*(Bprime-n*cp.sum)/(n*(Estar-cvp.sum)) all.mat.2$lambda.closed = lambda.m all.mat=rbind(all.mat.2, all.mat.1) all.mat$lambda.closed[is.na(all.mat$lambda.closed)| is.infinite(all.mat$lambda.closed)] = 1 m=m+1 } all.mat=all.mat[order(all.mat$ind), ] list(all.mat=all.mat, m=m) } result3=function(Bprime, p.v, beta.v, v.v, c2.v, n) { beta=sum(beta.v*p.v) budget = n*sum(c2.v*p.v)-Bprime Estar = sum(sqrt(c2.v*v.v)*p.v) if (budget <=0) { lambda.up = rep(1, length(v.v)) } else { lambda.closed=sqrt(v.v/c2.v)*Bprime/(n*Estar) ## equation (8) ## Deal with a "perfect" auxiliary if necessary lambda.closed[is.na(lambda.closed)| is.infinite(lambda.closed)] = 1 lambda.closed[lambda.closed == 0] = 1 ## Update the closed-form solution if necessary all.up = update.lambdas3(lambda.closed, p.v, v.v, c2.v, Bprime, n) lambda.up = all.up$all.mat$lambda.closed } ## simple random sampling lambdabar = Bprime /(n*sum(c2.v*p.v)) if (lambdabar > 1) lambdabar = 1 Vprime = sum((beta.v-beta)^2 *p.v) var.beta = Vprime+sum(v.v*p.v/lambda.up) var.betabar = Vprime+sum(v.v*p.v/lambdabar) RE= round((var.beta/n)/(var.betabar/n),4) all.res=matrix(c(lambda.up, lambdabar, RE, var.beta, var.betabar), nrow=1) len=length(beta.v) all.res=round(all.res, 4) cat(paste(“RE optimal vs optimal SRS = “,RE),”\n”) colnames(all.res)=c(paste("lambda", seq(0, length=len), sep=""), "lambdabar", "RE", "var(beta)","var(beta)lambdabar") return(all.res) } ######################################################################################## ### Result 4: Minimize expected phase-two budget given fixed variance ### and fixed study sample size n ######################################################################################################## update.lambdas4 = function(V, Vprime, lambda.closed, p.v, v.v, c2.v, n, varY) { Estar = sum(sqrt(c2.v*v.v)*p.v) # some lambda.closed >1 all.mat = cbind(ind=1:length(v.v), c2.v, p.v, lambda.closed, v.v, lambda.orig=lambda.closed) all.mat = data.frame(all.mat) m = 1 while (any(all.mat$lambda.closed>1)) { if (all(all.mat$lambda.closed>=1)) { all.mat$lambda.closed = 1 } else { all.mat.1= subset(all.mat, lambda.closed>=1) all.mat.2= subset(all.mat, lambda.closed<1) all.mat.1= all.mat.1[order(all.mat.1$lambda.closed), ] if (nrow(all.mat.1)>m){ all.mat.1$lambda.closed[1:m] = 1 all.mat.2 = rbind(all.mat.2, all.mat.1[-c(1:m),]) all.mat.1 = all.mat.1[1:m,] } else { all.mat.1$lambda.closed = 1 } cp.sum = sum(all.mat.1$c2.v*all.mat.1$p.v) vp.sum = sum(all.mat.1$v.v*all.mat.1$p.v) cvp.sum= sum(sqrt(all.mat.1$c2.v*all.mat.1$v.v)*all.mat.1$p.v) ## Appendix A equation (61) lambda.m = sqrt(all.mat.2$v.v/all.mat.2$c2.v)*(Estar-cvp.sum)/(n*V-Vprime-vp.sum) all.mat.2$lambda.closed = lambda.m all.mat=rbind(all.mat.2, all.mat.1) all.mat$lambda.closed[is.na(all.mat$lambda.closed)| is.infinite(all.mat$lambda.closed)] = 1 } m=m+1 } if(all(all.mat$lambda.closed==1 | all.mat$lambda.closed<0)) { all.mat$lambda.closed = 1 } all.mat=all.mat[order(all.mat$ind), ] list(all.mat=all.mat, m=m) } result4=function(V, beta.v, v.v, c2.v, p.v, n, varY) { beta=sum(beta.v*p.v) Vprime = sum((beta.v-beta)^2 *p.v) lambda.closed= sqrt(v.v/c2.v)* sum(sqrt(c2.v*v.v)*p.v)/(n*V-Vprime) ## "perfect" auxiliary lambda.closed[is.na(lambda.closed)| is.infinite(lambda.closed)] = 1 lambda.closed[lambda.closed == 0] = 1 ## fix the closed-form solution all.up = update.lambdas4(V, Vprime, lambda.closed, p.v, v.v, c2.v, n, varY) lambda.up = all.up$all.mat$lambda.closed ## phase 2 budget B.prime = n*sum(c2.v*lambda.up*p.v) ## simple random sampling where lambdaj= fixed constant for all j lambdabar = sum(v.v*p.v)/(n*V - Vprime) if (lambdabar > 1) lambdabar = 1 B2.prime = n*lambdabar*sum(c2.v*p.v) RE= round(B.prime/B2.prime,4) all.res=matrix(c(lambda.up, lambdabar, RE), nrow=1) len=length(beta.v) all.res=round(all.res, 2) cat(paste(“RE optimal vs optimal SRS = “,RE),”\n”) colnames(all.res)=c(paste("lambda", seq(0, length=len), sep=""), "fixedlambda", "RE") return(all.res) } ######################################################################### ####################### ## Now code is given to estimate the mean with a 95% confidence interval ## and p-value, using either the naïve complete-case approach or the ## Rotnitzky and Robins (1995, Biometrika) estimator ####################### ## Estimation under Approach 1: Analyze the data by only including the ## complete-cases (those with Y observed), ignores auxiliaries in the ## analysis ## ## Input: ## my.data = input data with column names W, Y, R, lambda ## and each row for subject i=1 to n ## ## alpha = 0.05, alpha level for CI ## ## Output: ## point estimate of E(Y), standard error, (1-alpha)x100% confidence ## interval, and a 2-sided p-value ## Completecaseestimate.beta=function(my.data, alpha=0.05) { obs.Y = subset(my.data, R==1) beta.hat = mean(obs.Y$Y) var.beta= var(obs.Y$Y) /dim(obs.Y)[1] ## CI and p value ci = c(beta.hat - sqrt(var.beta)* qnorm(1-alpha/2), beta.hat + sqrt(var.beta)* qnorm(1-alpha/2)) pval = 2*(1-pnorm(abs(beta.hat)/sqrt(var.beta))) c(beta.hat, sqrt(var.beta), ci, pval) } ######################################################################## ## RR estimation: This function estimates beta using the RR method ## (Rotnitzky and Robins, 1995, Biometrika) ## The model E[Y|W] is fit using simple linear regression, via ## the robust method of Yohai (1987, Biometrika) "ROB" or via ## ordinary least squares "OLS" ## ## Input: ## my.data = input data with column names W, Y, R, lambda ## and each row for subject i=1 to n ## ## alpha = 0.05, alpha level for CI ## ## alpha0, alpha1 and alpha2 = parameters for true mean model E(Y|W) ## only needed when trueG ==1 ## ## fitmethod = "ROB" for robust method ## "OLS" for ordinary least squares ## ## trueG = 1 using true E(Y|W) for simulation purpose ## 2 using estimated E(Y|W) (OLS or ROB) (always set ## trueG for an actual data analysis) ## ## Output: ## RR estimate, standard error, (1-alpha)x100% confidence interval, and ## a 2-sided p-value ## RRestimate.beta=function(my.data, alpha=0.05, alhpa0=1, alpha1=1, alpha2=0, fitmethod="ROB", trueG=1){ n=dim(my.data)[1] if (trueG!=1) { ## using estimated E(Y|W) (OLS or ROB) if (fitmethod!="ROB") { ## Fitted values for study data by ordinary least squares (OLS method) fit = lm(Y~W, data = my.data, subset=R==1) } if (fitmethod=="ROB") { ## Fitted values for study data by the robust lm (ROB method) fit = lmrob(Y~W, data = my.data, subset=R==1, control = lmrob.control(max.it = 100)) } ## Obtain fitted values for E[Y|W] for each subject my.data$g.hat=predict(fit, newdata=my.data) } if(trueG==1) { ## using true E(Y|W) my.data$g.hat=alpha0 + alpha1*my.data$W + alpha2*my.data$W^2 } Ug1=my.data$R*my.data$Y/my.data$lambda - (my.data$R-my.data$lambda)*my.data$g.hat/my.data$lambda Ug2=my.data$R/my.data$lambda - (my.data$R-my.data$lambda)/my.data$lambda beta.hat=sum(Ug1) /sum(Ug2) ## Sandwich variance estimator An= -1 Ug= my.data$R*(my.data$Y-beta.hat)/my.data$lambda - (my.data$R - my.data$lambda) *(my.data$g.hat-beta.hat)/my.data$lambda Bn= sum(Ug^2) /n var.beta= 1/An * Bn * (1/An)/n ## CI and p value ci = c(beta.hat - sqrt(var.beta)* qnorm(1-alpha/2), beta.hat + sqrt(var.beta)* qnorm(1-alpha/2)) pval = 2*(1-pnorm(abs(beta.hat)/sqrt(var.beta))) c(beta.hat, sqrt(var.beta), ci, pval) } ##################################################################### ## Design and Analysis under Result 3 (as illustrated in the article) ## ## The two functions analysisresult3_design and analysisresult3_analysis ## apply for a continuous univariate auxiliary W ## ## It specifies the optimal lambda using Result 3, specifying ## lambda* based on a pilot data-set of (W,Y) pairs and either fitting ## the model Var(Y|W) = exp(nu00 + nu0*W + nu1*W^2) by maximum likelihood ## (Approach 3a) or by using sample variances of Y within quantiles of W ## (Approach 3c). It also computes the optimal lambda* for simple random ## sampling under both of these approaches, and outputs relative ## efficiencies (REs), which can be used to help guide what sampling ## design to select for the study. ## ## After specifying lambda*, analysisresult3_analysis carries out the RR ## analysis, with E[Y|W] modeled using a simple linear regression ## model(OLS) OR robust method (ROB) ## ## The analysis function produces the following output: ## ## Estimated beta, s.e., (1-alpha)x100% Wald-based confidence interval ## for beta, and 2-sided Wald-based p-value for testing H0: beta=0, ## using the following methods (described in the simulation ## study section of Gilbert et al., 2011): ## ## 1. Complete-case method (Approach 1) ## ## AND either methods (2,3), (4,5), or (6,7) below, depending on which ## approach was used to determine the selection indicators R and hence ## the data on Y ## ## 2. RR method using simple random sampling and ## OLS fitting for E[Y|W] from study data (Approach 2) ## 3. RR method using simple random sampling and ## ROB fitting for E[Y|W] from study data (Approach 2) ## 4. RR method using optimal stratified sampling ## and OLS fitting for E[Y|W] from study data(Approach 3c) ## 5. RR method using optimal stratified sampling ## and ROB fitting for E[Y|W] from study data (Approach 3c) ## 6. RR method using optimal sampling ## and OLS fitting for E[Y|W] from study data (Approach 3a) ## 7. RR method using optimal quartilized stratified sampling ## and ROB fitting for E[Y|W] from study data(Approach 3a) ## ## For 5 and 7, robust fitting of a simple linear regression ## model E[Y|W] for the pilot data is used to specify the ## E[Y|W=j] vector for j = 1 to n. For 4-5 this is done ## with W a factor variable with M bins/quintiles; and for 6-7 ## this is done with W a continuous variable. ## ## For 2-7 the confidence intervals and p-values are computed ## using the sandwich variance estimator with the RR estimating equation. ## ## For Approach 1 the confidence intervals and p-values are computed ## using the sample variance of Y ## ###################################################################### analysisresult3_design <- function(Ypilot,Wpilot,Bprime,c2.v,c2.vstrat,Wstudy,M=4, meanFun=1) { ###################################################################### # Input variables: # #(Wpilot, Ypilot) = (W,Y) values from the pilot study where both W and Y # were measured # -Assumes complete data (no missing values) # # Bprime, c2.v = defined above # # c2.vstrat = a vector of c2 for stratified W # # Wstudy = W values from the study # -Assumes complete data (no missing values) # # M = Number of bins/quintiles for the optimal stratified analysis # # meanFun = 1 E(Y|W) = alpha0 +alpha1*W # = 2 E(Y|W) = alpha0 +alpha1*W + alpha2*W^2 # # Output: # optimal lambdas for optimal sampling, stratified sampling, simple # random sampling,2nd-phase selection indicators R for: # -optimal sampling # -optimal stratified sampling # -optimal simple random sampling, # RE for optimal sampling vs optimal simple random sampling, # RE for optimal sampling vs optimal stratified random sampling, # RE for optimal stratified sampling vs optimal simple random sampling # ####################################################################### my.data = data.frame(cbind(Ypilot,Wpilot)) my.study = data.frame(Wstudy) lambdaopt=NA lambdaoptstrat.n=NA lambdabar=NA Ropt=NA Roptstrat=NA Rsimple=NA RE3avs2=NA RE3cvs2=NA ################################################################ ## Analyze the pilot data to determine the optimal lambda* and ## optimal lambdabar (for optimal simple random sampling) ## ## Estimate the conditional variance function Var(Y|W) ## under the model ## Var(Y|W) = exp(nu00 + nu0*W + nu1*W^2) if (meanFun==1) { X=cbind(1, Wpilot) } else { X=cbind(1, Wpilot, Wpilot^2) } Z=cbind(1, Wpilot, Wpilot^2) fit1=myremlscore(Ypilot, X, Z, maxit=200) if (!inherits(fit1, "try-error") & fit1$conv==1) { gamma= fit1$gamma nu00.hat=gamma[1,1] nu0.hat=gamma[2,1] nu1.hat=gamma[3,1] #Set Var(Y|W) values for all of the study subjects based on their #auxiliaries W v.vopt= exp(nu00.hat + nu0.hat*Wstudy + nu1.hat*Wstudy^2) ############################################################# # Next, estimate the conditional variance function Var(Y|W) # under the model Var(Y|W) = variance for the quintile within # which W lies cuts=quantile(Wpilot, probs=seq(0,1,len=M+1)) ## form of ( ] ## make "cuts" from pilot data fit the range of study data cuts[1]=min(Wstudy) cuts[M+1]=max(Wstudy) my.data$Wpilot.j= cut(Wpilot, cuts, include.lowest =TRUE, labels=FALSE) varY.j = aggregate(Ypilot, by=list(my.data$Wpilot.j), FUN="var") v.voptstrat = varY.j$x ##################################### # Use the empirical distribution to estimate p.v = P(W=j) # For W continuous: n <- length(Wstudy) p.v <- rep(1/n,n) ##################################### ## Specify beta.v = vector of E[Y|W=j] based on the pilot data ## For W continuous: ## Obtain fitted values by robust lm (ROB method) fit = lmrob(Ypilot~Wpilot, data = my.data, control = lmrob.control(max.it = 100)) beta0hat <- coef(fit)[1] beta1hat <- coef(fit)[2] beta.v=beta0hat + beta1hat*Wstudy # For W in quintiles with m levels: fit = lmrob(Ypilot~factor(my.data$Wpilot.j), data = my.data, control = lmrob.control(max.it = 100)) beta0hat <- coef(fit)[1] beta1hatvect <- coef(fit)[2:M] my.study$Wstudy.j= cut(Wstudy, cuts, include.lowest =TRUE, labels=FALSE) # For W in quintiles with M levels: n.v = table(my.study$Wstudy.j) p.vstrat=n.v/n beta.vstrat = rep(NA, M) beta.vstrat[1] <- beta0hat for (j in 2:M) { beta.vstrat[j] <- beta0hat + beta1hatvect[j-1] } # Obtain lambdaopt and lambdabar needed for 6-7 and 2-3, respectively: callopt3 <- result3(Bprime,p.v,beta.v,v.vopt,c2.v,n) lambdaopt <- callopt3[c(1:n)] lambdabar <- callopt3[n+1] # Relative efficiency approach 3a vs optimal simple random sampling: RE3avs2 <- round(callopt3[n+2],4) # Obtain lambdaoptstrat for optimal stratified sampling for 4-5: call3optstrat <- result3(Bprime,p.vstrat,beta.vstrat,v.voptstrat,c2.vstrat,n) lambdaoptstrat <- call3optstrat[c(1:M)] lambdaoptstrat.n= rep(NA, n) for(j in 1:M){ lambdaoptstrat.n [my.study$Wstudy.j==j] = lambdaoptstrat [j] } # Relative efficiency approach 3c vs optimal simple random sampling: RE3cvs2 <- round(call3optstrat[M+2],4) ################################################ # Compute the 2nd-phase selection indicators R (under Bernoulli sampling) # for each of the three sampling approaches Rsimple <- rbinom(n,1,lambdabar) Ropt <- rbinom(n,1,lambdaopt) Roptstrat <- rbinom(n,1,lambdaoptstrat.n) } ans <- list(lambdaopt,lambdaoptstrat.n,lambdabar,Ropt,Roptstrat,Rsimple,RE3avs2,RE3cvs2) cat(paste("Relative efficiency of optimal sampling vs optimal simple random sampling = ",RE3avs2),"\n") cat("\n") cat(paste("Relative efficiency of stratified optimal sampling vs optimal simple random sampling = ",RE3cvs2),"\n") cat("\n") cat(paste("Relative efficiency of optimal vs optimal stratified sampling = ",round(RE3avs2/RE3cvs2,4)),"\n") cat("\n") ans } analysisresult3_analysis <- function(Ystudy,Wstudy,Rstudy,lambda,alpha=0.05,method="fully optimal", alhpa0=1, alpha1=1, alpha2=1, simulation=0) { ####################################################################################### # Input variables: # # Ystudy: Y values from the study, which are assumed to have been # selected based on one of the three indicator sets outputted from # analysisresult3_design. Missing values denoted by 0. # # Wstudy: Auxiliary W values from the study. Assumes complete data. # (no missing values) # # Rstudy: Selection indicators (outputted from analysisresult3_design) # # lambda: The vector of lambda(Wi)s i = 1 to n, # assumed to be outputted from analysisresult3_design as one of # the three sets # # alpha: Nominal level of the 2-sided test of H0: beta=0, and # determines the confidence level of the (1-alpha)x100% # confidence interval for beta # # method="fully optimal","stratified optimal","simple random sampling # optimal" is Approach 3a, 3c, 2, respectively # # simulation=1: for simulation, the true E(Y|W) is known, i.e., alpha0, # alpha1, alpha2 are known # # alhpa0=1, alpha1=1, alpha2=1 : parameters for the true E(Y|W), only # needed when simulation=1 # # Output: # # A matrix with columns of Estimated beta, S.E., lower level of CI, upper # level of CI, P-value and rows for complete case, RR using OLS, RR using # ROB, RR with known true E(Y|W) if simulation # ######################################################################################## my.data <- data.frame(cbind(Y=Ystudy,W=Wstudy,R=Rstudy,lambda)) ans1 <- round(Completecaseestimate.beta(my.data,alpha),4) ans2 <- round(RRestimate.beta(my.data,alpha, alhpa0, alpha1, alpha2, "OLS", 2),4) ans3 <- round(RRestimate.beta(my.data,alpha, alhpa0, alpha1, alpha2, "ROB", 2),4) resultsmat <- rbind(ans1,ans2,ans3) ## true E(Y|W) if(simulation==1) { ans4 <- round(RRestimate.beta(my.data,alpha, alhpa0, alpha1, alpha2, "OLS", 1),4) resultsmat <- rbind(resultsmat,ans4) } # Report the results colnames(resultsmat) <- c("Est. beta","S.E.","LL CI","UL CI","P-value") if (simulation==1) { rownames(resultsmat) <- c("CC","RR OLS","RR ROB", "RR(True G)") } else { rownames(resultsmat) <- c("CC","RR OLS","RR ROB") } #cat(paste("RR method using optimal 2-phase sampling: ",method),"\n") #print(resultsmat) resultsmat } ##################################################################################### ## For simulation 3(b) and 3(d) ## ## This function specifies the optimal lambda using Result 3, specifying ## lambda* based on a pilot data-set of (W,Y) pairs and either fitting ## the wrong model Var(Y|W) = exp(nu00 + nu0*W) by maximum likelihood ## (Approach 3b) or by using the true model Var(Y|W) (Approach 3d). ## It also computes the optimal lambda* for simple random sampling ## and outputs relative efficiencies (REs). analysisresult3bd_design<- function(Ypilot,Wpilot,Bprime,c2.v,Wstudy, c0, nu0, nu1,meanFun=1) { ########################################################################### # Input variables: # #(Wpilot, Ypilot) = (W,Y) values from the pilot study where both W and Y # were measured # -Assumes complete data (no missing values) # # Bprime, c2.v = defined above # # Wstudy = W values from the study. Assumes complete data (no missing # values) # # c0, nu0, nu1 = parameters for the true model Var(Y|W) # # meanFun = 1 E(Y|W) = alpha0 +alpha1*W # = 2 E(Y|W) = alpha0 +alpha1*W + alpha2*W^2 # # Output: # optimal lambdas for optimal sampling with wrong Var(Y|W), optimal # sampling with true Var(Y|W), simple random sampling, # 2nd-phase selection indicators R for: # -optimal sampling # -optimal stratified sampling # -optimal simple random sampling, # RE for optimal sampling vs optimal simple random sampling, # RE for optimal sampling vs optimal stratified random sampling, # RE for optimal stratified sampling vs optimal simple random sampling # ############################################################################ my.data = data.frame(cbind(Ypilot,Wpilot)) my.study = data.frame(Wstudy) lambdaoptW=NA lambdaoptT=NA lambdabar=NA RoptW=NA RoptT=NA Rsimple=NA RE3bvs2=NA RE3dvs2=NA ################################################################ ## Analyze the pilot data to determine the optimal lambda* ## ## Estimate the conditional variance function Var(Y|W) ## under the wrong model ## Var(Y|W) = exp(nu00 + nu0*W) if (meanFun==1) { X=cbind(1, Wpilot) } else { X=cbind(1, Wpilot, Wpilot^2) } Zwrong = cbind(1, Wpilot) ## for 3(b) fit2=myremlscore(Ypilot, X, Zwrong, maxit=200) if (!inherits(fit2, "try-error") & fit2$conv==1) { gamma= fit2$gamma nu00.hat=gamma[1,1] nu0.hat=gamma[2,1] #Set Var(Y|W) values using misspecified var function v.voptW= exp(nu00.hat + nu0.hat*Wstudy) #Set Var(Y|W) values using the true var function v.voptT= exp(c0 + nu0*Wstudy + nu1*Wstudy^2) ##################################### # Use the empirical distribution to estimate p.v = P(W=j) # For W continuous: n <- length(Wstudy) p.v <- rep(1/n,n) ################################ ## Specify beta.v = vector of E[Y|W=j] based on the pilot data ## Obtain fitted values by robust lm (ROB method) fit = lmrob(Ypilot~Wpilot, data = my.data, control = lmrob.control(max.it = 100)) beta0hat <- coef(fit)[1] beta1hat <- coef(fit)[2] beta.v=beta0hat + beta1hat*Wstudy # Obtain lambdaopt and lambdabar callopt3W <- result3(Bprime,p.v,beta.v,v.voptW,c2.v,n) callopt3T <- result3(Bprime,p.v,beta.v,v.voptT,c2.v,n) lambdaoptW <- callopt3W[c(1:n)] lambdaoptT <- callopt3T[c(1:n)] lambdabar <- callopt3W[n+1] # Relative efficiency approach 3a vs simple random sampling: RE3bvs2 <- round(callopt3W[n+2],4) RE3dvs2 <- round(callopt3T[n+2],4) ################################### # Compute the selection indicators R (under Bernoulli sampling) for each # of the three sampling approaches Rsimple <- rbinom(n,1,lambdabar) RoptW <- rbinom(n,1,lambdaoptW) RoptT <- rbinom(n,1,lambdaoptT) } ans <- list(lambdaoptW,lambdaoptT,lambdabar,RoptW,RoptT,Rsimple,RE3bvs2,RE3dvs2) ans } ######################################################################################## ## This function calculates estimates of the mean beta (beta.hats) for ## different approaches, for simulated pilot and study ## data-sets. This code is used to produce the simulation study ## figures of the article. my.simulation = function(nstudy, npilot, mn, sigma, alpha0, alpha1, alpha2, c0, nu0, nu1, c2.v, c2.vstrat, Bprime, n.simu=100, alpha=0.05, M=4, meanFun=1) { ######################################################################################## # Input variables: # # nstudy = sample size for study data # # npilot = sample size for pilot data # # mn = mean value for W # sigma = standard deviation of W, assuming W ~ N(mn, sigma^2) # # alhpa0, alpha1, alpha2 = parameters for E(Y|W)= alpha0 +alpha1*W + alpha2*W^2 # # c0, nu0, nu1 = parameters for Var(Y|W) = exp(c0 + nu0*W + nu1*W^2) # # c2.v, c2.vstrat, Bprime, alpha, M = defined above # # n.simu = number of simulations # # meanFun = 1 E(Y|W) = alpha0 +alpha1*W # = 2 E(Y|W) = alpha0 +alpha1*W + alpha2*W^2 # # Output: # RE = RE for Approaches 2-3(d) vs 1 # all.beta = estimated betas for all Approaches 1-3(d) ######################################################################################## all.beta=NULL for (i in 1:n.simu) { Wpilot=rnorm(npilot, mean=mn, sd=sigma) Ypilot=rnorm(npilot, mean=alpha0 + alpha1*Wpilot + alpha2*Wpilot^2, sd=sqrt(exp(c0+nu0*Wpilot+nu1*Wpilot^2))) Wstudy=rnorm(nstudy, mean=mn, sd=sigma) YstudyC=rnorm(nstudy, mean=alpha0 + alpha1*Wstudy + alpha2*Wstudy^2, sd=sqrt(exp(c0+nu0*Wstudy+nu1*Wstudy^2))) ans <- analysisresult3_design(Ypilot,Wpilot,Bprime,c2.v,c2.vstrat,Wstudy,M, meanFun) ans2 <- analysisresult3bd_design(Ypilot,Wpilot,Bprime, c2.v,Wstudy, c0, nu0, nu1, meanFun) if (!is.na(ans[[1]][1]) & !is.na(ans2[[1]][1])) { # Choose the phase-2 Y sample based on the fully optimal sampling design lambda <- ans[[1]] Rstudy <- ans[[4]] Ystudy = YstudyC Ystudy[Rstudy==0] <- 0 # Carry out the data analysis: ansOpt <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"fully optimal", alpha0, alpha1, alpha2,simulation=1) ###################### # Choose the phase-2 Y sample based on the optimal stratified sampling # design lambda <- ans[[2]] Rstudy <- ans[[5]] Ystudy = YstudyC Ystudy[Rstudy==0] <- 0 ansStrat <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"stratified optimal", alpha0, alpha1, alpha2,simulation=1) ####################### # Choose the phase-2 Y sample based on the optimal simple random sampling # design lambda <- rep(ans[[3]], length(Wstudy)) Rstudy <- ans[[6]] Ystudy = YstudyC Ystudy[Rstudy==0] <- 0 ansSimple <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"simple random sampling optimal", alpha0, alpha1, alpha2,simulation=1) # Choose the phase-2 Y sample based on the fully optimal sampling design lambdaW <- ans2[[1]] RstudyW <- ans2[[4]] Ystudy = YstudyC Ystudy[RstudyW==0] <- 0 # Carry out the data analysis: ansOptW <- analysisresult3_analysis(Ystudy,Wstudy,RstudyW,lambdaW,alpha,"fully optimal", alpha0, alpha1, alpha2,simulation=1) lambdaT <- ans2[[2]] RstudyT <- ans2[[5]] Ystudy = YstudyC Ystudy[RstudyT==0] <- 0 # Carry out the data analysis: ansOptT <- analysisresult3_analysis(Ystudy,Wstudy,RstudyT,lambdaT,alpha,"fully optimal", alpha0, alpha1, alpha2,simulation=1) all.beta = rbind(all.beta, c(ansSimple[1:4,1], ansOpt[2:4,1], ansOptW[2:4,1], ansStrat[2:4,1], ansOptT[2:4,1])) } } varSim=apply(all.beta, 2, var, na.rm=TRUE) RE= varSim/varSim[1] list(RE=RE, all.beta=all.beta) } ####################################################################### ## The function myremlscore is used to estimate the parameters in the ## model Var(Y|W) = exp(nu00 + nu0*W + nu1*W^2) ## ## The function uses the remlscore function from statmod library. ## We add an indicator "conv" for convergence myremlscore = function (y, X, Z, trace = FALSE, tol = 1e-05, maxit = 40) { conv=1 n <- length(y) p <- dim(X)[2] q <- dim(Z)[2] const <- n * log(2 * pi) fitm <- lm.fit(X, y) if (fitm$qr$rank < p) stop("X is of not of full column rank") Q <- qr.Q(fitm$qr) h <- as.vector(Q^2 %*% array(1, c(p, 1))) d <- fitm$residuals^2 wd <- 1 - h zd <- log(d/(1 - h)) + 1.27 fitd <- lm.wfit(Z, zd, wd) gam <- ifelse(is.na(fitd$coef), 0, fitd$coef) g <- fitd$fitted.values phi <- exp(g) wm <- 1/phi fitm <- lm.wfit(X, y, wm) d <- fitm$residuals^2 dev <- sum(d/phi) + sum(log(phi)) + const + 2 * log(prod(abs(diag(fitm$qr$qr)))) iter <- 0 if (trace) cat("Iter =", iter, ", Dev =", dev, " Gamma", gam, "\n") Q2 <- array(0, c(n, p * (p + 1)/2)) repeat { iter <- iter + 1 Q <- qr.qy(fitm$qr, diag(1, nrow = n, ncol = p)) j0 <- 0 for (k in 0:(p - 1)) { Q2[, (j0 + 1):(j0 + p - k)] <- Q[, 1:(p - k)] * Q[, (k + 1):p] j0 <- j0 + p - k } if (p > 1) Q2[, (p + 1):(p * (p + 1)/2)] <- sqrt(2) * Q2[, (p + 1):(p * (p + 1)/2)] h <- drop(Q2[, 1:p] %*% array(1, c(p, 1))) Q2Z <- t(Q2) %*% Z ZVZ <- (t(Z) %*% vecmat(1 - 2 * h, Z) + t(Q2Z) %*% Q2Z)/2 maxinfo <- max(diag(ZVZ)) if (iter == 1) { lambda <- abs(mean(diag(ZVZ)))/q I <- diag(q) } zd <- (d - (1 - h) * phi)/phi dl <- crossprod(Z, zd)/2 gamold <- gam devold <- dev lev <- 0 repeat { lev <- lev + 1 R <- chol(ZVZ + lambda * I) dgam <- backsolve(R, backsolve(R, dl, transpose = TRUE)) gam <- gamold + dgam phi <- as.vector(exp(Z %*% gam)) wm <- 1/phi fitm <- lm.wfit(X, y, wm) d <- fitm$residuals^2 dev <- sum(d/phi) + sum(log(phi)) + const + 2 * log(prod(abs(diag(fitm$qr$qr)))) if (dev < devold - 1e-15) break if (lambda/maxinfo > 1e+15) { gam <- gamold warning("Too much damping - convergence tolerance not achievable") break } lambda <- 2 * lambda if (trace) cat("Damping increased to", lambda, "\n") } if (trace) cat("Iter =", iter, ", Dev =", dev, " Gamma", gam, "\n") if (lambda/maxinfo > 1e+15) break if (lev == 1) lambda <- lambda/10 if (crossprod(dl, dgam) < tol) break if (iter > maxit) { conv=0 ## not convergent warning("reml: Max iterations exceeded") break } } cov.gam <- chol2inv(chol(ZVZ)) se.gam <- sqrt(diag(cov.gam)) cov.beta <- chol2inv(qr.R(fitm$qr)) se.beta <- sqrt(diag(cov.beta)) list(beta = fitm$coef, se.beta = se.beta, gamma = gam, se.gam = se.gam, mu = fitm$fitted, phi = phi, deviance = dev, h = h, cov.beta = cov.beta, cov.gam = cov.gam, conv=conv) } ################################## The end of functions ################################################### ####################################################### # Generate a practice pilot data-set and a study data-set, # for illustrating how to perform the design and analysis # for Result 3 library(statmod) library(robustbase) npilot <- 100 nstudy <- 200 Wpilot <- rnorm(npilot) Ypilot <- Wpilot + (Wpilot^2)*rnorm(npilot) Bprime <- 10000 c2.v <- rep(500,nstudy) M=4 c2.vstrat = rep(500, M) Wstudy <- rnorm(nstudy) ans <- analysisresult3_design(Ypilot,Wpilot,Bprime,c2.v,c2.vstrat,Wstudy,M, meanFun=1) # Choose the phase-2 Y sample based on the fully optimal sampling design lambda <- ans[[1]] Rstudy <- ans[[4]] Ystudy <- rep(0,nstudy) Ystudy[Rstudy==1] <- Wstudy[Rstudy==1] + (Wstudy[Rstudy==1]^2)*rnorm(length(Wstudy[Rstudy==1])) alpha <- 0.05 # Carry out the data analysis: ansOpt <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"fully optimal") ###################### # Choose the phase-2 Y sample based on the optimal stratified sampling design lambda <- ans[[2]] Rstudy <- ans[[5]] Ystudy <- rep(0,nstudy) Ystudy[Rstudy==1] <- Wstudy[Rstudy==1] + (Wstudy[Rstudy==1]^2)*rnorm(length(Wstudy[Rstudy==1])) ansStrat <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"stratified optimal") ###################### # Choose the phase-2 Y sample based on the optimal simple random sampling design lambda <- rep(ans[[3]], length(Wstudy)) Rstudy <- ans[[6]] Ystudy <- rep(0,nstudy) Ystudy[Rstudy==1] <- Wstudy[Rstudy==1] + (Wstudy[Rstudy==1]^2)*rnorm(length(Wstudy[Rstudy==1])) ansSimple <- analysisresult3_analysis(Ystudy,Wstudy,Rstudy,lambda,alpha,"simple random sampling optimal") ###################################### Simulation #################################### mn=3.3 sigma=sqrt(0.5) alpha0=0.1 alpha1=3 alpha2=0 var.m=rbind(c(1, 0.2, 2.890, 0, 0), c(1,0.5, 1.504, 0, 0), c(1,0.8, 0.118, 0, 0), c(0,0.2, -1.026, -0.2, 0.3), c(0, 0.5, -2.413, -0.2, 0.3), c(0,0.8, -3.799,-0.2, 0.3)) var.m=data.frame(var.m) names(var.m) = c("indep", "R2", "c0", "nu0", "nu1") n.s= 10000 npilot <- 1000 # m nstudy <- 200 # n Bprime=nstudy*4000*0.10 c2.v <- rep(4000,nstudy) c2.vstrat = rep(4000, 4) RE = NULL for (ind in 0:1) { for (r2 in c(0.2, 0.5, 0.8)) { parm.v= as.numeric(subset(var.m, R2==r2 & indep==ind)) c0=parm.v[3] nu0=parm.v[4] nu1=parm.v[5] tt= my.simulation (nstudy, npilot, mn, sigma, alpha0, alpha1, alpha2, c0, nu0, nu1, c2.v, c2.vstrat, Bprime, n.simu=n.s, alpha=0.05) res = cbind(npilot, nstudy, c0, nu0, nu1, r2, re=tt$RE) RE = rbind(RE, res) } } RE = data.frame(RE) names(RE) = c("m", "n", "c0", "nu0", "nu1", "R2", "RE") RE$Approach = rep(c(1, rep(2:6, each=3)), 6) RE$lm = rep(c("none", rep(c("OLS", "ROB", "true"),5)), 6) RE$estimate = rep(c("none", rep(c("est", "est", "true"), 5)), 6) write.csv(RE, paste("allRE", format(Sys.Date(),"%Y%m%d"),".csv", sep=""), row.names=FALSE, col.names=FALSE)