#================================================================= # # Name: roc.s # # Purpose: ROC curves for binary and oridnal data with C.I. # # Date: 03/15/2005 # #================================================================== options(contrasts=c("contr.treatment", "contr.poly")) mpinv<-function(A, eps=1e-13){ # Moore-Penrose generalized inverse of a matrix s<-svd(A) e<-s$d e[e>eps]<-1/e[e>eps] return(s$v %*% diag(e) %*% t(s$u)) } vecnorm <- function(x, p=2) sum(x^p, na.rm=T)^(1/p) #===== Binary-scale data===== Page 101 ==== roc.binary<-function(s1, s0, r1, r0, alpha){ result<-list(estimate=NULL, variance=NULL, asympCI=NULL, scoreCI=NULL) n1<-s1+s0 n0<-r1+r0 #-----estimate sensitivity and specificity--------- se<-s1/(s1+s0) sp<-r0/(r1+r0) #-----estimate variance for sensitivity and specificity---- se.var<-s1*s0/n1^3 sp.var<-r1*r0/n0^3 z<-qnorm(1-alpha/2) #-----asymptotic C.I. for sensitivity and specifity--------- se.asympCI<-c(se-z*sqrt(se.var), se+z*sqrt(se.var)) sp.asympCI<-c(sp-z*sqrt(sp.var), sp+z*sqrt(sp.var)) #-----score C.I. for sensitivity and specificity------------ se.scoreCI<-c((se+z^2/2/n1 - z*sqrt((se*(1-se)+z^2/4/n1)/n1))/(1+z^2/n1), (se+z^2/2/n1 +z*sqrt((se*(1-se)+z^2/4/n1)/n1))/(1+z^2/n1)) sp.scoreCI<-c((sp+z^2/2/n1 - z*sqrt((sp*(1-sp)+z^2/4/n0)/n0))/(1+z^2/n0), (sp+z^2/2/n0 +z*sqrt((sp*(1-sp)+z^2/4/n0)/n0))/(1+z^2/n0)) result$estimate<-rbind(se, sp) result$variance<-rbind(se.var, sp.var) result$asympCI<-rbind(se.asympCI, sp.asympCI) result$scoreCI<-rbind(se.scoreCI, sp.scoreCI) print(result) } #-- Mammogram Results of 30 pts with and 30 pts without breast cancer--Pg.17- roc.binary(29, 1, 19, 11, 0.05) #==== ordinal-scale data ========= Pg. 110 =============== roc.ordinal.emp<-function(data, alpha){ result<-list(area.empROC=NULL, var.empROC.HM=NULL, var.empROC.DDC=NULL, empROC.CI.HM=NULL, empROC.CI.DDC=NULL) z<-qnorm(1-alpha/2) if(dim(data)[1]!=2){ print("The dimension for disease status should be 2") } K<-dim(data)[2] n1<-sum(data[1,]) n0<-sum(data[2,]) #----draw empirical ROC curve---------- se<-rep(NA, length = K) FPR<-rep(NA, length = K) for(i in 1:K){ se[i]= sum(data[1,i:K])/n1 FPR[i] = sum(data[2, i:K])/n0 } plot(FPR,se, xlim=c(0,1), ylim=c(0,1), type="b", xlab="FPR", ylab="Specificity", main=paste("Empirical ROC curve for Oridinal Data with ", K, "Categories")) #--- area under empirical ROC curve Pg. 126-131 ---------- tmpM<-matrix(NA, nrow=K, ncol=K) for(i in 1:K){ for(j in 1:K){ if(j > i){ tmpM[i,j]<-0 } if(i == j){ tmpM[i,j]<-0.5 } if(j < i){ tmpM[i,j]<-1 } } } tmpM2<-matrix(NA, ncol=K, nrow=K) for(i in 1:K){ for(j in 1:K){ tmpM2[i,j]<-data[1,i]*data[2,j]*tmpM[i,j] } } area.empROC<-sum(tmpM2)/n0/n1 #----variance of area under empirical ROC Hanley & McNeil--------- Q1<-area.empROC/(2-area.empROC) Q2<-2*area.empROC^2/(1+area.empROC) var.empROC.HM<-(area.empROC*(1-area.empROC) + (n1-1)*(Q1-area.empROC^2) + (n0-1)*(Q2-area.empROC^2))/n0/n1 #---variance of area under empirical ROC by Delong, Delong & Clarke-Pearson-- V1<-rep(NA, length=K) V0<-rep(NA, length=K) tmpM.V1<-matrix(NA, ncol=K, nrow=K) tmpM.V0<-matrix(NA, ncol=K, nrow=K) for(j in 1:K){ tmpM.V1[,j]<-data[2,j]*tmpM[,j] } V1<-apply(tmpM.V1, 1, sum)/n0 for(i in 1:K){ tmpM.V0[i,]<-data[1,i]*tmpM[i,] } V0<-apply(tmpM.V0, 2, sum)/n1 SS.T1<-sum(data[1,]*(V1-area.empROC)^2)/(n1-1) SS.T0<-sum(data[2,]*(V0-area.empROC)^2)/(n0-1) var.empROC.DDC<-SS.T1/n1 + SS.T0/n0 #----- calcualte CI for area under empirical ROC --------------- empROC.CI.HM <-c(area.empROC-z*sqrt(var.empROC.HM), area.empROC+z*sqrt(var.empROC.HM)) empROC.CI.DDC<-c(area.empROC-z*sqrt(var.empROC.DDC), area.empROC+z*sqrt(var.empROC.DDC)) result<-list(area.empROC=NULL, var.empROC.HM=NULL, var.empROC.DDC=NULL, empROC.CI.HM=NULL, empROC.CI.DDC=NULL) result$area.empROC<-area.empROC result$var.empROC.HM<-var.empROC.HM result$var.empROC.DDC<-var.empROC.DDC result$empROC.CI.HM<-empROC.CI.HM result$empROC.CI.DDC<-empROC.CI.HM return(result) } #----- Fitting a smooth curve for oridnal data Pg 112-116--- roc.ordinal.smooth<-function(data, alpha, FPR){ result<-list(est.ini=NULL, est.improved=NULL, var.est=NULL, area.ROC.smooth=NULL, auc.var=NULL, auc.CI=NULL, sen=NULL, sen.CI=NULL) K<-dim(data)[2] n1<-sum(data[1,]) n0<-sum(data[2,]) c<-rep(NA, length=K) for(i in 1:(K-1)){ c[i]<-qnorm(sum(data[2,1:i])/n0) } c[K]<-Inf aVec<-rep(NA, length=K) bVec<-rep(NA, length=K) for(i in 1:K-1){ x<-qnorm(sum(data[1,1:i])/n1) y<-qnorm(sum(data[1,1:(i+1)])/n1) bVec[i]<-(y-x)/(c[i+1]-c[i]) aVec[i]<- (bVec[i]-x) } a<-mean(aVec, na.rm=T) b<-mean(bVec, na.rm=T) s0<-c(a, b, c[1:K-1]) #--- calculate first partial ----- r<-rep(NA, length=K+1) x<-c(0,pnorm(b*c[1:(K-1)]-a),1) t1<-x[2:K]-x[1:(K-1)] t2<-x[3:(K+1)]-x[2:K] t1[is.na(t1)]<-0 t2[is.na(t2)]<-0 tt1<-data[1,1:(K-1)]/n1 tt2<-data[1,2:K]/n1 x<-c(0, pnorm(c[1:(K-1)]), 1) v1<-x[2:K]-x[1:(K-1)] v2<-x[3:(K+1)] -x[2:K] v1[is.na(v1)]<-0 v2[is.na(v2)]<-0 vv1<-data[2,1:(K-1)]/n0 vv2<-data[2,2:K]/n0 r[1]<- -n1*sum(dnorm(b*c[1:(K-1)]-a)*(tt1/t1-tt2/t2), na.rm=T) r[2]<-n1*sum(dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)]*(tt1/t1-tt2/t2), na.rm=T) tmpVec<-n1*dnorm(b*c[1:(K-1)]-a)*b*(tt1/t1-tt2/t2) + n0*dnorm(c[1:(K-1)])*(vv1/v1-vv2/v2) tmpVec[is.na(tmpVec)]<-0 r[3:(K+1)]<-tmpVec r<-as.numeric(r) #--calculate second partial ----- A<-matrix(0, nrow=(K+1), ncol=(K+1)) x<-c(0,dnorm(b*c[1:(K-1)]-a),0) tt1<-x[2:K]-x[1:(K-1)] tt2<-x[3:(K+1)]-x[2:K] x<-c(0,pnorm(b*c[1:(K-1)]-a),1) t1<-x[2:K]-x[1:(K-1)] t2<-x[3:(K+1)]-x[2:K] A[1,1]<--n1*sum(dnorm(b*c[1:(K-1)]-a)*(tt1/t1-tt2/t2), na.rm=T) A[1,2]<-n1*sum(dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)]*(tt1/t1-tt2/t2), na.rm=T) tmpVec<-n1*dnorm(b*c[1:(K-1)]-a)*b*(tt1/t1-tt2/t2) tmpVec[is.na(tmpVec)]<-0 A[1, 3:(K+1)]<-tmpVec A[2, 3:(K+1)]<- -tmpVec x<-c(0,dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)],0) x[is.na(x)]<-0 tt1<-x[2:K]-x[1:(K-1)] tt2<-x[3:(K+1)]-x[2:K] A[2,2]<- -n1*sum(dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)]*(tt1/t1-tt2/t2), na.rm=T) x<-c(0, pnorm(c[1:(K-1)]), 1) v1<-x[2:K]-x[1:(K-1)] v2<-x[3:(K+1)] -x[2:K] x<-dnorm(b*c[1:(K-1)]-a) y<-dnorm(c[1:(K-1)]) diag(A)[3:(K+1)]<- -n1*x*b^2*(x/t1+x/t2)-n0*y*(y/v1+y/v2) diag(A)[is.na(diag(A))]<-0 for(i in 2:(K+1)){ for(j in 1:i){ A[i,j]<-A[j,i] } } s1<-s0 + mpinv(-A)%*%r result$est.ini<-s0 result$est.improved<-s1 a<-s0[1] b<-s0[2] c[1:(K-1)]<-s0[3:(K+1)] A<-matrix(0, nrow=(K+1), ncol=(K+1)) x<-c(0,dnorm(b*c[1:(K-1)]-a),0) tt1<-x[2:K]-x[1:(K-1)] tt2<-x[3:(K+1)]-x[2:K] x<-c(0,pnorm(b*c[1:(K-1)]-a),1) t1<-x[2:K]-x[1:(K-1)] t2<-x[3:(K+1)]-x[2:K] A[1,1]<--n1*sum(dnorm(b*c[1:(K-1)]-a)*(tt1/t1-tt2/t2), na.rm=T) A[1,2]<-n1*sum(dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)]*(tt1/t1-tt2/t2), na.rm=T) tmpVec<-n1*dnorm(b*c[1:(K-1)]-a)*b*(tt1/t1-tt2/t2) tmpVec[is.na(tmpVec)]<-0 A[1, 3:(K+1)]<-tmpVec A[2, 3:(K+1)]<- -tmpVec x<-c(0,dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)],0) x[is.na(x)]<-0 tt1<-x[2:K]-x[1:(K-1)] tt2<-x[3:(K+1)]-x[2:K] A[2,2]<- -n1*sum(dnorm(b*c[1:(K-1)]-a)*c[1:(K-1)]*(tt1/t1-tt2/t2), na.rm=T) x<-c(0, pnorm(c[1:(K-1)]), 1) v1<-x[2:K]-x[1:(K-1)] v2<-x[3:(K+1)] -x[2:K] x<-dnorm(b*c[1:(K-1)]-a) y<-dnorm(c[1:(K-1)]) diag(A)[3:(K+1)]<- -n1*x*b^2*(x/t1+x/t2)-n0*y*(y/v1+y/v2) diag(A)[is.na(diag(A))]<-0 for(i in 2:(K+1)){ for(j in 1:i){ A[i,j]<-A[j,i] } } result$var.est<-mpinv(-A) #---- plot smooth ROC curve ---------- x<-seq(0,1 ,length=500) y<-1-pnorm(b*qnorm(1-x)-a) plot(x, y, type="l", xlab="FPR", ylab="Sensitivity", main=paste("Smooth ROC curve for ordinal data with ",K, " categories")) #--- AUC estimate ------ varMatrix<-mpinv(-A) result$area.ROC.smooth<- pnorm(a/sqrt(1+b^2)) f<-exp(-a^2/2/(1+b^2))/sqrt(2*pi*(1+b^2)) g<--a*b*exp(-a/2/(1+b^2))/sqrt(2*pi*(1+b^2)^3) result$auc.var<-f^2*varMatrix[1,1]+g^2*varMatrix[2,2]+2*f*g*varMatrix[1,2] z<-qnorm(1-alpha/2) result$auc.CI<-c(result$area.ROC.smooth-z*sqrt(result$auc.var), result$area.ROC.smooth+z*sqrt(result$auc.var)) #--- calculate sensitivity at a particular FPR --- z.FPR<- qnorm(FPR) z.TPR<-b*z.FPR-a sen<-1-pnorm(z.TPR) var.z.TPR<-z.FPR^2*varMatrix[2,2] + varMatrix[1,1] -2*varMatrix[1,2] result$sen<-sen ll<-z.TPR-z*sqrt(var.z.TPR) ul<-z.TPR+z*sqrt(var.z.TPR) result$sen.CI<-cbind(1-pnorm(ll), 1-pnorm(ul)) return(result) } data<-matrix(NA, ncol=5, nrow=2) data[1,]<-c(1, 0, 1, 11, 0) data[2,]<-c(22, 8, 7, 8, 0) data<-as.data.frame(data) roc.ordinal.emp(data, 0.05) roc.ordinal.smooth(data,0.05, 0.2) #===== ROC for continuous =================== roc.continous<-function(data, alpha, FPR){ if(length(table(data[,1]))!=2){ print("disease status has more than 2 categories!") return(NULL) } n1<-sum(data[,1]) n0<-dim(data)[1]-n1 c<-sort(unique(data[,2])) K<-length(c) #-------- empirical ROC curve ------------------------- F0<-rep(NA, length=K) F1<-rep(NA, length=K) for(i in 1:K){ F0[i]<- sum(data[data[,1]==0,][,2]>c[i])/n0 F1[i]<- sum(data[data[,1]==1,][,2]>c[i])/n1 } F0<-c(1, F0, 0) F1<-c(1, F1, 0) plot(F0, F1, xlab="FPR", ylab="Sensitivity", main="ROC curve for continuous data", type="s") # ------- smooth ROC curve, parametric method Pg 139 ---------- # if data is binormal test<-data$T disease<-data$D mu0<-mean(test[disease==0]) mu1<-mean(test[disease==1]) sigma0<-sqrt(var(test[disease==0])) sigma1<-sqrt(var(test[disease==1])) b<-sigma0/sigma1 a<-(mu1-mu0)/sigma1 result$est.binormal<-c(a, b) result$var.binormal<-matrix(c((n1*(a^2+2)+2*n0*b^2)/2/n0/n1, a*b/2/n0, a*b/2/n0, (n1+n0)*b^2/2/n1/n0), ncol=2, byrow=T) x<-seq(0,1 ,length=500) y<-1-pnorm(b*qnorm(1-x)-a) lines(x, y, lty=2) result$auc.binormal<-pnorm(a/sqrt(1+b^2)) f<-exp(-a^2/2/(1+b^2))/sqrt(2*pi*(1+b^2)) g<--a*b*exp(-a/2/(1+b^2))/sqrt(2*pi*(1+b^2)^3) result$auc.var.binormal<-f^2*result$var.binormal[1,1]+g^2*result$var.binormal[2,2]+2*f*g*result$var.binormal[1,2] z<-qnorm(1-alpha/2) result$auc.CI.binormal<-c(result$auc.binormal-z*sqrt(result$auc.var.binormal), result$auc.binormal+z*sqrt(result$auc.var.binormal)) #--- direct Binormal estiamte(log-transformed)---------------- test<-log(data$T) disease<-data$D mu0<-mean(test[disease==0]) mu1<-mean(test[disease==1]) sigma0<-sqrt(var(test[disease==0])) sigma1<-sqrt(var(test[disease==1])) b<-sigma0/sigma1 a<-(mu1-mu0)/sigma1 result$est.logBinormal<-c(a, b) result$var.logBinormal<-matrix(c((n1*(a^2+2)+2*n0*b^2)/2/n0/n1, a*b/2/n0, a*b/2/n0, (n1+n0)*b^2/2/n1/n0), ncol=2, byrow=T) x<-seq(0,1 ,length=500) y<-1-pnorm(b*qnorm(1-x)-a) lines(x, y, lty=3) legend(0.2, 0.5, c("Empirical", "Direct Binormal estimate(untransformed)", "Direct Binormal estimate(untransformed)"), lty = c(1, 2,3)) result$auc.logBinormal<-pnorm(a/sqrt(1+b^2)) f<-exp(-a^2/2/(1+b^2))/sqrt(2*pi*(1+b^2)) g<--a*b*exp(-a/2/(1+b^2))/sqrt(2*pi*(1+b^2)^3) result$auc.var.logBinormal<-f^2*result$var.logBinormal[1,1]+g^2*result$var.logBinormal[2,2]+2*f*g*result$var.logBinormal[1,2] result$auc.CI.logBinormal<-c(result$auc.logBinormal-z*sqrt(result$auc.var.logBinormal), result$auc.logBinormal+z*sqrt(result$auc.var.logBinormal)) #--- Fixed FPR -- The sensitivity and decision threshold --- Pg 148-- Schafer-------- return(result) } x1<-rep(1, length=41) y1<-c(140, 1087, 230, 183, 1256, 700, 16, 800, 253, 740, 126, 153, 283, 90, 303, 193, 76, 1370, 543, 913, 230, 463, 60, 509, 576, 671, 80, 490, 156, 356, 350, 323, 1560, 120, 216, 443, 523, 76, 303, 353, 206) x0<-rep(0, length=19) y0<-c(136, 286, 281, 23, 200, 146, 220, 96, 100, 60, 17, 27, 126, 100, 253, 70, 40, 6, 46) data<-as.data.frame(cbind(c(x1, x0), c(y1, y0))) names(data)<-c("D", "T") roc.continous(data, 0.05) #================ Comparing the Accuracy of Two Diagnostic Tests ====== Pg 165==================== twoTests.binary.unpaired<-function(s11, s10, s21, s20, alpha){ result<-list(se=NULL, diff.se=NULL, wald.var=NULL, wald.CI=NULL, NH.CI=NULL, AC.var=NULL, AC.CI=NULL, EE.CI=NULL, TT.CI=NULL) n11<-s11+s10 n21<-s21+s20 z<-qnorm(1-alpha/2) se<-c(s11/n11, s21/n21) diff.se<-se[2]-se[1] #------ Wald method -------------------- wald.var<-se[1]*(1-se[1])/n11 + se[2]*(1-se[2])/n21 wald.CI<-c(diff.se-z*sqrt(wald.var), diff.se+z*sqrt(wald.var)) #---- Newcombe's hybrid(NH) score method ------- l1<-(2*se[1]*n11+z^2 - sqrt(4*n11*se[1]*z^2+z^4-4*n11*se[1]^2*z^2))/2/(n11+z^2) u1<-(2*se[1]*n11+z^2 + sqrt(4*n11*se[1]*z^2+z^4-4*n11*se[1]^2*z^2))/2/(n11+z^2) l2<-(2*se[2]*n21+z^2 - sqrt(4*n21*se[2]*z^2+z^4-4*n21*se[2]^2*z^2))/2/(n21+z^2) u2<-(2*se[2]*n21+z^2 + sqrt(4*n21*se[2]*z^2+z^4-4*n21*se[2]^2*z^2))/2/(n21+z^2) NH.CI<-c(diff.se-((se[2]-l2)^2+(u1-se[1])^2)^0.5, diff.se+((u2-se[2])^2 +(se[1]-l1)^2)^0.5) # ---- Agresti and Caffo(AC) method ----------------- p1<-(s11+1)/(n11+2) p2<-(s21+1)/(n21+2) AC.var<-p2*(1-p2)/n21 + p1*(1-p1)/n11 AC.CI <- c(diff.se-z*sqrt(AC.var), diff.se+z*sqrt(AC.var)) #---- Edgeworth expansion direct (EE) method -------- n<-n11+n21 delta<-(n/n21)^2*se[2]*(1-se[2])*(1-2*se[2]) -(n/n11)^2*se[1]*(1-se[1])*(1-2*se[1]) sigma<-(n/n21*se[2]*(1-se[2])+n/n11*se[1]*(1-se[1]))^0.5 a<-delta/6/sigma^2 b<-n*(1-2*se[2])/2/n21 - delta/6/sigma^2 Q-(a+b*z^2)/sigma EE.CI<-c(diff.se -(se[1]*(1-se[1])/n11 + se[2]*(1-se[2])/n21)^0.5*(z-n^0.5*Q), diff.se -(se[1]*(1-se[1])/n11 + se[2]*(1-se[2])/n21)^0.5*(-z-n^0.5*Q)) #-----Edgeworth expansion monotone transformation (TT) method ------ gl<-n^0.5*/b/sigma*((1+3*b*sigma*(z/n^0.5 -a*sigma/n))^(1/3) - 1) gu<- n^0.5*/b/sigma*((1+3*b*sigma*(-z/n^0.5 -a*sigma/n))^(1/3) - 1) TT.CI<-c(diff.se-(se[1]*(1-se[1])/n11 + se[2]*(1-se[2])/n21)^0.5*gl, diff.se-(se[1]*(1-se[1])/n11 + se[2]*(1-se[2])/n21)^0.5*gu) result$wald.var<-wald.var result$wald.CI<-wald.CI result$NH.CI<-NH.CI result$AC.var<-AC.var result$AC.CI<-AC.CI result$EE.CI<-EE.CI result$TT.CI<-TT.CI return(result) } twoTest.binary.paired<-function(y11, y10, y01, y00, alpha){ n<-y11+y10+y01+y00 y1<-y11+y10 y2<-y11+y01 se<-c(y1/n, y2/n) diff.se<-se[2]-se[1] z<-qnorm(1-alpha/2) #------ Wald method ------ wald.var<-(se[1]*(1-se[1])+se[2]*(1-se[2])+2*(se[1]*se[2] - y11/n))/n wald.CI<-c(diff.se-z*sqrt(wald.var), diff.se+z*sqrt(wald.var)) #------ Newcombe's hybrid score method (NH) -------- tmp1<-(y00+y01)/n tmp2<-(y00+y10)/n l1<-(2*tmp1*n+z^2 - sqrt(4*n*tmp1*z^2+z^4-4*n*tmp1^2*z^2))/2/(n+z^2) u1<-(2*tmp1*n+z^2 + sqrt(4*n*tmp1*z^2+z^4-4*n*tmp1^2*z^2))/2/(n+z^2) l2<-(2*tmp2*n+z^2 - sqrt(4*n*tmp2*z^2+z^4-4*n*tmp2^2*z^2))/2/(n+z^2) u2<-(2*tmp2*n+z^2 + sqrt(4*n*tmp2*z^2+z^4-4*n*tmp2^2*z^2))/2/(n+z^2) delta1<-(y00+y01)/n-l1 delta2<-(y00+y10)/n-l2 epsilon1<-u1-(y00+y01)/n epsilon2<-u2-(y00+y10)/n D<-(y00+y10)*(y01+y11)*(y00+y01)*(y10+y11) x<-y00*y11-y10*y01 if(D==0) phi<-0 if(D>0 & x<=0) phi<-x/D if(D>0 & x>0) phi<-max(x/D-n/2, 0) NH.CI<-c(diff.se-(delta1^2-2*phi*delta1*epsilon2+epsilon2^2)^0.5, diff.se+(epsilon1^2 - 2*phi*epsilon1*delta2+epsilon2^2)^0.5) }