# # cd4_eda.q # # ------------------------------------------------------------ # # PURPOSE: Exploratory data analysis for CD4 data # # AUTHOR: P. Heagerty # # DATE: 00/01/31 # # ------------------------------------------------------------ # data <- read.table( "cd4.data", header=F ) cd4 <- data.frame( year = data[,1], cd4 = data[,2], age = data[,3], packs = data[,4], drugs = data[,5], partners = data[,6], cesd = data[,7], id = data[,8] ) # ##### Data summary # summary( cd4 ) # ##### Subjects and obs/subject # cat( paste("Number of subjects =", length(table(cd4$id)),"\n\n") ) cat( paste("Number of observations =", length(cd4$id),"\n\n") ) cat( "Number of subjects with a given observations-per-subject:\n\n" ) print( table( table( cd4$id ) ) ) cat("\n\n") # ##### ##### mean plots ##### # go.plot <- T if( go.plot ){ postscript( file="cd4_eda_plot1.ps", horiz=F ) par( oma=c(15,0,0,0) ) plot( cd4$year, cd4$cd4, pch=".", xlab="Years since seroconversion", ylab="CD4+ cells" ) lines( smooth.spline( cd4$year, cd4$cd4, df=7 ) ) title("MACS CD4 Data") graphics.off() # postscript( file="cd4_eda_plot2.ps", horiz=F ) par( oma=c(15,0,0,0) ) plot( cd4$year, cd4$cd4, pch=".", xlab="Years since seroconversion", type="n", ylab="CD4+ cells" ) # ##### sample 25 subjects # uid <- unique( cd4$id ) subset <- sample( uid, 25 ) for( j in 1:25 ){ lines( cd4$year[ cd4$id==subset[j] ], cd4$cd4[ cd4$id==subset[j] ] ) } title("MACS CD4 Data -- 25 subjects") # postscript( file="cd4_eda_plot3.ps", horiz=F ) par( oma=c(15,0,0,0) ) plot( cd4$year, cd4$cd4, pch=".", xlab="Years since seroconversion", ylab="CD4+ cells" ) # ##### plot percentiles # tvec <- c(-25:45)/10 m <- length( tvec ) Q1vec <- rep( NA, m ) Mvec <- rep( NA, m ) Q3vec <- rep( NA, m ) for( j in 1:m ){ keep <- ( cd4$year >= tvec[j]-0.5 )&( cd4$year <= tvec[j]+0.5 ) Q1vec[j] <- quantile( cd4$cd4[keep], 0.25 ) Mvec[j] <- quantile( cd4$cd4[keep], 0.50 ) Q3vec[j] <- quantile( cd4$cd4[keep], 0.75 ) } lines( tvec, Q1vec, lwd=2 ) lines( tvec, Mvec, lwd=2 ) lines( tvec, Q3vec, lwd=2 ) title("MACS CD4 Data -- quartiles") graphics.off() } # ##### ##### covariance summaries ##### # fit <- lm( cd4 ~ ns( year, knots=c(-2,0,2,4) ), data=cd4 ) resids <- cd4$cd4 - fitted( fit ) # nobs <- length( cd4$cd4 ) nsubjects <- length( table( cd4$id ) ) rmat <- matrix( NA, nsubjects, 7 ) ycat <- c( -2, -1, 0, 1, 2, 3, 4, ) nj <- unlist( lapply( split( cd4$id, cd4$id ), length ) ) for( j in 1:7 ){ legal <- ( cd4$year >= ycat[j]-0.5 )&( cd4$year < ycat[j]+0.5 ) jtime <- cd4$year + 0.01*rnorm(nobs) t0 <- unlist( lapply( split( abs(jtime - ycat[j]) , cd4$id ), min ) ) tj <- rep( t0, nj ) keep <- ( abs( jtime - ycat[j] )==tj )&( legal ) yj <- rep( NA, nobs ) yj[keep] <- resids[keep] yj <- unlist( lapply( split( yj, cd4$id ), min, na.rm=T ) ) rmat[ , j ] <- yj } # postscript( file="cd4_eda_plot4.ps", horiz=F ) dimnames( rmat ) <- list( NULL, paste("year",c(-2:4)) ) pairs( rmat ) graphics.off() # ##### covariance matrix # cmat <- matrix( 0, 7, 7 ) nmat <- matrix( 0, 7, 7 ) # for( j in 1:7 ){ for( k in j:7 ){ njk <- sum( !is.na( rmat[,j]*rmat[,k] ) ) sjk <- sum( rmat[,j]*rmat[,k], na.rm=T )/njk cmat[j,k] <- sjk nmat[j,k] <- njk } } print( round( cmat, 2 ) ) vvec <- diag(cmat) cormat <- cmat/( outer( sqrt(vvec), sqrt(vvec) ) ) print( round( cormat, 2 ) ) print( nmat ) # ##### Variogram estimation # source("variogram.q") # out <- lda.variogram( id=cd4$id, y=resids, x=cd4$year ) dr <- out$delta.y dt <- out$delta.x # var.est <- var( resids ) # postscript( file="cd4_eda_variogram.ps", horiz=F ) par( oma=c(15,0,0,0) ) plot( dt, dr, pch=".", ylim=c(0, 1.2*var.est) ) lines( smooth.spline( dt, dr, df=5 ), lwd=3 ) abline( h=var.est, lty=2, lwd=2 ) title("CD4 residual variogram") graphics.off() # # # end-of-file...