## ## General rules for programming simulation studies; ## 1. Try one dataset, understand how the analysis works ## 2. Replicate a few times, using the same steps (cut and paste!) ## and check this works ## 3. Run for a larger number of replications ## NPEOPLE <- 1000 NSNPS <- 50 # we will increase this later set.seed(7) environ <- rpois(NPEOPLE, 10) # Poisson data, mean 10 (e.g. #drinks per week) hist(environ) outcome <- environ*(environ+rnorm(NPEOPLE)) hist(outcome) plot(outcome~environ) abline(lm(outcome~environ)) cor(outcome, environ)^2 # data is a bit "curvy"! one.gene <- rbinom(NPEOPLE, 2, 0.3) table(one.gene) plot(outcome~factor(one.gene), varwidth=TRUE) #"interaction" analysis tests whether environ effect differs by genotype plot(outcome~jitter(environ, 1), col=one.gene + 1) abline(lm(outcome~environ, subset=one.gene==0), col=1) abline(lm(outcome~environ, subset=one.gene==1), col=2) abline(lm(outcome~environ, subset=one.gene==2), col=3) lm(outcome~one.gene*environ) summary(lm(outcome~one.gene*environ))$coeff summary(lm(outcome~one.gene*environ))$coeff[4,4] ## ## Replicate for many SNPs ## many.genes <- matrix(rbinom(NPEOPLE*NSNPS,2,0.3), ncol=NSNPS) # "main effects" analysis, i.e. no interactions system.time({ betas <- numeric(NSNPS) # creates empty vectors - rep(NA, NSNPS) ses <- numeric(NSNPS) pvalues <- numeric(NSNPS) for(i in 1:NSNPS){ model <- lm(outcome~many.genes[,i]) output <- coef(summary(model)) betas[i] <- output[2,1] ses[i] <- output[2,2] pvalues[i] <- output[2,4] } }) qqplot(-log10(ppoints(NSNPS)), -log10(pvalues)) abline(0,1, lty=2) # same data, for interaction analysis system.time({ betas <- numeric(NSNPS) ses <- numeric(NSNPS) pvalues <- numeric(NSNPS) for(i in 1:NSNPS){ model <- lm(outcome~environ*many.genes[,i]) output <- coef(summary(model)) betas[i] <- output[4,1] ses[i] <- output[4,2] pvalues[i] <- output[4,4] } }) qqplot(-log10(ppoints(NSNPS)), -log10(pvalues)) abline(0,1, lty=2) median( (betas/ses)^2)/0.4549 # genomic control "lambda" # 0.4549=qchisq(0.5, df=1) # are we just being unlucky? Try the interaction analysis x10; int.lambdas <- replicate(10, { environ <- rpois(NPEOPLE, 10) outcome <- environ*(environ+rnorm(NPEOPLE)) many.genes <- matrix(rbinom(NPEOPLE*NSNPS,2,0.3), ncol=NSNPS) betas <- numeric(NSNPS) ses <- numeric(NSNPS) for(i in 1:NSNPS){ model <- lm(outcome~environ*many.genes[,i]) output <- coef(summary(model)) betas[i] <- output[4,1] ses[i] <- output[4,2] } median( (betas/ses)^2)/0.4549 } ) alarm() int.lambdas hist(int.lambdas) stripchart(int.lambdas, pch=1, main="replicate lambdas") ## ## use robust std errs to fix it; see the sandwich package ## library("sandwich") # NSNPS <- 500 # around 2 mins, on my machine system.time({ inter.lambdas.robust <-replicate(10,{ environ <- rpois(NPEOPLE, 10) outcome <- environ*(environ+rnorm(NPEOPLE)) many.genes <- matrix(rbinom(NPEOPLE*NSNPS,2,0.3), ncol=NSNPS) betas <- numeric(NSNPS) ses <- numeric(NSNPS) for(i in 1:NSNPS){ model <- lm(outcome~environ*many.genes[,i]) output <- coef(summary(model)) betas[i] <- output[4,1] ses[i] <- sqrt(vcovHC(model)[4,4]) } median( (betas/ses)^2)/0.4549 }) }) alarm() stripchart(inter.lambdas.robust , pch=1, main="replicate lambdas\n interaction effect, robust") main.lambdas.robust <-replicate(10,{ environ <- rpois(NPEOPLE, 10) outcome <- environ*(environ+rnorm(NPEOPLE)) many.genes <- matrix(rbinom(NPEOPLE*NSNPS,2,0.3), ncol=NSNPS) betas <- numeric(NSNPS) ses <- numeric(NSNPS) for(i in 1:NSNPS){ model <- lm(outcome~many.genes[,i]) output <- coef(summary(model)) betas[i] <- output[2,1] ses[i] <- sqrt(vcovHC(model)[2,2]) } median( (betas/ses)^2)/0.4549 }) stripchart(main.lambdas.robust , pch=1, main="replicate lambdas\nmain effects, robust") stripchart(list(inter.plain=int.lambdas, inter.robust=inter.lambdas.robust, main.robust=main.lambdas.robust), pch=21)