## ## Session 5 ## ## ## Q1: first, the original version from the slides ## many.medians <- rep(NA, 10000) set.seed(4) for(i in 1:10000){ mysample <- rexp(n=51, rate=1) # take a sample, size 51 many.medians[i] <- median(mysample) # calcuate & store its median } mean(many.medians) var(many.medians) hist(many.medians) # version with n=501 many.medians501 <- rep(NA, 10000) set.seed(1314) # Bannockburn! for(i in 1:10000){ mysample <- rexp(n=501, rate=1) # take a sample, size 501 many.medians501[i] <- median(mysample) # calcuate & store its median } mean(many.medians501) var(many.medians501) hist(many.medians501) #histograms comparing the two situations par(mfrow=c(1,2)) hist(many.medians, xlim=c(0.3, 1.3)) hist(many.medians501, xlim=c(0.3, 1.3)) # same mean (plus or minus Monte Carlo error) # variance about x10 smaller ## ## Q2 ## View(mtcars) plot(mpg~wt, data=mtcars) orig.cor <- with(mtcars, cor(mpg, wt)) orig.cor with(mtcars, plot(mpg~wt)) many.cor <- rep(NA, 100000) # set up place to put output set.seed(1513) # Flodden Field for(i in 1:100000){ mpg.shuffle <- sample(mtcars$mpg) many.cor[i] <- cor( mpg.shuffle, mtcars$wt ) } hist(many.cor, xlim=c(-1,1), n=31) abline(v=c(-1,1)*orig.cor, lwd=2, col=2) table( abs(many.cor) > abs(orig.cor) ) # so p<1E-5 # how small should p be? (roughly) cor.test(mtcars$mpg, mtcars$wt) # comparison of timing # the recommended version first; system.time({ for(i in 1:100000){ mpg.shuffle <- sample(mtcars$mpg) many.cor[i] <- cor( mpg.shuffle, mtcars$wt ) } }) # and a version "growing" the output - several times slower many.cor2 <- NULL system.time({ for(i in 1:100000){ mpg.shuffle <- sample(mtcars$mpg) many.cor2 <- c(many.cor2, cor( mpg.shuffle, mtcars$wt ) ) } }) # this is slow because R has to make many copies of the output # ... which takes much more time than the actual calculations