##
## Session 5
##

##
## Q1: first, the original version from the slides
##

many.medians <- rep(NA, 10000)
set.seed(4)

for(i in 1:10000){
  mysample <- rexp(n=51, rate=1) # take a sample, size 51
  many.medians[i] <- median(mysample) # calcuate & store its median
}

mean(many.medians)
var(many.medians)
hist(many.medians)

# version with n=501
many.medians501 <- rep(NA, 10000)
set.seed(1314) # Bannockburn!

for(i in 1:10000){
  mysample <- rexp(n=501, rate=1) # take a sample, size 501
  many.medians501[i] <- median(mysample) # calcuate & store its median
}

mean(many.medians501)
var(many.medians501)
hist(many.medians501)

#histograms comparing the two situations
par(mfrow=c(1,2))
hist(many.medians, xlim=c(0.3, 1.3))
hist(many.medians501, xlim=c(0.3, 1.3))

# same mean (plus or minus Monte Carlo error) 
# variance about x10 smaller

##
## Q2
##

View(mtcars)
plot(mpg~wt, data=mtcars)
orig.cor <- with(mtcars, cor(mpg, wt))
orig.cor 
with(mtcars, plot(mpg~wt))

many.cor <- rep(NA, 100000) # set up place to put output
set.seed(1513) # Flodden Field

for(i in 1:100000){ 
  mpg.shuffle <- sample(mtcars$mpg)
  many.cor[i] <- cor( mpg.shuffle, mtcars$wt )
}

hist(many.cor, xlim=c(-1,1), n=31)
abline(v=c(-1,1)*orig.cor, lwd=2, col=2)
table( abs(many.cor) > abs(orig.cor) )

# so p<1E-5

# how small should p be? (roughly)
cor.test(mtcars$mpg, mtcars$wt)

# comparison of timing
# the recommended version first;

system.time({
  for(i in 1:100000){ 
    mpg.shuffle <- sample(mtcars$mpg)
    many.cor[i] <- cor( mpg.shuffle, mtcars$wt )
  }
})

# and a version "growing" the output - several times slower

many.cor2 <- NULL
system.time({
  for(i in 1:100000){ 
    mpg.shuffle <- sample(mtcars$mpg)
    many.cor2 <- c(many.cor2, cor( mpg.shuffle, mtcars$wt ) )
  }
})

# this is slow because R has to make many copies of the output
# ... which takes much more time than the actual calculations