# Session 6

# reading in data from the web (or could give a file location on disk)
lawschool <- read.table("http://faculty.washington.edu/kenrice/rintro/lawschool.txt", header=TRUE)

# q1

plot(LSAT~GPA, data=lawschool)

# q2 

orig.cor <- with(lawschool, cor(LSAT, GPA))
orig.cor

# q3 (first with a for() loop)

set.seed(1650) # Battle of Dunbar

# do everything for one resample;
n <- dim(lawschool)[1] # i.e. 15
myrows <- sample(1:n, replace=TRUE) # not a shuffle
lawschool.boot <- lawschool[myrows,]
lawschool.boot 
with(lawschool.boot, cor(LSAT, GPA))

many.cors <- rep(NA, 50)
for(i in 1:50){
   myrows <- sample(1:n, replace=TRUE) # not a shuffle
   lawschool.boot <- lawschool[myrows,]
	many.cors[i] <- with(lawschool.boot, cor(LSAT, GPA))
}
hist(many.cors)
summary(many.cors)

# now with a repeat() loop

set.seed(1650) # Battle of Dunbar

many.cors <- rep(NA, 50)

count <- 1
repeat({
   myrows <- sample(1:n, replace=TRUE) 
   lawschool.boot <- lawschool[myrows,]
   many.cors[count] <- with(lawschool.boot, cor(LSAT, GPA))
   count <- count + 1
   if( count == 50 ) break 
})
hist(many.cors)
summary(many.cors)

# !beware - uses the value of count at the start of the repeat() iteration

# now with a while() loop

set.seed(1650) # Battle of Dunbar
many.cors <- rep(NA, 50)
count <- 1
while(count<=50)({
   myrows <- sample(1:n, replace=TRUE) 
   lawschool.boot <- lawschool[myrows,]
   many.cors[count] <- with(lawschool.boot, cor(LSAT, GPA))
   count <- count + 1
})
hist(many.cors)
summary(many.cors)

# q4 - using a for() loop
# wrap all the q3 code inside a loop of numbers of samples - 50/200/800/3200;
# (and add some prettified output)
par(mfrow=c(2,2))
set.seed(1651) # Battle of Worcester

for(bignumber in c(50, 200, 800, 3200)){
	many.cors <- rep(NA, bignumber)
	for(i in 1:bignumber){
      myrows <- sample(1:n, replace=TRUE) # not a shuffle
      lawschool.boot <- lawschool[myrows,]
	   many.cors[i] <- with(lawschool.boot, cor(LSAT, GPA))
	}
	hist(many.cors, xlim=c(0,1), main="")
	print(summary(many.cors))
   # and a fancy title that includes numeric summaries
	title( paste("Mean =",round(mean(many.cors),2),"SD =",round(sd(many.cors),2) ))

	if(bignumber==3200){ # pretty output in concatanated form this time
		cat("\nFinal estimate:\n") # \n is carriage return
		cat(paste(
			round(orig.cor,2), "(",
			round(quantile(many.cors, 0.025),2), ",",
			round(quantile(many.cors, 0.975),2), ")\n"))
	}
}


# bootstrapping a function of the original estimate is straightforward
# - in this case, atanh() of the correlation - Fisher's Z transform
# (doing this using other statistical approaches is lots of work)

length(many.cors)
quantile(many.cors, c(0.025, 0.975) )
abline(v= quantile(many.cors, c(0.025, 0.975) ), col="thistle", lwd=2)

par(mfrow=c(1,1))
hist(atanh(many.cors))
abline(v= quantile(atanh(many.cors), c(0.025, 0.975) ), col="thistle", lwd=2)

summary( atanh(many.cors) )