# R considers any line that begins with a "#" sign as a comment.
#
#================= variables and assignments ========================
#
# In the R command window (the one that opens automatically when you
# start R), type the following, followed by a carriage return. (From
# here on, assume every line is followed by a carriage return.)

x <- 5 # this assigns "5" to a variable called "x"

print(x) # This prints the contents of variable "x"

# Let's play with this a bit, for example, if you do

x <- 3

# x now is 3, instead of 5 (i.e., you overwrote what was in x),
# so that if you do:

print(x)

# it says 3.

# Let's assign 7 to a variable called "y":

y <- 7
print(y)

# Now, what would be the value of a new variable z after the following?

z _ x + y

# any guess? Let's find out:

print (z)

#====== assigning a "vector" (i.e., string of numbers) to a variable ======

x <- c(1,3,4,7,8)

# the function "c" combines values into a vector, so that when you do:

print(x) # you'll see it has five values, 1,3,4,7, and 8.

#================= Introducing a random number generator, rnorm() =========

x <- rnorm(10) # This samples 10 values from a normal distribution with a
# mean of 0 and sd of 1

print(x)

# Let's do this a couple of times:
x <- rnorm(10)
print(x)

x <- rnorm(10)
print(x)

# Of course, with only 10 values, it's difficult to see if the values are
# indeed being sampled from a normal distribution.
#
# Let's sample 100 values:

x <- rnorm(100)
print(x)

# Now we have more of a "distribution". Does this look like a normal
# distribution?
#
# By the way, when you did "print(x)", you must have noticed that each line
# of the output was preceded by a number in square brackets. They indicate
# the position of the number just to the right in the vector. So [96] means
# that the number to the right of it is the 96th value of the vector called
# x. The one to the right of that is the 97th, and then the 98th next, etc.
#
# Now you may be realizing that the human ability to glean the shape of a
# distribution is not all that good. What to do? Let's plot a histogram. R
# has a function called "hist()" which does a histogram.

hist(x)

# That looks pretty good, no? How about if you sample 10,000 values, rather
# than 100?

x <- rnorm(10000)
hist(x)

# This should look even better, looking more normally distributed.
# How about if you sample more, for example 100,000 values?
# Warning: don't do the following on a slow computer. Skip the next two
# lines if your computer is slow

x <- rnorm(100000)
hist(x)

# By now, I hope you're now reasonably convinced that rnorm(N) samples N
# values from a normal distribution with a mean of 0 and sd of 1.

#==== computing a sample statistic - mean(), var(), median(), max() etc.====
#
# mean(x) is the mean of vector x, var(x) is the variance of x, and
# median(x) is the median of x, max(x) is the maximum of x, min(x) is
# the minimum of x

x <- c(1,2,2,4,5) # let's first assign a few numbers here (otherwise x would
# still contain 100,000 numbers you assigned above.
# You don't want to do print(x) when x has 100,000 things
# in it (press ESC to stop the R, if that happens).
# Also, with x <- c(1,2,2,4,5), you can verify the result
# of doing the following calculations by hand.

print(x)
print(mean(x))
print(var(x))
print(median(x))
print(max(x))
print(min(x))

# by the way, I've been writing "print(x)" but if you just type the name of
# the variable, in this case, x, and hit return, it's the same as typing
# "print(x)". See below.

x <- rnorm(10)
print(x)
x

print(mean(x))
mean(x)

# Another simplification, the assignment operator, "<-" can be substituted
# by an underscore, as below, which saves one keystroke!

x <- 5 # this assigns 5 to x
x
x _ 7 # this assigns 7 to x
x

# =========================== "Homework" =============================

# One other thing: The rnorm() function can take a couple of optional
# parameters, "mean", and "sd":

x _ rnorm(10,mean=10,sd=2)

# the above samples 10 values from a normal distribution with a mean of 10
# and sd of 2.

# Using the functions introduced so far, do the following hypothetical
# study:

# (1) Sample 10 students and look at the mean of their GRE scores. Assume
# that in the population, GRE scores have a mean of 500 and sd of # 100.

# (1b) Replicate the above study a few times and see how your results change
# from one replication to another

# (2) Suppose that women's GRE scores were on average 20 points higher than
# men's (suppose in the population, women's mean is 510 with an sd of 100,
# and men's mean is 490 with an sd of 100 . Do a thought experiment in which
# you take a sample of 10 men and 10 women and compare their mean scores. In
# this hypothetical study, do women turn out to have higher mean than men?

# (2b) Replicate the above study 10 times or so and see how your results change
# from one replication to another

# (2c) What if you used max instead of mean to compare men and women?
#

#
# =========================== "Preview" =============================
# Below is a preview of a systematic way to answer the question above.
# I'll go over this in class, but you're welcome to try it ahead of time.
# Copy the whole thing below, paste in R, run it and see what happens.
# (If nothing happens, hit return -- sometimes copying/pasting
# doesn't include the carriage return at the end of the last line.)

WomenMinusMen.mean _ NULL
WomenMinusMen.max _ NULL

for (i in c(1:1000)) {
  women _ rnorm(10,510,100)
  men   _ rnorm(10,490,100)

  WomenMinusMen.mean _ c(WomenMinusMen.mean, mean(women) - mean(men))
  WomenMinusMen.max  _ c(WomenMinusMen.max,  max(women)  - max(men) )
}

cat("\n\nmean(women) < mean(men) in",
     length(WomenMinusMen.mean[WomenMinusMen.mean < 0]),
     "out of",length(WomenMinusMen.mean),"times.\n")

cat("\n\nmax(women) < max(men) in",
     length(WomenMinusMen.max[WomenMinusMen.max < 0]),
     "out of",length(WomenMinusMen.max),"times.\n")

# Note: the differences between mean and max are more exaggerated when N=20

# This illustrates the question of statistical power. What's the statistical
# power for detecting (i.e., correctly rejecting the null hypothesis) given
# the above? If you're interested in means?  Or maximums?

# Let's test to see if the standard t-test formula is correct.
# (1) We will assume that the two populations have equal means and sd.
# (2) We will see what proportion of the time the difference will exceed
# the critical value the standard t-test gives for p=.05.
# Formula: SEdiff = sqrt(VARpooled*(1/N1 + 1/N2))
#          VARpooled = ((N1-1)VAR1 + (N2-1)VAR2)/(N1+N2-2)

X1 _ c(16,9,4,23,19,10,5,2)
X2 _ c(20,5,1,16,2,4)
VARpooled _ ((length(X1)-1)*var(X1) + (length(X2)-1)*var(X2)) / (length(X1)+length(X2)-2)
SEdiff _ sqrt(VARpooled*(1/length(X1) + 1/length(X2)))

tvalue _ (mean(X1) - mean(X2))/SEdiff
cat ("The t value based on the forumla is", round(tvalue,4),"\n")

tvalueRfunction _ t.test(X1, X2, var.equal=T)
attributes(tvalueRfunction)
cat ("The t value based on the R function t.test() is",
            round(tvalueRfunction$statistic,4),"\n")

# ==============

tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL

Nreplications _ 10000

for (i in c(1:Nreplications)) {
  X1 _ rnorm(10,500,100)
  X2 _ rnorm(10,500,100)

  tresults _ t.test(X1,X2,var.equal=T)
  tvalues _ c(tvalues,tresults$statistic)
  pvalues _ c(pvalues,tresults$p.value)

  if (i > 0 && (i %% 100) == 0) {
      cat("Done",i,"times.\n")
   flush.console()
  }
}

#tcrit _ 2.101 #(two- tailed, p=.05, with df=18)
# tcrit _ 1.734 #(one-tailed, p=.05, with df=18)

cat("\n\nt>",1.734," in",
     length(tvalues[tvalues > 1.734]),
     "out of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
     length(tvalues[tvalues > 1.733]),
     "out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
     length(pvalues[pvalues < 0.05]),
     "out of",Nreplications,"times.\n")

# ==== What if the population variances are not equal? =====

tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL

Nreplications _ 10000

for (i in c(1:Nreplications)) {
  X1 _ rnorm(10,500,100)
  X2 _ rnorm(10,500,1)

  tresults _ t.test(X1,X2,var.equal=T)
  tvalues _ c(tvalues,tresults$statistic)
  pvalues _ c(pvalues,tresults$p.value)
  tresults _ t.test(X1,X2,var.equal=F)
  pvalues.Welch _ c(pvalues.Welch,tresults$p.value)
  if (i > 0 && (i %% 100) == 0) {
      cat("Done",i,"times.\n")
   flush.console()
  }
}

#tcrit _ 2.101 #(two- tailed, p=.05, with df=18)
#tcrit _ 1.734 #(one-tailed, p=.05, with df=18)

cat("\n\nt>",1.734," in",
     length(tvalues[tvalues > 1.734]),
     "out of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
     length(tvalues[tvalues > 1.733]),
     "out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
     length(pvalues[pvalues < 0.05]),
     "out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
     length(pvalues.Welch[pvalues.Welch < 0.05]),
     "out of",Nreplications,"times.\n")

# hist(tvalues)

# ==== What if the population variances are not equal and sample sizes differ a lot too? =====

# Let's define a new function, called ttesttest()

ttesttest_ function (N1, N2, M1, M2, V1, V2, Nreps) {

tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL

for (i in c(1:Nreps)) {
  X1 _ rnorm(N1,M1,V1)
  X2 _ rnorm(N2,M2,V2)

  tresults _ t.test(X1,X2,var.equal=T)
  tvalues _ c(tvalues,tresults$statistic)
  pvalues _ c(pvalues,tresults$p.value)
  tresults _ t.test(X1,X2,var.equal=F)
  pvalues.Welch _ c(pvalues.Welch,tresults$p.value)
  if (i > 0 && (i %% 100) == 0) {
      cat("Done",i,"times.\n")
   flush.console()
  }
}

cat("\n\nt>",1.734," in",
     length(tvalues[tvalues > 1.734]),
     "out of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
     length(tvalues[tvalues > 1.733]),
     "out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
     length(pvalues[pvalues < 0.05]),
     "out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
     length(pvalues.Welch[pvalues.Welch < 0.05]),
     "out of",Nreplications,"times.\n")

}

ttesttest(N1=10,N2=20,M1=500,M2=500,V1=100,V2=1,Nreps=10000)

ttesttest(N1=20,N2=10,M1=500,M2=500,V1=100,V2=1,Nreps=10000)