# R considers any line that begins with a "#" sign as a comment.
#
#================= variables and assignments ========================
#
# In the R command window (the one that opens automatically when you
# start R), type the following, followed by a carriage return. (From
# here on, assume every line is followed by a carriage return.)
x <- 5 # this assigns "5" to a variable called "x"
print(x) # This prints the contents of variable "x"
# Let's play with this a bit, for example, if you do
x <- 3
# x now is 3, instead of 5 (i.e., you overwrote what was in x),
# so that if you do:
print(x)
# it says 3.
# Let's assign 7 to a variable called "y":
y <- 7
print(y)
# Now, what would be the value of a new variable z after the following?
z _ x + y
# any guess? Let's find out:
print (z)
#====== assigning a "vector" (i.e., string of numbers) to a variable ======
x <- c(1,3,4,7,8)
# the function "c" combines values into a vector, so that when you do:
print(x) # you'll see it has five values, 1,3,4,7, and 8.
#================= Introducing a random number generator, rnorm() =========
x <- rnorm(10) # This samples 10 values from a normal distribution with a
# mean of 0 and sd of 1
print(x)
# Let's do this a couple of times:
x <- rnorm(10)
print(x)
x <- rnorm(10)
print(x)
# Of course, with only 10 values, it's difficult to see if the values are
# indeed being sampled from a normal distribution.
#
# Let's sample 100 values:
x <- rnorm(100)
print(x)
# Now we have more of a "distribution". Does this look like a normal
# distribution?
#
# By the way, when you did "print(x)", you must have noticed that each line
# of the output was preceded by a number in square brackets. They indicate
# the position of the number just to the right in the vector. So [96] means
# that the number to the right of it is the 96th value of the vector called
# x. The one to the right of that is the 97th, and then the 98th next, etc.
#
# Now you may be realizing that the human ability to glean the shape of a
# distribution is not all that good. What to do? Let's plot a histogram. R
# has a function called "hist()" which does a histogram.
hist(x)
# That looks pretty good, no? How about if you sample 10,000 values, rather
# than 100?
x <- rnorm(10000)
hist(x)
# This should look even better, looking more normally distributed.
# How about if you sample more, for example 100,000 values?
# Warning: don't do the following on a slow computer. Skip the next two
# lines if your computer is slow
x <- rnorm(100000)
hist(x)
# By now, I hope you're now reasonably convinced that rnorm(N) samples N
# values from a normal distribution with a mean of 0 and sd of 1.
#==== computing a sample statistic - mean(), var(), median(), max() etc.====
#
# mean(x) is the mean of vector x, var(x) is the variance of x, and
# median(x) is the median of x, max(x) is the maximum of x, min(x) is
# the minimum of x
x <- c(1,2,2,4,5) # let's first assign a few numbers here (otherwise x would
# still contain 100,000 numbers you assigned above.
# You don't want to do print(x) when x has 100,000 things
# in it (press ESC to stop the R, if that happens).
# Also, with x <- c(1,2,2,4,5), you can verify the result
# of doing the following calculations by hand.
print(x)
print(mean(x))
print(var(x))
print(median(x))
print(max(x))
print(min(x))
# by the way, I've been writing "print(x)" but if you just type the name of
# the variable, in this case, x, and hit return, it's the same as typing
# "print(x)". See below.
x <- rnorm(10)
print(x)
x
print(mean(x))
mean(x)
# Another simplification, the assignment operator, "<-" can be substituted
# by an underscore, as below, which saves one keystroke!
x <- 5 # this assigns 5 to x
x
x _ 7 # this assigns 7 to x
x
# =========================== "Homework" =============================
# One other thing: The rnorm() function can take a couple of optional
# parameters, "mean", and "sd":
x _ rnorm(10,mean=10,sd=2)
# the above samples 10 values from a normal distribution with a mean of 10
# and sd of 2.
# Using the functions introduced so far, do the following hypothetical
# study:
# (1) Sample 10 students and look at the mean of their GRE scores. Assume
# that in the population, GRE scores have a mean of 500 and sd of # 100.
# (1b) Replicate the above study a few times and see how your results change
# from one replication to another
# (2) Suppose that women's GRE scores were on average 20 points higher than
# men's (suppose in the population, women's mean is 510 with an sd of 100,
# and men's mean is 490 with an sd of 100 . Do a thought experiment in which
# you take a sample of 10 men and 10 women and compare their mean scores. In
# this hypothetical study, do women turn out to have higher mean than men?
# (2b) Replicate the above study 10 times or so and see how your results change
# from one replication to another
# (2c) What if you used max instead of mean to compare men and women?
#
#
# =========================== "Preview" =============================
# Below is a preview of a systematic way to answer the question above.
# I'll go over this in class, but you're welcome to try it ahead of time.
# Copy the whole thing below, paste in R, run it and see what happens.
# (If nothing happens, hit return -- sometimes copying/pasting
# doesn't include the carriage return at the end of the last line.)
WomenMinusMen.mean _ NULL
WomenMinusMen.max _ NULL
for (i in c(1:1000)) {
women _ rnorm(10,510,100)
men _ rnorm(10,490,100)
WomenMinusMen.mean _ c(WomenMinusMen.mean, mean(women) - mean(men))
WomenMinusMen.max _ c(WomenMinusMen.max, max(women) - max(men) )
}
cat("\n\nmean(women) < mean(men) in",
length(WomenMinusMen.mean[WomenMinusMen.mean < 0]),
"out
of",length(WomenMinusMen.mean),"times.\n")
cat("\n\nmax(women) < max(men) in",
length(WomenMinusMen.max[WomenMinusMen.max < 0]),
"out of",length(WomenMinusMen.max),"times.\n")
# Note: the differences between mean and max are more exaggerated when N=20
# This illustrates the question of
statistical power. What's the statistical
# power for detecting (i.e., correctly rejecting the null hypothesis) given
# the above? If you're interested in means? Or maximums?
# Let's test to see if the standard t-test formula is
correct.
# (1) We will assume that the two populations have equal means and sd.
# (2) We will see what proportion of the time the difference will exceed
# the critical value the standard t-test gives for p=.05.
# Formula: SEdiff = sqrt(VARpooled*(1/N1 + 1/N2))
# VARpooled = ((N1-1)VAR1
+ (N2-1)VAR2)/(N1+N2-2)
X1 _ c(16,9,4,23,19,10,5,2)
X2 _ c(20,5,1,16,2,4)
VARpooled _ ((length(X1)-1)*var(X1) + (length(X2)-1)*var(X2)) /
(length(X1)+length(X2)-2)
SEdiff _ sqrt(VARpooled*(1/length(X1) + 1/length(X2)))
tvalue _ (mean(X1) - mean(X2))/SEdiff
cat ("The t value based on the forumla is",
round(tvalue,4),"\n")
tvalueRfunction _ t.test(X1, X2, var.equal=T)
attributes(tvalueRfunction)
cat ("The t value based on the R function t.test() is",
round(tvalueRfunction$statistic,4),"\n")
# ==============
tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL
Nreplications _ 10000
for (i in c(1:Nreplications)) {
X1 _ rnorm(10,500,100)
X2 _ rnorm(10,500,100)
tresults _ t.test(X1,X2,var.equal=T)
tvalues _ c(tvalues,tresults$statistic)
pvalues _ c(pvalues,tresults$p.value)
if (i > 0 && (i %% 100) == 0) {
cat("Done",i,"times.\n")
flush.console()
}
}
#tcrit _ 2.101 #(two- tailed, p=.05, with
df=18)
#
tcrit _ 1.734 #(one-tailed, p=.05, with df=18)
cat("\n\nt>",1.734," in",
length(tvalues[tvalues > 1.734]),
"out
of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
length(tvalues[tvalues > 1.733]),
"out
of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
length(pvalues[pvalues < 0.05]),
"out
of",Nreplications,"times.\n")
# ==== What if the population variances are not equal? =====
tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL
Nreplications _ 10000
for (i in c(1:Nreplications)) {
X1 _ rnorm(10,500,100)
X2 _ rnorm(10,500,1)
tresults _ t.test(X1,X2,var.equal=T)
tvalues _ c(tvalues,tresults$statistic)
pvalues _ c(pvalues,tresults$p.value)
tresults _ t.test(X1,X2,var.equal=F)
pvalues.Welch _ c(pvalues.Welch,tresults$p.value)
if (i > 0 && (i %% 100) == 0) {
cat("Done",i,"times.\n")
flush.console()
}
}
#tcrit _ 2.101 #(two- tailed, p=.05, with
df=18)
#tcrit _ 1.734 #(one-tailed, p=.05, with df=18)
cat("\n\nt>",1.734," in",
length(tvalues[tvalues > 1.734]),
"out
of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
length(tvalues[tvalues > 1.733]),
"out
of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
length(pvalues[pvalues < 0.05]),
"out
of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
length(pvalues.Welch[pvalues.Welch < 0.05]),
"out
of",Nreplications,"times.\n")
# hist(tvalues)
# ==== What if the population variances are not equal and sample sizes differ a lot too? =====
# Let's define a new function, called ttesttest()
ttesttest_ function (N1, N2, M1, M2, V1, V2, Nreps) {
tvalues _ NULL
pvalues _ NULL
pvalues.Welch _ NULL
for (i in c(1:Nreps)) {
X1 _ rnorm(N1,M1,V1)
X2 _ rnorm(N2,M2,V2)tresults _ t.test(X1,X2,var.equal=T)
tvalues _ c(tvalues,tresults$statistic)
pvalues _ c(pvalues,tresults$p.value)
tresults _ t.test(X1,X2,var.equal=F)
pvalues.Welch _ c(pvalues.Welch,tresults$p.value)
if (i > 0 && (i %% 100) == 0) {
cat("Done",i,"times.\n")
flush.console()
}
}cat("\n\nt>",1.734," in",
length(tvalues[tvalues > 1.734]),
"out of",Nreplications,"times.\n")
cat("\n\nt>",1.733," in",
length(tvalues[tvalues > 1.733]),
"out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
length(pvalues[pvalues < 0.05]),
"out of",Nreplications,"times.\n")
cat("\n\np<",0.05," in",
length(pvalues.Welch[pvalues.Welch < 0.05]),
"out of",Nreplications,"times.\n")
}
ttesttest(N1=10,N2=20,M1=500,M2=500,V1=100,V2=1,Nreps=10000)
ttesttest(N1=20,N2=10,M1=500,M2=500,V1=100,V2=1,Nreps=10000)