# permtest.R
#
# created 11/10 by M. Frazier
#
# last modified 11/11 by M. Dillon
#
# demonstrate a permutation t-test
#
# Data are test scores ("Score") for students who did or did not
# receive training ("Treatment")
#########################################################
## set working directory
setwd("/home/michael/Documents/Courses/R/Hands on R/Fall 2011/week 11-some basic stats/4-Permutations")
dir()
data <- read.table("RData.txt", header=TRUE, sep="\t")
data
str(data)
unique(data$Treatment) #what are the treatment levels?
plot(data$Score ~ data$Treatment)
par(mfrow=c(2,1))
hist(data$Score[data$Treatment == "Training"])
hist(data$Score[data$Treatment == "NoTraining"])
##########################################################
#Step 1: calculate observed test statistic from data
##########################################################
MeanTraining <- mean(data$Score[data$Treatment == "Training"])
MeanNoTraining <- mean(data$Score[data$Treatment == "NoTraining"])
Obs <- MeanTraining - MeanNoTraining
Obs
########################################################################
#Step 2: figure out how to randomize the treatments and calculate a
### Randomized test statistic
########################################################################
# The "sample" function will randomly shuffle the treatments!
RandTreat <- sample(data$Treatment) #check it out to make sure it worked
RandMeanTraining <- mean(data$Score[RandTreat == "Training"])
RandMeanNoTraining <- mean(data$Score[RandTreat == "NoTraining"])
RandStats <- RandMeanTraining - RandMeanNoTraining
### QUESTION: how does RandStats compare to the Obs test statistic?
RandStats; Obs
#########################################################################
#Step 3: we were able to calculate a single randomized test statistic.
### We need to repeat the above step several hundred times to get
### a distribution of random statistics To do this, we need to
### add "wrap" the loop code around the code in Step 2.
###
#########################################################################
# A. Make a variable that describes how many Randomizations you want:
RandNum <- 1000
# B. Make an empty vector of NA (i.e. missing values).
# This will be where we place the RandStat values we randomly generate.
RandStats <- rep(NA, RandNum)
######The following is the "wrapper" we need to place around the code in Step 2.
######This tells R to repeat this step 1 through the length of RandNum (which is 1000)
######Curly brackets are used for loops in R
for(i in 1:RandNum){
##### Cut and paste the code from Step 2 down here:
RandTreat <- sample(data$Treatment) #this function shuffles the Treatments!
RandMeanTraining <- mean(data$Score[RandTreat == "Training"])
RandMeanNoTraining <- mean(data$Score[RandTreat == "NoTraining"])
###We make one change below, by adding the [i] after RandStats to tell R where to
###place the value in the RandStats vector that we created.
RandStats[i] <- RandMeanTraining - RandMeanNoTraining
#We need a final curly bracket to tell R that we are done with the loop:
}
RandStats
#you should have a vector with 1000 values in it. If you don't, something went wrong.
#make a distribution of the random test statistics:
hist(RandStats)
#compare the observed value to the random distribution.
abline(v=Obs, col="red") #compare observed value from data to distribution.
# It appears unlikely that we got our result by chance.
# Calculate the p-value:
# What percentage of the random samples were greater than the observed statistic?
sum(RandStats >= Obs)/RandNum
#This is the 1-tailed p-value because we are asking how many were greater.
# To get a 2-tailed p-value, we
# use the absolute (abs) values.
# This determines the percentage of samples that had such an extreme value on either
# end of the distribution.
sum(abs(RandStats) >= abs(Obs))/RandNum
# P-value is around 0.023, which suggests that the group that received
# training had a statistically higher score.
# We can compare our p-value to the standard t-test p-values:
t.test(Score~Treatment, data=data)