# permtest.R # # created 11/10 by M. Frazier # # last modified 11/11 by M. Dillon # # demonstrate a permutation t-test # # Data are test scores ("Score") for students who did or did not # receive training ("Treatment") ######################################################### ## set working directory setwd("/home/michael/Documents/Courses/R/Hands on R/Fall 2011/week 11-some basic stats/4-Permutations") dir() data <- read.table("RData.txt", header=TRUE, sep="\t") data str(data) unique(data$Treatment) #what are the treatment levels? plot(data$Score ~ data$Treatment) par(mfrow=c(2,1)) hist(data$Score[data$Treatment == "Training"]) hist(data$Score[data$Treatment == "NoTraining"]) ########################################################## #Step 1: calculate observed test statistic from data ########################################################## MeanTraining <- mean(data$Score[data$Treatment == "Training"]) MeanNoTraining <- mean(data$Score[data$Treatment == "NoTraining"]) Obs <- MeanTraining - MeanNoTraining Obs ######################################################################## #Step 2: figure out how to randomize the treatments and calculate a ### Randomized test statistic ######################################################################## # The "sample" function will randomly shuffle the treatments! RandTreat <- sample(data$Treatment) #check it out to make sure it worked RandMeanTraining <- mean(data$Score[RandTreat == "Training"]) RandMeanNoTraining <- mean(data$Score[RandTreat == "NoTraining"]) RandStats <- RandMeanTraining - RandMeanNoTraining ### QUESTION: how does RandStats compare to the Obs test statistic? RandStats; Obs ######################################################################### #Step 3: we were able to calculate a single randomized test statistic. ### We need to repeat the above step several hundred times to get ### a distribution of random statistics To do this, we need to ### add "wrap" the loop code around the code in Step 2. ### ######################################################################### # A. Make a variable that describes how many Randomizations you want: RandNum <- 1000 # B. Make an empty vector of NA (i.e. missing values). # This will be where we place the RandStat values we randomly generate. RandStats <- rep(NA, RandNum) ######The following is the "wrapper" we need to place around the code in Step 2. ######This tells R to repeat this step 1 through the length of RandNum (which is 1000) ######Curly brackets are used for loops in R for(i in 1:RandNum){ ##### Cut and paste the code from Step 2 down here: RandTreat <- sample(data$Treatment) #this function shuffles the Treatments! RandMeanTraining <- mean(data$Score[RandTreat == "Training"]) RandMeanNoTraining <- mean(data$Score[RandTreat == "NoTraining"]) ###We make one change below, by adding the [i] after RandStats to tell R where to ###place the value in the RandStats vector that we created. RandStats[i] <- RandMeanTraining - RandMeanNoTraining #We need a final curly bracket to tell R that we are done with the loop: } RandStats #you should have a vector with 1000 values in it. If you don't, something went wrong. #make a distribution of the random test statistics: hist(RandStats) #compare the observed value to the random distribution. abline(v=Obs, col="red") #compare observed value from data to distribution. # It appears unlikely that we got our result by chance. # Calculate the p-value: # What percentage of the random samples were greater than the observed statistic? sum(RandStats >= Obs)/RandNum #This is the 1-tailed p-value because we are asking how many were greater. # To get a 2-tailed p-value, we # use the absolute (abs) values. # This determines the percentage of samples that had such an extreme value on either # end of the distribution. sum(abs(RandStats) >= abs(Obs))/RandNum # P-value is around 0.023, which suggests that the group that received # training had a statistically higher score. # We can compare our p-value to the standard t-test p-values: t.test(Score~Treatment, data=data)