# regression.R # # This script runs through an example regression analysis # # Depends: ipomopsis.txt # # created 11/1/2010 by M. Frazier # # last modified: 11/1/2011 by M. Dillon # ####################################################################### ## set working directory setwd("/home/michael/Documents/Courses/R/Hands on R/Fall 2011/week 11-some basic stats/3-Regression Example") dir() ## Load data ## We can read data from a website! But if this fails, the data is also in ## the directory folder. You can access it using alternative 2 below. ipomopsis<- read.table('http://www.unc.edu/courses/2007spring/enst/562/001/data/lab5/ipomopsis.txt', header=T,sep='\t') ## Alternative 2 (if you need this, uncomment it) #ipomopsis<- read.table("ipomopsis.txt", header=T,sep='\t') ## Run some functions to check out the data head(ipomopsis) # first 6 rows of dataframe dim(ipomopsis) # dimensions: number of rows, columns summary(ipomopsis) lapply(ipomopsis, class) #variable classes...no surprises here! ## Some quick visual checks of the data hist(ipomopsis$Root) hist(ipomopsis$Fruit) # Oh yay! These variables pass the visual test for normality (but see Zuur et al., 2010) plot(ipomopsis$Grazing, ipomopsis$Root) plot(ipomopsis$Grazing, ipomopsis$Fruit) # There doesn't appear to be much of an effect of grazing on Fruit size. # But lets do the full analysis. ################################ ## Plotting the point data ################################ plot(ipomopsis$Fruit ~ ipomopsis$Root, col = c("red", "blue")[ipomopsis$Grazing], #a little color coding pch = 16, main = "Effects of grazing", ylab = "Fruit size (g)", xlab = "Root size (g)", las = 1, #las rotates the labels on the y axis cex = 1.5 #cex scales the point size up or down ) # We can get real fancy, and add a legend. # ?legend to get a summary of the options. legend("topleft", legend=c("grazed", "ungrazed"), col=c("red", "blue"), pch=16) ############################################################### ## Linear models using ordinary least squares regression models ############################################################### #### Here is the simplest possible model mod1 <- lm(Fruit ~ Root, data=ipomopsis) summary(mod1) abline(mod1) #this plots the regression model on the figure # this works on simple models, but fails on more complex models with # multiple effects. plot(mod1) # look at some diagnostic plots #abline is a flexible function, try these abline(v = 7) abline(h = 40) #### Varying intercepts model # there is a strong positive correlation between root size and fruit size. # but it looks like we can get a better model. mod2 <- lm(Fruit ~ Root + Grazing, data=ipomopsis) summary(mod2) # we can also use abline(intercept, slope) to plot the model fit # In this case, the "grazed" category is the default category that # the model fits (i.e. dummy variable is 1 for this category). The # "ungrazed" category is compared to the "grazed" category. # line for the "grazed" category abline(-127.829, 23.560, col="red", lwd=2) #lwd = line width # we have to calculate the intercept for the "ungrazed" category # by adding the intercept and GrazingUngrazed Estimate abline((-127.829 + 36.103), 23.560, col="blue", lwd=2) # notice we use the same slope for the grazed and ungrazed caegories. ### Interaction model mod3 <- lm(Fruit ~ Root*Grazing, data=ipomopsis) summary(mod3) ############################## #### Model selection ############################## # The fact that the p-value for the Root:GrazingUngrazed coefficient # is not significant suggests that this more complex model is not supported. # Rather than using p-values to determine which model is better, we should # be using AIC values, or other more robust methods. AIC(mod1, mod2, mod3) # the model with the lowest AIC value is considered the best supported. # According to this, mod2 is the best one. # Another way to compare models is to use the anova function. anova(mod1, mod2) #mod 2 is significantly better than mod1 anova(mod2, mod3) #mod 3 is NOT significantly better than mod2 # Given this, we will report mod2. Now, we want to export a nice picture # for our paper/talk ############################################################### ## Peaking into the linear model ############################################################### # there is more information in the regression model that we can access names(mod2) #here is a list of some of them # they can be accessed like this: mod2$coefficients # one thing that can be interesting is to look at what the model # predicts for each sample. For example, the first sample has a # Root size of 6.225 and is Ungrazed. How big should the fruit be based # on the model? mod2$fitted.values #provides a predicted fruit size for each sample # According to the model, the predicted size of fruit for that sample # was 54.935. The observed weight was 59.77, so the model guessed a little low # in that case. The difference between the observed and predicted value is # the "residual". For the first sample, the residual would be: 59.77 - 54.935 #residual for sample 1 # We can also get these values: mod2$residuals # It can be useful to compare the predicted and observed values for the data plot(ipomopsis$Fruit, mod2$fitted.values) #it looks like the model does a good job. ### Here are some plots to evaluate the model assumptions plot(mod2) # you will cycle through 4 different figures. # Fig. 1: A plot of residuals vs. fitted values (predicted values). # Ideally, these points will be symmetric around the zero line. # There may be a pattern that suggests we should be fitting a curved line. # Fig. 2: Normal Q-Q plot. If residuals are from a normal distribution the # points should lie close to the line. # Fig. 3: Similar to Fig. 1, but the square root of standardized residuals # is used. I guess there are theoretical reasons why this is better for # evaluating whether the variance is constant (but usually doesn't matter). # A funnel shape to the points might indicate heteroscedasticity (unequal variance). # Fig. 4: Identifies residuals that are influential in driving the # regression line. Cook's distance (the red contours) measures the extent # to which a line would change if the point were omitted. Values >1 are # considered influential. In this case, everything is less than 0.5. # Note: outliers may or may not be influential. Influential outliers can # be problematic. # To get confidence intervals around predictions confint(mod2) ### Nice shortcut figures # You can run this if you know how to install packages. # If not, don't worry, we will cover that concept next. library(car) scatterplot(Fruit ~ Root, data=ipomopsis, groups=ipomopsis$Grazing, by.groups=TRUE) #fits a linear regression model and a splinal fit to each group.