# regression.R
#
# This script runs through an example regression analysis
#
# Depends: ipomopsis.txt
#
# created 11/1/2010 by M. Frazier
#
# last modified: 11/1/2011 by M. Dillon
#
#######################################################################
## set working directory
setwd("/home/michael/Documents/Courses/R/Hands on R/Fall 2011/week 11-some basic stats/3-Regression Example")
dir()
## Load data
## We can read data from a website! But if this fails, the data is also in
## the directory folder. You can access it using alternative 2 below.
ipomopsis<- read.table('http://www.unc.edu/courses/2007spring/enst/562/001/data/lab5/ipomopsis.txt', header=T,sep='\t')
## Alternative 2 (if you need this, uncomment it)
#ipomopsis<- read.table("ipomopsis.txt", header=T,sep='\t')
## Run some functions to check out the data
head(ipomopsis) # first 6 rows of dataframe
dim(ipomopsis) # dimensions: number of rows, columns
summary(ipomopsis)
lapply(ipomopsis, class) #variable classes...no surprises here!
## Some quick visual checks of the data
hist(ipomopsis$Root)
hist(ipomopsis$Fruit)
# Oh yay! These variables pass the visual test for normality (but see Zuur et al., 2010)
plot(ipomopsis$Grazing, ipomopsis$Root)
plot(ipomopsis$Grazing, ipomopsis$Fruit)
# There doesn't appear to be much of an effect of grazing on Fruit size.
# But lets do the full analysis.
################################
## Plotting the point data
################################
plot(ipomopsis$Fruit ~ ipomopsis$Root,
col = c("red", "blue")[ipomopsis$Grazing], #a little color coding
pch = 16,
main = "Effects of grazing",
ylab = "Fruit size (g)",
xlab = "Root size (g)",
las = 1, #las rotates the labels on the y axis
cex = 1.5 #cex scales the point size up or down
)
# We can get real fancy, and add a legend.
# ?legend to get a summary of the options.
legend("topleft", legend=c("grazed", "ungrazed"), col=c("red", "blue"), pch=16)
###############################################################
## Linear models using ordinary least squares regression models
###############################################################
#### Here is the simplest possible model
mod1 <- lm(Fruit ~ Root, data=ipomopsis)
summary(mod1)
abline(mod1) #this plots the regression model on the figure
# this works on simple models, but fails on more complex models with
# multiple effects.
plot(mod1) # look at some diagnostic plots
#abline is a flexible function, try these
abline(v = 7)
abline(h = 40)
#### Varying intercepts model
# there is a strong positive correlation between root size and fruit size.
# but it looks like we can get a better model.
mod2 <- lm(Fruit ~ Root + Grazing, data=ipomopsis)
summary(mod2)
# we can also use abline(intercept, slope) to plot the model fit
# In this case, the "grazed" category is the default category that
# the model fits (i.e. dummy variable is 1 for this category). The
# "ungrazed" category is compared to the "grazed" category.
# line for the "grazed" category
abline(-127.829, 23.560, col="red", lwd=2) #lwd = line width
# we have to calculate the intercept for the "ungrazed" category
# by adding the intercept and GrazingUngrazed Estimate
abline((-127.829 + 36.103), 23.560, col="blue", lwd=2)
# notice we use the same slope for the grazed and ungrazed caegories.
### Interaction model
mod3 <- lm(Fruit ~ Root*Grazing, data=ipomopsis)
summary(mod3)
##############################
#### Model selection
##############################
# The fact that the p-value for the Root:GrazingUngrazed coefficient
# is not significant suggests that this more complex model is not supported.
# Rather than using p-values to determine which model is better, we should
# be using AIC values, or other more robust methods.
AIC(mod1, mod2, mod3)
# the model with the lowest AIC value is considered the best supported.
# According to this, mod2 is the best one.
# Another way to compare models is to use the anova function.
anova(mod1, mod2) #mod 2 is significantly better than mod1
anova(mod2, mod3) #mod 3 is NOT significantly better than mod2
# Given this, we will report mod2. Now, we want to export a nice picture
# for our paper/talk
###############################################################
## Peaking into the linear model
###############################################################
# there is more information in the regression model that we can access
names(mod2) #here is a list of some of them
# they can be accessed like this:
mod2$coefficients
# one thing that can be interesting is to look at what the model
# predicts for each sample. For example, the first sample has a
# Root size of 6.225 and is Ungrazed. How big should the fruit be based
# on the model?
mod2$fitted.values #provides a predicted fruit size for each sample
# According to the model, the predicted size of fruit for that sample
# was 54.935. The observed weight was 59.77, so the model guessed a little low
# in that case. The difference between the observed and predicted value is
# the "residual". For the first sample, the residual would be:
59.77 - 54.935 #residual for sample 1
# We can also get these values:
mod2$residuals
# It can be useful to compare the predicted and observed values for the data
plot(ipomopsis$Fruit, mod2$fitted.values) #it looks like the model does a good job.
### Here are some plots to evaluate the model assumptions
plot(mod2) # you will cycle through 4 different figures.
# Fig. 1: A plot of residuals vs. fitted values (predicted values).
# Ideally, these points will be symmetric around the zero line.
# There may be a pattern that suggests we should be fitting a curved line.
# Fig. 2: Normal Q-Q plot. If residuals are from a normal distribution the
# points should lie close to the line.
# Fig. 3: Similar to Fig. 1, but the square root of standardized residuals
# is used. I guess there are theoretical reasons why this is better for
# evaluating whether the variance is constant (but usually doesn't matter).
# A funnel shape to the points might indicate heteroscedasticity (unequal variance).
# Fig. 4: Identifies residuals that are influential in driving the
# regression line. Cook's distance (the red contours) measures the extent
# to which a line would change if the point were omitted. Values >1 are
# considered influential. In this case, everything is less than 0.5.
# Note: outliers may or may not be influential. Influential outliers can
# be problematic.
# To get confidence intervals around predictions
confint(mod2)
### Nice shortcut figures
# You can run this if you know how to install packages.
# If not, don't worry, we will cover that concept next.
library(car)
scatterplot(Fruit ~ Root, data=ipomopsis,
groups=ipomopsis$Grazing, by.groups=TRUE)
#fits a linear regression model and a splinal fit to each group.