-
Notifications
You must be signed in to change notification settings - Fork 2
/
CSE891_LinearModelsIntro_old_SomeStuffNeedsToBEExtracted.R
216 lines (132 loc) · 9.69 KB
/
CSE891_LinearModelsIntro_old_SomeStuffNeedsToBEExtracted.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# BEACON CSE845 Feb 28 2013
# First let us bring in the course source file
source('http://beaconcourse.pbworks.com/w/file/fetch/36678466/Source_useful_R_function_ID_Feb24_2011.R')# to see what functions this has....
# The data we are going to be using for this tutorial were provided by Luis (who else!).
# Luis set up some experiments to look at the joint influences of recombination, and mutation rate with and without parasitism on species(?) diversity and richness, and whether or not parasites were lost (for runs that had them). F
parasite_data <- read.csv("http://beaconcourse.pbworks.com/w/file/fetch/37890621/sexAsex.csv", h=T)
# remember to always check your data coming in
str(parasite_data)
summary(parasite_data)
head(parasite_data)
dim(parasite_data) # check against wc -l sexAsex.csv
# First we need to change some variable types
parasite_data$Sex <- factor(parasite_data$Sex) # was sex turned on
parasite_data$Parasites <- factor(parasite_data$Parasites) # did we have parasites initially in the experiment
levels(parasite_data$Sex) <- c("NoSex", "Sex") # This is the easiest way of recoding the levels of the factor.. below is another approach.
levels(parasite_data$Parasites) <- c("NoParasites","Parasites")
# parasite_data$Sex <- ifelse(parasite_data$Sex=="0", "NoSex","Sex") # remember ifelse is vectorized!
# parasite_data$Parasites <- ifelse(parasite_data$Parasites=="0", "NoParasites","Parasites")
# parasite_data$Sex <- factor(parasite_data$Sex) # was sex turned on
# parasite_data$Parasites <- factor(parasite_data$Parasites) # did we have parasites initially in the experiment
# Quick reminder of the structure of the experiment.
with(parasite_data, table(factor(HostMutationRate), Sex, Parasites))
# A plot to remind us of what is going on
require(lattice)
bwplot(FinalDiversityIndex ~ factor(HostMutationRate)|Sex*Parasites, data=parasite_data)
# So We are asking the basic question (to start with) on how FinalDiversity is influenced by several underlying covariates including mutation rate, whether there is sexual recombination and whether parasites occur. The latter two variables are categorical, so we will treat these as factors (although as you will soon realize it does not really matter for linear models, when there are only two levels.)
# So we have one continuous covariate (mutation rate)
# and two treatment factors with two levels (on or off).
# Let's start by just looking at a subset of the data, with the parasites non present and no sex.
parasite_data_0_0 <- parasite_data[parasite_data$Sex =="NoSex" & parasite_data$Parasites =="NoParasites",]
str(parasite_data_0_0) # Do you notice anything weird about this?
edit(parasite_data_0_0)
levels(parasite_data_0_0$Sex)
# It turns out R has a "feature" which most people consider annoying, that is, it reports levels of a factor from the original data set, even if you subset on levels of that factor.
# It turns out this will not influence most analyses, but it is safest to just "fix" this
#parasite_data_0_0$Sex <- factor(parasite_data_0_0$Sex) # one method to drop levels.....
parasite_data_0_0$Sex <- droplevels(parasite_data_0_0$Sex) # this does the same as the line above
levels(parasite_data_0_0$Sex)
# Note this can also be done using the droplevels()
parasite_data_0_0$Parasites <- droplevels(parasite_data_0_0$Parasites)
plot(FinalDiversityIndex ~ jitter(HostMutationRate, factor=0.5), data=parasite_data_0_0, pch=16)
# HEre is the classic regression of FinalDiversity on mutation rate
parasites_lm_1 <- lm(FinalDiversityIndex ~ 1 + HostMutationRate, data=parasite_data_0_0)
# Before we do anything else, let's think about the design matrix
# y[i] ~ a + bx[i] + e[i]
# Y = BetaX + E
edit(model.matrix(~1+ parasite_data_0_0$HostMutationRate))
#or
model.matrix(parasites_lm_1)
# Now let's take account of what this all means....
summary(parasites_lm_1)
# Let's step through this..
#####Output from summary
# Call:
# lm(formula = FinalDiversityIndex ~ HostMutationRate, data = parasite_data_0_0)
# Residuals:
# Min 1Q Median 3Q Max
# -1.44135 -0.64813 -0.03321 0.47003 2.29853
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.9857 0.1616 6.101 2.11e-08 ***
# HostMutationRate 0.6452 0.2813 2.294 0.0239 *
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Residual standard error: 0.7955 on 98 degrees of freedom
# Multiple R-squared: 0.05096, Adjusted R-squared: 0.04127 # 1 - (residual SS/total SS) OR #regression SS/total SS
# F-statistic: 5.262 on 1 and 98 DF, p-value: 0.02393
# The fitted line
abline(parasites_lm_1, lwd=3)
confint(parasites_lm_1)
# As you have read about, and we have discussed briefly, there are a number of assumptions that are made in a linear model. There are a number of graphical tools we can use to evaluate them. Some of them are even automated in R.
par(mfrow=c(2,2))
plot(parasites_lm_1)
# Let's talk about these. By far the first two are the most useful...
#There are other ways of looking at these.
plot(resid(parasites_lm_1) ~ parasite_data_0_0$HostMutationRate)
boxplot(resid(parasites_lm_1) ~ as.factor(parasite_data_0_0$HostMutationRate), ylab="residuals", xlab="HostMutationRate")
#Sometimes I find a histogram or density plot is most useful for the residuals as opposed to a q-q plot
data_plotter.fun(resid(parasites_lm_1))
# So we have clearly violated one assumption (homogeneity of variance). The residuals look more or less normally distributed, but clearly increase with mutation rate. It is sometimes difficult to know the exact impact of this. There are a number of things we can do to address this if we are worried though (GLS, non-parametric models, inference using resampling.)
# One of the most useful (but not so frequently used) plots is fitted values ~ observed.
plot(fitted(parasites_lm_1) ~ parasite_data_0_0$FinalDiversityIndex, xlim=c(0,4), ylim=c(0,4))
abline(a=0,b=1)
# In many ways this provides a graphical interpretation of R^2. As the numbers suggested, we have not really done a good job of fitting the data with such a simple model.
anova(parasites_lm_1)
# To understand this, we need to think about a couple of things.
# F distributions...
# So we need to think about the sum of squares of two models.
parasites.lm.0 <- lm(FinalDiversityIndex ~ 1, data=parasite_data_0_0) # What does this model mean?
anova(parasites.lm.0, parasites_lm_1) # Compare the two models, We are asking how much of the variance have we accounted for in the model/ residual variance (unaccounted for by the model.)
# Now let us take a look at a different subset of the data, where we are including Parasites
parasite_data_p_0 <- parasite_data[parasite_data$Sex =="NoSex" & parasite_data$HostMutationRate==0.1,]
dim(parasite_data_P_0)
plot(FinalDiversityIndex ~ Parasites, data=parasite_data_p_0)
# So since the only factor (parasites present or not) has only two levels we could try a t-test...
# But we could instead ask the same question we asked above, how much variation can we account for by fitting a model
lm_parasites_1 <- lm(FinalDiversityIndex ~ 1 + Parasites, data=parasite_data_p_0)
lm_parasites_0 <- lm(FinalDiversityIndex ~ 1, data=parasite_data_p_0)
# In this linear model framework, it is important to think about what the design matrix might look like..
# This is something we will come back to alot, and is probably the most important thing
anova(lm.parasites_1)
anova(lm.parasites_0, lm.parasites_1) # comparing the models
# Let's think about what we are doing here. We are computing the ratio of the variation between groups (Parasites VS NoParasites) to the variation within the population (or within groups.)
# We know SS.total = SS.model + SS.E
# we can think about this is a slightly different manner for the ANOVA.
# in particular the relative ratio of SS.model and SS.E (although we will need to adjust these for degrees of freedom.)
# We can calculate SS.model and error as before, but there is a different "traditional" approach for anova.
# variation between groups VS. variation within group.s
# variation between groups. Often called SS.treatment
# first we calculate each treatment mean (in this case, with and without parasite).
treatment_means <- with(parasite_data_p_0, tapply(X=FinalDiversityIndex,INDEX=Parasites, FUN=mean))
# Then for each we have to compute the sum of squared differences between the treatment mean, and the sample (grand mean for sample)
TreatmentSquaredDifferences <- (treatment_means- mean(parasite_data_p_0$FinalDiversityIndex))^2
# It is important to think about what we have just done (we will draw it on the board)...
# The treatment means are the fitted values for ALL of the observations..
# So the difference we have computed has to multipied by the number of individuals we have for each group.
TreatmentSampleSize <- with(parasite_data_p_0, tapply(X=FinalDiversityIndex,INDEX=Parasites, FUN=length))
TreatmentSumSquares <- sum( TreatmentSquaredDifferences*TreatmentSampleSize )
# The same logic is essentially applied for the error, but computing the sum of squares within each group, and summing across those.
summary(lm_parasites_1)
# Let's think about what these parameters mean, as it depends ALOT on how we specified the design matrix.
# SO what does this mean?
lm_parasites_1_cell <- lm(FinalDiversityIndex ~ 0 + Parasites, data=parasite_data_p_0)
##### The whole shebang #!
LmParasiteDataComplete <- lm(FinalDiversityIndex ~ (Parasites*HostMutationRate*Sex) - Parasites:HostMutationRate:Sex , data=parasite_data)
summary(LmParasiteDataComplete)
par(mfrow=c(1,1))
plot(parasite_data$FinalDiversityIndex ~fitted(LmParasiteDataComplete), xlim=c(0,5), ylim=c(0,5))
abline(a=0, b=1)
par(mfrow=c(2,2))
plot(LmParasiteDataComplete)
hist(resid(LmParasiteDataComplete))