-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_day2.Rmd
365 lines (280 loc) · 8.78 KB
/
03_day2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
```{r, include = FALSE}
knitr::opts_chunk$set(fig.align = "center", fig.show = TRUE,
class.source = "fold-show")
```
# Simple data analysis in R
R is a free software environment for statistical computing and graphics.
The stats package, which is automatically loaded when R starts up, contains
functions for statistical calculations and random number generation. For a
complete list of functions, see `library(help = stats)`.
## $t$ tests
```{r}
# One-sample t test
rt <- rnorm(100, mean = 400, sd = 20)
t.test(rt, mu = 400)
# Two-sample t test for independent groups
t.test(weight ~ group, PlantGrowth[PlantGrowth$group != "ctrl", ],
var.equal = TRUE)
# Two-sample t test for dependent groups
sleep2 <- reshape(sleep, direction = "wide", idvar = "ID", timevar = "group")
## Traditional interface
t.test(sleep2$extra.1, sleep2$extra.2, paired = TRUE)
## Formula interface
t.test(Pair(extra.1, extra.2) ~ 1, data = sleep2)
```
## Linear models
Let us first fit a simple linear regression.
```{r, fig.dim = c(4, 4.5)}
dat <- data.frame(
x = c(18, 23, 25, 35, 65, 54, 34, 56, 72, 19, 23, 42, 18, 39, 37),
y = c(202, 186, 187, 180, 156, 169, 174, 172, 153, 199, 193, 174, 198, 183, 178)
)
lm1 <- lm(y ~ x, dat)
plot(y ~ x, dat)
abline(lm1)
coef(lm1)
predict(lm1)
resid(lm1)
names(lm1)
summary(lm1)
```
```{r, fig.dim = c(8, 8)}
# model diagnostics
par(mfrow = c(2, 2))
plot(lm1)
```
## Analysis of Variance (ANOVA)
An ANOVA is just a special case of a regression where all predictors are
categorical. Using the function `aov()` instead of `lm()` gives us
different results for the extractor functions.
```{r anova}
dat <- read.table(text = "
id group score
1 control 8
2 control 12
3 control 7
4 control 10
5 control 11
6 control 12
7 pro 7
8 pro 9
9 pro 15
10 pro 13
11 pro 11
12 pro 16
13 pro 12
14 pro 8
15 pro 13
16 pro 16
17 contra 4
18 contra 5
19 contra 6
20 contra 3
21 contra 8
22 contra 10
23 contra 3
24 contra 9
", header = TRUE)
aov1 <- aov(score ~ group, data = dat)
summary(aov1)
```
We can again look at the model diagnostics.
```{r, fig.dim = c(8, 8)}
# model diagnostics
par(mfrow = c(2, 2))
plot(aov1)
```
# Plotting
There are two graphics systems in R
* Traditional graphics
* Grid graphics
## Traditional plots
```{r, fig.dim = c(4, 4.5)}
plot(cars)
```
Stepwise plotting -- first example:
```{r, fig.dim = c(4, 4.5)}
dat <- read.table(header = TRUE, text = "
A B rt
a1 b1 825
a1 b2 792
a1 b3 840
a2 b1 997
a2 b2 902
a2 b3 786
", stringsAsFactors = TRUE)
plot(rt ~ as.numeric(B), dat, type = "n", axes = FALSE, xlim = c(.8, 3.2),
ylim = c(750, 1000), xlab = "Difficulty", ylab = "Mean reaction time (ms)")
# Plot the data points separately for each level of factor A.
points(rt ~ as.numeric(B), dat[dat$A == "a1", ], type = "b", pch = 16)
points(rt ~ as.numeric(B), dat[dat$A == "a2", ], type = "b", pch = 4)
# Add axes and a legend.
axis(side = 1, at = 1:3, expression(B[1], B[2], B[3]))
axis(side = 2)
legend(2.5, 975, expression(A[1], A[2]), pch = c(16, 4), bty = "n",
title = "Task")
```
Second example:
```{r, fig.dim = c(5, 4.5)}
plot(Sepal.Length ~ Sepal.Width, iris, axes = FALSE, type = "n",
xlab = "Sepal width", ylab = "Sepal Length")
points(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "setosa"),
col = "magenta", pch = 21)
points(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "versicolor"),
col = "red", pch = 22)
points(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "virginica"),
col = "purple", pch = 23)
axis(1)
axis(2)
legend("topleft", c("setosa", "versicolor", "virginica"),
pch = 21:23, col = c("magenta", "red", "purple"), bty = "n")
lm1 <- lm(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "setosa"))
lm2 <- lm(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "versicolor"))
lm3 <- lm(Sepal.Length ~ Sepal.Width, subset(iris, iris$Species == "virginica"))
abline(lm1, col = "magenta")
abline(lm2, col = "red")
abline(lm3, col = "purple")
```
## Lattice plots
The lattice package implements Trellis plots in R. These are plots
conditional on other variables. They are perfectly suited for visualizing
complex relationships.
```{r, fig.dim = c(5.3, 5)}
library(lattice)
states <- data.frame(state.x77, state.name = state.name,
state.region = state.region) # built-in data sets
xyplot(Murder ~ Population | state.region, states)
```
```{r, fig.dim = c(6, 6), include = FALSE}
# Fine control
xyplot(Murder ~ Population | state.region, states, groups = state.name,
xlim = c(-5000, 25000), ylab = "Murder rate (per 100000)",
# panel function
panel = function(x, y, subscripts, groups){
panel.grid(-1, -1) # align with ticks
panel.abline(lm(y ~ x)) # regression line
ltext(x, y, labels = groups[subscripts], cex = 1)
},
scales = list(tck = .5, cex = .7, x = list(at = seq(0, 20000, 5000))),
strip = strip.custom(bg = "gray96"), # strip function
par.strip.text = list(cex = .7)
)
```
```{r, fig.dim = c(6, 3.5)}
xyplot(Sepal.Length ~ Sepal.Width, iris, groups = Species,
type = c("p", "r"), auto.key = TRUE)
```
Other example to "quickly" look at data:
```{r, fig.dim = c(4, 4), include = FALSE}
xyplot(extra ~ group, sleep, groups = ID, type = "b")
```
```{r, fig.dim = c(5.3, 4)}
xyplot(weight ~ Time | Diet, ChickWeight, groups = Chick,
type = c("g", "p", "r"))
```
## The Grammar of Graphics `ggplot2`
```{r, include = FALSE}
library(ggplot2)
theme_set(theme_test())
```
R has very powerful graphics function. Creating beautiful (publication
ready) plots might be one of the best reasons to learn R. The R package
`ggplot2` gives you endless possibilities. In recent years it has become
the state of the art way to create plots in R. It has a steeper learning
curve than the functions above (and I recommend to get some knowledge about
them, too, so you understand how plotting in R works), but it is well worth
the effort to invest some time. A good place to start are the following
(online) books:
* Chang, W. (2018). R graphics cookbook: practical recipes for visualizing
data. O'Reilly Media. https://r-graphics.org/
* Wickham, H. (2016). ggplot2: elegant graphics for data analysis
Springer-Verlag New York. https://ggplot2-book.org/
The grammar of graphics has a simple structure
```
ggplot(data = <DATA>) +
<GEOM_FUNCTION>(
mapping = aes(<MAPPINGS>),
stat = <STAT>,
position = <POSITION>
) +
<COORDINATE_FUNCTION> +
<FACET_FUNCTION>
```
```{r}
# load package
library(ggplot2)
# load data
data(mpg)
# ?mpg
mpg
```
### Some examples
```{r, fig.dim = c(3.5, 3.5)}
# start your plot with coordinate system
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) + # then add layers
theme_bw() # and a theme
```
```{r, fig.dim = c(4, 3.5)}
# even more information
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = fl))
```
What is the difference between these plots?
```{r, fig.dim = c(5, 3.5)}
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))
```
```{r, fig.dim = c(4, 3.5)}
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
```
What happens when we do this?
```{r, fig.dim = c(5, 3.5)}
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = "blue"))
```
### Facets
* Way to visualize additional variables
* Split plot into several facets
* Usually only meaningful for categorical variables
* Used with R's formula notation `y ~ x`
```{r, fig.dim = c(6, 4)}
# one variable
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
```
```{r, fig.dim = c(6, 4)}
# two variables
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
```
### Geometric objects
* A `geom` is the geometrical object that a plot uses to represent data
* For example bar charts, box plots, line charts, ...
* Every `geom` function takes a `mapping` argument.
* `ggplot2` provides over 40 `geoms`, and extension packages provide even more
(see https://exts.ggplot2.tidyverse.org/gallery/ for a sampling)
```{r, fig.dim = c(4, 3)}
# scatter plot
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
```
```{r, fig.dim = c(4, 3)}
# fitted line
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy))
```
```{r, fig.dim = c(4.5, 3)}
# add another variable
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
```
```{r, fig.dim = c(4.5, 3)}
# more layers: points and regression lines
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy, color = drv)) +
geom_smooth(aes(x = displ, y = hwy, color = drv), method = "lm")
```