-
Notifications
You must be signed in to change notification settings - Fork 1
/
predicting_blood_donations.Rmd
298 lines (239 loc) · 9.76 KB
/
predicting_blood_donations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
---
title: "Predict Blood Donations"
author: "Ben Herndon-Miller"
date: "1/29/2019"
output: html_document
---
# Load dependencies and data
```{r}
# Install and load pacakges
list.of.packages <- c("dplyr", "caret", "e1071", "esquisse")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(dplyr)
library(caret)
library(e1071)
library(esquisse)
# Load data
train <- read.csv("data/BloodDonationTrainingData.csv", header = TRUE, stringsAsFactors = FALSE)
test <- read.csv("data/BloodDonationTestData.csv", header = TRUE, stringsAsFactors = FALSE)
example_submission <- read.csv("data/BloodDonationSubmissionFormat.csv", header = TRUE, stringsAsFactors = FALSE)
# Rename labels for the caret package (doesn't allow 1 and 0 as label names)
train$Made.Donation.in.March.2007[train$Made.Donation.in.March.2007 ==1] <- "yes"
train$Made.Donation.in.March.2007[train$Made.Donation.in.March.2007 ==0] <- "no"
```
# Exploratory Analysis
```{r}
esquisse::esquisser()
```
# Tests for Marginal Associations
```{r}
# Initialize data frame to store results of t-tests
marginal_results <- data.frame(matrix(nrow = 4, ncol = 4))
colnames(marginal_results) <- c("mean_no", "mean_yes", "t_statistic", "p_value")
rownames(marginal_results) <- colnames(train)[2:5]
# Perform Welch's t-test to test marginal association between all variables
for (i in 2:5){
tmp_test <- t.test(train[,i]~train$Made.Donation.in.March.2007)
marginal_results$mean_no[i-1] <- tmp_test$estimate[1]
marginal_results$mean_yes[i-1] <- tmp_test$estimate[2]
marginal_results$t_statistic[i-1] <- tmp_test$statistic
marginal_results$p_value[i-1] <- tmp_test$p.value
}
marginal_results
```
# Define log loss function
```{r}
# Define function to calculate log loss
log_loss <- function(actual, predicted, eps = 1e-15){
actual[actual == "yes"] <- 1
actual[actual == "no"] <- 0
actual <- as.numeric(actual)
# Bound probabilities (0,1) for computational purposes
predicted[predicted < eps] <- eps
predicted[predicted > 1 - eps] <- 1 - eps
result=-1/length(actual)*(sum((actual*log(predicted)+(1-actual)*log(1-predicted))))
return(result)
}
```
# Train KNN algorithm
```{r}
# Set seed for reproducibility
set.seed(123)
# Initialize data frame of cross-validation log loss
knn_cv_results <- data.frame(matrix(ncol = 6, nrow = 20))
knn_cv_results[,1] <- c(1:20)
colnames(knn_cv_results) <- c("k", "iter1", "iter2", "iter3", "iter4", "iter5")
# Perform repeated cross-validation for KNN to tune K
for (i in 1:20){
for (j in 1:5){
# Set random index for subsampling
cv_idx <- sample(nrow(train), nrow(train)*.7, replace = FALSE)
# Split data into training and validation sets, drop ID column
cv_tr <- train[cv_idx,-1]
cv_val <- train[-cv_idx,-1]
# Set K
cv_grid <- expand.grid(k = c(i))
# Train KNN model
knn_cv <- train(as.factor(Made.Donation.in.March.2007) ~ Months.since.Last.Donation + Number.of.Donations + Total.Volume.Donated..c.c.. + Months.since.First.Donation,
data = cv_tr,
method = "knn",
tuneGrid = cv_grid)
# Predict on validation set
pred_cv <- predict(knn_cv, cv_val, type = "prob")
# Record results
knn_cv_results[i,j+1] <- log_loss(cv_val$Made.Donation.in.March.2007, pred_cv$yes)
}
}
# Calculate average log loss at each value of K
knn_cv_results$avg_log_loss <- rowSums(knn_cv_results[,2:6])/5
```
# Define custom log loss function for Caret Package
```{r}
# Code borrowed from https://www.kaggle.com/c/otto-group-product-classification-challenge/discussion/13064#69102
LogLossSummary <- function (data, lev = NULL, model = NULL) {
LogLoss <- function(actual, pred, eps = 1e-15) {
# Check to see that two vectors are the same length
stopifnot(all(dim(actual) == dim(pred)))
# Bound probabilities (0,1) for computational purposes
pred[pred < eps] <- eps
pred[pred > 1 - eps] <- 1 - eps
# Compute log loss
-sum(actual * log(pred)) / nrow(pred)}
# Convert into factors
if (is.character(data$obs)) data$obs <- factor(data$obs, levels = lev)
pred <- data[, "pred"]
obs <- data[, "obs"]
isNA <- is.na(pred)
pred <- pred[!isNA]
obs <- obs[!isNA]
data <- data[!isNA, ]
cls <- levels(obs)
# Contruct loss function summary output
if (length(obs) + length(pred) == 0) {out <- rep(NA, 2)}
else {
pred <- factor(pred, levels = levels(obs))
out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag", "kappa")]
probs <- data[, cls]
actual <- model.matrix(~ obs - 1)
out2 <- LogLoss(actual = actual, pred = probs)
}
out <- c(out, out2)
names(out) <- c("Accuracy", "Kappa", "LogLoss")
if (any(is.nan(out))) out[is.nan(out)] <- NA
out
}
```
# K-Nearest Neighbors
```{r}
# Set seed for reproducibility
set.seed(123)
# Create training controls for repeated cross-validation for KNN
knn_control <- trainControl(method = "repeatedcv",
number = 5,
classProbs = TRUE,
summaryFunction = LogLossSummary)
# Create vector of hyper-parameter values to tune over
knn_grid <- expand.grid(k = c(1:20))
# Train KNN model
knn_fit <- train(as.factor(Made.Donation.in.March.2007) ~ Months.since.Last.Donation + Number.of.Donations + Total.Volume.Donated..c.c.. + Months.since.First.Donation,
data = train,
method = "knn",
metric = "LogLoss",
maximize = FALSE,
trControl = knn_control,
tuneGrid = knn_grid)
# Predict on test data to output class probabilities
knn_pred <- predict(knn_fit, test, type = "prob")
# Create data frame to submit
knn_submission <- data.frame(test$X, knn_pred$yes)
colnames(knn_submission) <- c("", "Made Donation in March 2007")
# Write CSV to submit
write.csv(knn_submission, "knn_submission.csv", row.names = FALSE)
```
# Penalized Logistic Regression (Elastic Net)
```{r}
# Set seed for reproducibility
set.seed(123)
# Create training controls for repeated cross-validation for Elastic Net
elnet_control <- trainControl(method = "repeatedcv",
number = 10,
classProbs = TRUE,
summaryFunction = LogLossSummary)
# Create grid of hyper-parameter values to tune over
elnet_grid <- expand.grid(alpha = seq(0,1,0.1),
lambda = 10^seq(-4,-1,length=100))
# Train Elastic Net model
elnet_fit <- train(as.factor(Made.Donation.in.March.2007) ~ Months.since.Last.Donation + Number.of.Donations + Total.Volume.Donated..c.c.. + Months.since.First.Donation,
data = train,
method = "glmnet",
metric = "LogLoss",
maximize = FALSE,
trControl = elnet_control,
tuneGrid = elnet_grid)
# Predict on test data to output class probabilities
elnet_pred <- predict(elnet_fit, test, type = "prob")
# Create data frame to submit
elnet_submission <- data.frame(test$X, elnet_pred$yes)
colnames(elnet_submission) <- c("", "Made Donation in March 2007")
# Write CSV to submit
write.csv(elnet_submission, "elnet_submission.csv", row.names = FALSE)
```
### Support Vector Machine with Linear Kernel
```{r}
# Set seed for reproducibility
set.seed(123)
# Create training controls for K-Fold cross-validation for SVM
svm_control <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = LogLossSummary)
# Create grid of hyper-parameter values to tune over
svm_grid <- expand.grid(C = 2^seq(-5,5))
# Train SVM model with Linear Kernel
svm_fit <- train(as.factor(Made.Donation.in.March.2007) ~ Months.since.Last.Donation + Number.of.Donations + Total.Volume.Donated..c.c.. + Months.since.First.Donation,
data = train,
method = "svmLinear",
# We can add options to pre-process the data
preProc = c("center", "scale"),
metric = "LogLoss",
maximize = FALSE,
trControl = svm_control,
tuneGrid = svm_grid)
# Predict on test data to output class probabilities
svm_pred <- predict(svm_fit, test, type = "prob")
# Create data frame to submit
svm_submission <- data.frame(test$X, svm_pred$yes)
colnames(svm_submission) <- c("", "Made Donation in March 2007")
# Write CSV to submit
write.csv(svm_submission, "svm_submission.csv", row.names = FALSE)
```
### Random Forest
```{r}
# Set seed for reproducibility
set.seed(123)
# Create training controls for K-Fold cross-validation for Random Forest
rf_control <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = LogLossSummary)
# Create vector of hyper-parameter values to tune over
rf_grid <- expand.grid(mtry = c(1:4))
# Train Random Forest model
rf_fit <- train(as.factor(Made.Donation.in.March.2007) ~ Months.since.Last.Donation + Number.of.Donations + Total.Volume.Donated..c.c.. + Months.since.First.Donation,
data = train,
method = "rf",
# We can add options to pre-process the data
preProc = c("center", "scale"),
metric = "LogLoss",
maximize = FALSE,
trControl = rf_control,
tuneGrid = rf_grid)
# Predict on test data to output class probabilities
rf_pred <- predict(rf_fit, test, type = "prob")
# Create data frame to submit
rf_submission <- data.frame(test$X, rf_pred$yes)
colnames(rf_submission) <- c("", "Made Donation in March 2007")
# Write CSV to submit
write.csv(rf_submission, "rf_submission.csv", row.names = FALSE)
```