-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
316 lines (316 loc) · 14.2 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
library(readr)
train <- read_csv("C:/Users/Abhay Goyal/Downloads/train.csv")
View(train)
str(train)
train <- read.csv("train.csv", stringsAsFactors = FALSE)
table(train$Survived)
prop.table(table(train$Survived))
prop.table(table(train$Survived))*100
library(readr)
test <- read_csv("C:/Users/Abhay Goyal/Downloads/test.csv")
View(test)
test$Survived <- rep(0,418)
table(train$Sex)
prop.table(table(train$Sex))
prop.table(table(train$Sex,train$Survived))
prop.table(table(train$Sex,train$Survived),1)
test$Survived <- 0
test$Survived[test$Sex == 'female'] <-1
summary(train$Age)
train$Child
train$Child <- 0
train$Child[train$Age < 18 ] <- 1
aggregate(Survived ~ Sex + Child, data=train, FUN=sum)
aggregate(Survived ~ Sex + Child, data=train, FUN=length)
aggregate(Survived ~ Sex + Child, data=train, FUN=function(x){sum(x)/length(x)})
aggregate(Survived ~ Sex + Child, data=train, FUN=function(x){sum(x)/length(x)}*100)
train$Fare
train$Fare2 < '30+'
train$Fare2 <- '30+'
train$Fare2[train$Fare <30 & train$Fare>=20] <- '20-30'
train$Fare2[train$Fare <20 & train$Fare>=10] <- '10-20'
train$Fare2[train$Fare <10] <- '<10'
aggregate(Survived ~ Sex + Pclass + Fare2, data=train, function(x){sum(x)/length(x)})
aggregate(Survived ~ Sex + Pclass + Fare2, data=train, function(x){sum(x)/length(x)}*100)
test$Survived <- 0
test$Survived[test$Sex == 'female'] <- 1
test$Survived[test$Sex == 'female' & test$Pclass == 3 & test$Fare >= 20] <- 0
library(rpart)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked , data=train, method="class")
plot(fit)
text(fit)
install.packages(rattle)
install.packages("rattle")
install.packages("rpart.plot")
install.packages("RColorBrewer")
library(RColorBrewer)
library(rattle)
install.packages('rattle')
library(rattle)
library(rpart.plot)
fancyRpartPlot(fit)
library(rpart.plot)
fancyRpartPlot(fit)
fancyRpartPlot(fit)
library(rpart)
library(rpart.plot)
fancyRpartPlot(fit)
text(fit)
fancyRpartPlot(fit)
fit <- rpart(Survived ~ Sex, data=train, method="class")
fancyRpartPlot(fit)
library(hplot)
install.packages('hplot')
library(hplot)
install.packages('hplot')
fit <- rpart(Survived ~ Sex, data=train, method="class")
library(hplot)
fancyRpartPlot(fit)
rpart.plot(fit)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked , data=train, method="class")
rpart.plot(fit)
Prediction <- predict(fit, test, type='class')
submit <- data.frame(PassengerId = test$PassengerId, SUrvived = Prediction)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data =train, method="class", control = rpart.control(missplit=2, cp=0))
rpart.plot(fit)
train$Name[1]
test$Survived <- NA
combi <- rbind(train,test)
dim(train)
dim(test)
test$Child <- 0
test$Child[test$Age < 18] <- 1
test$Fare2 <- 0
test <- subset(test, -test$Cabin)
test <- subset(test, -test$Child)
test <- subset(test, -Child)
test <- subset(test, select = -Child)
test <- subset(test, select = -Fare2)
submit <- data.frame(PassengerId = test$PassengerId, Survived=Prediction)
write.csv(submit, file="fullgrowntree.csv",row.names = FALSE)
library(readr)
train <- read_csv("C:/Users/Abhay Goyal/Downloads/train.csv")
View(train)
library(readr)
test <- read_csv("C:/Users/Abhay Goyal/Downloads/test.csv")
View(test)
train$Name[1]
test$Survived <- NA
combi <- rbind(train,test)
combi$Name <- as.character(combi$Name)
combi$Name
combi$Name <- as.character(combi$Name)
combi$Name[1]
strsplit(combi$Name[1], split = '[,.]')
strsplit(combi$Name[1], split = '[,.]')[[1]]
strsplit(combi$Name[1], split = '[,.]')[[1]][2]
combi$Title <- sapply(combi$Name, FUN= function(x){strsplit(x, split='[,.]')[[1]][2]})
combi$Title
combi$Title <- strsplit(x, split='[,.]')[[1]][2]})
combi$Title <- strsplit(x, split='[,.]')[[1]][2])
combi$Title <- strsplit(x, split='[,.]'[[1]][2])
combi$Title <- strsplit(combi$Name, split='[,.]'[[1]][2])
combi$Title
combi$Title <- strsplit(combi$Name, split = '[,.]')[[1]][2]
combi$Title
combi$Title <- sapply(combi$Name, FUN= function(x){strsplit(x, split='[,.]')[[1]][2]})
combi$Title
combi$Title <- sub(' ','', combi$Title)
combi$Title
table
table(combi$Title)
combi$Title[combi$Title %in% c('Mme','Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt','Don',''Major,'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Capt','Don','Major','Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Jonkheer','Lady','Dona','the Countess')] <- 'Lady'
table(combi$Title)
combi$Title <- factor(combi$Title)
combi$FamilySize <- combi$SibSp + combi$Parch + 1
combi$Title <- sapply(combi$Name, FUN= function(x){strsplit(x, split='[,.]')[[1]][1]})
combi$Title <- sapply(combi$Name, FUN= function(x){strsplit(x, split='[,.]')[[1]][2]})
combi$Surname <- sapply(combi$Name, FUN= function(x){strsplit(x, split='[,.]')[[1]][1]})
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname , sep="")
combi$FamilyID
combi$FamilyID[combi$FamilySize <=2] <- 'Small'
table(combi$FamilySize)
table(combi$FamilyID)
famIDs <- data.frame(table(combi$FamilyID))
famIDs <- famIDs[famIDs$Freq <=2,]
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)
combi$FamilyID <- factor(combi$FamilyID)
train <- combi[1:891,]
test <- combi[892:1309,]
fir ,- rpart(Suurvuved ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method="class")
fit <- rpart(Suurvuved ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method="class")
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method="class")
library(rpart)
library(rpart.plot)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method="class")
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method="class")
rpart.plot(fit)
Prediction <- predit(fit, test, type="class")
Prediction <- predict(fit, test, type="class")
submit <- data.frame(PassengerID = test$PassengerId, Survived = Prediction)
write.csv(submit, file="engineeredfeatures.csv", row.names = FALSE)
library(readr)
dataset <- read_csv(NULL)
View(dataset)
library(readr)
test <- read_csv("C:/Users/Abhay Goyal/Downloads/test.csv")
View(test)
library(readr)
train <- read_csv("C:/Users/Abhay Goyal/Downloads/train.csv")
View(train)
sample(1:10, replace = TRUe)
sample(1:10, replace = TRUE)
summary(combi$Age)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data=combi[!is.na(combi$Age),], method = "anova")
library(rpart)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data=combi[!is.na(combi$Age),], method = "anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age)])
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age)])
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[!is.na(combi$Age)])
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age)])
install.packages("randomForest")
summary(combi$Age)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data=combi[!is.na(combi$Age),], method = "anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age)])
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
summary(combi)
summary(combi$Embarked)
which(combi$Embarked == '')
which(combi$Embarked == ' ')
which(combi$Embarked == is.na())
which(combi$Embarked == ' ')
is.na(combi$Embarked)
combi$Embarked[is.na()]
which(combi$Embarked == NA)
which(combi$Embarked == ' ')
which(combi$Embarked == 'NA')
which(combi$Embarked all.equal(' ') 'NA')
combi$Embarked(all.equal(' '))
combi$Embarked(all.equal('NA'))
which(combi$Embarked == 'NA')
is.na(combi$Embarked)
combi$Embarked[NA]
dim(is.na(combi))
dim(is.na(combi$Embarked))
dim(!is.na(combi$Embarked))
dim(as.na(combi$Embarked))
dim(is.na(combi$Embarked))
?which
which(combi$Embarked == 'NA')
which(combi$Embarked == ' ')
which(combi$Embarked)
which(combi$Embarked == NA)
arrayInd(combi$Embarked == ' ')
which(is.na(combi$Embarked))
combi$Embarked[c(62,830)] = 'S'
combi$Embarked <- factor(combi$Embarked)
which(is.na(combi$Fare))
combi$Fare <- median(combi$Fare, na.rm =TRUE)
summary(combi)
summary(combi$Age)
summary(combi$Pclass)
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)
combi$FamilyID2
combi$FamilyID
library(randomForest)
set.seed(415)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2, data=train, importance = TRUE, ntree=2000)
varImpPlot(fit)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2, data=train, importance = TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + FamilySize + FamilyID2, data=train, importance = TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + FamilyID2, data=train, importance = TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2, data=train, importance = TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + FamilySize + FamilyID2, data=combi, importance = TRUE, ntree=2000)
train$Title <- sapply( )
?sapply
train$Title <- sapply(train , FUN= function(x) {strsplit})
train$Title <- sapply(train , FUN= function(x) {strsplit['.,'][[1]][2],})
train$Title <- sapply(train , FUN= function(x) {strsplit['.,'][[1]][2]})
train$Title <- sapply(combi$Name , FUN= function(x) {strsplit['.,'][[1]][2]})
train$Title <- sapply(combi$Name , FUN = function(x) {strsplit['.,'][[1]][2]})
train$Title <- sapply(combi$Name , FUN= function(x) {strsplit['.,'][[1]][2]})
train$Title <- sapply(combi$Name , FUN = function(x) {strsplit['.,'][[1]][2]})
combi$FamilySize <- combi$SibSp + combi$Parch + 1
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
combi$Title <- sub(' ', '', combi$Title)
# Combine small title groups
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
# Convert to a factor
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
combi$Title <- sub(' ', '', combi$Title)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=combi, importance=TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=combi, importance=TRUE, ntree=2000)
which(is.na(combi$Survived))
which(is.na(combi$Survived)) <- 0
is.na(combi$Survived) <- 0
which(is.na(combi$Survived)) <- 0
which(is.na(combi$Survived))
train <- combi[1:89,]
train <- combi[1:891,]
test <- combi[892:1309,]
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
which(is.na(test$Surname))
varImpPlot(fit)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
set.seed(415)
fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, controls = cforest_unbiased(ntree=2000,mtry=3))
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
str(train)
train$Title <- as.factor(train$Title)
str(train)
train$Surname <- as.factor(train$Surname)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
train$Sex <- as.factor(train$Sex)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
varImpPlot(fit)
prediction <- predict(fit,test)
test$Sex <- as.factor(test$Sex)
test$Title <- as.factor(test$Title)
test$Surname <- as.factor(test$Surname)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
prediction <- predict(fit,test)
str(test)
prediction <- predict(fit,test)
str(train)
test$Surname <- factor(test$Surname, levels = levels(train$Surname))
test$Title <- factor(test$Title, levels = levels(train$Title))
test$Sex <- factor(test$Sex, levels = levels(train$Sex))
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID2,
data=train, importance=TRUE, ntree=2000)
varImpPlot(fit)
Prediction <- predict(fit, test)
submit <- data.frame(PassengerID = test$PassengerId, Survived = Prediction)
write.csv(submit, file="firstforest.csv", row.names = FALSE)
install.packages('party')
library(party)
set.seed(415)
str(train$Survived)
fit <- cforest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, controls = cforest_unbiased(ntree=2000, mtry=3))
Prediction <- predict(fit, test, OOB=TRUE, type= "response"
)
submit <- data.frame(PassengerID = test$PassengerId, Survived = Prediction)
write.csv(submit, file="secondforest.csv" , row.names = FALSE)