-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlfuncs.py
260 lines (230 loc) · 10.8 KB
/
mlfuncs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# import base libraries and dependencies
from re import UNICODE
from numpy.lib import utils
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import funcs
import plots
import utils
# import machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import sklearn.externals
import joblib
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
SEED = 42
def pre_process_split_data(df):
if isinstance(df, pd.DataFrame):
# drop all outliers outside 3 standard deviations of price
df = df[np.abs(df['price'] - df['price'].mean()) <= (3 * df['price'].std())]
df.reset_index(drop = True, inplace = True)
# drop price and year
X = df.drop(['price', 'year'], axis=1)
y = df['price']
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED)
else:
raise utils.InvalidDataFrame(df)
return X_train, X_test, y_train, y_test
def extract_cat_num(X_train):
if isinstance(X_train, pd.DataFrame):
# list to hold categorical and numerical column names
cat_feats = []
num_feats = []
# extract column names
xtrain_cols = X_train.columns.tolist()
for col in xtrain_cols:
if X_train[col].dtypes == 'object':
cat_feats.append(col)
elif X_train[col].dtypes == 'int64' or X_train[col].dtypes == 'float64' or X_train[col].dtypes == 'int32':
num_feats.append(col)
else:
raise utils.InvalidDataFrame(X_train)
return cat_feats, num_feats
def preprocess_col_transformer(cat_feats, num_feats):
# create instances for imputation and encoding of categorical variables
cat_imp = SimpleImputer(strategy = 'constant', fill_value = 'missing')
ohe = OneHotEncoder(handle_unknown = 'ignore')
cat_pipeline = make_pipeline(cat_imp, ohe)
# create instances for imputation and encoding of numerical variables
num_imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
std = StandardScaler()
num_pipeline = make_pipeline(num_imp, std)
# create a preprocessor object
preprocessor = make_column_transformer(
(cat_pipeline, cat_feats),
(num_pipeline, num_feats),
remainder = 'passthrough'
)
return preprocessor
def run_multi_models(df):
# extract train and test data
X_train, X_test, y_train, y_test = pre_process_split_data(df)
# extract cat and num features
cat_feats, num_feats = extract_cat_num(X_train)
# instantiate preprocessor
preprocessor = preprocess_col_transformer(cat_feats, num_feats)
# instantiate regression models with base parameters
sgd_reg = SGDRegressor()
# lin_reg = LinearRegression()
ri = Ridge(random_state=SEED)
lo = Lasso(max_iter = 5000, random_state=SEED)
en = ElasticNet(random_state=SEED)
dt_reg = DecisionTreeRegressor(random_state=SEED)
rf_reg = RandomForestRegressor(random_state=SEED)
# create tuple of regressors
regressors = [
('SGDRegressor', sgd_reg),
# ('LinearRegression', lin_reg),
('Ridge', ri),
('Lasso', lo),
('ElasticNet', en),
('DecisionTreeRegressor', dt_reg),
('RandomForestRegressor', rf_reg)
]
# dictionary to hold accuracy scores
acc_scores = {}
for reg_name, reg in regressors:
# instantiate pipeline
print("Creating pipeline for {}.".format(reg_name))
pipe = make_pipeline(preprocessor, reg)
# fit training data to pipe
print("Fitting training data to pipeline for {}.".format(reg_name))
pipe.fit(X_train, y_train)
# get predictions
print("Predicting test values for {}.".format(reg_name))
y_pred = pipe.predict(X_test)
# get accuracy score
print("Calculating accuracy score for {}.".format(reg_name))
reg_acc_scr = r2_score(y_test, y_pred)
print("Accuracy Score for {}: {}".format(reg_name, reg_acc_scr))
# create key, value pair in acc_scores
acc_scores[reg_name] = reg_acc_scr
# create dataframe of acc_scores dict
df_scores = pd.DataFrame(acc_scores.items(), columns=['Regressor', 'R Squared'])
max_score = df_scores.loc[df_scores['R Squared'] == df_scores['R Squared'].max()]
best_regressor = max_score['Regressor'].values.tolist()[0]
print("####################################################################################")
print("{} model yielded the highest R Squared of: {:.2f}".format(max_score['Regressor'].values.tolist()[0], max_score['R Squared'].values.tolist()[0]))
# plot scores
plots.bar_plot(df_scores, 'Regressor', 'R Squared')
return best_regressor
def regressor_hyperparameters(best_regressor):
# regressor list
regressor_list = ['SGDRegressor', 'Ridge', 'Lasso', 'ElasticNet', 'DecisionTreeRegressor', 'RandomForestRegressor']
# param_dict
params_dict = {
'SGDRegressor': [
{'model': SGDRegressor(random_state=SEED)},
{'grid': {
'sgdregressor__penalty': ['l2'],
'sgdregressor__alpha': [0.0001, 0.001, 0.01, 1, 5, 10, 100, 1000],
'sgdregressor__max_iter': [1000, 5000, 10000]
}}],
'Ridge': [
{'model': Ridge(random_state=SEED)},
{'grid': {'ridge__alpha': [0.0001, 0.001, 0.01, 1, 5, 10, 100, 1000]
}}],
'Lasso': [
{'model': Lasso(random_state=SEED)},
{'grid': {'lasso_aplha': [0.0001, 0.001, 0.01, 1, 5, 10, 100, 1000],
'lasso__max_iter': [1000, 5000, 10000]
}}],
'ElasticNet': [
{'model': ElasticNet(random_state=SEED)},
{'grid': {'elasticnet__alpha': [0.0001, 0.001, 0.01, 1, 5, 10, 100, 1000]
}}],
'DecisionTreeRegressor': [
{'model': DecisionTreeRegressor(random_state=SEED)},
{ 'grid': {'decisiontreeregressor__max_depth': [2, 4, 8, 10, 12, 16, 20],
'decisiontreeregressor__min_samples_leaf': [2, 4, 8, 10, 12, 16, 20]
}}],
'RandomForestRegressor': [
{'model': RandomForestRegressor(random_state=SEED)},
{'grid' : {'randomforestregressor__max_depth': [2, 15, 22],
'randomforestregressor__min_samples_leaf': [1, 2, 4],
'randomforestregressor__min_samples_split': [2, 4, 6]
# 'randomforestregressor__min_weight_fraction_leaf': [0.1, 0.3, 0.9],
# 'randomforestregressor__max_features': ['auto', 'log2', 'sqrt', None]
# 'randomforestregressor__max_leaf_nodes': [None, 10, 40, 90]
}}]
}
if best_regressor in regressor_list:
grid_params = params_dict[best_regressor][1]['grid']
grid_model = params_dict[best_regressor][0]['model']
else:
print("{} is not among list of regressors.".format(best_regressor))
return grid_params, grid_model
def best_regressor_hyperparameter(df, best_regressor):
print("Hyperparameter Tuning for the Best Regressor: {}".format(best_regressor))
# extract train and test data
X_train, X_test, y_train, y_test = pre_process_split_data(df)
# extract cat and num features
cat_feats, num_feats = extract_cat_num(X_train)
# instantiate preprocessor
preprocessor = preprocess_col_transformer(cat_feats, num_feats)
# extract best regressor model and grid params
grid_params, grid_model = regressor_hyperparameters(best_regressor)
# instantiate pipeline
print("Creating pipeline for {}.".format(best_regressor))
pipe = make_pipeline(preprocessor, grid_model)
# dictionary to hold gridsearch model output
GridSearchCV_model_output = {}
# create gridsearch object
grid_search = GridSearchCV(pipe, param_grid=grid_params, cv = 10, scoring = 'r2', refit = True, n_jobs = 4, return_train_score = True)
# fit gridsearch to training data
grid_search.fit(X_train, y_train)
# create key, value pair for best regressor
GridSearchCV_model_output['best_regressor_grid_object'] = grid_search.best_estimator_
# get best params
grid_best_params = grid_search.best_params_
print("Best parameters after GridSearchCV:\n {}".format(grid_best_params))
GridSearchCV_model_output['best_params'] = grid_best_params
# best score
grid_best_score = grid_search.best_score_
print("Best score after GridSearchCV:\n {:.2f}".format(grid_best_score))
# get feature names
cat_feature_names = grid_search.best_estimator_.named_steps['columntransformer'].named_transformers_['pipeline-1'].\
named_steps['onehotencoder'].get_feature_names(input_features = cat_feats)
all_feature_names = np.r_[cat_feature_names, num_feats]
# list to hold coefs or feature imporatnces in the case of decision trees and random forest
coefs_or_feats_imp = []
if not 'RandomForestRegressor' or not 'DecisionTreeRegressor':
# grab the coefficients
best_regressor_coef = list(grid_search.best_estimator_.named_steps[best_regressor.lower()].coef_)
best_regressor_coef_x = [x for x in best_regressor_coef]
coefs_or_feats_imp.append(best_regressor_coef_x)
# grab the intercept
grid_search_intercept = grid_search.best_estimator_.named_steps[best_regressor.lower()].intercept_
GridSearchCV_model_output['best_regressor_intercept'] = grid_search_intercept
else:
# grab the coefficients
best_regressor_coef = list(grid_search.best_estimator_.named_steps[best_regressor.lower()].feature_importances_)
best_regressor_coef_x = [x for x in best_regressor_coef]
coefs_or_feats_imp.append(best_regressor_coef_x)
# create a dataframe of feature names and coeeficients
coef_dict = dict(zip(all_feature_names, coefs_or_feats_imp[0]))
df_coef = pd.DataFrame(coef_dict.items(), columns = ['Feature', 'Coefficient'])
df_coef = df_coef.sort_values(by = ['Coefficient'], ascending = False)
df_coef = df_coef.reset_index(drop = True)
GridSearchCV_model_output['best_regressor_feature_importances'] = df_coef
return GridSearchCV_model_output, plots.bar_plot(df_coef, 'Feature', 'Coefficient')
def dump_estimator(GridSearchCV_model_output):
# convert tuple to list
grid_list = list(GridSearchCV_model_output)
# extract best estimator
best_estimator = grid_list[0]['best_regressor_grid_object']
# save as pickle
model = joblib.dump(best_estimator, 'models/best_regressor_dump.pkl')
return model