Skip to content

Commit

Permalink
FIX import notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Aurelien Massiot committed Sep 19, 2023
1 parent 61a7b99 commit 88c2153
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 19 deletions.
Binary file modified models/preprocessor.pkl
Binary file not shown.
237 changes: 219 additions & 18 deletions notebook/titanic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"_cell_guid": "9ae4a31b-44ce-72b7-375b-1376bcc81142"
},
Expand All @@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"_cell_guid": "e117a178-539a-d880-ab8c-5306d6d671f0"
},
Expand All @@ -95,11 +95,37 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"_cell_guid": "166004fb-0092-7fb9-f890-1b764a7f6da9"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 891 entries, 0 to 890\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 PassengerId 891 non-null int64 \n",
" 1 Survived 891 non-null int64 \n",
" 2 Pclass 891 non-null int64 \n",
" 3 Name 891 non-null object \n",
" 4 Sex 891 non-null object \n",
" 5 Age 714 non-null float64\n",
" 6 SibSp 891 non-null int64 \n",
" 7 Parch 891 non-null int64 \n",
" 8 Ticket 891 non-null object \n",
" 9 Fare 891 non-null float64\n",
" 10 Cabin 204 non-null object \n",
" 11 Embarked 889 non-null object \n",
"dtypes: float64(2), int64(5), object(5)\n",
"memory usage: 83.7+ KB\n"
]
}
],
"source": [
"train.info()"
]
Expand Down Expand Up @@ -133,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -143,11 +169,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from indus.feature_engineering import Preprocessor"
"from feature_engineering import Preprocessor"
]
},
{
Expand All @@ -161,7 +187,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"_cell_guid": "d004f91d-1c6b-e281-6e4c-45b44eadbcca"
},
Expand All @@ -188,11 +214,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"_cell_guid": "09391302-b621-4730-7589-7eb017286e7f"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"55\n"
]
}
],
"source": [
"print(len(train_processed.columns))"
]
Expand Down Expand Up @@ -281,11 +315,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"_cell_guid": "5593980a-4145-9594-299c-f4d1a9f01970"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8373\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
Expand All @@ -312,11 +354,170 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {
"_cell_guid": "d77e221b-352d-8669-05d9-f7defce05709"
},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>variable</th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Sex_male</td>\n",
" <td>0.113878</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>Name_Title_Mr.</td>\n",
" <td>0.111428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Sex_female</td>\n",
" <td>0.108093</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Fare</td>\n",
" <td>0.087661</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Name_Len</td>\n",
" <td>0.081891</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Age</td>\n",
" <td>0.077145</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Pclass_3</td>\n",
" <td>0.044393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>Name_Title_Mrs.</td>\n",
" <td>0.031899</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Ticket_Length</td>\n",
" <td>0.031384</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>Name_Title_Miss.</td>\n",
" <td>0.029865</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Cabin_Letter_n</td>\n",
" <td>0.027167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>Family_Size_Big</td>\n",
" <td>0.026689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>Family_Size_Nuclear</td>\n",
" <td>0.022733</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Pclass_1</td>\n",
" <td>0.018575</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Ticket_Category_1</td>\n",
" <td>0.018411</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Ticket_Category_3</td>\n",
" <td>0.013439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Pclass_2</td>\n",
" <td>0.012920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>Name_Title_Master.</td>\n",
" <td>0.012221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Ticket_Category_Low_ticket</td>\n",
" <td>0.011784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>Family_Size_Solo</td>\n",
" <td>0.011101</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" variable importance\n",
"12 Sex_male 0.113878\n",
"45 Name_Title_Mr. 0.111428\n",
"11 Sex_female 0.108093\n",
"1 Fare 0.087661\n",
"2 Name_Len 0.081891\n",
"0 Age 0.077145\n",
"10 Pclass_3 0.044393\n",
"46 Name_Title_Mrs. 0.031899\n",
"4 Ticket_Length 0.031384\n",
"42 Name_Title_Miss. 0.029865\n",
"33 Cabin_Letter_n 0.027167\n",
"51 Family_Size_Big 0.026689\n",
"52 Family_Size_Nuclear 0.022733\n",
"8 Pclass_1 0.018575\n",
"16 Ticket_Category_1 0.018411\n",
"18 Ticket_Category_3 0.013439\n",
"9 Pclass_2 0.012920\n",
"41 Name_Title_Master. 0.012221\n",
"21 Ticket_Category_Low_ticket 0.011784\n",
"53 Family_Size_Solo 0.011101"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat((pd.DataFrame(train_processed.iloc[:, 1:].columns, columns=['variable']), \n",
" pd.DataFrame(rf.feature_importances_, columns=['importance'])), \n",
Expand All @@ -334,7 +535,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {
"_cell_guid": "14dc0e66-9fc4-86bf-8927-46d366d4bbcf"
},
Expand All @@ -356,7 +557,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -365,7 +566,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -408,7 +609,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
"version": "3.10.0"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ uvicorn==0.22.0
notebook==6.5.4
jupyter-nbextensions-configurator==0.6.2

# Fix SAJO AUMA https://github.com/microsoft/azuredatastudio/issues/24443
# Fix SAJO AUMA
traitlets==5.9.0

0 comments on commit 88c2153

Please sign in to comment.