diff --git a/.gitignore b/.gitignore index e7506f3..6d95421 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,160 @@ -.idea/* -.vscode +# Byte-compiled / optimized / DLL files __pycache__/ -.ipynb_checkpoints/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ \ No newline at end of file diff --git a/README.md b/README.md index 3adf769..17397bc 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ Pour suivre ce TP, nous allons utiliser les GitHub pages suivantes : [TP 3 Documenter avec Sphinx](https://octo-technology.github.io/Formation-MLOps-1/tp3#0) -[TP 4 Écrire un script de CI](https://octo-technology.github.io/Formation-MLOps-1/tp4#0) +[TP 4 Créer un package python](https://octo-technology.github.io/Formation-MLOps-1/tp4#0) -[TP 5 Créer un package python](https://octo-technology.github.io/Formation-MLOps-1/tp5#0) - -[TP 6 Créer une API, et la conteneuriser](https://octo-technology.github.io/Formation-MLOps-1/tp6#0) +[TP 5 Créer une API, et la conteneuriser](https://octo-technology.github.io/Formation-MLOps-1/tp5#0) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..87c2788 --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,3 @@ +import os + +MODELS_PATH = os.path.dirname(__file__) \ No newline at end of file diff --git a/notebook/titanic.ipynb b/notebook/titanic.ipynb index e831c11..e210ea7 100644 --- a/notebook/titanic.ipynb +++ b/notebook/titanic.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "_cell_guid": "9ae4a31b-44ce-72b7-375b-1376bcc81142" }, @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "_cell_guid": "e117a178-539a-d880-ab8c-5306d6d671f0" }, @@ -95,11 +95,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "_cell_guid": "166004fb-0092-7fb9-f890-1b764a7f6da9" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 891 entries, 0 to 890\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 PassengerId 891 non-null int64 \n", + " 1 Survived 891 non-null int64 \n", + " 2 Pclass 891 non-null int64 \n", + " 3 Name 891 non-null object \n", + " 4 Sex 891 non-null object \n", + " 5 Age 714 non-null float64\n", + " 6 SibSp 891 non-null int64 \n", + " 7 Parch 891 non-null int64 \n", + " 8 Ticket 891 non-null object \n", + " 9 Fare 891 non-null float64\n", + " 10 Cabin 204 non-null object \n", + " 11 Embarked 889 non-null object \n", + "dtypes: float64(2), int64(5), object(5)\n", + "memory usage: 83.7+ KB\n" + ] + } + ], "source": [ "train.info()" ] @@ -115,11 +141,154 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "_cell_guid": "2fee872c-6233-57b1-d6a8-d017ef15edbd" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "train.head()" ] @@ -144,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" @@ -153,16 +322,16 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append(\"../src/\")" + "sys.path.append(\"../\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "from feature_engineering import *" + "from src.feature_engineering import *" ] }, { @@ -176,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "_cell_guid": "d004f91d-1c6b-e281-6e4c-45b44eadbcca" }, @@ -200,11 +369,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "_cell_guid": "09391302-b621-4730-7589-7eb017286e7f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "55\n" + ] + } + ], "source": [ "print(len(train_processed.columns))" ] @@ -302,11 +479,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "_cell_guid": "5593980a-4145-9594-299c-f4d1a9f01970" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8373\n" + ] + } + ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", @@ -333,11 +518,170 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "_cell_guid": "d77e221b-352d-8669-05d9-f7defce05709" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variableimportance
12Sex_male0.113878
45Name_Title_Mr.0.111428
11Sex_female0.108093
1Fare0.087661
2Name_Len0.081891
0Age0.077145
10Pclass_30.044393
46Name_Title_Mrs.0.031899
4Ticket_Length0.031384
42Name_Title_Miss.0.029865
33Cabin_Letter_n0.027167
51Family_Size_Big0.026689
52Family_Size_Nuclear0.022733
8Pclass_10.018575
16Ticket_Category_10.018411
18Ticket_Category_30.013439
9Pclass_20.012920
41Name_Title_Master.0.012221
21Ticket_Category_Low_ticket0.011784
53Family_Size_Solo0.011101
\n", + "
" + ], + "text/plain": [ + " variable importance\n", + "12 Sex_male 0.113878\n", + "45 Name_Title_Mr. 0.111428\n", + "11 Sex_female 0.108093\n", + "1 Fare 0.087661\n", + "2 Name_Len 0.081891\n", + "0 Age 0.077145\n", + "10 Pclass_3 0.044393\n", + "46 Name_Title_Mrs. 0.031899\n", + "4 Ticket_Length 0.031384\n", + "42 Name_Title_Miss. 0.029865\n", + "33 Cabin_Letter_n 0.027167\n", + "51 Family_Size_Big 0.026689\n", + "52 Family_Size_Nuclear 0.022733\n", + "8 Pclass_1 0.018575\n", + "16 Ticket_Category_1 0.018411\n", + "18 Ticket_Category_3 0.013439\n", + "9 Pclass_2 0.012920\n", + "41 Name_Title_Master. 0.012221\n", + "21 Ticket_Category_Low_ticket 0.011784\n", + "53 Family_Size_Solo 0.011101" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.concat((pd.DataFrame(train_processed.iloc[:, 1:].columns, columns=['variable']), \n", " pd.DataFrame(rf.feature_importances_, columns=['importance'])), \n", @@ -355,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "_cell_guid": "14dc0e66-9fc4-86bf-8927-46d366d4bbcf" }, @@ -371,9 +715,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## Conclusion" - ] + "source": [] }, { "cell_type": "markdown", @@ -383,15 +725,22 @@ "\n", "I welcome any comments and suggestions." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "_change_revision": 0, "_is_fork": false, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "PythonIndus", "language": "python", - "name": "python3" + "name": "pythonindus" }, "language_info": { "codemirror_mode": { @@ -403,9 +752,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.0" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/notebook/y_test_predictions.csv b/notebook/y_test_predictions.csv new file mode 100644 index 0000000..ffe908d --- /dev/null +++ b/notebook/y_test_predictions.csv @@ -0,0 +1,419 @@ +PassengerId,Survived +892,0 +893,0 +894,0 +895,0 +896,1 +897,0 +898,0 +899,0 +900,1 +901,0 +902,0 +903,0 +904,1 +905,0 +906,1 +907,1 +908,0 +909,0 +910,0 +911,1 +912,0 +913,1 +914,1 +915,0 +916,1 +917,0 +918,1 +919,0 +920,0 +921,0 +922,0 +923,0 +924,1 +925,0 +926,1 +927,1 +928,0 +929,0 +930,0 +931,1 +932,0 +933,0 +934,0 +935,1 +936,1 +937,0 +938,0 +939,0 +940,1 +941,1 +942,0 +943,0 +944,1 +945,1 +946,0 +947,0 +948,0 +949,0 +950,0 +951,1 +952,0 +953,0 +954,0 +955,1 +956,1 +957,1 +958,1 +959,0 +960,0 +961,1 +962,1 +963,0 +964,1 +965,1 +966,1 +967,0 +968,0 +969,1 +970,0 +971,1 +972,1 +973,0 +974,0 +975,0 +976,0 +977,0 +978,0 +979,0 +980,1 +981,1 +982,1 +983,0 +984,1 +985,0 +986,0 +987,0 +988,1 +989,0 +990,1 +991,0 +992,1 +993,0 +994,0 +995,0 +996,1 +997,0 +998,0 +999,0 +1000,0 +1001,0 +1002,0 +1003,1 +1004,1 +1005,1 +1006,1 +1007,0 +1008,0 +1009,1 +1010,0 +1011,1 +1012,1 +1013,0 +1014,1 +1015,0 +1016,0 +1017,1 +1018,0 +1019,1 +1020,0 +1021,0 +1022,0 +1023,0 +1024,0 +1025,0 +1026,0 +1027,0 +1028,0 +1029,0 +1030,1 +1031,0 +1032,0 +1033,1 +1034,0 +1035,0 +1036,1 +1037,0 +1038,0 +1039,0 +1040,0 +1041,0 +1042,1 +1043,0 +1044,0 +1045,1 +1046,0 +1047,0 +1048,1 +1049,1 +1050,0 +1051,1 +1052,1 +1053,1 +1054,1 +1055,0 +1056,0 +1057,1 +1058,0 +1059,0 +1060,1 +1061,0 +1062,0 +1063,0 +1064,0 +1065,0 +1066,0 +1067,1 +1068,1 +1069,0 +1070,1 +1071,1 +1072,0 +1073,1 +1074,1 +1075,0 +1076,1 +1077,0 +1078,1 +1079,0 +1080,0 +1081,0 +1082,0 +1083,0 +1084,1 +1085,0 +1086,1 +1087,0 +1088,1 +1089,1 +1090,0 +1091,0 +1092,1 +1093,1 +1094,1 +1095,1 +1096,0 +1097,0 +1098,1 +1099,0 +1100,1 +1101,0 +1102,0 +1103,0 +1104,0 +1105,1 +1106,0 +1107,0 +1108,1 +1109,0 +1110,1 +1111,0 +1112,1 +1113,0 +1114,1 +1115,0 +1116,1 +1117,1 +1118,0 +1119,1 +1120,0 +1121,0 +1122,0 +1123,1 +1124,0 +1125,0 +1126,0 +1127,0 +1128,0 +1129,0 +1130,1 +1131,1 +1132,1 +1133,1 +1134,1 +1135,0 +1136,1 +1137,1 +1138,1 +1139,0 +1140,1 +1141,1 +1142,1 +1143,0 +1144,1 +1145,0 +1146,0 +1147,0 +1148,0 +1149,0 +1150,1 +1151,0 +1152,0 +1153,0 +1154,1 +1155,1 +1156,0 +1157,0 +1158,0 +1159,0 +1160,0 +1161,0 +1162,0 +1163,0 +1164,1 +1165,1 +1166,0 +1167,1 +1168,0 +1169,0 +1170,0 +1171,0 +1172,0 +1173,1 +1174,1 +1175,1 +1176,1 +1177,0 +1178,0 +1179,0 +1180,0 +1181,0 +1182,0 +1183,0 +1184,0 +1185,0 +1186,0 +1187,0 +1188,1 +1189,0 +1190,0 +1191,0 +1192,0 +1193,0 +1194,0 +1195,0 +1196,1 +1197,1 +1198,0 +1199,1 +1200,0 +1201,0 +1202,0 +1203,0 +1204,0 +1205,0 +1206,1 +1207,1 +1208,0 +1209,0 +1210,0 +1211,0 +1212,0 +1213,0 +1214,0 +1215,0 +1216,1 +1217,0 +1218,1 +1219,0 +1220,0 +1221,0 +1222,1 +1223,0 +1224,0 +1225,1 +1226,0 +1227,0 +1228,0 +1229,0 +1230,0 +1231,1 +1232,0 +1233,0 +1234,0 +1235,1 +1236,1 +1237,1 +1238,0 +1239,1 +1240,0 +1241,1 +1242,1 +1243,0 +1244,0 +1245,0 +1246,1 +1247,0 +1248,1 +1249,0 +1250,0 +1251,1 +1252,0 +1253,1 +1254,1 +1255,0 +1256,1 +1257,0 +1258,0 +1259,0 +1260,1 +1261,0 +1262,0 +1263,1 +1264,0 +1265,0 +1266,1 +1267,1 +1268,0 +1269,0 +1270,0 +1271,0 +1272,0 +1273,0 +1274,1 +1275,1 +1276,0 +1277,1 +1278,0 +1279,0 +1280,0 +1281,0 +1282,0 +1283,1 +1284,1 +1285,0 +1286,0 +1287,1 +1288,0 +1289,1 +1290,0 +1291,0 +1292,1 +1293,0 +1294,1 +1295,0 +1296,1 +1297,0 +1298,0 +1299,0 +1300,1 +1301,1 +1302,1 +1303,1 +1304,0 +1305,0 +1306,1 +1307,0 +1308,0 +1309,1 diff --git a/requirements.txt b/requirements.txt index 3d1fbf7..246ec81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ jupyter==1.0.0 -jupyter_contrib_nbextensions==0.5.1 +notebook==7.0.3 nbconvert==7.7.2 numpy==1.23.3 matplotlib==3.7.1 @@ -8,7 +8,3 @@ scikit-learn==1.1.3 seaborn==0.12.0 fastapi==0.95.2 uvicorn==0.22.0 - -# Temp fix due to broken new version of jupyter-nbextensions-configurator 0.6.3 incompatible with notebook 7.0.0 -notebook==6.5.4 -jupyter-nbextensions-configurator==0.6.2 \ No newline at end of file diff --git a/src/__pycache__/__init__.cpython-37.pyc b/src/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 9c66a9d..0000000 Binary files a/src/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/src/__pycache__/feature_engineering.cpython-37.pyc b/src/__pycache__/feature_engineering.cpython-37.pyc deleted file mode 100644 index cdd8d91..0000000 Binary files a/src/__pycache__/feature_engineering.cpython-37.pyc and /dev/null differ diff --git a/tests/__pycache__/__init__.cpython-37.pyc b/tests/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 75b7061..0000000 Binary files a/tests/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/tests/__pycache__/test_feature_engineering.cpython-37-pytest-6.2.5.pyc b/tests/__pycache__/test_feature_engineering.cpython-37-pytest-6.2.5.pyc deleted file mode 100644 index 4ed207f..0000000 Binary files a/tests/__pycache__/test_feature_engineering.cpython-37-pytest-6.2.5.pyc and /dev/null differ