From 8d0e170ceb4ab45943de4e5211b6d6bedf3bb325 Mon Sep 17 00:00:00 2001 From: nitro381penta <151678037+nitro381penta@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:11:28 +0200 Subject: [PATCH] Lab Day 22: Ensembles Julia's solution --- your-code/main-julia.ipynb | 471 +++++++++++++++++++++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100644 your-code/main-julia.ipynb diff --git a/your-code/main-julia.ipynb b/your-code/main-julia.ipynb new file mode 100644 index 0000000..672bb61 --- /dev/null +++ b/your-code/main-julia.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9975797-fbde-4f96-8aac-85a4f950c421", + "metadata": {}, + "source": [ + "# Challenge 1\n", + "\n", + "The heart disease dataset is a classic dataset that contains various health metrics (age, sex, chest pain type, blood pressure, cholesterol, etc.) related to diagnosing heart disease (binary classification: presence or absence of heart disease)." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "00cf591d-8a5b-499e-8715-1ad140867934", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Load the dataset (change the path if needed)\n", + "df = pd.read_csv('../data/heart.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0bb5ea1c-a4e5-4419-bae8-661fe2d82711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063131452331015002.30011
137121302500118703.50021
241011302040017201.42021
356111202360117800.82021
457001203540116310.62021
\n", + "
" + ], + "text/plain": [ + " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n", + "0 63 1 3 145 233 1 0 150 0 2.3 0 \n", + "1 37 1 2 130 250 0 1 187 0 3.5 0 \n", + "2 41 0 1 130 204 0 0 172 0 1.4 2 \n", + "3 56 1 1 120 236 0 1 178 0 0.8 2 \n", + "4 57 0 0 120 354 0 1 163 1 0.6 2 \n", + "\n", + " ca thal target \n", + "0 0 1 1 \n", + "1 0 2 1 \n", + "2 0 2 1 \n", + "3 0 2 1 \n", + "4 0 2 1 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "870ebc45-d873-4c37-b1e4-ce2b0ebc08f2", + "metadata": {}, + "source": [ + "We are going to try to predict the presence of heart disease suing this features, starting with a classical baseline method and trying to improve on that result with a series of ensembled approaches." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "23ad7e40-87f3-4b93-bef9-a9ddb5881ddc", + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop(columns=\"target\")\n", + "y = df[\"target\"]\n", + "\n", + "# Train-test split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)\n", + "\n", + "# Feature scaling (for certain models, e.g., SVM or logistic regression, not always necessary for trees)\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "id": "c0153586-1242-43a0-bb61-78b1234434a6", + "metadata": {}, + "source": [ + "# Baseline model : decision Tree\n", + "\n", + "We'll train a decision tree as our baseline model and evaluate it using accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d39376f1-b4ca-44c0-8364-d11b9a7605f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decision Tree - MSE: 0.21052631578947367, R2: 0.14305849189570097\n" + ] + } + ], + "source": [ + "# Create and Train a Decision Tree Classifier and print the train and test accuracy\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import accuracy_score, mean_squared_error, r2_score\n", + "\n", + "\n", + "# Initialize Decision Tree model\n", + "tree_model = DecisionTreeClassifier(random_state=42)\n", + "\n", + "# Train the model\n", + "tree_model.fit(X_train, y_train)\n", + "\n", + "# Predict on test data\n", + "y_pred = tree_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "mse_tree = mean_squared_error(y_test, y_pred)\n", + "r2_tree = r2_score(y_test, y_pred)\n", + "\n", + "print(f\"Decision Tree - MSE: {mse_tree}, R2: {r2_tree}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e9502aa2-06b0-4cf2-bc6b-cb91390ff1a8", + "metadata": {}, + "source": [ + "We can see that this model is overfitting. This is expected, decision trees, especially deep ones are notorious agressive at exploiting the data available. But that also makes them highly variant: a small change on the tree/data makes for potentially large changes in performance." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9c60160a-b179-4896-a026-4beab803bb4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the same code again a couple of times. \n", + "# You can see that the Train Accuracy is always 100% (overfitting) and the Test Accuracy is all over the place. \n", + "# This is undesirable: our method is not generalizing and has high variance" + ] + }, + { + "cell_type": "markdown", + "id": "6cfb71cb-fc65-4c49-a1d6-1abd9a1085c1", + "metadata": {}, + "source": [ + "# Bagging: reducing variance" + ] + }, + { + "cell_type": "markdown", + "id": "828f3606-bb0b-4567-8583-11c38ab02579", + "metadata": {}, + "source": [ + "Bagging improves models because it reduces variance by averaging the predictions of multiple models trained on different subsets of the training data. This averaging effect reduces the sensitivity of the overall model to any one dataset or model, making the final prediction more stable and less prone to overfitting.\n", + "\n", + "- High-variance models, like decision trees, tend to overfit the training data. This means that small changes in the training data can lead to large changes in the model’s predictions. For example, a decision tree trained on one subset of data might look completely different from a decision tree trained on another subset. This leads to high variance, where the model’s performance fluctuates a lot depending on the specific data it was trained on.\n", + "- Once all the individual models are trained, Bagging combines their predictions by averaging them (for regression) or using a majority vote (for classification). The key idea here is that the errors in each individual model are somewhat independent because they are trained on different bootstrap samples. Some models will make errors in one direction, while others might make errors in another. When you average these predictions, the errors cancel out, reducing the overall variability (variance) of the final model." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8fc76766-a90c-47ed-bd02-66827a1dc115", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bagging with Decision Tree - MSE: 0.21052631578947367, R2: 0.14305849189570097\n" + ] + } + ], + "source": [ + "# Create and Train a BaggingClassifier. \n", + "# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really over a lot of different data samples\n", + "# Print the train and test accuracy\n", + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "\n", + "# Initialize Decision Tree model\n", + "tree_model = DecisionTreeClassifier(random_state=42)\n", + "\n", + "# Apply Bagging using older versions of sklearn\n", + "bagging_model = BaggingClassifier(estimator=tree_model, n_estimators=10, random_state=42)\n", + "\n", + "# Train the Bagging model\n", + "bagging_model.fit(X_train, y_train)\n", + "\n", + "# Predict on test data\n", + "y_pred = bagging_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "mse_bagging = mean_squared_error(y_test, y_pred)\n", + "r2_bagging = r2_score(y_test, y_pred)\n", + "\n", + "print(f\"Bagging with Decision Tree - MSE: {mse_bagging}, R2: {r2_bagging}\")\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "7efabafd-e27b-4a70-85e2-159170853f0b", + "metadata": {}, + "source": [ + "You can probably see a modest improvement in score, but most importantly, the overfitting is mostly gone. This is because averaging over multiple datasets stabilizes the high variance of the base model. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9f892484-618a-46fe-8e56-0a18fa652ed8", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the same code again a couple of times. \n", + "# You can see that consistently the Train Accuracy is close to the Test Accuracy. " + ] + }, + { + "cell_type": "markdown", + "id": "9e99849f-20fe-4eac-be80-b43dd56ba374", + "metadata": {}, + "source": [ + "# Boosting: reducing bias\n", + "\n", + "Now we’ll apply AdaBoost with decision trees as weak learners. This will sequentially improve the model by focusing on difficult cases.\n", + "\n", + "Boosting reduces bias by sequentially training a series of weak learners (often simple models like decision trees) where each subsequent model focuses on the mistakes made by the previous models. The key idea behind boosting is to incrementally improve the model by correcting errors, which helps to reduce bias, especially when the initial model is too simple and underfits the data.\n", + "\n", + "- Boosting typically uses weak learners, which are models that perform only slightly better than random guessing. For example, in classification, a weak learner might be a shallow decision tree (a \"stump\") with just a few levels. Weak learners usually have high bias, meaning they are too simplistic and don't capture the underlying patterns in the data well. As a result, they underfit the data.\n", + "\n", + "- In each iteration, boosting trains a new model that tries to correct the errors made by the earlier models. If an instance was misclassified by the first weak learner, it will receive a higher weight, so the next model pays more attention to it. As the sequence of models progresses, the ensemble collectively focuses more on the difficult-to-predict instances. Over time, the combined models become better at fitting the data, as they successively reduce the bias (systematic error) by adjusting for earlier mistakes." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "4bba1773-b0b0-44ba-a838-58b8c466ff88", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AdaBoost with Decision Tree - MSE: 0.19736842105263158, R2: 0.19661733615221966, Accuracy: 0.8026315789473685\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py:527: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Create and Train a AdaBoostClassifier. \n", + "# Use as base estimator a weak decision tree (max_depth=1) and 100 estimators to really target the specific behaviors of this phenomenon\n", + "# Print the train and test accuracy\n", + "\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "\n", + "# Train AdaBoost\n", + "tree_model = DecisionTreeClassifier(random_state=42)\n", + "\n", + "# Initialize AdaBoost using the base Decision Tree model\n", + "ada_boost_model = AdaBoostClassifier(estimator=tree_model, n_estimators=50, random_state=42)\n", + "\n", + "# Train the AdaBoost model\n", + "ada_boost_model.fit(X_train, y_train)\n", + "\n", + "# Predict on test data\n", + "y_pred = ada_boost_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "mse_ada = mean_squared_error(y_test, y_pred)\n", + "r2_ada = r2_score(y_test, y_pred)\n", + "accuracy_ada = accuracy_score(y_test, y_pred)\n", + "\n", + "print(f\"AdaBoost with Decision Tree - MSE: {mse_ada}, R2: {r2_ada}, Accuracy: {accuracy_ada}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4427e8a3-478b-4260-b2ac-74aa80983e50", + "metadata": {}, + "source": [ + "You can probably see a good improvement in score, but overfitting rearing it's ugly head a gain (not as much as in the base model). This is because the iterative correction of adaboost really allows the model to focus on the specifics of this problem, at a cost of overexploiting the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4b5e21fe-0a8f-45f6-a2d3-74261941f9c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the same code again a couple of times. \n", + "# You can see that the test Accuracy will mostly be pretty good, even if some times it get's lower or higher scores (high variance, low bias)\n", + "# You can see also that consistently the Train Accuracy is higher than the Test Accuracy,indicating some (not extreme) overfitting " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}