From 0e9e2670002facc9fc4975444d97e038164e01db Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Mon, 5 Dec 2022 08:47:02 +0100 Subject: [PATCH 1/6] Add BERT experiments --- experiments/cs433/README.md | 16 + .../cs433/en-spam-classifier/README.md | 13 + .../en-spam-classifier/training_eng_sc.ipynb | 5616 +++++++++++++++++ .../cs433/multi-spam-classifier/README.md | 13 + .../traing_multi_sc.ipynb | 4252 +++++++++++++ 5 files changed, 9910 insertions(+) create mode 100644 experiments/cs433/README.md create mode 100644 experiments/cs433/en-spam-classifier/README.md create mode 100644 experiments/cs433/en-spam-classifier/training_eng_sc.ipynb create mode 100644 experiments/cs433/multi-spam-classifier/README.md create mode 100644 experiments/cs433/multi-spam-classifier/traing_multi_sc.ipynb diff --git a/experiments/cs433/README.md b/experiments/cs433/README.md new file mode 100644 index 0000000..5047347 --- /dev/null +++ b/experiments/cs433/README.md @@ -0,0 +1,16 @@ +# Experiments + +Luka Secilmis, Thomas Ecabert, Yanis De Busschere + +## Abstract + +This folder contains all the notebooks used for experimenting with the differents models during the project. + +## Summary of experiments + +| Experiment | Folder | Training Time | Prediction Time | Accuracy | F1-Score | +|---------------------|----------------------------------------------------|---------------|-----------------|----------|----------| +| BERT (english only) | [./en-spam-classifier](./en-spam-classifier) | 1h02 | 0.005s | 98.759 | 98.600 | +| BERT (multilingual) | [./multi-spam-classifier](./multi-spam-classifier) | 1h09 | 0.005 | 98.814 | 98.779 | + +All computation and time measurement were made using an NVIDIA RTX A5000. diff --git a/experiments/cs433/en-spam-classifier/README.md b/experiments/cs433/en-spam-classifier/README.md new file mode 100644 index 0000000..8c1d980 --- /dev/null +++ b/experiments/cs433/en-spam-classifier/README.md @@ -0,0 +1,13 @@ +### BERT (English only) + +## Abstract + +We developed an NLP-based spam classifier through a transfer learning approach, by fine-tuning a pre-trained English DistilBERT model on the Zenodo dataset for text classification. + +## Results + +| Training Time | Prediction Time | Accuracy | F1-Score | +|---------------|-----------------|----------|----------| +| 1h02 | 0.005s | 98.759 | 98.600 | + +All computation and time measurement were made using an NVIDIA RTX A5000. diff --git a/experiments/cs433/en-spam-classifier/training_eng_sc.ipynb b/experiments/cs433/en-spam-classifier/training_eng_sc.ipynb new file mode 100644 index 0000000..3e8e90f --- /dev/null +++ b/experiments/cs433/en-spam-classifier/training_eng_sc.ipynb @@ -0,0 +1,5616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "3ILVeQZ5Gfew" + }, + "source": [ + "### EPFL CS-433 - Machine Learning Project 2\n", + "#### CERN - Zenodo: Adaptable Spam Filter Modelling for Digital Scientific Research Repository \n", + "Training a DistilBERT model for the task of english spam detection.\n", + "\n", + "Authors: Luka Secilmis, Yanis De Busschere, Thomas Ecabert" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "731Nc_OQF3bl" + }, + "outputs": [], + "source": [ + "# Install required packages\n", + "!pip install transformers\n", + "!pip install pandas\n", + "!pip install numpy\n", + "!pip install datasets\n", + "!pip install sklearn\n", + "!pip install torch" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "4gGUsIXLF_TI" + }, + "outputs": [], + "source": [ + "# Import required packages\n", + "import transformers\n", + "import pandas as pd\n", + "import numpy as np\n", + "import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5xpevkPtGCoZ", + "outputId": "9d35a50a-b3ec-46a2-d2b7-fa08ab5c9bd2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPU: NVIDIA RTX A5000\n" + ] + } + ], + "source": [ + "# Set Hardware accelerator to GPU in Edit: Notebook Settings (in Google Colab)\n", + "# Check if GPU is available, this will significantly speed up fine-tuning\n", + "if torch.cuda.is_available(): \n", + " device = torch.device(\"cuda\") \n", + " print('GPU:', torch.cuda.get_device_name(0))\n", + "else:\n", + " print('No GPU available, do not train')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dBlaE9efGbfW" + }, + "source": [ + "## Import pre-processed data\n", + "Run script *feat-eng-esc.py* to generate the pre-processed data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "4z1ZyD0uGbyJ" + }, + "outputs": [], + "source": [ + "# Load the processed dataset\n", + "df = pd.read_csv('dataset-esc.csv')\n", + "df = df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WobbJ86FQMVP", + "outputId": "dcf6efbd-cae4-4e3c-e20d-6685e37a5fbc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(83778, 2)\n" + ] + } + ], + "source": [ + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "nHN7dKsdQT7s", + "outputId": "ce8aafbb-34cc-4922-861b-c755f358cebd" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " description label\n", + "0 FonePaw iPhone Data Recovery features in recov... 1\n", + "1 FonePaw iOS Transfer is mainly designed to tra... 1\n", + "2 This is my first upload 1\n", + "3 Lost photos from iPhone can be recovered with ... 1\n", + "4 I can’t play WLMP file directly, what player d... 1" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptionlabel
0FonePaw iPhone Data Recovery features in recov...1
1FonePaw iOS Transfer is mainly designed to tra...1
2This is my first upload1
3Lost photos from iPhone can be recovered with ...1
4I can’t play WLMP file directly, what player d...1
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YZPyhMFbQXiq", + "outputId": "a4568b61-28b6-4612-a717-09afbe632205" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 55847\n", + "1 27931\n", + "Name: label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "df['label'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eN38xG9dQZhz" + }, + "source": [ + "## Training Set-up" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "K1SyRzDPQcld" + }, + "outputs": [], + "source": [ + "# Split data into train and test sets\n", + "# Note: stratify on 'label' to preserve the same proportions of labels in each set as observed in the original dataset\n", + "train, test = train_test_split(df, test_size=0.2, stratify=df[['label']], random_state=42)\n", + "test_en = test.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "1f37d39909ba411c84ba4d587502808f", + "83985d283ba747ac813204d89b39fc19", + "d83687033a1f4c549a258eedb8fbc13f", + "c5506388a6c841d0b8e0b8959444b1bd", + "42e80e5f55d849adae1694dd26c48596", + "f25e47228131400aa23b1b9d61ab1a8f", + "9599953932cd4eebad2e7b8aa3a10ebf", + "f9011bab69da4abc8116463fc937d667", + "a4d860f540d547f28c124e47b540ef32", + "9fec8a0b4b3e4697b83ce2d63795c30f", + "5af0450d396048edbdefa772a927b0d3", + "8bc429f2cb80478792c3b9296067f09a", + "4204b749b4d844ca92254e693be1ea4c", + "370954df41134e03a9ede5aa55141b6e", + "b08c290cffaf466cb53baf9b7bbf11ca", + "deddcc7ebc6b4250b2b1e28cfed07b6f", + "384ddeb98b9c4775bc6b9344b60b9554", + "c7630c218de0423da30c3a59936b484f", + "d4b63a54f77c4af79ccd3b5a8460352e", + "2b63341f028a4cf4b4efee73d77f8f61", + "2eae3ad820254e589596f34191eaa992", + "198299a2a67b46c8a1e9e9c77e930207", + "53d774b07f434041a9beb99e594760d4", + "95073dd374104a12ba1bf0bb7ef9d9d5", + "c47ed49a6f644018af40bb2a5f2ced9e", + "79c3275a7abf473db6ce9d9d8780bc71", + "7cad65e1d5154139b7775dc57dffa4e1", + "bded28391bd9442da0ebbb9461002046", + "3c47d4afdb89459aaeadd6e93d337dac", + "b80b8b33903f48d5a5276d648cf529f5", + "9f1b87dece70453787bd40fc7902945b", + "e611e960e4c842eb801d723d3735bcba", + "5b6f4eab91fb41129900f797262255a3", + "2b45c435aacf41cf98833abcfcc80989", + "59d498db7b914a5781d8f256514438e3", + "8c086ffc64ac42f88fbc20265c85f036", + "189ff3fbd931458bb1038d5f96b8ff17", + "5fabbacbefc7495892cf71289b7bea54", + "e72af544a87f433b961dc235bb53623a", + "832d878db7ad4b1f9f9162126b9d9974", + "2e800df0d89242b09f66a00be949a7e7", + "042be1e691574df3ac3f3d06b14f8170", + "45f189446ee84bc5b8cf691da54dbb01", + "e70aba335af14f59ac44e051a0a5a51b", + "7960a7016bf84079aa86437e41cb42f0", + "db75e31c410c4aa186839c44d7eab912", + "6f34f1bdb26e4cd1ab6099e92f70de80", + "64f7945ea3c54998b41ec032ab9b3528" + ] + }, + "id": "_UdZwkQ1QdgD", + "outputId": "b10abe35-6ba0-43d1-bcbb-72105b568915" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7960a7016bf84079aa86437e41cb42f0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/29.0 [00:00\n", + " \n", + " \n", + " [25134/25134 1:02:22, Epoch 3/3]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
5000.140700
10000.095800
15000.098900
20000.073800
25000.054100
30000.076100
35000.081300
40000.081500
45000.074500
50000.075700
55000.074200
60000.084900
65000.080400
70000.075000
75000.056700
80000.061200
85000.075700
90000.061200
95000.047900
100000.055100
105000.057500
110000.047500
115000.056500
120000.055900
125000.052100
130000.060200
135000.042800
140000.046900
145000.033600
150000.049600
155000.036200
160000.049500
165000.037800
170000.040900
175000.033900
180000.036600
185000.028000
190000.024500
195000.032300
200000.031600
205000.027300
210000.022200
215000.022400
220000.030700
225000.023000
230000.021400
235000.027500
240000.021600
245000.035700
250000.027800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to test_trainer/checkpoint-500\n", + "Configuration saved in test_trainer/checkpoint-500/config.json\n", + "Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-1000\n", + "Configuration saved in test_trainer/checkpoint-1000/config.json\n", + "Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-1500\n", + "Configuration saved in test_trainer/checkpoint-1500/config.json\n", + "Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-2000\n", + "Configuration saved in test_trainer/checkpoint-2000/config.json\n", + "Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-2500\n", + "Configuration saved in test_trainer/checkpoint-2500/config.json\n", + "Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-1000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-3000\n", + "Configuration saved in test_trainer/checkpoint-3000/config.json\n", + "Model weights saved in test_trainer/checkpoint-3000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-1500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-3500\n", + "Configuration saved in test_trainer/checkpoint-3500/config.json\n", + "Model weights saved in test_trainer/checkpoint-3500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-2000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-4000\n", + "Configuration saved in test_trainer/checkpoint-4000/config.json\n", + "Model weights saved in test_trainer/checkpoint-4000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-2500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-4500\n", + "Configuration saved in test_trainer/checkpoint-4500/config.json\n", + "Model weights saved in test_trainer/checkpoint-4500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-3000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-5000\n", + "Configuration saved in test_trainer/checkpoint-5000/config.json\n", + "Model weights saved in test_trainer/checkpoint-5000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-3500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-5500\n", + "Configuration saved in test_trainer/checkpoint-5500/config.json\n", + "Model weights saved in test_trainer/checkpoint-5500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-4000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-6000\n", + "Configuration saved in test_trainer/checkpoint-6000/config.json\n", + "Model weights saved in test_trainer/checkpoint-6000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-4500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-6500\n", + "Configuration saved in test_trainer/checkpoint-6500/config.json\n", + "Model weights saved in test_trainer/checkpoint-6500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-5000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-7000\n", + "Configuration saved in test_trainer/checkpoint-7000/config.json\n", + "Model weights saved in test_trainer/checkpoint-7000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-5500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-7500\n", + "Configuration saved in test_trainer/checkpoint-7500/config.json\n", + "Model weights saved in test_trainer/checkpoint-7500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-6000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-8000\n", + "Configuration saved in test_trainer/checkpoint-8000/config.json\n", + "Model weights saved in test_trainer/checkpoint-8000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-6500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-8500\n", + "Configuration saved in test_trainer/checkpoint-8500/config.json\n", + "Model weights saved in test_trainer/checkpoint-8500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-7000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-9000\n", + "Configuration saved in test_trainer/checkpoint-9000/config.json\n", + "Model weights saved in test_trainer/checkpoint-9000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-7500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-9500\n", + "Configuration saved in test_trainer/checkpoint-9500/config.json\n", + "Model weights saved in test_trainer/checkpoint-9500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-8000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-10000\n", + "Configuration saved in test_trainer/checkpoint-10000/config.json\n", + "Model weights saved in test_trainer/checkpoint-10000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-8500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-10500\n", + "Configuration saved in test_trainer/checkpoint-10500/config.json\n", + "Model weights saved in test_trainer/checkpoint-10500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-9000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-11000\n", + "Configuration saved in test_trainer/checkpoint-11000/config.json\n", + "Model weights saved in test_trainer/checkpoint-11000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-9500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-11500\n", + "Configuration saved in test_trainer/checkpoint-11500/config.json\n", + "Model weights saved in test_trainer/checkpoint-11500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-10000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-12000\n", + "Configuration saved in test_trainer/checkpoint-12000/config.json\n", + "Model weights saved in test_trainer/checkpoint-12000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-10500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-12500\n", + "Configuration saved in test_trainer/checkpoint-12500/config.json\n", + "Model weights saved in test_trainer/checkpoint-12500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-11000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-13000\n", + "Configuration saved in test_trainer/checkpoint-13000/config.json\n", + "Model weights saved in test_trainer/checkpoint-13000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-11500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-13500\n", + "Configuration saved in test_trainer/checkpoint-13500/config.json\n", + "Model weights saved in test_trainer/checkpoint-13500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-12000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-14000\n", + "Configuration saved in test_trainer/checkpoint-14000/config.json\n", + "Model weights saved in test_trainer/checkpoint-14000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-12500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-14500\n", + "Configuration saved in test_trainer/checkpoint-14500/config.json\n", + "Model weights saved in test_trainer/checkpoint-14500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-13000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-15000\n", + "Configuration saved in test_trainer/checkpoint-15000/config.json\n", + "Model weights saved in test_trainer/checkpoint-15000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-13500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-15500\n", + "Configuration saved in test_trainer/checkpoint-15500/config.json\n", + "Model weights saved in test_trainer/checkpoint-15500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-14000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-16000\n", + "Configuration saved in test_trainer/checkpoint-16000/config.json\n", + "Model weights saved in test_trainer/checkpoint-16000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-14500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-16500\n", + "Configuration saved in test_trainer/checkpoint-16500/config.json\n", + "Model weights saved in test_trainer/checkpoint-16500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-15000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-17000\n", + "Configuration saved in test_trainer/checkpoint-17000/config.json\n", + "Model weights saved in test_trainer/checkpoint-17000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-15500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-17500\n", + "Configuration saved in test_trainer/checkpoint-17500/config.json\n", + "Model weights saved in test_trainer/checkpoint-17500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-16000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-18000\n", + "Configuration saved in test_trainer/checkpoint-18000/config.json\n", + "Model weights saved in test_trainer/checkpoint-18000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-16500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-18500\n", + "Configuration saved in test_trainer/checkpoint-18500/config.json\n", + "Model weights saved in test_trainer/checkpoint-18500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-17000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-19000\n", + "Configuration saved in test_trainer/checkpoint-19000/config.json\n", + "Model weights saved in test_trainer/checkpoint-19000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-17500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-19500\n", + "Configuration saved in test_trainer/checkpoint-19500/config.json\n", + "Model weights saved in test_trainer/checkpoint-19500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-18000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-20000\n", + "Configuration saved in test_trainer/checkpoint-20000/config.json\n", + "Model weights saved in test_trainer/checkpoint-20000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-18500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-20500\n", + "Configuration saved in test_trainer/checkpoint-20500/config.json\n", + "Model weights saved in test_trainer/checkpoint-20500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-19000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-21000\n", + "Configuration saved in test_trainer/checkpoint-21000/config.json\n", + "Model weights saved in test_trainer/checkpoint-21000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-19500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-21500\n", + "Configuration saved in test_trainer/checkpoint-21500/config.json\n", + "Model weights saved in test_trainer/checkpoint-21500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-20000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-22000\n", + "Configuration saved in test_trainer/checkpoint-22000/config.json\n", + "Model weights saved in test_trainer/checkpoint-22000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-20500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-22500\n", + "Configuration saved in test_trainer/checkpoint-22500/config.json\n", + "Model weights saved in test_trainer/checkpoint-22500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-21000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-23000\n", + "Configuration saved in test_trainer/checkpoint-23000/config.json\n", + "Model weights saved in test_trainer/checkpoint-23000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-21500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-23500\n", + "Configuration saved in test_trainer/checkpoint-23500/config.json\n", + "Model weights saved in test_trainer/checkpoint-23500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-22000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-24000\n", + "Configuration saved in test_trainer/checkpoint-24000/config.json\n", + "Model weights saved in test_trainer/checkpoint-24000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-22500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-24500\n", + "Configuration saved in test_trainer/checkpoint-24500/config.json\n", + "Model weights saved in test_trainer/checkpoint-24500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-23000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-25000\n", + "Configuration saved in test_trainer/checkpoint-25000/config.json\n", + "Model weights saved in test_trainer/checkpoint-25000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-23500] due to args.save_total_limit\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=25134, training_loss=0.052604342072317574, metrics={'train_runtime': 3743.4451, 'train_samples_per_second': 53.711, 'train_steps_per_second': 6.714, 'total_flos': 2.6634689978167296e+16, 'train_loss': 0.052604342072317574, 'epoch': 3.0})" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Finally, we fine-tune our model\n", + "trainer.train() # Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 236 + }, + "id": "gmYZ24ZTQqj2", + "outputId": "21940fd4-9e3b-4f3a-ddf2-2d43ac0a1cf6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 16756\n", + " Batch size = 8\n", + "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [2095/2095 01:38]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'eval_loss': 0.05530316382646561,\n", + " 'eval_accuracy': 0.9875865361661494,\n", + " 'eval_runtime': 98.4031,\n", + " 'eval_samples_per_second': 170.279,\n", + " 'eval_steps_per_second': 21.29,\n", + " 'epoch': 3.0}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Evaluate model on test set\n", + "trainer.evaluate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nDHgPCeiQs23", + "outputId": "14f66abb-48bd-4551-f15a-e8e066efb57a" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to model-esc\n", + "Configuration saved in model-esc/config.json\n", + "Model weights saved in model-esc/pytorch_model.bin\n" + ] + } + ], + "source": [ + "# Save model\n", + "trainer.save_model(\"model-esc\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "opFYrx7FyK2V" + }, + "source": [ + "## Performance on Test Set: English Spam Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "8843f019aebd467196068f4309f25cc9", + "10ae828e4419422185cb5bf0e834a1b7", + "12eb1eab035e4a11bac666b5107b71c5", + "fd86ec91921b4616b7e44b88ac227e2b", + "fd416c21c4234cf08f3fbf35d919081d", + "56d5fa4a7beb44fc970ef4be853bd7d1", + "e30fdf93ef214dc590ab2aaefa839ec4", + "2659476ea6a7422d984d833b6ee4a559", + "a609503e6d93432fa87bdebcd1d41c44", + "11ac3ef5cec1446fb1e6e6fbdcd786fc", + "b4ed709d84934f6b8834736c0827325e", + "8c0244cc8ddd415baef0af9a26e99c2a", + "55d43e8681f34cf8b647e3640e65b5c3", + "f6e9075e09374f0c8d73cbc23fe2c37f", + "d452fd4dadf44c17958db8d708babaaa", + "2a67e7e31ee64b2783875c8a6503f3f7", + "029110c1b2944668b989034bfe44bdcf", + "7a31e43bee4f4ac9a6348a9fd5193e8a", + "706d7c5ee2704629a8818aea8f1de9ad", + "f2c936c647fd4150add769a8346ff487", + "6302b8231099405b9878315047f7ecca", + "c81150576005465a9f7501b4daf958a0", + "1201e64261d940b7879ca4ac4edaf132", + "a8c994bbd11f498fb045a8fb3133734e", + "36433ddc6ff346eda8b000f4f8b99af8", + "4066b311bb7f4df795695dc9e7e3b01e", + "409352ab5143416b8100af93fe5ff874", + "985900082d00472286fbd731de7d4cb5", + "9ed8efeef04a4a24b3c20db522ee881f", + "60619fc65d5b49179dce1e4ebfb240e9", + "4e1b10bfb2d5480dab2f72a7882e6007", + "7bf7c794b5414e54b4bb1fabdd5d5872", + "a379f0161b2a4f2a854280720c8a386a", + "55195cb575f9411fb8025257b4bb7e6c", + "ae027b9b103040ebb67c958f8c4bbc04", + "a68734e56e6743279bffd85f658dd6f0", + "f2fd3178ae9e456f94d40bc5aebfdc30", + "8bce862f7f37454d9bd5642f3dc2126d", + "a444ba3c963f4d678462ff1463ab622c", + "918b4c69019f43ecac1af0b4a7e07496", + "a513c748eae3413cb985de8926f5c393", + "e45dfb3b48b340d0afb90c4639127353", + "b4c5f02e8c47456ca61de0dc1558b32c", + "21ddfc64c103402d90e4944255c53189" + ] + }, + "id": "Ogo5rLn1yK2V", + "outputId": "ec4ab69a-fa40-4caf-bead-949dba4b26b9" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/29.0 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptionlabel
0Masih bingung dalam hal makanan penambah berat...1
1FonePaw iPhone Data Recovery features in recov...1
2FonePaw iOS Transfer is mainly designed to tra...1
3This is my first upload1
4Doyantoto merupakan sebuah website Agen Togel ...1
\n", + "" + ], + "text/plain": [ + " description label\n", + "0 Masih bingung dalam hal makanan penambah berat... 1\n", + "1 FonePaw iPhone Data Recovery features in recov... 1\n", + "2 FonePaw iOS Transfer is mainly designed to tra... 1\n", + "3 This is my first upload 1\n", + "4 Doyantoto merupakan sebuah website Agen Togel ... 1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YZPyhMFbQXiq", + "outputId": "c23765ac-aaf1-4ccd-eaae-fb84d0643295" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 52834\n", + "1 37784\n", + "Name: label, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['label'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eN38xG9dQZhz" + }, + "source": [ + "## Training Set-up" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "K1SyRzDPQcld" + }, + "outputs": [], + "source": [ + "# Split data into train and test sets\n", + "# Note: stratify on 'label' to preserve the same proportions of labels in each set as observed in the original dataset\n", + "train, test = train_test_split(df, test_size=0.2, stratify=df[['label']], random_state=42)\n", + "test_multi = test.copy()\n", + "test_no_en = test.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "1e6d4ba2b7b94f4dac9284a4f093d321", + "0cf595effd0e4cdcae790f2aec3fc176", + "80b55e96450648a880766c0492299029", + "7cd75a5a8b9e4365bb7adb7ce339b533", + "9dc7769a72244901883af1e209c02ab0", + "6e500e22b9ac47f29a2b60695eaeee1b", + "499d54d5f9d64ec5a7297780d01d7542", + "ccad7e6b754643d1872cbafef7158603", + "301745323ce14a49884c61ac37c36542", + "d848f3cd8fc648988e18a8569eb0bc48", + "dca20772f5b24c2591870d1da7dfbbff", + "ad44115b21dc46ef8ca008a738b8fbfe", + "1f9eb0a390484302a45317810f2e829d", + "c7fd5da448114c02a63c8d32b1d2991c", + "0cf205ec4c82488ba13f32519fab56b3", + "2eea7465105a4b4ea00cd57c6c16e78f", + "eef2d41094f34f18b811b7079f043220", + "6d992fc1ec634c19a794b7eda835c4a1", + "c17c34d0a0f5454ea2e12cef84aa4e27", + "745b1b2a04d1422d96d6c54934d771b5", + "8af7ffff71ce445e8b437bccf46b96f2", + "ee5a378bf6384328bb59b3f3bdb0f3d2", + "389833ff94f9462f8270fcec2396b19e", + "95d668b851c24aea988177972de6b0fa", + "cbfee6447535490a80fab030669c6eae", + "76ea2f1266a54708939acb5a96e8dfdf", + "a6f64f1c4078400a9df4bb35dd97d0b4", + "2fce782be7da430dba8014f58ac00b02", + "2490c7a9dfe04ff48995d77e7cb749d4", + "1918d586490e46ce8da9416064096151", + "ffcfba2fe7f94b5c982eaaedf6f3febc", + "8666b83039ed4cc3bd3896485ec09d09", + "cf46fefd82f24b12ad0c3145c4924acb", + "5633ff0eba2a48f48c69ebb23271d4ea", + "7440b1b3c178410ebeb5bd0dc1547a1b", + "2ef2d786b88c4f04b3d9bd73e6e945d0", + "da353539b2474d6caffc38af0241fecf", + "a5f108b19ab149ccb4e2b028e6d62422", + "b3928b364fb24476be243bde37e65469", + "2105aab8494c4a9d94f01acf7f7e357d", + "264853c4fdf44d25819027c2effa3b0f", + "d04a2a562c5b4aaea83248e1ecf44c70", + "a195f4aee30044d38af9d9e20c2dc490", + "25295a5688474c219fa43c9a0b7482ab" + ] + }, + "id": "_UdZwkQ1QdgD", + "outputId": "0c16a6eb-d0d7-478e-c8a9-4779d9bf853b" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "445f2eedb4f447bbbf87992d680508ee", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/29.0 [00:00\n", + " \n", + " \n", + " [27186/27186 1:09:26, Epoch 3/3]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
5000.175200
10000.096900
15000.089600
20000.087400
25000.085300
30000.084000
35000.089200
40000.088300
45000.094400
50000.065800
55000.075000
60000.080100
65000.080600
70000.075900
75000.081400
80000.062000
85000.069600
90000.056100
95000.048700
100000.047400
105000.041600
110000.032300
115000.040400
120000.045100
125000.052600
130000.053700
135000.052600
140000.043300
145000.042600
150000.044300
155000.034900
160000.038800
165000.045900
170000.037800
175000.046500
180000.036300
185000.029500
190000.029300
195000.027500
200000.029400
205000.024800
210000.013800
215000.027000
220000.020900
225000.039700
230000.026800
235000.031600
240000.024400
245000.027200
250000.016400
255000.027900
260000.031200
265000.023400
270000.025200

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to test_trainer/checkpoint-500\n", + "Configuration saved in test_trainer/checkpoint-500/config.json\n", + "Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-1000\n", + "Configuration saved in test_trainer/checkpoint-1000/config.json\n", + "Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-1500\n", + "Configuration saved in test_trainer/checkpoint-1500/config.json\n", + "Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin\n", + "Saving model checkpoint to test_trainer/checkpoint-2000\n", + "Configuration saved in test_trainer/checkpoint-2000/config.json\n", + "Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-2500\n", + "Configuration saved in test_trainer/checkpoint-2500/config.json\n", + "Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-1000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-3000\n", + "Configuration saved in test_trainer/checkpoint-3000/config.json\n", + "Model weights saved in test_trainer/checkpoint-3000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-1500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-3500\n", + "Configuration saved in test_trainer/checkpoint-3500/config.json\n", + "Model weights saved in test_trainer/checkpoint-3500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-2000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-4000\n", + "Configuration saved in test_trainer/checkpoint-4000/config.json\n", + "Model weights saved in test_trainer/checkpoint-4000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-2500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-4500\n", + "Configuration saved in test_trainer/checkpoint-4500/config.json\n", + "Model weights saved in test_trainer/checkpoint-4500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-3000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-5000\n", + "Configuration saved in test_trainer/checkpoint-5000/config.json\n", + "Model weights saved in test_trainer/checkpoint-5000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-3500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-5500\n", + "Configuration saved in test_trainer/checkpoint-5500/config.json\n", + "Model weights saved in test_trainer/checkpoint-5500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-4000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-6000\n", + "Configuration saved in test_trainer/checkpoint-6000/config.json\n", + "Model weights saved in test_trainer/checkpoint-6000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-4500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-6500\n", + "Configuration saved in test_trainer/checkpoint-6500/config.json\n", + "Model weights saved in test_trainer/checkpoint-6500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-5000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-7000\n", + "Configuration saved in test_trainer/checkpoint-7000/config.json\n", + "Model weights saved in test_trainer/checkpoint-7000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-5500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-7500\n", + "Configuration saved in test_trainer/checkpoint-7500/config.json\n", + "Model weights saved in test_trainer/checkpoint-7500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-6000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-8000\n", + "Configuration saved in test_trainer/checkpoint-8000/config.json\n", + "Model weights saved in test_trainer/checkpoint-8000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-6500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-8500\n", + "Configuration saved in test_trainer/checkpoint-8500/config.json\n", + "Model weights saved in test_trainer/checkpoint-8500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-7000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-9000\n", + "Configuration saved in test_trainer/checkpoint-9000/config.json\n", + "Model weights saved in test_trainer/checkpoint-9000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-7500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-9500\n", + "Configuration saved in test_trainer/checkpoint-9500/config.json\n", + "Model weights saved in test_trainer/checkpoint-9500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-8000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-10000\n", + "Configuration saved in test_trainer/checkpoint-10000/config.json\n", + "Model weights saved in test_trainer/checkpoint-10000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-8500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-10500\n", + "Configuration saved in test_trainer/checkpoint-10500/config.json\n", + "Model weights saved in test_trainer/checkpoint-10500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-9000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-11000\n", + "Configuration saved in test_trainer/checkpoint-11000/config.json\n", + "Model weights saved in test_trainer/checkpoint-11000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-9500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-11500\n", + "Configuration saved in test_trainer/checkpoint-11500/config.json\n", + "Model weights saved in test_trainer/checkpoint-11500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-10000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-12000\n", + "Configuration saved in test_trainer/checkpoint-12000/config.json\n", + "Model weights saved in test_trainer/checkpoint-12000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-10500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-12500\n", + "Configuration saved in test_trainer/checkpoint-12500/config.json\n", + "Model weights saved in test_trainer/checkpoint-12500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-11000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-13000\n", + "Configuration saved in test_trainer/checkpoint-13000/config.json\n", + "Model weights saved in test_trainer/checkpoint-13000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-11500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-13500\n", + "Configuration saved in test_trainer/checkpoint-13500/config.json\n", + "Model weights saved in test_trainer/checkpoint-13500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-12000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-14000\n", + "Configuration saved in test_trainer/checkpoint-14000/config.json\n", + "Model weights saved in test_trainer/checkpoint-14000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-12500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-14500\n", + "Configuration saved in test_trainer/checkpoint-14500/config.json\n", + "Model weights saved in test_trainer/checkpoint-14500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-13000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-15000\n", + "Configuration saved in test_trainer/checkpoint-15000/config.json\n", + "Model weights saved in test_trainer/checkpoint-15000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-13500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-15500\n", + "Configuration saved in test_trainer/checkpoint-15500/config.json\n", + "Model weights saved in test_trainer/checkpoint-15500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-14000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-16000\n", + "Configuration saved in test_trainer/checkpoint-16000/config.json\n", + "Model weights saved in test_trainer/checkpoint-16000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-14500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-16500\n", + "Configuration saved in test_trainer/checkpoint-16500/config.json\n", + "Model weights saved in test_trainer/checkpoint-16500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-15000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-17000\n", + "Configuration saved in test_trainer/checkpoint-17000/config.json\n", + "Model weights saved in test_trainer/checkpoint-17000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-15500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-17500\n", + "Configuration saved in test_trainer/checkpoint-17500/config.json\n", + "Model weights saved in test_trainer/checkpoint-17500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-16000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-18000\n", + "Configuration saved in test_trainer/checkpoint-18000/config.json\n", + "Model weights saved in test_trainer/checkpoint-18000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-16500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-18500\n", + "Configuration saved in test_trainer/checkpoint-18500/config.json\n", + "Model weights saved in test_trainer/checkpoint-18500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-17000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-19000\n", + "Configuration saved in test_trainer/checkpoint-19000/config.json\n", + "Model weights saved in test_trainer/checkpoint-19000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-17500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-19500\n", + "Configuration saved in test_trainer/checkpoint-19500/config.json\n", + "Model weights saved in test_trainer/checkpoint-19500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-18000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-20000\n", + "Configuration saved in test_trainer/checkpoint-20000/config.json\n", + "Model weights saved in test_trainer/checkpoint-20000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-18500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-20500\n", + "Configuration saved in test_trainer/checkpoint-20500/config.json\n", + "Model weights saved in test_trainer/checkpoint-20500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-19000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-21000\n", + "Configuration saved in test_trainer/checkpoint-21000/config.json\n", + "Model weights saved in test_trainer/checkpoint-21000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-19500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-21500\n", + "Configuration saved in test_trainer/checkpoint-21500/config.json\n", + "Model weights saved in test_trainer/checkpoint-21500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-20000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-22000\n", + "Configuration saved in test_trainer/checkpoint-22000/config.json\n", + "Model weights saved in test_trainer/checkpoint-22000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-20500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-22500\n", + "Configuration saved in test_trainer/checkpoint-22500/config.json\n", + "Model weights saved in test_trainer/checkpoint-22500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-21000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-23000\n", + "Configuration saved in test_trainer/checkpoint-23000/config.json\n", + "Model weights saved in test_trainer/checkpoint-23000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-21500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-23500\n", + "Configuration saved in test_trainer/checkpoint-23500/config.json\n", + "Model weights saved in test_trainer/checkpoint-23500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-22000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-24000\n", + "Configuration saved in test_trainer/checkpoint-24000/config.json\n", + "Model weights saved in test_trainer/checkpoint-24000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-22500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-24500\n", + "Configuration saved in test_trainer/checkpoint-24500/config.json\n", + "Model weights saved in test_trainer/checkpoint-24500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-23000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-25000\n", + "Configuration saved in test_trainer/checkpoint-25000/config.json\n", + "Model weights saved in test_trainer/checkpoint-25000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-23500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-25500\n", + "Configuration saved in test_trainer/checkpoint-25500/config.json\n", + "Model weights saved in test_trainer/checkpoint-25500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-24000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-26000\n", + "Configuration saved in test_trainer/checkpoint-26000/config.json\n", + "Model weights saved in test_trainer/checkpoint-26000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-24500] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-26500\n", + "Configuration saved in test_trainer/checkpoint-26500/config.json\n", + "Model weights saved in test_trainer/checkpoint-26500/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-25000] due to args.save_total_limit\n", + "Saving model checkpoint to test_trainer/checkpoint-27000\n", + "Configuration saved in test_trainer/checkpoint-27000/config.json\n", + "Model weights saved in test_trainer/checkpoint-27000/pytorch_model.bin\n", + "Deleting older checkpoint [test_trainer/checkpoint-25500] due to args.save_total_limit\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=27186, training_loss=0.05157783799180919, metrics={'train_runtime': 4167.7462, 'train_samples_per_second': 52.182, 'train_steps_per_second': 6.523, 'total_flos': 2.880927479450419e+16, 'train_loss': 0.05157783799180919, 'epoch': 3.0})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Finally, we fine-tune our model\n", + "trainer.train() # Training took about 1h15 with GPU: NVIDIA RTX A5000" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "gmYZ24ZTQqj2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 18124\n", + " Batch size = 8\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [2266/2266 01:38]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'eval_loss': 0.05935591459274292,\n", + " 'eval_accuracy': 0.9881372765393953,\n", + " 'eval_runtime': 98.1932,\n", + " 'eval_samples_per_second': 184.575,\n", + " 'eval_steps_per_second': 23.077,\n", + " 'epoch': 3.0}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Evaluate model on test set\n", + "trainer.evaluate()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "nDHgPCeiQs23" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to msc-model\n", + "Configuration saved in msc-model/config.json\n", + "Model weights saved in msc-model/pytorch_model.bin\n" + ] + } + ], + "source": [ + "# Save model\n", + "trainer.save_model(\"msc-model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance on Test Set: Multilingual Spam Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file msc-model/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"msc-model\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForSequenceClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"problem_type\": \"single_label_classification\",\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.25.1\",\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading weights file msc-model/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing DistilBertForSequenceClassification.\n", + "\n", + "All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at msc-model.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-multilingual-cased/snapshots/fb240273126596a03b35c85793d2e82a5b13ac79/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"distilbert-base-multilingual-cased\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForMaskedLM\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"transformers_version\": \"4.25.1\",\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-multilingual-cased/snapshots/fb240273126596a03b35c85793d2e82a5b13ac79/vocab.txt\n", + "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-multilingual-cased/snapshots/fb240273126596a03b35c85793d2e82a5b13ac79/tokenizer.json\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at None\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-multilingual-cased/snapshots/fb240273126596a03b35c85793d2e82a5b13ac79/tokenizer_config.json\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-multilingual-cased/snapshots/fb240273126596a03b35c85793d2e82a5b13ac79/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"distilbert-base-multilingual-cased\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForMaskedLM\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"initializer_range\": 0.02,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"transformers_version\": \"4.25.1\",\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from transformers import pipeline\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", + "# Load both models\n", + "model = AutoModelForSequenceClassification.from_pretrained('msc-model')\n", + "model_tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-multilingual-cased\")\n", + "\n", + "# Create a pipeline to facilitate the use of the model for classification\n", + "classifier = pipeline(\"text-classification\", model=model, tokenizer=model_tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Disabling tokenizer parallelism, we're using DataLoader multithreading already\n" + ] + } + ], + "source": [ + "y_true = test_multi['label'].tolist()\n", + "y_predict = classifier(test_multi['description'].map(lambda x: str(x)).tolist(), padding=True, truncation=True)\n", + "y_predict = [1 if pred['label'] == 'LABEL_1' else 0 for pred in y_predict]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 98.81372765393954 %\n", + "\n", + "F1 Score: 98.77898987594016 %\n", + "\n", + "True positives: 98.26650787349477 %\n", + "True negatives: 99.20507239519259 %\n" + ] + } + ], + "source": [ + "# compute accuracy\n", + "from sklearn.metrics import accuracy_score\n", + "accuracy_score(y_true, y_predict)\n", + "print(f'Accuracy: {100 * accuracy_score(y_true, y_predict)} %')\n", + "print()\n", + "# compute f1 score\n", + "from sklearn.metrics import f1_score\n", + "f1_score(y_true, y_predict, average='macro')\n", + "print(f'F1 Score: {100 * f1_score(y_true, y_predict, average=\"macro\")} %')\n", + "print()\n", + "# compute confusion matrix\n", + "from sklearn.metrics import confusion_matrix\n", + "confusion_matrix = confusion_matrix(y_true, y_predict)\n", + "\n", + "print('True positives: ', 100 * confusion_matrix[1][1]/(confusion_matrix[1][1] + confusion_matrix[1][0]), '%')\n", + "print('True negatives: ', 100 * confusion_matrix[0][0]/(confusion_matrix[0][0] + confusion_matrix[0][1]), '%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance on Test Set: all non-english languages Spam Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + } + ], + "source": [ + "# Filter out all the english texts\n", + "import re\n", + "from ftlangdetect import detect\n", + "CLEANING_REGEX = re.compile(r'[^a-zA-Z0-9\\s]', re.MULTILINE)\n", + "def detect_lang(descr):\n", + " d = CLEANING_REGEX.sub('', str(descr))\n", + " d = d.replace('\\r', ' ').replace('\\n', ' ')\n", + " lang = detect(d)['lang']\n", + " return lang\n", + "\n", + "test_no_en['lang'] = test_no_en['description'].map(lambda x: detect_lang(x))\n", + "test_no_en = test_no_en[test_no_en['lang'] != 'en']\n", + "test_no_en = test_no_en.drop(columns=['lang'])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "y_true = test_no_en['label'].tolist()\n", + "y_predict = classifier(test_no_en['description'].map(lambda x: str(x)).tolist(), padding=True, truncation=True)\n", + "y_predict = [1 if pred['label'] == 'LABEL_1' else 0 for pred in y_predict]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 98.31041257367387 %\n", + "\n", + "F1 Score: 97.48498193971817 %\n", + "\n", + "True positives: 98.50746268656717 %\n", + "True negatives: 97.57009345794393 %\n" + ] + } + ], + "source": [ + "# compute accuracy\n", + "from sklearn.metrics import accuracy_score\n", + "accuracy_score(y_true, y_predict)\n", + "print(f'Accuracy: {100 * accuracy_score(y_true, y_predict)} %')\n", + "print()\n", + "# compute f1 score\n", + "from sklearn.metrics import f1_score\n", + "f1_score(y_true, y_predict, average='macro')\n", + "print(f'F1 Score: {100 * f1_score(y_true, y_predict, average=\"macro\")} %')\n", + "print()\n", + "# compute confusion matrix\n", + "from sklearn.metrics import confusion_matrix\n", + "confusion_matrix = confusion_matrix(y_true, y_predict)\n", + "print('True positives: ', 100 * confusion_matrix[1][1]/(confusion_matrix[1][1] + confusion_matrix[1][0]), '%')\n", + "print('True negatives: ', 100 * confusion_matrix[0][0]/(confusion_matrix[0][0] + confusion_matrix[0][1]), '%')" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + }, + "vscode": { + "interpreter": { + "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0cf205ec4c82488ba13f32519fab56b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8af7ffff71ce445e8b437bccf46b96f2", + "placeholder": "​", + "style": "IPY_MODEL_ee5a378bf6384328bb59b3f3bdb0f3d2", + "value": " 466/466 [00:00<00:00, 15.4kB/s]" + } + }, + "0cf595effd0e4cdcae790f2aec3fc176": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6e500e22b9ac47f29a2b60695eaeee1b", + "placeholder": "​", + "style": "IPY_MODEL_499d54d5f9d64ec5a7297780d01d7542", + "value": "Downloading: 100%" + } + }, + "164d488a4b2242ff9d8104052fc0240f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17b3166a46f54ab5a2c725d8355b8759": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c139d1e7843247f0bfb30a916a0233d7", + "IPY_MODEL_d2d9bce8fafa4c188c401ed7bee7fd49", + "IPY_MODEL_ae3c0f882ae14194931a8387f82f9fd4" + ], + "layout": "IPY_MODEL_7e60c1d8816b49d5a5b1ff59dfde5ae0" + } + }, + "1918d586490e46ce8da9416064096151": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1e6d4ba2b7b94f4dac9284a4f093d321": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0cf595effd0e4cdcae790f2aec3fc176", + "IPY_MODEL_80b55e96450648a880766c0492299029", + "IPY_MODEL_7cd75a5a8b9e4365bb7adb7ce339b533" + ], + "layout": "IPY_MODEL_9dc7769a72244901883af1e209c02ab0" + } + }, + "1f9eb0a390484302a45317810f2e829d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eef2d41094f34f18b811b7079f043220", + "placeholder": "​", + "style": "IPY_MODEL_6d992fc1ec634c19a794b7eda835c4a1", + "value": "Downloading: 100%" + } + }, + "2105aab8494c4a9d94f01acf7f7e357d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2131431357ca4b2e963de9b004e21102": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2267c81aaeea44ab87bf7ba3d1beff03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee4a8572a2234f5fb374584c2af87a1f", + "placeholder": "​", + "style": "IPY_MODEL_6dcc977075ad4cf49dd22eea0ccdf449", + "value": "Downloading: 100%" + } + }, + "238696e1b9d34281b94c08a8d4d858ba": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2490c7a9dfe04ff48995d77e7cb749d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "25295a5688474c219fa43c9a0b7482ab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "264853c4fdf44d25819027c2effa3b0f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bdb17153fc648a1aa24704e1f5779e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_31847ecd09c44c93b19f54c651f3df48", + "placeholder": "​", + "style": "IPY_MODEL_51a29e88dc884eae901226444148576a", + "value": "100%" + } + }, + "2eea7465105a4b4ea00cd57c6c16e78f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ef2d786b88c4f04b3d9bd73e6e945d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_264853c4fdf44d25819027c2effa3b0f", + "max": 1961828, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d04a2a562c5b4aaea83248e1ecf44c70", + "value": 1961828 + } + }, + "2fce782be7da430dba8014f58ac00b02": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "301745323ce14a49884c61ac37c36542": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31847ecd09c44c93b19f54c651f3df48": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34c6d9d585f84f56b638d780ba2cc293": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "389833ff94f9462f8270fcec2396b19e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_95d668b851c24aea988177972de6b0fa", + "IPY_MODEL_cbfee6447535490a80fab030669c6eae", + "IPY_MODEL_76ea2f1266a54708939acb5a96e8dfdf" + ], + "layout": "IPY_MODEL_a6f64f1c4078400a9df4bb35dd97d0b4" + } + }, + "43ba716e2e50446eb5f7dfaca3d81b36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_34c6d9d585f84f56b638d780ba2cc293", + "placeholder": "​", + "style": "IPY_MODEL_8c050559ad244002afc9f84ea33a09c6", + "value": " 542M/542M [00:09<00:00, 59.3MB/s]" + } + }, + "499d54d5f9d64ec5a7297780d01d7542": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4e0808e7c9a94bd89dbe568f9795afc6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "50a4df1c2ce9404987f4625f193929d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7af3f089337047368e55d96926a2e3bd", + "placeholder": "​", + "style": "IPY_MODEL_635922d86fc4405c826ff2b19cd7fc74", + "value": "Downloading builder script: " + } + }, + "51a29e88dc884eae901226444148576a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5633ff0eba2a48f48c69ebb23271d4ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7440b1b3c178410ebeb5bd0dc1547a1b", + "IPY_MODEL_2ef2d786b88c4f04b3d9bd73e6e945d0", + "IPY_MODEL_da353539b2474d6caffc38af0241fecf" + ], + "layout": "IPY_MODEL_a5f108b19ab149ccb4e2b028e6d62422" + } + }, + "5ec669e823294afe8897f7beeb446c69": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "635922d86fc4405c826ff2b19cd7fc74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "63e769008e724def9f8f9b3410c1de1c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2267c81aaeea44ab87bf7ba3d1beff03", + "IPY_MODEL_ed2f673d3b354e29aaac004b5203de8f", + "IPY_MODEL_43ba716e2e50446eb5f7dfaca3d81b36" + ], + "layout": "IPY_MODEL_238696e1b9d34281b94c08a8d4d858ba" + } + }, + "6821955b03724206a785bee03e1a2452": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6d992fc1ec634c19a794b7eda835c4a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6dcc977075ad4cf49dd22eea0ccdf449": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6e500e22b9ac47f29a2b60695eaeee1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "710f69706ef143cfa7513f13afd7f1fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7440b1b3c178410ebeb5bd0dc1547a1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3928b364fb24476be243bde37e65469", + "placeholder": "​", + "style": "IPY_MODEL_2105aab8494c4a9d94f01acf7f7e357d", + "value": "Downloading: 100%" + } + }, + "745b1b2a04d1422d96d6c54934d771b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "76ea2f1266a54708939acb5a96e8dfdf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8666b83039ed4cc3bd3896485ec09d09", + "placeholder": "​", + "style": "IPY_MODEL_cf46fefd82f24b12ad0c3145c4924acb", + "value": " 996k/996k [00:00<00:00, 2.02MB/s]" + } + }, + "77c3fafe158b48dd9624f66e01383665": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7af3f089337047368e55d96926a2e3bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7cd75a5a8b9e4365bb7adb7ce339b533": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d848f3cd8fc648988e18a8569eb0bc48", + "placeholder": "​", + "style": "IPY_MODEL_dca20772f5b24c2591870d1da7dfbbff", + "value": " 29.0/29.0 [00:00<00:00, 958B/s]" + } + }, + "7dea9f79416447c4b67c36239d4e3973": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e60c1d8816b49d5a5b1ff59dfde5ae0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "80b55e96450648a880766c0492299029": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ccad7e6b754643d1872cbafef7158603", + "max": 29, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_301745323ce14a49884c61ac37c36542", + "value": 29 + } + }, + "86167df488b1420bb8332345ae9a052e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_77c3fafe158b48dd9624f66e01383665", + "max": 91, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4e0808e7c9a94bd89dbe568f9795afc6", + "value": 91 + } + }, + "864458a3a69e4dfda01d55f0143ba8b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8666b83039ed4cc3bd3896485ec09d09": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "89fe56d17960451286261aa5bf308b06": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8af7ffff71ce445e8b437bccf46b96f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8c050559ad244002afc9f84ea33a09c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "95d668b851c24aea988177972de6b0fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2fce782be7da430dba8014f58ac00b02", + "placeholder": "​", + "style": "IPY_MODEL_2490c7a9dfe04ff48995d77e7cb749d4", + "value": "Downloading: 100%" + } + }, + "9766005031f442e0a6b4aa4235478198": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_50a4df1c2ce9404987f4625f193929d4", + "IPY_MODEL_d52fa063f955471da01b7ee6b5e29be0", + "IPY_MODEL_f436622afdba40d8abb46590de18d218" + ], + "layout": "IPY_MODEL_c2612158281943a9b6777fe893807542" + } + }, + "9c39358f86c14301bfbf3ef3b80bdafc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9dc7769a72244901883af1e209c02ab0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9fa95444dd8e46a5800a727e257ca14c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a195f4aee30044d38af9d9e20c2dc490": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a5f108b19ab149ccb4e2b028e6d62422": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a6f64f1c4078400a9df4bb35dd97d0b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7a446784fbd49e69097ce2d65f7ced7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ff9535a15bfe4feab5d1d379bc9fe135", + "placeholder": "​", + "style": "IPY_MODEL_aaa3e3da8a98422ba236f1609c4ed42c", + "value": " 91/91 [02:01<00:00, 1.46s/ba]" + } + }, + "aaa3e3da8a98422ba236f1609c4ed42c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ad44115b21dc46ef8ca008a738b8fbfe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1f9eb0a390484302a45317810f2e829d", + "IPY_MODEL_c7fd5da448114c02a63c8d32b1d2991c", + "IPY_MODEL_0cf205ec4c82488ba13f32519fab56b3" + ], + "layout": "IPY_MODEL_2eea7465105a4b4ea00cd57c6c16e78f" + } + }, + "ae3c0f882ae14194931a8387f82f9fd4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f8983be97d254c82b6eb7ed93f41c5f6", + "placeholder": "​", + "style": "IPY_MODEL_864458a3a69e4dfda01d55f0143ba8b4", + "value": " 23/23 [00:29<00:00, 1.09ba/s]" + } + }, + "b3928b364fb24476be243bde37e65469": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c139d1e7843247f0bfb30a916a0233d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_164d488a4b2242ff9d8104052fc0240f", + "placeholder": "​", + "style": "IPY_MODEL_710f69706ef143cfa7513f13afd7f1fc", + "value": "100%" + } + }, + "c17c34d0a0f5454ea2e12cef84aa4e27": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2612158281943a9b6777fe893807542": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c7fd5da448114c02a63c8d32b1d2991c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c17c34d0a0f5454ea2e12cef84aa4e27", + "max": 466, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_745b1b2a04d1422d96d6c54934d771b5", + "value": 466 + } + }, + "c928e4b904844c50b6fe2b0b7c864174": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2bdb17153fc648a1aa24704e1f5779e8", + "IPY_MODEL_86167df488b1420bb8332345ae9a052e", + "IPY_MODEL_a7a446784fbd49e69097ce2d65f7ced7" + ], + "layout": "IPY_MODEL_2131431357ca4b2e963de9b004e21102" + } + }, + "cbfee6447535490a80fab030669c6eae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1918d586490e46ce8da9416064096151", + "max": 995526, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ffcfba2fe7f94b5c982eaaedf6f3febc", + "value": 995526 + } + }, + "ccad7e6b754643d1872cbafef7158603": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cee3f2161609479dbe05730d718498db": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf19df6dada74286bfe708bd13a9149f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "cf46fefd82f24b12ad0c3145c4924acb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d04a2a562c5b4aaea83248e1ecf44c70": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d2d9bce8fafa4c188c401ed7bee7fd49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5ec669e823294afe8897f7beeb446c69", + "max": 23, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cf19df6dada74286bfe708bd13a9149f", + "value": 23 + } + }, + "d52fa063f955471da01b7ee6b5e29be0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cee3f2161609479dbe05730d718498db", + "max": 1652, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9fa95444dd8e46a5800a727e257ca14c", + "value": 1652 + } + }, + "d848f3cd8fc648988e18a8569eb0bc48": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da353539b2474d6caffc38af0241fecf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a195f4aee30044d38af9d9e20c2dc490", + "placeholder": "​", + "style": "IPY_MODEL_25295a5688474c219fa43c9a0b7482ab", + "value": " 1.96M/1.96M [00:00<00:00, 1.97MB/s]" + } + }, + "dca20772f5b24c2591870d1da7dfbbff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ed2f673d3b354e29aaac004b5203de8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c39358f86c14301bfbf3ef3b80bdafc", + "max": 541808922, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6821955b03724206a785bee03e1a2452", + "value": 541808922 + } + }, + "ee4a8572a2234f5fb374584c2af87a1f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee5a378bf6384328bb59b3f3bdb0f3d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eef2d41094f34f18b811b7079f043220": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f436622afdba40d8abb46590de18d218": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7dea9f79416447c4b67c36239d4e3973", + "placeholder": "​", + "style": "IPY_MODEL_89fe56d17960451286261aa5bf308b06", + "value": " 4.21k/? [00:00<00:00, 119kB/s]" + } + }, + "f8983be97d254c82b6eb7ed93f41c5f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff9535a15bfe4feab5d1d379bc9fe135": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ffcfba2fe7f94b5c982eaaedf6f3febc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 6287a918eb4a8ec9c65ae7bc5f15fbb3aba94bc9 Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Tue, 13 Dec 2022 17:44:42 +0100 Subject: [PATCH 2/6] Add BERT multilingual classification --- .gitignore | 13 ++ Makefile | 12 +- README.md | 55 ++++-- requirements.txt | 40 +---- src/features/process_dataset.py | 111 ++++++++++++ src/models/predict_model.py | 64 +++++++ src/models/train_model.py | 164 ++++++++++++++++++ .../build_features.py => utils/__init__.py} | 0 src/utils/datasets_utils.py | 42 +++++ src/utils/file_utils.py | 116 +++++++++++++ src/utils/init_utils.py | 77 ++++++++ 11 files changed, 644 insertions(+), 50 deletions(-) create mode 100644 src/features/process_dataset.py rename src/{features/build_features.py => utils/__init__.py} (100%) create mode 100644 src/utils/datasets_utils.py create mode 100644 src/utils/file_utils.py create mode 100644 src/utils/init_utils.py diff --git a/.gitignore b/.gitignore index 6774deb..de67ed6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,15 @@ +# Project specific files data/* models/* + +# Logs +*.log + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Environments +.env +.venv diff --git a/Makefile b/Makefile index 61d4f29..2b1ae36 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,21 @@ endif ################################################################################# ## Install Python Dependencies -requirements: test_environment +requirements: #test_environment #TODO: uncomment test_environment when it is ready $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel $(PYTHON_INTERPRETER) -m pip install -r requirements.txt ## Make Dataset data: requirements - $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + +## Process Dataset +process: data + export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/features/process_dataset.py + +## Train model +train: process + export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/models/train_model.py ## Delete all compiled Python files clean: diff --git a/README.md b/README.md index 0e58929..74651f2 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,50 @@ -# Zenodo spam classifiers +## Zenodo spam classifiers Spam classification machine learning models for Zenodo records and communities. ## Usage -First of all, create a virtualenv, install the depencencies, and run the Jupyter notebook server: +First of all, create a virtual environment (the make script will install the required dependencies in it): ```bash -# Create a virtual environment - mkvirtualenv --python python3.9 zenodo-classifier - (zenodo-classifier) pip install -e . - -# This will also open Jupyter notebook in your browser - (zenodo-classifier) jupyter notebook + mkvirtualenv --python python3.10 zenodo-classifier # Create the virtual environment ``` -To re-train the model: +To train/re-train the model: -1. Go to Zenodo Open Metadata record at to acces all dataset versions. -2. Download the latest dump locally under `data` -3. Open the `model_spam_detection_record.ipynb` notebook -4. Update the `data_file` and `model_path` variables to point to the new dump location -5. Run all the cells up to `4. Dump model`. +```bash + make train +``` + +The `make train` command will install all the necessary dependencies and run the following python scripts: + +- `make_dataset.py`: download/create the Zenodo dataset and store it in `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl`. +- `process_dataset.py`: extract the features/process them and store the new dataset in `data/processed/zenodo_open_metadata_processed_YYYY-MM-DD.csv`. +- `train_model.py`: train the model and store it in `models/zenodo_msc_YYYY-MM-DD`. + +Note: each of these files can be called as a script (using `make` or manually) or imported as module. As a script, they don't take any parameters, the `process_dataset.py` (resp. `train_model.py`) will automatically search for the latest dataset in `data/raw/` (resp. `data/processed/`) and use it. The latest dataset is found by comparing the date present in the file name. If the data is placed manually in `data/raw` (resp. `data/processed`) it should follow the naming convention, that is, `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl` (resp. `zenodo_open_metadata_processed_YYYY-MM-DD.csv`) to ensure that it is found automatically. + +To make a prediction on a new record you can proceed in two ways: + +- Use the `predict_model.py` script: + ```bash + export PYTHONPATH=/path/to/zenodo-classifier # you can use "PYTHONPATH=$(pwd)" if you are in the zenodo-classifier directory + python3 predict.py "Some description of the record that is not preprocess (but can be)" + ``` +- Import `predict_model.py` in your python script: + ```python + from src.models.predict_model import load_model, make_prediction + # You need to load the model only once + # You must pass the path to the model as argument + # You can get the path to the latest model with `find_latest_model()` or pass the path to the model you want to use + model = load_model(model_path) + # You can make some predictions + make_prediction(model, "Some description of the record that is not preprocess (but can be)") + ``` + +To visualize the results of the model: + +> TODO To compare with older models: @@ -64,7 +87,7 @@ To compare with older models: │   │   └── make_dataset.py │ │ │   ├── features <- Scripts to turn raw data into features for modeling -│   │   └── build_features.py +│   │   └── process_dataset.py │ │ │   ├── models <- Scripts to train models and then use trained models to make │ │ │ predictions @@ -75,5 +98,3 @@ To compare with older models: │   └── visualize.py ``` - - diff --git a/requirements.txt b/requirements.txt index dc2a07e..3df58a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,9 @@ -bleach==3.3.0 -certifi==2020.6.20 -chardet==3.0.4 -decorator==4.4.2 -idna==2.9 -importlib-metadata==1.6.1 -joblib==0.15.1 -numpy==1.19.0 -pandas==1.0.5 -pexpect==4.8.0 -pickleshare==0.7.5 -ptyprocess==0.6.0 -pyparsing==2.4.7 -pyrsistent==0.16.0 -python-dateutil==2.8.1 -pytz==2020.1 -PyYAML==5.4 -pyzmq==19.0.1 -qtconsole==4.7.5 -QtPy==1.9.0 -requests==2.24.0 -scikit-learn==0.23.1 -scipy==1.4.1 -six==1.15.0 -tensorflow==2.4.0 -traitlets==4.3.3 -urllib3==1.25.9 -wcwidth==0.2.5 -webencodings==0.5.1 -wrapt==1.12.1 -zipp==3.1.0 +bs4==0.0.1 +click==8.1.2 +datasets==2.7.1 +evaluate==0.3.0 +numpy==1.23.5 +pandas==1.5.2 +torch==1.12.1 +transformers==4.25.1 +python-dotenv==0.21.0 diff --git a/src/features/process_dataset.py b/src/features/process_dataset.py new file mode 100644 index 0000000..3a8d5ea --- /dev/null +++ b/src/features/process_dataset.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ------------------------------ +# File : process_dataset.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +"""Module/script used to process the raw dataset.""" + +import logging +import sys +from math import ceil + +from bs4 import BeautifulSoup +import pandas as pd + +from src.utils.file_utils import ( + PROCESSED_DATASETS_EXTENSION, + PROCESSED_DATASETS_PREFIX, + RAW_DATASETS_PATH, + PROCESSED_DATASETS_PATH, + extract_date, + find_latest_raw_dataset, +) + +from src.utils.init_utils import init_logger, init_seed, SEED + +KEPT_FIELDS = ["description", "spam"] +CHUNK_SIZE = 100000 + + +def parse_description(description: str) -> str: + """Cleans up the description by removing HTML tags. + + Args: + description (str): Description to clean up. + + Returns: + str: Cleaned up description. + """ + return BeautifulSoup(description, "html.parser").get_text() + + +def process_dataset(raw_dataset_path: str, processed_dataset_path: str) -> None: + """Process the raw dataset, process it and save it. + + Args: + raw_dataset_path (str): Path to the raw dataset. + processed_dataset_path (str): Path to the processed dataset. + """ + + spams = pd.DataFrame() + hams = pd.DataFrame() + + logging.info(f'Processing dataset from "{raw_dataset_path}".') + logging.info(f'Processed dataset will be saved to "{processed_dataset_path}".') + + nb_lines = 0 + with open(raw_dataset_path, "r") as f: + for _ in f: + nb_lines += 1 + chunk_done = 0 + + for chunk in pd.read_json(raw_dataset_path, lines=True, chunksize=CHUNK_SIZE): + logging.info( + f"Processing chunk {chunk_done + 1} out of {ceil(nb_lines / CHUNK_SIZE)}." + ) + + chunk = chunk[KEPT_FIELDS].dropna() + + chunk_spams = chunk[chunk["spam"] == True] + chunk_spams["description"] = chunk_spams["description"].map(parse_description) + chunk_spams = chunk_spams.assign(spam=1) + spams = pd.concat([spams, chunk_spams]).dropna() + + chunk_hams = chunk[chunk["spam"] == False] + chunk_hams["description"] = chunk_hams["description"].map(parse_description) + chunk_hams = chunk_hams.assign(spam=0) + hams = pd.concat([hams, chunk_hams]).dropna() + + chunk_done += 1 + + hams = hams.sample(n=2 * len(spams), random_state=SEED) + + df = pd.concat([spams, hams]).rename(columns={"spam": "label"}) + df.to_csv(processed_dataset_path, index=False) + logging.info(f'Dataset processed and saved to "{processed_dataset_path}".') + + +if __name__ == "__main__": + init_logger("process_dataset") + init_seed() + + raw_dataset_path = find_latest_raw_dataset() + if raw_dataset_path is None or not raw_dataset_path.exists(): + logging.critical(f"No raw dataset found in {RAW_DATASETS_PATH}.") + sys.exit(1) + + date = extract_date(raw_dataset_path.name) + processed_dataset_path = ( + PROCESSED_DATASETS_PATH + / f"{PROCESSED_DATASETS_PREFIX}{date}{PROCESSED_DATASETS_EXTENSION}" + ) + + if processed_dataset_path.exists(): + # This is critical because we quit the program but we return 0 to ensure + # that the pipeline does not fail entirely. + logging.critical( + f"Processed dataset already exists at {processed_dataset_path}." + ) + sys.exit(0) + + process_dataset(raw_dataset_path, processed_dataset_path) diff --git a/src/models/predict_model.py b/src/models/predict_model.py index e69de29..2a46bd5 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ------------------------------ +# File : predict_model.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +"""Module/script used to load a model and make predictions on new data.""" + +import logging +import sys + +from transformers import ( + Pipeline, + pipeline, + AutoModelForSequenceClassification, + AutoTokenizer, +) +from src.features.process_dataset import parse_description +from src.utils.file_utils import MODELS_PATH, find_latest_model +from src.utils.init_utils import check_gpu, init_logger + + +def load_model(model_path: str) -> Pipeline: + model = AutoModelForSequenceClassification.from_pretrained(model_path) + model_tokenizer = AutoTokenizer.from_pretrained( + "distilbert-base-multilingual-cased" + ) + classifier = pipeline("text-classification", model=model, tokenizer=model_tokenizer) + return classifier + + +def make_prediction(description: str, model: Pipeline) -> bool: + """Makes a prediction on a description. The description doesn't have to be + preprocessed. + + Args: + description (str): Description to predict. + + Returns: + bool: Prediction. True if the description is spam, False otherwise. + """ + description = parse_description(description) + prediction = model(description, padding=True, truncation=True) + is_spam = prediction[0]["label"] == "LABEL_1" + score = prediction[0]["score"] + logging.info(f'Prediction for "{description}": {is_spam=}, {score=}') + return is_spam + + +if __name__ == "__main__": + init_logger("predict_model") + + if len(sys.argv) != 2: + logging.critical("Usage: predict_model ") + + model_path = find_latest_model() + if model_path is None or not model_path.exists(): + logging.critical(f"Model not found in {MODELS_PATH}.") + sys.exit(1) + + check_gpu() + + model = load_model(model_path) + description = sys.argv[1] + make_prediction(description, model) diff --git a/src/models/train_model.py b/src/models/train_model.py index e69de29..58446ae 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ------------------------------ +# File : train_dataset.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +"""Module/script used to train a new model on the processed dataset.""" + +import logging + +import datasets +import evaluate +import numpy as np +import pandas as pd +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + Trainer, + TrainingArguments, + logging as transformers_logging, +) + +from src.utils.datasets_utils import load_processed_dataset, split_train_test +from src.utils.file_utils import ( + MODEL_CHECKPOINTS_PATH, + MODELS_PATH, + extract_date, + find_latest_processed_dataset, +) +from src.utils.init_utils import SEED, check_gpu, init_logger, init_seed + + +def tokenize(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame: + """Tokenizes the dataset. + + Args: + train (pd.DataFrame): Train dataset. + test (pd.DataFrame): Test dataset. + + Returns: + pd.DataFrame: Tokenized dataset + """ + logging.info("Tokenizing dataset.") + tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") + + train = pd.DataFrame( + { + "label": [int(x) for x in train["label"].tolist()], + "text": [str(x) for x in train["description"].tolist()], + } + ) + + test = pd.DataFrame( + { + "label": [int(x) for x in test["label"].tolist()], + "text": [str(x) for x in test["description"].tolist()], + } + ) + + dataset = datasets.DatasetDict( + { + "train": datasets.Dataset.from_dict(train), + "test": datasets.Dataset.from_dict(test), + } + ) + dataset_tokenized = dataset.map( + lambda e: tokenizer(e["text"], padding="max_length", truncation=True), + batched=True, + ) + + logging.info("Tokenization finished.") + return dataset_tokenized + + +def compute_metrics(eval_pred): + """Computes metrics for the model. + + Args: + eval_pred (tuple): Tuple containing the predictions and the labels. + + Returns: + """ + metric = evaluate.combine(["accuracy", "f1"]) + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels, seed=SEED) + + +def train_model( + processed_dataset_path: str, + model_path: str, + model_checkpoints_path: str, + resune_from_checkpoint: bool = False, +) -> None: + """Trains the model and save it. + + Args: + processed_dataset_path (str): Path to the processed dataset. + model_path (str): Path to the model. + model_checkpoints_path (str): Path to the model checkpoints. + """ + + dataset = load_processed_dataset(processed_dataset_path) + train, test = split_train_test(dataset) + + dataset_tokenized = tokenize(train, test) + + model = AutoModelForSequenceClassification.from_pretrained( + "distilbert-base-multilingual-cased", num_labels=2 + ) + + training_args = TrainingArguments( + output_dir=model_checkpoints_path, + overwrite_output_dir=True, + save_total_limit=2, + seed=SEED, + log_level="info", + logging_strategy="steps", + logging_steps=500, + evaluation_strategy="steps", + eval_steps=500, + save_strategy="steps", + save_steps=500, + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset_tokenized["train"], + eval_dataset=dataset_tokenized["test"], + compute_metrics=compute_metrics, + ) + + trainer.train(resume_from_checkpoint=resune_from_checkpoint) + trainer.evaluate() + trainer.save_model(model_path) + + +if __name__ == "__main__": + init_logger("train_model") + transformers_logging.set_verbosity_info() + transformers_logging.disable_default_handler() + transformers_logging.disable_progress_bar() + transformers_logging.add_handler(logging.getLogger()) + + init_seed() + check_gpu() + + processed_dataset_path = find_latest_processed_dataset() + if not processed_dataset_path.exists(): + logging.critical(f"Processed dataset not found at {processed_dataset_path}") + exit(1) + + date = extract_date(processed_dataset_path.name) + + model_path = MODELS_PATH / f"model_msc_{date}" + if model_path.exists(): + # This is critical because we quit the program but we return 0 to ensure + # that the pipeline does not fail entirely. + logging.critical(f"Model already exists at {model_path}") + exit(0) + + model_checkpoints_path = MODEL_CHECKPOINTS_PATH / f"checkpoints_model_msc_{date}" + + train_model(processed_dataset_path, model_path, model_checkpoints_path) diff --git a/src/features/build_features.py b/src/utils/__init__.py similarity index 100% rename from src/features/build_features.py rename to src/utils/__init__.py diff --git a/src/utils/datasets_utils.py b/src/utils/datasets_utils.py new file mode 100644 index 0000000..ecfeb88 --- /dev/null +++ b/src/utils/datasets_utils.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# ------------------------------ +# File : datasets_utils.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +""" +Module containing utility functions to handle datasets, for example, load +and split. +""" + +from typing import Tuple + +import pandas as pd + +from src.utils.init_utils import SEED + + +def load_processed_dataset(processed_dataset_path: str) -> pd.DataFrame: + """Load the processed dataset. + + Returns: + pd.DataFrame: Preprocessed dataset. + """ + return pd.read_csv(processed_dataset_path) + + +def split_train_test( + dataset: pd.DataFrame, test_size=0.2 +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Split the dataset into train and test. + + Args: + dataset (pd.DataFrame): Dataset to split. + test_size (float, optional): Percentage of the dataset to use for the + test set. + + Returns: + pd.DataFrame, pd.DataFrame: Train and test sets. + """ + dataset = dataset.sample(frac=1, random_state=SEED).reset_index(drop=True) + train = dataset.iloc[: int(len(dataset) * (1 - test_size))] + test = dataset.iloc[int(len(dataset) * (1 - test_size)) :] + return train, test diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py new file mode 100644 index 0000000..e64b5da --- /dev/null +++ b/src/utils/file_utils.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# ------------------------------ +# File : file_utils.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +""" +Module containing utility functions and constants to access and find +files in the project. +""" + +import logging +from pathlib import Path +import re + + +ROOT_PATH = Path(__file__).parents[2] + +DATA_PATH = ROOT_PATH / "data" + +RAW_DATASETS_PATH = DATA_PATH / "raw" +RAW_DATASETS_PREFIX = "zenodo_open_metadata_" +RAW_DATASETS_EXTENSION = ".jsonl" + +PROCESSED_DATASETS_PATH = DATA_PATH / "processed" +PROCESSED_DATASETS_PREFIX = RAW_DATASETS_PREFIX + "processed_" +PROCESSED_DATASETS_EXTENSION = ".csv" + + +MODELS_PATH = ROOT_PATH / "models" +MODELS_PREFIX = "model_msc_" +MODELS_EXTENSION = "" + +MODEL_CHECKPOINTS_PATH = MODELS_PATH / "checkpoints" + + +def extract_date(filename: str) -> str | None: + """Extract the date from the filename. + + Args: + filename (str): Filename + + Returns: + str: Date + """ + match = re.match(r".*_(\d{4}-\d{1,2}-\d{1,2}).*", filename) + if match: + return match.group(1) + return None + + +def find_latest_in_directory( + directory: str, prefix: str, extension: str +) -> Path | None: + """Find the latest file in a directory. The latest file is the one with + the latest timestamp in its name. + + Args: + directory (str): Directory where to search for the latest file. + prefix (str): Prefix of the file to search. + extension (str): Extension of the file to search, e.g. ".csv". + + + Returns: + str: Absolute path of the latest file. + """ + logging.debug(f"Searching for latest file in {directory}") + oldest = None + oldest_date = None + for file in directory.iterdir(): + pattern = prefix + r"(\d{4}-\d{1,2}-\d{1,2})" + extension + match = re.match(pattern, file.name) + if match: + logging.debug(f"Found file {file.name}") + date = match.group(1) + if not oldest_date or date < oldest_date: + logging.debug(f"File {file.name} is the oldest so far.") + oldest_date = date + oldest = file + else: + if not file.name.startswith("."): + logging.debug( + f"File {file.name} does not match the pattern: {pattern} but is present in the directory." + ) + if oldest: + return oldest.absolute() + return None + + +def find_latest_raw_dataset() -> str: + """Find the latest raw dataset in the raw datasets directory. + + Returns: + str: Absolute path of the latest raw dataset. + """ + return find_latest_in_directory( + RAW_DATASETS_PATH, RAW_DATASETS_PREFIX, RAW_DATASETS_EXTENSION + ) + + +def find_latest_processed_dataset() -> str: + """Find the latest processed dataset in the processed datasets directory. + + Returns: + str: Absolute path of the latest processed dataset. + """ + return find_latest_in_directory( + PROCESSED_DATASETS_PATH, PROCESSED_DATASETS_PREFIX, PROCESSED_DATASETS_EXTENSION + ) + + +def find_latest_model() -> str: + """Find the latest model in the models directory. + + Returns: + str: Absolute path of the latest model. + """ + return find_latest_in_directory(MODELS_PATH, MODELS_PREFIX, MODELS_EXTENSION) diff --git a/src/utils/init_utils.py b/src/utils/init_utils.py new file mode 100644 index 0000000..bc5d32d --- /dev/null +++ b/src/utils/init_utils.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +# ------------------------------ +# File : init_utils.py +# Created by : Yanis De Busschere, Luka Secilmis, Thomas Ecabert +""" +Module containing utility functions and constants to initialize a scripts. + +This necessary because the project is made of several standalone scripts that +need to be initialized in the same way and need to perform the same checks. +""" + +import datetime +import logging +import random +import sys + +import numpy as np +import torch + +from src.utils.file_utils import ROOT_PATH + +SEED = 42 +LOG_PATH = ROOT_PATH / "logs" + + +def check_gpu() -> bool: + """Check if GPU is available and log it. + + Returns: + bool: True if GPU is available, False otherwise. + """ + if torch.cuda.is_available(): + device = torch.device("cuda") + logging.info(f"GPU is available. Using: {torch.cuda.get_device_name(0)}") + return True + else: + logging.warning("No GPU available. Using the CPU instead.") + return False + + +def init_seed(seed=SEED) -> None: + """Set seed for reproducibility. + Use this function and keep the same seed for all the experiments to ensure + reproducibility. + + Args: + seed (int, optional): Seed value. Defaults to 42. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + logging.info(f'Seed initialized to "{seed}".') + + +def init_logger(file_name: str, level=logging.INFO) -> None: + """Initialize logger. + + Args: + file_name (str): Name of the file where the logs will be stored. + """ + if not LOG_PATH.exists(): + LOG_PATH.mkdir(parents=True) + + file_path = ( + LOG_PATH + / f"{file_name}_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log" + ) + logging.basicConfig( + format="%(asctime)s:%(levelname)s:%(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=level, + handlers=[logging.FileHandler(file_path), logging.StreamHandler()], + ) + + sys.excepthook = lambda *args: logging.critical("Uncaught exception", exc_info=args) From 250bcf82fd4d067e694c46aa385419b5949b935c Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Thu, 15 Dec 2022 14:16:30 +0100 Subject: [PATCH 3/6] Fix train and restart from checkpoints --- README.md | 2 ++ requirements.txt | 5 +++-- src/models/predict_model.py | 3 ++- src/models/train_model.py | 43 +++++++++++++++++++++++++++++-------- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 74651f2..bb0a587 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ The `make train` command will install all the necessary dependencies and run the Note: each of these files can be called as a script (using `make` or manually) or imported as module. As a script, they don't take any parameters, the `process_dataset.py` (resp. `train_model.py`) will automatically search for the latest dataset in `data/raw/` (resp. `data/processed/`) and use it. The latest dataset is found by comparing the date present in the file name. If the data is placed manually in `data/raw` (resp. `data/processed`) it should follow the naming convention, that is, `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl` (resp. `zenodo_open_metadata_processed_YYYY-MM-DD.csv`) to ensure that it is found automatically. +Note: checkpoints are automatically saved in `models/checkpoints/` during training. If there are some checkpoints, the training will automatically resume from there. If you want to start over for some reason, delete them. + To make a prediction on a new record you can proceed in two ways: - Use the `predict_model.py` script: diff --git a/requirements.txt b/requirements.txt index 3df58a3..f73560d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ datasets==2.7.1 evaluate==0.3.0 numpy==1.23.5 pandas==1.5.2 -torch==1.12.1 -transformers==4.25.1 python-dotenv==0.21.0 +scikit-learn==1.1.3 +torch==1.13.0 +transformers==4.25.1 diff --git a/src/models/predict_model.py b/src/models/predict_model.py index 2a46bd5..b819c85 100644 --- a/src/models/predict_model.py +++ b/src/models/predict_model.py @@ -34,6 +34,7 @@ def make_prediction(description: str, model: Pipeline) -> bool: Args: description (str): Description to predict. + model (Pipeline): Model to use for prediction. Returns: bool: Prediction. True if the description is spam, False otherwise. @@ -42,7 +43,7 @@ def make_prediction(description: str, model: Pipeline) -> bool: prediction = model(description, padding=True, truncation=True) is_spam = prediction[0]["label"] == "LABEL_1" score = prediction[0]["score"] - logging.info(f'Prediction for "{description}": {is_spam=}, {score=}') + logging.debug(f'Prediction for "{description}": {is_spam=}, {score=}') return is_spam diff --git a/src/models/train_model.py b/src/models/train_model.py index 58446ae..2c30905 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -6,6 +6,8 @@ """Module/script used to train a new model on the processed dataset.""" import logging +from pathlib import Path +import shutil import datasets import evaluate @@ -82,14 +84,13 @@ def compute_metrics(eval_pred): metric = evaluate.combine(["accuracy", "f1"]) logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels, seed=SEED) + results = metric.compute(predictions=predictions, references=labels, seed=SEED) + logging.info(f"Metrics: {results}") + return results def train_model( - processed_dataset_path: str, - model_path: str, - model_checkpoints_path: str, - resune_from_checkpoint: bool = False, + processed_dataset_path: Path, model_path: Path, model_checkpoints_path: Path ) -> None: """Trains the model and save it. @@ -115,11 +116,11 @@ def train_model( seed=SEED, log_level="info", logging_strategy="steps", - logging_steps=500, + logging_steps=2500, evaluation_strategy="steps", - eval_steps=500, + eval_steps=2500, save_strategy="steps", - save_steps=500, + save_steps=2500, ) trainer = Trainer( @@ -130,9 +131,31 @@ def train_model( compute_metrics=compute_metrics, ) - trainer.train(resume_from_checkpoint=resune_from_checkpoint) + if ( + model_checkpoints_path.exists() + and len(list(model_checkpoints_path.iterdir())) > 0 + ): + logging.info( + f"Files found in {model_checkpoints_path}, trying to resume training." + ) + resume_from_checkpoint = True + else: + logging.info( + f"No files found in {model_checkpoints_path}, starting training from scratch." + ) + resume_from_checkpoint = False + + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + logging.info("Training finished.") + trainer.evaluate() + trainer.save_model(model_path) + logging.info(f"Model saved in {model_path}.") + + logging.info(f"Deleting checkpoints in {model_checkpoints_path}.") + shutil.rmtree(model_checkpoints_path) + logging.info(f"Deleted {model_checkpoints_path}.") if __name__ == "__main__": @@ -141,6 +164,8 @@ def train_model( transformers_logging.disable_default_handler() transformers_logging.disable_progress_bar() transformers_logging.add_handler(logging.getLogger()) + evaluate.logging.set_verbosity_info() + evaluate.logging.get_logger().addHandler(logging.getLogger()) init_seed() check_gpu() From 6130d89d3e54d12b28fa6a65b5b93d92cc9f94ea Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Thu, 15 Dec 2022 15:49:08 +0100 Subject: [PATCH 4/6] Add visualization --- Makefile | 4 + README.md | 6 +- reports/report_2020-10-19.md | 103 +++++++++++++++++++ src/utils/file_utils.py | 2 + src/visualization/visualize.py | 176 +++++++++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 1 deletion(-) create mode 100644 reports/report_2020-10-19.md diff --git a/Makefile b/Makefile index 2b1ae36..c8d3210 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,10 @@ process: data train: process export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/models/train_model.py +## Visualize model +visualize: train + export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/visualization/visualize.py $(N) + ## Delete all compiled Python files clean: find . -type f -name "*.py[co]" -delete diff --git a/README.md b/README.md index bb0a587..df469fd 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,11 @@ To make a prediction on a new record you can proceed in two ways: To visualize the results of the model: -> TODO +```bash +make visualize +``` + +This will generate a `report_YYYY-MM-DD.md` file in the `reports/`. To make the generate faster you can use `make visualize N=1000` to compute the results on only 1000 tests samples. To compare with older models: diff --git a/reports/report_2020-10-19.md b/reports/report_2020-10-19.md new file mode 100644 index 0000000..44f8898 --- /dev/null +++ b/reports/report_2020-10-19.md @@ -0,0 +1,103 @@ +# Report for model model_msc_2020-10-19 + +## Predictions + - Number of examples: 10000 + - Number of spam labels: 4198 + - Number of ham examples: 5802 + - Average prediction time: 0.22s + +## Confusion matrix +| | Pred. Ham | Pred. Spam | +|:------------|------------:|-------------:| +| Actual Ham | 5751 | 51 | +| Actual Spam | 77 | 4121 | + +| | Pred. Ham | Pred. Spam | +|:------------|------------:|-------------:| +| Actual Ham | 0.99121 | 0.00879007 | +| Actual Spam | 0.0183421 | 0.981658 | + +## Examples of False Positive + +Samples that were classified as spam but are actually ham. + +``` +Please see the PDF +``` +``` +VivoLab Sys 1 +``` +``` +Magnesium gluconate is a pharmaceutical/nutraceutical compound used as a source of magnesium ion. The recent study described the impact of The Trivedi Effect®-Energy of Consciousness Healing Treatment on magnesium gluconate for the variation in physicochemical, structural, thermal and behavioral properties using PXRD, PSD, FT-IR, UV-vis spectroscopy, TGA, and DSC analysis. Magnesium gluconate was divided into two parts – one part was control without any Biofield Energy Treatment, while another part was treated with The Trivedi Effect®-Energy of Consciousness Healing Treatment remotely by twenty renowned Biofield Energy Healers and defined as The Trivedi Effect® Treated sample. The PXRD analysis exhibited that the crystallite size of the treated sample was remarkably altered from -70% to 130% compared with the control sample. The average crystallite size was significantly decreased by 23.74% in the treated sample compared to the control sample. Biofield Energy Healing Treatment significantly reduced the particle size of magnesium gluconate at d10, d50, and d90 values by 12.15%, 8.98% and 15.35%, respectively compared to the control sample. The surface area analysis showed that surface area of the treated sample was significantly increased by 11.76% compared with the control sample. The FT-IR and UV-vis analysis displayed that structure of the magnesium gluconate persisted identical in both the treated and control samples. The TGA analysis exhibited four steps thermal degradation in both samples and the total weight loss of the Biofield Energy Treated sample was reduced by 0.19% compared with the control sample. The melting temperature of the Biofield Energy Treated sample (171.25ºC) was slightly (0.16%) higher from the control sample (170.97ºC). The latent heat of fusion was significantly decreased by 7.76% in the treated sample compared to the control sample. The TGA and DSC analysis revealed that the thermal stability of the treated sample was enhanced compared with the control sample. The current study revealed that The Trivedi Effect®-Energy of Consciousness Healing Treatment might produce a new polymorphic form of magnesium gluconate, which could be more soluble and bioavailable along with improved thermal stability compared with the untreated compound. The Biofield Treated sample could be more stable during manufacturing, delivery or storage conditions than the untreated sample. Hence, The Trivedi Effect® Treated magnesium gluconate would be very useful to design better nutraceutical/pharmaceutical formulations that might offer better therapeutic responses against inflammatory diseases, immunological disorders, stress, aging, and other chronic infections. +  +``` +``` +Sebagian karakter yang diterangkan dalam syariat Islam adalah karakter kemandirian, kemandirian dalam mempertanggungjawabkan prilaku dan perbuatannya di hadapan Allah kemudian di hadapan hukum dan perundang-undangan yang berlaku dimana seseorang berada. Kemandirian ini diberlakukan mulai dari seorang anak sampai pada status akil baligh. Ditinjau dari perspektif pendidikan, bahwa masa akil baligh adalah masa ia telah mendapatkan bekal pemahaman yang cukup untuk bekal ia sebagai seorang muslim yang mulai berdiri sendiri dengan tanggungjawab personal dihadapan Tuhannya. Masa Baligh adalah sebuah fase baru dalam kehidupan setiap muslim yang harus dihadapi oleh mereka dengan bekal karakter kemandirian yang memadahi, dan karakter kemandirian ini ternyata harus by design disebabkan tuntutan umur, yaitu minimal pada umur 15 tahun, setiap pribadi muslim sudah mempunyai kemampuan yang menjadikan mereka berdiri sendiri dalam mempertanggungjawabkan semua sikap, tindakan dan prilakunya. Berangkat dari hal di atas, pembentukan karakter kemandirian pada seorang anak adalah tanggungjawab orangtua dan pemerintah melalui kurikulum pendidikan. Metode penulisan yang digunakan adalah kajian pustaka Dr. Yusuf Al Qaradhawi terhadap Al Quran dan Sunnah terkait dengan aktivitas usaha yang bisa di lakukan seseorang, sehingga dari tulisan ini diharapkan dapat memudahkan dalam upaya membuat panduan program dan muatan kemandirian dalam kurikulum pendidikan kedepannya. +``` +``` +tbc +``` +``` +Asırlar boyu ilim ve medeniyet alanında insanlıga çok büyük hizmetler sunmuş olan Türk milleti. son derece zengin bir ilim ve kültür mirasına sahiptir. Yaşadığı her devirde ilmiyle. ırfanıyla, düşünce ve yaşantısıyla insanlığa örnek olmasını bilen şerefli milletimiz İslamtyet'le müşerref olduktan sonra bu özellıklerıni daha da kuvvetlendirmiş ve asırlarca gerçek medeniyetin tımsali olmuştur. O. bu özel\iklere sahip oluşunu, hiç şüphezis elinde bulundurdugu sayısiZ uM degerlere borçludur. +``` +``` +Данная статья посвящена вопросам необходимости реформирования нормативно-правовой базы, регламентирующей деятельность некоммерческих организаций, а также отдельным недочетам в существующем законодательстве. +This paper focuses on the need to reform the legislation, governing the activities of non–profit organizations, as well as some shortcomings in existing legislation. +  +``` +``` +test this function! +``` +``` +While moving between assets from Planet Inc and Google Earth Engine it was imperative to create a pipeline that allows for easy transitions between the two service endpoints and this tool is designed to act as a step by step process chain from Planet Assets to batch upload and modification within the Google Earth Engine environment. The ambition is apart from helping user with batch actions on assets along with interacting and extending capabilities of existing GEE CLI. It is developed case by case basis to include more features in the future as it becomes available or as need arises. +This release also contains a windows installer which bypasses the need for you to have admin permission, it does, however, require you to have python in the system path meaning when you open up command prompt you should be able to type python and start it within the command prompt window. Post-installation using the installer you can just call ppipe using the command prompt similar to calling python. Give it a go post installation type +ppipe -h +``` +``` +Market Value of Share of a company is determined by various factors. So many studies were conducted in finding the determinants of share price. Some of the studies found that Capital Structure is a determinant of the market price of the share and some of the studies found that Earning per Share is a determinant of the market price of the share. Even some studies proved that Capital Structure is also determining the Earning per Share. With this ideology an attempt was made in this study by using Structural equation modeling to see how for the Capital Structure of the firm has got a direct and indirect effect over the Market value of share. Debt to Total Assets, Equity to Total Assets, Coverage Ratio, Earning per share and Market price of share of eleven Indian public sector banks were taken for the study for five years from March 2013 to March 2017. +``` + +## Examples of False Negative + +Samples that were classified as ham but are actually spam. + +``` +taratata +``` +``` +dslhsdkhdskhd +``` +``` +Withania somnifera (Ashwagandha) root extract is very popular ancient herbal medicine. The objective of the study was to characterize and evaluate the impact of The Trivedi Effect® - Energy of Consciousness Healing Treatment (Biofield Energy Healing) on phytoconstituents present in the ashwagandha root extract using GC-MS and NMR. Ashwagandha root extract was divided into two parts. One part was denoted as the control, while the other part was defined as The Trivedi Effect® - Biofield Energy Treated sample, which received The Trivedi Effect® - Energy of Consciousness Healing Treatment remotely from eighteen renowned Biofield Energy Healers. The GC-MS data indicated that the peak height and peak area of The Trivedi Effect® treated sample were found to be altered compared with the control sample. The peak height of the phytoconstituents present in the treated ashwagandha sample was altered significantly in the range of -8.32% to 89.25% compared with the control sample. Similarly, the peak area of the treated sample was altered significantly in the range of - 4.28% to 216.30% compared with the control sample. Overall, the change in the peak area% of the treated sample was significantly altered in the range of -18.29% to 170.18% compared with the control sample. The GC-MS and NMR analysis results identified the presence of withanolides such as glyco-withanolides, alkaloids, and sugars in the root extract in both the sample. The peak area of 2,3,4,5-tetrahydropyridazine (1), methyl ethyl sulfoxide (2), 5,6-dihydro-2-methyl-4(H)pyran-3,4-dione (4), diethoxy-2-methyl-propane (5), 2,3,4,5-tetrahydroxy-tetrahydro-pyran (6), and 3,4-dimethyl-2(3H)-furanone (7) were significantly increased by 170.18%, 58.21%, 7.74%, 139.50%, 23.16%, and 45.63%, respectively in the treated sample compared with the control sample. On the contrary, the peak area% of 2-hydroxy-γ-butyrolactone (3) was decreased by - 14.96% in the treated ashwagandha compared with the control sample. From the results, it can be hypothesized that The Trivedi Effect® - Biofield Energy Treatment might have the impact on the intrinsic physicochemical properties of the phytoconstituents present in the ashwagandha root extract and responsible for the alteration in the relative peak height/area of treated sample compared with the control sample. As a result, the concentrations of the phytoconstituents assumed to be increased in treated sample compared with the control sample. This treated ashwagandha root extract would be helpful for designing better nutraceutical/pharmaceutical formulations which might be providing a better therapeutic response against autoimmune diseases, nervous and sexual disorders, infectious diseases, antiaging, diabetes, cancer, immunological disorders, stress, arthritis, etc. +Source: https://www.trivedieffect.com/the-science/biotech/publication/healers-science/nutraceuticals/effect-of-the-energy-of-consciousness-the-trivedi-effect-on-withania-somnifera-root-extract-using-gas-chromatography-mass-spectrometry-gc-ms-and-nuclear-magnetic-resonance-nmr/ +http://www.sciencepublishinggroup.com/journal/paperinfo?journalid=320&doi=10.11648/j.jdmp.20170302.11 +``` +``` +Men who have sex with men (MSM) who use crystal methamphetamine (CM) are at increased risk for HIV infection. Post-exposure prophylaxis (PEP) is a useful HIV prevention strategy if individuals can identify high-risk exposures and seek timely care, however, to date there has been limited data on the use of PEP by CM users. • Few studies have investigated biomedical HIV prevention interventions for men who have sex with men who use crystal methamphetamine (CM). • This study investigated the use of CM by MSM who sought post-exposure prophylaxis (PEP). • MSM who use CM had higher risk exposures during the event that led to seeking PEP. • MSM who use CM also had greater recurrent exposure to HIV following PEP use. • HIV prevention interventions that address both recurrent HIV exposure and substance use may be particularly effective +``` +``` +Herbomineral formulations have increased in recognition and popularity due to their high safety and better therapeutic action. A new proprietary herbomineral formulation was formulated with a mixture of the herbal root extract of ashwagandha and three minerals viz. zinc, magnesium, and selenium. The aim of the study was to evaluate the immunomodulatory potential of Biofield Energy Healing (The Trivedi Effect®) on the test formulation when applied to splenocyte cells isolated from the Biofield Treated mice. The test formulation was divided into two parts. One part was denoted as the control without any Biofield Energy Treatment. The other part was defined as the Biofield Energy Treated sample, which received the Biofield Energy Healing Treatment remotely by seven renowned Biofield Energy Healers. A wide concentration range (0.00001053 to 10.53 µg/mL) of the test formulation was used to determine non-cytotoxic concentrations using MTT assay. Further, the expression of pro-inflammatory cytokines (TNF-α, MIP-1α, and IL-1β) was determined by ELISA method. The test formulation was evaluated and found to be safe up to 1.053 µg/mL with a percentage cell viability range of 73% to 97% using MTT assay. The Biofield Treated formulation improved the cell viability up to 6.61% compared with the untreated test formulation. TNF-α expression was significantly inhibited by 16.72% at 0.1053 µg/mL compared with the untreated test formulation, however expression was significantly altered by 53.67% and 25.62% at 0.01053 and 1.053 µg/mL, respectively compared to the untreated test formulation. TNF-α expression was also suppressed in the Biofield Treated test formulation at 0.001053 and 0.1053 µg/mL by 4.0% and 8.56%, respectively as compared with the vehicle control. MIP-1α suppression was reported in the Biofield Treated test formulation at 0.00001053 to 1.053 µg/mL by 8.43%, 22.02%, 21.92%, 20.54%, 5.40%, and 19.82%, respectively compared with the vehicle control. However, the Biofield Treated formulation further exhibited substantial suppression of MIP-1α at 0.0001053, 0.001053, 0.01053, and 0.1053 µg/mL by 13.50%, 7.38%, 36.83% (p≤0.001), and 2.53%, respectively compared with the untreated test formulation. In addition, significant inhibition of IL-1β secretion was reported in the Biofield Treated formulation at 0.0001053, 0.001053, 0.01053, and 0.01053 µg/mL by 32.40%, 14.99%, 60.42%, and 15.15%, respectively compared with the untreated test formulation. The Biofield Energy Healing Treatment significantly potentiated the immunosuppressive effect of the test formulation in Biofield Treated mouse splenocytes, which can be used for autoimmune and inflammatory diseases, stress management and anti-aging by improving overall health. +Source: https://www.trivedieffect.com/the-science/publications/nutraceuticals-publications/an-impact-of-the-trivedi-effect-biofield-energy-healing-on-herbomineral-formulation-for-immunomodulation-of-pro-inflammatory-cytokines-in-biofield-treated-mouse-splenocytes/ +http://www.sciencepublishinggroup.com/journal/paperinfo?journalid=110&doi=10.11648/j.ab.20160406.12 +``` +``` +Cryptocurrency Quasi cooperative refers to a condition when the cooperative is not a cooperative. The legal cooperative is bound to the basic principles of cooperatives and the prevailing laws and regulations. Thus it is quite easy to find and recognize a quasi cooperative. If a cooperative does not implement the basic principles of cooperatives and not in accordance with applicable legislation, then the cooperative can be categorized as a quasi cooperative. The inability of the cooperative as a self-help organization is partly due to the non involvement of members in the decision making process and the preparation of cooperative programs. Member participation is only taken into account when attending Annual Members Meeting (RAT), paying cooperative deposits, and making transactions with cooperatives. The implementation of cooperative activities and businesses is more dominated by the management and managers who seem to know what is best for the cooperative. Currently there are many cooperative management patterns applied in almost all companies including some state owned enterprises that have applied the concept of profit sharing and employee participation, namely moving the participation of workers/employees in corporate decision making, especially related to the principle of profit and risk sharing. The purpose of this study is to reduce the negative impact of quasi cooperatives that greatly make big losses for the community as members of the cooperative who trust the pattern of management. This study uses Literature study methods collected from previous studies on the pattern of cooperative management and the influence of trust members of the cooperative on the development of cooperatives in Indonesia. The results of this study indicate that many quasi cooperatives that flourish in Indonesia develops and this is due to the level of trust members of the cooperative against the quasi cooperative. It is hoped that this research will be one of the sources of comparison for all components of cooperative managers who have been given trust by members, in order not to run things that mengarhkan cooperatives managed to be a quasi cooperative, because of course this will be bad for the development of cooperatives in Indonesia. +``` +``` +ABSTRACT +The study aimed to identify deterioration problems, repair and conservation needs of andesites on the walls of the Ankara Castle. Decay forms of walls were documented by visual examination. Samples taken from the surface of the weathered andesites were examined for their basic physical, mechanical compositional and minerological properties. The bulk density and total porosity were determined as basic physical properties. The mechanical properties were expressed as ultrasonic velocity and modulus of elasticity (Emod). Compositional and mineralogical properties were determined by optical microscopy and XRD analyses. Soluble salt content of the andesite samples was determined by spot tests of anions and electrical conductivity measurements. Findings were evaluated in terms of the long-term weathering behaviour of andesites under the effect of the prevailing climate, air pollution problems of Ankara, dampness problems of the structure, previous repairs with incompatible cement mortars. The surfaces of Ankara Castle andesite blocks were heavily weathered. The results were compared with the physical and mechanical properties of fresh andesites from Gölbaşı- Ankara quarry. The surface of the andesite blocks at the Ankara Castle, had low bulk density and high porosity, low ultrasonic velocity and low Emod values. Thin section and XRD analyses supported those results by revealing the presence of physical and chemical weathering on feldspars and other main minerals of andesite, as well as the presence of amorphous minerals at the surface. +ÖZ +``` +``` +A proprietary herbomineral formulation was formulated with four ingredients; a mixture of the minerals (zinc, magnesium, and selenium) and the herbal root extract ashwagandha. The aim of the study was to evaluate the immunomodulatory potential of Biofield Energy Healing (The Trivedi Effect®) on the herbomineral formulation in splenocyte cells, which were isolated from Biofield Treated mice. The test formulation was divided into two parts. One part was denoted as the control without any Biofield Energy Treatment, while the other part was defined as the Biofield Energy Treated sample, which received the Biofield Energy Healing Treatment remotely from seven renowned Biofield Energy Healers. The splenocyte cells were treated with the test formulation at concentrations ranges from 0.00001053 to 10.53 µg/mL and analyzed after 48 hours of treatment by MTT assay. The cell viability data showed safe concentrations up to 1.053 µg/mL with viability ranges from 69.22% to 123.88% in the test formulation groups. The expression of TNF-α was decreased by 4.82% at 1.053 µg/mL in the Biofield Energy Treated test formulation compared with the vehicle control. The level of TNF-α was significantly decreased by 2.02%, 4.92%, and 18.78% at 0.00001053, 0.001053, and 1.053 µg/mL, respectively in the Biofield Energy Treated test formulation group as compared to the untreated test formulation. The expression of IL-1β was significantly reduced by 83.65%, 92.15%, 27.30%, and 41.88% at 0.00001053, 0.0001053, 0.001053, and 1.053 µg/mL, respectively in the Biofield Energy Treated test formulation compared with the vehicle control. The Biofield Treated test formulation showed significant reduction of IL-1β by 17.26%, 92.61% (p≤0.001), 34.62% (p≤0.05), and 16.13% at 0.00001053, 0.0001053, 0.001053, and 1.053 µg/mL, respectively compared with the untreated test formulation. Additionally, the expression of chemokine MIP-1α was significantly reduced by 17.03%, 10.99%, 22.33%, 24.21%, 21.61%, and 30.67% at 0.00001053, 0.0001053, 0.001053, 0.01053, 0.1053, and 1.053 µg/mL, respectively in the Biofield Treated test formulation compared with the vehicle control. The MIP-1α expression was significantly reduced by 19.32% and 12.56% at 0.01053 and 0.1053 µg/mL, respectively in the Biofield Treated test formulation compared with the untreated test formulation. The overall results demonstrated that the Biofield Energy Treated test formulation significantly down-regulated the expression of TNF-α, IL-1β, and MIP-1α in the Biofield Treated mice splenocyte cells compared to the untreated test formulation. These data suggest that the Biofield Treated test formulation can be used for autoimmune and inflammatory diseases, stress management and anti-aging by improving overall health. +Source: https://www.trivedieffect.com/the-science/publications/nutraceuticals-publications/effect-of-biofield-energy-healing-based-herbomineral-formulation-on-pro-inflammatory-cytokines-expression-in-biofield-treated-mouse-splenocyte-cells-impact-of-the-trivedi-effect/ +http://www.sciencepublishinggroup.com/journal/paperinfo?journalid=219&doi=10.11648/j.ajbio.20160406.11 +``` +``` +Drug discovery and development involve the utilization of in vitro and in vivo experimental models. Different models, ranging +from test tube experiments to cell cultures, animals, healthy human subjects, and even small numbers of patients that are +involved in clinical trials, are used at different stages of drug discovery and development for determination of efficacy and +``` +``` +As per the latest research citings of National Cancer Institute, in 2016 there were approximately 15.5 million cancer survivors due to early intervention of chemotherapy. Business analysts predict the rise in survivors to 20.3 million by 2030. +``` diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py index e64b5da..9fd8215 100644 --- a/src/utils/file_utils.py +++ b/src/utils/file_utils.py @@ -31,6 +31,8 @@ MODEL_CHECKPOINTS_PATH = MODELS_PATH / "checkpoints" +REPORTS_PATH = ROOT_PATH / "reports" + def extract_date(filename: str) -> str | None: """Extract the date from the filename. diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py index e69de29..7c67a62 100644 --- a/src/visualization/visualize.py +++ b/src/visualization/visualize.py @@ -0,0 +1,176 @@ +import time +import logging +from pathlib import Path +import pandas as pd +import sys + +from sklearn.metrics import confusion_matrix + +from src.utils.init_utils import init_logger, init_seed +from src.utils.file_utils import ( + REPORTS_PATH, + MODELS_PATH, + extract_date, + find_latest_model, + PROCESSED_DATASETS_PATH, + PROCESSED_DATASETS_EXTENSION, + PROCESSED_DATASETS_PREFIX, +) +from src.models.predict_model import load_model, make_prediction +from src.utils.datasets_utils import load_processed_dataset, split_train_test +from src.features.process_dataset import parse_description + + +PDF_WIDTH = 210 +PDF_HEIGHT = 297 +NUM_FP_TO_SHOW = 10 +NUM_FN_TO_SHOW = 10 +MAX_LINES_TO_SHOW = 3 + + +def make_predictions_with_time(model, expected_df): + total_time = 0 + count = 0 + predictions_df = pd.DataFrame( + { + "description": pd.Series(dtype=str), + "label": pd.Series(dtype=bool), + } + ) + for _, row in expected_df.iterrows(): + start_time = time.time() + prediction = make_prediction(row["description"], model) + prediction_time = time.time() - start_time + total_time += prediction_time + predictions_df = pd.concat( + [ + predictions_df, + pd.DataFrame( + { + "description": [row["description"]], + "label": [prediction], + } + ), + ] + ) + + count += 1 + logging.info( + f"Example {count} done in {prediction_time:.4f} seconds. Average time: {total_time / (count):.4f} seconds." + ) + predictions_df.reset_index(drop=True, inplace=True) + return predictions_df, total_time / count + + +def generate_report(model_path: Path, N=None): + """Generate a report for a model. + + Args: + model_path (Path): Path to the model. + N (int, optional): Number of examples to use for the report. None means + all examples. Defaults to None. + """ + logging.info(f"Generating report for model {model_path} and N={N}.") + model = load_model(model_path) + + date = extract_date(model_path.name) + processed_dataset_path = ( + PROCESSED_DATASETS_PATH + / f"{PROCESSED_DATASETS_PREFIX}{date}{PROCESSED_DATASETS_EXTENSION}" + ) + if not processed_dataset_path.exists(): + logging.critical(f"Processed dataset {processed_dataset_path} not found.") + sys.exit(1) + + expected_df = split_train_test(load_processed_dataset(processed_dataset_path)) + expected_df = expected_df[1].dropna() + expected_df = expected_df[:N] + expected_df["label"] = expected_df["label"].astype(bool) + expected_df.reset_index(drop=True, inplace=True) + + predictions_df, average_time = make_predictions_with_time(model, expected_df) + + report_path = REPORTS_PATH / f"report_{date}.md" + logging.info(f"Saving report to {report_path}.") + if report_path.exists(): + logging.warning(f"Report {report_path} already exists. Overwriting.") + + with open(report_path, "w") as f: + f.write(f"# Report for model {model_path.name}\n") + + f.write("\n") + f.write("## Predictions") + f.write("\n") + f.write(f" - Number of examples: {len(expected_df)}\n") + f.write(f" - Number of spam labels: {expected_df['label'].sum()}\n") + f.write(f" - Number of ham examples: {len(expected_df) - expected_df['label'].sum()}\n") + f.write(f" - Average prediction time: {average_time:.2f}s\n") + + f.write("\n") + f.write("## Confusion matrix") + f.write("\n") + confusion_matrix_df = pd.DataFrame( + confusion_matrix(expected_df["label"], predictions_df["label"]), + columns=["Pred. Ham", "Pred. Spam"], + index=["Actual Ham", "Actual Spam"], + ) + f.write(confusion_matrix_df.to_markdown()) + + f.write("\n\n") + confusion_matrix_df = pd.DataFrame( + confusion_matrix( + expected_df["label"], predictions_df["label"], normalize="true" + ), + columns=["Pred. Ham", "Pred. Spam"], + index=["Actual Ham", "Actual Spam"], + ) + f.write(confusion_matrix_df.to_markdown()) + + f.write("\n\n") + f.write("## Examples of False Positive\n") + f.write("\n") + f.write("Samples that were classified as spam but are actually ham.\n") + f.write("\n") + count = 0 + for _, row in predictions_df.iterrows(): + if count >= NUM_FP_TO_SHOW: + break + if row["label"] and not expected_df.iloc[row.name]["label"]: + count += 1 + f.write("```\n") + split_lines = parse_description(row["description"]).split("\n") + for line in split_lines[:MAX_LINES_TO_SHOW]: + f.write(line + "\n") + f.write("```\n") + + f.write("\n") + f.write("## Examples of False Negative\n") + f.write("\n") + f.write("Samples that were classified as ham but are actually spam.\n") + f.write("\n") + count = 0 + for _, row in predictions_df.iterrows(): + if count >= NUM_FN_TO_SHOW: + break + if not row["label"] and expected_df.iloc[row.name]["label"]: + count += 1 + f.write("```\n") + split_lines = parse_description(row["description"]).split("\n") + for line in split_lines[:MAX_LINES_TO_SHOW]: + f.write(line + "\n") + f.write("```\n") + logging.info(f"Report saved to {report_path}.") + +if __name__ == "__main__": + init_logger("visualize") + init_seed() + + if len(sys.argv) > 2: + logging.critical("Usage: python visualize.py []") + sys.exit(1) + + N = None + if len(sys.argv) == 2: + N = int(sys.argv[1]) + + generate_report(find_latest_model(), N=N) From 536f3dfbdad00576ac92796f0bbeb8b974151b5f Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Wed, 21 Dec 2022 08:55:53 +0100 Subject: [PATCH 5/6] Add missing scripts for experiments --- .../cs433/en-spam-classifier/feat-eng-esc.py | 39 +++++++++++++++++++ .../multi-spam-classifier/feat-eng-msc.py | 24 ++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 experiments/cs433/en-spam-classifier/feat-eng-esc.py create mode 100644 experiments/cs433/multi-spam-classifier/feat-eng-msc.py diff --git a/experiments/cs433/en-spam-classifier/feat-eng-esc.py b/experiments/cs433/en-spam-classifier/feat-eng-esc.py new file mode 100644 index 0000000..c28c097 --- /dev/null +++ b/experiments/cs433/en-spam-classifier/feat-eng-esc.py @@ -0,0 +1,39 @@ +import pandas as pd +from bs4 import BeautifulSoup +from ftlangdetect import detect +import re + +KEEP = ['description', 'spam'] +SPAMS = pd.DataFrame() +HAMS = pd.DataFrame() +CLEANING_REGEX = re.compile(r'[^a-zA-Z0-9\s]', re.MULTILINE) + +def detect_lang(descr): + descr = CLEANING_REGEX.sub('', descr) + descr = descr.replace('\r', ' ').replace('\n', ' ') + lang = detect(descr)['lang'] + return lang + +for chunk in pd.read_json('zenodo_open_metadata_2020-10-19.jsonl', lines=True, chunksize=100000): + chunk = chunk[KEEP].dropna() + + chunk_spams = chunk[chunk['spam'] == True] + chunk_spams['description'] = chunk_spams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text()) + chunk_spams['lang'] = chunk_spams['description'].map(lambda x: detect_lang(x) if not pd.isna(x) else None).dropna() + chunk_spams = chunk_spams[chunk_spams['lang'] == 'en'] + chunk_spams = chunk_spams.drop(columns=['lang']) + chunk_spams['spam'] = chunk_spams['spam'].map(lambda x: 1) + SPAMS = pd.concat([SPAMS, chunk_spams]) + + chunk_hams = chunk[chunk['spam'] == False] + chunk_hams['description'] = chunk_hams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text()) + chunk_hams['lang'] = chunk_hams['description'].map(lambda x: detect_lang(x) if not pd.isna(x) else None).dropna() + chunk_hams = chunk_hams[chunk_hams['lang'] == 'en'] + chunk_hams = chunk_hams.drop(columns=['lang']) + chunk_hams['spam'] = chunk_hams['spam'].map(lambda x: 0) + HAMS = pd.concat([HAMS, chunk_hams]) + +HAMS = HAMS.sample(n= 2*len(SPAMS)) +df = pd.concat([SPAMS, HAMS]).rename(columns={'spam': 'label'}) + +df.to_csv('dataset-esc.csv', index=False) \ No newline at end of file diff --git a/experiments/cs433/multi-spam-classifier/feat-eng-msc.py b/experiments/cs433/multi-spam-classifier/feat-eng-msc.py new file mode 100644 index 0000000..1bdba2c --- /dev/null +++ b/experiments/cs433/multi-spam-classifier/feat-eng-msc.py @@ -0,0 +1,24 @@ +import pandas as pd +from bs4 import BeautifulSoup + +KEEP = ['description', 'spam'] +SPAMS = pd.DataFrame() +HAMS = pd.DataFrame() + +for chunk in pd.read_json('zenodo_open_metadata_2020-10-19.jsonl', lines=True, chunksize=100000): + chunk = chunk[KEEP].dropna() + + chunk_spams = chunk[chunk['spam'] == True] + chunk_spams['description'] = chunk_spams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text()) + chunk_spams['spam'] = chunk_spams['spam'].map(lambda x: 1) + SPAMS = pd.concat([SPAMS, chunk_spams]) + + chunk_hams = chunk[chunk['spam'] == False] + chunk_hams['description'] = chunk_hams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text()) + chunk_hams['spam'] = chunk_hams['spam'].map(lambda x: 0) + HAMS = pd.concat([HAMS, chunk_hams]) + +HAMS = HAMS.sample(n= 2*len(SPAMS)) +df = pd.concat([SPAMS, HAMS]).rename(columns={'spam': 'label'}) + +df.to_csv('dataset-msc.csv', index=False) \ No newline at end of file From 4bf2a8e4b1bd2c65b132198469e470397203f9e1 Mon Sep 17 00:00:00 2001 From: De Busschere Yanis Date: Tue, 17 Jan 2023 18:39:58 +0100 Subject: [PATCH 6/6] Few fixes that we forgot to push --- README.md | 4 +++- requirements.txt | 2 +- src/features/process_dataset.py | 2 +- src/utils/datasets_utils.py | 25 ++++++++++++++++++++----- src/utils/file_utils.py | 2 +- src/visualization/visualize.py | 2 +- 6 files changed, 27 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index df469fd..1c8aeb1 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ Note: each of these files can be called as a script (using `make` or manually) o Note: checkpoints are automatically saved in `models/checkpoints/` during training. If there are some checkpoints, the training will automatically resume from there. If you want to start over for some reason, delete them. +**Note: the `make_dataset.py` script is not ready yet. In the future, it will effectively dump the Zenodo dataset in `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl` but will remain unvailable for generic users. You should download the dataset from [this Zenodo record](https://zenodo.org/record/7438358/files/zenodo_open_metadata_2022-12-14.jsonl.gz?download=1). The results given in our report used the Zenodo dataset from 2020/10/21.** + To make a prediction on a new record you can proceed in two ways: - Use the `predict_model.py` script: @@ -44,7 +46,7 @@ To make a prediction on a new record you can proceed in two ways: make_prediction(model, "Some description of the record that is not preprocess (but can be)") ``` -To visualize the results of the model: +To visualize the results of the model, i.e., see its performance on the test set, you can use the `visualize_results.py` script: ```bash make visualize diff --git a/requirements.txt b/requirements.txt index f73560d..f97fa64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,5 @@ numpy==1.23.5 pandas==1.5.2 python-dotenv==0.21.0 scikit-learn==1.1.3 -torch==1.13.0 +torch==1.13.1 transformers==4.25.1 diff --git a/src/features/process_dataset.py b/src/features/process_dataset.py index 3a8d5ea..d89fcfe 100644 --- a/src/features/process_dataset.py +++ b/src/features/process_dataset.py @@ -81,7 +81,7 @@ def process_dataset(raw_dataset_path: str, processed_dataset_path: str) -> None: hams = hams.sample(n=2 * len(spams), random_state=SEED) df = pd.concat([spams, hams]).rename(columns={"spam": "label"}) - df.to_csv(processed_dataset_path, index=False) + df.to_pickle(processed_dataset_path) logging.info(f'Dataset processed and saved to "{processed_dataset_path}".') diff --git a/src/utils/datasets_utils.py b/src/utils/datasets_utils.py index ecfeb88..55b1078 100644 --- a/src/utils/datasets_utils.py +++ b/src/utils/datasets_utils.py @@ -20,13 +20,14 @@ def load_processed_dataset(processed_dataset_path: str) -> pd.DataFrame: Returns: pd.DataFrame: Preprocessed dataset. """ - return pd.read_csv(processed_dataset_path) + return pd.read_pickle(processed_dataset_path) def split_train_test( dataset: pd.DataFrame, test_size=0.2 ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Split the dataset into train and test. + """Split the dataset into train and test. It will keep exactly the same + distribution of classes in both train and test. Args: dataset (pd.DataFrame): Dataset to split. @@ -36,7 +37,21 @@ def split_train_test( Returns: pd.DataFrame, pd.DataFrame: Train and test sets. """ - dataset = dataset.sample(frac=1, random_state=SEED).reset_index(drop=True) - train = dataset.iloc[: int(len(dataset) * (1 - test_size))] - test = dataset.iloc[int(len(dataset) * (1 - test_size)) :] + spams = dataset[dataset["label"] == 1] + hams = dataset[dataset["label"] == 0] + + spams = spams.sample(frac=1, random_state=SEED) + hams = hams.sample(frac=1, random_state=SEED) + + spams_train = spams[: int(len(spams) * (1 - test_size))] + spams_test = spams[int(len(spams) * (1 - test_size)) :] + hams_train = hams[: int(len(hams) * (1 - test_size))] + hams_test = hams[int(len(hams) * (1 - test_size)) :] + + train = pd.concat([spams_train, hams_train]) + test = pd.concat([spams_test, hams_test]) + + train = train.sample(frac=1, random_state=SEED) + test = test.sample(frac=1, random_state=SEED) + return train, test diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py index 9fd8215..552e988 100644 --- a/src/utils/file_utils.py +++ b/src/utils/file_utils.py @@ -22,7 +22,7 @@ PROCESSED_DATASETS_PATH = DATA_PATH / "processed" PROCESSED_DATASETS_PREFIX = RAW_DATASETS_PREFIX + "processed_" -PROCESSED_DATASETS_EXTENSION = ".csv" +PROCESSED_DATASETS_EXTENSION = ".pkl" MODELS_PATH = ROOT_PATH / "models" diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py index 7c67a62..1fad4aa 100644 --- a/src/visualization/visualize.py +++ b/src/visualization/visualize.py @@ -83,7 +83,7 @@ def generate_report(model_path: Path, N=None): sys.exit(1) expected_df = split_train_test(load_processed_dataset(processed_dataset_path)) - expected_df = expected_df[1].dropna() + expected_df = expected_df[1] expected_df = expected_df[:N] expected_df["label"] = expected_df["label"].astype(bool) expected_df.reset_index(drop=True, inplace=True)