zenodo · yanisdb · Dec 5, 2022 · Dec 13, 2022 · Dec 15, 2022 · Dec 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,15 @@
+# Project specific files
 data/*
 models/*
+
+# Logs
+*.log
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Environments
+.env
+.venv
diff --git a/Makefile b/Makefile
@@ -21,13 +21,25 @@ endif
 #################################################################################
 
 ## Install Python Dependencies
-requirements: test_environment
+requirements: #test_environment #TODO: uncomment test_environment when it is ready
 	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
 	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
 
 ## Make Dataset
 data: requirements
-	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
+	export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
+
+## Process Dataset
+process: data
+	export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/features/process_dataset.py
+
+## Train model
+train: process
+	export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/models/train_model.py
+
+## Visualize model
+visualize: train
+	export PYTHONPATH=$(PROJECT_DIR) && $(PYTHON_INTERPRETER) src/visualization/visualize.py $(N)
 
 ## Delete all compiled Python files
 clean:

diff --git a/README.md b/README.md
@@ -1,27 +1,58 @@
-# Zenodo spam classifiers
+## Zenodo spam classifiers
 
 Spam classification machine learning models for Zenodo records and communities.
 
 ## Usage
 
-First of all, create a virtualenv, install the depencencies, and run the Jupyter notebook server:
+First of all, create a virtual environment (the make script will install the required dependencies in it):
 
 ```bash
-# Create a virtual environment
-   mkvirtualenv --python python3.9 zenodo-classifier
-   (zenodo-classifier) pip install -e .
-
-# This will also open Jupyter notebook in your browser
-   (zenodo-classifier) jupyter notebook
+   mkvirtualenv --python python3.10 zenodo-classifier # Create the virtual environment
 ```
 
-To re-train the model:
+To train/re-train the model:
 
-1. Go to Zenodo Open Metadata record at <https://doi.org/10.5281/zenodo.787062> to acces all dataset versions.
-2. Download the latest dump locally under `data`
-3. Open the `model_spam_detection_record.ipynb` notebook
-4. Update the `data_file` and `model_path` variables to point to the new dump location
-5. Run all the cells up to `4. Dump model`.
+```bash
+   make train
+```
+
+The `make train` command will install all the necessary dependencies and run the following python scripts:
+
+- `make_dataset.py`: download/create the Zenodo dataset and store it in `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl`.
+- `process_dataset.py`: extract the features/process them and store the new dataset in `data/processed/zenodo_open_metadata_processed_YYYY-MM-DD.csv`.
+- `train_model.py`: train the model and store it in `models/zenodo_msc_YYYY-MM-DD`.
+
+Note: each of these files can be called as a script (using `make` or manually) or imported as module. As a script, they don't take any parameters, the `process_dataset.py` (resp. `train_model.py`) will automatically search for the latest dataset in `data/raw/` (resp. `data/processed/`) and use it. The latest dataset is found by comparing the date present in the file name. If the data is placed manually in `data/raw` (resp. `data/processed`) it should follow the naming convention, that is, `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl` (resp. `zenodo_open_metadata_processed_YYYY-MM-DD.csv`) to ensure that it is found automatically.
+
+Note: checkpoints are automatically saved in `models/checkpoints/` during training. If there are some checkpoints, the training will automatically resume from there. If you want to start over for some reason, delete them.
+
+**Note: the `make_dataset.py` script is not ready yet. In the future, it will effectively dump the Zenodo dataset in `data/raw/zenodo_open_metadata_YYYY-MM-DD.jsonl` but will remain unvailable for generic users. You should download the dataset from [this Zenodo record](https://zenodo.org/record/7438358/files/zenodo_open_metadata_2022-12-14.jsonl.gz?download=1). The results given in our report used the Zenodo dataset from 2020/10/21.**
+
+To make a prediction on a new record you can proceed in two ways:
+
+- Use the `predict_model.py` script:
+  ```bash
+  export PYTHONPATH=/path/to/zenodo-classifier # you can use "PYTHONPATH=$(pwd)" if you are in the zenodo-classifier directory
+  python3 predict.py "Some description of the record that is not preprocess (but can be)"
+  ```
+- Import `predict_model.py` in your python script:
+  ```python
+  from src.models.predict_model import load_model, make_prediction
+  # You need to load the model only once
+  # You must pass the path to the model as argument
+  # You can get the path to the latest model with `find_latest_model()` or pass the path to the model you want to use
+  model = load_model(model_path)
+  # You can make some predictions
+  make_prediction(model, "Some description of the record that is not preprocess (but can be)")
+  ```
+
+To visualize the results of the model, i.e., see its performance on the test set, you can use the `visualize_results.py` script:
+
+```bash
+make visualize
+```
+
+This will generate a `report_YYYY-MM-DD.md` file in the `reports/`. To make the generate faster you can use `make visualize N=1000` to compute the results on only 1000 tests samples.
 
 To compare with older models:
 
@@ -64,7 +95,7 @@ To compare with older models:
 │   │   └── make_dataset.py
 │   │
 │   ├── features       <- Scripts to turn raw data into features for modeling
-│   │   └── build_features.py
+│   │   └── process_dataset.py
 │   │
 │   ├── models         <- Scripts to train models and then use trained models to make
 │   │   │                 predictions
@@ -75,5 +106,3 @@ To compare with older models:
 │       └── visualize.py
 
 ```
-
-
diff --git a/experiments/cs433/README.md b/experiments/cs433/README.md
@@ -0,0 +1,16 @@
+# Experiments
+
+Luka Secilmis, Thomas Ecabert, Yanis De Busschere
+
+## Abstract
+
+This folder contains all the notebooks used for experimenting with the differents models during the project.
+
+## Summary of experiments
+
+| Experiment          | Folder                                             | Training Time | Prediction Time | Accuracy | F1-Score |
+|---------------------|----------------------------------------------------|---------------|-----------------|----------|----------|
+| BERT (english only) | [./en-spam-classifier](./en-spam-classifier)       | 1h02          | 0.005s          | 98.759   | 98.600   |
+| BERT (multilingual) | [./multi-spam-classifier](./multi-spam-classifier) | 1h09          | 0.005           | 98.814   | 98.779   |
+
+All computation and time measurement were made using an NVIDIA RTX A5000.
diff --git a/experiments/cs433/en-spam-classifier/README.md b/experiments/cs433/en-spam-classifier/README.md
@@ -0,0 +1,13 @@
+### BERT (English only)
+
+## Abstract
+
+We developed an NLP-based spam classifier through a transfer learning approach, by fine-tuning a pre-trained English DistilBERT model on the Zenodo dataset for text classification.
+
+## Results
+
+| Training Time | Prediction Time | Accuracy | F1-Score |
+|---------------|-----------------|----------|----------|
+| 1h02          | 0.005s          | 98.759   | 98.600   |
+
+All computation and time measurement were made using an NVIDIA RTX A5000.
diff --git a/experiments/cs433/en-spam-classifier/feat-eng-esc.py b/experiments/cs433/en-spam-classifier/feat-eng-esc.py
@@ -0,0 +1,39 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+from ftlangdetect import detect
+import re
+
+KEEP = ['description', 'spam']
+SPAMS = pd.DataFrame()
+HAMS = pd.DataFrame()
+CLEANING_REGEX = re.compile(r'[^a-zA-Z0-9\s]', re.MULTILINE)
+
+def detect_lang(descr):
+    descr = CLEANING_REGEX.sub('', descr)
+    descr = descr.replace('\r', ' ').replace('\n', ' ')
+    lang = detect(descr)['lang']
+    return lang
+
+for chunk in pd.read_json('zenodo_open_metadata_2020-10-19.jsonl', lines=True, chunksize=100000):
+    chunk = chunk[KEEP].dropna()
+
+    chunk_spams = chunk[chunk['spam'] == True]
+    chunk_spams['description'] = chunk_spams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text())
+    chunk_spams['lang'] = chunk_spams['description'].map(lambda x: detect_lang(x) if not pd.isna(x) else None).dropna()
+    chunk_spams = chunk_spams[chunk_spams['lang'] == 'en']
+    chunk_spams = chunk_spams.drop(columns=['lang'])
+    chunk_spams['spam'] = chunk_spams['spam'].map(lambda x: 1)
+    SPAMS = pd.concat([SPAMS, chunk_spams])
+
+    chunk_hams = chunk[chunk['spam'] == False]
+    chunk_hams['description'] = chunk_hams['description'].map(lambda x: BeautifulSoup(x, 'html.parser').get_text())
+    chunk_hams['lang'] = chunk_hams['description'].map(lambda x: detect_lang(x) if not pd.isna(x) else None).dropna()
+    chunk_hams = chunk_hams[chunk_hams['lang'] == 'en']
+    chunk_hams = chunk_hams.drop(columns=['lang'])
+    chunk_hams['spam'] = chunk_hams['spam'].map(lambda x: 0)
+    HAMS = pd.concat([HAMS, chunk_hams])
+
+HAMS = HAMS.sample(n= 2*len(SPAMS))
+df = pd.concat([SPAMS, HAMS]).rename(columns={'spam': 'label'})
+
+df.to_csv('dataset-esc.csv', index=False)