generated from sharif-ml-lab/IMDb-IR-System
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
56 changed files
with
2,831 additions
and
205 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
name: documentation | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
pull_request: | ||
branches: | ||
- main | ||
workflow_dispatch: | ||
|
||
permissions: | ||
contents: write | ||
|
||
jobs: | ||
docs: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: actions/setup-python@v3 | ||
- name: Install dependencies | ||
run: | | ||
cd Logic/ | ||
pip install sphinx myst_parser sphinx-book-theme | ||
cd ../UI/ | ||
pip install -r requirements.txt | ||
- name: Sphinx build | ||
run: | | ||
sphinx-build documentation/source _build | ||
- name: Deploy to GitHub Pages | ||
uses: peaceiris/actions-gh-pages@v3 | ||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} | ||
with: | ||
publish_branch: gh-pages | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
publish_dir: _build/ | ||
force_orphan: true | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .core import * | ||
from .utils import * | ||
|
||
|
||
__all__ = [k for k in globals().keys() if not k.startswith("_")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from .indexer import * | ||
from .utility import * | ||
from .search import * | ||
from .link_analysis import * | ||
from .classification import * | ||
from .clustering import * | ||
from .word_embedding import * | ||
|
||
__all__ = [k for k in globals().keys() if not k.startswith("_")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Classification | ||
|
||
This package contains the code for the classification phase of the project. | ||
The classification phase is responsible for classifying the comment data into two classes: positive and negative. | ||
you have to train the models on the training data and then use the trained model to classify the comment data which you crawled in the first phase. You can access the training data using [this link](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/code). | ||
|
||
## Classes | ||
|
||
Here is a brief description of the files in this package: | ||
|
||
### 1. [Basic Classifier](basic_classifier.py) | ||
This file contains a abstract class `BasicClassifier` which is the base class for all the classifiers in this package. You should implement function `get_percent_of_positive_reviews` in this class which is responsible to compute percentage of positive reviews of a list of reviews. In all classifiers, you have to use the fasttext embeddings as the input to the classifier except for the Naive Bayes classifier. In the Naive Bayes classifier, you have to use the count vectorizer to convert the text data into the input for the classifier. | ||
|
||
### 2. [Naive Bayes](naive_bayes.py) | ||
This file contains the implementation of the Naive Bayes classifier. | ||
|
||
### 3. [SVM](svm.py) | ||
This file contains the implementation of the Support Vector Machine classifier. you can use the sci-kit learn library to implement the SVM classifier. | ||
|
||
### 4. [KNN](knn.py) | ||
This file contains the implementation of the K-Nearest Neighbors classifier. | ||
|
||
### 5. [Deep Model](deep.py) | ||
This file contains the implementation of the MLP model using the PyTorch library. | ||
|
||
### 6. [Data Loader](data_loader.py) | ||
This file contains the implementation of the data loader class which is responsible for loading the data from the disk and use fasttext model to generate the word embeddings. you have to split the data into training and testing data in this file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from .basic_classifier import * | ||
from .data_loader import * | ||
from .deep import * | ||
from .knn import * | ||
from .naive_bayes import * | ||
from .svm import * | ||
|
||
|
||
__all__ = [k for k in globals().keys() if not k.startswith("_")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import numpy as np | ||
import sklearn | ||
from tqdm import tqdm | ||
|
||
from ..word_embedding.fasttext_model import FastText | ||
|
||
|
||
class BasicClassifier: | ||
def __init__(self): | ||
pass | ||
|
||
def fit(self, x, y): | ||
pass | ||
|
||
def predict(self, x): | ||
pass | ||
|
||
def prediction_report(self, x, y): | ||
pass | ||
|
||
def get_percent_of_positive_reviews(self, sentences): | ||
""" | ||
Get the percentage of positive reviews in the given sentences | ||
Parameters | ||
---------- | ||
sentences: list | ||
The list of sentences to get the percentage of positive reviews | ||
Returns | ||
------- | ||
float | ||
The percentage of positive reviews | ||
""" | ||
pass | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import numpy | ||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
import tqdm | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
from Logic.core.word_embedding.fasttext_model import FastText | ||
|
||
|
||
class ReviewLoader: | ||
def __init__(self, file_path: str): | ||
self.file_path = file_path | ||
self.fasttext_model = None | ||
self.review_tokens = [] | ||
self.sentiments = [] | ||
self.embeddings = [] | ||
|
||
def load_data(self): | ||
""" | ||
Load the data from the csv file and preprocess the text. Then save the normalized tokens and the sentiment labels. | ||
Also, load the fasttext model. | ||
""" | ||
self.fasttext_model = FastText(method='skipgram') | ||
self.fasttext_model.prepare(None,mode="load") | ||
df = pd.read_csv(self.file_path) | ||
le = LabelEncoder() | ||
df['sentiment'] = le.fit_transform(df['sentiment']) | ||
self.review_tokens = df['review'].to_numpy() | ||
self.sentiments = df['sentiment'].to_numpy() | ||
return self.review_tokens,self.sentiments | ||
|
||
def get_embeddings(self): | ||
""" | ||
Get the embeddings for the reviews using the fasttext model. | ||
""" | ||
self.embeddings = numpy.array([self.fasttext_model.get_query_embedding(token) for token in tqdm.tqdm(self.review_tokens)]) | ||
|
||
|
||
|
||
|
||
def split_data(self, test_data_ratio=0.2): | ||
""" | ||
Split the data into training and testing data. | ||
Parameters | ||
---------- | ||
test_data_ratio: float | ||
The ratio of the test data | ||
Returns | ||
------- | ||
np.ndarray, np.ndarray, np.ndarray, np.ndarray | ||
Return the training and testing data for the embeddings and the sentiments. | ||
in the order of x_train, x_test, y_train, y_test | ||
""" | ||
x_train, x_test, y_train, y_test = train_test_split(self.embeddings, self.sentiments, test_size=test_data_ratio, random_state=42) | ||
return x_train, x_test, y_train, y_test |
Oops, something went wrong.