-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML_Algorithm.py
87 lines (64 loc) · 2.87 KB
/
ML_Algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os #to assist csv-import
import pandas as pd #to read csv-import
import time #to measure how long processes take
import re #to use regular expressions to clean data if needed
from pprint import pprint #pretty print
import numpy as np #for mathematical calculations
import matplotlib #python plotting library
import matplotlib.pyplot as plt #matplot for donut charts
import sklearn #scikit learn
import mglearn #for plotting purposes
from sklearn.preprocessing import LabelEncoder #encode the labels pos, neg, neut into numbers
from sklearn.feature_extraction.text import CountVectorizer #featureextraction, features aus Texten extrahieren
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
import string
import time
# Multiprocessing: to be able to use all available cores
import multiprocessing
NUM_JOBS = multiprocessing.cpu_count() -1
start = time.time()
#User Input
enter_text = input('Please enter a swiss german text: ')
type_enter_text =type(enter_text)
try:
int(enter_text)
print('-'*50)
print('Warning: Please make sure to enter text, not numbers. Thanks!')
except:
print('-'*50)
enter_text = "".join([word.lower() for word in enter_text if word not in string.punctuation])
print('Checking sentiment for:', enter_text)
text = pd.read_csv('data/HateSpeechDetectionLR.csv', sep=',') #csv import
text.drop('Dialekt', axis=1, inplace=True)
text.drop(text.loc[text['Sentiment'] == 'neut'].index, inplace=True)
def clean_text(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
return text
text['new_text'] = text['Text'].apply(lambda x: clean_text(x))
lbl_enc = LabelEncoder()
labels = text ['Sentiment'].tolist()
full_text = text ['new_text'].tolist()
lbl_enc.fit(sorted(labels))
enc_label = lbl_enc.transform(labels)
vect = CountVectorizer().fit(full_text)
vect_text = vect.transform(full_text)
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(C=10.0, solver='lbfgs', n_jobs=-1)
lr_model_full = logisticRegr.fit(vect_text, enc_label)
d = {'sentence': [1],'text_to_check': enter_text}
df = pd.DataFrame(d)
df
user_text = df['text_to_check'].tolist()
vectorized_text = vect.transform(user_text)
predict_test = lr_model_full.predict(vectorized_text).tolist()
n = -1
for x in predict_test:
n +=1
pred = lbl_enc.inverse_transform([x])
for i in pred:
print('The prediction of sentence', n,'is:', str(i))
end = time.time()
time_result = (end - start)
print('Seconds to run the Logistic Regression: ', round(time_result,2))