-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
176 lines (149 loc) · 5.42 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import re
import os
import pickle
import multiprocessing
import langid
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from constants import DATAFRAME_FNAME, STOP_WORDS
def drop_or_not(index, row):
"""
Determine if current document (lyrics) should be dropped or not
Returns index if row needs to be dropped otherwise returns -1
"""
# Note: row['lyrics'] returns nan when empty
lyric = '' if not row['lyrics'] else str(row['lyrics'])[:100]
if not lyric:
return index
else:
lang, _ = langid.classify(lyric)
if lang == 'en':
return -1
else:
return index
def load_data(filename):
"""
Load data frame
"""
if os.path.isfile(DATAFRAME_FNAME) and os.path.exists(DATAFRAME_FNAME):
print('Cached dataframe found.')
df = pd.read_pickle(DATAFRAME_FNAME)
else:
print('Loading data...')
df = pd.read_csv(filename)
# Remove rows with missing values
df = df.dropna()
# Remove rows with lyrics that don't contain any letters
df = df[df['lyrics'].str.contains('[A-Za-z]', na=False)]
# Remove rows with non-English lyrics
drop_indices = Parallel(n_jobs=multiprocessing.cpu_count(), prefer='threads')(delayed(drop_or_not)(index, row) for index, row in tqdm(df.iterrows(), total=df.shape[0]))
drop_indices = [i for i in drop_indices if i >= 0]
df = df.drop(drop_indices)
# Remove songs whose year < 1970
df = remove_old_songs(df)
# Remove songs whose genre is 'Not Available'
df = remove_not_available(df)
# Cache dataframe
df.to_pickle(DATAFRAME_FNAME)
return df
def remove_old_songs(df, too_old=1970):
"""
Remove the songs before 1970 and after 2019 from df
"""
drop_indices = df.index[df['year'] < too_old].tolist()
df = df.drop(drop_indices)
return df
def remove_not_available(df):
"""
Remove the songs whose genre is Not Available
"""
drop_indices = df.index[df['genre'] == 'Not Available'].tolist()
df = df.drop(drop_indices)
return df
def remove_stop_words(stop_list, tokens):
"""
Remove stop words from tokens and remove tokens that are single letters/char like 's'
"""
return [t for t in tokens if len(t) > 2 and not t in stop_list]
def stem_tokens(stemmer, tokens):
"""
Stem tokens
"""
return [stemmer.stem(t) for t in tokens]
def remove_low_freq_tokens(freq_list, tokens):
"""
Remove low frequency tokens
"""
return [t for t in tokens if freq_list[t] > 1]
def tokenize_corpus(corpus, tokens_fname):
"""
Segment each document in corpus into words
Also applys stop words and stemming to tokens
"""
if os.path.isfile(tokens_fname) and os.path.exists(tokens_fname):
print('Cached tokens found.')
with open(tokens_fname, 'rb') as f:
final_tokens = pickle.load(f)
else:
print('Tokenizing data...')
tokens_corpus = []
# TweetTokenizer doesn't split words with apostrophes
# tokenizer = TweetTokenizer()
tokenizer = RegexpTokenizer(r'\w+')
# Create English stop words list
en_stop = get_stop_words('en') + STOP_WORDS
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
for doc in corpus:
raw = doc.lower()
tokens = tokenizer.tokenize(raw)
tokens_corpus.append(tokens)
print('Removing stop words from tokens...')
stopped_tokens = Parallel(n_jobs=multiprocessing.cpu_count(), prefer='threads')(delayed(remove_stop_words)(en_stop, tokens) for tokens in tqdm(tokens_corpus))
print('Stemming tokens...')
stemmed_tokens = Parallel(n_jobs=multiprocessing.cpu_count(), prefer='threads')(delayed(stem_tokens)(p_stemmer, tokens) for tokens in tqdm(stopped_tokens))
print('Removing low frequency tokens...')
freq_list = defaultdict(int)
for doc in stemmed_tokens:
for token in doc:
freq_list[token] += 1
final_tokens = Parallel(n_jobs=multiprocessing.cpu_count(), prefer='threads')(delayed(remove_low_freq_tokens)(freq_list, tokens) for tokens in tqdm(stemmed_tokens))
# Cache tokens
with open(tokens_fname, 'wb') as f:
pickle.dump(final_tokens, f)
return final_tokens
def visualize_data(df):
"""
Take in a data frame and visualize genres vs count
"""
# Remove 'not available'
genres = df.genre.unique().tolist()
remove_index = genres.index('Not Available')
genres.pop(remove_index)
print('Genres: ', genres)
# Extract number of songs in each genre
genre_counts = df.genre.value_counts().tolist()
genre_counts.pop(remove_index)
print('Counts: ', genre_counts)
# Plot bar graph
plt.bar(genres, genre_counts)
plt.xlabel('Genres')
plt.ylabel('Count')
plt.show()
def main():
df = load_data('lyrics.csv')
# visualize_data(df)
df_list = df['lyrics'].tolist()
tokens = tokenize_corpus(df_list)
print(tokens[:1])
# en_stop = get_stop_words('en') + STOP_WORDS
# print('Stop words: ', en_stop, len(en_stop))
if __name__ == '__main__':
main()