-
Notifications
You must be signed in to change notification settings - Fork 0
/
tf_transformer.py
117 lines (96 loc) · 3.69 KB
/
tf_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import nltk, random
import numpy as np
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
documents = [
(' '.join(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents) #in-place shuffle
train_set, test_set = train_test_split(documents, test_size=0.1, random_state=12)
vocab_size = 10000
maxlen = 200 # Only consider the last 200 words of each movie review
embed_dim = 32
num_heads = 2
ff_dim = 32
batch_size = 128
epochs = 10
## texts vs. labels
texts = np.array([t for (t, l) in train_set])
labels = np.array([1 if l == 'pos' else 0 for (t, l) in train_set])
## tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
## fit tokenizer
tokenizer.fit_on_texts(texts)
## text to sequences
texts_to_int = tokenizer.texts_to_sequences(texts)
## pad sequences
texts_to_int_pad = keras.preprocessing.sequence.pad_sequences(texts_to_int, maxlen=maxlen, truncating='pre', padding='pre')
x_train = texts_to_int_pad
y_train = labels
## Perform same vectorization on testing set
x_val_text = np.array([t for (t,l) in test_set])
x_val = keras.preprocessing.sequence.pad_sequences(
tokenizer.texts_to_sequences(x_val_text),
maxlen=maxlen,
truncating='pre',
padding='pre'
)
y_val = np.array([1 if l == 'pos' else 0 for (t, l) in test_set])
class Transformer(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(Transformer, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads,
key_dim=embed_dim)
self.ffn = keras.Sequential([
layers.Dense(ff_dim, activation="relu"),
layers.Dense(embed_dim),
])
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size,
output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
## Using Sequential API
model = keras.Sequential([
layers.Input(shape=(maxlen, )),
TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim),
Transformer(embed_dim, num_heads, ff_dim),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.1),
layers.Dense(ff_dim, activation='relu'),
layers.Dropout(0.1),
layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"])
history = model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_val, y_val))