Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Visual attention img captioning ar #22

Open
wants to merge 3 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions scenes/image_captioning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import image_captioning_interface
import tensorflow as tf
import numpy as np
import pickle
from PIL import Image
import os
import cv2

def init():
global model
model = image_captioning_interface.image_captioning(mobile_net_v2_weights='mobilenet_v2_weights_1.4.h5')

def generate_caption(np_RGB_image):
return model.generate_from_img_nparray_encode(np_RGB_image)

if __name__ == '__main__':
init()
while(1):
test_image = input("enter path:")
im = Image.open(test_image)
im = np.array(im)
print(generate_caption(im))
145 changes: 145 additions & 0 deletions scenes/image_captioning_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from models import ImageFeatureExtract,CNN_Encoder,RNN_Decoder
import tensorflow as tf
import numpy as np
import pickle
from PIL import Image
import os
import cv2

# abspath = os.path.abspath(__file__)
# dname = os.path.dirname(abspath)
# os.chdir(dname)

#max_length of_train_sequences
max_length=46

#parameters used in training process
embedding_dim = 256
units = 512
len_tokenizer_word_index=26555
vocab_size = len_tokenizer_word_index + 1

class image_captioning ():
def __init__(self,
mobile_net_v2_weights = 'mobilenet',
alpha = 1.4):
self.image_features_extract_model = ImageFeatureExtract(mobile_net_v2_weights = mobile_net_v2_weights,alpha = alpha)
self.E = CNN_Encoder(embedding_dim)
self.D = RNN_Decoder(embedding_dim, units, vocab_size)
infile = open('tokenizer.pickle','rb')
self.toketokenizer= pickle.load(infile)
infile.close()

def preprocess_image_nparray(self,image):
img = tf.convert_to_tensor(image, dtype=tf.uint8)
#try for DCT
# img = tf.convert_to_tensor(image, dtype=tf.float32)
# s0, s1, s2 = tf.split(img, num_or_size_splits=3, axis=2)
# s0 = tf.signal.dct(s0,type=3)
# s1 = tf.signal.dct(s1,type=3)
# s2 = tf.signal.dct(s2,type=3)
# img = tf.concat([s0,s1,s2], 2)
#try
# img = tf.reshape(img,[720*1280*3])
# img = tf.signal.dct(img,type=2)
# img = tf.reshape(img,[720,1280,3])
# img = tf.cast(img,tf.uint8)
# print(img)
img = tf.image.resize(img, (224, 224))
img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
return img

def load_image(self,image_path):
img = tf.io.read_file(image_path)
# img = tf.image.decode_jpeg(img, dct_method = "INTEGER_ACCURATE",channels=3)
img = tf.image.decode_jpeg(img,channels=3)
img = tf.image.resize(img, (224, 224))
img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
return img

def encode(self,image):
img = tf.convert_to_tensor(image, dtype=tf.uint8)
img = tf.image.encode_jpeg(img,
quality=100,
progressive=False,
optimize_size=False,
chroma_downsampling=True,
density_unit='in',
x_density=300,
y_density=300)
# img = tf.image.encode_png(
# img, compression=-1, name=None)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, (224, 224))
img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
return img

def generate_from_img_nparray_encode(self,image):
im = tf.expand_dims(self.encode(image), 0)
return self.generate_caption(im)

def generate_from_saving_img_opencv(self,image):
image_path = "temp.jpg"
cv2.imwrite(image_path,image,[cv2.IMWRITE_JPEG_QUALITY,100,cv2.IMWRITE_JPEG_PROGRESSIVE,1,cv2.IMWRITE_JPEG_OPTIMIZE,1\
,cv2.IMWRITE_JPEG_LUMA_QUALITY,100,cv2.IMWRITE_JPEG_CHROMA_QUALITY,100])
return self.generate_from_img_path(image_path)
def generate_from_saving_img_pillow(self,image):
image_path = "temp.jpg"
image = Image.fromarray(image)
image.save(image_path)
return self.generate_from_img_path(image_path)

def generate_from_img_nparray(self,image):
im = tf.expand_dims(self.preprocess_image_nparray(image), 0)
return self.generate_caption(im)

def generate_from_img_path(self,image):
im = tf.expand_dims(self.load_image(image), 0)
return self.generate_caption(im)

def generate_caption(self,temp_input):
hidden = self.D.reset_state(batch_size=1)

img_tensor_val = self.image_features_extract_model.extract(temp_input)
img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

features = self.E(img_tensor_val)

dec_input = tf.expand_dims([self.toketokenizer.word_index["<start>"]], 0)
result = []
for i in range(max_length):
predictions, hidden, attention_weights = self.D(dec_input, features, hidden)
predicted_id = tf.argmax(predictions[0]).numpy()

result.append(self.toketokenizer.index_word[predicted_id])
if self.toketokenizer.index_word[predicted_id] == '<end>':
return ' '.join(result[:-1])
dec_input = tf.expand_dims([predicted_id], 0)
return ' '.join(result[:-1])

#For testing purpose
if __name__ == '__main__':
# while(1):
# test_image = input("\n")
# im = cv2.imread(test_image)
# print (type(im))
# print(cv2.imdecode(im,1))
model = image_captioning (mobile_net_v2_weights='mobilenet_v2_weights_1.4.h5')
while(1):
test_image = input("enter path:")

#opencv
# im = cv2.imread(test_image)
# im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

#pillow
im = Image.open(test_image)
im = np.array(im)

print("generate_from_img_nparray_encode\n" + model.generate_from_img_nparray_encode(im))
print("generate_from_img_nparray\n" +model.generate_from_img_nparray(im))
print("generate_from_saving_img_opencv\n" +model.generate_from_saving_img_opencv(im))
print("generate_from_saving_img_pillow\n" +model.generate_from_saving_img_pillow(im))
print("generate_from_img_path\n" +model.generate_from_img_path(test_image)) #take path
print('\n')

Binary file added scenes/mobilenet_v2_weights_1.4.h5
Binary file not shown.
114 changes: 114 additions & 0 deletions scenes/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import tensorflow as tf
import numpy as np
import pickle
from PIL import Image


# Shape of the vector extracted from mobilenetv2 is (49, 1792)
# These two variables represent that vector shape


class ImageFeatureExtract():
def __init__(self,mobile_net_v2_weights = 'mobilenet',alpha = 1.4):
image_model=tf.keras.applications.MobileNetV2(include_top=False, weights = mobile_net_v2_weights,alpha = alpha)
new_input=image_model.input
hidden_layer=image_model.layers[-1].output
self.model = tf.keras.Model(new_input,hidden_layer)
def extract(self, temp_input):
return self.model(temp_input)

class CNN_Encoder(tf.keras.Model):
# Since you have already extracted the features and dumped it using pickle
# This encoder passes those features through a Fully connected layer
def __init__(self, embedding_dim):
super(CNN_Encoder, self).__init__()
# shape after fc == (batch_size, 64, embedding_dim)
C = tf.keras.initializers.Constant
#here, we'll load the weights from the tflite instead
w1, w2 = [np.load("encoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "dense", j)) \
for j in range(2)]
self.fc = tf.keras.layers.Dense(embedding_dim, kernel_initializer=C(w1), bias_initializer=C(w2))

def call(self, x):
x = self.fc(x)
x = tf.nn.relu(x)
return x

class BahdanauAttention(tf.keras.Model):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
C = tf.keras.initializers.Constant
w1, w2, w3, w4, w5, w6 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(4, "bahdanau_attention", j)) \
for j in range(6)]
self.W1 = tf.keras.layers.Dense(units, kernel_initializer=C(w1), bias_initializer=C(w2))
self.W2 = tf.keras.layers.Dense(units, kernel_initializer=C(w3), bias_initializer=C(w4))
self.V = tf.keras.layers.Dense(1, kernel_initializer=C(w5), bias_initializer=C(w6))

def call(self, features, hidden):

hidden_with_time_axis = tf.expand_dims(hidden, 1)
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

# attention_weights shape == (batch_size, 64, 1)
# you get 1 at the last axis because you are applying score to self.V
attention_weights = tf.nn.softmax(self.V(score), axis=1)

# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)

return context_vector, attention_weights

class RNN_Decoder(tf.keras.Model):
def __init__(self, embedding_dim, units, vocab_size):
super(RNN_Decoder, self).__init__()
self.units = units

C = tf.keras.initializers.Constant
w_emb = np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "embedding", 0))
w_gru_1, w_gru_2, w_gru_3 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(1, "gru", j)) for j in range(3)]
w1, w2 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(2, "dense_1", j)) for j in range(2)]
w3, w4 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(3, "dense_2", j)) for j in range(2)]

self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer=C(w_emb))
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
kernel_initializer=C(w_gru_1),
recurrent_initializer=C(w_gru_2),
bias_initializer=C(w_gru_3)
)
self.fc1 = tf.keras.layers.Dense(self.units, kernel_initializer=C(w1), bias_initializer=C(w2))
self.fc2 = tf.keras.layers.Dense(vocab_size, kernel_initializer=C(w3), bias_initializer=C(w4))

self.attention = BahdanauAttention(self.units)

def call(self, x, features, hidden):
# defining attention as a separate model
context_vector, attention_weights = self.attention(features, hidden)

# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(x)

# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

# passing the concatenated vector to the GRU
output, state = self.gru(x)

# shape == (batch_size, max_length, hidden_size)
x = self.fc1(output)

# x shape == (batch_size * max_length, hidden_size)
x = tf.reshape(x, (-1, x.shape[2]))

# output shape == (batch_size * max_length, vocab)
x = self.fc2(x)

return x, state, attention_weights

def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))



Binary file added scenes/tokenizer.pickle
Binary file not shown.