canesee-project · Mostafa3zazi · Aug 2, 2020 · Aug 2, 2020 · Aug 3, 2020
diff --git a/scenes/image_captioning.py b/scenes/image_captioning.py
@@ -0,0 +1,22 @@
+import image_captioning_interface
+import tensorflow as tf
+import numpy as np
+import pickle
+from PIL import Image
+import os
+import cv2
+
+def init():
+    global model
+    model = image_captioning_interface.image_captioning(mobile_net_v2_weights='mobilenet_v2_weights_1.4.h5')
+
+def generate_caption(np_RGB_image):
+    return model.generate_from_img_nparray_encode(np_RGB_image)
+
+if __name__ == '__main__':
+    init()
+    while(1):
+        test_image = input("enter path:")
+        im = Image.open(test_image)
+        im = np.array(im)
+        print(generate_caption(im))
diff --git a/scenes/image_captioning_interface.py b/scenes/image_captioning_interface.py
@@ -0,0 +1,145 @@
+from models import ImageFeatureExtract,CNN_Encoder,RNN_Decoder
+import tensorflow as tf
+import numpy as np
+import pickle
+from PIL import Image
+import os
+import cv2
+
+# abspath = os.path.abspath(__file__)
+# dname = os.path.dirname(abspath)
+# os.chdir(dname)
+
+#max_length of_train_sequences
+max_length=46
+
+#parameters used in training process
+embedding_dim = 256
+units = 512
+len_tokenizer_word_index=26555
+vocab_size = len_tokenizer_word_index + 1
+
+class image_captioning ():
+    def __init__(self,
+    mobile_net_v2_weights = 'mobilenet',
+    alpha = 1.4):
+        self.image_features_extract_model = ImageFeatureExtract(mobile_net_v2_weights = mobile_net_v2_weights,alpha = alpha)
+        self.E = CNN_Encoder(embedding_dim)
+        self.D = RNN_Decoder(embedding_dim, units, vocab_size)
+        infile = open('tokenizer.pickle','rb')
+        self.toketokenizer= pickle.load(infile)
+        infile.close()
+
+    def preprocess_image_nparray(self,image):
+        img = tf.convert_to_tensor(image, dtype=tf.uint8)
+        #try for DCT
+        # img = tf.convert_to_tensor(image, dtype=tf.float32)
+        # s0, s1, s2 = tf.split(img, num_or_size_splits=3, axis=2)
+        # s0 = tf.signal.dct(s0,type=3)
+        # s1 = tf.signal.dct(s1,type=3)
+        # s2 = tf.signal.dct(s2,type=3)
+        # img = tf.concat([s0,s1,s2], 2)
+        #try
+        # img = tf.reshape(img,[720*1280*3])
+        # img = tf.signal.dct(img,type=2)
+        # img = tf.reshape(img,[720,1280,3])
+        # img = tf.cast(img,tf.uint8)
+        # print(img)
+        img = tf.image.resize(img, (224, 224))
+        img = tf.keras.applications.mobilenet_v2.preprocess_input(img) 
+        return img
+
+    def load_image(self,image_path):
+        img = tf.io.read_file(image_path)
+        # img = tf.image.decode_jpeg(img, dct_method = "INTEGER_ACCURATE",channels=3)
+        img = tf.image.decode_jpeg(img,channels=3)
+        img = tf.image.resize(img, (224, 224))
+        img = tf.keras.applications.mobilenet_v2.preprocess_input(img) 
+        return img
+
+    def encode(self,image):
+        img = tf.convert_to_tensor(image, dtype=tf.uint8)
+        img = tf.image.encode_jpeg(img,
+            quality=100,
+            progressive=False,
+            optimize_size=False,
+            chroma_downsampling=True,
+            density_unit='in',
+            x_density=300,
+            y_density=300)
+        # img = tf.image.encode_png(
+        #     img, compression=-1, name=None)
+        img = tf.image.decode_jpeg(img, channels=3)
+        img = tf.image.resize(img, (224, 224))
+        img = tf.keras.applications.mobilenet_v2.preprocess_input(img) 
+        return img
+
+    def generate_from_img_nparray_encode(self,image):
+        im = tf.expand_dims(self.encode(image), 0)
+        return self.generate_caption(im)
+
+    def generate_from_saving_img_opencv(self,image):
+        image_path = "temp.jpg"
+        cv2.imwrite(image_path,image,[cv2.IMWRITE_JPEG_QUALITY,100,cv2.IMWRITE_JPEG_PROGRESSIVE,1,cv2.IMWRITE_JPEG_OPTIMIZE,1\
+        ,cv2.IMWRITE_JPEG_LUMA_QUALITY,100,cv2.IMWRITE_JPEG_CHROMA_QUALITY,100])
+        return self.generate_from_img_path(image_path)
+    def generate_from_saving_img_pillow(self,image):
+        image_path = "temp.jpg"
+        image = Image.fromarray(image)
+        image.save(image_path)
+        return self.generate_from_img_path(image_path)
+
+    def generate_from_img_nparray(self,image):
+        im = tf.expand_dims(self.preprocess_image_nparray(image), 0)
+        return self.generate_caption(im)
+
+    def generate_from_img_path(self,image):
+        im = tf.expand_dims(self.load_image(image), 0)
+        return self.generate_caption(im)
+
+    def generate_caption(self,temp_input):
+        hidden = self.D.reset_state(batch_size=1)
+
+        img_tensor_val = self.image_features_extract_model.extract(temp_input)
+        img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
+
+        features = self.E(img_tensor_val)
+
+        dec_input = tf.expand_dims([self.toketokenizer.word_index["<start>"]], 0)
+        result = []
+        for i in range(max_length):
+            predictions, hidden, attention_weights = self.D(dec_input, features, hidden)
+            predicted_id = tf.argmax(predictions[0]).numpy()
+
+            result.append(self.toketokenizer.index_word[predicted_id])
+            if self.toketokenizer.index_word[predicted_id] == '<end>':
+                return ' '.join(result[:-1])
+            dec_input = tf.expand_dims([predicted_id], 0)
+        return ' '.join(result[:-1])
+
+#For testing purpose
+if __name__ == '__main__':    
+    # while(1):
+    #     test_image = input("\n")
+    #     im = cv2.imread(test_image)
+    #     print (type(im))
+    #     print(cv2.imdecode(im,1))
+    model = image_captioning (mobile_net_v2_weights='mobilenet_v2_weights_1.4.h5')
+    while(1):
+        test_image = input("enter path:")
+
+        #opencv
+        # im = cv2.imread(test_image)
+        # im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+        #pillow
+        im = Image.open(test_image)
+        im = np.array(im)
+
+        print("generate_from_img_nparray_encode\n" + model.generate_from_img_nparray_encode(im))
+        print("generate_from_img_nparray\n" +model.generate_from_img_nparray(im))
+        print("generate_from_saving_img_opencv\n" +model.generate_from_saving_img_opencv(im))
+        print("generate_from_saving_img_pillow\n" +model.generate_from_saving_img_pillow(im))
+        print("generate_from_img_path\n" +model.generate_from_img_path(test_image)) #take path
+        print('\n')
+
diff --git a/scenes/mobilenet_v2_weights_1.4.h5 b/scenes/mobilenet_v2_weights_1.4.h5
diff --git a/scenes/models.py b/scenes/models.py
@@ -0,0 +1,114 @@
+import tensorflow as tf
+import numpy as np
+import pickle
+from PIL import Image
+
+
+# Shape of the vector extracted from mobilenetv2 is (49, 1792)
+# These two variables represent that vector shape
+
+
+class ImageFeatureExtract():
+    def __init__(self,mobile_net_v2_weights = 'mobilenet',alpha = 1.4):
+        image_model=tf.keras.applications.MobileNetV2(include_top=False, weights = mobile_net_v2_weights,alpha = alpha)
+        new_input=image_model.input
+        hidden_layer=image_model.layers[-1].output
+        self.model = tf.keras.Model(new_input,hidden_layer)
+    def extract(self, temp_input):
+        return self.model(temp_input)
+
+class CNN_Encoder(tf.keras.Model):
+    # Since you have already extracted the features and dumped it using pickle
+    # This encoder passes those features through a Fully connected layer
+    def __init__(self, embedding_dim):
+        super(CNN_Encoder, self).__init__()
+        # shape after fc == (batch_size, 64, embedding_dim)
+        C = tf.keras.initializers.Constant
+        #here, we'll load the weights from the tflite instead
+        w1, w2 = [np.load("encoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "dense", j)) \
+                                      for j in range(2)]
+        self.fc = tf.keras.layers.Dense(embedding_dim, kernel_initializer=C(w1), bias_initializer=C(w2))
+
+    def call(self, x):
+        x = self.fc(x)
+        x = tf.nn.relu(x)
+        return x
+
+class BahdanauAttention(tf.keras.Model):
+  def __init__(self, units):
+    super(BahdanauAttention, self).__init__()
+    C = tf.keras.initializers.Constant
+    w1, w2, w3, w4, w5, w6 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(4, "bahdanau_attention", j)) \
+                                  for j in range(6)]
+    self.W1 = tf.keras.layers.Dense(units, kernel_initializer=C(w1), bias_initializer=C(w2))
+    self.W2 = tf.keras.layers.Dense(units, kernel_initializer=C(w3), bias_initializer=C(w4))
+    self.V = tf.keras.layers.Dense(1, kernel_initializer=C(w5), bias_initializer=C(w6))
+
+  def call(self, features, hidden):
+
+    hidden_with_time_axis = tf.expand_dims(hidden, 1)
+    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
+
+    # attention_weights shape == (batch_size, 64, 1)
+    # you get 1 at the last axis because you are applying score to self.V
+    attention_weights = tf.nn.softmax(self.V(score), axis=1)
+
+    # context_vector shape after sum == (batch_size, hidden_size)
+    context_vector = attention_weights * features
+    context_vector = tf.reduce_sum(context_vector, axis=1)
+
+    return context_vector, attention_weights
+
+class RNN_Decoder(tf.keras.Model):
+  def __init__(self, embedding_dim, units, vocab_size):
+    super(RNN_Decoder, self).__init__()
+    self.units = units
+
+    C = tf.keras.initializers.Constant
+    w_emb = np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(0, "embedding", 0))
+    w_gru_1, w_gru_2, w_gru_3 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(1, "gru", j)) for j in range(3)]
+    w1, w2 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(2, "dense_1", j)) for j in range(2)]
+    w3, w4 = [np.load("decoder_layer_weights/layer_%s_%s_weights_%s.npy" %(3, "dense_2", j)) for j in range(2)]
+
+    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer=C(w_emb))
+    self.gru = tf.keras.layers.GRU(self.units,
+                                   return_sequences=True,
+                                   return_state=True,
+                                   kernel_initializer=C(w_gru_1),
+                                   recurrent_initializer=C(w_gru_2),
+                                   bias_initializer=C(w_gru_3)
+                                   )
+    self.fc1 = tf.keras.layers.Dense(self.units, kernel_initializer=C(w1), bias_initializer=C(w2))
+    self.fc2 = tf.keras.layers.Dense(vocab_size, kernel_initializer=C(w3), bias_initializer=C(w4))
+
+    self.attention = BahdanauAttention(self.units)
+
+  def call(self, x, features, hidden):
+    # defining attention as a separate model
+    context_vector, attention_weights = self.attention(features, hidden)
+
+    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
+    x = self.embedding(x)
+
+    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
+    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
+
+    # passing the concatenated vector to the GRU
+    output, state = self.gru(x)
+
+    # shape == (batch_size, max_length, hidden_size)
+    x = self.fc1(output)
+
+    # x shape == (batch_size * max_length, hidden_size)
+    x = tf.reshape(x, (-1, x.shape[2]))
+
+    # output shape == (batch_size * max_length, vocab)
+    x = self.fc2(x)
+
+    return x, state, attention_weights
+
+  def reset_state(self, batch_size):
+    return tf.zeros((batch_size, self.units))
+
+
+
diff --git a/scenes/tokenizer.pickle b/scenes/tokenizer.pickle