在python中使用keras方法的图像字幕中的完整标题 [英] full caption in Image captioning using keras approach in python

查看:234
本文介绍了在python中使用keras方法的图像字幕中的完整标题的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我尝试过使用keras方法的图像字幕,我只得到序列中的下一个字,我如何获得图像的完整标题?
我得到了下一个字值
,就像res中的输出是(5,5)(测试中的两个图像),这是与单词相关联的数字。

I have tried Image captioning using keras approach , I only get the next word in the sequence, how do I get the full caption of the images ? I got the next word value like the output in res is (5,5)(two images in test) which is number associated with the words.

这是我的代码。

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector,Merge
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
import cv2
import numpy as np
from keras import backend as K 
K.set_image_dim_ordering('th')

max_caption_len = 15
vocab_size = 20
def VGG_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model
print "VGG loading"
image_model = VGG_16('vgg16_weights_th_dim_ordering_th_kernels.h5')
image_model.trainable = False
print "VGG loaded"
# let's load the weights from a save file.
# image_model.load_weights('weight_file.h5')

# next, let's define a RNN model that encodes sequences of words
# into sequences of 128-dimensional word vectors.
print "Text model loading"
language_model = Sequential()
language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
language_model.add(GRU(units=128, return_sequences=True))
language_model.add(TimeDistributed(Dense(128)))
print "Text model loaded"
# let's repeat the image vector to turn it into a sequence.
print "Repeat model loading"
image_model.add(RepeatVector(max_caption_len))
print "Repeat model loaded"
# the output of both models will be tensors of shape (samples, max_caption_len, 128).
# let's concatenate these 2 vector sequences.
print "Merging"
model = Sequential()
model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
# let's encode this vector sequence into a single vector
model.add(GRU(256, return_sequences=False))
# which will be used to compute a probability
# distribution over what the next word in the caption should be!
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print "Merged"
# "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
# "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
# containing word index sequences representing partial captions.
# "next_words" is a numpy float array of shape (nb_samples, vocab_size)
# containing a categorical encoding (0s and 1s) of the next word in the corresponding
# partial caption.
print "Data preprocessig"
Texts = ["START No MP seen END",
        "START No MP seen END",
        "START No abnormality seen END",
        "START No abnormality seen END",
        "START Overall Finding suggest possiblity of fungal lesion END"]

Images = ['Images/General Report_1864_b135a.jpg',
          'Images/General Report_1623_f3bee.jpg',
          'Images/General Report_3678_02bbc0.jpg',
          'Images/General Report_3678_56949.jpg',
          'Images/General Report_3998_21c27.jpg']
images = []
for image in Images:
    img = cv2.imread(image)
    img.resize((3,224,224))
    images.append(img)
images = np.asarray(images)

words = [txt.split() for txt in Texts]
unique = []
for word in words:
    unique.extend(word)
unique = list(set(unique))
word_index = {}
index_word = {}
for i,word in enumerate(unique):
    word_index[word] = i
    index_word[i] = word

partial_captions = []
for text in Texts:
    one = [word_index[txt] for txt in text.split()]
    partial_captions.append(one)

partial_captions = sequence.pad_sequences(partial_captions, maxlen=max_caption_len,padding='post')
next_words = np.zeros((5,vocab_size))
for i,text in enumerate(Texts):
    text = text.split()
    x = [word_index[txt] for txt in text]
    x = np.asarray(x)
    next_words[i,x] = 1

print "Data preprocessing done"
# The structure is as follows:
#(image, partial_caption)->(next_words)
#(image, "word1 word2")->("word3")    
model.fit([images, partial_captions], next_words, batch_size=1, nb_epoch=5)

# testing
Test_images=['testing Images/General Report_2361_e5399.jpg', 
             'testing Images/General Report_2660_04a446.jpg']
test_texts= ["START No MP seen END",
             "START No intra/axial collections seen END"]
test_images=[]
for i in Test_images:
    im=cv2.imread(i)
    im.resize((3,224,224))
    test_images.append(im)
test_images = np.asarray(test_images)
test_words = [txt.split() for txt in test_texts]
unique = []
for word in test_words:
    unique.extend(word)
unique = list(set(unique))
test_word_index = {}
test_index_word = {}
for i,word in enumerate(unique):
    test_word_index[word] = i
    test_index_word[i] = word

test_partial_captions = []
for text in test_texts:
    one = [test_word_index[txt] for txt in text.split()]
    test_partial_captions.append(one)

test_partial_captions = sequence.pad_sequences(partial_captions, maxlen=max_caption_len,padding='post')
test_next_words = np.zeros((5,vocab_size))
for i,text in enumerate(test_texts):
    text = text.split()
    x = [test_word_index[txt] for txt in text]
    x = np.asarray(x)
    test_next_words[i,x] = 1

res=model.predict_classes([test_images,test_partial_captions])
print res


推荐答案

而不是模型。 predict_classes ,你可以使用 model.predict 。这将返回您在next_words形状中给出的vocab_size的下一个单词的softmax概率。获得softmax概率后,您可以从中获得更多的单词。

Instead of model.predict_classes, you can use model.predict. This will return the softmax probabilities of next words for the vocab_size that you have given in the next_words shape. After getting the softmax probabilities, you can get more words from it.

这篇关于在python中使用keras方法的图像字幕中的完整标题的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆