在python中使用keras方法的图像字幕中的完整标题 [英] full caption in Image captioning using keras approach in python

查看：234 发布时间：2018/7/25 17:37:27 python image keras caption

本文介绍了在python中使用keras方法的图像字幕中的完整标题的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我尝试过使用keras方法的图像字幕，我只得到序列中的下一个字，我如何获得图像的完整标题？
我得到了下一个字值
，就像res中的输出是（5,5）（测试中的两个图像），这是与单词相关联的数字。

I have tried Image captioning using keras approach , I only get the next word in the sequence, how do I get the full caption of the images ? I got the next word value like the output in res is (5,5)(two images in test) which is number associated with the words.

这是我的代码。

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector,Merge
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
import cv2
import numpy as np
from keras import backend as K 
K.set_image_dim_ordering('th')

max_caption_len = 15
vocab_size = 20
def VGG_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2),dim_ordering='th'))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model
print "VGG loading"
image_model = VGG_16('vgg16_weights_th_dim_ordering_th_kernels.h5')
image_model.trainable = False
print "VGG loaded"
# let's load the weights from a save file.
# image_model.load_weights('weight_file.h5')

# next, let's define a RNN model that encodes sequences of words
# into sequences of 128-dimensional word vectors.
print "Text model loading"
language_model = Sequential()
language_model.add(Embedding(vocab_size, 256, input_length=max_caption_len))
language_model.add(GRU(units=128, return_sequences=True))
language_model.add(TimeDistributed(Dense(128)))
print "Text model loaded"
# let's repeat the image vector to turn it into a sequence.
print "Repeat model loading"
image_model.add(RepeatVector(max_caption_len))
print "Repeat model loaded"
# the output of both models will be tensors of shape (samples, max_caption_len, 128).
# let's concatenate these 2 vector sequences.
print "Merging"
model = Sequential()
model.add(Merge([image_model, language_model], mode='concat', concat_axis=-1))
# let's encode this vector sequence into a single vector
model.add(GRU(256, return_sequences=False))
# which will be used to compute a probability
# distribution over what the next word in the caption should be!
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print "Merged"
# "images" is a numpy float array of shape (nb_samples, nb_channels=3, width, height).
# "captions" is a numpy integer array of shape (nb_samples, max_caption_len)
# containing word index sequences representing partial captions.
# "next_words" is a numpy float array of shape (nb_samples, vocab_size)
# containing a categorical encoding (0s and 1s) of the next word in the corresponding
# partial caption.
print "Data preprocessig"
Texts = ["START No MP seen END",
        "START No MP seen END",
        "START No abnormality seen END",
        "START No abnormality seen END",
        "START Overall Finding suggest possiblity of fungal lesion END"]

Images = ['Images/General Report_1864_b135a.jpg',
          'Images/General Report_1623_f3bee.jpg',
          'Images/General Report_3678_02bbc0.jpg',
          'Images/General Report_3678_56949.jpg',
          'Images/General Report_3998_21c27.jpg']
images = []
for image in Images:
    img = cv2.imread(image)
    img.resize((3,224,224))
    images.append(img)
images = np.asarray(images)

words = [txt.split() for txt in Texts]
unique = []
for word in words:
    unique.extend(word)
unique = list(set(unique))
word_index = {}
index_word = {}
for i,word in enumerate(unique):
    word_index[word] = i
    index_word[i] = word

partial_captions = []
for text in Texts:
    one = [word_index[txt] for txt in text.split()]
    partial_captions.append(one)

partial_captions = sequence.pad_sequences(partial_captions, maxlen=max_caption_len,padding='post')
next_words = np.zeros((5,vocab_size))
for i,text in enumerate(Texts):
    text = text.split()
    x = [word_index[txt] for txt in text]
    x = np.asarray(x)
    next_words[i,x] = 1

print "Data preprocessing done"
# The structure is as follows:
#(image, partial_caption)->(next_words)
#(image, "word1 word2")->("word3")    
model.fit([images, partial_captions], next_words, batch_size=1, nb_epoch=5)

# testing
Test_images=['testing Images/General Report_2361_e5399.jpg', 
             'testing Images/General Report_2660_04a446.jpg']
test_texts= ["START No MP seen END",
             "START No intra/axial collections seen END"]
test_images=[]
for i in Test_images:
    im=cv2.imread(i)
    im.resize((3,224,224))
    test_images.append(im)
test_images = np.asarray(test_images)
test_words = [txt.split() for txt in test_texts]
unique = []
for word in test_words:
    unique.extend(word)
unique = list(set(unique))
test_word_index = {}
test_index_word = {}
for i,word in enumerate(unique):
    test_word_index[word] = i
    test_index_word[i] = word

test_partial_captions = []
for text in test_texts:
    one = [test_word_index[txt] for txt in text.split()]
    test_partial_captions.append(one)

test_partial_captions = sequence.pad_sequences(partial_captions, maxlen=max_caption_len,padding='post')
test_next_words = np.zeros((5,vocab_size))
for i,text in enumerate(test_texts):
    text = text.split()
    x = [test_word_index[txt] for txt in text]
    x = np.asarray(x)
    test_next_words[i,x] = 1

res=model.predict_classes([test_images,test_partial_captions])
print res

在python中使用keras方法的图像字幕中的完整标题 [英] full caption in Image captioning using keras approach in python

问题描述

推荐答案

相关文章

Python最新文章

热门教程

热门工具

登录关闭

在python中使用keras方法的图像字幕中的完整标题 [英] full caption in Image captioning using keras approach in python

问题描述

推荐答案

相关文章

Python最新文章

热门教程

热门工具

登录 关闭

登录关闭