admin管理员组

文章数量:823462

Describing Videos by Exploiting Temporal Structure

注:本文的数据集准备同样适应于以下两篇文章。

 Attention-based LSTM with Semantic Consistency for Videos Captioning

 Hierarchical LSTM with Adjusted TEmporal Attention for Video Captioning

作者GitHub提供了相关代码,根据作者README下载相应数据集可以跑通实验,但作者只提供了MSVD处理数据,如果想在其他数据集测试,则需要自行建立。作者没有提供相关代码,本文结合自己的实验过程,给出自己建立数据的代码,相互学习。

1、视频转单帧

import os
video_path = '/data/MSRVTTClips/train-video/'
frame_path = '/data/MSRVTTFrames/'count = 0
for video in os.listdir(video_path):os.mkdir(frame_path+video.split('eo')[0]+video.split('eo')[-1].split('.mp4')[0])os.system("ffmpeg -i "+video_path+video+" "+frame_path+video.split('eo')[0]+video.split('eo')[-1].split('.mp4')[0]+"/frame-%4d.jpg")
2、提取帧特征

 

import os
import sys
caffe_root = '/home/caffe_cudnn/python/'
sys.path.insert(0,caffe_root)
import numpy as npgpu_id = 2
import caffe
caffe.set_device(gpu_id)
caffe.set_mode_gpu()layer_num = 152
extract_from_layer = 'pool5'
model_def = "/home/caffe_cudnn/models/resnet/ResNet-"+str(layer_num)+"-deploy.prototxt"
pretrained_model = "/home/caffe_cudnn/models/resnet/ResNet-"+str(layer_num)+"-model.caffemodel"
batch_size = 1
folder_path = '/data/MSRVTTFrames/'
save_path = '/data/msrvtt/resnet'+str(layer_num)+'/'
mean_file = "/home/caffe_cudnn/models/resnet/ResNet_mean.npy"net = caffe.Net(model_def, pretrained_model, caffe.TEST)transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))
transformer.set_channel_swap('data', (2,1,0))
transformer.set_raw_scale('data', 255)
transformer.set_mean('data', np.reshape(np.load(mean_file),(3,224,224)))for i in range(1,10001):video_path = os.path.join(folder_path, 'vid'+str(i)+'/')feature = []for idx in range(1,len(os.listdir(video_path))+1):frame = caffe.io.load_image(video_path+'frame-'+str(idx).zfill(4)+'.jpg')net.blobs['data'].data[0] = transformer.preprocess('data', frame)temp = net.forward()feat = net.blobs[extract_from_layer].data[0].copy()feat = np.reshape(feat, (2048,))feature.append(feat)feature = np.asarray(feature)np.save(save_path+'vid'+str(i)+'.npy', feature)print video_path
3、获取CAP以及worddict等

import json
import nltk
import pickle
from collections import Counter
import collections
import randomwith open(anno_json_path, 'r') as f:anno_json = json.load(f)
anno_data = anno_json['sentences']sentences = anno_datacounter = Counter()
ncaptions = len(sentences)
for i, row in enumerate(sentences):caption = row.split('\t')[1]# 直接按照空格进行单词的切分# tokens = caption.lower().split(' ')# 使用nltk来进行单词切分tokens = nltk.tokenize.word_tokenize(caption.lower())counter.update(tokens)if i % 10000 == 0:print('[{}/{}] tokenized the captions.'.format(i, ncaptions))with open('/data/msrvtt/worddict.pkl','w') as f:pickle.dump(counter, f) temp = {}
for j in range(1,10001):temp['vid'+str(j)] = []
for i in range(len(sentences)):tmp = {}tmp['caption'] = sentences[i]['caption']tmp['cap_id'] = sentences[i]['sen_id']tmp['image_id'] = 'vid'+str(int(sentences[i]['video_id'].split('video')[-1])+1)tmp['tokenized'] = ' '.join(nltk.tokenize.word_tokenize(sentences[i]['caption'].lower()))temp['vid'+str(int(sentences[i]['video_id'].split('video')[-1])+1)].append(tmp)tp = {}
for j in range(1,10001):tp['vid'+str(j)] = []
for k in range(1,10001):tmp = temp['vid'+str(k)]min_id = min(tmp[i]['cap_id'] for i in range(len(tmp)))for m in range(len(tmp)):tmp[m]['cap_id'] -= min_idtmp[m]['cap_id'] = str(tmp[m]['cap_id'])tp['vid'+str(k)] = tmpd = collections.OrderedDict()
for i in range(1,10001):d['vid'+str(i)] = tp['vid'+str(i)]    
with open('/data/msrvtt/CAP.pkl','w') as f:pickle.dump(d, f)       tmp = []
for i in range(1,6514):for j in range(20):tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)
with open('/data/msrvtt/train.pkl','w') as f:pickle.dump(tmp, f)tmp = []
for i in range(6514,7011):for j in range(20):tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)    
with open('/data/msrvtt/valid.pkl','w') as f:pickle.dump(tmp, f)tmp = []
for i in range(7011,10001):for j in range(20):tmp.append('vid'+str(i)+'_'+str(j)) 
random.shuffle(tmp)    
with open('/data/msrvtt/test.pkl','w') as f:pickle.dump(tmp, f)









本文标签: Describing Videos by Exploiting Temporal Structure