diff --git a/README.md b/README.md index 1272153..4204b75 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,12 @@ This repo contains the Pytorch implementation of the AAAI'18 paper - [Deep Reinf The main requirements are [pytorch](http://pytorch.org/) (`v0.4.0`) and python `2.7`. Some dependencies that may not be installed in your machine are [tabulate](https://pypi.org/project/tabulate/) and [h5py](https://github.com/h5py/h5py). Please install other missing dependencies. +## TODO +1. Edit README +2. Reconstruct Repository +3. ~~Add KTS(Kernel Temporal Segmentation)~~ +4. Edit code for test + ## Get started 1. Download preprocessed datasets ```bash @@ -16,6 +22,11 @@ cd pytorch-vsumm-reinforce wget http://www.eecs.qmul.ac.uk/~kz303/vsumm-reinforce/datasets.tar.gz tar -xvzf datasets.tar.gz ``` +* If can't download, open following link.(I loaded same dataset in my onedrive) +``` +https://onedrive.live.com/?authkey=%21AO1tsqjDVCeakGg&cid=6FD3437627D709EE&id=6FD3437627D709EE%212809&parId=root&action=locate +``` + 2. Make splits ```bash python create_split.py -d datasets/eccv16_dataset_summe_google_pool5.h5 --save-dir datasets --save-name summe_splits --num-splits 5 @@ -25,13 +36,23 @@ As a result, the dataset is randomly split for 5 times, which are saved as json Train and test codes are written in `main.py`. To see the detailed arguments, please do `python main.py -h`. ## How to train +* Edit config/config.py ( see config/README.txt ) + ```bash -python main.py -d datasets/eccv16_dataset_summe_google_pool5.h5 -s datasets/summe_splits.json -m summe --gpu 0 --save-dir log/summe-split0 --split-id 0 --verbose +python video_summarization.py +``` +## How to evaluate +* Edit config/config.py ( see config/README.txt ) + +```bash +python video_summarization.py ``` ## How to test +* Edit config/config.py ( see config/README.txt ) + ```bash -python main.py -d datasets/eccv16_dataset_summe_google_pool5.h5 -s datasets/summe_splits.json -m summe --gpu 0 --save-dir log/summe-split0 --split-id 0 --evaluate --resume path_to_your_model.pth.tar --verbose --save-results +python video_summarization.py ``` If argument `--save-results` is enabled, output results will be saved to `results.h5` under the same folder specified by `--save-dir`. To visualize the score-vs-gtscore, simple do @@ -73,6 +94,7 @@ Please remember to specify the naming format of your video frames on this [line] ## How to use your own data We preprocess data by extracting image features for videos and save them to `h5` file. The file format looks like [this](https://github.com/KaiyangZhou/vsumm-reinforce/issues/1#issuecomment-363492711). After that, you can make split via `create_split.py`. If you wanna train policy network using the entire dataset, just do `train_keys = dataset.keys()`. [Here](https://github.com/KaiyangZhou/pytorch-vsumm-reinforce/blob/master/main.py#L75) is the code where we initialize dataset. If you have any problems, feel free to contact me by email or raise an `issue`. + ## Citation ``` @article{zhou2017reinforcevsumm, @@ -81,4 +103,4 @@ We preprocess data by extracting image features for videos and save them to `h5` journal={arXiv:1801.00054}, year={2017} } -``` \ No newline at end of file +``` diff --git a/config/README.txt b/config/README.txt new file mode 100644 index 0000000..54dcc3b --- /dev/null +++ b/config/README.txt @@ -0,0 +1,34 @@ +Configuration README +==================== + +1. example config to train +# Dataset options +DATASET = 'datasets/eccv16_dataset_summe_google_pool5.h5' +SPLIT = 'datasets/summe_splits.json' +SPLIT_ID = 0 +METRIC = 'summe' + +# Misc +GPU = '0' +EVALUATE = False +TEST = False +VERBOSE = True +SAVE_DIR = 'log/summe-split0' + +2. example config to evaluate +# Dataset options +DATASET = 'datasets/eccv16_dataset_summe_google_pool5.h5' +SPLIT = 'datasets/summe_splits.json' +SPLIT_ID = 0 +METRIC = 'summe' + +# Misc +GPU = '0' +EVALUATE = True +TEST = False +RESUME = 'log/summe-split0/model_epoch60.pth.tar' +VERBOSE = True +SAVE_DIR = 'log/summe-split0' +SAVE_RESULT = True + +3. example config to test \ No newline at end of file diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..635cf56 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1 @@ +from config import * \ No newline at end of file diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..d2d8e76 --- /dev/null +++ b/config/config.py @@ -0,0 +1,32 @@ +# ============ TRAIN CONFIG ============== +# Dataset options +DATASET = 'datasets/eccv16_dataset_summe_google_pool5.h5' # path to h5 dataset (required) +SPLIT = 'datasets/summe_splits.json' # path to split file (required) +SPLIT_ID = 0 # split index (default: 0) +METRIC = 'summe' # evaluation metric ['tvsum', 'summe']) + +# Model options +INPUT_DIM = 1024 # input dimension (default: 1024) +HIDDEN_DIM = 256 # hidden unit dimension of DSN (default: 256) +NUM_LAYERS = 1 # number of RNN layers (default: 1) +RNN_CELL = 'lstm' # RNN cell type (default: lstm) + +# Optimization options +LR = 1e-05 # learning rate (default: 1e-05) +WEIGHT_DECAY = 1e-05 # weight decay rate (default: 1e-05) +MAX_EPOCH = 60 # maximum epoch for training (default: 60) +STEP_SIZE = 30 # how many steps to decay learning rate (default: 30) +GAMMA = 0.1 # learning rate decay (default: 0.1) +NUM_EPISODE = 5 # number of episodes (default: 5) +BETA = 0.01 # weight for summary length penalty term (default: 0.01) + +# Misc +SEED = 1 # random seed (default: 1) +GPU = '0' # which gpu devices to use (default: 0) +USE_CPU = False # use cpu device +EVALUATE = False # whether to do evaluation only +TEST = False # whether to do evaluation only +RESUME = '' # path to resume file +VERBOSE = True # whether to show detailed test results +SAVE_DIR = 'log/summe-split0' # path to save output (default: log/) +SAVE_RESULTS = True # whether to save output results \ No newline at end of file diff --git a/create_split.py b/create_split.py index e7559a4..f6e1c63 100644 --- a/create_split.py +++ b/create_split.py @@ -1,26 +1,29 @@ +# Dataset Split + from __future__ import print_function + import os -import os.path as osp import argparse import h5py import math import numpy as np -from utils import write_json +from utils.file_process import write_json parser = argparse.ArgumentParser("Code to create splits in json form") -parser.add_argument('-d', '--dataset', type=str, required=True, help="path to h5 dataset (required)") -parser.add_argument('--save-dir', type=str, default='datasets', help="path to save output json file (default: 'datasets/')") -parser.add_argument('--save-name', type=str, default='splits', help="name to save as, excluding extension (default: 'splits')") -parser.add_argument('--num-splits', type=int, default=5, help="how many splits to generate (default: 5)") -parser.add_argument('--train-percent', type=float, default=0.8, help="percentage of training data (default: 0.8)") +parser.add_argument("-d", "--dataset", type=str, required=True, help="path to h5 dataset (required)") +parser.add_argument("--save-dir", type=str, default='datasets', help="path to save output jon file (default: 'datasets/'") +parser.add_argument("--save-name", type=str, default="splits", help="name to save as, excluding extension (default: 'splits')") +parser.add_argument("--num-splits", type=int, default=5, help="how many splits to generate (default: 5)") +parser.add_argument("--train-percent", type=float, default=0.8, help="percentage of training data (default: 0.8)") args = parser.parse_args() def split_random(keys, num_videos, num_train): - """Random split""" + """ Random split """ train_keys, test_keys = [], [] rnd_idxs = np.random.choice(range(num_videos), size=num_train, replace=False) + for key_idx, key in enumerate(keys): if key_idx in rnd_idxs: train_keys.append(key) @@ -32,29 +35,32 @@ def split_random(keys, num_videos, num_train): return train_keys, test_keys def create(): - print("==========\nArgs:{}\n==========".format(args)) + print("===========\nArgs:{}\n=========".format(args)) print("Goal: randomly split data for {} times, {:.1%} for training and the rest for testing".format(args.num_splits, args.train_percent)) - print("Loading dataset from {}".format(args.dataset)) + print("Loading dataset from: {}".format(args.dataset)) + dataset = h5py.File(args.dataset, 'r') keys = dataset.keys() num_videos = len(keys) num_train = int(math.ceil(num_videos * args.train_percent)) num_test = num_videos - num_train - print("Split breakdown: # total videos {}. # train videos {}. # test videos {}".format(num_videos, num_train, num_test)) + print("Split breakdown: # total videos {}. # train videos. # test videos {}".format(num_videos, num_train, num_test)) splits = [] for split_idx in range(args.num_splits): train_keys, test_keys = split_random(keys, num_videos, num_train) + splits.append({ 'train_keys': train_keys, 'test_keys': test_keys, - }) + }) - saveto = osp.join(args.save_dir, args.save_name + '.json') - write_json(splits, saveto) - print("Splits saved to {}".format(saveto)) + save_path = os.path.join(args.save_dir, args.save_name + '.json') + write_json(splits, save_path) + print("Splits save to {}".format(save_path)) dataset.close() if __name__ == '__main__': - create() \ No newline at end of file + create() + diff --git a/main.py b/main.py deleted file mode 100644 index 19c642d..0000000 --- a/main.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import print_function -import os -import os.path as osp -import argparse -import sys -import h5py -import time -import datetime -import numpy as np -from tabulate import tabulate - -import torch -import torch.nn as nn -import torch.backends.cudnn as cudnn -from torch.optim import lr_scheduler -from torch.distributions import Bernoulli - -from utils import Logger, read_json, write_json, save_checkpoint -from models import * -from rewards import compute_reward -import vsum_tools - -parser = argparse.ArgumentParser("Pytorch code for unsupervised video summarization with REINFORCE") -# Dataset options -parser.add_argument('-d', '--dataset', type=str, required=True, help="path to h5 dataset (required)") -parser.add_argument('-s', '--split', type=str, required=True, help="path to split file (required)") -parser.add_argument('--split-id', type=int, default=0, help="split index (default: 0)") -parser.add_argument('-m', '--metric', type=str, required=True, choices=['tvsum', 'summe'], - help="evaluation metric ['tvsum', 'summe']") -# Model options -parser.add_argument('--input-dim', type=int, default=1024, help="input dimension (default: 1024)") -parser.add_argument('--hidden-dim', type=int, default=256, help="hidden unit dimension of DSN (default: 256)") -parser.add_argument('--num-layers', type=int, default=1, help="number of RNN layers (default: 1)") -parser.add_argument('--rnn-cell', type=str, default='lstm', help="RNN cell type (default: lstm)") -# Optimization options -parser.add_argument('--lr', type=float, default=1e-05, help="learning rate (default: 1e-05)") -parser.add_argument('--weight-decay', type=float, default=1e-05, help="weight decay rate (default: 1e-05)") -parser.add_argument('--max-epoch', type=int, default=60, help="maximum epoch for training (default: 60)") -parser.add_argument('--stepsize', type=int, default=30, help="how many steps to decay learning rate (default: 30)") -parser.add_argument('--gamma', type=float, default=0.1, help="learning rate decay (default: 0.1)") -parser.add_argument('--num-episode', type=int, default=5, help="number of episodes (default: 5)") -parser.add_argument('--beta', type=float, default=0.01, help="weight for summary length penalty term (default: 0.01)") -# Misc -parser.add_argument('--seed', type=int, default=1, help="random seed (default: 1)") -parser.add_argument('--gpu', type=str, default='0', help="which gpu devices to use") -parser.add_argument('--use-cpu', action='store_true', help="use cpu device") -parser.add_argument('--evaluate', action='store_true', help="whether to do evaluation only") -parser.add_argument('--save-dir', type=str, default='log', help="path to save output (default: 'log/')") -parser.add_argument('--resume', type=str, default='', help="path to resume file") -parser.add_argument('--verbose', action='store_true', help="whether to show detailed test results") -parser.add_argument('--save-results', action='store_true', help="whether to save output results") - -args = parser.parse_args() - -torch.manual_seed(args.seed) -os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu -use_gpu = torch.cuda.is_available() -if args.use_cpu: use_gpu = False - -def main(): - if not args.evaluate: - sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) - else: - sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) - print("==========\nArgs:{}\n==========".format(args)) - - if use_gpu: - print("Currently using GPU {}".format(args.gpu)) - cudnn.benchmark = True - torch.cuda.manual_seed_all(args.seed) - else: - print("Currently using CPU") - - print("Initialize dataset {}".format(args.dataset)) - dataset = h5py.File(args.dataset, 'r') - num_videos = len(dataset.keys()) - splits = read_json(args.split) - assert args.split_id < len(splits), "split_id (got {}) exceeds {}".format(args.split_id, len(splits)) - split = splits[args.split_id] - train_keys = split['train_keys'] - test_keys = split['test_keys'] - print("# total videos {}. # train videos {}. # test videos {}".format(num_videos, len(train_keys), len(test_keys))) - - print("Initialize model") - model = DSN(in_dim=args.input_dim, hid_dim=args.hidden_dim, num_layers=args.num_layers, cell=args.rnn_cell) - print("Model size: {:.5f}M".format(sum(p.numel() for p in model.parameters())/1000000.0)) - - optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) - if args.stepsize > 0: - scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) - - if args.resume: - print("Loading checkpoint from '{}'".format(args.resume)) - checkpoint = torch.load(args.resume) - model.load_state_dict(checkpoint) - else: - start_epoch = 0 - - if use_gpu: - model = nn.DataParallel(model).cuda() - - if args.evaluate: - print("Evaluate only") - evaluate(model, dataset, test_keys, use_gpu) - return - - print("==> Start training") - start_time = time.time() - model.train() - baselines = {key: 0. for key in train_keys} # baseline rewards for videos - reward_writers = {key: [] for key in train_keys} # record reward changes for each video - - for epoch in range(start_epoch, args.max_epoch): - idxs = np.arange(len(train_keys)) - np.random.shuffle(idxs) # shuffle indices - - for idx in idxs: - key = train_keys[idx] - seq = dataset[key]['features'][...] # sequence of features, (seq_len, dim) - seq = torch.from_numpy(seq).unsqueeze(0) # input shape (1, seq_len, dim) - if use_gpu: seq = seq.cuda() - probs = model(seq) # output shape (1, seq_len, 1) - - cost = args.beta * (probs.mean() - 0.5)**2 # minimize summary length penalty term [Eq.11] - m = Bernoulli(probs) - epis_rewards = [] - for _ in range(args.num_episode): - actions = m.sample() - log_probs = m.log_prob(actions) - reward = compute_reward(seq, actions, use_gpu=use_gpu) - expected_reward = log_probs.mean() * (reward - baselines[key]) - cost -= expected_reward # minimize negative expected reward - epis_rewards.append(reward.item()) - - optimizer.zero_grad() - cost.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) - optimizer.step() - baselines[key] = 0.9 * baselines[key] + 0.1 * np.mean(epis_rewards) # update baseline reward via moving average - reward_writers[key].append(np.mean(epis_rewards)) - - epoch_reward = np.mean([reward_writers[key][epoch] for key in train_keys]) - print("epoch {}/{}\t reward {}\t".format(epoch+1, args.max_epoch, epoch_reward)) - - write_json(reward_writers, osp.join(args.save_dir, 'rewards.json')) - evaluate(model, dataset, test_keys, use_gpu) - - elapsed = round(time.time() - start_time) - elapsed = str(datetime.timedelta(seconds=elapsed)) - print("Finished. Total elapsed time (h:m:s): {}".format(elapsed)) - - model_state_dict = model.module.state_dict() if use_gpu else model.state_dict() - model_save_path = osp.join(args.save_dir, 'model_epoch' + str(args.max_epoch) + '.pth.tar') - save_checkpoint(model_state_dict, model_save_path) - print("Model saved to {}".format(model_save_path)) - - dataset.close() - -def evaluate(model, dataset, test_keys, use_gpu): - print("==> Test") - with torch.no_grad(): - model.eval() - fms = [] - eval_metric = 'avg' if args.metric == 'tvsum' else 'max' - - if args.verbose: table = [["No.", "Video", "F-score"]] - - if args.save_results: - h5_res = h5py.File(osp.join(args.save_dir, 'result.h5'), 'w') - - for key_idx, key in enumerate(test_keys): - seq = dataset[key]['features'][...] - seq = torch.from_numpy(seq).unsqueeze(0) - if use_gpu: seq = seq.cuda() - probs = model(seq) - probs = probs.data.cpu().squeeze().numpy() - - cps = dataset[key]['change_points'][...] - num_frames = dataset[key]['n_frames'][()] - nfps = dataset[key]['n_frame_per_seg'][...].tolist() - positions = dataset[key]['picks'][...] - user_summary = dataset[key]['user_summary'][...] - - machine_summary = vsum_tools.generate_summary(probs, cps, num_frames, nfps, positions) - fm, _, _ = vsum_tools.evaluate_summary(machine_summary, user_summary, eval_metric) - fms.append(fm) - - if args.verbose: - table.append([key_idx+1, key, "{:.1%}".format(fm)]) - - if args.save_results: - h5_res.create_dataset(key + '/score', data=probs) - h5_res.create_dataset(key + '/machine_summary', data=machine_summary) - h5_res.create_dataset(key + '/gtscore', data=dataset[key]['gtscore'][...]) - h5_res.create_dataset(key + '/fm', data=fm) - - if args.verbose: - print(tabulate(table)) - - if args.save_results: h5_res.close() - - mean_fm = np.mean(fms) - print("Average F-score {:.1%}".format(mean_fm)) - - return mean_fm - -if __name__ == '__main__': - main() diff --git a/networks/CNN.py b/networks/CNN.py new file mode 100644 index 0000000..0695e5e --- /dev/null +++ b/networks/CNN.py @@ -0,0 +1,48 @@ +import torch.nn as nn +from torchvision import transforms, models +from torch.autograd import Variable + +""" +pre-trained ResNet +""" + +class ResNet(nn.Module): + """ + Args: + fea_type: string, resnet101 or resnet 152 + """ + + def __init__(self, fea_type = 'resnet152'): + super(ResNet, self).__init__() + self.fea_type = fea_type + # rescale and normalize transformation + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + if fea_type == 'resnet101': + resnet = models.resnet101(pretrained=True) # dim of pool5 is 2048 + elif fea_type == 'resnet152': + resnet = models.resnet152(pretrained=True) + else: + raise Exception('No such ResNet!') + + resnet.float() + resnet.cuda() + resnet.eval() + + module_list = list(resnet.children()) + self.conv5 = nn.Sequential(*module_list[: -2]) + self.pool5 = module_list[-2] + + # rescale and normalize image, then pass it through ResNet + def forward(self, x): + x = self.transform(x) + x = x.unsqueeze(0) # reshape the single image s.t. it has a batch dim + x = Variable(x).cuda() + res_conv5 = self.conv5(x) + res_pool5 = self.pool5(res_conv5) + res_pool5 = res_pool5.view(res_pool5.size(0), -1) + + return res_pool5 \ No newline at end of file diff --git a/models.py b/networks/DSN.py similarity index 82% rename from models.py rename to networks/DSN.py index 532e8be..9b02433 100644 --- a/models.py +++ b/networks/DSN.py @@ -1,21 +1,25 @@ -import torch +import torch as T import torch.nn as nn from torch.nn import functional as F __all__ = ['DSN'] class DSN(nn.Module): - """Deep Summarization Network""" + """ Deep Summarization Network """ + def __init__(self, in_dim=1024, hid_dim=256, num_layers=1, cell='lstm'): super(DSN, self).__init__() - assert cell in ['lstm', 'gru'], "cell must be either 'lstm' or 'gru'" + assert cell in ['lstm', 'gru'], "cell must be either 'lstm' or 'gru" + if cell == 'lstm': self.rnn = nn.LSTM(in_dim, hid_dim, num_layers=num_layers, bidirectional=True, batch_first=True) - else: + elif cell == 'gru': self.rnn = nn.GRU(in_dim, hid_dim, num_layers=num_layers, bidirectional=True, batch_first=True) + self.fc = nn.Linear(hid_dim*2, 1) def forward(self, x): h, _ = self.rnn(x) - p = F.sigmoid(self.fc(h)) + p = T.sigmoid(self.fc(h)) + return p \ No newline at end of file diff --git a/rewards.py b/networks/RL.py similarity index 56% rename from rewards.py rename to networks/RL.py index 18118ac..a3c5cb3 100644 --- a/rewards.py +++ b/networks/RL.py @@ -1,9 +1,8 @@ import torch -import sys def compute_reward(seq, actions, ignore_far_sim=True, temp_dist_thre=20, use_gpu=False): """ - Compute diversity reward and representativeness reward + Compute Diversity reward and Representativeness reward Args: seq: sequence of features, shape (1, seq_len, dim) @@ -11,46 +10,58 @@ def compute_reward(seq, actions, ignore_far_sim=True, temp_dist_thre=20, use_gpu ignore_far_sim (bool): whether to ignore temporally distant similarity (default: True) temp_dist_thre (int): threshold for ignoring temporally distant similarity (default: 20) use_gpu (bool): whether to use GPU + """ + _seq = seq.detach() _actions = actions.detach() - pick_idxs = _actions.squeeze().nonzero().squeeze() - num_picks = len(pick_idxs) if pick_idxs.ndimension() > 0 else 1 - + + # get selected frames indices + pick_indices = actions.squeeze().nonzero().squeeze() + num_picks = len(pick_indices) if pick_indices.ndimension() > 0 else 1 + if num_picks == 0: # give zero reward is no frames are selected - reward = torch.tensor(0.) + reward = torch.tensor(0, ) if use_gpu: reward = reward.cuda() return reward - _seq = _seq.squeeze() + _seq = seq.squeeze() n = _seq.size(0) # compute diversity reward + # Rdiv = 1 / (Y * (Y-1)) * SUM(SUM( d(xt,xt') )) + # d(xt,xt') = 1 - ( xtT*xt' / (||xt|| * ||xt'||) ) if num_picks == 1: - reward_div = torch.tensor(0.) - if use_gpu: reward_div = reward_div.cuda() + reward_div = torch.tensor(0, ) + if use_gpu: reward_div.cuda() + else: normed_seq = _seq / _seq.norm(p=2, dim=1, keepdim=True) - dissim_mat = 1. - torch.matmul(normed_seq, normed_seq.t()) # dissimilarity matrix [Eq.4] - dissim_submat = dissim_mat[pick_idxs,:][:,pick_idxs] + dissim_mat = 1 - torch.matmul(normed_seq, normed_seq.t()) # dissimilarity matrix [Eq.4] + + # Y : Selected frames indices + # pick_idx : Y + dissim_submat = dissim_mat[pick_indices, :][: ,pick_indices] + if ignore_far_sim: # ignore temporally distant similarity - pick_mat = pick_idxs.expand(num_picks, num_picks) + pick_mat = pick_indices.expand(num_picks, num_picks) temp_dist_mat = torch.abs(pick_mat - pick_mat.t()) dissim_submat[temp_dist_mat > temp_dist_thre] = 1. - reward_div = dissim_submat.sum() / (num_picks * (num_picks - 1.)) # diversity reward [Eq.3] + + reward_div = dissim_submat.sum() / (num_picks* (num_picks - 1.)) # diversity reward [Eq.3] # compute representativeness reward dist_mat = torch.pow(_seq, 2).sum(dim=1, keepdim=True).expand(n, n) dist_mat = dist_mat + dist_mat.t() dist_mat.addmm_(1, -2, _seq, _seq.t()) - dist_mat = dist_mat[:,pick_idxs] + + dist_mat = dist_mat[:, pick_indices] dist_mat = dist_mat.min(1, keepdim=True)[0] - #reward_rep = torch.exp(torch.FloatTensor([-dist_mat.mean()]))[0] # representativeness reward [Eq.5] - reward_rep = torch.exp(-dist_mat.mean()) - # combine the two rewards + reward_rep = torch.exp(-dist_mat.mean()) # representativeness reward [Eq.5] + reward = (reward_div + reward_rep) * 0.5 - return reward + return reward \ No newline at end of file diff --git a/networks/__init__.py b/networks/__init__.py new file mode 100644 index 0000000..2eea161 --- /dev/null +++ b/networks/__init__.py @@ -0,0 +1,3 @@ +from CNN import * +from DSN import * +from RL import * \ No newline at end of file diff --git a/parse_json.py b/parse_json.py index 8f4e4b2..68f67c2 100644 --- a/parse_json.py +++ b/parse_json.py @@ -1,27 +1,25 @@ import os import argparse -import re -import os.path as osp import matplotlib -matplotlib.use('Agg') +matplotlib.use("Agg") from matplotlib import pyplot as plt -from utils import read_json +from utils.file_process import read_json """ -Parse json file (.json) to extract rewards for specific videos. - -How to use: -# image will be saved in path: blah_blah_blah -$ python parse_json.py -p blah_blah_blah/rewards.json -i 0 + Parse json file (.json) to extract rewards for specific videos. + + How to use: + # image will be saved in path + $ python parse_json.py -p log/summe-split0/rewards.json -i 0 """ parser = argparse.ArgumentParser() -parser.add_argument('-p', '--path', type=str, required=True, help="path to rewards.json; output saved to the same dir") -parser.add_argument('-i', '--idx', type=int, default=0, help="choose which video to visualize, index starts from 0 (default: 0)") +parser.add_argument("-p", "--path", type=str, required=True, help="path to rewards.json; output saved to the same dir") +parser.add_argument("-i", "--idx", type=int, default=0, help="choose which video to visualize, index starts from 0 (default: 0)") args = parser.parse_args() reward_writers = read_json(args.path) -keys = reward_writers.keys() +keys = [key for key in reward_writers] assert args.idx < len(keys) key = keys[args.idx] rewards = reward_writers[key] @@ -30,5 +28,5 @@ plt.xlabel('epoch') plt.ylabel('reward') plt.title("{}".format(key)) -plt.savefig(osp.join(osp.dirname(args.path), 'epoch_reward_' + str(args.idx) + '.png')) +plt.savefig(os.path.join(os.path.dirname(args.path), 'epoch_reward_' + str(args.idx) + '.png')) plt.close() \ No newline at end of file diff --git a/parse_json.sh b/parse_json.sh deleted file mode 100644 index c8002a4..0000000 --- a/parse_json.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!bin/sh - -# Note: index starts from 0, so if you wanna visualize all videos, -# say there are 10 videos, NUM should be 9 -NUM=39; - -for i in $(seq 0 $NUM); -do - echo "do: parse_json.py -p path_to/rewards.json -i $i" - python parse_json.py -p log/rewards.json -i $i -done \ No newline at end of file diff --git a/parse_log.py b/parse_log.py index 9c6baee..c04462c 100644 --- a/parse_log.py +++ b/parse_log.py @@ -1,28 +1,27 @@ import os import argparse import re -import os.path as osp import matplotlib -matplotlib.use('Agg') +matplotlib.use("Agg") from matplotlib import pyplot as plt """ -Parse log file (.txt) to extract rewards. - -How to use: -# image will be saved in path: blah_blah_blah -$ python parse_log.py -p blah_blah_blah/log_train.txt + Parse log file (.txt) to extract rewards. + + How to use: + # image will be saved in path + $ python parse_log.py -p log/summe-split0/log_train.txt """ parser = argparse.ArgumentParser() -parser.add_argument('-p', '--path', type=str, required=True, help="path to log.txt; output saved to the same dir") +parser.add_argument("-p", "--path", type=str, required=True, help="path to log.txt; output saved to the same dir") args = parser.parse_args() -if not osp.exists(args.path): - raise ValueError("Given path is invalid: {}".format(args.path)) +if not os.path.exists(args.path): + raise ValueError("Given path is invalid: {}".format(args["path"])) -if osp.splitext(osp.basename(args.path))[-1] != '.txt': - raise ValueError("File found does not end with .txt: {}".format(args.path)) +if os.path.splitext(os.path.basename(args.path))[-1] != '.txt': + raise ValueError("File found dose not end with .txt: {}".format(args.path)) regex_reward = re.compile('reward ([\.\deE+-]+)') rewards = [] @@ -36,8 +35,8 @@ rewards.append(reward) plt.plot(rewards) -plt.xlabel('epoch') -plt.ylabel('reward') +plt.xlabel("epoch") +plt.ylabel("reward") plt.title("Overall rewards") -plt.savefig(osp.join(osp.dirname(args.path), 'overall_reward.png')) -plt.close() +plt.savefig(os.path.join(os.path.dirname(args.path), 'overall_reward.png')) +plt.close() \ No newline at end of file diff --git a/summary2video.py b/summary2video.py index 21ba48f..da091a3 100644 --- a/summary2video.py +++ b/summary2video.py @@ -1,8 +1,6 @@ import h5py import cv2 import os -import os.path as osp -import numpy as np import argparse parser = argparse.ArgumentParser() @@ -22,16 +20,16 @@ def frm2video(frm_dir, summary, vid_writer): # here frame name starts with '000001.jpg' # change according to your need frm_name = str(idx+1).zfill(6) + '.jpg' - frm_path = osp.join(frm_dir, frm_name) + frm_path = os.path.join(frm_dir, frm_name) frm = cv2.imread(frm_path) frm = cv2.resize(frm, (args.width, args.height)) vid_writer.write(frm) if __name__ == '__main__': - if not osp.exists(args.save_dir): + if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) vid_writer = cv2.VideoWriter( - osp.join(args.save_dir, args.save_name), + os.path.join(args.save_dir, args.save_name), cv2.VideoWriter_fourcc(*'MP4V'), args.fps, (args.width, args.height), diff --git a/utils/KTS/README.txt b/utils/KTS/README.txt new file mode 100644 index 0000000..38a2d39 --- /dev/null +++ b/utils/KTS/README.txt @@ -0,0 +1,10 @@ +Kernel temporal segmentation +============================ + +This archive contains the following files: +cpd_nonlin.py - kernel temporal segmentation with fixed number of segments +cpd_auto.py - kernel temporal segmentation with autocalibration +demo.py - demo on synthetic examples + +Dependencies: +python + libraries: numpy, scipy, matplotlib (for demo) diff --git a/utils/KTS/__init__.py b/utils/KTS/__init__.py new file mode 100644 index 0000000..fd84190 --- /dev/null +++ b/utils/KTS/__init__.py @@ -0,0 +1 @@ +from cpd_auto import * \ No newline at end of file diff --git a/utils/KTS/cpd_auto.py b/utils/KTS/cpd_auto.py new file mode 100644 index 0000000..d551936 --- /dev/null +++ b/utils/KTS/cpd_auto.py @@ -0,0 +1,86 @@ +import numpy as np +from cpd_nonlin import cpd_nonlin + +def cpd_auto(K, ncp, vmax, desc_rate=1, **kwargs): + """Main interface + + Detect change points automatically selecting their number + K - kernel between each pair of frames in video + ncp - maximum ncp + vmax - special parameter + Optional arguments: + lmin - minimum segment length + lmax - maximum segment length + desc_rate - rate of descriptor sampling (vmax always corresponds to 1x) + + Note: + - cps are always calculated in subsampled coordinates irrespective to + desc_rate + - lmin and m should be in agreement + --- + Returns: (cps, costs) + cps - best selected change-points + costs - costs for 0,1,2,...,m change-points + + Memory requirement: ~ (3*N*N + N*ncp)*4 bytes ~= 16 * N^2 bytes + That is 1,6 Gb for the N=10000. + """ + m = ncp + (_, scores) = cpd_nonlin(K, m, backtrack=False, **kwargs) + + N = K.shape[0] + N2 = N*desc_rate # length of the video before subsampling + + penalties = np.zeros(m+1) + # Prevent division by zero (in case of 0 changes) + ncp = np.arange(1, m+1) + penalties[1:] = (vmax*ncp/(2.0*N2))*(np.log(float(N2)/ncp)+1) + + costs = scores/float(N) + penalties + m_best = np.argmin(costs) + (cps, scores2) = cpd_nonlin(K, m_best, **kwargs) + + return (cps, costs) + + +# ------------------------------------------------------------------------------ +# Extra functions (currently not used) + +def estimate_vmax(K_stable): + """K_stable - kernel between all frames of a stable segment""" + n = K_stable.shape[0] + vmax = np.trace(centering(K_stable)/n) + return vmax + + +def centering(K): + """Apply kernel centering""" + mean_rows = np.mean(K, 1)[:, np.newaxis] + return K - mean_rows - mean_rows.T + np.mean(mean_rows) + + +def eval_score(K, cps): + """ Evaluate unnormalized empirical score + (sum of kernelized scatters) for the given change-points """ + N = K.shape[0] + cps = [0] + list(cps) + [N] + V1 = 0 + V2 = 0 + for i in range(len(cps)-1): + K_sub = K[cps[i]:cps[i+1], :][:, cps[i]:cps[i+1]] + V1 += np.sum(np.diag(K_sub)) + V2 += np.sum(K_sub) / float(cps[i+1] - cps[i]) + return (V1 - V2) + + +def eval_cost(K, cps, score, vmax): + """ Evaluate cost function for automatic number of change points selection + K - kernel between all frames + cps - selected change-points + score - unnormalized empirical score (sum of kernelized scatters) + vmax - vmax parameter""" + + N = K.shape[0] + penalty = (vmax*len(cps)/(2.0*N))*(np.log(float(N)/len(cps))+1) + return score/float(N) + penalty + diff --git a/utils/KTS/cpd_nonlin.py b/utils/KTS/cpd_nonlin.py new file mode 100644 index 0000000..115eafe --- /dev/null +++ b/utils/KTS/cpd_nonlin.py @@ -0,0 +1,108 @@ +import numpy as np + +import weave + +def calc_scatters(K): + """ + Calculate scatter matrix: + scatters[i,j] = {scatter of the sequence with starting frame i and ending frame j} + """ + n = K.shape[0] + K1 = np.cumsum([0] + list(np.diag(K))) + K2 = np.zeros((n+1, n+1)) + K2[1:, 1:] = np.cumsum(np.cumsum(K, 0), 1); # TODO: use the fact that K - symmetric + + scatters = np.zeros((n, n)); + + code = r""" + for (int i = 0; i < n; i++) { + for (int j = i; j < n; j++) { + scatters(i,j) = K1(j+1)-K1(i) - (K2(j+1,j+1)+K2(i,i)-K2(j+1,i)-K2(i,j+1))/(j-i+1); + } + } + """ + weave.inline(code, ['K1','K2','scatters','n'], global_dict = \ + {'K1':K1, 'K2':K2, 'scatters':scatters, 'n':n}, type_converters=weave.converters.blitz) + + return scatters + +def cpd_nonlin(K, ncp, lmin=1, lmax=100000, backtrack=True, verbose=True, + out_scatters=None): + """ Change point detection with dynamic programming + K - square kernel matrix + ncp - number of change points to detect (ncp >= 0) + lmin - minimal length of a segment + lmax - maximal length of a segment + backtrack - when False - only evaluate objective scores (to save memory) + + Returns: (cps, obj) + cps - detected array of change points: mean is thought to be constant on [ cps[i], cps[i+1] ) + obj_vals - values of the objective function for 0..m changepoints + + """ + m = int(ncp) # prevent numpy.int64 + + (n, n1) = K.shape + assert(n == n1), "Kernel matrix awaited." + + assert(n >= (m + 1)*lmin) + assert(n <= (m + 1)*lmax) + assert(lmax >= lmin >= 1) + + if verbose: + #print "n =", n + print ("Precomputing scatters...") + J = calc_scatters(K) + + if out_scatters != None: + out_scatters[0] = J + + if verbose: + print ("Inferring best change points...") + # I[k, l] - value of the objective for k change-points and l first frames + I = 1e101*np.ones((m+1, n+1)) + I[0, lmin:lmax] = J[0, lmin-1:lmax-1] + + if backtrack: + # p[k, l] --- "previous change" --- best t[k] when t[k+1] equals l + p = np.zeros((m+1, n+1), dtype=int) + else: + p = np.zeros((1,1), dtype=int) + + code = r""" + #define max(x,y) ((x)>(y)?(x):(y)) + for (int k=1; k 1e99] = np.inf + return cps, scores + + diff --git a/utils/KTS/demo.py b/utils/KTS/demo.py new file mode 100644 index 0000000..c6f023a --- /dev/null +++ b/utils/KTS/demo.py @@ -0,0 +1,80 @@ +import numpy as np +from cpd_nonlin import cpd_nonlin +from cpd_auto import cpd_auto + +def gen_data(n, m, d=1): + """Generates data with change points + n - number of samples + m - number of change-points + WARN: sigma is proportional to m + Returns: + X - data array (n X d) + cps - change-points array, including 0 and n""" + np.random.seed(1) + # Select changes at some distance from the boundaries + cps = np.random.permutation((n*3/4)-1)[0:m] + 1 + n/8 + cps = np.sort(cps) + cps = [0] + list(cps) + [n] + mus = np.random.rand(m+1, d)*(m/2) # make sigma = m/2 + X = np.zeros((n, d)) + for k in range(m+1): + X[cps[k]:cps[k+1], :] = mus[k, :][np.newaxis, :] + np.random.rand(cps[k+1]-cps[k], d) + return (X, np.array(cps)) + + +if __name__ == "__main__": + from matplotlib import pyplot as plt + plt.ioff() + + print ("Test 1: 1-dimensional signal") + plt.figure("Test 1: 1-dimensional signal") + n = 1000 + m = 10 + (X, cps_gt) = gen_data(n, m) + print ("Ground truth:", cps_gt) + plt.plot(X) + K = np.dot(X, X.T) + cps, scores = cpd_nonlin(K, m, lmin=1, lmax=10000) + print ("Estimated:", cps) + mi = np.min(X) + ma = np.max(X) + for cp in cps: + plt.plot([cp, cp], [mi, ma], 'r') + plt.show() + print ("="*79) + + + print ("Test 2: multidimensional signal") + plt.figure("Test 2: multidimensional signal") + n = 1000 + m = 20 + (X, cps_gt) = gen_data(n, m, d=50) + print ("Ground truth:", cps_gt) + plt.plot(X) + K = np.dot(X, X.T) + cps, scores = cpd_nonlin(K, m, lmin=1, lmax=10000) + print ("Estimated:", cps) + mi = np.min(X) + ma = np.max(X) + for cp in cps: + plt.plot([cp, cp], [mi, ma], 'r') + plt.show() + print ("="*79) + + + print ("Test 3: automatic selection of the number of change-points") + plt.figure("Test 3: automatic selection of the number of change-points") + (X, cps_gt) = gen_data(n, m) + print ("Ground truth: (m=%d)" % m, cps_gt) + plt.plot(X) + K = np.dot(X, X.T) + cps, scores = cpd_auto(K, 2*m, 1) + print ("Estimated: (m=%d)" % len(cps), cps) + mi = np.min(X) + ma = np.max(X) + for cp in cps: + plt.plot([cp, cp], [mi, ma], 'r') + plt.show() + print ("="*79) + + diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..457e1b5 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,3 @@ +from file_process import * +from knapsack import * +from vsum_tool import * \ No newline at end of file diff --git a/utils.py b/utils/file_process.py old mode 100755 new mode 100644 similarity index 73% rename from utils.py rename to utils/file_process.py index c4da3c7..ccd2d73 --- a/utils.py +++ b/utils/file_process.py @@ -1,26 +1,25 @@ -from __future__ import absolute_import -import os -import sys -import errno -import shutil +import sys, os import json -import os.path as osp - import torch -def mkdir_if_missing(directory): - if not osp.exists(directory): - try: - os.makedirs(directory) - except OSError as e: - if e.errno != errno.EEXIST: - raise +def write_json(splits, save_path): + if not os.path.exists(os.path.dirname(save_path)): + os.mkdir(os.path.dirname(save_path)) + + with open(save_path, 'w') as f: + json.dump(splits, f, indent=4, separators=(', ', ': ')) + +def read_json(fpath): + with open(fpath, 'r') as f: + obj = json.load(f) + return obj class AverageMeter(object): """Computes and stores the average and current value. - + Code imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262 """ + def __init__(self): self.reset() @@ -36,20 +35,27 @@ def update(self, val, n=1): self.count += n self.avg = self.sum / self.count + def save_checkpoint(state, fpath='checkpoint.pth.tar'): - mkdir_if_missing(osp.dirname(fpath)) + if not os.path.exists(os.path.dirname(fpath)): + os.mkdir(os.path.dirname(fpath)) + torch.save(state, fpath) + class Logger(object): """ Write console output to external text file. Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/logging.py. """ + def __init__(self, fpath=None): self.console = sys.stdout self.file = None if fpath is not None: - mkdir_if_missing(os.path.dirname(fpath)) + if not os.path.exists(os.path.dirname(fpath)): + os.mkdir(os.path.dirname(fpath)) + self.file = open(fpath, 'w') def __del__(self): @@ -77,18 +83,3 @@ def close(self): if self.file is not None: self.file.close() -def read_json(fpath): - with open(fpath, 'r') as f: - obj = json.load(f) - return obj - -def write_json(obj, fpath): - mkdir_if_missing(osp.dirname(fpath)) - with open(fpath, 'w') as f: - json.dump(obj, f, indent=4, separators=(',', ': ')) - - - - - - diff --git a/utils/generate_dataset.py b/utils/generate_dataset.py new file mode 100644 index 0000000..4e3c4de --- /dev/null +++ b/utils/generate_dataset.py @@ -0,0 +1,147 @@ +""" + Generate Dataset + + 1. Converting video to frames + 2. Extracting features + 3. Getting change points + 4. User Summary ( for evaluation ) + +""" +import os, sys +sys.path.append('../') +from networks.CNN import ResNet +from utils.KTS.cpd_auto import cpd_auto +from tqdm import tqdm +import math +import cv2 +import numpy as np +import h5py + +class Generate_Dataset: + def __init__(self, video_path, save_path): + self.resnet = ResNet() + self.dataset = {} + self.video_list = [] + self.video_path = '' + self.frame_root_path = './frames' + self.h5_file = h5py.File(save_path, 'w') + + self._set_video_list(video_path) + + def _set_video_list(self, video_path): + if os.path.isdir(video_path): + self.video_path = video_path + self.video_list = os.listdir(video_path) + self.video_list.sort() + else: + self.video_path = '' + self.video_list.append(video_path) + + for idx, file_name in enumerate(self.video_list): + self.dataset['video_{}'.format(idx+1)] = {} + self.h5_file.create_group('video_{}'.format(idx+1)) + + + def _extract_feature(self, frame): + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = cv2.resize(frame, (224, 224)) + res_pool5 = self.resnet(frame) + frame_feat = res_pool5.cpu().data.numpy().flatten() + + return frame_feat + + def _get_change_points(self, video_feat, n_frame, fps): + n = n_frame / fps + m = int(math.ceil(n/2.0)) + K = np.dot(video_feat, video_feat.T) + change_points, _ = cpd_auto(K, m, 1) + change_points = np.concatenate(([0], change_points, [n_frame-1])) + + temp_change_points = [] + for idx in range(len(change_points)-1): + segment = [change_points[idx], change_points[idx+1]-1] + if idx == len(change_points)-2: + segment = [change_points[idx], change_points[idx+1]] + + temp_change_points.append(segment) + change_points = np.array(list(temp_change_points)) + + temp_n_frame_per_seg = [] + for change_points_idx in range(len(change_points)): + n_frame = change_points[change_points_idx][1] - change_points[change_points_idx][0] + temp_n_frame_per_seg.append(n_frame) + n_frame_per_seg = np.array(list(temp_n_frame_per_seg)) + + return change_points, n_frame_per_seg + + # TODO : save dataset + def _save_dataset(self): + pass + + def generate_dataset(self): + for video_idx, video_filename in enumerate(tqdm(self.video_list)): + video_path = video_filename + if os.path.isdir(self.video_path): + video_path = os.path.join(self.video_path, video_filename) + + video_basename = os.path.basename(video_path).split('.')[0] + + if not os.path.exists(os.path.join(self.frame_root_path, video_basename)): + os.mkdir(os.path.join(self.frame_root_path, video_basename)) + + video_capture = cv2.VideoCapture(video_path) + + fps = video_capture.get(cv2.CAP_PROP_FPS) + n_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT)) + + frame_list = [] + picks = [] + video_feat = None + video_feat_for_train = None + for frame_idx in tqdm(range(n_frames-1)): + success, frame = video_capture.read() + if success: + frame_feat = self._extract_feature(frame) + + if frame_idx % 15 == 0: + picks.append(frame_idx) + + if video_feat_for_train is None: + video_feat_for_train = frame_feat + else: + video_feat_for_train = np.vstack((video_feat_for_train, frame_feat)) + + if video_feat is None: + video_feat = frame_feat + else: + video_feat = np.vstack((video_feat, frame_feat)) + + img_filename = "{}.jpg".format(str(frame_idx).zfill(5)) + cv2.imwrite(os.path.join(self.frame_root_path, video_basename, img_filename), frame) + + else: + break + + video_capture.release() + + change_points, n_frame_per_seg = self._get_change_points(video_feat, n_frames, fps) + + # self.dataset['video_{}'.format(video_idx+1)]['frames'] = list(frame_list) + # self.dataset['video_{}'.format(video_idx+1)]['features'] = list(video_feat) + # self.dataset['video_{}'.format(video_idx+1)]['picks'] = np.array(list(picks)) + # self.dataset['video_{}'.format(video_idx+1)]['n_frames'] = n_frames + # self.dataset['video_{}'.format(video_idx+1)]['fps'] = fps + # self.dataset['video_{}'.format(video_idx+1)]['change_points'] = change_points + # self.dataset['video_{}'.format(video_idx+1)]['n_frame_per_seg'] = n_frame_per_seg + + self.h5_file['video_{}'.format(video_idx+1)]['features'] = list(video_feat_for_train) + self.h5_file['video_{}'.format(video_idx+1)]['picks'] = np.array(list(picks)) + self.h5_file['video_{}'.format(video_idx+1)]['n_frames'] = n_frames + self.h5_file['video_{}'.format(video_idx+1)]['fps'] = fps + self.h5_file['video_{}'.format(video_idx+1)]['change_points'] = change_points + self.h5_file['video_{}'.format(video_idx+1)]['n_frame_per_seg'] = n_frame_per_seg + +if __name__ == "__main__": + gen = Generate_Dataset('/data/video_summarization/dataset_SumMe/videos/Air_Force_One.mp4', 'summe_dataset.h5') + gen.generate_dataset() + gen.h5_file.close() \ No newline at end of file diff --git a/knapsack.py b/utils/knapsack.py old mode 100755 new mode 100644 similarity index 93% rename from knapsack.py rename to utils/knapsack.py index 633d6ce..a856ae9 --- a/knapsack.py +++ b/utils/knapsack.py @@ -28,8 +28,8 @@ def knapsack_dp(values,weights,n_items,capacity,return_all=False): table = np.zeros((n_items+1,capacity+1),dtype=np.float32) keep = np.zeros((n_items+1,capacity+1),dtype=np.float32) - for i in xrange(1,n_items+1): - for w in xrange(0,capacity+1): + for i in range(1,n_items+1): + for w in range(0,capacity+1): wi = weights[i-1] # weight of current item vi = values[i-1] # value of current item if (wi <= w) and (vi + table[i-1,w-wi] > table[i-1,w]): @@ -41,7 +41,7 @@ def knapsack_dp(values,weights,n_items,capacity,return_all=False): picks = [] K = capacity - for i in xrange(n_items,0,-1): + for i in range(n_items,0,-1): if keep[i,K] == 1: picks.append(i) K -= weights[i-1] @@ -54,7 +54,7 @@ def knapsack_dp(values,weights,n_items,capacity,return_all=False): return picks,max_val return picks -def check_inputs(values,weights,n_items,capacity): +def check_inputs(values, weights, n_items, capacity): # check variable type assert(isinstance(values,list)) assert(isinstance(weights,list)) @@ -74,4 +74,4 @@ def check_inputs(values,weights,n_items,capacity): n_items = 3 capacity = 3 picks = knapsack_dp(values,weights,n_items,capacity) - print picks + print (picks) diff --git a/vsum_tools.py b/utils/vsum_tool.py similarity index 61% rename from vsum_tools.py rename to utils/vsum_tool.py index 797b0bf..981bb5f 100644 --- a/vsum_tools.py +++ b/utils/vsum_tool.py @@ -1,36 +1,45 @@ import numpy as np -from knapsack import knapsack_dp +from utils.knapsack import knapsack_dp import math def generate_summary(ypred, cps, n_frames, nfps, positions, proportion=0.15, method='knapsack'): - """Generate keyshot-based video summary i.e. a binary vector. + """ + Generate keyshot-based video summary. i.e. a binary vector + Args: - --------------------------------------------- - - ypred: predicted importance scores. - - cps: change points, 2D matrix, each row contains a segment. - - n_frames: original number of frames. - - nfps: number of frames per segment. - - positions: positions of subsampled frames in the original video. - - proportion: length of video summary (compared to original video length). - - method: defines how shots are selected, ['knapsack', 'rank']. + ypred: predicted importance scores. + cps: change points, 2D matrix, each row contains a segment. + n_frames: original number of frames. + nfps: number of frames per segment. + positions: positions of subsampled frames in the original video. + proportion: length of video summary (compared to original video length). + method: defines how shots are selected, ['knapsack', 'rank']. + """ + n_segs = cps.shape[0] + + # Frame Score frame_scores = np.zeros((n_frames), dtype=np.float32) if positions.dtype != int: positions = positions.astype(np.int32) + if positions[-1] != n_frames: positions = np.concatenate([positions, [n_frames]]) - for i in xrange(len(positions) - 1): - pos_left, pos_right = positions[i], positions[i+1] - if i == len(ypred): - frame_scores[pos_left:pos_right] = 0 + + for idx in range(len(positions) - 1): + pos_cur, pos_next = positions[idx], positions[idx+1] + + if idx == len(ypred): + frame_scores[pos_cur:pos_next] = 0 else: - frame_scores[pos_left:pos_right] = ypred[i] + frame_scores[pos_cur:pos_next] = ypred[idx] + # Segment Score seg_score = [] - for seg_idx in xrange(n_segs): - start, end = int(cps[seg_idx,0]), int(cps[seg_idx,1]+1) - scores = frame_scores[start:end] + for seg_idx in range(n_segs): + pos_start, pos_end = int(cps[seg_idx, 0]), int(cps[seg_idx, 1]+1) + scores = frame_scores[pos_start: pos_end] seg_score.append(float(scores.mean())) limits = int(math.floor(n_frames * proportion)) @@ -41,37 +50,43 @@ def generate_summary(ypred, cps, n_frames, nfps, positions, proportion=0.15, met order = np.argsort(seg_score)[::-1].tolist() picks = [] total_len = 0 - for i in order: - if total_len + nfps[i] < limits: - picks.append(i) - total_len += nfps[i] + + for idx in order: + if total_len + nfps[idx] < limits: + picks.append(idx) + total_len += nfps[idx] + else: raise KeyError("Unknown method {}".format(method)) summary = np.zeros((1), dtype=np.float32) # this element should be deleted - for seg_idx in xrange(n_segs): + for seg_idx in range(n_segs): nf = nfps[seg_idx] if seg_idx in picks: tmp = np.ones((nf), dtype=np.float32) else: tmp = np.zeros((nf), dtype=np.float32) + summary = np.concatenate((summary, tmp)) summary = np.delete(summary, 0) # delete the first element return summary def evaluate_summary(machine_summary, user_summary, eval_metric='avg'): - """Compare machine summary with user summary (keyshot-based). + """ + Compare machine summary with user summary (Keyshot-based). + Args: - -------------------------------- - machine_summary and user_summary should be binary vectors of ndarray type. - eval_metric = {'avg', 'max'} - 'avg' averages results of comparing multiple human summaries. - 'max' takes the maximum (best) out of multiple comparisons. + machine_summary: summary by machine + user_summary: summary by user(annotation) + eval_metric: {'avg', 'max'} + 'avg' : average results of comparing multiple human summaries. + 'max' : takes the maximum(best) out of multiple comparisons. """ + machine_summary = machine_summary.astype(np.float32) user_summary = user_summary.astype(np.float32) - n_users,n_frames = user_summary.shape + n_users, n_frames = user_summary.shape # binarization machine_summary[machine_summary > 0] = 1 @@ -87,8 +102,8 @@ def evaluate_summary(machine_summary, user_summary, eval_metric='avg'): prec_arr = [] rec_arr = [] - for user_idx in xrange(n_users): - gt_summary = user_summary[user_idx,:] + for user_idx in range(n_users): + gt_summary = user_summary[user_idx, :] overlap_duration = (machine_summary * gt_summary).sum() precision = overlap_duration / (machine_summary.sum() + 1e-8) recall = overlap_duration / (gt_summary.sum() + 1e-8) @@ -96,6 +111,7 @@ def evaluate_summary(machine_summary, user_summary, eval_metric='avg'): f_score = 0. else: f_score = (2 * precision * recall) / (precision + recall) + f_scores.append(f_score) prec_arr.append(precision) rec_arr.append(recall) @@ -104,10 +120,11 @@ def evaluate_summary(machine_summary, user_summary, eval_metric='avg'): final_f_score = np.mean(f_scores) final_prec = np.mean(prec_arr) final_rec = np.mean(rec_arr) + elif eval_metric == 'max': final_f_score = np.max(f_scores) max_idx = np.argmax(f_scores) final_prec = prec_arr[max_idx] final_rec = rec_arr[max_idx] - + return final_f_score, final_prec, final_rec \ No newline at end of file diff --git a/video_summarization.py b/video_summarization.py new file mode 100644 index 0000000..e962ef6 --- /dev/null +++ b/video_summarization.py @@ -0,0 +1,229 @@ +from __future__ import print_function +import os +import sys +import h5py +import time +import datetime +import numpy as np +from tabulate import tabulate + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.optim import lr_scheduler +from torch.distributions import Bernoulli + +from config import config +from utils.file_process import Logger, read_json, write_json, save_checkpoint +from networks.DSN import DSN +from networks.RL import compute_reward +from utils import vsum_tool + +torch.manual_seed(config.SEED) +os.environ["CUDA_VISIBLE_DEVCIES"] = config.GPU +use_gpu = torch.cuda.is_available() +if config.USE_CPU: use_gpu = False + +def main(): + if not config.EVALUATE: + sys.stdout = Logger(os.path.join(config.SAVE_DIR, 'log_train.txt')) + else: + sys.stdout = Logger(os.path.join(config.SAVE_DIR, 'log_test.txt')) + + + if use_gpu: + print("Currently using GPU {}".format(config.GPU)) + cudnn.benchmark = True + torch.cuda.manual_seed(config.SEED) + else: + print("Currently using CPU") + + print("Initialize dataset {}".format(config.DATASET)) + dataset = h5py.File(config.DATASET, 'r') + num_videos = len(dataset.keys()) + + splits = read_json(config.SPLIT) + + if not config.TEST: + assert config.SPLIT_ID < len(splits), "split_id (got {}) exceeds {}".format(config.SPLIT_ID, len(splits )) + split = splits[config.SPLIT_ID] + train_keys = split["train_keys"] + test_keys = split["test_keys"] + print("# total videos {}. # train videos {}. # test videos {}.".format(num_videos, len(train_keys), len(test_keys))) + + print("Initialize model") + model = DSN(in_dim=config.INPUT_DIM, hid_dim=config.HIDDEN_DIM, num_layers = config.NUM_LAYERS, cell=config.RNN_CELL) + print("Model Size: {:.5f}M".format(sum(p.numel() for p in model.parameters())/1000000.0)) + + optimizer = torch.optim.Adam(model.parameters(), lr=config.LR, weight_decay=config.WEIGHT_DECAY) + if config.STEP_SIZE > 0: + scheduler = lr_scheduler.StepLR(optimizer, step_size= config.STEP_SIZE, gamma=config.GAMMA) + + if config.RESUME: + print("Loading checkpoint from '{}'".format(config.RESUME)) + checkpoint = torch.load(config.RESUME) + model.load_state_dict(checkpoint) + else: + start_epoch = 0 + + if use_gpu: + model = nn.DataParallel(model).cuda() + + if config.TEST: + print("Test only") + test(model, dataset, ['video_1'], use_gpu) + return + + + # Evaluate + if config.EVALUATE: + print("Evaluate only") + evaluate(model, dataset, test_keys, use_gpu) + return + + # Train + print("===> Start training") + start_time = time.time() + model.train() + baselines = {key: 0. for key in train_keys} # baseline rewards for videos + reward_writers = {key: [] for key in train_keys} # record reward changes for each video + + for epoch in range(start_epoch, config.MAX_EPOCH): + indices = np.arange(len(train_keys)) + np.random.shuffle(indices) + + # Input each Video to Model + for idx in indices: + key = train_keys[idx] + seq = dataset[key]['features'][...] # sequence of features, (seq_len, dim) + seq = torch.from_numpy(seq).unsqueeze(0) # input shape (1, seq_len, dim) + + if use_gpu: seq = seq.cuda() + probs = model(seq) # output shape (1, seq_len, 1) + + cost = config.BETA * (probs.mean() - 0.5) ** 2 # minimize summary length penalty term [Eq.11] + m = Bernoulli(probs) + + epis_rewards = [] + for _ in range(config.NUM_EPISODE): + actions = m.sample() + log_probs = m.log_prob(actions) + reward = compute_reward(seq, actions, use_gpu=use_gpu) + + expected_reward = log_probs.mean() * (reward - baselines[key]) + cost -= expected_reward # minimize negative expected reward + epis_rewards.append(reward.item()) + + optimizer.zero_grad() + cost.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) + optimizer.step() + + baselines[key] = 0.9 * baselines[key] + 0.1 * np.mean(epis_rewards) # update baseline reward via moving average + reward_writers[key].append(np.mean(epis_rewards)) + + epoch_reward = np.mean([reward_writers[key][epoch] for key in train_keys]) + print("epoch {}/{}\t reward {}\t".format(epoch+1, config.MAX_EPOCH, epoch_reward)) + + write_json(reward_writers, os.path.join(config.SAVE_DIR, 'rewards.json')) + evaluate(model, dataset, test_keys, use_gpu) + + elapsed = round(time.time() - start_time) + elapsed = str(datetime.timedelta(seconds=elapsed)) + print("Finished. Total elapsed time (h:m:s): {}".format(elapsed)) + + model_state_dict = model.module.state_dict() if use_gpu else model.state_dict() + model_save_path = os.path.join(config.SAVE_DIR, 'model_epoch' + str(config.MAX_EPOCH) + '.pth.tar') + save_checkpoint(model_state_dict, model_save_path) + print("Model saved to {}".format(model_save_path)) + + dataset.close() + + +def evaluate(model, dataset, test_keys, use_gpu): + print("===> Evaluation") + with torch.no_grad(): + model.eval() + fms = [] + eval_metric = 'avg' if config.METRIC == 'tvsum' else 'max' + + if config.VERBOSE: table = [["No.", "Video", "F-Score"]] + + if config.SAVE_RESULTS: + h5_res = h5py.File(os.path.join(config.SAVE_DIR, 'result.h5'), 'w') + + for key_idx, key in enumerate(test_keys): + seq = dataset[key]['features'][...] + seq = torch.from_numpy(seq).unsqueeze(0) + + if use_gpu: seq = seq.cuda() + probs = model(seq) + probs = probs.data.cpu().squeeze().numpy() + + cps = dataset[key]['change_points'][...] + num_frames = dataset[key]['n_frames'][()] + nfps = dataset[key]['n_frame_per_seg'][...].tolist() + positions = dataset[key]['picks'][...] + user_summary = dataset[key]['user_summary'][...] + + machine_summary = vsum_tool.generate_summary(probs, cps, num_frames, nfps, positions) + fm, _, _ = vsum_tool.evaluate_summary(machine_summary, user_summary, eval_metric) + fms.append(fm) + + + if config.VERBOSE: + table.append([key_idx+1, key, "{:.1%}".format(fm)]) + + if config.SAVE_RESULTS: + h5_res.create_dataset(key + '/score', data=probs) + h5_res.create_dataset(key + '/machine_summary', data=machine_summary) + h5_res.create_dataset(key + '/gtscore', data=dataset[key]['gtscore'][...]) + h5_res.create_dataset(key + '/fm', data=fm) + + if config.VERBOSE: + print(tabulate(table)) + + if config.SAVE_RESULTS: h5_res.close() + + mean_fm = np.mean(fms) + print("Average F-Score {:.1%}".format(mean_fm)) + + return mean_fm + +def test(model, dataset, test_data, use_gpu): + print("===> Test") + with torch.no_grad(): + model.eval() + + if config.SAVE_RESULTS: + h5_res = h5py.File(os.path.join(config.SAVE_DIR, 'result_test.h5'),'w') + + for key_idx, key in enumerate(test_data): + seq = dataset[key]['features'][...] + seq = torch.from_numpy(seq).unsqueeze(0) + + if use_gpu: seq.cuda() + probs = model(seq) + probs = probs.data.cpu().squeeze().numpy() + + cps = dataset[key]['change_points'][...] + num_frames = dataset[key]['n_frames'][...] + nfps = dataset[key]['n_frame_per_seg'][...].tolist() + positions = dataset[key]['picks'][...] + + machine_summary = vsum_tool.generate_summary(probs, cps, num_frames, nfps,positions) + + if config.SAVE_RESULTS: + h5_res.create_dataset(key + '/score', data=probs) + h5_res.create_dataset(key + '/machine_summary', data=machine_summary) + + if config.SAVE_RESULTS: + h5_res.close() + +if __name__ == '__main__': + main() + + + + + diff --git a/visualize_results.py b/visualize_results.py deleted file mode 100644 index 1027b97..0000000 --- a/visualize_results.py +++ /dev/null @@ -1,38 +0,0 @@ -import h5py -from matplotlib import pyplot as plt -import argparse -import os -import os.path as osp - -parser = argparse.ArgumentParser() -parser.add_argument('-p', '--path', type=str, required=True, - help="path to h5 file containing summarization results") -args = parser.parse_args() - -h5_res = h5py.File(args.path, 'r') -keys = h5_res.keys() - -for key in keys: - score = h5_res[key]['score'][...] - machine_summary = h5_res[key]['machine_summary'][...] - gtscore = h5_res[key]['gtscore'][...] - fm = h5_res[key]['fm'][()] - - # plot score vs gtscore - fig, axs = plt.subplots(2) - n = len(gtscore) - axs[0].plot(range(n), gtscore, color='red') - axs[0].set_xlim(0, n) - axs[0].set_yticklabels([]) - axs[0].set_xticklabels([]) - axs[1].set_title("video {} F-score {:.1%}".format(key, fm)) - axs[1].plot(range(n), score, color='blue') - axs[1].set_xlim(0, n) - axs[1].set_yticklabels([]) - axs[1].set_xticklabels([]) - fig.savefig(osp.join(osp.dirname(args.path), 'score_' + key + '.png'), bbox_inches='tight') - plt.close() - - print "Done video {}. # frames {}.".format(key, len(machine_summary)) - -h5_res.close() \ No newline at end of file