-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
53 lines (40 loc) · 1.73 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import torch
from mmpt.models import MMPTModel
from torchvision.io import read_video, write_video
from torch.nn import Sequential
import torchvision.transforms as Transforms
from scripts.video_feature_extractor.preprocessing import Preprocessing
import skvideo.io
import numpy as np
#video_frames, _, info = read_video('tennis-game.mkv', start_pts #= 60.0, end_pts= 62.38, pts_unit='sec') # T, H, W, C
#T, H, W, C = video_frames.size()
#print(video_frames.size())
#fps = int(info["video_fps"])
#print("FPS:", fps)
#print(video_frames.size())
#video_frames = video_frames.view((1, -1, 30, H, W, C))
videodata = skvideo.io.vread("tennis-game.mkv")
videodata = videodata[1000:1060,:,:,:]
_, H, W, C = videodata.shape
videodata = np.reshape(videodata, (1, 2, 30, H, W, C))
print(videodata.shape)
model, tokenizer, aligner = MMPTModel.from_pretrained(
"projects/retri/videoclip/how2.yaml")
model.eval()
#transforms = Sequential(
# Transforms.CenterCrop(224),
# Transforms.ConvertImageDtype(torch.float))
#T.Normalize([mean_R, mean_G, mean_B], [std_R, std_G, std_B],
#frames = transforms(video_frames)
preprocessor = Preprocessing('s3d')
# B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
text_to_try = ["cats and dogs", "ordering a meal", "two men", "men playing sports", "tennis", "men playing tennis"]
video_frames = torch.from_numpy(videodata / 255.0).float()
for text in text_to_try:
caps, cmasks = aligner._build_text_seq(
tokenizer(text, add_special_tokens=False)["input_ids"]
)
caps, cmasks = caps[None, :], cmasks[None, :] # bsz=1
with torch.no_grad():
output = model(video_frames, caps, cmasks, return_score=True)
print(text, output["score"]) # dot-product