-
Notifications
You must be signed in to change notification settings - Fork 0
/
meccano_mm.py
103 lines (87 loc) · 4.7 KB
/
meccano_mm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import csv
import os
from PIL import Image
from torch.utils.data import Dataset
import numpy as np
import torch
class Meccano(Dataset):
def __init__(self, data_dir, mode, modality_1, modality_2, step_size, num_frames, transform):
self.data_dir = data_dir
self.mode = mode
self.modality_1 = modality_1
self.modality_2 = modality_2
self.transform = transform
self.step_size = step_size
self.num_frames = num_frames
print('Constructing MECCANO {}...'.format(mode))
self._construct_loader()
def _construct_loader(self):
path_to_file = os.path.join(
self.data_dir, 'action_annotations/MECCANO_{}_actions.csv'.format(self.mode.lower())
)
self._path_to_videos = []
self._labels = []
self._frame_start = []
self._frame_end = []
with open(path_to_file, 'r') as f:
for clip_idx, path_label in enumerate(f.read().splitlines()[1:]):
video_path, action_label, action_noun, frame_start, frame_end = path_label.split(',')
self._path_to_videos.append(video_path)
self._frame_start.append(frame_start)
self._frame_end.append(frame_end)
self._labels.append(int(action_label))
def __getitem__(self, index):
mod1_frames_desc = []
mod2_frames_desc = []
frame_count = int(self._frame_start[index][:-4])
while(frame_count <= int(self._frame_end[index][:-4])):
name_frame = str(frame_count)
if(len(name_frame) == 4): #add a prefix 0
name_frame = '0'+name_frame
elif(len(name_frame) == 3): #add two prefix 0
name_frame = '00'+name_frame
elif(len(name_frame) == 2): #add three prefix 0
name_frame = '000'+name_frame
elif(len(name_frame) == 1): #add four prefix 0
name_frame = '0000'+name_frame
mod1_image_path = self.data_dir+'/'+self.modality_1+'/'+self.mode+'/'+self._path_to_videos[index]+'/'+name_frame+'.jpg'
mod2_image_path = self.data_dir+'/'+self.modality_2+'/'+self.mode+'/'+self._path_to_videos[index]+'/'+name_frame+'.jpg'
mod1_frames_desc.append(mod1_image_path)
mod2_frames_desc.append(mod2_image_path)
frame_count+=1
frame_indexes = self.temporal_sampling(int(self._frame_start[index][:-4]), int(self._frame_end[index][:-4]), self.step_size, self.num_frames)
mod1_frames_desc = np.take(mod1_frames_desc, frame_indexes, 0)
mod2_frames_desc = np.take(mod2_frames_desc, frame_indexes, 0)
mod1_frames = [Image.open(frame_path) for frame_path in mod1_frames_desc]
mod2_frames = [Image.open(frame_path) for frame_path in mod2_frames_desc]
mod1_aug_frames = self.transform(mod1_frames)
mod2_aug_frames = self.transform(mod2_frames)
mod1_out_frames = torch.stack(mod1_aug_frames)
mod2_out_frames = torch.stack(mod2_aug_frames)
mod1_out_frames = mod1_out_frames.permute(1, 0, 2, 3)
mod2_out_frames = mod2_out_frames.permute(1, 0, 2, 3)
label = self._labels[index]
path = self._path_to_videos[index]
start = self._frame_start[index]
end = self._frame_end[index]
return mod1_out_frames, mod2_out_frames, label, path, start, end
def __len__(self):
return len(self._path_to_videos)
def temporal_sampling(self, start_idx, end_idx, step_size, num_samples):
# Calculate the total number of integers we can fit with the given step
max_possible_elements = (end_idx - start_idx) // step_size + 1
# Calculate the number of times we need to repeat the last integer
num_repeats = max(0, num_samples - max_possible_elements)
# Select random start_idx if step_size allows, without repeating the end_idx
if num_repeats == 0:
start_idx = np.random.randint(start_idx , end_idx - ((num_samples - 1) * step_size) + 1)
max_possible_elements = (end_idx - start_idx) // step_size + 1
num_repeats = max(0, num_samples - max_possible_elements)
# Create an array with the first 'num_samples - num_repeats' integers with the specified step
frame_ids = np.arange(start_idx, start_idx + (num_samples - num_repeats) * step_size, step_size)
# If num_repeats is greater than 0, append the last integer num_repeats times
if num_repeats > 0:
last_integer = frame_ids[-1]
frame_ids = np.append(frame_ids, np.full(num_repeats, last_integer))
index = frame_ids - start_idx
return index