-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampler.py
148 lines (108 loc) · 5.55 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import torch
from collections import defaultdict
import numpy as np
class BatchSampler:
def __init__(self, samples: torch.Tensor, labels: torch.Tensor, batch_size: int, num_class_eff=None, num_class_eff_min=None, num_class_eff_max=None, online=False):
self.samples = samples
self.labels = labels
self.batch_size = batch_size
self.label_dict = defaultdict(list)
self.classes = sorted(list(set(labels.tolist())))
for i, label in enumerate(labels):
self.label_dict[label.item()].append(i)
self.num_class = len(self.classes)
self.num_class_eff = num_class_eff if num_class_eff else self.num_class
self.num_class_eff_min = num_class_eff_min
self.num_class_eff_max = num_class_eff_max
self.max_samples_per_class = [len(self.label_dict[label]) for label in self.classes]
self.online = online
self.remaining_indices_per_class = None
if self.online:
# In online mode, initialize remaining_indices_per_class with all available indices per class
self.reset_remaining_indices()
def reset_remaining_indices(self):
"""Resets the remaining indices for online sampling."""
self.remaining_indices_per_class = {label: indices[:] for label, indices in self.label_dict.items()}
for label in self.remaining_indices_per_class:
np.random.shuffle(self.remaining_indices_per_class[label])
def generate_indices(self) -> list[int] | None:
final_indices = []
# Determine the number of classes to sample (num_class_eff)
if self.num_class_eff is None and self.num_class_eff_min is not None and self.num_class_eff_max is not None:
num_class_eff = np.random.randint(self.num_class_eff_min, self.num_class_eff_max + 1)
else:
num_class_eff = self.num_class_eff
# Offline setting: select classes first
if num_class_eff > 0 and num_class_eff < self.num_class:
selected_classes = np.random.choice(self.classes, num_class_eff, replace=False)
else:
selected_classes = self.classes
# Gather all indices from the selected classes
all_indices = []
for label in selected_classes:
all_indices.extend(self.label_dict[label])
if self.batch_size == -1 or len(all_indices) < self.batch_size:
final_indices = all_indices
else:
# Randomly select batch_size indices from all available indices
final_indices = np.random.choice(all_indices, self.batch_size, replace=False).tolist()
np.random.shuffle(final_indices)
return final_indices
class OnlineSampler:
def __init__(self, samples: torch.Tensor, labels: torch.Tensor, gamma: float, slots:int, batch_size: int, num_class_eff=None, num_class_eff_min=None, num_class_eff_max=None):
self.label_dict = defaultdict(list)
self.classes = sorted(list(set(labels.tolist())))
for i, label in enumerate(labels):
self.label_dict[label.item()].append(i)
self.labels = list(self.label_dict.keys())
self.labels.sort()
self.gamma = gamma
self.batch_size = batch_size
self.num_class = len(self.classes)
if slots is not None:
self.num_slots = slots
else:
self.num_slots = self.num_class_eff #if self.num_class_eff <= 100 else 100
self.c = 0
if self.gamma == -1:
self.create_indices_class(labels)
else:
self.create_indices(labels)
def generate_indices(self) -> list[int] | None:
if self.c + self.batch_size <= len(self.indices):
batch = self.indices[self.c:self.c + self.batch_size]
self.c += self.batch_size
return batch
else:
return None
def create_indices(self, test_labels):
selected_classes = self.classes
num_selected_classes = len(selected_classes)
final_indices = []
rng = np.random.default_rng()
label_distribution = rng.dirichlet([self.gamma] * self.num_slots, self.num_class)
slot_indices = [[] for _ in range(self.num_slots)]
for l,label in enumerate(selected_classes):
indices = np.array(self.label_dict[label])
partition = label_distribution[self.labels.index(label),:]
samples_per_splits = np.split(indices, (np.cumsum(partition)[:-1] * len(indices)).astype(int))
for s, ids in enumerate(samples_per_splits):
slot_indices[s].extend(ids)
for s_ids in slot_indices:
permutation = np.random.permutation(range(len(s_ids)))
ids = []
for i in permutation:
ids.extend(s_ids[i] if isinstance(s_ids[i], list) else [s_ids[i]])
final_indices.extend(ids)
self.indices = final_indices
def create_indices_class(self, test_labels):
# When gamma == -1, we want to order the classes separately and shuffle both classes and samples
final_indices = []
# Shuffle the order of the classes
shuffled_classes = np.random.permutation(self.classes)
for label in shuffled_classes:
indices = np.array(self.label_dict[label])
# Shuffle the samples within each class
np.random.shuffle(indices)
final_indices.extend(indices)
self.indices = final_indices