-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoco2017_download.py
185 lines (137 loc) · 6.82 KB
/
coco2017_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from pycocotools.coco import COCO
import json
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
def annotations_to_json(new_json, original_json, filenames):
# Create dictionaries for new annotations
new_annotations = {
"images": [],
"annotations": []
}
# Load original annotations
with open(original_json, 'r') as annotation_file:
original_annotations = json.load(annotation_file)
# Filter annotations for the selected images
for image_info in original_annotations['images']:
if image_info['file_name'] in filenames:
new_annotations['images'].append(image_info)
annotations = [ann for ann in original_annotations['annotations'] if ann['image_id'] == image_info['id']]
new_annotations['annotations'].extend(annotations)
print("\nWriting annotations.json for images...\n")
with open(new_json, "w") as json_file:
json.dump(new_annotations, json_file, indent=4)
# Function that lets you view a cluster (based on identifier)
def view_cluster(cluster, dict, image_dir):
plt.figure(figsize=(25, 25))
# Gets the list of filenames for a cluster
files = list(dict[cluster])
print(f"Task {cluster} size = {len(files)}")
# Only allow up to 30 images to be shown at a time
if len(files) > 30:
files = files[:29]
# Plot each image in the cluster
for index, file in enumerate(files):
plt.subplot(10, 10, index + 1)
img = Image.open(image_dir + file)
img = np.array(img)
plt.imshow(img)
plt.axis('off')
plt.show()
# Split to tasks using disjoint supercategories
def split2tasks_supercats(image_dir, new_json, caption_annFile, instance_annFile, path2txt, train=True):
# Define five disjoint supercategories
five_supercats = {
'Living Things': ['person', 'animal'],
'Vehicles and Outdoors': ['vehicle', 'outdoor', 'sports'],
'Home and Furniture': ['furniture', 'indoor', 'appliance'],
'Personal Items': ['accessory', 'electronic'],
'Food and Kitchen': ['food', 'kitchen']
}
# Create a mapping from original supercategories to new ones
original_to_new_supercat = {}
for new_supercat, original_supercats in five_supercats.items():
for original_supercat in original_supercats:
original_to_new_supercat[original_supercat] = new_supercat
# Load COCO dataset
coco = COCO(instance_annFile)
# Get all super-categories
cats = coco.loadCats(coco.getCatIds())
cat_to_supercat = {cat['id']: original_to_new_supercat[cat['supercategory']] for cat in cats}
# Get all images and filter for our subset
all_images = coco.loadImgs(coco.getImgIds())
# Initialize clusters
clusters = defaultdict(set)
image_filenames = []
# Cluster images by supercat (an image may be part of multiple supercategories, so an image belongs to more than one tasks)
print('\n Clustering images to disjoint supercategories...')
for img in tqdm(all_images):
ann_ids = coco.getAnnIds(imgIds=img['id'])
anns = coco.loadAnns(ann_ids)
supercats = set(cat_to_supercat[ann['category_id']] for ann in anns)
# If image belongs to multiple new supercategories, put it in 'Other'
if len(supercats) > 1:
clusters['Other'].add(img['file_name'])
# Check if the file exists before deleting
if os.path.exists(image_dir + img['file_name']):
os.remove(image_dir + img['file_name'])
else:
# If it belongs to exactly one category, add it to that category
try:
clusters[list(supercats)[0]].add(img['file_name'])
image_filenames.append(img['file_name'])
except IndexError:
# Check if the file exists before deleting
if os.path.exists(image_dir + img['file_name']):
os.remove(image_dir + img['file_name'])
continue
# Save the filenames selected
with open(path2txt, 'w') as file:
for filename in image_filenames:
file.write(f"{filename}\n")
# Create annotation file for selected images
annotations_to_json(new_json, caption_annFile, image_filenames)
# Print the distribution
total_images = sum(len(images) for images in clusters.values())
print(f"Total images: {total_images}")
print("\nDistribution:")
for supercat, images in clusters.items():
print(f"{supercat}: {len(images)} images ({len(images) / total_images * 100:.2f}%)")
# Calculate percentage of images in 'Other' category
other_percentage = len(clusters['Other']) / total_images * 100 if 'Other' in clusters else 0
print(f"\nPercentage of images in 'Other' category: {other_percentage:.2f}%")
print('\nCreating directories and annotations for each task...')
for cat in five_supercats.keys():
# view_cluster(cat, five_supercats, image_dir)
files = clusters[cat]
if train:
os.mkdir(f'./data/MSCOCO_annotations/task_{cat}_train_images')
for img in files:
image_data = Image.open(image_dir + img)
image_data.save(f'./data/MSCOCO_annotations/task_{cat}_train_images/' + img)
annotations_to_json(f'./data/MSCOCO_annotations/task_{cat}_train_annotations.json', new_json,
files)
else:
os.mkdir(f'./data/MSCOCO_annotations/task_{cat}_test_images')
for img in files:
image_data = Image.open(image_dir + img)
image_data.save(f'./data/MSCOCO_annotations/task_{cat}_test_images/' + img)
annotations_to_json(f'./data/MSCOCO_annotations/task_{cat}_test_annotations.json', new_json,
files)
# Training dataset arguments
train_cap_annFile = './data/MSCOCO_annotations/captions_train2017.json'
train_inst_annFile = './data/MSCOCO_annotations/instances_train2017.json'
train_image_dir = './data/MSCOCO_annotations/MSCOCO2017_train_images/'
new_train_json = './data/MSCOCO_annotations/filtered_train_annotations.json'
path2txt_train = './data/MSCOCO_annotations/train_image_filenames.txt'
split2tasks_supercats(train_image_dir, new_train_json, train_cap_annFile, train_inst_annFile, path2txt_train, train=True)
# Test dataset arguments
test_cap_annFile = './data/MSCOCO_annotations/captions_val2017.json'
test_inst_annFile = './data/MSCOCO_annotations/instances_val2017.json'
test_image_dir = './data/MSCOCO_annotations/MSCOCO2017_test_images/'
new_test_json = './data/MSCOCO_annotations/filtered_test_annotations.json'
path2txt_test = './data/MSCOCO_annotations/test_image_filenames.txt'
split2tasks_supercats(test_image_dir, new_test_json, test_cap_annFile, test_inst_annFile, path2txt_test, train=False)