-
Notifications
You must be signed in to change notification settings - Fork 512
/
wordnet_tagged_classification.py
489 lines (397 loc) · 17.7 KB
/
wordnet_tagged_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#
import argparse
import fcntl
import glob
import io
import os
import pickle
import random
import shutil
import tarfile
from pathlib import Path
from typing import Any, List, Mapping, Tuple
from urllib.parse import urlsplit
import pybase64
import torch
from PIL import Image, ImageFile
try:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
from nltk.stem import WordNetLemmatizer
NLTK_INSTALLED = True
except ModuleNotFoundError:
wn = None
Synset = None
WordNetLemmatizer = None
NLTK_INSTALLED = False
from corenet.constants import DATA_CACHE_DIR
from corenet.data.datasets import DATASET_REGISTRY
from corenet.data.datasets.classification.base_image_classification_dataset import (
BaseImageClassificationDataset,
)
from corenet.data.datasets.dataset_base import BaseImageDataset
from corenet.data.datasets.utils.text import caption_preprocessing
from corenet.data.io.transfer_clients import BaseClient, get_transfer_client
from corenet.data.transforms import BaseTransformation
from corenet.utils import logger
from corenet.utils.download_utils import get_local_path
# To enable reading truncated images, we update the default values of following variables in PIL
# TODO: Investigate later if below Image flags can be moved to where Image is read.
Image.MAX_IMAGE_PIXELS = None
ImageFile.LOAD_TRUNCATED_IMAGES = True
TAR_FILE_EXTN = "tar.gz"
TAR_FILE_EXTRACTION_CODE = "r:gz"
SAMPLE_FILE_EXTN = "pkl"
def extract_pos_offset_info_from_synset(synset: Synset) -> str:
"""Extracts part-of-speech and offset information from the input @synset.
Args:
synset: WordNet synset.
Returns:
A string containing part-of-speech and offset information about the synset.
"""
offset = synset.offset()
pos = synset.pos()
return f"{pos}{offset}"
def check_valid_noun_synset(synset: Synset, word: str) -> bool:
"""Check if input synset and word are the same.
Args:
synset: Input synset.
word: Input word.
Returns:
A boolean indicating if input synset and word are the same or not.
"""
return synset.name() == f"{word}.n.01"
@DATASET_REGISTRY.register(name="wordnet_tagged_classification", type="classification")
class WordnetTaggedClassificationDataset(BaseImageDataset):
"""WordNet tagged classification dataset.
This class converts the image-text dataset into multi-label classification dataset. The expected data structure of
the input data should be the same as 'corenet.data.datasets.multi_modal_img_text.img_text_tar_dataset.ImgTextTarDataset'
Args:
opts: Command-line arguments.
"""
def __init__(
self,
opts: argparse.Namespace,
*args,
**kwargs,
) -> None:
if not NLTK_INSTALLED:
logger.error(
"Please install NLTK library using 'pip install nltk==3.8.1' and 'python3 -m nltk.downloader all' commands."
)
super().__init__(opts=opts, *args, **kwargs)
self._transfer_client = None
self.vocab = self._get_vocab()
self.__post_init__()
def __post_init__(self) -> None:
"""Post init checks."""
if not self.is_training:
raise NotImplementedError("Only training is supported for now.")
def _get_vocab(self) -> List[str]:
"""Retrieves the vocabulary as a list.
The vocabulary is structured as a dictionary where synsets are represented as keys in pos-offset format,
and their corresponding frequencies in the dataset are stored as values. The key-value pairs are arranged
in descending order based on the frequencies. An example is shown below:
{
"n5928118": 52418327,
"n13333833": 46393897,
"n4960277": 38781582,
"n7947958": 36532096,
"n9638875": 34564013,
"n928077": 30290822,
"n7996689": 28076676,
"n10787470": 24182531,
"n5938976": 23817664,
"n8559508": 23476398
}
Returns:
A list containing the name of top-k synsets. The value of 'k' is specified using
'dataset.wordnet_tagged_classification.vocab_size' argument.
"""
vocab_file_path = getattr(
self.opts, "dataset.wordnet_tagged_classification.vocab_file"
)
if vocab_file_path is None:
logger.error(f"Vocab path can't be None in {self.__class__.__name__}.")
vocab_file_path = get_local_path(
self.opts,
path=vocab_file_path,
force_delete=False,
use_start_rank=True,
sync_ranks=False,
)
with open(vocab_file_path, "rb") as f:
vocab = pickle.load(f)
vocab_size = getattr(
self.opts, "dataset.wordnet_tagged_classification.vocab_size"
)
if vocab_size is None or vocab_size < 0:
logger.error(
f"Vocabulary size should be a positive number. Got: {vocab_size}. Please specify by 'dataset.wordnet_tagged_classification.vocab_size' argument."
)
return list(vocab.keys())[:vocab_size]
def _metadata_file_path(self) -> str:
"""Returns metadata file path from command-line arguments."""
opts = self.opts
metadata_file_path = getattr(
opts, f"dataset.wordnet_tagged_classification.metadata_file"
)
if not metadata_file_path:
logger.error(
f"Please specify metadata file path using 'dataset.wordnet_tagged_classification.metadata_file'."
)
return metadata_file_path
def _metadata(self):
"""Reads the metadata content.
...note:
The metadata file is expected to have following keys:
1. total_tar_files: Total number of tar files in the dataset.
2. max_files_per_tar: Maximum number of files inside each tar.
3. tar_file_names: List containing names of the tar files.
"""
opts = self.opts
metadata_file_path = self._metadata_file_path()
# download the metadata file
metadata_file_local_path = get_local_path(
opts,
path=metadata_file_path,
force_delete=False,
use_start_rank=True,
sync_ranks=False,
)
with open(metadata_file_local_path, "rb") as handle:
metadata = pickle.load(handle)
if not {"total_tar_files", "max_files_per_tar", "tar_file_names"}.issubset(
metadata.keys()
):
logger.error(
f"Metadata file in {self.__class__.__name__} should have following keys: \
total_tar_files, max_files_per_tar, tar_file_names"
)
return metadata
def _download_and_extract_tar_file(self, sample_index: int) -> int:
"""Downloads and extracts the tar file.
The tar files are pre-assumably stored in remote location (e.g., S3 bucket) and, if required, are downloaded and
extracted to local directory @self.cache_loc. Because of distributed and multi-process training, we first extract
them in the same location as downloaded, and then move to @self.cache_loc.
Args:
sample_index: Sample index.
Returns:
Index of the folder in which sample may be present.
...note:
Each tar file may have samples less than @self.max_files_per_tar because of filtering criteria.
"""
# Retrieve the folder index that may contain the sample.
folder_idx = sample_index // self.max_files_per_tar
metadata_file_path = self._metadata_file_path()
remote_directory = os.path.dirname(metadata_file_path)
remote_file_path = f"{remote_directory}/{folder_idx}.{TAR_FILE_EXTN}"
with open(
f"{self.cache_loc}/{folder_idx}.{TAR_FILE_EXTN}.lock", "a"
) as lock_file:
try:
fcntl.flock(lock_file, fcntl.LOCK_EX)
if os.path.isdir(f"{self.cache_loc}/{folder_idx}"):
return folder_idx
transfer_client = self._get_transfer_client(
file_path=metadata_file_path
)
local_tar_file_path = transfer_client.download(
remote_file_paths=remote_file_path, dst_dir=self.cache_loc
)
# extract the tar file in the same location where tar file is downloaded
tar_file_basename = os.path.basename(local_tar_file_path)
with tarfile.open(local_tar_file_path, TAR_FILE_EXTRACTION_CODE) as tar:
tar.extractall(
path=local_tar_file_path.replace(tar_file_basename, "")
)
# move extracted tar file to @self.cache_loc
shutil.move(
local_tar_file_path.replace(f".{TAR_FILE_EXTN}", ""), self.cache_loc
)
# Delete the tar file
if os.path.exists(local_tar_file_path):
os.remove(local_tar_file_path)
finally:
fcntl.flock(lock_file, fcntl.LOCK_UN)
return folder_idx
def _convert_caption_to_labels(self, captions_str: str) -> List[int]:
"""Converts the caption into multi-class labels.
The input caption is tokenized into words, and noun synsets are extracted for each word. Subsequently, the
parts of speech (POS) and offsets of the extracted noun synsets are compared with those in the vocabulary
to generate a list of multi-class labels.
Args:
captions_str: Input caption as a string.
Returns:
A list of integers, where each integer corresponds to the index of the matching synset in the vocabulary.
In case there are no matching synsets, an empty list is returned.
"""
captions_str = caption_preprocessing(captions_str)
# process caption and find synsets
tagged_words = nltk.pos_tag(nltk.word_tokenize(captions_str))
lemmatzr = WordNetLemmatizer()
labels = []
for word, pos in tagged_words:
# use lemmatizer to reduce text ambiguity.
# words like bicycle and bicycles are converted to bicycle
try:
word = lemmatzr.lemmatize(word)
noun_synset = wn.synset(f"{word}.n.01")
except Exception as e:
# No lemma 'is' with part of speech 'n', then nltk.corpus.reader.wordnet.WordNetError is raised.
# Skip such cases
continue
if not check_valid_noun_synset(noun_synset, word):
continue
noun_synset = extract_pos_offset_info_from_synset(noun_synset)
if noun_synset in self.vocab:
# add the indices of the labels
labels.append(self.vocab.index(noun_synset))
return labels
def _read_sample_with_wordnet_label_mining(
self, sample_index: int
) -> Tuple[Image.Image, List[str]]:
"""Reads the sample with WordNet derived labels.
The function extracts the image and caption corresponding to input @sample_index. It then
converts the caption into multi-class labels.
Args:
sample_index: Sample index.
Returns:
Returns a tuple of image and mult-class labels for a given sample index.
"""
# Check if this folder exists. If not, then download the tar file and extract it.
folder_idx = self._download_and_extract_tar_file(sample_index=sample_index)
file_name = f"{self.cache_loc}/{folder_idx}/{sample_index}.{SAMPLE_FILE_EXTN}"
if not Path(file_name).exists():
# Each tar file is supposed to have certain number of samples, but
# it may not have all samples (because some samples may be corrupted and are filtered).
# Therefore, if file does not exist, we randomly sample the file from a folder and return its content.
# This helps in avoiding errors related to tensor mismatch shapes (usually arises when each GPU has different batch size)
# when gathering the image and text embeddings from all GPUs in contrastive loss.
files_in_folder = glob.glob(
f"{self.cache_loc}/{folder_idx}/*.{SAMPLE_FILE_EXTN}"
)
assert len(files_in_folder) > 0
file_name = random.choice(files_in_folder)
with open(file_name, "rb") as handle:
data = pickle.load(handle)
img_bytes = pybase64.b64decode(data["image"], validate=True)
image = Image.open(io.BytesIO(img_bytes)).convert("RGBA").convert("RGB")
if "texts" in data:
caption_str = data["texts"]
elif "text" in data:
caption_str = data["text"]
else:
raise NotImplementedError("Text key not found.")
labels = self._convert_caption_to_labels(captions_str=caption_str)
return image, labels
def _training_transforms(self, *args, **kwargs) -> BaseTransformation:
"""Image transformations to be applied on input image during training.
See 'BaseImageClassificationDataset' for the supported transformations for the classification task.
"""
return BaseImageClassificationDataset._training_transforms(
self, *args, **kwargs
)
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Add dataset-specific arguments to the parser."""
if cls == WordnetTaggedClassificationDataset:
group = parser.add_argument_group(title=cls.__name__)
group.add_argument(
"--dataset.wordnet-tagged-classification.vocab-file",
type=str,
default=None,
help="Location of vocab pickle file. Defaults to None.",
)
group.add_argument(
"--dataset.wordnet-tagged-classification.metadata-file",
type=str,
default=None,
help="Metadata file containing information about img-text pairs. Defaults to None.",
)
group.add_argument(
"--dataset.wordnet-tagged-classification.vocab-size",
type=int,
default=None,
help="Vocabulary threshold. Synsets in the ordered vocabulary dictionary beyond this threshold will not be used. Defaults to None (i.e., user needs to specify the value).",
)
return parser
def _get_transfer_client(self, file_path: str) -> BaseClient:
"""Get transfer client for a given file path.
Args:
file_path: File path.
Returns:
An instance of BaseClient.
...note:
Some of the clients are not pickle-able (e.g., S3). Therefore, this function should not be
called inside the '__init__' function.
"""
if self._transfer_client is None:
opts = self.opts
client_name = urlsplit(file_path).scheme.lower()
self._transfer_client = get_transfer_client(
opts,
transfer_client_name=client_name,
force_delete=False,
only_download_on_start_rank=False,
synchronize_distributed_ranks=False,
parallel_download=False,
)
return self._transfer_client
def __getitem__(
self, sample_size_and_index: Tuple[int, int, int]
) -> Mapping[str, Any]:
"""Returns the sample corresponding to the input sample index.
Returned sample is transformed into the size specified by the input.
Args:
sample_size_and_index: Tuple of the form (crop_size_h, crop_size_w, sample_index).
Returns:
A dictionary with 'samples', 'targets', and 'sample_id' as keys corresponding to input,
label, and index of a sample, respectively.
Shapes:
The shape of values in output dictionary, output_data, are as follows:
output_data["samples"]: Shape is [Channels, Height, Width]
output_data["targets"]: Shape is [vocab_size]
output_data["sample_id"]: Shape is [1]
"""
crop_size_h, crop_size_w, sample_index = sample_size_and_index
transform_fn = self.get_augmentation_transforms(size=(crop_size_h, crop_size_w))
image, labels = self._read_sample_with_wordnet_label_mining(sample_index)
# convert labels to one hot vector
targets = torch.zeros((self.vocab_size), dtype=torch.long)
if labels is not None and len(labels) > 0:
targets[labels] = 1
output_data = {
"samples": transform_fn({"image": image})["image"],
"targets": targets,
"sample_id": sample_index,
}
return output_data
def __len__(self) -> int:
return self.total_tar_files * self.max_files_per_tar
@property
def cache_loc(self) -> str:
return DATA_CACHE_DIR
@property
def vocab_size(self):
return len(self.vocab)
@property
def total_tar_files(self) -> int:
"""Total number of tar files in the dataset."""
metadata = self._metadata()
return metadata["total_tar_files"]
@property
def max_files_per_tar(self) -> int:
"""Maximum number of files inside each tar file."""
metadata = self._metadata()
return metadata["max_files_per_tar"]
def extra_repr(self) -> str:
return super().extra_repr() + (
f"\n\ttotal_tar_files={self.total_tar_files}"
f"\n\tmax_files_per_tar={self.max_files_per_tar}"
f"\n\tnum_synsets={self.vocab_size}"
)