diff --git a/README.md b/README.md index 924c712a..82e3fe58 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ print(entry.split, entry.patient) | Name | Entries | Body region | Modality | |:-----------------------------------------------------------------------------------------------------------------------------------|----------:|:------------------------------------|:-----------------------------------------------------------------------| -| AMOS | 600 | Abdomen | CT, MRI | +| AMOS | 2465 | Abdomen | CT, MRI | | BIMCVCovid19 | 16335 | Chest | CT | | BraTS2021 | 5880 | Head | MRI T1, MRI T1Gd, MRI T2, MRI T2-FLAIR | | CC359 | 359 | Head | MRI T1 | diff --git a/amid/amos/dataset.py b/amid/amos/dataset.py index 24086793..9c567828 100644 --- a/amid/amos/dataset.py +++ b/amid/amos/dataset.py @@ -13,8 +13,10 @@ from .utils import label -ARCHIVE_NAME = 'amos22.zip' +ARCHIVE_NAME_SEG = 'amos22.zip' ARCHIVE_ROOT_NAME = 'amos22' +ERRORS = ['5514', '5437'] # these ids are damaged in the zip archives +# TODO: add MRI class AMOSBase(Source): @@ -56,24 +58,44 @@ def _base(_root: Silent): return Path(_root) @meta - def ids(_id2split): - return sorted(_id2split) - - def image(i, _id2split, _base): - file = f'images{_id2split[i]}/amos_{i}.nii.gz' - - with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked): + def ids(_id2split, _ids_unlabelled): + labelled = sorted(_id2split) + unlabelled = sorted(_ids_unlabelled) + return labelled + unlabelled + + def image(i, _id2split, _base, _archive_name): + """Corresponding 3D image.""" + if i in ERRORS: + return None # this image is damaged in the archive + + archive_name, archive_root = _archive_name + if i in _id2split: + archive_name = ARCHIVE_NAME_SEG + archive_root = ARCHIVE_ROOT_NAME + file = f'images{_id2split[i]}/amos_{i}.nii.gz' + else: + file = f'amos_{i}.nii.gz' + + with unpack(_base / archive_name, file, archive_root, '.zip') as (unpacked, is_unpacked): if is_unpacked: return np.asarray(nibabel.load(unpacked).dataobj) else: with open_nii_gz_file(unpacked) as image: return np.asarray(image.dataobj) - def affine(i, _id2split, _base): - """The 4x4 matrix that gives the image's spatial orientation""" - file = f'images{_id2split[i]}/amos_{i}.nii.gz' - - with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked): + def affine(i, _id2split, _base, _archive_name): + """The 4x4 matrix that gives the image's spatial orientation.""" + if i in ERRORS: + return None # this image is damaged in the archive + archive_name, archive_root = _archive_name + if i in _id2split: + archive_name = ARCHIVE_NAME_SEG + archive_root = ARCHIVE_ROOT_NAME + file = f'images{_id2split[i]}/amos_{i}.nii.gz' + else: + file = f'amos_{i}.nii.gz' + + with unpack(_base / archive_name, file, archive_root, '.zip') as (unpacked, is_unpacked): if is_unpacked: return nibabel.load(unpacked).affine else: @@ -81,17 +103,26 @@ def affine(i, _id2split, _base): return image.affine def mask(i, _id2split, _base): - file = f'labels{_id2split[i]}/amos_{i}.nii.gz' + if i in _id2split: + file = f'labels{_id2split[i]}/amos_{i}.nii.gz' + else: + return try: - with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked): + with unpack(_base / ARCHIVE_NAME_SEG, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked): if is_unpacked: return np.asarray(nibabel.load(unpacked).dataobj) else: with open_nii_gz_file(unpacked) as image: return np.asarray(image.dataobj) except FileNotFoundError: - return None + return + + def image_modality(i): + """Returns image modality, `CT` or `MRI`.""" + if 500 < int(i) <= 600: + return 'MRI' + return 'CT' # labels @@ -107,7 +138,7 @@ def mask(i, _id2split, _base): def _id2split(_base): id2split = {} - with ZipFile(_base / ARCHIVE_NAME) as zf: + with ZipFile(_base / ARCHIVE_NAME_SEG) as zf: for x in zf.namelist(): if (len(x.strip('/').split('/')) == 3) and x.endswith('.nii.gz'): file, split = x.split('/')[-1], x.split('/')[-2][-2:] @@ -117,12 +148,46 @@ def _id2split(_base): return id2split + def _ids_unlabelled(_base): + ids_unlabelled = [] + for archive in [ + 'amos22_unlabeled_ct_5000_5399.zip', + 'amos22_unlabeled_ct_5400_5899.zip', + 'amos22_unlabeled_ct_5900_6199.zip', + 'amos22_unlabeled_ct_6200_6899.zip', + ]: + with ZipFile(_base / archive) as zf: + for x in zf.namelist(): + if x.endswith('.nii.gz'): + file = x.split('/')[-1] + id_ = file.split('.')[0].split('_')[-1] + ids_unlabelled.append(id_) + return ids_unlabelled + @lru_cache(None) def _meta(_base): - file = 'labeled_data_meta_0000_0599.csv' - - with unpack(_base, file) as (unpacked, _): - return pd.read_csv(unpacked) + files = [ + 'labeled_data_meta_0000_0599.csv', + 'unlabeled_data_meta_5400_5899.csv', + 'unlabeled_data_meta_5000_5399.csv', + 'unlabeled_data_meta_5900_6199.csv', + ] + + dfs = [] + for file in files: + with unpack(_base, file) as (unpacked, _): + dfs.append(pd.read_csv(unpacked)) + return pd.concat(dfs) + + def _archive_name(i): + if 5000 <= int(i) < 5400: + return 'amos22_unlabeled_ct_5000_5399.zip', 'amos_unlabeled_ct_5000_5399' + elif 5400 <= int(i) < 5900: + return 'amos22_unlabeled_ct_5400_5899.zip', 'amos_unlabeled_ct_5400_5899' + elif 5900 <= int(i) < 6200: + return 'amos22_unlabeled_ct_5900_6199.zip', 'amos22_unlabeled_ct_5900_6199' + else: + return 'amos22_unlabeled_ct_6200_6899.zip', 'amos22_unlabeled_6200_6899' class SpacingFromAffine(Transform): @@ -140,7 +205,7 @@ def spacing(affine): license=licenses.CC_BY_40, link='https://zenodo.org/record/7262581', modality=('CT', 'MRI'), - raw_data_size='23G', + raw_data_size='23G', # TODO: update size with unlabelled prep_data_size='89,5G', task='Supervised multi-modality abdominal multi-organ segmentation', normalizers=[SpacingFromAffine()], diff --git a/amid/amos/utils.py b/amid/amos/utils.py index 5bd5fb14..9902f20b 100644 --- a/amid/amos/utils.py +++ b/amid/amos/utils.py @@ -7,6 +7,8 @@ def loader(column, i, _meta): # ambiguous data in meta if int(i) in [500, 600]: return None + elif int(i) not in _meta['amos_id']: + return None return _meta[_meta['amos_id'] == int(i)][column].item() diff --git a/amid/data/amos.hash b/amid/data/amos.hash index 4a31b6f1..d2d30959 100644 --- a/amid/data/amos.hash +++ b/amid/data/amos.hash @@ -1 +1 @@ -T:9311589f6b781685cc82e84bc605f7ea679cc2bf68a278b0de823ef57b1d7d9af42692059f19c51c31ca3f247bf0f9281d8487bdbb88feefd973b1cf62b54a6d \ No newline at end of file +T:6db6448dc4627ae833ad594f50152357582fac226c933b88bb754f1db422d1780b81d3375c2707d83023b52ff228112ffe6379e41debcbbedf0b1fb36c544eee \ No newline at end of file