neuro-ml · kurmukovai · Oct 17, 2023 · Oct 14, 2023 · Oct 14, 2023 · Oct 14, 2023
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ print(entry.split, entry.patient)
 
 | Name                                                                                                                               |   Entries | Body region                         | Modality                                                               |
 |:-----------------------------------------------------------------------------------------------------------------------------------|----------:|:------------------------------------|:-----------------------------------------------------------------------|
-| <a href="https://neuro-ml.github.io/amid/latest/datasets-api/#amid.amos.dataset.AMOS">AMOS</a>                                     |       600 | Abdomen                             | CT, MRI                                                                |
+| <a href="https://neuro-ml.github.io/amid/latest/datasets-api/#amid.amos.dataset.AMOS">AMOS</a>                                     |      2465 | Abdomen                             | CT, MRI                                                                |
 | <a href="https://neuro-ml.github.io/amid/latest/datasets-api/#amid.bimcv.BIMCVCovid19">BIMCVCovid19</a>                            |     16335 | Chest                               | CT                                                                     |
 | <a href="https://neuro-ml.github.io/amid/latest/datasets-api/#amid.brats2021.BraTS2021">BraTS2021</a>                              |      5880 | Head                                | MRI T1, MRI T1Gd, MRI T2, MRI T2-FLAIR                                 |
 | <a href="https://neuro-ml.github.io/amid/latest/datasets-api/#amid.cc359.dataset.CC359">CC359</a>                                  |       359 | Head                                | MRI T1                                                                 |

diff --git a/amid/amos/dataset.py b/amid/amos/dataset.py
@@ -13,8 +13,10 @@
 from .utils import label
 
 
-ARCHIVE_NAME = 'amos22.zip'
+ARCHIVE_NAME_SEG = 'amos22.zip'
 ARCHIVE_ROOT_NAME = 'amos22'
+ERRORS = ['5514', '5437']  # these ids are damaged in the zip archives
+# TODO: add MRI
 
 
 class AMOSBase(Source):
@@ -56,42 +58,71 @@ def _base(_root: Silent):
         return Path(_root)
 
     @meta
-    def ids(_id2split):
-        return sorted(_id2split)
-
-    def image(i, _id2split, _base):
-        file = f'images{_id2split[i]}/amos_{i}.nii.gz'
-
-        with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked):
+    def ids(_id2split, _ids_unlabelled):
+        labelled = sorted(_id2split)
+        unlabelled = sorted(_ids_unlabelled)
+        return labelled + unlabelled
+
+    def image(i, _id2split, _base, _archive_name):
+        """Corresponding 3D image."""
+        if i in ERRORS:
+            return None  # this image is damaged in the archive
+
+        archive_name, archive_root = _archive_name
+        if i in _id2split:
+            archive_name = ARCHIVE_NAME_SEG
+            archive_root = ARCHIVE_ROOT_NAME
+            file = f'images{_id2split[i]}/amos_{i}.nii.gz'
+        else:
+            file = f'amos_{i}.nii.gz'
+
+        with unpack(_base / archive_name, file, archive_root, '.zip') as (unpacked, is_unpacked):
             if is_unpacked:
                 return np.asarray(nibabel.load(unpacked).dataobj)
             else:
                 with open_nii_gz_file(unpacked) as image:
                     return np.asarray(image.dataobj)
 
-    def affine(i, _id2split, _base):
-        """The 4x4 matrix that gives the image's spatial orientation"""
-        file = f'images{_id2split[i]}/amos_{i}.nii.gz'
-
-        with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked):
+    def affine(i, _id2split, _base, _archive_name):
+        """The 4x4 matrix that gives the image's spatial orientation."""
+        if i in ERRORS:
+            return None  # this image is damaged in the archive
+        archive_name, archive_root = _archive_name
+        if i in _id2split:
+            archive_name = ARCHIVE_NAME_SEG
+            archive_root = ARCHIVE_ROOT_NAME
+            file = f'images{_id2split[i]}/amos_{i}.nii.gz'
+        else:
+            file = f'amos_{i}.nii.gz'
+
+        with unpack(_base / archive_name, file, archive_root, '.zip') as (unpacked, is_unpacked):
             if is_unpacked:
                 return nibabel.load(unpacked).affine
             else:
                 with open_nii_gz_file(unpacked) as image:
                     return image.affine
 
     def mask(i, _id2split, _base):
-        file = f'labels{_id2split[i]}/amos_{i}.nii.gz'
+        if i in _id2split:
+            file = f'labels{_id2split[i]}/amos_{i}.nii.gz'
+        else:
+            return
 
         try:
-            with unpack(_base / ARCHIVE_NAME, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked):
+            with unpack(_base / ARCHIVE_NAME_SEG, file, ARCHIVE_ROOT_NAME, '.zip') as (unpacked, is_unpacked):
                 if is_unpacked:
                     return np.asarray(nibabel.load(unpacked).dataobj)
                 else:
                     with open_nii_gz_file(unpacked) as image:
                         return np.asarray(image.dataobj)
         except FileNotFoundError:
-            return None
+            return
+
+    def image_modality(i):
+        """Returns image modality, `CT` or `MRI`."""
+        if 500 < int(i) <= 600:
+            return 'MRI'
+        return 'CT'
 
     # labels
 
@@ -107,7 +138,7 @@ def mask(i, _id2split, _base):
     def _id2split(_base):
         id2split = {}
 
-        with ZipFile(_base / ARCHIVE_NAME) as zf:
+        with ZipFile(_base / ARCHIVE_NAME_SEG) as zf:
             for x in zf.namelist():
                 if (len(x.strip('/').split('/')) == 3) and x.endswith('.nii.gz'):
                     file, split = x.split('/')[-1], x.split('/')[-2][-2:]
@@ -117,12 +148,46 @@ def _id2split(_base):
 
         return id2split
 
+    def _ids_unlabelled(_base):
+        ids_unlabelled = []
+        for archive in [
+            'amos22_unlabeled_ct_5000_5399.zip',
+            'amos22_unlabeled_ct_5400_5899.zip',
+            'amos22_unlabeled_ct_5900_6199.zip',
+            'amos22_unlabeled_ct_6200_6899.zip',
+        ]:
+            with ZipFile(_base / archive) as zf:
+                for x in zf.namelist():
+                    if x.endswith('.nii.gz'):
+                        file = x.split('/')[-1]
+                        id_ = file.split('.')[0].split('_')[-1]
+                        ids_unlabelled.append(id_)
+        return ids_unlabelled
+
     @lru_cache(None)
     def _meta(_base):
-        file = 'labeled_data_meta_0000_0599.csv'
-
-        with unpack(_base, file) as (unpacked, _):
-            return pd.read_csv(unpacked)
+        files = [
+            'labeled_data_meta_0000_0599.csv',
+            'unlabeled_data_meta_5400_5899.csv',
+            'unlabeled_data_meta_5000_5399.csv',
+            'unlabeled_data_meta_5900_6199.csv',
+        ]
+
+        dfs = []
+        for file in files:
+            with unpack(_base, file) as (unpacked, _):
+                dfs.append(pd.read_csv(unpacked))
+        return pd.concat(dfs)
+
+    def _archive_name(i):
+        if 5000 <= int(i) < 5400:
+            return 'amos22_unlabeled_ct_5000_5399.zip', 'amos_unlabeled_ct_5000_5399'
+        elif 5400 <= int(i) < 5900:
+            return 'amos22_unlabeled_ct_5400_5899.zip', 'amos_unlabeled_ct_5400_5899'
+        elif 5900 <= int(i) < 6200:
+            return 'amos22_unlabeled_ct_5900_6199.zip', 'amos22_unlabeled_ct_5900_6199'
+        else:
+            return 'amos22_unlabeled_ct_6200_6899.zip', 'amos22_unlabeled_6200_6899'
 
 
 class SpacingFromAffine(Transform):
@@ -140,7 +205,7 @@ def spacing(affine):
     license=licenses.CC_BY_40,
     link='https://zenodo.org/record/7262581',
     modality=('CT', 'MRI'),
-    raw_data_size='23G',
+    raw_data_size='23G',  # TODO: update size with unlabelled
     prep_data_size='89,5G',
     task='Supervised multi-modality abdominal multi-organ segmentation',
     normalizers=[SpacingFromAffine()],

diff --git a/amid/amos/utils.py b/amid/amos/utils.py
@@ -7,6 +7,8 @@ def loader(column, i, _meta):
     # ambiguous data in meta
     if int(i) in [500, 600]:
         return None
+    elif int(i) not in _meta['amos_id']:
+        return None
 
     return _meta[_meta['amos_id'] == int(i)][column].item()
 

diff --git a/amid/data/amos.hash b/amid/data/amos.hash
@@ -1 +1 @@
-T:9311589f6b781685cc82e84bc605f7ea679cc2bf68a278b0de823ef57b1d7d9af42692059f19c51c31ca3f247bf0f9281d8487bdbb88feefd973b1cf62b54a6d
+T:6db6448dc4627ae833ad594f50152357582fac226c933b88bb754f1db422d1780b81d3375c2707d83023b52ff228112ffe6379e41debcbbedf0b1fb36c544eee
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		T:9311589f6b781685cc82e84bc605f7ea679cc2bf68a278b0de823ef57b1d7d9af42692059f19c51c31ca3f247bf0f9281d8487bdbb88feefd973b1cf62b54a6d
		T:6db6448dc4627ae833ad594f50152357582fac226c933b88bb754f1db422d1780b81d3375c2707d83023b52ff228112ffe6379e41debcbbedf0b1fb36c544eee