Europe PMC Publication annotations (#233)

* use DRFJsonApi browsable API * adds Publication detail endpoint to proxy Europe PMC annotations request * limits Europe PMC annotations to metagenomics, groups by type, and sorts alphabetically * moves Europe PMC annotation logic to separate src file, changes grouping/sorting/humanizing * adds unit test / mock for europe pmc annotations * moves Europe PMC annotations endpoint & provider strings to settings.py * version bump -> 2.0.1
EBI-Metagenomics · Oct 12, 2021 · 361a788 · 361a788
1 parent 6b7f348
commit 361a788
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 4 deletions.
diff --git a/emgapi/europe_pmc.py b/emgapi/europe_pmc.py
@@ -0,0 +1,76 @@
+import itertools
+
+import requests
+from django.conf import settings
+from django.http import Http404
+
+TITLE = 'title'
+DESCRIPTION = 'description'
+ANNOTATIONS = 'annotations'
+
+# based on http://blog.europepmc.org/2020/11/europe-pmc-publications-metagenomics-annotations.html
+annotation_type_humanize_map = {
+    'Sample-Material': {TITLE: 'Sample material', DESCRIPTION: 'Sample from which the microbiome is extracted'},
+    'Body-Site': {TITLE: 'Body site', DESCRIPTION: 'Host body region/structure where microbiome is found'},
+    'Host': {TITLE: 'Host', DESCRIPTION: 'The organism where the microbiome is found'},
+    'Engineered': {TITLE: 'Engineered environment', DESCRIPTION: 'Microbiome’s man-made environment'},
+    'Ecoregion': {TITLE: 'Ecoregion', DESCRIPTION: 'Microbiome’s natural environment'},
+    'Date': {TITLE: 'Date', DESCRIPTION: 'Sampling date'},
+    'Place': {TITLE: 'Place', DESCRIPTION: 'Microbiome’s place or geocoordinates'},
+    'Site': {TITLE: 'Site', DESCRIPTION: 'Microbiome’s site within place'},
+    'State': {TITLE: 'State', DESCRIPTION: 'Host/Environment state'},
+    'Treatment': {TITLE: 'Treatment', DESCRIPTION: 'Host/Environment treatments'},
+    'Kit': {TITLE: 'Kit', DESCRIPTION: 'Nucleic acid extraction-kit'},
+    'Gene': {TITLE: 'Gene', DESCRIPTION: 'Target gene(s) (e.g. hypervariable regions of 16s/18s rRNA gene)'},
+    'Primer': {TITLE: 'Primer', DESCRIPTION: 'PCR primers'},
+    'LS': {TITLE: 'Library strategy', DESCRIPTION: 'e.g. aplicon, whole metagenome'},
+    'LCM': {TITLE: 'Library construction method', DESCRIPTION: 'e.g. paired-end, single-end'},
+    'Sequencing': {TITLE: 'Sequencing platform', DESCRIPTION: ''},
+}
+
+# sample processing annotations tend to be more accurate than others.
+sample_processing_annotation_types = ['Sequencing', 'LS', 'LCM', 'Kit', 'Primer']
+
+
+def get_publication_annotations(pubmed_id):
+    """
+    Fetch EMERALD-provided Europe PMC metagenomics annotations for a paper, and group them by type.
+    :param pubmed_id: the publication identified in pubmed
+    :return: grouped and sorted annotations, dict of lists of dicts
+    """
+    epmc = requests.get(settings.EUROPE_PMC['annotations_endpoint'], params={
+        'articleIds': f'MED:{pubmed_id}',
+        'provider': settings.EUROPE_PMC['annotations_provider']
+    })
+    try:
+        assert epmc.status_code == 200
+        annotations = epmc.json()[0][ANNOTATIONS]
+    except (AssertionError, KeyError, IndexError):
+        raise Http404
+
+    # Group by annotation type, sort within group by icase annotation text
+    grouped_annotations = {
+        anno_type: sorted([anno for anno in annots], key=lambda anno: anno.get('exact', '').lower())
+        for anno_type, annots
+        in itertools.groupby(annotations, key=lambda annotation: annotation.get('type', 'Other'))
+    }
+
+    # Split off special sample processing annotation groups
+    sample_processing_annotations = []
+    other_annotations = []
+
+    for anno_type, annots in grouped_annotations.items():
+        humanized_annotation_group = {
+            **annotation_type_humanize_map.get(anno_type, {TITLE: anno_type, DESCRIPTION: ''}),
+            ANNOTATIONS: annots
+        }
+        if anno_type in sample_processing_annotation_types:
+            sample_processing_annotations.append(humanized_annotation_group)
+        else:
+            other_annotations.append(humanized_annotation_group)
+
+    # Sort each group by highest number of annotations of that type
+    sample_processing_annotations.sort(key=lambda group: len(group.get(ANNOTATIONS, [])), reverse=True)
+    other_annotations.sort(key=lambda group: len(group.get(ANNOTATIONS, [])), reverse=True)
+
+    return {'sample_processing': sample_processing_annotations, 'other': other_annotations}
diff --git a/emgapi/views.py b/emgapi/views.py
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import os
 import logging
 import inflection
@@ -49,6 +48,7 @@
 from . import utils as emg_utils
 from . import renderers as emg_renderers
 from . import filters as emg_filters
+from .europe_pmc import get_publication_annotations
 from .sourmash import validate_sourmash_signature, save_signature, send_sourmash_jobs, get_sourmash_job_status, \
     get_result_file
 
@@ -1164,6 +1164,15 @@ def list(self, request, *args, **kwargs):
         """
         return super(PublicationViewSet, self).list(request, *args, **kwargs)
 
+    @action(
+        detail=True,
+        methods=['get', ]
+    )
+    def europe_pmc_annotations(self, request, pubmed_id=None):
+        if not pubmed_id:
+            raise Http404
+        return Response(data=get_publication_annotations(pubmed_id))
+
 
 class GenomeCatalogueViewSet(mixins.RetrieveModelMixin,
                              emg_mixins.ListModelMixin,

diff --git a/emgcli/settings.py b/emgcli/settings.py
@@ -201,6 +201,7 @@ def create_secret_key(var_dir):
     'rest_framework_mongoengine',
     'rest_framework_jwt',
     'django_filters',
+    'rest_framework_json_api',
     # apps
     'emgapi',
     'emgena',
@@ -364,7 +365,7 @@ def create_secret_key(var_dir):
         # 'rest_framework_xml.renderers.XMLRenderer',
         # 'rest_framework_yaml.renderers.YAMLRenderer',
         'emgapi.renderers.CSVStreamingRenderer',
-        'rest_framework.renderers.BrowsableAPIRenderer',
+        'rest_framework_json_api.renderers.BrowsableAPIRenderer',
     ),
 
     'DEFAULT_FILTER_BACKENDS': (
@@ -636,3 +637,11 @@ def create_secret_key(var_dir):
         "celery_broker": "redis://localhost:6379/0",
         "celery_backend": "redis://localhost:6379/0",
     }
+
+try:
+    EUROPE_PMC = EMG_CONF['emg']['europe_pmc']
+except KeyError:
+    EUROPE_PMC = {
+        "annotations_endpoint": 'https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds',
+        "annotations_provider": "Metagenomics"
+    }
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 _requirements = os.path.join(_base, 'requirements.txt')
 _requirements_test = os.path.join(_base, 'requirements-test.txt')
 
-version = "2.0.0"
+version = "2.0.1"
 
 install_requirements = []
 with open(_requirements) as f:

diff --git a/tests/api/test_publication.py b/tests/api/test_publication.py
@@ -13,17 +13,57 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from unittest import mock
 
 from django.urls import reverse
+from model_bakery import baker
 
 from rest_framework import status
 from rest_framework.test import APITestCase
 
 
+class MockEuropePMCResponse:
+    status_code = 200
+
+    @staticmethod
+    def json():
+        return [
+            {
+                'annotations': [
+                    {
+                        'prefix': 'Love is required whenever he’s ',
+                        'exact': 'sequenced',
+                        'postfix': '. It comes just before the assembly.',
+                        'type': 'LS',
+                    }
+                ]
+            }
+        ]
+
+
 class TestPublicationAPI(APITestCase):
+    def setUp(self):
+        baker.make(
+            'emgapi.Publication',
+            pk=7,
+            pubmed_id='007',
+            pub_title='The man with the golden metagenome',
+            authors='Bond, J; Moneypenny, J; et al'
+        )
 
     def test_default(self):
         url = reverse('emgapi_v1:publications-list')
         response = self.client.get(url)
         assert response.status_code == status.HTTP_200_OK
+
+    @mock.patch('emgapi.europe_pmc.requests.get')
+    def test_europe_pmc_annotations(self, mock_get):
+        mock_get.return_value = MockEuropePMCResponse()
+        url = reverse('emgapi_v1:publications-europe-pmc-annotations', args=('007',))
+        response = self.client.get(url)
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        annotations = response.json()
+        self.assertIn('sample_processing', annotations['data'])
+        first_group = annotations['data']['sample_processing'][0]
+        self.assertEqual(first_group['title'], 'Library strategy')
+        self.assertEqual(len(first_group['annotations']), 1)