Merge pull request #1739 from CentreForDigitalHumanities/feature/ner-…

…mappings Feature/ner mappings
CentreForDigitalHumanities · Jan 22, 2025 · c529be3 · c529be3
2 parents f249412 + 7f90281
commit c529be3
Show file tree

Hide file tree

Showing 12 changed files with 57 additions and 69 deletions.
diff --git a/.github/workflows/scheduled-build-and-push.yml b/.github/workflows/scheduled-build-and-push.yml
@@ -39,12 +39,3 @@ jobs:
             tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
             cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
             cache-to: type=inline
-        - name: Build Elasticsearch image, using cache from Github registry
-          uses: docker/build-push-action@v6
-          with:
-            context: .
-            file: DockerfileElastic
-            push: true
-            tags: ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
-            cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
-            cache-to: type=inline
diff --git a/DockerfileElastic b/DockerfileElastic
diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py
@@ -26,7 +26,6 @@ class MappingType(Enum):
     FLOAT = 'float'
     BOOLEAN = 'boolean'
     GEO_POINT = 'geo_point'
-    ANNOTATED_TEXT = 'annotated_text'
 
 
 class VisualizationType(Enum):

diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py
@@ -92,13 +92,12 @@ def float_mapping():
         'type': 'float'
     }
 
-
 def bool_mapping():
     return {'type': 'boolean'}
 
 def geo_mapping():
     return {'type': 'geo_point'}
 
 
-def annotated_text_mapping():
-    return {'type': 'annotated_text'}
+def non_indexed_text_mapping():
+    return {'type': 'text', 'index': False}
diff --git a/backend/addcorpus/permissions.py b/backend/addcorpus/permissions.py
@@ -1,9 +1,10 @@
 from rest_framework import permissions
 from rest_framework.exceptions import NotFound
 from rest_framework.request import Request
-from addcorpus.models import Corpus
+from addcorpus.models import Corpus, CorpusConfiguration
 
-def corpus_name_from_request(request):
+
+def corpus_name_from_request(request: Request):
     '''
     Extract the corpus name from a request
     '''
@@ -24,6 +25,11 @@ def corpus_name_from_request(request):
     return corpus
 
 
+def corpus_config_from_request(request: Request) -> CorpusConfiguration:
+    corpus_name = corpus_name_from_request(request)
+    return CorpusConfiguration.objects.get(corpus__name=corpus_name)
+
+
 class CanSearchCorpus(permissions.BasePermission):
     message = 'You do not have permission to access this corpus'
 

diff --git a/backend/addcorpus/tests/test_validators.py b/backend/addcorpus/tests/test_validators.py
@@ -1,7 +1,7 @@
 import pytest
 from addcorpus.models import Field
 from addcorpus.es_mappings import (
-    annotated_text_mapping,
+    non_indexed_text_mapping,
     date_mapping,
     int_mapping,
     text_mapping,
@@ -29,7 +29,7 @@ def test_validate_ner_slug():
         validate_ner_slug({}, "some:ner_inslug")
     with pytest.raises(ValidationError):
         validate_ner_slug(keyword_mapping(), "slug:ner")
-    validate_ner_slug(annotated_text_mapping(), "slug:ner")
+    validate_ner_slug(non_indexed_text_mapping(), "slug:ner")
     with pytest.raises(ValidationError):
         validate_ner_slug(date_mapping(), "slug:ner-kw")
     validate_ner_slug(keyword_mapping(), "slug:ner-kw")

diff --git a/backend/addcorpus/validation/creation.py b/backend/addcorpus/validation/creation.py
@@ -26,7 +26,6 @@ def supports_full_text_search(es_mapping):
 def _is_text(es_mapping):
     return primary_mapping_type(es_mapping) in [
         MappingType.TEXT.value,
-        MappingType.ANNOTATED_TEXT.value,
     ]
 
 
@@ -146,15 +145,16 @@ def validate_field_name_permissible_characters(slug: str):
 def validate_ner_slug(es_mapping: dict, name: str):
     """
     Checks if colons are in field name, will raise ValidationError if the field does not meet the following requirements:
-    - ends with `:ner` suffix and is an annotated_text field
+    - ends with `:ner` suffix and is a non-indexed text field
     - ends with `:ner-kw` suffix and is a keyword field
     """
     if ":" in name:
         if name.endswith(":ner"):
-            if primary_mapping_type(es_mapping) != MappingType.ANNOTATED_TEXT.value:
-                raise ValidationError(
-                    f"{name} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields"
-        )
+            if primary_mapping_type(es_mapping) != MappingType.TEXT.value:
+                if es_mapping.get('index', True):
+                    raise ValidationError(
+                        f"{name} cannot be used as a field name: the suffix `:ner` is reserved for Named Entity non-indexed text fields"
+                    )
         elif name.endswith(":ner-kw"):
             if primary_mapping_type(es_mapping) != MappingType.KEYWORD.value:
                 raise ValidationError(

diff --git a/backend/corpora/parliament/utils/parlamint.py b/backend/corpora/parliament/utils/parlamint.py
@@ -5,7 +5,7 @@
 from ianalyzer_readers.xml_tag import Tag
 from bs4.element import NavigableString, Tag as Node
 
-from addcorpus.es_mappings import annotated_text_mapping, keyword_mapping
+from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping
 from addcorpus.python_corpora.corpus import FieldDefinition
 from addcorpus.python_corpora.filters import MultipleChoiceFilter
 
@@ -208,7 +208,7 @@ def speech_ner():
     return FieldDefinition(
         name="speech:ner",
         hidden=True,
-        es_mapping=annotated_text_mapping(),
+        es_mapping=non_indexed_text_mapping(),
         display_type="text_content",
         searchable=True,
         extractor=XML(

diff --git a/backend/es/es_alias.py b/backend/es/es_alias.py
@@ -54,7 +54,7 @@ def alias(corpus: Corpus, clean=False):
 def get_current_index_name(corpus: CorpusConfiguration, client) -> str:
     """get the name of the current corpus' associated index"""
     alias = corpus.es_alias or corpus.es_index
-    indices = client.indices.get(index="{}".format(alias))
+    indices = client.indices.get(index=alias)
     return max(sorted(indices.keys()))
 
 

diff --git a/backend/es/tests/test_named_entity_search.py b/backend/es/tests/test_named_entity_search.py
@@ -9,22 +9,16 @@ def test_ner_search_view(es_ner_search_client, client):
 
 def test_construct_ner_query():
     viewset = NamedEntitySearchView()
-    fields = ['content:ner']
-    query = viewset.construct_named_entity_query(fields, 'my_identifier')
+    query = viewset.construct_named_entity_query('my_identifier')
     expected = {
         "bool": {
-            "must": [
-                {
-                    "term": {
-                        "id": "my_identifier"
-                    }
-                },
-                {
-                    "terms": {
-                        "content:ner": ["LOC", "PER", "ORG", "MISC"]
-                    }
-                }
-            ]
+            "must": {"term": {"id": "my_identifier"}},
+            "should": [
+                {"exists": {"field": "location:ner-kw"}},
+                {"exists": {"field": "miscellaneous:ner-kw"}},
+                {"exists": {"field": "organization:ner-kw"}},
+                {"exists": {"field": "person:ner-kw"}},
+            ],
         }
     }
     assert query == expected

diff --git a/backend/es/views.py b/backend/es/views.py
@@ -8,8 +8,10 @@
 from addcorpus.permissions import CanSearchCorpus
 from api.save_query import should_save_query
 from addcorpus.models import Corpus
+from addcorpus.permissions import corpus_config_from_request
 from api.models import Query
 from api.api_query import api_query_to_es_query
+from es.es_alias import get_current_index_name
 from es.search import get_index, total_hits, hits
 from ianalyzer.elasticsearch import elasticsearch
 from tag.permissions import CanSearchTags
@@ -115,13 +117,13 @@ class NamedEntitySearchView(APIView):
     permission_classes = [CanSearchCorpus]
 
     def get(self, request, *args, **kwargs):
-        corpus_name = kwargs.get('corpus')
+        corpus_config = corpus_config_from_request(request)
         document_id = kwargs.get('id')
-        client = elasticsearch(corpus_name)
-        index = get_index(corpus_name)
+        client = elasticsearch(corpus_config.corpus.name)
+        index = get_current_index_name(corpus_config, client)
         fields = self.find_named_entity_fields(client, index)
-        query = self.construct_named_entity_query(fields, document_id)
-        response = client.search(index=index, query=query, fields=fields)
+        query = self.construct_named_entity_query(document_id)
+        response = client.search(index=index, query=query)
         results = hits(response)
         annotations = {}
         response = {}
@@ -137,28 +139,26 @@ def find_named_entity_fields(self, client, index: str) -> list[str]:
         mapping = client.indices.get_mapping(index=index)
         fields = mapping[index]['mappings']['properties']
         field_names = fields.keys()
-        return [name for name in field_names if name.endswith(':ner') and fields[name].get('type') == 'annotated_text']
+        return [name for name in field_names if name.endswith(':ner')]
 
-    def construct_named_entity_query(self, fields: list[str], document_id: str) -> dict:
+    def construct_named_entity_query(self, document_id: str) -> dict:
+        """construct a query in which the document_id is obligatory, and any of the :ner-kw fields is present"""
         return {
             "bool": {
-                "must": [
-                    {
-                        "term": {
-                            "id": document_id
-                        }
-                    }, *self.add_terms(fields)
-                ]
+                "must": {"term": {"id": document_id}},
+                "should": [*self.add_terms()],
             }
         }
 
-    def add_terms(self, fields: list[str]) -> list[dict]:
+    def add_terms(self) -> list[dict]:
         return [
-            {
-                "terms": {
-                    field: ["LOC", "PER", "ORG", "MISC"]
-                }
-            } for field in fields
+            {"exists": {"field": field_name}}
+            for field_name in [
+                "location:ner-kw",
+                "miscellaneous:ner-kw",
+                "organization:ner-kw",
+                "person:ner-kw",
+            ]
         ]
 
     def find_entities(self, input_text: str) -> str:
@@ -170,8 +170,12 @@ def find_entities(self, input_text: str) -> str:
             if annotation.startswith('('):
                 continue
             elif annotation.startswith('['):
-               output.append(
-                   {'entity': self.entity_dict.get(annotations[index+1][1:-1]), 'text': annotation[1:-1]})
+                output.append(
+                    {
+                        'entity': self.entity_dict.get(annotations[index + 1][1:-1]),
+                        'text': annotation[1:-1],
+                    }
+                )
             else:
                 if annotation:
                     output.append({'entity': 'flat', 'text': annotation})

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   db:
-    image: docker.io/library/postgres
+    image: docker.io/library/postgres:17
     environment:
       - POSTGRES_DB=${SQL_DATABASE}
       - POSTGRES_USER=${SQL_USER}
@@ -52,9 +52,7 @@ services:
         target: /frontend/build
     command: sh -c "yarn prebuild && yarn start-docker"
   elasticsearch:
-    build:
-      context: .
-      dockerfile: DockerfileElastic
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.16.3
     environment:
       - node.name=ianalyzer-node
       - discovery.type=single-node