Skip to content

Commit

Permalink
Merge pull request #1739 from CentreForDigitalHumanities/feature/ner-…
Browse files Browse the repository at this point in the history
…mappings

Feature/ner mappings
  • Loading branch information
BeritJanssen authored Jan 22, 2025
2 parents f249412 + 7f90281 commit c529be3
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 69 deletions.
9 changes: 0 additions & 9 deletions .github/workflows/scheduled-build-and-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,3 @@ jobs:
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
cache-to: type=inline
- name: Build Elasticsearch image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: .
file: DockerfileElastic
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
cache-to: type=inline
3 changes: 0 additions & 3 deletions DockerfileElastic

This file was deleted.

1 change: 0 additions & 1 deletion backend/addcorpus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class MappingType(Enum):
FLOAT = 'float'
BOOLEAN = 'boolean'
GEO_POINT = 'geo_point'
ANNOTATED_TEXT = 'annotated_text'


class VisualizationType(Enum):
Expand Down
5 changes: 2 additions & 3 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,12 @@ def float_mapping():
'type': 'float'
}


def bool_mapping():
return {'type': 'boolean'}

def geo_mapping():
return {'type': 'geo_point'}


def annotated_text_mapping():
return {'type': 'annotated_text'}
def non_indexed_text_mapping():
return {'type': 'text', 'index': False}
10 changes: 8 additions & 2 deletions backend/addcorpus/permissions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from rest_framework import permissions
from rest_framework.exceptions import NotFound
from rest_framework.request import Request
from addcorpus.models import Corpus
from addcorpus.models import Corpus, CorpusConfiguration

def corpus_name_from_request(request):

def corpus_name_from_request(request: Request):
'''
Extract the corpus name from a request
'''
Expand All @@ -24,6 +25,11 @@ def corpus_name_from_request(request):
return corpus


def corpus_config_from_request(request: Request) -> CorpusConfiguration:
corpus_name = corpus_name_from_request(request)
return CorpusConfiguration.objects.get(corpus__name=corpus_name)


class CanSearchCorpus(permissions.BasePermission):
message = 'You do not have permission to access this corpus'

Expand Down
4 changes: 2 additions & 2 deletions backend/addcorpus/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest
from addcorpus.models import Field
from addcorpus.es_mappings import (
annotated_text_mapping,
non_indexed_text_mapping,
date_mapping,
int_mapping,
text_mapping,
Expand Down Expand Up @@ -29,7 +29,7 @@ def test_validate_ner_slug():
validate_ner_slug({}, "some:ner_inslug")
with pytest.raises(ValidationError):
validate_ner_slug(keyword_mapping(), "slug:ner")
validate_ner_slug(annotated_text_mapping(), "slug:ner")
validate_ner_slug(non_indexed_text_mapping(), "slug:ner")
with pytest.raises(ValidationError):
validate_ner_slug(date_mapping(), "slug:ner-kw")
validate_ner_slug(keyword_mapping(), "slug:ner-kw")
Expand Down
12 changes: 6 additions & 6 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def supports_full_text_search(es_mapping):
def _is_text(es_mapping):
return primary_mapping_type(es_mapping) in [
MappingType.TEXT.value,
MappingType.ANNOTATED_TEXT.value,
]


Expand Down Expand Up @@ -146,15 +145,16 @@ def validate_field_name_permissible_characters(slug: str):
def validate_ner_slug(es_mapping: dict, name: str):
"""
Checks if colons are in field name, will raise ValidationError if the field does not meet the following requirements:
- ends with `:ner` suffix and is an annotated_text field
- ends with `:ner` suffix and is a non-indexed text field
- ends with `:ner-kw` suffix and is a keyword field
"""
if ":" in name:
if name.endswith(":ner"):
if primary_mapping_type(es_mapping) != MappingType.ANNOTATED_TEXT.value:
raise ValidationError(
f"{name} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields"
)
if primary_mapping_type(es_mapping) != MappingType.TEXT.value:
if es_mapping.get('index', True):
raise ValidationError(
f"{name} cannot be used as a field name: the suffix `:ner` is reserved for Named Entity non-indexed text fields"
)
elif name.endswith(":ner-kw"):
if primary_mapping_type(es_mapping) != MappingType.KEYWORD.value:
raise ValidationError(
Expand Down
4 changes: 2 additions & 2 deletions backend/corpora/parliament/utils/parlamint.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ianalyzer_readers.xml_tag import Tag
from bs4.element import NavigableString, Tag as Node

from addcorpus.es_mappings import annotated_text_mapping, keyword_mapping
from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping
from addcorpus.python_corpora.corpus import FieldDefinition
from addcorpus.python_corpora.filters import MultipleChoiceFilter

Expand Down Expand Up @@ -208,7 +208,7 @@ def speech_ner():
return FieldDefinition(
name="speech:ner",
hidden=True,
es_mapping=annotated_text_mapping(),
es_mapping=non_indexed_text_mapping(),
display_type="text_content",
searchable=True,
extractor=XML(
Expand Down
2 changes: 1 addition & 1 deletion backend/es/es_alias.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def alias(corpus: Corpus, clean=False):
def get_current_index_name(corpus: CorpusConfiguration, client) -> str:
"""get the name of the current corpus' associated index"""
alias = corpus.es_alias or corpus.es_index
indices = client.indices.get(index="{}".format(alias))
indices = client.indices.get(index=alias)
return max(sorted(indices.keys()))


Expand Down
22 changes: 8 additions & 14 deletions backend/es/tests/test_named_entity_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,16 @@ def test_ner_search_view(es_ner_search_client, client):

def test_construct_ner_query():
viewset = NamedEntitySearchView()
fields = ['content:ner']
query = viewset.construct_named_entity_query(fields, 'my_identifier')
query = viewset.construct_named_entity_query('my_identifier')
expected = {
"bool": {
"must": [
{
"term": {
"id": "my_identifier"
}
},
{
"terms": {
"content:ner": ["LOC", "PER", "ORG", "MISC"]
}
}
]
"must": {"term": {"id": "my_identifier"}},
"should": [
{"exists": {"field": "location:ner-kw"}},
{"exists": {"field": "miscellaneous:ner-kw"}},
{"exists": {"field": "organization:ner-kw"}},
{"exists": {"field": "person:ner-kw"}},
],
}
}
assert query == expected
Expand Down
48 changes: 26 additions & 22 deletions backend/es/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
from addcorpus.permissions import CanSearchCorpus
from api.save_query import should_save_query
from addcorpus.models import Corpus
from addcorpus.permissions import corpus_config_from_request
from api.models import Query
from api.api_query import api_query_to_es_query
from es.es_alias import get_current_index_name
from es.search import get_index, total_hits, hits
from ianalyzer.elasticsearch import elasticsearch
from tag.permissions import CanSearchTags
Expand Down Expand Up @@ -115,13 +117,13 @@ class NamedEntitySearchView(APIView):
permission_classes = [CanSearchCorpus]

def get(self, request, *args, **kwargs):
corpus_name = kwargs.get('corpus')
corpus_config = corpus_config_from_request(request)
document_id = kwargs.get('id')
client = elasticsearch(corpus_name)
index = get_index(corpus_name)
client = elasticsearch(corpus_config.corpus.name)
index = get_current_index_name(corpus_config, client)
fields = self.find_named_entity_fields(client, index)
query = self.construct_named_entity_query(fields, document_id)
response = client.search(index=index, query=query, fields=fields)
query = self.construct_named_entity_query(document_id)
response = client.search(index=index, query=query)
results = hits(response)
annotations = {}
response = {}
Expand All @@ -137,28 +139,26 @@ def find_named_entity_fields(self, client, index: str) -> list[str]:
mapping = client.indices.get_mapping(index=index)
fields = mapping[index]['mappings']['properties']
field_names = fields.keys()
return [name for name in field_names if name.endswith(':ner') and fields[name].get('type') == 'annotated_text']
return [name for name in field_names if name.endswith(':ner')]

def construct_named_entity_query(self, fields: list[str], document_id: str) -> dict:
def construct_named_entity_query(self, document_id: str) -> dict:
"""construct a query in which the document_id is obligatory, and any of the :ner-kw fields is present"""
return {
"bool": {
"must": [
{
"term": {
"id": document_id
}
}, *self.add_terms(fields)
]
"must": {"term": {"id": document_id}},
"should": [*self.add_terms()],
}
}

def add_terms(self, fields: list[str]) -> list[dict]:
def add_terms(self) -> list[dict]:
return [
{
"terms": {
field: ["LOC", "PER", "ORG", "MISC"]
}
} for field in fields
{"exists": {"field": field_name}}
for field_name in [
"location:ner-kw",
"miscellaneous:ner-kw",
"organization:ner-kw",
"person:ner-kw",
]
]

def find_entities(self, input_text: str) -> str:
Expand All @@ -170,8 +170,12 @@ def find_entities(self, input_text: str) -> str:
if annotation.startswith('('):
continue
elif annotation.startswith('['):
output.append(
{'entity': self.entity_dict.get(annotations[index+1][1:-1]), 'text': annotation[1:-1]})
output.append(
{
'entity': self.entity_dict.get(annotations[index + 1][1:-1]),
'text': annotation[1:-1],
}
)
else:
if annotation:
output.append({'entity': 'flat', 'text': annotation})
Expand Down
6 changes: 2 additions & 4 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
services:
db:
image: docker.io/library/postgres
image: docker.io/library/postgres:17
environment:
- POSTGRES_DB=${SQL_DATABASE}
- POSTGRES_USER=${SQL_USER}
Expand Down Expand Up @@ -52,9 +52,7 @@ services:
target: /frontend/build
command: sh -c "yarn prebuild && yarn start-docker"
elasticsearch:
build:
context: .
dockerfile: DockerfileElastic
image: docker.elastic.co/elasticsearch/elasticsearch:8.16.3
environment:
- node.name=ianalyzer-node
- discovery.type=single-node
Expand Down

0 comments on commit c529be3

Please sign in to comment.