Skip to content

Commit

Permalink
Merge pull request #1677 from CentreForDigitalHumanities/feature/stan…
Browse files Browse the repository at this point in the history
…dard-term-vector

Always add term vector to main content field and fix mappings in some parliamentary corpora
  • Loading branch information
BeritJanssen authored Oct 17, 2024
2 parents 3431d3c + 6f9c28f commit 1ef29a6
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 22 deletions.
14 changes: 5 additions & 9 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
def primary_mapping_type(es_mapping: Dict) -> str:
return es_mapping.get('type', None)

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):

def main_content_mapping(
token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None
):
'''
Mapping for the main content field. Options:
Expand All @@ -14,14 +17,7 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

mapping = {
'type': 'text'
}

if updated_highlighting:
mapping.update({
'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting
})
mapping = {"type": "text", "term_vector": "with_positions_offsets"}

if any([token_counts, stopword_analysis, stemming_analysis]):
multifields = {}
Expand Down
2 changes: 1 addition & 1 deletion backend/corpora/parliament/finland.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def sources(self, start, end):
speaker_birth_year = field_defaults.speaker_birth_year()
speaker_birth_year.extractor = person_attribute_extractor('birth_year')

speech = field_defaults.speech()
speech = field_defaults.speech(language="fi")
speech.extractor = XML(transform = clean_value)

speech_id = field_defaults.speech_id()
Expand Down
13 changes: 2 additions & 11 deletions backend/corpora/parliament/ireland.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from addcorpus.python_corpora.corpus import CorpusDefinition, CSVCorpusDefinition, XMLCorpusDefinition
from addcorpus.python_corpora.extract import Constant, CSV, XML, Metadata, Combined, Backup
from addcorpus.es_mappings import main_content_mapping
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults
import corpora.utils.formatting as formatting
Expand Down Expand Up @@ -149,7 +150,6 @@ def sources(self, start, end):
source_archive = field_defaults.source_archive()
source_archive.extractor = Constant('1919-2013')


fields = [
date,
country,
Expand Down Expand Up @@ -495,17 +495,8 @@ def source2dicts(self, source):
speaker_id = field_defaults.speaker_id()
speaker_constituency = field_defaults.speaker_constituency()

speech = field_defaults.speech()
# no language-specific analysers since the corpus is mixed-language
speech.es_mapping = {
"type" : "text",
"fields": {
"length": {
"type": "token_count",
"analyzer": "standard"
}
}
}
speech = field_defaults.speech()

speech_id = field_defaults.speech_id()
topic = field_defaults.topic()
Expand Down
1 change: 0 additions & 1 deletion backend/corpora/parliament/utils/field_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def speech(language=None):
stopword_analysis=has_language,
stemming_analysis=has_language,
language=language,
updated_highlighting=True
),
results_overview=True,
search_field_core=True,
Expand Down

0 comments on commit 1ef29a6

Please sign in to comment.