-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into feature/netherlands-citation-page
- Loading branch information
Showing
89 changed files
with
1,869 additions
and
1,012 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
FROM docker.elastic.co/elasticsearch/elasticsearch:8.10.2 | ||
|
||
RUN bin/elasticsearch-plugin install mapper-annotated-text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import pytest | ||
|
||
@pytest.fixture() | ||
def content_field_json(): | ||
return { | ||
'name': 'content', | ||
'display_name': 'Content', | ||
'description': 'Bla bla bla', | ||
'type': 'text_content', | ||
'language': 'en', | ||
'options': { | ||
'search': True, | ||
'filter': 'none', | ||
'preview': True, | ||
'visualize': True, | ||
'sort': False, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'content'} | ||
} | ||
|
||
@pytest.fixture() | ||
def keyword_field_json(): | ||
return { | ||
'name': 'author', | ||
'display_name': 'Author', | ||
'description': 'Author of the text', | ||
'type': 'text_metadata', | ||
'options': { | ||
'search': True, | ||
'filter': 'show', | ||
'preview': True, | ||
'visualize': True, | ||
'sort': False, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'author'} | ||
} | ||
|
||
@pytest.fixture() | ||
def int_field_json(): | ||
return { | ||
'name': 'year', | ||
'display_name': 'Year', | ||
'description': 'Year in which the text was written', | ||
'type': 'integer', | ||
'options': { | ||
'search': False, | ||
'filter': 'show', | ||
'preview': False, | ||
'visualize': True, | ||
'sort': True, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'year'} | ||
} | ||
|
||
@pytest.fixture() | ||
def float_field_json(): | ||
return { | ||
'name': 'ocr_confidence', | ||
'display_name': 'OCR confidence', | ||
'description': 'Confidence level of optical character recognition output', | ||
'type': 'float', | ||
'options': { | ||
'search': False, | ||
'filter': 'hide', | ||
'preview': False, | ||
'visualize': False, | ||
'sort': False, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'ocr'} | ||
} | ||
|
||
@pytest.fixture() | ||
def date_field_json(): | ||
return { | ||
'name': 'date', | ||
'display_name': 'Date', | ||
'description': 'Date on which the text was written', | ||
'type': 'date', | ||
'options': { | ||
'search': False, | ||
'filter': 'show', | ||
'preview': True, | ||
'visualize': True, | ||
'sort': True, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'date'} | ||
} | ||
|
||
@pytest.fixture() | ||
def boolean_field_json(): | ||
return { | ||
'name': 'author_known', | ||
'display_name': 'Author known', | ||
'description': 'Whether the author of the text is known', | ||
'type': 'boolean', | ||
'options': { | ||
'search': False, | ||
'filter': 'show', | ||
'preview': False, | ||
'visualize': True, | ||
'sort': False, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'author_known'} | ||
} | ||
|
||
@pytest.fixture() | ||
def geo_field_json(): | ||
return { | ||
'name': 'location', | ||
'display_name': 'Location', | ||
'description': 'Location where the text was published', | ||
'type': 'geo_point', | ||
'options': { | ||
'search': False, | ||
'filter': 'none', | ||
'preview': False, | ||
'visualize': False, | ||
'sort': False, | ||
'hidden': False | ||
}, | ||
'extract': {'column': 'location'} | ||
} | ||
|
||
@pytest.fixture( | ||
params=['content', 'keyword', 'int', 'float', 'date', 'boolean', 'geo'] | ||
) | ||
def any_field_json( | ||
request, content_field_json, keyword_field_json, int_field_json, float_field_json, | ||
date_field_json, boolean_field_json, geo_field_json | ||
): | ||
field_type = request.param | ||
funcs = { | ||
'content': content_field_json, | ||
'keyword': keyword_field_json, | ||
'int': int_field_json, | ||
'float': float_field_json, | ||
'date': date_field_json, | ||
'boolean': boolean_field_json, | ||
'geo': geo_field_json, | ||
} | ||
return funcs[field_type] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
DEFAULT_CSV_DELIMITER = ',' | ||
DATE_FORMAT = '%Y-%m-%d' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from typing import Dict | ||
from datetime import date | ||
from addcorpus.models import Corpus, CorpusConfiguration, Field | ||
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT | ||
from addcorpus.es_mappings import primary_mapping_type | ||
|
||
def export_json_corpus(corpus: Corpus) -> Dict: | ||
config = corpus.configuration | ||
data = {'name': corpus.name, 'id': corpus.pk } | ||
data['meta'] = export_corpus_meta(config) | ||
data['source_data'] = export_corpus_source_data(config) | ||
options = export_corpus_options(config) | ||
if options: | ||
data['options'] = options | ||
data['fields'] = [ | ||
export_json_field(field) for field in config.fields.all() | ||
] | ||
return data | ||
|
||
def export_corpus_meta(configuration: CorpusConfiguration) -> Dict: | ||
return { | ||
'title': configuration.title, | ||
'category': configuration.category, | ||
'description': configuration.description, | ||
'languages': configuration.languages, | ||
'date_range': { | ||
'min': export_date(configuration.min_date), | ||
'max': export_date(configuration.max_date), | ||
} | ||
} | ||
|
||
def export_date(date: date): | ||
return date.strftime(DATE_FORMAT) | ||
|
||
def export_corpus_source_data(configuration: CorpusConfiguration) -> Dict: | ||
data = { | ||
'type': 'csv' | ||
} | ||
if configuration.source_data_delimiter != DEFAULT_CSV_DELIMITER: | ||
data['options'] = {'delimiter': configuration.source_data_delimiter} | ||
return data | ||
|
||
def export_corpus_options(configuration: CorpusConfiguration) -> Dict: | ||
data = {} | ||
if configuration.document_context: | ||
data['document_context'] = configuration.document_context | ||
if configuration.default_sort: | ||
data['default_sort'] = configuration.default_sort | ||
if configuration.language_field: | ||
data['language_field'] = configuration.language_field | ||
return data | ||
|
||
|
||
def export_json_field(field: Field) -> Dict: | ||
data = { | ||
'name': field.name, | ||
'display_name': field.display_name, | ||
'description': field.description, | ||
'type': export_field_type(field), | ||
'options': export_field_options(field), | ||
'extract': export_field_extract(field) | ||
} | ||
if field.language: | ||
data['language'] = field.language | ||
return data | ||
|
||
|
||
def export_field_type(field: Field) -> str: | ||
if field.display_type == 'text' or field.display_type == 'keyword': | ||
return 'text_metadata' | ||
return field.display_type | ||
|
||
|
||
def export_field_options(field: Field) -> Dict: | ||
return { | ||
'filter': export_field_filter(field), | ||
'hidden': field.hidden, | ||
'preview': field.results_overview, | ||
'search': field.searchable, | ||
'sort': field.sortable, | ||
'visualize': len(field.visualizations) > 0 | ||
} | ||
|
||
|
||
def export_field_filter(field: Field) -> str: | ||
if field.search_filter != {}: | ||
return 'show' | ||
filterable_mappings = ['keyword', 'int', 'float', 'date', 'boolean'] | ||
if primary_mapping_type(field.es_mapping) in filterable_mappings and field.display_type != 'url': | ||
return 'hide' | ||
return 'none' | ||
|
||
|
||
def export_field_extract(field: Field) -> Dict: | ||
return {'column': field.extract_column} |
Oops, something went wrong.