Skip to content

Commit

Permalink
Merge branch 'develop' into feature/dbnl-footnotes
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas authored Nov 15, 2024
2 parents ff4fca2 + bc3194a commit e291513
Show file tree
Hide file tree
Showing 98 changed files with 4,021 additions and 1,949 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/backend-build-and-push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Backend build and push after merge of requirements.txt

on:
pull_request:
branches:
- develop
types:
- closed
paths:
- backend/requirements.txt
- 'docker-compose.yaml'

jobs:
if_merged:
name: Build and push backend image
if: github.event.pull_request.merged == true
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Backend
uses: docker/build-push-action@v6
with:
context: backend/.
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
26 changes: 26 additions & 0 deletions .github/workflows/backend-build-and-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# This workflow will build the backend container and then run tests; it will only be triggered when requirements change

name: Build backend and run unit tests

on:
workflow_dispatch:
push:
branches:
- 'feature/**'
- 'bugfix/**'
- 'hotfix/**'
- 'dependabot/**'
paths:
- 'backend/requirements.txt'
- 'docker-compose.yaml'

jobs:
backend-test:
name: Test Backend
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run backend tests
run: |
sudo mkdir -p /ci-data
docker compose --env-file .env-ci run --build backend pytest
34 changes: 3 additions & 31 deletions .github/workflows/backend-test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This workflow will run backend tests on the Python version defined in the backend/Dockerfile
# This workflow will run backend tests using the `ianalyzer-backend:latest` image

name: Backend unit tests

Expand All @@ -12,10 +12,9 @@ on:
- 'bugfix/**'
- 'hotfix/**'
- 'release/**'
- 'dependabot/**'
paths:
- 'backend/**'
- '.github/workflows/backend*'
- '.github/workflows/backend-test.yml'
- 'docker-compose.yaml'

jobs:
Expand All @@ -24,34 +23,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Elasticsearch image
uses: docker/build-push-action@v6
with:
context: .
file: DockerfileElastic
push: true
tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest
cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest
cache-to: type=inline
- name: Build and push Backend
uses: docker/build-push-action@v6
with:
context: backend/.
push: true
tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest
cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest
cache-to: type=inline
- name: Run backend tests
run: |
sudo mkdir -p /ci-data
docker compose pull elasticsearch
docker compose pull backend
docker compose --env-file .env-ci run --rm backend pytest
docker compose --env-file .env-ci run backend pytest
35 changes: 35 additions & 0 deletions .github/workflows/frontend-build-and-push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Frontend build and push after merge of yarn.lock

on:
pull_request:
branches:
- develop
types:
- closed
paths:
- frontend/yarn.lock
- 'docker-compose.yaml'

jobs:
if_merged:
name: Build and push frontend image
if: github.event.pull_request.merged == true
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build frontend image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: frontend/.
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
cache-to: type=inline
25 changes: 25 additions & 0 deletions .github/workflows/frontend-build-and-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# This workflow will build the frontend container and then run tests; it will only be triggered when yarn.lock changes

name: Frontend unit tests

on:
workflow_dispatch:
push:
branches:
- 'feature/**'
- 'bugfix/**'
- 'hotfix/**'
- 'dependabot/**'
paths:
- frontend/yarn.lock
- 'docker-compose.yaml'

jobs:
frontend-test:
name: Test Frontend
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run frontend unit tests
run: |
docker compose --env-file .env-ci run --build frontend yarn test
23 changes: 3 additions & 20 deletions .github/workflows/frontend-test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This workflow will run frontend tests on the Node version defined in the Dockerfiles
# This workflow will run frontend tests on the `ianalyzer-frontend:latest` image

name: Frontend unit tests

Expand All @@ -15,7 +15,7 @@ on:
- 'dependabot/**'
paths:
- 'frontend/**'
- '.github/workflows/frontend*'
- '.github/workflows/frontend-test.yml'
- 'docker-compose.yaml'

jobs:
Expand All @@ -24,23 +24,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build frontend image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: frontend/.
push: true
tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest
cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest
cache-to: type=inline
- name: Run frontend unit tests
run: |
docker compose pull frontend
docker compose --env-file .env-ci run --rm frontend yarn test
docker compose --env-file .env-ci run --build frontend yarn test
48 changes: 48 additions & 0 deletions .github/workflows/scheduled-build-and-push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This workflow will run every first of the month, to make sure we update the underlying images and libraries

name: Scheduled build and push of all images

on:
workflow_dispatch:
schedule:
- cron: "0 0 1 * *"

jobs:
rebuild-scheduled:
name: Rebuild images
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build frontend image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: frontend/.
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-frontend:latest
cache-to: type=inline
- name: Build backend image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: backend/.
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-backend:latest
cache-to: type=inline
- name: Build Elasticsearch image, using cache from Github registry
uses: docker/build-push-action@v6
with:
context: .
file: DockerfileElastic
push: true
tags: ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
cache-from: type=registry,ref=ghcr.io/centrefordigitalhumanities/ianalyzer-elasticsearch:latest
cache-to: type=inline
6 changes: 3 additions & 3 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ authors:
identifiers:
- type: doi
value: 10.5281/zenodo.8064133
repository-code: 'https://github.com/UUDigitalHumanitieslab/I-analyzer'
repository-code: 'https://github.com/CentreForDigitalHumanities/I-analyzer'
url: 'https://ianalyzer.hum.uu.nl'
abstract: >-
I-analyzer is a tool for exploring corpora (large
Expand All @@ -35,5 +35,5 @@ keywords:
- elasticsearch
- natural language processing
license: MIT
version: 5.11.0
date-released: '2024-08-08'
version: 5.14.0
date-released: '2024-11-06'
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# I-analyzer

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8064133.svg)](https://doi.org/10.5281/zenodo.8064133)
[![Actions Status](https://github.com/UUDigitalHumanitiesLab/I-analyzer/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/I-analyzer/actions)
[![Actions Status](https://github.com/CentreForDigitalHumanities/I-analyzer/workflows/Unit%20tests/badge.svg)](https://github.com/CentreForDigitalHumanities/I-analyzer/actions)

> "The great text mining tool that obviates all others."
> — Julian Gonggrijp
Expand Down Expand Up @@ -41,7 +41,7 @@ If you wish to cite material that you accessed through I-analyzer, or you are no

## Contact

For questions, small feature suggestions, and bug reports, feel free to [create an issue](https://github.com/UUDigitalHumanitieslab/I-analyzer/issues/new/choose). If you don't have a Github account, you can also [contact the Centre for Digital Humanities](https://cdh.uu.nl/contact/).
For questions, small feature suggestions, and bug reports, feel free to [create an issue](https://github.com/CentreForDigitalHumanities/I-analyzer/issues/new/choose). If you don't have a Github account, you can also [contact the Centre for Digital Humanities](https://cdh.uu.nl/contact/).

If you want to add a new corpus to I-analyzer, or have an idea for a project, please [contact the Centre for Digital Humanities](https://cdh.uu.nl/contact/) rather than making an issue, so we can discuss the possibilities with you.

8 changes: 2 additions & 6 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ ianalyzer/config.py
# csv downloads
download/csv_files/

# word models
corpora/*/wm/*
!corpora/*/wm/documentation.md

# file storage
test_data/
data/
/test_data/
/data/
14 changes: 5 additions & 9 deletions backend/addcorpus/es_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
def primary_mapping_type(es_mapping: Dict) -> str:
return es_mapping.get('type', None)

def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None, updated_highlighting=True):

def main_content_mapping(
token_counts=True, stopword_analysis=False, stemming_analysis=False, language=None
):
'''
Mapping for the main content field. Options:
Expand All @@ -14,14 +17,7 @@ def main_content_mapping(token_counts=True, stopword_analysis=False, stemming_an
- `updated_highlighting`: enables the new highlighter, which only works for fields that are indexed with the term vector set to 'with_positions_offsets'.
'''

mapping = {
'type': 'text'
}

if updated_highlighting:
mapping.update({
'term_vector': 'with_positions_offsets' # include char positions on _source (in addition to the multifields) for highlighting
})
mapping = {"type": "text", "term_vector": "with_positions_offsets"}

if any([token_counts, stopword_analysis, stemming_analysis]):
multifields = {}
Expand Down
35 changes: 19 additions & 16 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import warnings

from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
Expand All @@ -12,15 +19,10 @@
)
from addcorpus.validation.indexing import (validate_essential_fields,
validate_has_configuration, validate_language_field, validate_has_data_directory)
from addcorpus.validation.publishing import (validate_default_sort,
validate_ngram_has_date_field)
from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.validation.publishing import (
validate_default_sort,
validate_ngram_has_date_field,
)
from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
Expand Down Expand Up @@ -264,14 +266,15 @@ def clean(self):

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
from es.search import total_hits

client = elasticsearch(self.corpus.name)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
# in production, the index name can be different from the object's es_index value
index_name = list(mapping.keys())[0]
fields = mapping[index_name].get('mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
# we check if any fields exist for filtering named entities
ner_exists = client.search(
index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0
)
if total_hits(ner_exists):
return True
except:
return False
Expand Down
Loading

0 comments on commit e291513

Please sign in to comment.