Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create previously non-existent folios during mei indexing #892

Merged
merged 3 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions app/public/cantusdata/management/commands/index_manuscript_mei.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
from typing import Any, Dict
from os import path, listdir
import re

from django.core.management.base import BaseCommand, CommandParser
from django.conf import settings
from solr.core import SolrConnection # type: ignore[import-untyped]

from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer
from cantusdata.models.folio import Folio

from solr.core import SolrConnection # type: ignore

MEI4_DIR = path.join("/code", "production-mei-files")
FOLIO_NUMBER_REGEX = re.compile(r"[a-zA-Z]?\d+[a-z]?")


class Command(BaseCommand):
help = (
"This command indexes the contents of MEI files in Solr, using"
"the MEITokenizer class to extract n-grams from the MEI files."
"Files must be named in the format [some string]_[folio number].mei."
"Files must be named in the format [some string]_[folio number].mei,"
"where [folio number] is an optional single letter followed by "
"some number of digits followed by an optional"
"lowercase single letter. The command currently has a workaround for folios "
"that have MEI files but are NOT in CantusDB. See #891 for details "
"about how to handle this case -- the command will alert the user "
"when it encounters this case."
)

def add_arguments(self, parser: CommandParser) -> None:
Expand Down Expand Up @@ -74,18 +83,27 @@ def handle(self, *args: Any, **options: Any) -> None:
raise ValueError(f"No folios found for manuscript {manuscript_id}.")
manuscript_mei_path = path.join(options["mei_dir"], str(manuscript_id))
if not path.exists(manuscript_mei_path):
raise FileNotFoundError(f"--mei-dir path does not exist.")
raise FileNotFoundError("--mei-dir path does not exist.")
manuscript_mei_files = [
f for f in listdir(manuscript_mei_path) if f.endswith(".mei")
]
if len(manuscript_mei_files) == 0:
raise FileNotFoundError(f"No MEI files found in {manuscript_mei_path}.")
for mei_file in manuscript_mei_files:
folio_number: str = mei_file.split("_")[-1].split(".")[0]
if not folio_number in folio_map:
if not FOLIO_NUMBER_REGEX.match(folio_number):
raise ValueError(
f"Folio number {folio_number} in MEI file {mei_file} does not exist in the database."
f"MEI file {mei_file} does not match the expected format."
)
if not folio_number in folio_map or folio_map[folio_number] == "":
self.stdout.write(
self.style.WARNING(
f"Folio number {folio_number} in MEI file "
f"{mei_file} did not exist in the database. Creating record. "
"See #891 for details on how to handle this case."
)
)
Folio.objects.create(manuscript_id=manuscript_id, number=folio_number)
tokenizer = MEITokenizer(
path.join(manuscript_mei_path, mei_file),
min_ngram=options["min_ngram"],
Expand All @@ -98,6 +116,7 @@ def handle(self, *args: Any, **options: Any) -> None:
doc["image_uri"] = folio_map.get(folio_number, "")
solr_conn.add_many(ngram_docs)
solr_conn.commit()
return None

def flush_manuscript_ngrams_from_index(
self, solr_conn: SolrConnection, manuscript_id: int
Expand Down
Loading
Loading