From a3812c765fb0a74117affd55e23b4529ae7726f5 Mon Sep 17 00:00:00 2001 From: Dylan Hillerbrand Date: Tue, 6 Aug 2024 12:50:08 -0400 Subject: [PATCH 1/3] feat(omr search): create non-existent folios when indexing manuscript mei When chants end on a folio where no other chants begin, the folio does not exist in CantusDB but does have an MEI file. For example, see folio A14r in Salzinnes. Here, we modify the index_manuscript_mei command to create a folio in such cases (we check that it's a "real" folio by making sure the mei file follows a naming convention and then create the folio if it doesn't exist). The user is alerted to this, and they must manually add the image_uri to the folio (either through the admin panel or the map folios process) and then reindex the mei. This is detailed in issue #891. This is convoluted, but given that we're going to change the structure of the CU database soon so that it is more closely coupled with CantusDB, we'll figure out a more permanent solution then -- solving #891). --- .../commands/index_manuscript_mei.py | 29 +- .../123723/cdn-hsmu-m2149l4_999r.mei | 1150 +++++++++++++++++ .../commands/test_index_manuscript_mei.py | 14 +- 3 files changed, 1187 insertions(+), 6 deletions(-) create mode 100644 app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_999r.mei diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py index 88e7d95e..77c9f683 100644 --- a/app/public/cantusdata/management/commands/index_manuscript_mei.py +++ b/app/public/cantusdata/management/commands/index_manuscript_mei.py @@ -1,21 +1,30 @@ from typing import Any, Dict from os import path, listdir +import re from django.core.management.base import BaseCommand, CommandParser from django.conf import settings +from solr.core import SolrConnection # type: ignore + from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer from cantusdata.models.folio import Folio -from solr.core import SolrConnection # type: ignore MEI4_DIR = path.join("/code", "production-mei-files") +FOLIO_NUMBER_REGEX = re.compile(r"[a-zA-Z]?\d+[a-z]?") class Command(BaseCommand): help = ( "This command indexes the contents of MEI files in Solr, using" "the MEITokenizer class to extract n-grams from the MEI files." - "Files must be named in the format [some string]_[folio number].mei." + "Files must be named in the format [some string]_[folio number].mei," + "where [folio number] is an optional single letter followed by " + "some number of digits followed by an optional" + "lowercase single letter. The command currently has a workaround for folios " + "that have MEI files but are NOT in CantusDB. See #891 for details " + "about how to handle this case -- the command will alert the user " + "when it encounters this case." ) def add_arguments(self, parser: CommandParser) -> None: @@ -74,7 +83,7 @@ def handle(self, *args: Any, **options: Any) -> None: raise ValueError(f"No folios found for manuscript {manuscript_id}.") manuscript_mei_path = path.join(options["mei_dir"], str(manuscript_id)) if not path.exists(manuscript_mei_path): - raise FileNotFoundError(f"--mei-dir path does not exist.") + raise FileNotFoundError("--mei-dir path does not exist.") manuscript_mei_files = [ f for f in listdir(manuscript_mei_path) if f.endswith(".mei") ] @@ -82,10 +91,19 @@ def handle(self, *args: Any, **options: Any) -> None: raise FileNotFoundError(f"No MEI files found in {manuscript_mei_path}.") for mei_file in manuscript_mei_files: folio_number: str = mei_file.split("_")[-1].split(".")[0] - if not folio_number in folio_map: + if not FOLIO_NUMBER_REGEX.match(folio_number): raise ValueError( - f"Folio number {folio_number} in MEI file {mei_file} does not exist in the database." + f"MEI file {mei_file} does not match the expected format." + ) + if not folio_number in folio_map: + self.stdout.write( + self.style.WARNING( + f"Folio number {folio_number} in MEI file " + f"{mei_file} did not exist in the database. Creating record. " + "See #891 for details on how to handle this case." + ) ) + Folio.objects.create(manuscript_id=manuscript_id, number=folio_number) tokenizer = MEITokenizer( path.join(manuscript_mei_path, mei_file), min_ngram=options["min_ngram"], @@ -98,6 +116,7 @@ def handle(self, *args: Any, **options: Any) -> None: doc["image_uri"] = folio_map.get(folio_number, "") solr_conn.add_many(ngram_docs) solr_conn.commit() + return None def flush_manuscript_ngrams_from_index( self, solr_conn: SolrConnection, manuscript_id: int diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_999r.mei b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_999r.mei new file mode 100644 index 00000000..1a1d7b5a --- /dev/null +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_files/123723/cdn-hsmu-m2149l4_999r.mei @@ -0,0 +1,1150 @@ + + + + + + + + MEI Encoding Output
+ + + + + + Ec + + + + + + ce + + + + + + + no + + + + + + men + + + + + + do + + + + + + mi + + + + + + + ni + + + + + + ve + + + + + + + + + + + nit + + + + + + + + + + + de + + + + + + lon + + + + + + + gin + + + + + + + + + + + + + quo + + + + + + + et + + + + + + cla + + + + + + ri + + + + + + tas + + + + + + e + + + + + + + + + + jus + + + + + + + re + + + + + + + plet + + + + + + or + + + + + + + bem + + + + + + ter + + + + + + + ra + + + + + + rum + + + + + + E + + + + + + u + + + + + + + + + o + + + + + + u + + + + + + + a + + + + + + e + + + + + Quem + + + + + + ter + + + + + + + ra + + + + + + + pon + + + + + + thus + + + + + + + + + et + + + + + + + eth + + + + + + + + + + ra + + + col + + + + + + lunt + + + + + + a + + + + + + do + + + + + + + rant + + + + + + prae + + + + + + di + + + + + + + cant + + + + + + tri + + + + + + + + + nam + + + + + + re + + + + + + + gen + + + + + + + tem + + + + + + ma + + + + + + + chi + + + + + + + + nam + + + + + + claus + + + + + + trum + + + + + + ma + + + + + + + + + + + ri + + + e + + + + + + + + + ba + + + + + + + + iu + + + + + + + lat + + + + + + Cui + + + + + + + lu + + + + + + + na + + + + + + + sol + + + + + + et + + + + + + sy + + + + + + + + + + + de + + + + + + ra + + + de + + + + + + ser + + + + + + vi + + + + + + + + + unt + + + + + + + per + + + + + + tem + + + + + + po + + + + + + + ra + + + + + + per + + + + + + fu + + + + + + sa + + + + + + + ce + + + + + + + li + + + + + + gra + + + + + + + + + + + + ti + + + a + + + + + + ges + + + + + + tant + + + + + + + + + pu + + + + + + + + + + + el + + + le + + + + + + + + + + + vis + + + + + + + ce + + + ra + + + + + + Be + + + + + + + + + + a + + + ta + + + + + + + ma + + + + + + ter + + + + + + mu + + + + + + + + + + + ne + + + + + + re + + + + + + cu + + + + + + ius + + + + + + su + + + + + + per + + + + + + + nus + + + + + + ar + + + + + + ti + + + + + + + fex + + + + + + mun + + + + + + dum + + + + + + pu + + + + + + + gil + + + + + + + + +
+
+
+ +
+
\ No newline at end of file diff --git a/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py b/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py index 9c5b9fbd..f2f12e5f 100644 --- a/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py +++ b/app/public/cantusdata/test/core/management/commands/test_index_manuscript_mei.py @@ -27,6 +27,10 @@ def setUpTestData(cls) -> None: Folio.objects.create(number="001v", manuscript=manuscript) def test_index_manuscript_mei(self) -> None: + # Assert that prior to the command run, the folio "999r" does not + # exist in the database + with self.assertRaises(Folio.DoesNotExist): + Folio.objects.get(manuscript_id=123723, number="999r") call_command( "index_manuscript_mei", "123723", @@ -37,6 +41,10 @@ def test_index_manuscript_mei(self) -> None: "--mei-dir", TEST_MEI_FILES_PATH, ) + # Assert that the folio "999r" now exists in the database + # (will raise exception if it does not) + with self.subTest("Test creation of non-existent folio"): + Folio.objects.get(manuscript_id=123723, number="999r") results = self.solr_conn.query("*:*", fq="type:omr_ngram") with self.subTest("Test total number of indexed documents"): total_exp_ngrams_001r = calculate_expected_total_ngrams( @@ -45,8 +53,12 @@ def test_index_manuscript_mei(self) -> None: total_exp_ngrams_001v = calculate_expected_total_ngrams( f"{TEST_MEI_FILES_PATH}/123723/cdn-hsmu-m2149l4_001v.mei", 1, 5 ) + total_exp_ngrams_999r = calculate_expected_total_ngrams( + f"{TEST_MEI_FILES_PATH}/123723/cdn-hsmu-m2149l4_999r.mei", 1, 5 + ) self.assertEqual( - results.numFound, total_exp_ngrams_001r + total_exp_ngrams_001v + results.numFound, + total_exp_ngrams_001r + total_exp_ngrams_001v + total_exp_ngrams_999r, ) def test_flush_option(self) -> None: From 229aa349aa5bfcb0399d6732325f1363459469a6 Mon Sep 17 00:00:00 2001 From: Dylan Hillerbrand Date: Tue, 6 Aug 2024 13:23:56 -0400 Subject: [PATCH 2/3] fix(index omr): check that image_uri has been added for additional folios --- .../cantusdata/management/commands/index_manuscript_mei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py index 77c9f683..e4e1539d 100644 --- a/app/public/cantusdata/management/commands/index_manuscript_mei.py +++ b/app/public/cantusdata/management/commands/index_manuscript_mei.py @@ -95,7 +95,7 @@ def handle(self, *args: Any, **options: Any) -> None: raise ValueError( f"MEI file {mei_file} does not match the expected format." ) - if not folio_number in folio_map: + if not folio_number in folio_map or folio_map[folio_number] == "": self.stdout.write( self.style.WARNING( f"Folio number {folio_number} in MEI file " From 3a45e15dd05a4532fa130b1f68b08e635321e029 Mon Sep 17 00:00:00 2001 From: Dylan Hillerbrand Date: Thu, 15 Aug 2024 12:11:19 -0400 Subject: [PATCH 3/3] test(index_manuscript_mei): add more specific type-ignore comment Specifically silences the `import-untyped` error on the import of `SolrConnection` from `solr.core` --- .../cantusdata/management/commands/index_manuscript_mei.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py index e4e1539d..6f7b09a4 100644 --- a/app/public/cantusdata/management/commands/index_manuscript_mei.py +++ b/app/public/cantusdata/management/commands/index_manuscript_mei.py @@ -4,7 +4,7 @@ from django.core.management.base import BaseCommand, CommandParser from django.conf import settings -from solr.core import SolrConnection # type: ignore +from solr.core import SolrConnection # type: ignore[import-untyped] from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer from cantusdata.models.folio import Folio