DDMAL · dchiller · Aug 15, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 15, 2024
diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py
@@ -1,21 +1,30 @@
 from typing import Any, Dict
 from os import path, listdir
+import re
 
 from django.core.management.base import BaseCommand, CommandParser
 from django.conf import settings
+from solr.core import SolrConnection  # type: ignore[import-untyped]
+
 from cantusdata.helpers.mei_processing.mei_tokenizer import MEITokenizer
 from cantusdata.models.folio import Folio
 
-from solr.core import SolrConnection  # type: ignore
 
 MEI4_DIR = path.join("/code", "production-mei-files")
+FOLIO_NUMBER_REGEX = re.compile(r"[a-zA-Z]?\d+[a-z]?")
 
 
 class Command(BaseCommand):
     help = (
         "This command indexes the contents of MEI files in Solr, using"
         "the MEITokenizer class to extract n-grams from the MEI files."
-        "Files must be named in the format [some string]_[folio number].mei."
+        "Files must be named in the format [some string]_[folio number].mei,"
+        "where [folio number] is an optional single letter followed by "
+        "some number of digits followed by an optional"
+        "lowercase single letter. The command currently has a workaround for folios "
+        "that have MEI files but are NOT in CantusDB. See #891 for details "
+        "about how to handle this case -- the command will alert the user "
+        "when it encounters this case."
     )
 
     def add_arguments(self, parser: CommandParser) -> None:
@@ -74,18 +83,27 @@ def handle(self, *args: Any, **options: Any) -> None:
             raise ValueError(f"No folios found for manuscript {manuscript_id}.")
         manuscript_mei_path = path.join(options["mei_dir"], str(manuscript_id))
         if not path.exists(manuscript_mei_path):
-            raise FileNotFoundError(f"--mei-dir path does not exist.")
+            raise FileNotFoundError("--mei-dir path does not exist.")
         manuscript_mei_files = [
             f for f in listdir(manuscript_mei_path) if f.endswith(".mei")
         ]
         if len(manuscript_mei_files) == 0:
             raise FileNotFoundError(f"No MEI files found in {manuscript_mei_path}.")
         for mei_file in manuscript_mei_files:
             folio_number: str = mei_file.split("_")[-1].split(".")[0]
-            if not folio_number in folio_map:
+            if not FOLIO_NUMBER_REGEX.match(folio_number):
                 raise ValueError(
-                    f"Folio number {folio_number} in MEI file {mei_file} does not exist in the database."
+                    f"MEI file {mei_file} does not match the expected format."
+                )
+            if not folio_number in folio_map or folio_map[folio_number] == "":
+                self.stdout.write(
+                    self.style.WARNING(
+                        f"Folio number {folio_number} in MEI file "
+                        f"{mei_file} did not exist in the database. Creating record. "
+                        "See #891 for details on how to handle this case."
+                    )
                 )
+                Folio.objects.create(manuscript_id=manuscript_id, number=folio_number)
             tokenizer = MEITokenizer(
                 path.join(manuscript_mei_path, mei_file),
                 min_ngram=options["min_ngram"],
@@ -98,6 +116,7 @@ def handle(self, *args: Any, **options: Any) -> None:
                 doc["image_uri"] = folio_map.get(folio_number, "")
             solr_conn.add_many(ngram_docs)
             solr_conn.commit()
+        return None
 
     def flush_manuscript_ngrams_from_index(
         self, solr_conn: SolrConnection, manuscript_id: int