Merge pull request #845 from dchiller/mei-parsing-updates

Update MEI parsing and creation of OMR search tokens
DDMAL · May 15, 2024 · 37bf9ae · 37bf9ae
2 parents 04600af + 526011d
commit 37bf9ae
Show file tree

Hide file tree

Showing 7 changed files with 638 additions and 315 deletions.
diff --git a/app/public/cantusdata/helpers/mei_processing/mei_parser.py b/app/public/cantusdata/helpers/mei_processing/mei_parser.py
@@ -8,20 +8,20 @@
         between two neume components.
     - get_contour_from_interval: Computes the contour of an interval.
     - analyze_neume: Analyzes a neume (a list of neume components) to determine its
-        neume type, its intervals, and its contour.
+        neume name, its intervals, and its contour.
 
 Defines associated types for the data structures used by the parser.
 """
 
 from typing import Tuple, Dict, List, Iterator, Optional
-from lxml import etree
+from lxml import etree  # pylint: disable=no-name-in-module
 from .mei_parsing_types import (
     Zone,
     SyllableText,
     NeumeComponentElementData,
     NeumeComponent,
     ContourType,
-    NeumeType,
+    NeumeName,
     Neume,
     Syllable,
 )
@@ -31,24 +31,24 @@
 PITCH_CLASS = {"c": 0, "d": 2, "e": 4, "f": 5, "g": 7, "a": 9, "b": 11}
 
 # Mapping from neume contours to neume names
-NEUME_GROUPS: Dict[str, NeumeType] = {
-    "": "Punctum",
-    "u": "Pes",
-    "d": "Clivis",
-    "uu": "Scandicus",
-    "ud": "Torculus",
-    "du": "Porrectus",
-    "s": "Distropha",
-    "ss": "Tristopha",
-    "sd": "Pressus",
-    "dd": "Climacus",
-    "ddu": "Climacus resupinus",
-    "udu": "Torculus resupinus",
-    "dud": "Porrectus flexus",
-    "udd": "Pes subpunctis",
-    "uud": "Scandicus flexus",
-    "uudd": "Scandicus subpunctis",
-    "dudd": "Porrectus subpunctis",
+NEUME_GROUPS: Dict[str, NeumeName] = {
+    "": "punctum",
+    "u": "pes",
+    "d": "clivis",
+    "uu": "scandicus",
+    "ud": "torculus",
+    "du": "porrectus",
+    "r": "distropha",
+    "rr": "tristopha",
+    "rd": "pressus",
+    "dd": "climacus",
+    "ddu": "climacus_resupinus",
+    "udu": "torculus_resupinus",
+    "dud": "porrectus_flexus",
+    "udd": "pes_subpunctis",
+    "uud": "scandicus_flexus",
+    "uudd": "scandicus_subpunctis",
+    "dudd": "porrectus_subpunctis",
 }
 
 
@@ -75,6 +75,7 @@ class MEIParser:
     def __init__(self, mei_file: str):
         self.mei_file = mei_file
         self.mei = etree.parse(self.mei_file)
+        self._remove_empty_neumes_and_syllables()
         self.zones = self.parse_zones()
         self.syllables = self.parse_mei()
 
@@ -182,7 +183,7 @@ def _parse_neume(
             )
             if parsed_neume_component:
                 parsed_nc_elements.append(parsed_neume_component)
-        neume_type, intervals, contours = analyze_neume(parsed_nc_elements)
+        neume_name, intervals, contours = analyze_neume(parsed_nc_elements)
         # If the first neume component of the next syllable can be parsed,
         # add the interval and contour between the final neume component of
         # the current syllable and the first neume component of the next syllable.
@@ -193,7 +194,7 @@ def _parse_neume(
             if parsed_next_neume_comp:
                 last_neume_comp = parsed_nc_elements[-1]
                 intervals.append(
-                    get_interval_between_neume_components(
+                    get_semitones_between_neume_components(
                         last_neume_comp, parsed_next_neume_comp
                     )
                 )
@@ -211,12 +212,13 @@ def _parse_neume(
                     "pname": nc["pname"],
                     "octave": nc["octave"],
                     "bounding_box": nc["bounding_box"],
-                    "interval": intervals[i] if i < len(intervals) else None,
+                    "semitone_interval": intervals[i] if i < len(intervals) else None,
                     "contour": contours[i] if i < len(contours) else None,
+                    "system": neume_system,
                 }
             )
         parsed_neume: Neume = {
-            "neume_type": neume_type,
+            "neume_name": neume_name,
             "neume_components": parsed_neume_components,
             "bounding_box": combined_bounding_box,
             "system": neume_system,
@@ -323,6 +325,26 @@ def _syllable_iterator(
                     system += 1
                 current_elem = next(elem_iterator, None)
 
+    def _remove_empty_neumes_and_syllables(self) -> None:
+        """
+        Apparently, for a while Rodan was creating invalid MEI files that
+        contained empty neumes (i.e., neumes with no neume components) and
+        empty syllables (i.e., syllables with no neumes or only empty neumes).
+        This method removes those empty neumes and syllables from the MEI being parsed;
+        it was added as a preprocessing step so that it can, once the base
+        MEI files are corrected, be removed.
+        """
+        for neume in self.mei.iter(f"{self.MEINS}neume"):
+            if len(neume.findall(f"{self.MEINS}nc")) == 0:
+                # Ignoring type because we know that getparent() will
+                # return an element in this case.
+                neume.getparent().remove(neume)  # type: ignore
+        for syllable in self.mei.iter(f"{self.MEINS}syllable"):
+            if len(syllable.findall(f"{self.MEINS}neume")) == 0:
+                # Ignoring type because we know that getparent() will
+                # return an element in this case.
+                syllable.getparent().remove(syllable)  # type: ignore
+
     def parse_mei(self) -> List[Syllable]:
         """
         Parses the MEI file into a list of syllables.
@@ -351,7 +373,7 @@ def parse_mei(self) -> List[Syllable]:
         return syllables
 
 
-def get_interval_between_neume_components(
+def get_semitones_between_neume_components(
     neume_component_1: NeumeComponentElementData,
     neume_component_2: NeumeComponentElementData,
 ) -> int:
@@ -369,8 +391,8 @@ def get_interval_between_neume_components(
     try:
         pc1 = PITCH_CLASS[neume_component_1["pname"]]
         pc2 = PITCH_CLASS[neume_component_2["pname"]]
-    except KeyError:
-        raise ValueError("Invalid pitch name in neume component.")
+    except KeyError as err:
+        raise ValueError("Invalid pitch name in neume component.") from err
     # In MIDI note numbers, C0 = 12.
     pitch_1 = pc1 + (12 * (neume_component_1["octave"] + 1))
     pitch_2 = pc2 + (12 * (neume_component_2["octave"] + 1))
@@ -382,34 +404,36 @@ def get_contour_from_interval(interval: int) -> ContourType:
     Compute the contour of an interval.
 
     :param interval: The size of the interval in semitones
-    :return: The contour of the interval ("u"[p], "d"[own], or "s"[tay])
+    :return: The contour of the interval ("u"[p], "d"[own], or "r"[epeat])
     """
     if interval < 0:
         return "d"
     if interval > 0:
         return "u"
-    return "s"
+    return "r"
 
 
 def analyze_neume(
     neume: List[NeumeComponentElementData],
-) -> Tuple[NeumeType, List[int], List[ContourType]]:
+) -> Tuple[NeumeName, List[int], List[ContourType]]:
     """
     Analyze a neume (a list of neume components) to determine:
-    - Neume type
-    - Neume intervals
-    - Neume contour
+    - The neume type (e.g., punctum, pes, clivis, etc.)
+    - The intervals in the neume in semitones
+    - The contour of the nueme
 
     :param neume: A list of neume components (a list of NeumeComponentsType dictionaries)
     :return: A tuple of information about the neume:
                 - Neume type (str)
-                - Neume intervals (list of ints)
-                - Neume contour (list of "u"[p], "d"[own], or "s"[tay])
+                - Neume intervals in semitones (list of ints)
+                - Neume contour (list of "u"[p], "d"[own], or "r"[epeat])
     """
-    intervals: List[int] = [
-        get_interval_between_neume_components(nc1, nc2)
+    semitone_intervals: List[int] = [
+        get_semitones_between_neume_components(nc1, nc2)
         for nc1, nc2 in zip(neume[:-1], neume[1:])
     ]
-    contours: List[ContourType] = [get_contour_from_interval(i) for i in intervals]
-    neume_type: NeumeType = NEUME_GROUPS.get("".join(contours), "Compound")
-    return neume_type, intervals, contours
+    contours: List[ContourType] = [
+        get_contour_from_interval(i) for i in semitone_intervals
+    ]
+    neume_type: NeumeName = NEUME_GROUPS.get("".join(contours), "compound")
+    return neume_type, semitone_intervals, contours
diff --git a/app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py b/app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py
@@ -2,7 +2,7 @@
 Contains type definitions used in the MEI parsing process.
 """
 
-from typing import Tuple, TypedDict, Literal, List, Optional
+from typing import Tuple, TypedDict, Literal, List, Optional, NotRequired
 from typing_extensions import TypeAlias
 
 # A type for coordinates of bounding boxes
@@ -30,26 +30,26 @@ class Zone(TypedDict):
     rotate: float
 
 
-ContourType = Literal["u", "d", "s"]
-NeumeType = Literal[
-    "Punctum",
-    "Pes",
-    "Clivis",
-    "Scandicus",
-    "Torculus",
-    "Porrectus",
-    "Distropha",
-    "Tristopha",
-    "Pressus",
-    "Climacus",
-    "Climacus resupinus",
-    "Torculus resupinus",
-    "Porrectus flexus",
-    "Pes subpunctis",
-    "Scandicus flexus",
-    "Scandicus subpunctis",
-    "Porrectus subpunctis",
-    "Compound",
+ContourType = Literal["u", "d", "r"]
+NeumeName = Literal[
+    "punctum",
+    "pes",
+    "clivis",
+    "scandicus",
+    "torculus",
+    "porrectus",
+    "distropha",
+    "tristopha",
+    "pressus",
+    "climacus",
+    "climacus_resupinus",
+    "torculus_resupinus",
+    "porrectus_flexus",
+    "pes_subpunctis",
+    "scandicus_flexus",
+    "scandicus_subpunctis",
+    "porrectus_subpunctis",
+    "compound",
 ]
 
 
@@ -74,27 +74,29 @@ class NeumeComponent(NeumeComponentElementData):
     """A type extending NeumeComponentElementData with interval and contour information.
 
 
-    interval: The interval (in semitones) between the neume component and the
+    semitone_interval: The interval in semitones between the neume component and the
         following neume component. If there is no following neume component,
         this is None.
-    contour: The contour ("u"[p], "d"[own], or "s"[tay]) of 'interval'. If there is no
+    contour: The contour ("u"[p], "d"[own], or "r"[epeat]) of 'interval'. If there is no
         following neume component, this is None.
+    system: The system number that the neume component is on
     """
 
-    interval: Optional[int]
+    semitone_interval: Optional[int]
     contour: Optional[ContourType]
+    system: int
 
 
 class Neume(TypedDict):
     """A type for neumes
 
-    neume_type: The name of the neume (ie. "Punctum", "Pes", "Clivis", etc.)
+    neume_name: The name of the neume (ie. "punctum", "pes", "clivis", etc.)
     neume_components: A list of neume components (containing pitch infomation)
     bounding_box: The bounding box of the neume
     system: The system number that the neume is on
     """
 
-    neume_type: NeumeType
+    neume_name: NeumeName
     neume_components: List[NeumeComponent]
     bounding_box: Zone
     system: int
@@ -112,3 +114,41 @@ class Syllable(TypedDict):
 
     text: SyllableText
     neumes: List[Neume]
+
+
+class NgramDocument(TypedDict):
+    """
+    A generic type for documents containing n-grams
+    of information extracted from MEI files.
+
+    ngram_unit: The unit of the n-gram
+    location: The location of the n-gram in the MEI file (MEI Zones
+        converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes)
+    pitch_names: A string containing the pitch names of the neume components in the n-gram,
+        separated by underscores.
+    contour: A string containing the contours of the neume components in the n-gram, separated
+        by underscores.
+    semitone_interval: A string containing the semitone intervals between the neume components
+        in the n-gram, separated by underscores.
+    neume_names: A string containing the names of the neumes in the n-gram,
+        separated by underscores. This field is not required, and is only present when
+        the n-gram contains complete neumes.
+
+    The following may be part of an NgramDocument, but are optional because
+    they will be added when the document is indexed:
+        manuscript_id: The ID of the manuscript the n-gram belongs to.
+        folio_number: The number of the folio on which the n-gram exists.
+        id: The unique ID of the document (corresponds to solr schema's id field)
+        type: The type of the document (corresponds to solr schema's type field)
+    """
+
+    location: str
+    pitch_names: str
+    contour: str
+    semitone_intervals: str
+    neume_names: NotRequired[str]
+    manuscript_id: NotRequired[str]
+    folio: NotRequired[str]
+    id: NotRequired[str]
+    type: NotRequired[Literal["omr_ngram"]]
+    image_uri: NotRequired[str]