Skip to content

Commit

Permalink
Merge pull request #845 from dchiller/mei-parsing-updates
Browse files Browse the repository at this point in the history
Update MEI parsing and creation of OMR search tokens
  • Loading branch information
dchiller authored May 15, 2024
2 parents 04600af + 526011d commit 37bf9ae
Show file tree
Hide file tree
Showing 7 changed files with 638 additions and 315 deletions.
106 changes: 65 additions & 41 deletions app/public/cantusdata/helpers/mei_processing/mei_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
between two neume components.
- get_contour_from_interval: Computes the contour of an interval.
- analyze_neume: Analyzes a neume (a list of neume components) to determine its
neume type, its intervals, and its contour.
neume name, its intervals, and its contour.
Defines associated types for the data structures used by the parser.
"""

from typing import Tuple, Dict, List, Iterator, Optional
from lxml import etree
from lxml import etree # pylint: disable=no-name-in-module
from .mei_parsing_types import (
Zone,
SyllableText,
NeumeComponentElementData,
NeumeComponent,
ContourType,
NeumeType,
NeumeName,
Neume,
Syllable,
)
Expand All @@ -31,24 +31,24 @@
PITCH_CLASS = {"c": 0, "d": 2, "e": 4, "f": 5, "g": 7, "a": 9, "b": 11}

# Mapping from neume contours to neume names
NEUME_GROUPS: Dict[str, NeumeType] = {
"": "Punctum",
"u": "Pes",
"d": "Clivis",
"uu": "Scandicus",
"ud": "Torculus",
"du": "Porrectus",
"s": "Distropha",
"ss": "Tristopha",
"sd": "Pressus",
"dd": "Climacus",
"ddu": "Climacus resupinus",
"udu": "Torculus resupinus",
"dud": "Porrectus flexus",
"udd": "Pes subpunctis",
"uud": "Scandicus flexus",
"uudd": "Scandicus subpunctis",
"dudd": "Porrectus subpunctis",
NEUME_GROUPS: Dict[str, NeumeName] = {
"": "punctum",
"u": "pes",
"d": "clivis",
"uu": "scandicus",
"ud": "torculus",
"du": "porrectus",
"r": "distropha",
"rr": "tristopha",
"rd": "pressus",
"dd": "climacus",
"ddu": "climacus_resupinus",
"udu": "torculus_resupinus",
"dud": "porrectus_flexus",
"udd": "pes_subpunctis",
"uud": "scandicus_flexus",
"uudd": "scandicus_subpunctis",
"dudd": "porrectus_subpunctis",
}


Expand All @@ -75,6 +75,7 @@ class MEIParser:
def __init__(self, mei_file: str):
self.mei_file = mei_file
self.mei = etree.parse(self.mei_file)
self._remove_empty_neumes_and_syllables()
self.zones = self.parse_zones()
self.syllables = self.parse_mei()

Expand Down Expand Up @@ -182,7 +183,7 @@ def _parse_neume(
)
if parsed_neume_component:
parsed_nc_elements.append(parsed_neume_component)
neume_type, intervals, contours = analyze_neume(parsed_nc_elements)
neume_name, intervals, contours = analyze_neume(parsed_nc_elements)
# If the first neume component of the next syllable can be parsed,
# add the interval and contour between the final neume component of
# the current syllable and the first neume component of the next syllable.
Expand All @@ -193,7 +194,7 @@ def _parse_neume(
if parsed_next_neume_comp:
last_neume_comp = parsed_nc_elements[-1]
intervals.append(
get_interval_between_neume_components(
get_semitones_between_neume_components(
last_neume_comp, parsed_next_neume_comp
)
)
Expand All @@ -211,12 +212,13 @@ def _parse_neume(
"pname": nc["pname"],
"octave": nc["octave"],
"bounding_box": nc["bounding_box"],
"interval": intervals[i] if i < len(intervals) else None,
"semitone_interval": intervals[i] if i < len(intervals) else None,
"contour": contours[i] if i < len(contours) else None,
"system": neume_system,
}
)
parsed_neume: Neume = {
"neume_type": neume_type,
"neume_name": neume_name,
"neume_components": parsed_neume_components,
"bounding_box": combined_bounding_box,
"system": neume_system,
Expand Down Expand Up @@ -323,6 +325,26 @@ def _syllable_iterator(
system += 1
current_elem = next(elem_iterator, None)

def _remove_empty_neumes_and_syllables(self) -> None:
"""
Apparently, for a while Rodan was creating invalid MEI files that
contained empty neumes (i.e., neumes with no neume components) and
empty syllables (i.e., syllables with no neumes or only empty neumes).
This method removes those empty neumes and syllables from the MEI being parsed;
it was added as a preprocessing step so that it can, once the base
MEI files are corrected, be removed.
"""
for neume in self.mei.iter(f"{self.MEINS}neume"):
if len(neume.findall(f"{self.MEINS}nc")) == 0:
# Ignoring type because we know that getparent() will
# return an element in this case.
neume.getparent().remove(neume) # type: ignore
for syllable in self.mei.iter(f"{self.MEINS}syllable"):
if len(syllable.findall(f"{self.MEINS}neume")) == 0:
# Ignoring type because we know that getparent() will
# return an element in this case.
syllable.getparent().remove(syllable) # type: ignore

def parse_mei(self) -> List[Syllable]:
"""
Parses the MEI file into a list of syllables.
Expand Down Expand Up @@ -351,7 +373,7 @@ def parse_mei(self) -> List[Syllable]:
return syllables


def get_interval_between_neume_components(
def get_semitones_between_neume_components(
neume_component_1: NeumeComponentElementData,
neume_component_2: NeumeComponentElementData,
) -> int:
Expand All @@ -369,8 +391,8 @@ def get_interval_between_neume_components(
try:
pc1 = PITCH_CLASS[neume_component_1["pname"]]
pc2 = PITCH_CLASS[neume_component_2["pname"]]
except KeyError:
raise ValueError("Invalid pitch name in neume component.")
except KeyError as err:
raise ValueError("Invalid pitch name in neume component.") from err
# In MIDI note numbers, C0 = 12.
pitch_1 = pc1 + (12 * (neume_component_1["octave"] + 1))
pitch_2 = pc2 + (12 * (neume_component_2["octave"] + 1))
Expand All @@ -382,34 +404,36 @@ def get_contour_from_interval(interval: int) -> ContourType:
Compute the contour of an interval.
:param interval: The size of the interval in semitones
:return: The contour of the interval ("u"[p], "d"[own], or "s"[tay])
:return: The contour of the interval ("u"[p], "d"[own], or "r"[epeat])
"""
if interval < 0:
return "d"
if interval > 0:
return "u"
return "s"
return "r"


def analyze_neume(
neume: List[NeumeComponentElementData],
) -> Tuple[NeumeType, List[int], List[ContourType]]:
) -> Tuple[NeumeName, List[int], List[ContourType]]:
"""
Analyze a neume (a list of neume components) to determine:
- Neume type
- Neume intervals
- Neume contour
- The neume type (e.g., punctum, pes, clivis, etc.)
- The intervals in the neume in semitones
- The contour of the nueme
:param neume: A list of neume components (a list of NeumeComponentsType dictionaries)
:return: A tuple of information about the neume:
- Neume type (str)
- Neume intervals (list of ints)
- Neume contour (list of "u"[p], "d"[own], or "s"[tay])
- Neume intervals in semitones (list of ints)
- Neume contour (list of "u"[p], "d"[own], or "r"[epeat])
"""
intervals: List[int] = [
get_interval_between_neume_components(nc1, nc2)
semitone_intervals: List[int] = [
get_semitones_between_neume_components(nc1, nc2)
for nc1, nc2 in zip(neume[:-1], neume[1:])
]
contours: List[ContourType] = [get_contour_from_interval(i) for i in intervals]
neume_type: NeumeType = NEUME_GROUPS.get("".join(contours), "Compound")
return neume_type, intervals, contours
contours: List[ContourType] = [
get_contour_from_interval(i) for i in semitone_intervals
]
neume_type: NeumeName = NEUME_GROUPS.get("".join(contours), "compound")
return neume_type, semitone_intervals, contours
92 changes: 66 additions & 26 deletions app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Contains type definitions used in the MEI parsing process.
"""

from typing import Tuple, TypedDict, Literal, List, Optional
from typing import Tuple, TypedDict, Literal, List, Optional, NotRequired
from typing_extensions import TypeAlias

# A type for coordinates of bounding boxes
Expand Down Expand Up @@ -30,26 +30,26 @@ class Zone(TypedDict):
rotate: float


ContourType = Literal["u", "d", "s"]
NeumeType = Literal[
"Punctum",
"Pes",
"Clivis",
"Scandicus",
"Torculus",
"Porrectus",
"Distropha",
"Tristopha",
"Pressus",
"Climacus",
"Climacus resupinus",
"Torculus resupinus",
"Porrectus flexus",
"Pes subpunctis",
"Scandicus flexus",
"Scandicus subpunctis",
"Porrectus subpunctis",
"Compound",
ContourType = Literal["u", "d", "r"]
NeumeName = Literal[
"punctum",
"pes",
"clivis",
"scandicus",
"torculus",
"porrectus",
"distropha",
"tristopha",
"pressus",
"climacus",
"climacus_resupinus",
"torculus_resupinus",
"porrectus_flexus",
"pes_subpunctis",
"scandicus_flexus",
"scandicus_subpunctis",
"porrectus_subpunctis",
"compound",
]


Expand All @@ -74,27 +74,29 @@ class NeumeComponent(NeumeComponentElementData):
"""A type extending NeumeComponentElementData with interval and contour information.
interval: The interval (in semitones) between the neume component and the
semitone_interval: The interval in semitones between the neume component and the
following neume component. If there is no following neume component,
this is None.
contour: The contour ("u"[p], "d"[own], or "s"[tay]) of 'interval'. If there is no
contour: The contour ("u"[p], "d"[own], or "r"[epeat]) of 'interval'. If there is no
following neume component, this is None.
system: The system number that the neume component is on
"""

interval: Optional[int]
semitone_interval: Optional[int]
contour: Optional[ContourType]
system: int


class Neume(TypedDict):
"""A type for neumes
neume_type: The name of the neume (ie. "Punctum", "Pes", "Clivis", etc.)
neume_name: The name of the neume (ie. "punctum", "pes", "clivis", etc.)
neume_components: A list of neume components (containing pitch infomation)
bounding_box: The bounding box of the neume
system: The system number that the neume is on
"""

neume_type: NeumeType
neume_name: NeumeName
neume_components: List[NeumeComponent]
bounding_box: Zone
system: int
Expand All @@ -112,3 +114,41 @@ class Syllable(TypedDict):

text: SyllableText
neumes: List[Neume]


class NgramDocument(TypedDict):
"""
A generic type for documents containing n-grams
of information extracted from MEI files.
ngram_unit: The unit of the n-gram
location: The location of the n-gram in the MEI file (MEI Zones
converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes)
pitch_names: A string containing the pitch names of the neume components in the n-gram,
separated by underscores.
contour: A string containing the contours of the neume components in the n-gram, separated
by underscores.
semitone_interval: A string containing the semitone intervals between the neume components
in the n-gram, separated by underscores.
neume_names: A string containing the names of the neumes in the n-gram,
separated by underscores. This field is not required, and is only present when
the n-gram contains complete neumes.
The following may be part of an NgramDocument, but are optional because
they will be added when the document is indexed:
manuscript_id: The ID of the manuscript the n-gram belongs to.
folio_number: The number of the folio on which the n-gram exists.
id: The unique ID of the document (corresponds to solr schema's id field)
type: The type of the document (corresponds to solr schema's type field)
"""

location: str
pitch_names: str
contour: str
semitone_intervals: str
neume_names: NotRequired[str]
manuscript_id: NotRequired[str]
folio: NotRequired[str]
id: NotRequired[str]
type: NotRequired[Literal["omr_ngram"]]
image_uri: NotRequired[str]
Loading

0 comments on commit 37bf9ae

Please sign in to comment.