Refactor lyrics tests, do not search for empty metadata (#5452)

## Description Fixes #2635 Fixes #5133 I realised that #5406 has gotten too big, thus I'm splitting it into several smaller PRs. This PR refactors lyrics plugin tests and fixes an empty metadata issue in the lyrics logic. #### CI - Added `--extras=lyrics` to the Poetry install command to include the lyrics plugin dependencies. - In the main task which measures coverage, set `LYRICS_UPDATED` environment variable based on changes detected in the lyrics files. #### Test setup - Introduced `ConfigMixin` to centralize configuration setup for tests, reducing redundancy. This can be used by tests based on `pytest`. #### Lyrics logic - Trimmed whitespace from `item.title`, `item.artist`, and `item.artist_sort` in `search_pairs` function. - Added checks to avoid searching for lyrics if either the artist or title is missing. - Improved `_scrape_strip_cruft` function to remove Google Ads tags and unnecessary HTML tags. #### Lyrics tests overhaul - Migrated lyrics tests to use `pytest` for better isolation and configuration management. - Deleted redundant lyrics text files and some unused utils. - Marked tests that should only run when lyrics source code is updated (`LYRICS_UPDATED` is set from the CI) using the `on_lyrics_update` marker. #### Documentation and Dependencies - Added `requests-mock` version `1.12.1` to `pyproject.toml` and `poetry.lock` for mocking HTTP requests in tests. - Updated `setup.cfg` to include a new marker `on_lyrics_update`.
beetbox · Jan 19, 2025 · 38c8209 · 38c8209
2 parents bd30439 + e5c006d
commit 38c8209
Show file tree

Hide file tree

Showing 13 changed files with 1,005 additions and 3,081 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -35,6 +35,14 @@ jobs:
           sudo apt update
           sudo apt install ffmpeg gobject-introspection libcairo2-dev libgirepository1.0-dev pandoc
 
+      - name: Get changed lyrics files
+        id: lyrics-update
+        uses: tj-actions/changed-files@v45
+        with:
+          files: |
+            beetsplug/lyrics.py
+            test/plugins/test_lyrics.py
+
       - name: Add pytest annotator
         uses: liskin/gh-problem-matcher-wrap@v3
         with:
@@ -44,13 +52,15 @@ jobs:
       - if: ${{ env.IS_MAIN_PYTHON != 'true' }}
         name: Test without coverage
         run: |
-          poetry install --extras=autobpm
+          poetry install --extras=autobpm --extras=lyrics
           poe test
 
       - if: ${{ env.IS_MAIN_PYTHON == 'true' }}
         name: Test with coverage
+        env:
+          LYRICS_UPDATED: ${{ steps.lyrics-update.outputs.any_changed }}
         run: |
-          poetry install --extras=autobpm --extras=docs --extras=replaygain --extras=reflink
+          poetry install --extras=autobpm --extras=lyrics --extras=docs --extras=replaygain --extras=reflink
           poe docs
           poe test-with-coverage
 

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -382,7 +382,7 @@ to get a basic view on how tests are written. Since we are currently migrating
 the tests from `unittest`_ to `pytest`_, new tests should be written using
 `pytest`_. Contributions migrating existing tests are welcome!
 
-External API requests under test should be mocked with `requests_mock`_,
+External API requests under test should be mocked with `requests-mock`_,
 However, we still want to know whether external APIs are up and that they
 return expected responses, therefore we test them weekly with our `integration
 test`_ suite.

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
@@ -26,12 +26,19 @@
 import unicodedata
 import warnings
 from functools import partial
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
 from urllib.parse import quote, urlencode
 
 import requests
 from unidecode import unidecode
 
+import beets
+from beets import plugins, ui
+
+if TYPE_CHECKING:
+    from beets.importer import ImportTask
+    from beets.library import Item
+
 try:
     import bs4
     from bs4 import SoupStrainer
@@ -47,10 +54,6 @@
 except ImportError:
     HAS_LANGDETECT = False
 
-
-import beets
-from beets import plugins, ui
-
 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
 TAG_RE = re.compile(r"<[^>]*>")
@@ -152,7 +155,13 @@ def generate_alternatives(string, patterns):
                 alternatives.append(match.group(1))
         return alternatives
 
-    title, artist, artist_sort = item.title, item.artist, item.artist_sort
+    title, artist, artist_sort = (
+        item.title.strip(),
+        item.artist.strip(),
+        item.artist_sort.strip(),
+    )
+    if not title or not artist:
+        return ()
 
     patterns = [
         # Remove any featuring artists from the artists name
@@ -161,7 +170,7 @@ def generate_alternatives(string, patterns):
     artists = generate_alternatives(artist, patterns)
     # Use the artist_sort as fallback only if it differs from artist to avoid
     # repeated remote requests with the same search terms
-    if artist != artist_sort:
+    if artist_sort and artist.lower() != artist_sort.lower():
         artists.append(artist_sort)
 
     patterns = [
@@ -222,7 +231,7 @@ def __init__(self, config, log):
         self._log = log
         self.config = config
 
-    def fetch_url(self, url):
+    def fetch_url(self, url, **kwargs):
         """Retrieve the content at a given URL, or return None if the source
         is unreachable.
         """
@@ -240,6 +249,7 @@ def fetch_url(self, url):
                         "User-Agent": USER_AGENT,
                     },
                     timeout=10,
+                    **kwargs,
                 )
         except requests.RequestException as exc:
             self._log.debug("lyrics request failed: {0}", exc)
@@ -250,20 +260,27 @@ def fetch_url(self, url):
             self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
             return None
 
-    def fetch(self, artist, title, album=None, length=None):
-        raise NotImplementedError()
+    def fetch(
+        self, artist: str, title: str, album: str, length: int
+    ) -> str | None:
+        raise NotImplementedError
 
 
 class LRCLib(Backend):
     base_url = "https://lrclib.net/api/get"
 
-    def fetch(self, artist, title, album=None, length=None):
-        params = {
+    def fetch(
+        self, artist: str, title: str, album: str, length: int
+    ) -> str | None:
+        params: dict[str, str | int] = {
             "artist_name": artist,
             "track_name": title,
-            "album_name": album,
-            "duration": length,
         }
+        if album:
+            params["album_name"] = album
+
+        if length:
+            params["duration"] = length
 
         try:
             response = requests.get(
@@ -316,7 +333,7 @@ def encode(cls, text: str) -> str:
 
         return quote(unidecode(text))
 
-    def fetch(self, artist, title, album=None, length=None):
+    def fetch(self, artist: str, title: str, *_) -> str | None:
         url = self.build_url(artist, title)
 
         html = self.fetch_url(url)
@@ -364,7 +381,7 @@ def __init__(self, config, log):
             "User-Agent": USER_AGENT,
         }
 
-    def fetch(self, artist, title, album=None, length=None):
+    def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics from genius.com
 
         Because genius doesn't allow accessing lyrics via the api,
@@ -495,7 +512,7 @@ class Tekstowo(DirectBackend):
     def encode(cls, text: str) -> str:
         return cls.non_alpha_to_underscore(unidecode(text.lower()))
 
-    def fetch(self, artist, title, album=None, length=None):
+    def fetch(self, artist: str, title: str, *_) -> str | None:
         if html := self.fetch_url(self.build_url(artist, title)):
             return self.extract_lyrics(html)
 
@@ -536,6 +553,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
     html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
     html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
     html = re.sub("\u2005", " ", html)  # replace unicode with regular space
+    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
+    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
 
     if plain_text_out:  # Strip remaining HTML tags
         html = COMMENT_RE.sub("", html)
@@ -586,11 +605,7 @@ class Google(Backend):
     """Fetch lyrics from Google search results."""
 
     REQUIRES_BS = True
-
-    def __init__(self, config, log):
-        super().__init__(config, log)
-        self.api_key = config["google_API_key"].as_str()
-        self.engine_id = config["google_engine_ID"].as_str()
+    SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
@@ -667,15 +682,14 @@ def is_page_candidate(self, url_link, url_title, title, artist):
         ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
         return ratio >= typo_ratio
 
-    def fetch(self, artist, title, album=None, length=None):
-        query = f"{artist} {title}"
-        url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % (
-            self.api_key,
-            self.engine_id,
-            quote(query.encode("utf-8")),
-        )
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        params = {
+            "key": self.config["google_API_key"].as_str(),
+            "cx": self.config["google_engine_ID"].as_str(),
+            "q": f"{artist} {title}",
+        }
 
-        data = self.fetch_url(url)
+        data = self.fetch_url(self.SEARCH_URL, params=params)
         if not data:
             self._log.debug("google backend returned no data")
             return None
@@ -877,10 +891,7 @@ def func(lib, opts, args):
             for item in items:
                 if not opts.local_only and not self.config["local"]:
                     self.fetch_item_lyrics(
-                        lib,
-                        item,
-                        write,
-                        opts.force_refetch or self.config["force"],
+                        item, write, opts.force_refetch or self.config["force"]
                     )
                 if item.lyrics:
                     if opts.printlyr:
@@ -966,15 +977,13 @@ def writerest_indexes(self, directory):
             with open(conffile, "w") as output:
                 output.write(REST_CONF_TEMPLATE)
 
-    def imported(self, session, task):
+    def imported(self, _, task: ImportTask) -> None:
         """Import hook for fetching lyrics automatically."""
         if self.config["auto"]:
             for item in task.imported_items():
-                self.fetch_item_lyrics(
-                    session.lib, item, False, self.config["force"]
-                )
+                self.fetch_item_lyrics(item, False, self.config["force"])
 
-    def fetch_item_lyrics(self, lib, item, write, force):
+    def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
         """Fetch and store lyrics for a single item. If ``write``, then the
         lyrics will also be written to the file itself.
         """
@@ -983,18 +992,17 @@ def fetch_item_lyrics(self, lib, item, write, force):
             self._log.info("lyrics already present: {0}", item)
             return
 
-        lyrics = None
-        album = item.album
-        length = round(item.length)
+        lyrics_matches = []
+        album, length = item.album, round(item.length)
         for artist, titles in search_pairs(item):
-            lyrics = [
-                self.get_lyrics(artist, title, album=album, length=length)
+            lyrics_matches = [
+                self.get_lyrics(artist, title, album, length)
                 for title in titles
             ]
-            if any(lyrics):
+            if any(lyrics_matches):
                 break
 
-        lyrics = "\n\n---\n\n".join(filter(None, lyrics))
+        lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
 
         if lyrics:
             self._log.info("fetched lyrics: {0}", item)
@@ -1019,18 +1027,20 @@ def fetch_item_lyrics(self, lib, item, write, force):
             item.try_write()
         item.store()
 
-    def get_lyrics(self, artist, title, album=None, length=None):
+    def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         """Fetch lyrics, trying each source in turn. Return a string or
         None if no lyrics were found.
         """
         for backend in self.backends:
-            lyrics = backend.fetch(artist, title, album=album, length=length)
+            lyrics = backend.fetch(artist, title, *args)
             if lyrics:
                 self._log.debug(
                     "got lyrics from backend: {0}", backend.__class__.__name__
                 )
                 return _scrape_strip_cruft(lyrics, True)
 
+        return None
+
     def append_translation(self, text, to_lang):
         from xml.etree import ElementTree
 

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -41,6 +41,13 @@ Bug fixes:
 * Fix ambiguous column name ``sqlite3.OperationalError`` that occured in album
   queries that filtered album track titles, for example ``beet list -a keyword
   title:foo``.
+* :doc:`plugins/lyrics`: Rewrite lyrics tests using pytest to provide isolated
+  configuration for each test case. This fixes the issue where some tests
+  failed because they read developers' local lyrics configuration.
+  :bug:`5133`
+* :doc:`plugins/lyrics`: Do not attempt to search for lyrics if either the
+  artist or title is missing and ignore ``artist_sort`` value if it is empty.
+  :bug:`2635`
 
 For packagers:
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,6 +92,7 @@ python3-discogs-client = ">=2.3.15"
 py7zr = "*"
 pyxdg = "*"
 rarfile = "*"
+requests-mock = ">=1.12.1"
 requests_oauthlib = "*"
 responses = ">=0.3.0"
 

diff --git a/setup.cfg b/setup.cfg
@@ -8,6 +8,7 @@ addopts =
     -ra
     --strict-config
 markers =
+    on_lyrics_update: mark a test to run only after lyrics source code is updated
     integration_test: mark a test as an integration test
 
 [coverage:run]

diff --git a/test/conftest.py b/test/conftest.py
@@ -3,10 +3,21 @@
 import pytest
 
 
-def pytest_runtest_setup(item: pytest.Item):
-    """Skip integration tests if INTEGRATION_TEST environment variable is not set."""
-    if os.environ.get("INTEGRATION_TEST"):
-        return
+def skip_marked_items(items: list[pytest.Item], marker_name: str, reason: str):
+    for item in (i for i in items if i.get_closest_marker(marker_name)):
+        test_name = item.nodeid.split("::", 1)[-1]
+        item.add_marker(pytest.mark.skip(f"{reason}: {test_name}"))
 
-    if next(item.iter_markers(name="integration_test"), None):
-        pytest.skip(f"INTEGRATION_TEST=1 required: {item.nodeid}")
+
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: list[pytest.Item]
+):
+    if not os.environ.get("INTEGRATION_TEST") == "true":
+        skip_marked_items(
+            items, "integration_test", "INTEGRATION_TEST=1 required"
+        )
+
+    if not os.environ.get("LYRICS_UPDATED") == "true":
+        skip_marked_items(
+            items, "on_lyrics_update", "No change in lyrics source code"
+        )