feat: download youtube videos & playlists; remove print stmts

biglocalnews · Jul 1, 2024 · 6173978 · 6173978
1 parent d987346
commit 6173978
Show file tree

Hide file tree

Showing 6 changed files with 334 additions and 396 deletions.
diff --git a/Pipfile b/Pipfile
@@ -36,6 +36,7 @@ retry = "*"
 urllib3 = "1.26.18" # pegged to avoid test issue
 typing-extensions = "*"
 us = "*"
+pytube = "*"
 
 [pipenv]
 allow_prereleases = false
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/clean/ca/sacramento_pd.py b/clean/ca/sacramento_pd.py
@@ -1,7 +1,6 @@
 import copy
 import time
 from pathlib import Path
-from pprint import pprint
 from typing import List
 from urllib.parse import urlparse
 
@@ -104,7 +103,7 @@ def _create_metadata_json(self) -> Path:
         html = self.cache.read(html_location)
         soup = BeautifulSoup(html, "html.parser")
         lists = soup.select("#container-392a98e5b6 .paragraph li")
-        print(f"retrieved {len(lists)} lists")
+
         for url in self._extract_index_urls(lists):
             links.append(
                 {
@@ -114,12 +113,10 @@ def _create_metadata_json(self) -> Path:
                     "name": url["name"],
                 }
             )
-        print(f"extracted {len(links)} links from parent page")
 
         metadata = self._extract_child_links(links)
-        print(f"collected {len(metadata)} links total")
         outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
-        pprint(self._contains_repeated_asset_url(metadata))
+        self._test_repeated_asset_url(metadata)
         self.cache.write_json(outfile, metadata)
         return outfile
 
@@ -143,7 +140,6 @@ def _extract_child_links(self, links: List[MetadataDict]) -> List[MetadataDict]:
                     file_stem = url_split[-1]
                     soup = self._download_and_parse(link["asset_url"], file_stem)
                     photo_links = self._extract_photos(soup, file_stem, link)
-                    print(f"\t extending links by {len(photo_links)}")
                     modified_links.extend(photo_links)
                 else:
                     modified_links.append(
@@ -272,7 +268,7 @@ def _extract_index_urls(self, lists: ResultSet[Tag]):
                     "href": href_str,
                 }
 
-    def _contains_repeated_asset_url(self, objects: List[MetadataDict]):
+    def _test_repeated_asset_url(self, objects: List[MetadataDict]):
         """
         Check if the given list of objects contains any repeated asset URLs and returns them.
 

diff --git a/clean/ca/sonoma_county_sheriff.py b/clean/ca/sonoma_county_sheriff.py
@@ -60,7 +60,6 @@ def scrape_meta(self, throttle=0):
                     "name": link.strong.string,
                 }
                 metadata.append(payload)
-        print(metadata)
         outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
         self.cache.write_json(outfile, metadata)
         return outfile

diff --git a/clean/cache.py b/clean/cache.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from typing import Union
 
-from .utils import MetadataDict, get_url
+from .utils import MetadataDict, get_url, get_youtube_url
 
 logger = logging.getLogger(__name__)
 
@@ -109,22 +109,28 @@ def download(
         # Open the local Path
         local_path = Path(self.path, name)
         local_path.parent.mkdir(parents=True, exist_ok=True)
+        url_queue = [url]
         # Request the URL
         if not force and self.exists(name):
             logger.debug(f"File found in cache: {local_path}")
             return local_path
 
-        with get_url(url, stream=True, **kwargs) as r:
-            # If there's no encoding, set it
-            if encoding:
-                r.encoding = encoding
-            elif r.encoding is None:
-                r.encoding = "utf-8"
-            logger.debug(f"Downloading {url} to {local_path}")
-            # Write out the file in little chunks
-            with open(local_path, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
+        if "youtube" in url:
+            logger.debug("Detected Youtube URL")
+            url_queue = get_youtube_url(url)
+
+        for url in url_queue:
+            with get_url(url, stream=True, **kwargs) as r:
+                # If there's no encoding, set it
+                if encoding:
+                    r.encoding = encoding
+                elif r.encoding is None:
+                    r.encoding = "utf-8"
+                logger.debug(f"Downloading {url} to {local_path}")
+                # Write out the file in little chunks
+                with open(local_path, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
         # Return the path
         return local_path
 

diff --git a/clean/utils.py b/clean/utils.py
@@ -4,10 +4,12 @@
 import os
 from pathlib import Path
 from time import sleep
-from typing import Literal, Optional, TypedDict
+from typing import List, Literal, Optional, TypedDict
+from urllib.parse import parse_qs, urlparse
 
 import requests
 import us  # type: ignore
+from pytube import Playlist, YouTube  # type: ignore
 from retry import retry
 
 logger = logging.getLogger(__name__)
@@ -200,3 +202,56 @@ def get_url(
 
     # Return the response
     return response
+
+
+def get_youtube_url(url: str) -> List[str]:
+    """Download a video or playlist from a YouTube URL and save it to the cache. Return the set of stream URLs to be downloaded.
+
+    Args:
+        url (str): The URL of the video or playlist to download
+    """
+    logger.debug(f"Requesting YouTube {url}")
+    stream_urls = []
+
+    try:
+        if is_youtube_playlist(url):
+            logger.debug("Detected Youtube playlist, fetching URLs")
+            playlist = Playlist(url)
+            for video in playlist.videos:
+                stream = video.streams.get_highest_resolution()
+                if stream:
+                    stream_urls.append(stream.url)
+        else:
+            logger.debug("Detected Youtube video, fetching URL")
+            video = YouTube(url)
+            stream = video.streams.get_highest_resolution()
+            if stream:
+                stream_urls.append(stream.url)
+    except Exception as e:
+        logger.error(f"Error fetching YouTube content: {e}")
+
+    return stream_urls
+
+
+def is_youtube_playlist(url: str) -> bool:
+    """
+    Check if the given URL is a YouTube playlist URL.
+
+    Args:
+        url (str): The URL to check.
+
+    Returns:
+        bool: True if the URL is a playlist URL, False otherwise.
+    """
+    parsed_url = urlparse(url)
+    query_params = parse_qs(parsed_url.query)
+
+    # Check if 'list' query parameter exists
+    if "list" in query_params:
+        return True
+
+    # Check if URL path contains '/playlist'
+    if "/playlist" in parsed_url.path:
+        return True
+
+    return False