Skip to content

Commit

Permalink
feat: download youtube videos & playlists; remove print stmts
Browse files Browse the repository at this point in the history
  • Loading branch information
newsroomdev committed Jul 1, 2024
1 parent d987346 commit 6173978
Show file tree
Hide file tree
Showing 6 changed files with 334 additions and 396 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ retry = "*"
urllib3 = "1.26.18" # pegged to avoid test issue
typing-extensions = "*"
us = "*"
pytube = "*"

[pipenv]
allow_prereleases = false
631 changes: 256 additions & 375 deletions Pipfile.lock

Large diffs are not rendered by default.

10 changes: 3 additions & 7 deletions clean/ca/sacramento_pd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import copy
import time
from pathlib import Path
from pprint import pprint
from typing import List
from urllib.parse import urlparse

Expand Down Expand Up @@ -104,7 +103,7 @@ def _create_metadata_json(self) -> Path:
html = self.cache.read(html_location)
soup = BeautifulSoup(html, "html.parser")
lists = soup.select("#container-392a98e5b6 .paragraph li")
print(f"retrieved {len(lists)} lists")

for url in self._extract_index_urls(lists):
links.append(
{
Expand All @@ -114,12 +113,10 @@ def _create_metadata_json(self) -> Path:
"name": url["name"],
}
)
print(f"extracted {len(links)} links from parent page")

metadata = self._extract_child_links(links)
print(f"collected {len(metadata)} links total")
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
pprint(self._contains_repeated_asset_url(metadata))
self._test_repeated_asset_url(metadata)
self.cache.write_json(outfile, metadata)
return outfile

Expand All @@ -143,7 +140,6 @@ def _extract_child_links(self, links: List[MetadataDict]) -> List[MetadataDict]:
file_stem = url_split[-1]
soup = self._download_and_parse(link["asset_url"], file_stem)
photo_links = self._extract_photos(soup, file_stem, link)
print(f"\t extending links by {len(photo_links)}")
modified_links.extend(photo_links)
else:
modified_links.append(
Expand Down Expand Up @@ -272,7 +268,7 @@ def _extract_index_urls(self, lists: ResultSet[Tag]):
"href": href_str,
}

def _contains_repeated_asset_url(self, objects: List[MetadataDict]):
def _test_repeated_asset_url(self, objects: List[MetadataDict]):
"""
Check if the given list of objects contains any repeated asset URLs and returns them.
Expand Down
1 change: 0 additions & 1 deletion clean/ca/sonoma_county_sheriff.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def scrape_meta(self, throttle=0):
"name": link.strong.string,
}
metadata.append(payload)
print(metadata)
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile
Expand Down
30 changes: 18 additions & 12 deletions clean/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from typing import Union

from .utils import MetadataDict, get_url
from .utils import MetadataDict, get_url, get_youtube_url

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -109,22 +109,28 @@ def download(
# Open the local Path
local_path = Path(self.path, name)
local_path.parent.mkdir(parents=True, exist_ok=True)
url_queue = [url]
# Request the URL
if not force and self.exists(name):
logger.debug(f"File found in cache: {local_path}")
return local_path

with get_url(url, stream=True, **kwargs) as r:
# If there's no encoding, set it
if encoding:
r.encoding = encoding
elif r.encoding is None:
r.encoding = "utf-8"
logger.debug(f"Downloading {url} to {local_path}")
# Write out the file in little chunks
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
if "youtube" in url:
logger.debug("Detected Youtube URL")
url_queue = get_youtube_url(url)

for url in url_queue:
with get_url(url, stream=True, **kwargs) as r:
# If there's no encoding, set it
if encoding:
r.encoding = encoding
elif r.encoding is None:
r.encoding = "utf-8"
logger.debug(f"Downloading {url} to {local_path}")
# Write out the file in little chunks
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# Return the path
return local_path

Expand Down
57 changes: 56 additions & 1 deletion clean/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import os
from pathlib import Path
from time import sleep
from typing import Literal, Optional, TypedDict
from typing import List, Literal, Optional, TypedDict
from urllib.parse import parse_qs, urlparse

import requests
import us # type: ignore
from pytube import Playlist, YouTube # type: ignore
from retry import retry

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -200,3 +202,56 @@ def get_url(

# Return the response
return response


def get_youtube_url(url: str) -> List[str]:
"""Download a video or playlist from a YouTube URL and save it to the cache. Return the set of stream URLs to be downloaded.
Args:
url (str): The URL of the video or playlist to download
"""
logger.debug(f"Requesting YouTube {url}")
stream_urls = []

try:
if is_youtube_playlist(url):
logger.debug("Detected Youtube playlist, fetching URLs")
playlist = Playlist(url)
for video in playlist.videos:
stream = video.streams.get_highest_resolution()
if stream:
stream_urls.append(stream.url)
else:
logger.debug("Detected Youtube video, fetching URL")
video = YouTube(url)
stream = video.streams.get_highest_resolution()
if stream:
stream_urls.append(stream.url)
except Exception as e:
logger.error(f"Error fetching YouTube content: {e}")

return stream_urls


def is_youtube_playlist(url: str) -> bool:
"""
Check if the given URL is a YouTube playlist URL.
Args:
url (str): The URL to check.
Returns:
bool: True if the URL is a playlist URL, False otherwise.
"""
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# Check if 'list' query parameter exists
if "list" in query_params:
return True

# Check if URL path contains '/playlist'
if "/playlist" in parsed_url.path:
return True

return False

0 comments on commit 6173978

Please sign in to comment.