Skip to content

Commit

Permalink
Sonoma County Scraper (#32)
Browse files Browse the repository at this point in the history
Add ca_sonoma_county_sheriff
  • Loading branch information
ochezems authored May 20, 2024
1 parent 2828124 commit 54cb7f8
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 2 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

* Ocheze Amuzie ([@ochezems](https://github.com/ochezems))
* Jordan Rynning ([@jrynning](https://github.com/jrynning))
* Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))
* Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))
89 changes: 89 additions & 0 deletions clean/ca/sonoma_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import time
from pathlib import Path

from bs4 import BeautifulSoup
from typing import List

from .. import utils
from ..cache import Cache


class Site:
"""Scrape file metadata and download files for the San Diego Police Department for SB16/SB1421/AB748 data.
Attributes:
name (str): The official name of the agency
"""

name = "Sonoma County Sheriff's Office"
agency_slug = "ca_sonoma_county_sheriff"

def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
# Start page contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.base_url = "https://www.sonomasheriff.org/sb1421"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
# to create a subdir inside the main cache directory to stash files for this agency
self.cache_suffix = f"{state_postal}_{mod.stem}" # ca_sonoma_county_sheriff

def scrape_meta(self, throttle=0):

# construct a local filename relative to the cache directory - agency slug + page url (ca/sonoma__/release_page.html)
# download the page (if not already cached)
# save the index page url to cache (sensible name)
base_name = f"{self.base_url.split('/')[-1]}.html"
filename = f'{self.agency_slug}/{base_name}'
cache_path = self.cache.download(filename, self.base_url), "utf-8"
metadata=[]
html = self.cache.read(filename)
soup = BeautifulSoup(html, "html.parser")
# TODO: Check with Katey Rusch about whether we need to harvest YouTube briefing links
body = soup.find("div", class_='main-content')
links = body.find_all("a")
for link in links:
if link.strong:
payload = {
"year": link.find_parent("ul").find_previous_sibling("p").strong.string.replace(":",""),
"parent_page": str(self.base_url),
"asset_url": link['href'].replace("dl=0","dl=1"),
"name": link.strong.string
}
metadata.append(payload)
print(metadata)
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile

def scrape(self, throttle: int = 4, filter: str="") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
# TODO: Update the logic to gracefully handle PDFs in addition to zip fiiles
url = asset['asset_url']
asset_name = asset['name']
# If name ends in `pdf?dl=1`, handle one way
if url.find('pdf?dl=1')== -1:
outfile = url.split('/')[-1].replace('?dl=1', '.zip')
else: outfile = url.split('/')[-1].replace('?dl=1', '')
dl_path = Path(self.agency_slug, "assets", outfile)
return dl_path

0 comments on commit 54cb7f8

Please sign in to comment.