Skip to content

Commit

Permalink
scraper done
Browse files Browse the repository at this point in the history
  • Loading branch information
irenecasado committed Jan 23, 2025
1 parent 79dac79 commit 2a3f43e
Showing 1 changed file with 167 additions and 59 deletions.
226 changes: 167 additions & 59 deletions clean/ca/san_diego_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
<<<<<<< HEAD
import json
import logging
import time
from pathlib import Path
from typing import List

=======
import csv
import logging
import time
import json
import re
from typing import List, Dict
from pathlib import Path
>>>>>>> 140792a (scraper done)
from playwright.sync_api import sync_playwright

from .. import utils
Expand Down Expand Up @@ -40,28 +50,30 @@ def agency_slug(self) -> str:
state_postal = mod.parent.stem
return f"{state_postal}_{mod.stem}" # e.g., ca_san_diego_county_sheriff

def scrape_meta(self, throttle: int = 0) -> Path:
def scrape_meta(self, throttle: int = 4) -> Path:
"""
<<<<<<< HEAD
Gather metadata on downloadable files by following a two-step process.
1. Extract links from main pages.
2. Extract metadata from detail pages.
=======
Download CSV file, extract request numbers, and scrape metadata.
>>>>>>> 140792a (scraper done)
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
throttle (int): Number of seconds to wait between requests. Defaults to 4.
Returns:
Path: Local path of JSON file containing metadata.
"""
logging.info("Starting metadata scraping process.")

# Step 1: Extract links from main pages
main_links = self.get_main_page_links()
logging.debug(f"Extracted {len(main_links)} main page links.")
# Step 1: Download and read the CSV file
request_numbers = self.download_and_parse_csv()

# Step 2: Extract metadata from detail pages
metadata = self.get_detail_page_links(main_links, throttle)
logging.debug(f"Extracted metadata for {len(metadata)} items.")
# Step 2: Scrape metadata for each request number
metadata = self.scrape_for_request_numbers(request_numbers, throttle)

# Write metadata to a JSON file
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
Expand All @@ -71,40 +83,61 @@ def scrape_meta(self, throttle: int = 0) -> Path:
logging.info(f"Metadata written to {outfile}")
return outfile

def get_all_page_urls(self) -> List[str]:

## TO DO: Print the urls that are being gathered in
# the get_all_page_urls function to see what is not working
def download_and_parse_csv(self) -> List[str]:
"""
Generate a list of all paginated URLs by navigating sequentially via the "Next" button.
Download the CSV file, parse it, and extract request numbers.
Returns:
List[str]: List of URLs for all paginated pages.
List[str]: A list of request numbers extracted from the CSV file.
"""
pages_urls = []
logging.info("Downloading CSV file.")

with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
browser = p.chromium.launch(headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()

# Navigate to the first page
logging.debug(f"Navigating to {self.disclosure_url}.")
page.goto(self.disclosure_url)

# Wait for the first page to load
try:
# Navigate to the disclosure page
logging.debug(f"Navigating to {self.disclosure_url}.")
page.goto(self.disclosure_url)
page.wait_for_load_state("networkidle", timeout=10000)

# Locate and click the dropdown arrow to export formats
dropdown_arrow = page.locator("#gridView_DXCTMenu0_DXI1_T")
dropdown_arrow.click()
page.wait_for_timeout(1000)

# Select and click the CSV option
csv_option = page.locator("text=CSV")
csv_button = page.get_by_role("menuitem").filter(has=csv_option)
csv_button.click()

# Wait for the file to download
download = page.wait_for_event("download")
download_path = (
self.cache_dir / "san_diego_county_sheriff_cases_012325.csv"
)
download.save_as(str(download_path))

logging.info(f"CSV file downloaded to {download_path}.")

# Read and parse the CSV file
with open(download_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
request_numbers = [row["Request Number"] for row in reader]

logging.info(f"Extracted {len(request_numbers)} request numbers.")
return request_numbers

except Exception as e:
logging.warning(f"Error while loading the initial page: {e}")
logging.error(f"Error during CSV download or parsing: {e}")
return []

while True:
try:
# Capture the current page URL
current_url = page.url
if current_url not in pages_urls:
pages_urls.append(current_url)
logging.debug(f"Collected URL: {current_url}")
finally:
browser.close()

<<<<<<< HEAD
# Check for the "Next" button
next_button = page.locator("#gridView_DXPagerBottom_PBN")
if (
Expand Down Expand Up @@ -189,49 +222,124 @@ def get_detail_page_links(
) -> List[MetadataDict]:
"""
Extract detailed metadata from links on the main pages.
=======
def scrape_for_request_numbers(
self, request_numbers: List[str], throttle: int
) -> List[MetadataDict]:
"""
Scrape data for each request number.
>>>>>>> 140792a (scraper done)

Args:
main_links (List[str]): A list of main page URLs.
request_numbers (List[str]): List of request numbers to search for.
throttle (int): Number of seconds to wait between requests.

Returns:
List[MetadataDict]: A list of metadata dictionaries for downloadable resources.
List[MetadataDict]: A list of metadata dictionaries for all request numbers.
"""
metadata: List[MetadataDict] = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
browser = p.chromium.launch(headless=False)
page = browser.new_page()
for link in main_links:
logging.debug(f"Navigating to detail page: {link}")
page.goto(link)
try:
page.wait_for_selector("a.qac_link", timeout=10000)
except Exception as e:
logging.warning(f"Detail links not loaded for {link}: {e}")
continue
try:
for request_number in request_numbers:
logging.debug(f"Processing request number: {request_number}")
# Reload the disclosure page for each request number
page.goto(self.disclosure_url)
page.wait_for_load_state("networkidle", timeout=10000)
# Extract downloadable links
detail_links = page.locator("a.qac_link")
for i in range(detail_links.count()):
href = detail_links.nth(i).get_attribute("onclick")
if href and "Open" in href:
asset_url = href.split(",")[2].strip().strip('"')
case_id = asset_url.split("/")[-1]
metadata.append(
{
"asset_url": asset_url,
"case_id": case_id,
"name": case_id,
"title": case_id,
"parent_page": link,
}
try:
search_input = page.locator("input#txtRefsearch_I")
search_input.fill(request_number)
search_input.press("Enter")
# Wait for results to load
page.wait_for_selector(
"i.fa.fa-arrow-circle-o-right", timeout=15000
)
time.sleep(throttle)
# Navigate to the detailed page and extract metadata
page_metadata = self.scrape_detail_page(
page, throttle, request_number
)
metadata.extend(page_metadata)
browser.close()
except Exception as e:
logging.error(
f"Failed to process request number {request_number}: {e}"
)
time.sleep(throttle)
finally:
browser.close()
logging.info(f"Extracted metadata for {len(metadata)} detail links.")
return metadata
def scrape_detail_page(
self, page, throttle: int, request_number: str
) -> List[MetadataDict]:
"""
Navigate to the detailed page and extract metadata.

Args:
page: The Playwright page object.
throttle (int): Number of seconds to wait between actions.
request_number (str): The request number being processed.

Returns:
List[MetadataDict]: A list of metadata dictionaries from the detail page.
"""
page_metadata = []

try:
# Click the arrow icon to go to the detail page
detail_icon = page.locator("i.fa.fa-arrow-circle-o-right").first
detail_icon.click()
page.wait_for_load_state("networkidle", timeout=10000)

# Extract metadata from the detail page
detail_links = page.locator("a.qac_link")
for i in range(detail_links.count()):
try:
onclick_attr = detail_links.nth(i).get_attribute("onclick")
if onclick_attr:
url_match = re.search(r'"(https?://[^"\s]+)"', onclick_attr)
name = detail_links.nth(i).inner_text().strip()
if url_match:
href = url_match.group(1)
case_id_match = re.search(
r"%2Fpublicrecords%2F([^%]+)", href
)
title = (
case_id_match.group(1)
if case_id_match
else "Unknown Title"
)
case_id = "".join(filter(str.isdigit, name))
if not case_id:
case_id = request_number
page_metadata.append(
{
"asset_url": href,
"case_id": case_id,
"name": name + ".zip",
"title": title,
"parent_page": page.url,
}
)
except Exception as e:
logging.warning(f"Error extracting detail link {i}: {e}")

# Go back to the main page
page.goto(self.disclosure_url)
page.wait_for_load_state("networkidle", timeout=10000)
time.sleep(throttle)

except Exception as e:
logging.error(f"Error navigating to the detail page: {e}")

return page_metadata

0 comments on commit 2a3f43e

Please sign in to comment.