Merge branch 'ri-gh-413-fix-scraper' of https://github.com/chriszs/wa…

…rn-scraper
biglocalnews · Feb 5, 2022 · 41ca38c · 41ca38c
2 parents 72247a6 + f87f7df
commit 41ca38c
Showing 1 changed file with 28 additions and 28 deletions.
diff --git a/warn/scrapers/ri.py b/warn/scrapers/ri.py
@@ -2,12 +2,13 @@
 from pathlib import Path
 
 from bs4 import BeautifulSoup
+from openpyxl import load_workbook
 
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212"]
-__tags__ = ["html"]
+__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "chriszs"]
+__tags__ = ["excel"]
 
 logger = logging.getLogger(__name__)
 
@@ -28,14 +29,35 @@ def scrape(
     # Open the cache
     cache = Cache(cache_dir)
 
+    state_code = "ri"
+
     # Get the HTML
-    url = "https://dlt.ri.gov/wds/warn/"
+    base_url = "https://dlt.ri.gov/"
+    url = f"{base_url}/employers/worker-adjustment-and-retraining-notification-warn"
     r = utils.get_url(url)
     html = r.text
-    cache.write("ri/source.html", html)
+    cache.write(f"{state_code}/source.html", html)
+
+    # Find links
+    soup = BeautifulSoup(html, "html.parser")
+    links = soup.find_all("a")
+
+    row_list = []
+
+    for link in links:
+        if "WARN Report" in link.text:
+            excel_url = f"{base_url}{link.get('href')}"
+            excel_path = cache.download(f"{state_code}/WARN Report.xlsx", excel_url)
+
+            # Open it up
+            workbook = load_workbook(filename=excel_path)
 
-    # Scrape out the data
-    row_list = _parse_table(html)
+            # Get the first sheet
+            worksheet = workbook.worksheets[0]
+
+            for r in list(worksheet.rows)[2:]:
+                column = [cell.value for cell in r]
+                row_list.append(column)
 
     # Write out
     data_path = data_dir / "ri.csv"
@@ -45,27 +67,5 @@ def scrape(
     return data_path
 
 
-def _parse_table(html) -> list:
-    # Parse table
-    soup = BeautifulSoup(html, "html.parser")
-    table_list = soup.find_all("table")
-
-    # We expect the first table to be there with our data
-    assert len(table_list) > 0
-    table = table_list[0]
-
-    # Parse the cells
-    row_list = []
-    for row in table.find_all("tr"):
-        cell_list = row.find_all(["th", "td"])
-        if not cell_list:
-            continue
-        cell_list = [c.text.strip() for c in cell_list]
-        row_list.append(cell_list)
-
-    # Return it
-    return row_list
-
-
 if __name__ == "__main__":
     scrape()