From 7190f2227fcd1a91ddfe37c2522c4f2fe1a4c4e4 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 7 Aug 2024 18:43:02 -0600 Subject: [PATCH 1/2] fix(scrape_pacer_free): correct order of ocr_available function params --- .../management/commands/scrape_pacer_free_opinions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index ff96862bc4..f31ee2ad45 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -360,7 +360,7 @@ def do_everything(courts, date_start, date_end, index, queue): logger.info("Getting PDFs from free document reports") get_pdfs(courts, date_start, date_end, index, queue) logger.info("Doing OCR and saving items to Solr.") - ocr_available(index, queue) + ocr_available(queue, index) class Command(VerboseCommand): From 459084b5aa68f7f2da0d9af89c646c5012c2fabd Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 7 Aug 2024 19:01:01 -0600 Subject: [PATCH 2/2] feat(scrape_pacer_free): log number of documents requiring OCR --- .../management/commands/scrape_pacer_free_opinions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index f31ee2ad45..6aa4d76765 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -327,6 +327,7 @@ def ocr_available(queue: str, index: bool) -> None: .order_by() ) count = rds.count() + logger.info(f"Total documents requiring OCR: {count}") throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait()