Merge branch 'main' into 4920-store-unmatched-citations

freelawproject · Jan 24, 2025 · d3a6a83 · d3a6a83
2 parents 2fa3924 + bf837db
commit d3a6a83
Show file tree

Hide file tree

Showing 7 changed files with 202 additions and 20 deletions.
diff --git a/cl/corpus_importer/signals.py b/cl/corpus_importer/signals.py
@@ -119,11 +119,16 @@ def handle_update_latest_case_id_and_schedule_iquery_sweep(
         # pacer_case_id)
         return None
 
-    # Only call update_latest_case_id_and_schedule_iquery_sweep if this is a
-    # new RECAP district or bankruptcy docket with pacer_case_id not added by
-    # iquery sweep tasks.
+    # Only call update_latest_case_id_and_schedule_iquery_sweep if:
+    # - The docket belongs to a RECAP district or bankruptcy court,
+    # - The docket has a pacer_case_id,
+    # - The docket was newly created (when IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=True), or
+    # - The docket was created or updated by the last probe iteration from probe_iquery_pages.
+    check_probe_or_created = (
+        not getattr(instance, "avoid_trigger_signal", False) or created
+    )
     if (
-        created
+        check_probe_or_created
         and instance.pacer_case_id
         and instance.court_id
         in list(

diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py
@@ -2576,7 +2576,7 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(
             )
             # Probing will add 3 dockets (12, 16, 24) + 2 added for the sweep task (13,18).
             self.assertEqual(
-                dockets.count(), 5, msg="Docket number doesn't match."
+                dockets.count(), 5, msg="Docket count doesn't match."
             )
             # 7 additional PACER HTML files should be stored by now, 3 added by the
             # probing task + 4 added by the sweep task.
@@ -2589,8 +2589,22 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(
 
             ### Integration test probing task + sweep
             # IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED False
+            with override_settings(IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=False):
+                # Create docket pacer_case_id 12, which is the last docket in
+                # the probe. Even though it already exists, it should trigger
+                # a sweep task.
+                DocketFactory(
+                    court=self.court_txed,
+                    source=Docket.RECAP,
+                    case_name="New Incoming Docket 12",
+                    docket_number="2:10-cv-00602",
+                    pacer_case_id="12",
+                )
+
             dockets = Docket.objects.filter(court_id=self.court_txed.pk)
-            self.assertEqual(dockets.count(), 0)
+            self.assertEqual(
+                dockets.count(), 1, msg="Docket count doesn't match for txed."
+            )
             r = get_redis_interface("CACHE")
             # Simulate a highest_known_pacer_case_id  = 8
             r.hset("iquery:highest_known_pacer_case_id", self.court_txed.pk, 8)
@@ -2615,9 +2629,10 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(
                 1,
                 msg="Wrong number of sweep task called.",
             )
-            # Probing will add 3 dockets (9,10,12) + 1 added for the sweep task (11).
+            # Probing will add 3 dockets (9,10) + 1 added for the sweep task (11).
+            # Docket 12 already exists however, it should still trigger the sweep task that adds 11.
             self.assertEqual(
-                dockets.count(), 4, msg="Docket number doesn't match for txed."
+                dockets.count(), 4, msg="Docket count doesn't match for txed."
             )
         finally:
             # Ensure the signal is disconnected after the test

diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py
@@ -943,8 +943,12 @@ async def add_docket_entries(
         # RDs. The check here ensures that if that happens for a particular
         # entry, we avoid creating the main RD a second+ time when we get the
         # docket sheet a second+ time.
-        appelate_court_id_exists = await ais_appellate_court(d.court_id)
-        if de_created is False and appelate_court_id_exists:
+
+        appellate_court_id_exists = await ais_appellate_court(d.court_id)
+        appellate_rd_att_exists = False
+        if de_created is False and appellate_court_id_exists:
+            # In existing appellate entry merges, check if the entry has at
+            # least one attachment.
             appellate_rd_att_exists = await de.recap_documents.filter(
                 document_type=RECAPDocument.ATTACHMENT
             ).aexists()
@@ -953,14 +957,16 @@ async def add_docket_entries(
                 params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
         try:
             get_params = deepcopy(params)
-            if de_created is False and not appelate_court_id_exists:
-                del get_params["document_type"]
+            if de_created is False and not appellate_court_id_exists:
                 get_params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
+            if de_created is False:
+                # Try to match the RD regardless of the document_type.
+                del get_params["document_type"]
             rd = await RECAPDocument.objects.aget(**get_params)
             rds_updated.append(rd)
         except RECAPDocument.DoesNotExist:
             rd = None
-            if de_created is False and not appelate_court_id_exists:
+            if de_created is False and not appellate_court_id_exists:
                 try:
                     # Check for documents with a bad pacer_doc_id
                     rd = await RECAPDocument.objects.aget(**params)

diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py
@@ -249,10 +249,15 @@ async def process_recap_pdf(pk):
     pq = await ProcessingQueue.objects.aget(pk=pk)
     await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
 
-    if pq.attachment_number is None:
-        document_type = RECAPDocument.PACER_DOCUMENT
-    else:
-        document_type = RECAPDocument.ATTACHMENT
+    document_type = (
+        RECAPDocument.PACER_DOCUMENT
+        if not pq.attachment_number  # This check includes attachment_number set to None or 0
+        else RECAPDocument.ATTACHMENT
+    )
+    # Set attachment_number to None if it is 0
+    pq.attachment_number = (
+        None if not pq.attachment_number else pq.attachment_number
+    )
 
     logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq} ")
     try:

diff --git a/cl/recap/tests.py b/cl/recap/tests.py
@@ -168,7 +168,9 @@ def setUpTestData(cls):
         cls.att_data = AppellateAttachmentPageFactory(
             attachments=[
                 AppellateAttachmentFactory(
-                    pacer_doc_id="04505578698", attachment_number=1
+                    pacer_doc_id="04505578698",
+                    attachment_number=1,
+                    description="Order entered",
                 ),
                 AppellateAttachmentFactory(
                     pacer_doc_id="04505578699", attachment_number=2
@@ -182,6 +184,7 @@ def setUpTestData(cls):
                 DocketEntryDataFactory(
                     pacer_doc_id="04505578698",
                     document_number=1,
+                    short_description="Lorem ipsum",
                 )
             ],
         )
@@ -577,6 +580,149 @@ def test_reprocess_appellate_docket_after_adding_attachments(
             self.att_data["attachments"][0]["description"],
         )
 
+    def test_match_appellate_main_rd_with_attachments_and_no_att_data(
+        self, mock_upload
+    ):
+        """Can we match the main RECAPDocument when merging an appellate docket
+        entry from a docket sheet after a PDF upload has added attachments,
+        but before the attachment page for the entry is available?
+        """
+
+        d = DocketFactory(
+            source=Docket.RECAP,
+            court=self.court_appellate,
+            pacer_case_id="104490",
+        )
+        # Merge docket entry #1
+        async_to_sync(add_docket_entries)(d, self.de_data["docket_entries"])
+
+        # Confirm that the main RD has been properly merged.
+        recap_documents = RECAPDocument.objects.all().order_by("date_created")
+        self.assertEqual(recap_documents.count(), 1)
+        main_rd = recap_documents[0]
+        self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
+        self.assertEqual(main_rd.attachment_number, None)
+        self.assertEqual(main_rd.description, "Lorem ipsum")
+
+        # Upload a PDF for attachment 2 in the same entry #1.
+        pq = ProcessingQueue.objects.create(
+            court=self.court_appellate,
+            uploader=self.user,
+            pacer_case_id=d.pacer_case_id,
+            pacer_doc_id="04505578699",
+            document_number=1,
+            attachment_number=2,
+            upload_type=UPLOAD_TYPE.PDF,
+            filepath_local=self.f,
+        )
+        async_to_sync(process_recap_upload)(pq)
+
+        entry_rds = RECAPDocument.objects.filter(
+            docket_entry=main_rd.docket_entry
+        )
+        # Confirm a new RD was created by the att PDF upload.
+        self.assertEqual(entry_rds.count(), 2, msg="Wrong number of RDs.")
+
+        pq.refresh_from_db()
+        att_2_rd = pq.recap_document
+        # The new RD should be attachment #2
+        self.assertEqual(att_2_rd.document_type, RECAPDocument.ATTACHMENT)
+        self.assertEqual(att_2_rd.attachment_number, 2)
+
+        # Simulate a docket sheet merge containing entry #1 again:
+        de_data_2 = DocketEntriesDataFactory(
+            docket_entries=[
+                DocketEntryDataFactory(
+                    pacer_doc_id="04505578698",
+                    document_number=1,
+                    short_description="Motion",
+                )
+            ],
+        )
+
+        async_to_sync(add_docket_entries)(d, de_data_2["docket_entries"])
+        self.assertEqual(entry_rds.count(), 2, msg="Wrong number of RDs.")
+        main_rd.refresh_from_db()
+
+        # Confirm the main RD was properly matched and updated.
+        self.assertEqual(main_rd.description, "Motion")
+        self.assertEqual(
+            main_rd.document_type,
+            RECAPDocument.PACER_DOCUMENT,
+            msg="Wrong document type.",
+        )
+        self.assertEqual(main_rd.attachment_number, None)
+
+        # Now merge the Attachment page.
+        pq = ProcessingQueue.objects.create(
+            court=self.court_appellate,
+            uploader=self.user,
+            pacer_case_id="104490",
+            upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
+            filepath_local=self.f,
+        )
+        with mock.patch(
+            "cl.recap.tasks.get_data_from_appellate_att_report",
+            side_effect=lambda x, y: self.att_data,
+        ):
+            # Process the appellate attachment page containing 2 attachments.
+            async_to_sync(process_recap_appellate_attachment)(pq.pk)
+
+        # Confirm that the main_rd is properly converted into an attachment.
+        self.assertEqual(recap_documents.count(), 2)
+        main_rd.refresh_from_db()
+        self.assertEqual(
+            main_rd.document_type,
+            RECAPDocument.ATTACHMENT,
+            msg="Wrong document type.",
+        )
+        self.assertEqual(main_rd.attachment_number, 1)
+
+    def test_avoid_merging_att_zero_on_pdf_uploads(self, mock_upload):
+        """Confirm that a RECAP PDF upload containing attachment number 0
+        matches the main RD."""
+
+        d = DocketFactory(
+            source=Docket.RECAP,
+            court=self.court_appellate,
+            pacer_case_id="104490",
+        )
+        # Merge docket entry #1
+        async_to_sync(add_docket_entries)(d, self.de_data["docket_entries"])
+
+        # Confirm that the main RD has been properly merged.
+        recap_documents = RECAPDocument.objects.all().order_by("date_created")
+        self.assertEqual(recap_documents.count(), 1)
+        main_rd = recap_documents[0]
+        self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
+        self.assertEqual(main_rd.attachment_number, None)
+        self.assertEqual(main_rd.is_available, False)
+
+        # Upload a PDF for attachment number 0.
+        pq = ProcessingQueue.objects.create(
+            court=self.court_appellate,
+            uploader=self.user,
+            pacer_case_id=d.pacer_case_id,
+            pacer_doc_id="04505578698",
+            document_number=1,
+            attachment_number=0,
+            upload_type=UPLOAD_TYPE.PDF,
+            filepath_local=self.f,
+        )
+        async_to_sync(process_recap_upload)(pq)
+        entry_rds = RECAPDocument.objects.filter(
+            docket_entry=main_rd.docket_entry
+        )
+        pq.refresh_from_db()
+        main_rd = pq.recap_document
+
+        # Confirm that the main RD is properly matched and that
+        # attachment_number is not set to 0.
+        self.assertEqual(entry_rds.count(), 1, msg="Wrong number of RDs.")
+        self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
+        self.assertEqual(main_rd.attachment_number, None)
+        self.assertEqual(main_rd.is_available, True)
+
     async def test_uploading_a_case_query_result_page(self, mock):
         """Can we upload a case query result page and have it be saved
         correctly?

diff --git a/cl/search/factories.py b/cl/search/factories.py
@@ -307,7 +307,12 @@ def filepath_local(self, create, extracted, **kwargs):
             self.filepath_local = FileField().evaluate(None, None, kwargs)
 
         if create:
-            self.save(update_fields=["filepath_local"])
+            # Use a Docket queryset to persist filepath_local instead of calling
+            # save(), which can trigger duplicate post_save signals, potentially
+            # causing issues in certain testing scenarios.
+            Docket.objects.filter(pk=self.pk).update(
+                filepath_local=self.filepath_local
+            )
 
 
 class DocketWithChildrenFactory(DocketFactory):

diff --git a/cl/settings/project/corpus_importer.py b/cl/settings/project/corpus_importer.py
@@ -11,7 +11,7 @@
     "IQUERY_COURT_BLOCKED_MAX_ATTEMPTS", default=6
 )
 IQUERY_EMPTY_PROBES_LIMIT = env.int("IQUERY_EMPTY_PROBES_LIMIT", default=15)
-IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED = env(
+IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED = env.bool(
     "IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED", default=False
 )
 IQUERY_COURT_RATE = env("IQUERY_COURT_RATE", default="100/s")