Skip to content

Commit

Permalink
Merge branch 'main' into 4920-store-unmatched-citations
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie authored Jan 24, 2025
2 parents 2fa3924 + bf837db commit d3a6a83
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 20 deletions.
13 changes: 9 additions & 4 deletions cl/corpus_importer/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,16 @@ def handle_update_latest_case_id_and_schedule_iquery_sweep(
# pacer_case_id)
return None

# Only call update_latest_case_id_and_schedule_iquery_sweep if this is a
# new RECAP district or bankruptcy docket with pacer_case_id not added by
# iquery sweep tasks.
# Only call update_latest_case_id_and_schedule_iquery_sweep if:
# - The docket belongs to a RECAP district or bankruptcy court,
# - The docket has a pacer_case_id,
# - The docket was newly created (when IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=True), or
# - The docket was created or updated by the last probe iteration from probe_iquery_pages.
check_probe_or_created = (
not getattr(instance, "avoid_trigger_signal", False) or created
)
if (
created
check_probe_or_created
and instance.pacer_case_id
and instance.court_id
in list(
Expand Down
23 changes: 19 additions & 4 deletions cl/corpus_importer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2576,7 +2576,7 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(
)
# Probing will add 3 dockets (12, 16, 24) + 2 added for the sweep task (13,18).
self.assertEqual(
dockets.count(), 5, msg="Docket number doesn't match."
dockets.count(), 5, msg="Docket count doesn't match."
)
# 7 additional PACER HTML files should be stored by now, 3 added by the
# probing task + 4 added by the sweep task.
Expand All @@ -2589,8 +2589,22 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(

### Integration test probing task + sweep
# IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED False
with override_settings(IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=False):
# Create docket pacer_case_id 12, which is the last docket in
# the probe. Even though it already exists, it should trigger
# a sweep task.
DocketFactory(
court=self.court_txed,
source=Docket.RECAP,
case_name="New Incoming Docket 12",
docket_number="2:10-cv-00602",
pacer_case_id="12",
)

dockets = Docket.objects.filter(court_id=self.court_txed.pk)
self.assertEqual(dockets.count(), 0)
self.assertEqual(
dockets.count(), 1, msg="Docket count doesn't match for txed."
)
r = get_redis_interface("CACHE")
# Simulate a highest_known_pacer_case_id = 8
r.hset("iquery:highest_known_pacer_case_id", self.court_txed.pk, 8)
Expand All @@ -2615,9 +2629,10 @@ def test_update_latest_case_id_and_schedule_iquery_sweep_integration(
1,
msg="Wrong number of sweep task called.",
)
# Probing will add 3 dockets (9,10,12) + 1 added for the sweep task (11).
# Probing will add 3 dockets (9,10) + 1 added for the sweep task (11).
# Docket 12 already exists however, it should still trigger the sweep task that adds 11.
self.assertEqual(
dockets.count(), 4, msg="Docket number doesn't match for txed."
dockets.count(), 4, msg="Docket count doesn't match for txed."
)
finally:
# Ensure the signal is disconnected after the test
Expand Down
16 changes: 11 additions & 5 deletions cl/recap/mergers.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,8 +943,12 @@ async def add_docket_entries(
# RDs. The check here ensures that if that happens for a particular
# entry, we avoid creating the main RD a second+ time when we get the
# docket sheet a second+ time.
appelate_court_id_exists = await ais_appellate_court(d.court_id)
if de_created is False and appelate_court_id_exists:

appellate_court_id_exists = await ais_appellate_court(d.court_id)
appellate_rd_att_exists = False
if de_created is False and appellate_court_id_exists:
# In existing appellate entry merges, check if the entry has at
# least one attachment.
appellate_rd_att_exists = await de.recap_documents.filter(
document_type=RECAPDocument.ATTACHMENT
).aexists()
Expand All @@ -953,14 +957,16 @@ async def add_docket_entries(
params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
try:
get_params = deepcopy(params)
if de_created is False and not appelate_court_id_exists:
del get_params["document_type"]
if de_created is False and not appellate_court_id_exists:
get_params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
if de_created is False:
# Try to match the RD regardless of the document_type.
del get_params["document_type"]
rd = await RECAPDocument.objects.aget(**get_params)
rds_updated.append(rd)
except RECAPDocument.DoesNotExist:
rd = None
if de_created is False and not appelate_court_id_exists:
if de_created is False and not appellate_court_id_exists:
try:
# Check for documents with a bad pacer_doc_id
rd = await RECAPDocument.objects.aget(**params)
Expand Down
13 changes: 9 additions & 4 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,15 @@ async def process_recap_pdf(pk):
pq = await ProcessingQueue.objects.aget(pk=pk)
await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)

if pq.attachment_number is None:
document_type = RECAPDocument.PACER_DOCUMENT
else:
document_type = RECAPDocument.ATTACHMENT
document_type = (
RECAPDocument.PACER_DOCUMENT
if not pq.attachment_number # This check includes attachment_number set to None or 0
else RECAPDocument.ATTACHMENT
)
# Set attachment_number to None if it is 0
pq.attachment_number = (
None if not pq.attachment_number else pq.attachment_number
)

logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq} ")
try:
Expand Down
148 changes: 147 additions & 1 deletion cl/recap/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def setUpTestData(cls):
cls.att_data = AppellateAttachmentPageFactory(
attachments=[
AppellateAttachmentFactory(
pacer_doc_id="04505578698", attachment_number=1
pacer_doc_id="04505578698",
attachment_number=1,
description="Order entered",
),
AppellateAttachmentFactory(
pacer_doc_id="04505578699", attachment_number=2
Expand All @@ -182,6 +184,7 @@ def setUpTestData(cls):
DocketEntryDataFactory(
pacer_doc_id="04505578698",
document_number=1,
short_description="Lorem ipsum",
)
],
)
Expand Down Expand Up @@ -577,6 +580,149 @@ def test_reprocess_appellate_docket_after_adding_attachments(
self.att_data["attachments"][0]["description"],
)

def test_match_appellate_main_rd_with_attachments_and_no_att_data(
self, mock_upload
):
"""Can we match the main RECAPDocument when merging an appellate docket
entry from a docket sheet after a PDF upload has added attachments,
but before the attachment page for the entry is available?
"""

d = DocketFactory(
source=Docket.RECAP,
court=self.court_appellate,
pacer_case_id="104490",
)
# Merge docket entry #1
async_to_sync(add_docket_entries)(d, self.de_data["docket_entries"])

# Confirm that the main RD has been properly merged.
recap_documents = RECAPDocument.objects.all().order_by("date_created")
self.assertEqual(recap_documents.count(), 1)
main_rd = recap_documents[0]
self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
self.assertEqual(main_rd.attachment_number, None)
self.assertEqual(main_rd.description, "Lorem ipsum")

# Upload a PDF for attachment 2 in the same entry #1.
pq = ProcessingQueue.objects.create(
court=self.court_appellate,
uploader=self.user,
pacer_case_id=d.pacer_case_id,
pacer_doc_id="04505578699",
document_number=1,
attachment_number=2,
upload_type=UPLOAD_TYPE.PDF,
filepath_local=self.f,
)
async_to_sync(process_recap_upload)(pq)

entry_rds = RECAPDocument.objects.filter(
docket_entry=main_rd.docket_entry
)
# Confirm a new RD was created by the att PDF upload.
self.assertEqual(entry_rds.count(), 2, msg="Wrong number of RDs.")

pq.refresh_from_db()
att_2_rd = pq.recap_document
# The new RD should be attachment #2
self.assertEqual(att_2_rd.document_type, RECAPDocument.ATTACHMENT)
self.assertEqual(att_2_rd.attachment_number, 2)

# Simulate a docket sheet merge containing entry #1 again:
de_data_2 = DocketEntriesDataFactory(
docket_entries=[
DocketEntryDataFactory(
pacer_doc_id="04505578698",
document_number=1,
short_description="Motion",
)
],
)

async_to_sync(add_docket_entries)(d, de_data_2["docket_entries"])
self.assertEqual(entry_rds.count(), 2, msg="Wrong number of RDs.")
main_rd.refresh_from_db()

# Confirm the main RD was properly matched and updated.
self.assertEqual(main_rd.description, "Motion")
self.assertEqual(
main_rd.document_type,
RECAPDocument.PACER_DOCUMENT,
msg="Wrong document type.",
)
self.assertEqual(main_rd.attachment_number, None)

# Now merge the Attachment page.
pq = ProcessingQueue.objects.create(
court=self.court_appellate,
uploader=self.user,
pacer_case_id="104490",
upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
filepath_local=self.f,
)
with mock.patch(
"cl.recap.tasks.get_data_from_appellate_att_report",
side_effect=lambda x, y: self.att_data,
):
# Process the appellate attachment page containing 2 attachments.
async_to_sync(process_recap_appellate_attachment)(pq.pk)

# Confirm that the main_rd is properly converted into an attachment.
self.assertEqual(recap_documents.count(), 2)
main_rd.refresh_from_db()
self.assertEqual(
main_rd.document_type,
RECAPDocument.ATTACHMENT,
msg="Wrong document type.",
)
self.assertEqual(main_rd.attachment_number, 1)

def test_avoid_merging_att_zero_on_pdf_uploads(self, mock_upload):
"""Confirm that a RECAP PDF upload containing attachment number 0
matches the main RD."""

d = DocketFactory(
source=Docket.RECAP,
court=self.court_appellate,
pacer_case_id="104490",
)
# Merge docket entry #1
async_to_sync(add_docket_entries)(d, self.de_data["docket_entries"])

# Confirm that the main RD has been properly merged.
recap_documents = RECAPDocument.objects.all().order_by("date_created")
self.assertEqual(recap_documents.count(), 1)
main_rd = recap_documents[0]
self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
self.assertEqual(main_rd.attachment_number, None)
self.assertEqual(main_rd.is_available, False)

# Upload a PDF for attachment number 0.
pq = ProcessingQueue.objects.create(
court=self.court_appellate,
uploader=self.user,
pacer_case_id=d.pacer_case_id,
pacer_doc_id="04505578698",
document_number=1,
attachment_number=0,
upload_type=UPLOAD_TYPE.PDF,
filepath_local=self.f,
)
async_to_sync(process_recap_upload)(pq)
entry_rds = RECAPDocument.objects.filter(
docket_entry=main_rd.docket_entry
)
pq.refresh_from_db()
main_rd = pq.recap_document

# Confirm that the main RD is properly matched and that
# attachment_number is not set to 0.
self.assertEqual(entry_rds.count(), 1, msg="Wrong number of RDs.")
self.assertEqual(main_rd.document_type, RECAPDocument.PACER_DOCUMENT)
self.assertEqual(main_rd.attachment_number, None)
self.assertEqual(main_rd.is_available, True)

async def test_uploading_a_case_query_result_page(self, mock):
"""Can we upload a case query result page and have it be saved
correctly?
Expand Down
7 changes: 6 additions & 1 deletion cl/search/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,12 @@ def filepath_local(self, create, extracted, **kwargs):
self.filepath_local = FileField().evaluate(None, None, kwargs)

if create:
self.save(update_fields=["filepath_local"])
# Use a Docket queryset to persist filepath_local instead of calling
# save(), which can trigger duplicate post_save signals, potentially
# causing issues in certain testing scenarios.
Docket.objects.filter(pk=self.pk).update(
filepath_local=self.filepath_local
)


class DocketWithChildrenFactory(DocketFactory):
Expand Down
2 changes: 1 addition & 1 deletion cl/settings/project/corpus_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"IQUERY_COURT_BLOCKED_MAX_ATTEMPTS", default=6
)
IQUERY_EMPTY_PROBES_LIMIT = env.int("IQUERY_EMPTY_PROBES_LIMIT", default=15)
IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED = env(
IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED = env.bool(
"IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED", default=False
)
IQUERY_COURT_RATE = env("IQUERY_COURT_RATE", default="100/s")

0 comments on commit d3a6a83

Please sign in to comment.