diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 7f8862d596..e00092b791 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1130,7 +1130,7 @@ def do_case_query_by_pacer_case_id( async_to_sync(update_docket_metadata)(d, docket_data) d.save() - add_tags_to_objs(tag_names, [d]) + async_to_sync(add_tags_to_objs)(tag_names, [d]) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 8ccdc4e472..5e726194bd 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -816,7 +816,7 @@ async def get_or_make_docket_entry( async def add_docket_entries( d: Docket, docket_entries: list[dict[str, Any]], - tags: list[str] | None = None, + tags: list[Tag] | None = None, do_not_update_existing: bool = False, ) -> tuple[ tuple[list[DocketEntry], list[RECAPDocument]], list[RECAPDocument], bool @@ -873,7 +873,7 @@ async def add_docket_entries( await de.asave() if tags: for tag in tags: - tag.tag_object(de) + await sync_to_async(tag.tag_object)(de) if de_created: content_updated = True @@ -918,7 +918,10 @@ async def add_docket_entries( params["document_type"] = RECAPDocument.ATTACHMENT params["pacer_doc_id"] = docket_entry["pacer_doc_id"] try: - rd = await RECAPDocument.objects.aget(**params) + get_params = deepcopy(params) + if de_created is False and not appelate_court_id_exists: + del get_params["document_type"] + rd = await RECAPDocument.objects.aget(**get_params) rds_updated.append(rd) except RECAPDocument.DoesNotExist: try: @@ -950,9 +953,24 @@ async def add_docket_entries( await duplicate_rd_queryset.exclude(pk=rd.pk).adelete() rd.pacer_doc_id = rd.pacer_doc_id or docket_entry["pacer_doc_id"] - rd.description = ( - docket_entry.get("short_description") or rd.description - ) + description = docket_entry.get("short_description") or rd.description + if rd.document_type == RECAPDocument.PACER_DOCUMENT: + rd.description = description + else: + rd_qs = de.recap_documents.filter( + document_type=RECAPDocument.PACER_DOCUMENT + ) + if await rd_qs.aexists(): + rd_pd = await rd_qs.afirst() + if rd_pd.attachment_number is not None: + continue + if rd_pd.description != description: + rd_pd.description = description + try: + await rd_pd.asave() + except ValidationError: + # Happens from race conditions. + continue rd.document_number = docket_entry["document_number"] or "" try: await rd.asave() @@ -961,7 +979,7 @@ async def add_docket_entries( continue if tags: for tag in tags: - tag.tag_object(rd) + await sync_to_async(tag.tag_object)(rd) attachments = docket_entry.get("attachments") if attachments is not None: @@ -1430,7 +1448,7 @@ def add_claims_to_docket(d, new_claims, tag_names=None): ) db_claim.remarks = new_claim.get("remarks") or db_claim.remarks db_claim.save() - add_tags_to_objs(tag_names, [db_claim]) + async_to_sync(add_tags_to_objs)(tag_names, [db_claim]) for new_history in new_claim["history"]: add_claim_history_entry(new_history, db_claim) @@ -1457,7 +1475,7 @@ def get_data_from_appellate_att_report( return att_data -async def add_tags_to_objs(tag_names: List[str], objs: Any) -> QuerySet: +async def add_tags_to_objs(tag_names: List[str], objs: Any) -> list[Tag]: """Add tags by name to objects :param tag_names: A list of tag name strings @@ -1469,14 +1487,14 @@ async def add_tags_to_objs(tag_names: List[str], objs: Any) -> QuerySet: if tag_names is None: return [] - tags = [] + tags: list[Tag] = [] for tag_name in tag_names: tag, _ = await Tag.objects.aget_or_create(name=tag_name) tags.append(tag) for tag in tags: for obj in objs: - tag.tag_object(obj) + await sync_to_async(tag.tag_object)(obj) return tags @@ -1700,8 +1718,6 @@ async def merge_attachment_page_data( attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), - # Missing on some restricted docs (see Juriscraper) - attachment["page_count"] is not None, attachment["description"], ] if not all(sanity_checks): @@ -1734,17 +1750,49 @@ async def merge_attachment_page_data( try: rd = await RECAPDocument.objects.aget(**params) except RECAPDocument.DoesNotExist: - rd = RECAPDocument(**params) - rds_created.append(rd) + try: + doc_id_params = deepcopy(params) + del doc_id_params["attachment_number"] + del doc_id_params["document_type"] + doc_id_params["pacer_doc_id"] = attachment["pacer_doc_id"] + rd = await RECAPDocument.objects.aget(**doc_id_params) + if attachment.get("attachment_number") == 0: + try: + old_main_rd = await RECAPDocument.objects.aget( + de=de, + document_type=RECAPDocument.PACER_DOCUMENT, + ) + rd.description = old_main_rd.description + except RECAPDocument.DoesNotExist: + rd.description = "" + except RECAPDocument.MultipleObjectsReturned: + rd.description = "" + logger.info( + f"Failed to migrate description for " + f"{attachment["pacer_doc_id"]}, " + f"multiple source documents found." + ) + rd.attachment_number = None + rd.document_type = RECAPDocument.PACER_DOCUMENT + else: + rd.attachment_number = attachment["attachment_number"] + rd.document_type = RECAPDocument.ATTACHMENT + except RECAPDocument.DoesNotExist: + rd = RECAPDocument(**params) + rds_created.append(rd) rds_affected.append(rd) - for field in ["description", "pacer_doc_id"]: - if attachment[field]: - setattr(rd, field, attachment[field]) + if ( + attachment["description"] + and rd.document_type == RECAPDocument.ATTACHMENT + ): + rd.description = attachment["description"] + if attachment["pacer_doc_id"]: + rd.pacer_doc_id = attachment["pacer_doc_id"] # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. - if rd.page_count is None: + if rd.page_count is None and attachment.get("page_count", None): rd.page_count = attachment["page_count"] # If we have file_size_bytes it should have max precision. file_size_bytes = attachment.get("file_size_bytes") diff --git a/poetry.lock b/poetry.lock index ec0a881741..f085fbe921 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2236,13 +2236,13 @@ setuptools = "*" [[package]] name = "juriscraper" -version = "2.6.19" +version = "2.6.20" description = "An API to scrape American court websites for metadata." optional = false python-versions = "*" files = [ - {file = "juriscraper-2.6.19-py27-none-any.whl", hash = "sha256:fe5807f1ae7ecb4adcc971c5351fb56d7af71547f39d9ee56a78821c80db8276"}, - {file = "juriscraper-2.6.19.tar.gz", hash = "sha256:ebb7312efa1f6b8ddfb5cf0b9ea678dbe40a248f575559860816ed4af507af9d"}, + {file = "juriscraper-2.6.20-py27-none-any.whl", hash = "sha256:5de2830f5c0593ad3ea0ebbdfbac203bc4c4da537461a890d49d1d0c439b9ac7"}, + {file = "juriscraper-2.6.20.tar.gz", hash = "sha256:895744c1edde3828f4ff9f62c1366010bb636e54e59bb6848729e598cc15fd69"}, ] [package.dependencies] @@ -5466,4 +5466,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "7a2f54103ce6aaa8d20563aa306fcfa93e656b8e10f748e88bad288dc0d44ebe" +content-hash = "7b4647b80299ec79c1146ec44d7eed564bf35e259cf821e2a00450d832250d3c" diff --git a/pyproject.toml b/pyproject.toml index 636a556658..19afc96344 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,7 @@ hyperscan = "^0.7.7" openai = "^1.31.1" seal-rookery = "^2.2.3" types-pytz = "^2024.1.0.20240417" -juriscraper = "^2.6.15" +juriscraper = "^2.6.20" [tool.poetry.group.dev.dependencies]