From 78468a984f10291dc319480181350bf61f4591e7 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 20 Jan 2025 22:25:07 -0500 Subject: [PATCH] feat(citations): add UnmatchedCitation model and logic Solves #4920 - Add new model UnmatchedCitation on citations app - refactor cl.search.models.Citation to create a BaseCitation abstract model to reuse on the UnmatchedCitation model - updates cl.citations.tasks.store_opinion_citations_and_update_parentheticals to handle storing and updating unmatched citations - updates cl.search.signals to update UnmatchedCitation status when a new Citation is saved - add tests --- .../management/commands/find_citations.py | 4 + cl/citations/migrations/0001_initial.py | 153 ++++++++++++++++++ cl/citations/migrations/__init__.py | 0 cl/citations/models.py | 96 +++++++++++ cl/citations/tasks.py | 74 ++++++++- cl/citations/tests.py | 95 +++++++++++ cl/search/models.py | 24 ++- cl/search/signals.py | 19 +++ 8 files changed, 453 insertions(+), 12 deletions(-) create mode 100644 cl/citations/migrations/0001_initial.py create mode 100644 cl/citations/migrations/__init__.py create mode 100644 cl/citations/models.py diff --git a/cl/citations/management/commands/find_citations.py b/cl/citations/management/commands/find_citations.py index 37bb191b58..4cdbfbdfb0 100644 --- a/cl/citations/management/commands/find_citations.py +++ b/cl/citations/management/commands/find_citations.py @@ -5,6 +5,7 @@ from django.core.management import CommandError from django.core.management.base import CommandParser +from cl.citations.models import UnmatchedCitation from cl.citations.tasks import ( find_citations_and_parentheticals_for_opinion_by_pks, ) @@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None: query = query.filter(date_modified__gte=options["modified_after"]) if options.get("all"): query = Opinion.objects.all() + sys.stdout.write("Deleting all UnmatchedCitation rows") + UnmatchedCitation.objects.all().delete() + self.count = query.count() self.average_per_s = 0.0 self.timings: List[float] = [] diff --git a/cl/citations/migrations/0001_initial.py b/cl/citations/migrations/0001_initial.py new file mode 100644 index 0000000000..25f194d61d --- /dev/null +++ b/cl/citations/migrations/0001_initial.py @@ -0,0 +1,153 @@ +# Generated by Django 5.1.4 on 2025-01-21 03:45 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("search", "0037_alter_citation_type_noop"), + ] + + operations = [ + migrations.CreateModel( + name="UnmatchedCitation", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "volume", + models.SmallIntegerField( + help_text="The volume of the reporter" + ), + ), + ( + "reporter", + models.TextField( + db_index=True, + help_text="The abbreviation for the reporter", + ), + ), + ( + "page", + models.TextField( + help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana." + ), + ), + ( + "type", + models.SmallIntegerField( + choices=[ + (1, "A federal reporter citation (e.g. 5 F. 55)"), + ( + 2, + "A citation in a state-based reporter (e.g. Alabama Reports)", + ), + ( + 3, + "A citation in a regional reporter (e.g. Atlantic Reporter)", + ), + ( + 4, + "A citation in a specialty reporter (e.g. Lawyers' Edition)", + ), + ( + 5, + "A citation in an early SCOTUS reporter (e.g. 5 Black. 55)", + ), + ( + 6, + "A citation in the Lexis system (e.g. 5 LEXIS 55)", + ), + ( + 7, + "A citation in the WestLaw system (e.g. 5 WL 55)", + ), + (8, "A vendor neutral citation (e.g. 2013 FL 1)"), + ( + 9, + "A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)", + ), + ], + help_text="The type of citation that this is.", + ), + ), + ( + "status", + models.SmallIntegerField( + choices=[ + ( + 1, + "The citation does not exist in the search_citation table. We couldn't match the citation to a cluster on the previous citation extractor run", + ), + ( + 2, + "The citation exists on the search_citation table. We haven't updated the citing Opinion.html_with_citations yet", + ), + ( + 3, + "The citing Opinion.html_with_citations was updated successfully", + ), + ( + 4, + "The citing Opinion.html_with_citations update failed because the citation is ambiguous", + ), + ( + 5, + "The citing Opinion.html_with_citations update failed", + ), + ], + help_text="Status of resolution of the initially unmatched citation", + ), + ), + ( + "citation_string", + models.TextField( + help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation" + ), + ), + ( + "court_id", + models.TextField( + help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations" + ), + ), + ( + "year", + models.TextField( + help_text="A year identified by eyecite from the opinion's context" + ), + ), + ( + "citing_opinion", + models.ForeignKey( + help_text="The opinion citing this citation", + on_delete=django.db.models.deletion.CASCADE, + related_name="eyecite_citations", + to="search.opinion", + ), + ), + ], + options={ + "indexes": [ + models.Index( + fields=["volume", "reporter", "page"], + name="citations_u_volume_da4d25_idx", + ) + ], + "unique_together": { + ("citing_opinion", "volume", "reporter", "page") + }, + }, + ), + ] diff --git a/cl/citations/migrations/__init__.py b/cl/citations/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cl/citations/models.py b/cl/citations/models.py new file mode 100644 index 0000000000..d722fff2e8 --- /dev/null +++ b/cl/citations/models.py @@ -0,0 +1,96 @@ +from django.db import models +from eyecite.models import FullCaseCitation + +from cl.citations.utils import map_reporter_db_cite_type +from cl.search.models import BaseCitation, Opinion + + +class UnmatchedCitation(BaseCitation): + """Keep track of citations that could not be resolved to a cluster on the + batch citator run + """ + + UNMATCHED = 1 + FOUND = 2 + RESOLVED = 3 + FAILED_AMBIGUOUS = 4 + FAILED = 5 + STATUS = ( + ( + UNMATCHED, + "The citation does not exist in the search_citation table." + " We couldn't match the citation to a cluster on the " + " previous citation extractor run", + ), + ( + FOUND, + "The citation exists on the search_citation table. We " + " haven't updated the citing Opinion.html_with_citations yet", + ), + ( + RESOLVED, + "The citing Opinion.html_with_citations was updated successfully", + ), + ( + FAILED_AMBIGUOUS, + "The citing Opinion.html_with_citations update " + "failed because the citation is ambiguous", + ), + (FAILED, "The citing Opinion.html_with_citations update failed"), + ) + citing_opinion = models.ForeignKey( + Opinion, + help_text="The opinion citing this citation", + on_delete=models.CASCADE, + related_name="eyecite_citations", + ) + status = models.SmallIntegerField( + help_text="Status of resolution of the initially unmatched citation", + choices=STATUS, + ) + citation_string = models.TextField( + help_text="The unparsed citation string in case it doesn't match the " + "regular citation model in BaseCitation" + ) + court_id = models.TextField( + help_text="A court_id as identified by eyecite from the opinion's " + "context. May be useful to know where to find missing citations" + ) + year = models.TextField( + help_text="A year identified by eyecite from the opinion's context" + ) + + class Meta: + indexes = [ + models.Index( + fields=["volume", "reporter", "page"], + ) + ] + # + unique_together = (("citing_opinion", "volume", "reporter", "page"),) + + @classmethod + def create_from_eyecite( + cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion + ): + """ + Create an UnmatchedCitation instance using an eyecite FullCaseCitation + + Saving is left to the caller + + :param eyecite_citation: a FullCaseCitation as returned by + eyecite.get_citations + :param citing_opinion: the opinion which uses the citation + """ + cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type + return cls( + citing_opinion=citing_opinion, + status=cls.UNMATCHED, + citation_string=eyecite_citation.matched_text(), + court_id=eyecite_citation.metadata.court or "", + year=eyecite_citation.metadata.year or "", + volume=eyecite_citation.groups["volume"], + reporter=eyecite_citation.corrected_reporter(), + page=eyecite_citation.groups["page"], + type=map_reporter_db_cite_type(cite_type_str), + ) diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index 8ac2984752..c3f5ba1499 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -5,7 +5,7 @@ from django.db.models import F from django.db.models.query import QuerySet from eyecite import get_citations -from eyecite.models import CitationBase +from eyecite.models import CitationBase, FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer from cl.celery_init import app @@ -21,6 +21,7 @@ NO_MATCH_RESOURCE, do_resolve_citations, ) +from cl.citations.models import UnmatchedCitation from cl.citations.parenthetical_utils import create_parenthetical_groups from cl.citations.recap_citations import store_recap_citations from cl.citations.score_parentheticals import parenthetical_score @@ -131,7 +132,8 @@ def store_opinion_citations_and_update_parentheticals( opinion: Opinion, ) -> None: """ - Updates counts of citations to other opinions within a given court opinion, as well as parenthetical info for the cited opinions. + Updates counts of citations to other opinions within a given court opinion, + parenthetical info for the cited opinions, and stores unmatched citations :param opinion: A search.Opinion object. :return: None @@ -160,8 +162,8 @@ def store_opinion_citations_and_update_parentheticals( opinion, citation_resolutions ) - # Delete the unmatched citations - citation_resolutions.pop(NO_MATCH_RESOURCE, None) + # Put apart the unmatched citations + unmatched_citations = citation_resolutions.pop(NO_MATCH_RESOURCE, []) # Increase the citation count for the cluster of each matched opinion # if that cluster has not already been cited by this opinion. First, @@ -205,6 +207,12 @@ def store_opinion_citations_and_update_parentheticals( ) ) + # If the opinion has been processed previously, we update it's + # associated UnmatchedCitations.status. If not, we store them all + update_unmatched_status = UnmatchedCitation.objects.filter( + citing_opinion=opinion + ).exists() + # Finally, commit these changes to the database in a single # transcation block. with transaction.atomic(): @@ -215,6 +223,11 @@ def store_opinion_citations_and_update_parentheticals( citation_count=F("citation_count") + 1 ) + if update_unmatched_status: + update_unmatched_citations_status(citation_resolutions, opinion) + else: + store_unmatched_citations(unmatched_citations, opinion) + # Nuke existing citations and parentheticals OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() Parenthetical.objects.filter(describing_opinion_id=opinion.pk).delete() @@ -249,3 +262,56 @@ def store_opinion_citations_and_update_parentheticals( index_related_cites_fields.delay( OpinionsCited.__name__, opinion.pk, cluster_ids_to_update ) + + +def update_unmatched_citations_status( + citation_resolutions: Dict[ + MatchedResourceType, List[SupportedCitationType] + ], + citing_opinion: Opinion, +) -> None: + """Check if previously unmatched citations have been resolved and + updtes UnmatchedCitation.status accordingly + + We assume no new UnmatchedCitations will be created after the first run + + :param citation_resolutions: dict whose values are resolved citations + :param citing_opinion: the opinion + :return None: + """ + resolved_citations = { + c.matched_text() for v in citation_resolutions.values() for c in v + } + + # the query will also try to reprocess the previous failures + found_citations = UnmatchedCitation.objects.filter( + citing_opinion=citing_opinion + ).exclude( + status__in=[UnmatchedCitation.UNMATCHED, UnmatchedCitation.RESOLVED] + ) + for found in found_citations: + if found.citation_string in resolved_citations: + found.status = UnmatchedCitation.RESOLVED + else: + found.status = UnmatchedCitation.FAILED + found.save() + + +def store_unmatched_citations( + unmatched_citations: List[CitationBase], opinion: Opinion +) -> None: + """Bulk create UnmatchedCitation instances cited by an opinion + + Only FullCaseCitations provide useful information for resolution + updates. Other types are discarded + + :param unmatched_citations: + :param opinion: the citing opinion + :return None: + """ + unmatched_citations_to_store = [ + UnmatchedCitation.create_from_eyecite(unmatched_citation, opinion) + for unmatched_citation in unmatched_citations + if isinstance(unmatched_citation, FullCaseCitation) + ] + UnmatchedCitation.objects.bulk_create(unmatched_citations_to_store) diff --git a/cl/citations/tests.py b/cl/citations/tests.py index 59b1e7a986..5bca20ca36 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -45,10 +45,13 @@ do_resolve_citations, resolve_fullcase_citation, ) +from cl.citations.models import UnmatchedCitation from cl.citations.score_parentheticals import parenthetical_score from cl.citations.tasks import ( find_citations_and_parentheticals_for_opinion_by_pks, store_recap_citations, + store_unmatched_citations, + update_unmatched_citations_status, ) from cl.lib.test_helpers import CourtTestCase, PeopleTestCase, SearchTestCase from cl.search.factories import ( @@ -62,6 +65,7 @@ ) from cl.search.models import ( SEARCH_TYPES, + Citation, Opinion, OpinionCluster, OpinionsCited, @@ -2398,3 +2402,94 @@ async def test_can_throttle_user_exceeding_citation_limit_by_big_margin( # times the allowed number of citations. expected_time = test_date + timedelta(minutes=3) self.assertEqual(data["wait_until"], expected_time.isoformat()) + + +class UnmatchedCitationTest(SimpleTestCase): + # this will produce 4 citations: 3 FullCase and 1 Id + plain_text = """ + petition. 62 Tex. Sup. Ct. J. 313 (Jan. 18, 2019). II. Appraisal and the + TPPCA Although presented in... inference and resolving any doubts in the + nonmovants favor. E.g., Frost Natl Bank v. Fernandez, 315 S.W.3d 494, 508 + (Tex. 2010) (citation omitted); Valence Operating Co. v. Dorsett, + 164 S.W.3d 656, 661 (Tex. 2005) (citation omitted). When both parties move + for summary judgment on the same issue,... does not alter the fact that + State Farm complied with the Insurance Code . . . . Id. Likewise, we hold + in this case that State Farm's invocation' + """ + eyecite_citations = get_citations( + plain_text, tokenizer=HYPERSCAN_TOKENIZER + ) + cluster = None + opinion = None + + @classmethod + def setUpClass(cls): + cls.cluster = OpinionClusterFactoryWithChildrenAndParents() + cls.opinion = cls.cluster.sub_opinions.first() + UnmatchedCitation.objects.all().delete() + + def test_1st_creation(self) -> None: + """Can we save unmatched citations?""" + store_unmatched_citations(self.eyecite_citations, self.opinion) + unmatched_citations = list( + UnmatchedCitation.objects.filter(citing_opinion=self.opinion).all() + ) + self.assertTrue( + len(unmatched_citations) == 3, + "Incorrect number of citations saved", + ) + self.assertTrue( + unmatched_citations[-1].court_id == "texapp", + "court_id was not saved", + ) + self.assertTrue( + unmatched_citations[0].year == "2019", "year was not saved" + ) + + def test_2nd_signal_update(self) -> None: + """Can we update the status of a matched citation?""" + unmatched_citation = UnmatchedCitation.objects.first() + Citation.objects.create( + cluster=self.cluster, + reporter=unmatched_citation.reporter, + volume=unmatched_citation.volume, + page=unmatched_citation.page, + type=unmatched_citation.type, + ) + + unmatched_citation.refresh_from_db() + self.assertTrue( + unmatched_citation.status == UnmatchedCitation.FOUND, + "post_save signal was not executed", + ) + + def test_3rd_resolution_update(self): + """Is UnmatchedCitation.status updated properly?""" + # Only 1 citation was resolved + citation_resolutions = {1: [self.eyecite_citations[0]]} + + should_resolve = UnmatchedCitation.objects.first() + should_not_resolve = UnmatchedCitation.objects.last() + should_not_resolve.status = UnmatchedCitation.FOUND + should_not_resolve.save() + + found_count = UnmatchedCitation.objects.filter( + status=UnmatchedCitation.FOUND + ).count() + self.assertTrue( + found_count == 2, + f"There should be 2 found UnmatchedCitations, there are {found_count}", + ) + + update_unmatched_citations_status(citation_resolutions, self.opinion) + should_resolve.refresh_from_db() + should_not_resolve.refresh_from_db() + + self.assertTrue( + should_resolve.status == UnmatchedCitation.RESOLVED, + f"UnmatchedCitation.status should be UnmatchedCitation.RESOLVED, is {should_resolve.status}", + ) + self.assertTrue( + should_not_resolve.status == UnmatchedCitation.FAILED, + f"UnmatchedCitation.status should be UnmatchedCitation.FAILED is {should_not_resolve.status}", + ) diff --git a/cl/search/models.py b/cl/search/models.py index 51084008e3..291d778b74 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2880,8 +2880,7 @@ class Meta: proxy = True -@pghistory.track() -class Citation(models.Model): +class BaseCitation(models.Model): """A simple class to hold citations.""" FEDERAL = 1 @@ -2921,12 +2920,6 @@ class Citation(models.Model): "72 Soc.Sec.Rep.Serv. 318)", ), ) - cluster = models.ForeignKey( - OpinionCluster, - help_text="The cluster that the citation applies to", - related_name="citations", - on_delete=models.CASCADE, - ) volume = models.SmallIntegerField(help_text="The volume of the reporter") reporter = models.TextField( help_text="The abbreviation for the reporter", @@ -2947,6 +2940,21 @@ class Citation(models.Model): help_text="The type of citation that this is.", choices=CITATION_TYPES ) + class Meta: + abstract = True + + +@pghistory.track() +class Citation(BaseCitation): + """A citation to an OpinionCluster""" + + cluster = models.ForeignKey( + OpinionCluster, + help_text="The cluster that the citation applies to", + related_name="citations", + on_delete=models.CASCADE, + ) + def __str__(self) -> str: # Note this representation is used in the front end. return "{volume} {reporter} {page}".format(**self.__dict__) diff --git a/cl/search/signals.py b/cl/search/signals.py index abfe8d448c..21a4a901c0 100644 --- a/cl/search/signals.py +++ b/cl/search/signals.py @@ -3,6 +3,7 @@ from django.dispatch import receiver from cl.audio.models import Audio +from cl.citations.models import UnmatchedCitation from cl.citations.tasks import ( find_citations_and_parantheticals_for_recap_documents, ) @@ -575,3 +576,21 @@ def handle_recap_doc_change( and instance.is_available == True ): send_prayer_emails(instance) + + +@receiver( + post_save, + sender=Citation, + dispatch_uid="handle_citation_save_uid", +) +def update_unmatched_citation( + sender, instance: Citation, created: bool, **kwargs +): + """Updates UnmatchedCitation.status to MATCHED, if found""" + if not created: + return + UnmatchedCitation.objects.filter( + volume=instance.volume, + reporter=instance.reporter, + page=instance.page, + ).update(status=UnmatchedCitation.FOUND)