feat(citations): add UnmatchedCitation model and logic

Solves #4920 - Add new model UnmatchedCitation on citations app - refactor cl.search.models.Citation to create a BaseCitation abstract model to reuse on the UnmatchedCitation model - updates cl.citations.tasks.store_opinion_citations_and_update_parentheticals to handle storing and updating unmatched citations - updates cl.search.signals to update UnmatchedCitation status when a new Citation is saved - add tests
freelawproject · Jan 21, 2025 · 78468a9 · 78468a9
1 parent 4e18f5a
commit 78468a9
Show file tree

Hide file tree

Showing 8 changed files with 453 additions and 12 deletions.
diff --git a/cl/citations/management/commands/find_citations.py b/cl/citations/management/commands/find_citations.py
@@ -5,6 +5,7 @@
 from django.core.management import CommandError
 from django.core.management.base import CommandParser
 
+from cl.citations.models import UnmatchedCitation
 from cl.citations.tasks import (
     find_citations_and_parentheticals_for_opinion_by_pks,
 )
@@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None:
             query = query.filter(date_modified__gte=options["modified_after"])
         if options.get("all"):
             query = Opinion.objects.all()
+            sys.stdout.write("Deleting all UnmatchedCitation rows")
+            UnmatchedCitation.objects.all().delete()
+
         self.count = query.count()
         self.average_per_s = 0.0
         self.timings: List[float] = []

diff --git a/cl/citations/migrations/0001_initial.py b/cl/citations/migrations/0001_initial.py
@@ -0,0 +1,153 @@
+# Generated by Django 5.1.4 on 2025-01-21 03:45
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ("search", "0037_alter_citation_type_noop"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="UnmatchedCitation",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "volume",
+                    models.SmallIntegerField(
+                        help_text="The volume of the reporter"
+                    ),
+                ),
+                (
+                    "reporter",
+                    models.TextField(
+                        db_index=True,
+                        help_text="The abbreviation for the reporter",
+                    ),
+                ),
+                (
+                    "page",
+                    models.TextField(
+                        help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana."
+                    ),
+                ),
+                (
+                    "type",
+                    models.SmallIntegerField(
+                        choices=[
+                            (1, "A federal reporter citation (e.g. 5 F. 55)"),
+                            (
+                                2,
+                                "A citation in a state-based reporter (e.g. Alabama Reports)",
+                            ),
+                            (
+                                3,
+                                "A citation in a regional reporter (e.g. Atlantic Reporter)",
+                            ),
+                            (
+                                4,
+                                "A citation in a specialty reporter (e.g. Lawyers' Edition)",
+                            ),
+                            (
+                                5,
+                                "A citation in an early SCOTUS reporter (e.g. 5 Black. 55)",
+                            ),
+                            (
+                                6,
+                                "A citation in the Lexis system (e.g. 5 LEXIS 55)",
+                            ),
+                            (
+                                7,
+                                "A citation in the WestLaw system (e.g. 5 WL 55)",
+                            ),
+                            (8, "A vendor neutral citation (e.g. 2013 FL 1)"),
+                            (
+                                9,
+                                "A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)",
+                            ),
+                        ],
+                        help_text="The type of citation that this is.",
+                    ),
+                ),
+                (
+                    "status",
+                    models.SmallIntegerField(
+                        choices=[
+                            (
+                                1,
+                                "The citation does not exist in the search_citation table. We couldn't match the citation to a cluster on the  previous citation extractor run",
+                            ),
+                            (
+                                2,
+                                "The citation exists on the search_citation table. We  haven't updated the citing Opinion.html_with_citations yet",
+                            ),
+                            (
+                                3,
+                                "The citing Opinion.html_with_citations was updated successfully",
+                            ),
+                            (
+                                4,
+                                "The citing Opinion.html_with_citations update failed because the citation is ambiguous",
+                            ),
+                            (
+                                5,
+                                "The citing Opinion.html_with_citations update failed",
+                            ),
+                        ],
+                        help_text="Status of resolution of the initially unmatched citation",
+                    ),
+                ),
+                (
+                    "citation_string",
+                    models.TextField(
+                        help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation"
+                    ),
+                ),
+                (
+                    "court_id",
+                    models.TextField(
+                        help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations"
+                    ),
+                ),
+                (
+                    "year",
+                    models.TextField(
+                        help_text="A year identified by eyecite from the opinion's context"
+                    ),
+                ),
+                (
+                    "citing_opinion",
+                    models.ForeignKey(
+                        help_text="The opinion citing this citation",
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="eyecite_citations",
+                        to="search.opinion",
+                    ),
+                ),
+            ],
+            options={
+                "indexes": [
+                    models.Index(
+                        fields=["volume", "reporter", "page"],
+                        name="citations_u_volume_da4d25_idx",
+                    )
+                ],
+                "unique_together": {
+                    ("citing_opinion", "volume", "reporter", "page")
+                },
+            },
+        ),
+    ]
diff --git a/cl/citations/migrations/__init__.py b/cl/citations/migrations/__init__.py
diff --git a/cl/citations/models.py b/cl/citations/models.py
@@ -0,0 +1,96 @@
+from django.db import models
+from eyecite.models import FullCaseCitation
+
+from cl.citations.utils import map_reporter_db_cite_type
+from cl.search.models import BaseCitation, Opinion
+
+
+class UnmatchedCitation(BaseCitation):
+    """Keep track of citations that could not be resolved to a cluster on the
+    batch citator run
+    """
+
+    UNMATCHED = 1
+    FOUND = 2
+    RESOLVED = 3
+    FAILED_AMBIGUOUS = 4
+    FAILED = 5
+    STATUS = (
+        (
+            UNMATCHED,
+            "The citation does not exist in the search_citation table."
+            " We couldn't match the citation to a cluster on the "
+            " previous citation extractor run",
+        ),
+        (
+            FOUND,
+            "The citation exists on the search_citation table. We "
+            " haven't updated the citing Opinion.html_with_citations yet",
+        ),
+        (
+            RESOLVED,
+            "The citing Opinion.html_with_citations was updated successfully",
+        ),
+        (
+            FAILED_AMBIGUOUS,
+            "The citing Opinion.html_with_citations update "
+            "failed because the citation is ambiguous",
+        ),
+        (FAILED, "The citing Opinion.html_with_citations update failed"),
+    )
+    citing_opinion = models.ForeignKey(
+        Opinion,
+        help_text="The opinion citing this citation",
+        on_delete=models.CASCADE,
+        related_name="eyecite_citations",
+    )
+    status = models.SmallIntegerField(
+        help_text="Status of resolution of the initially unmatched citation",
+        choices=STATUS,
+    )
+    citation_string = models.TextField(
+        help_text="The unparsed citation string in case it doesn't match the "
+        "regular citation model in BaseCitation"
+    )
+    court_id = models.TextField(
+        help_text="A court_id as identified by eyecite from the opinion's "
+        "context. May be useful to know where to find missing citations"
+    )
+    year = models.TextField(
+        help_text="A year identified by eyecite from the opinion's context"
+    )
+
+    class Meta:
+        indexes = [
+            models.Index(
+                fields=["volume", "reporter", "page"],
+            )
+        ]
+        #
+        unique_together = (("citing_opinion", "volume", "reporter", "page"),)
+
+    @classmethod
+    def create_from_eyecite(
+        cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion
+    ):
+        """
+        Create an UnmatchedCitation instance using an eyecite FullCaseCitation
+
+        Saving is left to the caller
+
+        :param eyecite_citation: a FullCaseCitation as returned by
+            eyecite.get_citations
+        :param citing_opinion: the opinion which uses the citation
+        """
+        cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type
+        return cls(
+            citing_opinion=citing_opinion,
+            status=cls.UNMATCHED,
+            citation_string=eyecite_citation.matched_text(),
+            court_id=eyecite_citation.metadata.court or "",
+            year=eyecite_citation.metadata.year or "",
+            volume=eyecite_citation.groups["volume"],
+            reporter=eyecite_citation.corrected_reporter(),
+            page=eyecite_citation.groups["page"],
+            type=map_reporter_db_cite_type(cite_type_str),
+        )
diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py
@@ -5,7 +5,7 @@
 from django.db.models import F
 from django.db.models.query import QuerySet
 from eyecite import get_citations
-from eyecite.models import CitationBase
+from eyecite.models import CitationBase, FullCaseCitation
 from eyecite.tokenizers import HyperscanTokenizer
 
 from cl.celery_init import app
@@ -21,6 +21,7 @@
     NO_MATCH_RESOURCE,
     do_resolve_citations,
 )
+from cl.citations.models import UnmatchedCitation
 from cl.citations.parenthetical_utils import create_parenthetical_groups
 from cl.citations.recap_citations import store_recap_citations
 from cl.citations.score_parentheticals import parenthetical_score
@@ -131,7 +132,8 @@ def store_opinion_citations_and_update_parentheticals(
     opinion: Opinion,
 ) -> None:
     """
-    Updates counts of citations to other opinions within a given court opinion, as well as parenthetical info for the cited opinions.
+    Updates counts of citations to other opinions within a given court opinion,
+    parenthetical info for the cited opinions, and stores unmatched citations
 
     :param opinion: A search.Opinion object.
     :return: None
@@ -160,8 +162,8 @@ def store_opinion_citations_and_update_parentheticals(
         opinion, citation_resolutions
     )
 
-    # Delete the unmatched citations
-    citation_resolutions.pop(NO_MATCH_RESOURCE, None)
+    # Put apart the unmatched citations
+    unmatched_citations = citation_resolutions.pop(NO_MATCH_RESOURCE, [])
 
     # Increase the citation count for the cluster of each matched opinion
     # if that cluster has not already been cited by this opinion. First,
@@ -205,6 +207,12 @@ def store_opinion_citations_and_update_parentheticals(
                     )
                 )
 
+    # If the opinion has been processed previously, we update it's
+    # associated UnmatchedCitations.status. If not, we store them all
+    update_unmatched_status = UnmatchedCitation.objects.filter(
+        citing_opinion=opinion
+    ).exists()
+
     # Finally, commit these changes to the database in a single
     # transcation block.
     with transaction.atomic():
@@ -215,6 +223,11 @@ def store_opinion_citations_and_update_parentheticals(
             citation_count=F("citation_count") + 1
         )
 
+        if update_unmatched_status:
+            update_unmatched_citations_status(citation_resolutions, opinion)
+        else:
+            store_unmatched_citations(unmatched_citations, opinion)
+
         # Nuke existing citations and parentheticals
         OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()
         Parenthetical.objects.filter(describing_opinion_id=opinion.pk).delete()
@@ -249,3 +262,56 @@ def store_opinion_citations_and_update_parentheticals(
     index_related_cites_fields.delay(
         OpinionsCited.__name__, opinion.pk, cluster_ids_to_update
     )
+
+
+def update_unmatched_citations_status(
+    citation_resolutions: Dict[
+        MatchedResourceType, List[SupportedCitationType]
+    ],
+    citing_opinion: Opinion,
+) -> None:
+    """Check if previously unmatched citations have been resolved and
+    updtes UnmatchedCitation.status accordingly
+
+    We assume no new UnmatchedCitations will be created after the first run
+
+    :param citation_resolutions: dict whose values are resolved citations
+    :param citing_opinion: the opinion
+    :return None:
+    """
+    resolved_citations = {
+        c.matched_text() for v in citation_resolutions.values() for c in v
+    }
+
+    # the query will also try to reprocess the previous failures
+    found_citations = UnmatchedCitation.objects.filter(
+        citing_opinion=citing_opinion
+    ).exclude(
+        status__in=[UnmatchedCitation.UNMATCHED, UnmatchedCitation.RESOLVED]
+    )
+    for found in found_citations:
+        if found.citation_string in resolved_citations:
+            found.status = UnmatchedCitation.RESOLVED
+        else:
+            found.status = UnmatchedCitation.FAILED
+        found.save()
+
+
+def store_unmatched_citations(
+    unmatched_citations: List[CitationBase], opinion: Opinion
+) -> None:
+    """Bulk create UnmatchedCitation instances cited by an opinion
+
+    Only FullCaseCitations provide useful information for resolution
+    updates. Other types are discarded
+
+    :param unmatched_citations:
+    :param opinion: the citing opinion
+    :return None:
+    """
+    unmatched_citations_to_store = [
+        UnmatchedCitation.create_from_eyecite(unmatched_citation, opinion)
+        for unmatched_citation in unmatched_citations
+        if isinstance(unmatched_citation, FullCaseCitation)
+    ]
+    UnmatchedCitation.objects.bulk_create(unmatched_citations_to_store)