Skip to content

Commit

Permalink
feat(citations): add UnmatchedCitation model and logic
Browse files Browse the repository at this point in the history
Solves #4920

- Add new model UnmatchedCitation on citations app
- refactor cl.search.models.Citation to create a BaseCitation abstract model to reuse on the UnmatchedCitation model
- updates cl.citations.tasks.store_opinion_citations_and_update_parentheticals to handle storing and updating unmatched citations
- updates cl.search.signals to update UnmatchedCitation status when a new Citation is saved
- add tests
  • Loading branch information
grossir committed Jan 21, 2025
1 parent 4e18f5a commit 78468a9
Show file tree
Hide file tree
Showing 8 changed files with 453 additions and 12 deletions.
4 changes: 4 additions & 0 deletions cl/citations/management/commands/find_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.management import CommandError
from django.core.management.base import CommandParser

from cl.citations.models import UnmatchedCitation
from cl.citations.tasks import (
find_citations_and_parentheticals_for_opinion_by_pks,
)
Expand Down Expand Up @@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None:
query = query.filter(date_modified__gte=options["modified_after"])
if options.get("all"):
query = Opinion.objects.all()
sys.stdout.write("Deleting all UnmatchedCitation rows")
UnmatchedCitation.objects.all().delete()

self.count = query.count()
self.average_per_s = 0.0
self.timings: List[float] = []
Expand Down
153 changes: 153 additions & 0 deletions cl/citations/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Generated by Django 5.1.4 on 2025-01-21 03:45

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

initial = True

dependencies = [
("search", "0037_alter_citation_type_noop"),
]

operations = [
migrations.CreateModel(
name="UnmatchedCitation",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"volume",
models.SmallIntegerField(
help_text="The volume of the reporter"
),
),
(
"reporter",
models.TextField(
db_index=True,
help_text="The abbreviation for the reporter",
),
),
(
"page",
models.TextField(
help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana."
),
),
(
"type",
models.SmallIntegerField(
choices=[
(1, "A federal reporter citation (e.g. 5 F. 55)"),
(
2,
"A citation in a state-based reporter (e.g. Alabama Reports)",
),
(
3,
"A citation in a regional reporter (e.g. Atlantic Reporter)",
),
(
4,
"A citation in a specialty reporter (e.g. Lawyers' Edition)",
),
(
5,
"A citation in an early SCOTUS reporter (e.g. 5 Black. 55)",
),
(
6,
"A citation in the Lexis system (e.g. 5 LEXIS 55)",
),
(
7,
"A citation in the WestLaw system (e.g. 5 WL 55)",
),
(8, "A vendor neutral citation (e.g. 2013 FL 1)"),
(
9,
"A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)",
),
],
help_text="The type of citation that this is.",
),
),
(
"status",
models.SmallIntegerField(
choices=[
(
1,
"The citation does not exist in the search_citation table. We couldn't match the citation to a cluster on the previous citation extractor run",
),
(
2,
"The citation exists on the search_citation table. We haven't updated the citing Opinion.html_with_citations yet",
),
(
3,
"The citing Opinion.html_with_citations was updated successfully",
),
(
4,
"The citing Opinion.html_with_citations update failed because the citation is ambiguous",
),
(
5,
"The citing Opinion.html_with_citations update failed",
),
],
help_text="Status of resolution of the initially unmatched citation",
),
),
(
"citation_string",
models.TextField(
help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation"
),
),
(
"court_id",
models.TextField(
help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations"
),
),
(
"year",
models.TextField(
help_text="A year identified by eyecite from the opinion's context"
),
),
(
"citing_opinion",
models.ForeignKey(
help_text="The opinion citing this citation",
on_delete=django.db.models.deletion.CASCADE,
related_name="eyecite_citations",
to="search.opinion",
),
),
],
options={
"indexes": [
models.Index(
fields=["volume", "reporter", "page"],
name="citations_u_volume_da4d25_idx",
)
],
"unique_together": {
("citing_opinion", "volume", "reporter", "page")
},
},
),
]
Empty file.
96 changes: 96 additions & 0 deletions cl/citations/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from django.db import models
from eyecite.models import FullCaseCitation

from cl.citations.utils import map_reporter_db_cite_type
from cl.search.models import BaseCitation, Opinion


class UnmatchedCitation(BaseCitation):
"""Keep track of citations that could not be resolved to a cluster on the
batch citator run
"""

UNMATCHED = 1
FOUND = 2
RESOLVED = 3
FAILED_AMBIGUOUS = 4
FAILED = 5
STATUS = (
(
UNMATCHED,
"The citation does not exist in the search_citation table."
" We couldn't match the citation to a cluster on the "
" previous citation extractor run",
),
(
FOUND,
"The citation exists on the search_citation table. We "
" haven't updated the citing Opinion.html_with_citations yet",
),
(
RESOLVED,
"The citing Opinion.html_with_citations was updated successfully",
),
(
FAILED_AMBIGUOUS,
"The citing Opinion.html_with_citations update "
"failed because the citation is ambiguous",
),
(FAILED, "The citing Opinion.html_with_citations update failed"),
)
citing_opinion = models.ForeignKey(
Opinion,
help_text="The opinion citing this citation",
on_delete=models.CASCADE,
related_name="eyecite_citations",
)
status = models.SmallIntegerField(
help_text="Status of resolution of the initially unmatched citation",
choices=STATUS,
)
citation_string = models.TextField(
help_text="The unparsed citation string in case it doesn't match the "
"regular citation model in BaseCitation"
)
court_id = models.TextField(
help_text="A court_id as identified by eyecite from the opinion's "
"context. May be useful to know where to find missing citations"
)
year = models.TextField(
help_text="A year identified by eyecite from the opinion's context"
)

class Meta:
indexes = [
models.Index(
fields=["volume", "reporter", "page"],
)
]
#
unique_together = (("citing_opinion", "volume", "reporter", "page"),)

@classmethod
def create_from_eyecite(
cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion
):
"""
Create an UnmatchedCitation instance using an eyecite FullCaseCitation
Saving is left to the caller
:param eyecite_citation: a FullCaseCitation as returned by
eyecite.get_citations
:param citing_opinion: the opinion which uses the citation
"""
cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type
return cls(
citing_opinion=citing_opinion,
status=cls.UNMATCHED,
citation_string=eyecite_citation.matched_text(),
court_id=eyecite_citation.metadata.court or "",
year=eyecite_citation.metadata.year or "",
volume=eyecite_citation.groups["volume"],
reporter=eyecite_citation.corrected_reporter(),
page=eyecite_citation.groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)
74 changes: 70 additions & 4 deletions cl/citations/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from django.db.models import F
from django.db.models.query import QuerySet
from eyecite import get_citations
from eyecite.models import CitationBase
from eyecite.models import CitationBase, FullCaseCitation
from eyecite.tokenizers import HyperscanTokenizer

from cl.celery_init import app
Expand All @@ -21,6 +21,7 @@
NO_MATCH_RESOURCE,
do_resolve_citations,
)
from cl.citations.models import UnmatchedCitation
from cl.citations.parenthetical_utils import create_parenthetical_groups
from cl.citations.recap_citations import store_recap_citations
from cl.citations.score_parentheticals import parenthetical_score
Expand Down Expand Up @@ -131,7 +132,8 @@ def store_opinion_citations_and_update_parentheticals(
opinion: Opinion,
) -> None:
"""
Updates counts of citations to other opinions within a given court opinion, as well as parenthetical info for the cited opinions.
Updates counts of citations to other opinions within a given court opinion,
parenthetical info for the cited opinions, and stores unmatched citations
:param opinion: A search.Opinion object.
:return: None
Expand Down Expand Up @@ -160,8 +162,8 @@ def store_opinion_citations_and_update_parentheticals(
opinion, citation_resolutions
)

# Delete the unmatched citations
citation_resolutions.pop(NO_MATCH_RESOURCE, None)
# Put apart the unmatched citations
unmatched_citations = citation_resolutions.pop(NO_MATCH_RESOURCE, [])

# Increase the citation count for the cluster of each matched opinion
# if that cluster has not already been cited by this opinion. First,
Expand Down Expand Up @@ -205,6 +207,12 @@ def store_opinion_citations_and_update_parentheticals(
)
)

# If the opinion has been processed previously, we update it's
# associated UnmatchedCitations.status. If not, we store them all
update_unmatched_status = UnmatchedCitation.objects.filter(
citing_opinion=opinion
).exists()

# Finally, commit these changes to the database in a single
# transcation block.
with transaction.atomic():
Expand All @@ -215,6 +223,11 @@ def store_opinion_citations_and_update_parentheticals(
citation_count=F("citation_count") + 1
)

if update_unmatched_status:
update_unmatched_citations_status(citation_resolutions, opinion)
else:
store_unmatched_citations(unmatched_citations, opinion)

# Nuke existing citations and parentheticals
OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()
Parenthetical.objects.filter(describing_opinion_id=opinion.pk).delete()
Expand Down Expand Up @@ -249,3 +262,56 @@ def store_opinion_citations_and_update_parentheticals(
index_related_cites_fields.delay(
OpinionsCited.__name__, opinion.pk, cluster_ids_to_update
)


def update_unmatched_citations_status(
citation_resolutions: Dict[
MatchedResourceType, List[SupportedCitationType]
],
citing_opinion: Opinion,
) -> None:
"""Check if previously unmatched citations have been resolved and
updtes UnmatchedCitation.status accordingly
We assume no new UnmatchedCitations will be created after the first run
:param citation_resolutions: dict whose values are resolved citations
:param citing_opinion: the opinion
:return None:
"""
resolved_citations = {
c.matched_text() for v in citation_resolutions.values() for c in v
}

# the query will also try to reprocess the previous failures
found_citations = UnmatchedCitation.objects.filter(
citing_opinion=citing_opinion
).exclude(
status__in=[UnmatchedCitation.UNMATCHED, UnmatchedCitation.RESOLVED]
)
for found in found_citations:
if found.citation_string in resolved_citations:
found.status = UnmatchedCitation.RESOLVED
else:
found.status = UnmatchedCitation.FAILED
found.save()


def store_unmatched_citations(
unmatched_citations: List[CitationBase], opinion: Opinion
) -> None:
"""Bulk create UnmatchedCitation instances cited by an opinion
Only FullCaseCitations provide useful information for resolution
updates. Other types are discarded
:param unmatched_citations:
:param opinion: the citing opinion
:return None:
"""
unmatched_citations_to_store = [
UnmatchedCitation.create_from_eyecite(unmatched_citation, opinion)
for unmatched_citation in unmatched_citations
if isinstance(unmatched_citation, FullCaseCitation)
]
UnmatchedCitation.objects.bulk_create(unmatched_citations_to_store)
Loading

0 comments on commit 78468a9

Please sign in to comment.