From b701bc05516840bf9eb1793f304674c4085595e5 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 15 Jun 2023 14:05:55 -0400 Subject: [PATCH 01/50] feat(search.models): Add django-ordered-model Add django-ordered-model Add django-ordered-model to Opinions Update poetry --- cl/search/migrations/0019_order_opinions.py | 71 ++++++++++ cl/search/migrations/0019_order_opinions.sql | 129 +++++++++++++++++++ cl/search/models.py | 4 +- cl/settings/django.py | 1 + poetry.lock | 13 +- pyproject.toml | 1 + 6 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 cl/search/migrations/0019_order_opinions.py create mode 100644 cl/search/migrations/0019_order_opinions.sql diff --git a/cl/search/migrations/0019_order_opinions.py b/cl/search/migrations/0019_order_opinions.py new file mode 100644 index 0000000000..5e446056cc --- /dev/null +++ b/cl/search/migrations/0019_order_opinions.py @@ -0,0 +1,71 @@ +# Generated by Django 4.2.1 on 2023-06-15 17:56 + +from django.db import migrations, models +import pgtrigger.compiler +import pgtrigger.migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("search", "0018_update_cluster_model"), + ] + + operations = [ + migrations.AlterModelOptions( + name="opinion", + options={"ordering": ("order",)}, + ), + pgtrigger.migrations.RemoveTrigger( + model_name="opinion", + name="update_or_delete_snapshot_delete", + ), + pgtrigger.migrations.RemoveTrigger( + model_name="opinion", + name="update_or_delete_snapshot_update", + ), + migrations.AddField( + model_name="opinion", + name="order", + field=models.PositiveIntegerField( + db_index=True, default=1, editable=False, verbose_name="order" + ), + preserve_default=False, + ), + migrations.AddField( + model_name="opinionevent", + name="order", + field=models.PositiveIntegerField( + default=1, editable=False, verbose_name="order" + ), + preserve_default=False, + ), + pgtrigger.migrations.AddTrigger( + model_name="opinion", + trigger=pgtrigger.compiler.Trigger( + name="update_or_delete_snapshot_update", + sql=pgtrigger.compiler.UpsertTriggerSql( + condition='WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."order" IS DISTINCT FROM (NEW."order") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr"))', + func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', + hash="bcac41027f469bbd394e8671cb0b2fa33e7035f3", + operation="UPDATE", + pgid="pgtrigger_update_or_delete_snapshot_update_67ecd", + table="search_opinion", + when="AFTER", + ), + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="opinion", + trigger=pgtrigger.compiler.Trigger( + name="update_or_delete_snapshot_delete", + sql=pgtrigger.compiler.UpsertTriggerSql( + func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', + hash="79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad", + operation="DELETE", + pgid="pgtrigger_update_or_delete_snapshot_delete_1f4fd", + table="search_opinion", + when="AFTER", + ), + ), + ), + ] diff --git a/cl/search/migrations/0019_order_opinions.sql b/cl/search/migrations/0019_order_opinions.sql new file mode 100644 index 0000000000..3226cb510b --- /dev/null +++ b/cl/search/migrations/0019_order_opinions.sql @@ -0,0 +1,129 @@ +BEGIN; +-- +-- Change Meta options on opinion +-- +-- (no-op) +-- +-- Remove trigger update_or_delete_snapshot_delete from model opinion +-- +DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; +-- +-- Remove trigger update_or_delete_snapshot_update from model opinion +-- +DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; +-- +-- Add field order to opinion +-- +ALTER TABLE "search_opinion" ADD COLUMN "order" integer DEFAULT 1 NOT NULL CHECK ("order" >= 0); +ALTER TABLE "search_opinion" ALTER COLUMN "order" DROP DEFAULT; +-- +-- Add field order to opinionevent +-- +ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer DEFAULT 1 NOT NULL CHECK ("order" >= 0); +ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; +-- +-- Create trigger update_or_delete_snapshot_update on model opinion +-- + + CREATE OR REPLACE FUNCTION "public"._pgtrigger_should_ignore( + trigger_name NAME + ) + RETURNS BOOLEAN AS $$ + DECLARE + _pgtrigger_ignore TEXT[]; + _result BOOLEAN; + BEGIN + BEGIN + SELECT INTO _pgtrigger_ignore + CURRENT_SETTING('pgtrigger.ignore'); + EXCEPTION WHEN OTHERS THEN + END; + IF _pgtrigger_ignore IS NOT NULL THEN + SELECT trigger_name = ANY(_pgtrigger_ignore) + INTO _result; + RETURN _result; + ELSE + RETURN FALSE; + END IF; + END; + $$ LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_update_67ecd() + RETURNS TRIGGER AS $$ + + BEGIN + IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN + IF (TG_OP = 'DELETE') THEN + RETURN OLD; + ELSE + RETURN NEW; + END IF; + END IF; + INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; + END; + $$ LANGUAGE plpgsql; + + DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; + CREATE TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd + AFTER UPDATE ON "search_opinion" + + + FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."order" IS DISTINCT FROM (NEW."order") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr")) + EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_update_67ecd(); + + COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS 'bcac41027f469bbd394e8671cb0b2fa33e7035f3'; + +-- +-- Create trigger update_or_delete_snapshot_delete on model opinion +-- + + CREATE OR REPLACE FUNCTION "public"._pgtrigger_should_ignore( + trigger_name NAME + ) + RETURNS BOOLEAN AS $$ + DECLARE + _pgtrigger_ignore TEXT[]; + _result BOOLEAN; + BEGIN + BEGIN + SELECT INTO _pgtrigger_ignore + CURRENT_SETTING('pgtrigger.ignore'); + EXCEPTION WHEN OTHERS THEN + END; + IF _pgtrigger_ignore IS NOT NULL THEN + SELECT trigger_name = ANY(_pgtrigger_ignore) + INTO _result; + RETURN _result; + ELSE + RETURN FALSE; + END IF; + END; + $$ LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_delete_1f4fd() + RETURNS TRIGGER AS $$ + + BEGIN + IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN + IF (TG_OP = 'DELETE') THEN + RETURN OLD; + ELSE + RETURN NEW; + END IF; + END IF; + INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; + END; + $$ LANGUAGE plpgsql; + + DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; + CREATE TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd + AFTER DELETE ON "search_opinion" + + + FOR EACH ROW + EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_delete_1f4fd(); + + COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad'; + +CREATE INDEX "search_opinion_order_d54dd126" ON "search_opinion" ("order"); +COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index d04587edef..fc6aa75414 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -14,6 +14,7 @@ from django.utils.encoding import force_str from django.utils.text import slugify from eyecite import get_citations +from ordered_model.models import OrderedModel from cl.citations.utils import get_citation_depth_between_clusters from cl.custom_filters.templatetags.text_filters import best_case_name @@ -2815,7 +2816,7 @@ def sort_cites(c): @pghistory.track(AfterUpdateOrDeleteSnapshot()) -class Opinion(AbstractDateTimeModel): +class Opinion(OrderedModel, AbstractDateTimeModel): COMBINED = "010combined" UNANIMOUS = "015unamimous" LEAD = "020lead" @@ -2965,6 +2966,7 @@ class Opinion(AbstractDateTimeModel): default=False, db_index=True, ) + order_with_respect_to = "cluster" @property def siblings(self) -> QuerySet: diff --git a/cl/settings/django.py b/cl/settings/django.py index 21b1ba4a7c..a522d824df 100644 --- a/cl/settings/django.py +++ b/cl/settings/django.py @@ -162,6 +162,7 @@ "admin_cursor_paginator", "pghistory", "pgtrigger", + "ordered_model", # CourtListener Apps "cl.alerts", "cl.audio", diff --git a/poetry.lock b/poetry.lock index 7f91780e6a..b8f82b7eed 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1054,6 +1054,17 @@ files = [ {file = "django_mathfilters-1.0.0-py3-none-any.whl", hash = "sha256:64200a21bb249fbf27be601d4bbb788779e09c6e063170c097cd82c4d18ebb83"}, ] +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -4576,4 +4587,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "674af32861e1e5bf9c31401f02a3af0b698be8b60b9492cd89ab5464218efd3e" +content-hash = "2b4d76ce134a241162a25c9634a4f9fdbf140d261750fdfca63a87ccbac4fcfd" diff --git a/pyproject.toml b/pyproject.toml index 91020cf1e0..ee00cd8366 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,6 +108,7 @@ types-dateparser = "^1.1.4.6" juriscraper = "^2.5.49" uvicorn = {extras = ["standard"], version = "^0.22.0"} daphne = "^4.0.0" +django-ordered-model = "^3.7.4" [tool.poetry.group.dev.dependencies] From 6cf0d7581be8241eda3d0b8b4a46833efb7de979 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 15 Jun 2023 14:57:38 -0400 Subject: [PATCH 02/50] fix(tests): Update fixtures for opinion model --- cl/search/fixtures/functest_opinions.json | 12 +++-- cl/search/fixtures/opinions-issue-412.json | 6 ++- cl/search/fixtures/opinions-issue-550.json | 6 ++- cl/search/fixtures/test_objects_search.json | 18 ++++--- .../fixtures/api_scotus_map_data.json | 6 ++- .../fixtures/scotus_map_data.json | 51 ++++++++++++------- 6 files changed, 66 insertions(+), 33 deletions(-) diff --git a/cl/search/fixtures/functest_opinions.json b/cl/search/fixtures/functest_opinions.json index e4fa89a260..45f5f0b759 100644 --- a/cl/search/fixtures/functest_opinions.json +++ b/cl/search/fixtures/functest_opinions.json @@ -64,7 +64,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 10 @@ -134,7 +135,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 11 @@ -184,7 +186,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 12 @@ -254,7 +257,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 12 diff --git a/cl/search/fixtures/opinions-issue-412.json b/cl/search/fixtures/opinions-issue-412.json index ca6ac33971..2e429ebecf 100644 --- a/cl/search/fixtures/opinions-issue-412.json +++ b/cl/search/fixtures/opinions-issue-412.json @@ -64,7 +64,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 10 @@ -134,7 +135,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 11 diff --git a/cl/search/fixtures/opinions-issue-550.json b/cl/search/fixtures/opinions-issue-550.json index b0163eb8f8..829a94c7d2 100644 --- a/cl/search/fixtures/opinions-issue-550.json +++ b/cl/search/fixtures/opinions-issue-550.json @@ -64,7 +64,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 10 @@ -86,7 +87,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "020lead" + "type": "020lead", + "order": 1 }, "model": "search.opinion", "pk": 11 diff --git a/cl/search/fixtures/test_objects_search.json b/cl/search/fixtures/test_objects_search.json index 2255c7edcf..9fddb84fca 100644 --- a/cl/search/fixtures/test_objects_search.json +++ b/cl/search/fixtures/test_objects_search.json @@ -239,7 +239,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "020lead" + "type": "020lead", + "order": 1 }, "model": "search.opinion", "pk": 1 @@ -261,7 +262,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 2 @@ -283,7 +285,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 3 @@ -305,7 +308,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 4 @@ -327,7 +331,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 5 @@ -349,7 +354,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 6 diff --git a/cl/visualizations/fixtures/api_scotus_map_data.json b/cl/visualizations/fixtures/api_scotus_map_data.json index 5b4b19fe73..46dc2f9856 100644 --- a/cl/visualizations/fixtures/api_scotus_map_data.json +++ b/cl/visualizations/fixtures/api_scotus_map_data.json @@ -121,7 +121,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "020lead" + "type": "020lead", + "order": 1 }, "model": "search.opinion", "pk": 1 @@ -143,7 +144,8 @@ "date_created": "2015-08-15T14:10:56.801Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 2 diff --git a/cl/visualizations/fixtures/scotus_map_data.json b/cl/visualizations/fixtures/scotus_map_data.json index ce504fe2c9..a885e4df54 100644 --- a/cl/visualizations/fixtures/scotus_map_data.json +++ b/cl/visualizations/fixtures/scotus_map_data.json @@ -902,7 +902,8 @@ "date_created": "2016-02-16T19:49:54.525Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 111014 @@ -924,7 +925,8 @@ "date_created": "2016-02-16T19:49:54.545Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 111113 @@ -946,7 +948,8 @@ "date_created": "2016-02-16T19:49:54.565Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 111464 @@ -968,7 +971,8 @@ "date_created": "2016-02-16T19:49:54.610Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 111505 @@ -990,7 +994,8 @@ "date_created": "2016-02-16T19:49:54.629Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 111924 @@ -1012,7 +1017,8 @@ "date_created": "2016-02-16T19:49:54.575Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 112331 @@ -1034,7 +1040,8 @@ "date_created": "2016-02-16T19:49:54.537Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 112646 @@ -1056,7 +1063,8 @@ "date_created": "2016-02-16T19:49:54.583Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 112779 @@ -1078,7 +1086,8 @@ "date_created": "2016-02-16T19:49:54.592Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 112874 @@ -1100,7 +1109,8 @@ "date_created": "2016-02-16T19:49:54.602Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 117967 @@ -1122,7 +1132,8 @@ "date_created": "2016-02-16T19:49:54.553Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 118377 @@ -1144,7 +1155,8 @@ "date_created": "2016-02-16T19:49:54.621Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 121168 @@ -1166,7 +1178,8 @@ "date_created": "2016-02-16T19:49:54.658Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 136984 @@ -1188,7 +1201,8 @@ "date_created": "2016-02-16T19:49:54.647Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 142900 @@ -1210,7 +1224,8 @@ "date_created": "2016-02-16T19:49:54.666Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 799990 @@ -1232,7 +1247,8 @@ "date_created": "2016-02-16T19:49:54.636Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 799993 @@ -1254,7 +1270,8 @@ "date_created": "2016-02-16T19:49:54.513Z", "html_lawbox": "", "per_curiam": false, - "type": "010combined" + "type": "010combined", + "order": 1 }, "model": "search.opinion", "pk": 2674862 From 05e9d9856b543579c28de371dcf8823c8ee7e666 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 15 Jun 2023 15:23:23 -0400 Subject: [PATCH 03/50] fix(tests): Update fixtures for opinion model Take 2 --- .../fixtures/test_objects_query_counts.json | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cl/search/fixtures/test_objects_query_counts.json b/cl/search/fixtures/test_objects_query_counts.json index aa909b2fb2..b51117602a 100644 --- a/cl/search/fixtures/test_objects_query_counts.json +++ b/cl/search/fixtures/test_objects_query_counts.json @@ -300,7 +300,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"020lead" + "type":"020lead", + "order": 1 }, "model":"search.opinion", "pk":1 @@ -324,7 +325,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"010combined" + "type":"010combined", + "order": 1 }, "model":"search.opinion", "pk":2 @@ -348,7 +350,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"010combined" + "type":"010combined", + "order": 1 }, "model":"search.opinion", "pk":3 @@ -371,7 +374,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"010combined" + "type":"010combined", + "order": 1 }, "model":"search.opinion", "pk":4 @@ -395,7 +399,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"010combined" + "type":"010combined", + "order": 1 }, "model":"search.opinion", "pk":5 @@ -418,7 +423,8 @@ "date_created":"2015-08-15T14:10:56.801Z", "html_lawbox":"", "per_curiam":false, - "type":"010combined" + "type":"010combined", + "order": 1 }, "model":"search.opinion", "pk":6 From b0fc70a56055699c551b59a3ed38a005459905e3 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 16 Jun 2023 12:46:45 -0400 Subject: [PATCH 04/50] feat(models): Override django-ordered-model default By default it sorts by order - so if we dont want that feature we simply need to override the django order with a custom ordered manager in on the opinion class. (I think) --- cl/search/models.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cl/search/models.py b/cl/search/models.py index fc6aa75414..7fc2c03458 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -14,7 +14,7 @@ from django.utils.encoding import force_str from django.utils.text import slugify from eyecite import get_citations -from ordered_model.models import OrderedModel +from ordered_model.models import OrderedModel, OrderedModelManager from cl.citations.utils import get_citation_depth_between_clusters from cl.custom_filters.templatetags.text_filters import best_case_name @@ -2815,6 +2815,13 @@ def sort_cites(c): return 8 +class CustomOrderedManager(OrderedModelManager): + """Override the django ordered model default ordering""" + + def get_queryset(self): + return super().get_queryset().order_by() + + @pghistory.track(AfterUpdateOrDeleteSnapshot()) class Opinion(OrderedModel, AbstractDateTimeModel): COMBINED = "010combined" @@ -2968,6 +2975,8 @@ class Opinion(OrderedModel, AbstractDateTimeModel): ) order_with_respect_to = "cluster" + objects = CustomOrderedManager() + @property def siblings(self) -> QuerySet: # These are other sub-opinions of the current cluster. From b8fa44563ac4bb42d6ad3020c604da8f8940f187 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 16 Jun 2023 13:44:51 -0400 Subject: [PATCH 05/50] fix(models): Different override for ordering on OP --- cl/search/models.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index 7fc2c03458..be645bc5e8 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -14,7 +14,7 @@ from django.utils.encoding import force_str from django.utils.text import slugify from eyecite import get_citations -from ordered_model.models import OrderedModel, OrderedModelManager +from ordered_model.models import OrderedModel from cl.citations.utils import get_citation_depth_between_clusters from cl.custom_filters.templatetags.text_filters import best_case_name @@ -2815,13 +2815,6 @@ def sort_cites(c): return 8 -class CustomOrderedManager(OrderedModelManager): - """Override the django ordered model default ordering""" - - def get_queryset(self): - return super().get_queryset().order_by() - - @pghistory.track(AfterUpdateOrDeleteSnapshot()) class Opinion(OrderedModel, AbstractDateTimeModel): COMBINED = "010combined" @@ -2975,7 +2968,8 @@ class Opinion(OrderedModel, AbstractDateTimeModel): ) order_with_respect_to = "cluster" - objects = CustomOrderedManager() + class Meta: + ordering = () @property def siblings(self) -> QuerySet: From 7429eba0290bc2f931489b5799e90de318cd1512 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 28 Jul 2023 19:30:18 -0600 Subject: [PATCH 06/50] fix(poetry): Fix merge conflicts --- poetry.lock | 13 ++++++++++++- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index e034727158..24dc7977e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1072,6 +1072,17 @@ files = [ {file = "django_mathfilters-1.0.0-py3-none-any.whl", hash = "sha256:64200a21bb249fbf27be601d4bbb788779e09c6e063170c097cd82c4d18ebb83"}, ] +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -4690,4 +4701,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "10446165560282337aada87c0f3a9324dc904777bbfcc0f7e35db5c9d13a10a9" +content-hash = "7c0448e0852dba4f13177892cc0e619e2b58470f4d82707d8069fbeceb1cb919" diff --git a/pyproject.toml b/pyproject.toml index f9d568defa..2caee093ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ daphne = "^4.0.0" psycopg2 = "^2.9.6" juriscraper = "^2.5.51" httpx = {extras = ["http2"], version = "^0.24.1"} +django-ordered-model = "^3.7.4" [tool.poetry.group.dev.dependencies] From f45a093c6b02ed5ae4a1077062295fa25f1c4894 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 31 Jul 2023 19:43:51 -0600 Subject: [PATCH 07/50] fix(models): Add 'order' field as default ordering for Opinion model Test added for django-ordered-model library Optimize imports in search/tests.py --- cl/search/models.py | 2 +- cl/search/tests.py | 69 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index 5024bdcc3d..e50987c3f6 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2986,7 +2986,7 @@ class Opinion(OrderedModel, AbstractDateTimeModel): order_with_respect_to = "cluster" class Meta: - ordering = () + ordering = ("order",) @property def siblings(self) -> QuerySet: diff --git a/cl/search/tests.py b/cl/search/tests.py index b5c71c9b16..c4edb24b31 100644 --- a/cl/search/tests.py +++ b/cl/search/tests.py @@ -5,7 +5,7 @@ from datetime import date from functools import reduce from pathlib import Path -from unittest import mock, skipUnless +from unittest import mock import pytz from asgiref.sync import sync_to_async @@ -19,9 +19,8 @@ from django.db import IntegrityError, transaction from django.http import HttpRequest from django.test import AsyncRequestFactory, override_settings -from django.test.utils import captured_stderr from django.urls import reverse -from elasticsearch_dsl import Q, connections +from elasticsearch_dsl import Q from factory import RelatedFactory from lxml import etree, html from rest_framework.status import HTTP_200_OK @@ -58,6 +57,7 @@ DocketFactory, OpinionClusterFactory, OpinionClusterFactoryWithChildrenAndParents, + OpinionFactory, OpinionsCitedWithParentsFactory, OpinionWithChildrenFactory, OpinionWithParentsFactory, @@ -283,6 +283,69 @@ def test_custom_manager_chained_filter(self) -> None: ) self.assertEqual(cluster_count, expected_count) + def test_opinions_order(self) -> None: + """Test django-ordered-model library""" + + # Create court + court = CourtFactory(id="nyappdiv") + + # Create cluster + cluster = OpinionClusterFactory( + case_name="Foo v. Bar", + case_name_short="Foo v. Bar", + docket=DocketFactory( + court=court, + ), + date_filed=date(1978, 3, 10), + source="U", + precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + ) + + # Create three opinions + op_1 = OpinionFactory( + cluster=cluster, + type="Concurrence Opinion", + ) + + op_2 = OpinionFactory( + cluster=cluster, + type="Dissent", + ) + + op_3 = OpinionFactory( + cluster=cluster, + type="Lead Opinion", + ) + + # Test that the value of the order field matches the order in which + # they were created + self.assertEqual(op_1.order, 0) + self.assertEqual(op_2.order, 1) + self.assertEqual(op_3.order, 2) + + # Use library method to move lead opinion to first position, we can + # use this function to easily reorder existing opinions + op_3.to(0) + + # The position of the elements was modified, we refresh the objects + op_1.refresh_from_db() + op_2.refresh_from_db() + op_3.refresh_from_db() + + # Test new order + self.assertEqual(op_3.order, 0) + self.assertEqual(op_1.order, 1) + self.assertEqual(op_2.order, 2) + + # Add new opinion to cluster + op_4 = OpinionFactory( + cluster=cluster, + type="Dissent", + ) + + # Test that the new opinion is in last place + self.assertEqual(op_4.order, 3) + class DocketValidationTest(TestCase): @classmethod From 37dee19fcfacf95a79aac71c21ccc507d10289b4 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 11:27:50 -0600 Subject: [PATCH 08/50] fix(opinion_order): fix merge conflicts with main --- poetry.lock | 17 ++++++++++++++--- pyproject.toml | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index a23818c13e..9b7321deb1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "amqp" @@ -1094,6 +1094,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -2718,7 +2729,7 @@ name = "ndg-httpsclient" version = "0.5.1" description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL" optional = false -python-versions = ">=2.7,<3.0.0 || >=3.4.0" +python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0" files = [ {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"}, {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"}, @@ -5091,4 +5102,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "5257a6d2a26b74054bac82d0c5700a55f1e2e2ec580608921e8a27a76d015f52" +content-hash = "46adbdc75bf4ad70aa4d6531f4d71a8f22f1e85ee9886408e921e7147aab7a36" diff --git a/pyproject.toml b/pyproject.toml index 87d6e90ff9..ef5970143f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,7 @@ juriscraper = "^2.5.51" httpx = {extras = ["http2"], version = "^0.24.1"} django-model-utils = "^4.3.1" inflection = "^0.5.1" # necessary for DRF schema generation - remove after drf-spectacular +django-ordered-model = "^3.7.4" [tool.poetry.group.dev.dependencies] From 3b4cb06ef8724d5052f9868f8d77388acfe18be1 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 11:55:03 -0600 Subject: [PATCH 09/50] fix(opinion_order): rename migrations --- .../{0019_order_opinions.py => 0020_order_opinions.py} | 2 +- .../{0019_order_opinions.sql => 0020_order_opinions.sql} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cl/search/migrations/{0019_order_opinions.py => 0020_order_opinions.py} (99%) rename cl/search/migrations/{0019_order_opinions.sql => 0020_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0019_order_opinions.py b/cl/search/migrations/0020_order_opinions.py similarity index 99% rename from cl/search/migrations/0019_order_opinions.py rename to cl/search/migrations/0020_order_opinions.py index 5e446056cc..f614156360 100644 --- a/cl/search/migrations/0019_order_opinions.py +++ b/cl/search/migrations/0020_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0018_update_cluster_model"), + ("search", "0019_add_docket_source_noop"), ] operations = [ diff --git a/cl/search/migrations/0019_order_opinions.sql b/cl/search/migrations/0020_order_opinions.sql similarity index 100% rename from cl/search/migrations/0019_order_opinions.sql rename to cl/search/migrations/0020_order_opinions.sql From 878b9479e9c95b429b16c6bd044a2315b6cce3f3 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 17:11:30 -0600 Subject: [PATCH 10/50] feat(opinion_order): management command to update the order of harvard and columbia opinions --- .../commands/update_opinions_order.py | 598 ++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100644 cl/corpus_importer/management/commands/update_opinions_order.py diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py new file mode 100644 index 0000000000..f48de154a0 --- /dev/null +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -0,0 +1,598 @@ +import re +from typing import Any, Optional + +from bs4 import BeautifulSoup, NavigableString, Tag +from django.core.management import BaseCommand +from django.db.models import Count + +from cl.corpus_importer.utils import similarity_scores +from cl.lib.command_utils import logger +from cl.lib.string_diff import get_cosine_similarity +from cl.search.models import Opinion, OpinionCluster + +# TODO Should we add a flag to know that the cluster has been processed? + + +def match_text_lists( + file_opinions_list: list[str], cl_opinions_list: list[str] +) -> dict[int, Any]: + """Generate matching lists above threshold + :param file_opinions_list: Opinions from file + :param cl_opinions_list: CL opinions + :return: Matches if found or False + """ + # We import this here to avoid a circular import + from cl.corpus_importer.management.commands.harvard_opinions import ( + compare_documents, + ) + + scores = similarity_scores(file_opinions_list, cl_opinions_list) + + matches = {} + for i, row in enumerate(scores): + j = row.argmax() # type: ignore + # Lower threshold for small opinions. + if ( + get_cosine_similarity(file_opinions_list[i], cl_opinions_list[j]) + < 0.60 + ): + continue + percent_match = compare_documents( + file_opinions_list[i], cl_opinions_list[j] + ) + if percent_match < 60: + continue + matches[i] = j + + # Key is opinion position from file, Value is opinion position from cl opinion + # e.g. matches {0: 1, 1: 2} 0 is file opinion and 1 in cl opinion, 1 is file + # opinion and 2 is cl opinion + return matches + + +def get_opinion_content( + cluster_id, +) -> tuple[Optional[str], list[dict], int, bool]: + """Get the opinions content for a cluster object + :param cluster_id: Cluster ID for a set of opinions + :return: (xml path, list of extracted opinions, start position, True if combined + opinions exists in cluster) + """ + cl_cleaned_opinions = [] + # by default the opinions are ordered by pk + opinions_from_cluster = Opinion.objects.filter( + cluster_id=cluster_id + ).order_by("id") + combined_opinions_cluster = opinions_from_cluster.filter( + type="010combined" + ) + xml_path = None + combined_opinion = False + if combined_opinions_cluster: + # the combined opinion will be displayed at beginning + start_position = combined_opinions_cluster.count() + combined_opinion = True + else: + # we don't have combined opinions, we start ordering from 0 to n + start_position = 0 + + for i, op in enumerate(opinions_from_cluster.exclude(type="010combined")): + if op.local_path and not xml_path: + xml_path = op.local_path + content = None + if len(op.html_with_citations) > 1: + content = op.html_with_citations + elif len(op.html_columbia) > 1: + content = op.html_columbia + elif len(op.html_lawbox) > 1: + content = op.html_lawbox + elif len(op.plain_text) > 1: + content = op.plain_text + elif len(op.html) > 1: + content = op.html + elif len(op.xml_harvard) > 1: + content = op.xml_harvard + if content: + soup = BeautifulSoup(content, features="html.parser") + prep_text = re.sub( + r"[^a-zA-Z0-9 ]", "", soup.getText(separator=" ").lower() + ) + prep_text = re.sub(" +", " ", prep_text) + cl_cleaned_opinions.append( + { + "id": op.id, + "byline": op.author_str, + "type": op.type, + "opinion": prep_text, + "order": i, + } + ) + + return xml_path, cl_cleaned_opinions, start_position, combined_opinion + + +def get_opinions_columbia_xml(xml_filepath: str) -> list: + """Convert xml data into dict + :param xml_filepath: path of xml file + :return: dict with data + """ + + SIMPLE_TAGS = [ + "attorneys", + "caption", + "citation", + "court", + "date", + "docket", + "hearing_date", + "panel", + "posture", + "reporter_caption", + ] + + data = {} # type: dict + + with open(xml_filepath, "r", encoding="utf-8") as f: + file_content = f.read() + + data["unpublished"] = False + + if "" in file_content: + file_content = file_content.replace( + "", "" + ) + file_content = file_content.replace("", "").replace( + "", "" + ) + + data["unpublished"] = True + + # Sometimes opening and ending tag mismatch (e.g. c6b39dcb29c9c.xml) + file_content = file_content.replace( + "", "" + ) + + soup = BeautifulSoup(file_content, "lxml") + + # Find the outer tag to have all elements inside + find_opinion = soup.find("opinion") + + step_one_opinions = [] # type: list + opinions = [] # type: list + order = 0 + + if find_opinion: + untagged_content = [] + + # We iterate all content, with and without tags + # STEP 1: Extract all content in multiple dict elements + for i, content in enumerate(find_opinion): # type: int, Tag + if type(content) == NavigableString: + # We found a raw string, store it + untagged_content.append(str(content)) + + else: + if content.name in SIMPLE_TAGS + [ + "citation_line", + "opinion_byline", + "dissent_byline", + "concurrence_byline", + ]: + # Ignore these tags, it will be processed later + continue + elif content.name in [ + "opinion_text", + "dissent_text", + "concurrence_text", + ]: + if untagged_content: + # We found something other than a navigable string that is + # not an opinion, but now we have found an opinion, + # let's create this content first + + # default type + op_type = "opinion" + if step_one_opinions: + if step_one_opinions[-1].get("type"): + # use type of previous opinion if exists + op_type = step_one_opinions[-1].get("type") + + # Get rid of double spaces + opinion_content = re.sub( + " +", " ", "\n".join(untagged_content) + ).strip() # type: str + if opinion_content: + step_one_opinions.append( + { + "opinion": opinion_content, + "order": order, + "byline": "", + "type": op_type, + } + ) + order = order + 1 + untagged_content = [] + + byline = content.find_previous_sibling() + opinion_author = "" + if byline and "_byline" in byline.name: + opinion_author = byline.get_text() + + opinion_content = re.sub( + " +", " ", content.decode_contents() + ).strip() + if opinion_content: + step_one_opinions.append( + { + "opinion": opinion_content, + "order": order, + "byline": opinion_author, + "type": content.name.replace("_text", ""), + } + ) + order = order + 1 + + else: + # Content not inside _text tag, we store it + untagged_content.append(str(content)) + + if untagged_content: + # default type + op_type = "opinion" + if step_one_opinions: + if step_one_opinions[-1].get("type"): + # use type of previous opinion if exists + op_type = step_one_opinions[-1].get("type") + + opinion_content = re.sub( + " +", " ", "\n".join(untagged_content) + ).strip() + if opinion_content: + step_one_opinions.append( + { + "opinion": opinion_content, + "order": order, + "byline": "", + "type": op_type, + } + ) + + # Step 2: Merge found content in the xml file + new_order = 0 + authorless_content = [] + + for i, found_content in enumerate(step_one_opinions, start=1): + byline = found_content.get("byline") + if not byline: + # Opinion has no byline, store it + authorless_content.append(found_content) + + if byline: + # Opinion has byline + opinion_type = found_content.get("type") + opinion_content = found_content.get("opinion", "") + # Store content that doesn't match the current type + alternative_authorless_content = [ + z + for z in authorless_content + if z.get("type") != opinion_type + ] + # Keep content that matches the current type + authorless_content = [ + z + for z in authorless_content + if z.get("type") == opinion_type + ] + + if alternative_authorless_content: + # Keep floating text that are not from the same type, + # we need to create a separate opinion for those, + # for example: in 2713f39c5a8e8684.xml we have an opinion + # without an author, and the next opinion with an author is + # a dissent opinion, we can't combine both + + # We check if the previous stored opinion matches the type of the + # content + relevant_opinions = ( + [opinions[-1]] + if opinions + and opinions[-1]["type"] + == alternative_authorless_content[0].get("type") + else [] + ) + + if relevant_opinions: + previous_opinion = relevant_opinions[-1] + if previous_opinion.get( + "type" + ) == alternative_authorless_content[0].get("type"): + # Merge last opinion with previous opinion, it probably + # belongs the same author + relevant_opinions[-1][ + "opinion" + ] += "\n" + "\n".join( + [ + f.get("opinion") + for f in alternative_authorless_content + if f.get("opinion") + ] + ) + authorless_content = [] + + else: + # No relevant opinions found, create a new opinion + new_opinion = { + "byline": None, + "type": alternative_authorless_content[0].get( + "type" + ), + "opinion": "\n".join( + [ + f.get("opinion") + for f in alternative_authorless_content + if f.get("opinion") + ] + ), + "order": new_order, + } + new_order = new_order + 1 + opinions.append(new_opinion) + + # Add new opinion + new_opinion = { + "byline": byline, + "type": opinion_type, + "opinion": "\n".join( + [ + f.get("opinion") + for f in authorless_content + if f.get("type") == opinion_type + ] + ) + + "\n\n" + + opinion_content, + "order": new_order, + } + + opinions.append(new_opinion) + new_order = new_order + 1 + authorless_content = [] + + if len(step_one_opinions) == i and authorless_content: + # If is the last opinion, and we still have opinions without + # byline, create an opinion without an author and the contents + # that couldn't be merged + + # We check if the previous stored opinion matches the type of the + # content + relevant_opinions = ( + [opinions[-1]] + if opinions + and opinions[-1]["type"] + == authorless_content[0].get("type") + else [] + ) + + if relevant_opinions: + previous_opinion = relevant_opinions[-1] + if previous_opinion.get("type") == authorless_content[ + 0 + ].get("type"): + # Merge last opinion with previous opinion, it probably + # belongs the same author + relevant_opinions[-1]["opinion"] += "\n" + "\n".join( + [ + f.get("opinion") + for f in authorless_content + if f.get("opinion") + ] + ) + + else: + # Create last floating opinion + new_opinion = { + "byline": None, + "type": authorless_content[0].get("type"), + "opinion": "\n".join( + [ + f.get("opinion") + for f in authorless_content + if f.get("opinion") + ] + ), + "order": new_order, + } + opinions.append(new_opinion) + + for op in opinions: + opinion_content = op.get("opinion") + opinion_content = BeautifulSoup( + opinion_content, "html.parser" + ).getText() + opinion_content = re.sub(r"[^a-zA-Z0-9 ]", "", opinion_content.lower()) + op["opinion"] = opinion_content + + return opinions + + +def run_harvard(): + """ + We assume that harvard data is already ordered, we just need to fill the order + field in each opinion + """ + + # Get all harvard clusters with more than one opinion + clusters = ( + OpinionCluster.objects.prefetch_related("sub_opinions") + .annotate(opinions_count=Count("sub_opinions")) + .filter(opinions_count__gt=1, source="U") + ) + # print(clusters.query) + print("clusters", len(clusters)) + + # cluster_id: 4697264, the combined opinion will go to the last position + for oc in clusters: + combined_opinions_cluster = oc.sub_opinions.filter( + type="010combined" + ).order_by("id") + if combined_opinions_cluster: + # the combined opinion will be displayed at first + start_position = combined_opinions_cluster.count() + else: + # we don't have combined opinions, we start ordering from 0 to n + start_position = 0 + + print("combined_opinions_cluster", combined_opinions_cluster) + for opinion_order, cluster_op in enumerate( + oc.sub_opinions.exclude(type="010combined").order_by("id"), + start=start_position, + ): + cluster_op.order = opinion_order + cluster_op.save() + + # Show combined opinions at beginning + for opinion_order, cluster_op in enumerate(combined_opinions_cluster): + cluster_op.order = opinion_order + cluster_op.save() + + logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") + + +def run_columbia(): + """ + Update opinion order for columbia clusters + """ + + # Get all columbia cluster ids with more than one opinion + clusters = ( + OpinionCluster.objects.annotate(opinions_count=Count("sub_opinions")) + .filter(opinions_count__gt=1, source="Z") + .order_by("id") + .values_list("id") + ) + + for cluster_id in clusters: + logger.info(f"Processing cluster id: {cluster_id}") + ( + xml_path, + cl_cleaned_opinions, + start_position, + combined_opinion, + ) = get_opinion_content(cluster_id) + + columbia_opinions = None + if xml_path: + columbia_opinions = get_opinions_columbia_xml(xml_path) + + if cl_cleaned_opinions and columbia_opinions: + matches = match_text_lists( + [op.get("opinion") for op in columbia_opinions], + [op.get("opinion") for op in cl_cleaned_opinions], + ) + + if matches: + if len(matches.values()) != len(set(matches.values())): + # We don't have a unique match for each opinion, they were + # probably combined incorrectly + logger.info( + f"We can't infer opinions order for cluster id: {cluster_id}" + ) + # Go to next cluster id + continue + + if len(cl_cleaned_opinions) > len(set(matches.values())): + # We have more opinions than matches + logger.info( + f"We couldn't match all cl opinions to the file's " + f"content, cluster id: {cluster_id}" + ) + # Go to next cluster id + continue + + failed = False + for file_pos, cl_pos in matches.items(): + # file_pos is the correct index to find the opinion id to update + file_opinion = columbia_opinions[file_pos] + # the order was calculated using the xml file + file_order = file_opinion.get("order") + start_position + cl_opinion = cl_cleaned_opinions[cl_pos] + opinion_id_to_update = cl_opinion.get("id") + + if opinion_id_to_update: + try: + # Save opinion + op = Opinion.objects.get(id=opinion_id_to_update) + op.order = file_order + op.save() + logger.info( + f"Cluster id processed: {cluster_id} Update opinion id: {opinion_id_to_update} with position: {file_order}" + ) + except Opinion.DoesNotExist: + logger.warning( + f"We can't update opinion, opinion doesn't exist with " + f"id: {opinion_id_to_update}" + ) + failed = True + break + else: + logger.warning( + f"We can't update opinion, empty opinion id " + f"from cluster: {cluster_id}" + ) + failed = True + break + + if combined_opinion and not failed: + combined_opinions_cluster = Opinion.objects.filter( + cluster_id=cluster_id, type="010combined" + ).order_by("id") + + # Show combined opinions at beginning + for opinion_order, cluster_op in enumerate( + combined_opinions_cluster + ): + cluster_op.order = opinion_order + cluster_op.save() + + else: + # No matches found + logger.warning( + f"Failed to match opinions from cluster id: {cluster_id}" + ) + continue + + +class Command(BaseCommand): + help = "Fill order field in Opinion objects" + + def __init__(self, *args, **kwargs): + super(Command, self).__init__(*args, **kwargs) + + def add_arguments(self, parser): + parser.add_argument( + "--process-harvard", + action="store_true", + help="Fix harvard opinions order", + ) + + parser.add_argument( + "--process-columbia", + action="store_true", + help="Fix columbia opinions order", + ) + + def handle(self, *args, **options): + print("harvard", options["process_harvard"]) + print("columbia", options["process_columbia"]) + + if options["process_harvard"] and options["process_columbia"]: + print( + "You can only select one option process-harvard or process-columbia" + ) + return + + if options["process_harvard"]: + run_harvard() + + if options["process_columbia"]: + run_columbia() From c3a5c4a2a0ad002b075ea69b3a0757bbef684a1f Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 19:13:02 -0600 Subject: [PATCH 11/50] feat(opinion_order): exception when xml file not found --- .../commands/update_opinions_order.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index f48de154a0..0560c506ba 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag from django.core.management import BaseCommand from django.db.models import Count +from django.db.models.fields.files import FieldFile from cl.corpus_importer.utils import similarity_scores from cl.lib.command_utils import logger @@ -52,7 +53,7 @@ def match_text_lists( def get_opinion_content( cluster_id, -) -> tuple[Optional[str], list[dict], int, bool]: +) -> tuple[Optional[FieldFile], list[dict], int, bool]: """Get the opinions content for a cluster object :param cluster_id: Cluster ID for a set of opinions :return: (xml path, list of extracted opinions, start position, True if combined @@ -78,6 +79,8 @@ def get_opinion_content( for i, op in enumerate(opinions_from_cluster.exclude(type="010combined")): if op.local_path and not xml_path: + # We store the field because we are using S3 for storage and that backend + # doesn't support absolute paths xml_path = op.local_path content = None if len(op.html_with_citations) > 1: @@ -111,7 +114,7 @@ def get_opinion_content( return xml_path, cl_cleaned_opinions, start_position, combined_opinion -def get_opinions_columbia_xml(xml_filepath: str) -> list: +def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: """Convert xml data into dict :param xml_filepath: path of xml file :return: dict with data @@ -132,8 +135,8 @@ def get_opinions_columbia_xml(xml_filepath: str) -> list: data = {} # type: dict - with open(xml_filepath, "r", encoding="utf-8") as f: - file_content = f.read() + with xml_filepath.open("r") as f: + file_content = f.read().decode("utf-8") data["unpublished"] = False @@ -432,6 +435,7 @@ def run_harvard(): # cluster_id: 4697264, the combined opinion will go to the last position for oc in clusters: + logger.info(f"Processing cluster id: {oc}") combined_opinions_cluster = oc.sub_opinions.filter( type="010combined" ).order_by("id") @@ -468,7 +472,7 @@ def run_columbia(): OpinionCluster.objects.annotate(opinions_count=Count("sub_opinions")) .filter(opinions_count__gt=1, source="Z") .order_by("id") - .values_list("id") + .values_list("id", flat=True) ) for cluster_id in clusters: @@ -482,7 +486,11 @@ def run_columbia(): columbia_opinions = None if xml_path: - columbia_opinions = get_opinions_columbia_xml(xml_path) + try: + columbia_opinions = get_opinions_columbia_xml(xml_path) + except FileNotFoundError: + logger.warning(f"Xml file not found, cluster id: {cluster_id}") + continue if cl_cleaned_opinions and columbia_opinions: matches = match_text_lists( From 6ba8d3d3b1048ba4dfaf79ef60b72bf5fff8e55f Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 19:37:30 -0600 Subject: [PATCH 12/50] feat(opinion_order): add param to resume command to order opinions --- .../commands/update_opinions_order.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 0560c506ba..d4d915695d 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -82,6 +82,7 @@ def get_opinion_content( # We store the field because we are using S3 for storage and that backend # doesn't support absolute paths xml_path = op.local_path + # print("url", op.local_path.url) content = None if len(op.html_with_citations) > 1: content = op.html_with_citations @@ -136,7 +137,7 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: data = {} # type: dict with xml_filepath.open("r") as f: - file_content = f.read().decode("utf-8") + file_content = f.read() data["unpublished"] = False @@ -418,10 +419,11 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: return opinions -def run_harvard(): +def run_harvard(start_id: int): """ We assume that harvard data is already ordered, we just need to fill the order field in each opinion + :param start_id: skip any id lower than this value """ # Get all harvard clusters with more than one opinion @@ -429,9 +431,11 @@ def run_harvard(): OpinionCluster.objects.prefetch_related("sub_opinions") .annotate(opinions_count=Count("sub_opinions")) .filter(opinions_count__gt=1, source="U") + .order_by("id") ) - # print(clusters.query) - print("clusters", len(clusters)) + + if start_id: + clusters = clusters.filter(pk__gte=start_id) # cluster_id: 4697264, the combined opinion will go to the last position for oc in clusters: @@ -446,7 +450,6 @@ def run_harvard(): # we don't have combined opinions, we start ordering from 0 to n start_position = 0 - print("combined_opinions_cluster", combined_opinions_cluster) for opinion_order, cluster_op in enumerate( oc.sub_opinions.exclude(type="010combined").order_by("id"), start=start_position, @@ -462,9 +465,10 @@ def run_harvard(): logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") -def run_columbia(): +def run_columbia(start_id: int): """ Update opinion order for columbia clusters + :param start_id: skip any id lower than this value """ # Get all columbia cluster ids with more than one opinion @@ -475,6 +479,9 @@ def run_columbia(): .values_list("id", flat=True) ) + if start_id: + clusters = filter(lambda x: x >= start_id, clusters) + for cluster_id in clusters: logger.info(f"Processing cluster id: {cluster_id}") ( @@ -589,10 +596,14 @@ def add_arguments(self, parser): help="Fix columbia opinions order", ) - def handle(self, *args, **options): - print("harvard", options["process_harvard"]) - print("columbia", options["process_columbia"]) + parser.add_argument( + "--start-id", + type=int, + default=0, + help="Skip any id lower than this value", + ) + def handle(self, *args, **options): if options["process_harvard"] and options["process_columbia"]: print( "You can only select one option process-harvard or process-columbia" @@ -600,7 +611,7 @@ def handle(self, *args, **options): return if options["process_harvard"]: - run_harvard() + run_harvard(options["start_id"]) if options["process_columbia"]: - run_columbia() + run_columbia(options["start_id"]) From 71ec6241cc0c06d4aaebfb71a0cec188eb39a11a Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 30 Aug 2023 19:47:56 -0600 Subject: [PATCH 13/50] feat(opinion_order): add new param for command --- .../commands/update_opinions_order.py | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index d4d915695d..7a46530a82 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -11,8 +11,6 @@ from cl.lib.string_diff import get_cosine_similarity from cl.search.models import Opinion, OpinionCluster -# TODO Should we add a flag to know that the cluster has been processed? - def match_text_lists( file_opinions_list: list[str], cl_opinions_list: list[str] @@ -419,11 +417,12 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: return opinions -def run_harvard(start_id: int): +def run_harvard(start_id: int, end_id: int): """ We assume that harvard data is already ordered, we just need to fill the order field in each opinion :param start_id: skip any id lower than this value + :param end_id: skip any id greater than this value """ # Get all harvard clusters with more than one opinion @@ -437,6 +436,9 @@ def run_harvard(start_id: int): if start_id: clusters = clusters.filter(pk__gte=start_id) + if end_id: + clusters = clusters.filter(pk__lte=end_id) + # cluster_id: 4697264, the combined opinion will go to the last position for oc in clusters: logger.info(f"Processing cluster id: {oc}") @@ -465,10 +467,11 @@ def run_harvard(start_id: int): logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") -def run_columbia(start_id: int): +def run_columbia(start_id: int, end_id: int): """ Update opinion order for columbia clusters :param start_id: skip any id lower than this value + :param end_id: skip any id greater than this value """ # Get all columbia cluster ids with more than one opinion @@ -482,6 +485,9 @@ def run_columbia(start_id: int): if start_id: clusters = filter(lambda x: x >= start_id, clusters) + if end_id: + clusters = filter(lambda x: x <= end_id, clusters) + for cluster_id in clusters: logger.info(f"Processing cluster id: {cluster_id}") ( @@ -600,7 +606,14 @@ def add_arguments(self, parser): "--start-id", type=int, default=0, - help="Skip any id lower than this value", + help="Start id for a range of clusters (inclusive)", + ) + + parser.add_argument( + "--end-id", + type=int, + default=0, + help="End id for a range of clusters (inclusive)", ) def handle(self, *args, **options): @@ -611,7 +624,7 @@ def handle(self, *args, **options): return if options["process_harvard"]: - run_harvard(options["start_id"]) + run_harvard(options["start_id"], options["end_id"]) if options["process_columbia"]: - run_columbia(options["start_id"]) + run_columbia(options["start_id"], options["end_id"]) From f4615b07d931f93b7a2409438d17f85d6582f4a9 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 31 Aug 2023 13:35:04 -0600 Subject: [PATCH 14/50] feat(opinion_order): update typing --- .../commands/update_opinions_order.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 7a46530a82..480f2ef6d6 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,5 +1,5 @@ import re -from typing import Any, Optional +from typing import Any, List, Optional from bs4 import BeautifulSoup, NavigableString, Tag from django.core.management import BaseCommand @@ -13,12 +13,12 @@ def match_text_lists( - file_opinions_list: list[str], cl_opinions_list: list[str] -) -> dict[int, Any]: + file_opinions_list: List[Any], cl_opinions_list: List[Any] +) -> dict[int, int]: """Generate matching lists above threshold :param file_opinions_list: Opinions from file :param cl_opinions_list: CL opinions - :return: Matches if found or False + :return: Matches if found or empty dict """ # We import this here to avoid a circular import from cl.corpus_importer.management.commands.harvard_opinions import ( @@ -507,8 +507,16 @@ def run_columbia(start_id: int, end_id: int): if cl_cleaned_opinions and columbia_opinions: matches = match_text_lists( - [op.get("opinion") for op in columbia_opinions], - [op.get("opinion") for op in cl_cleaned_opinions], + [ + op.get("opinion") + for op in columbia_opinions + if op.get("opinion") + ], + [ + op.get("opinion") + for op in cl_cleaned_opinions + if op.get("opinion") + ], ) if matches: From 3ceff218c23c77201b3b78fd7bda838db09a2706 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 31 Aug 2023 14:30:00 -0600 Subject: [PATCH 15/50] feat(opinion_order): temporary read xml files from s3 it requires to change the AWS_STORAGE_BUCKET_NAME env variable to read files from private storage --- .../management/commands/update_opinions_order.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 480f2ef6d6..0b96a5dae1 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -134,6 +134,16 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: data = {} # type: dict + if "/home/mlissner" in str(xml_filepath): + # Temporary replace the path with the correct from S3, this way we read them + # directly from S3, we need the files in /sources/columbia/opinions/ in + # com-courtlistener-storage bucket + # TODO discuss this + xml_filepath.name = xml_filepath.name.replace( + "/home/mlissner", "/sources" + ) + + # print(f"Opening {xml_filepath.url}") with xml_filepath.open("r") as f: file_content = f.read() @@ -502,7 +512,9 @@ def run_columbia(start_id: int, end_id: int): try: columbia_opinions = get_opinions_columbia_xml(xml_path) except FileNotFoundError: - logger.warning(f"Xml file not found, cluster id: {cluster_id}") + logger.warning( + f"Xml file not found in {xml_path}, cluster id: {cluster_id}" + ) continue if cl_cleaned_opinions and columbia_opinions: From 0bd9b9ac3bf2a511633d93de0bdebc49da06ca5d Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 5 Sep 2023 15:52:59 -0600 Subject: [PATCH 16/50] feat(update_opinions_order): argument added to point to the mounted directory with xml files --- .../commands/update_opinions_order.py | 47 +++++++++++++------ 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 0b96a5dae1..f6c72811d8 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,3 +1,4 @@ +import os.path import re from typing import Any, List, Optional @@ -80,7 +81,6 @@ def get_opinion_content( # We store the field because we are using S3 for storage and that backend # doesn't support absolute paths xml_path = op.local_path - # print("url", op.local_path.url) content = None if len(op.html_with_citations) > 1: content = op.html_with_citations @@ -113,9 +113,10 @@ def get_opinion_content( return xml_path, cl_cleaned_opinions, start_position, combined_opinion -def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: +def get_opinions_columbia_xml(xml_filepath: FieldFile, xml_dir: str) -> list: """Convert xml data into dict :param xml_filepath: path of xml file + :param xml_dir: absolute path to the directory with columbia xml files :return: dict with data """ @@ -134,17 +135,17 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile) -> list: data = {} # type: dict - if "/home/mlissner" in str(xml_filepath): - # Temporary replace the path with the correct from S3, this way we read them - # directly from S3, we need the files in /sources/columbia/opinions/ in - # com-courtlistener-storage bucket - # TODO discuss this - xml_filepath.name = xml_filepath.name.replace( - "/home/mlissner", "/sources" + if "/home/mlissner/columbia/opinions/" in str(xml_filepath): + filepath = str( + xml_filepath.name.replace("/home/mlissner/columbia/opinions/", "") ) + # fix file path temporarily + new_xml_filepath = os.path.join(xml_dir, filepath) + else: + logger.info(f"Can't fix xml file path: {xml_filepath}") + raise FileNotFoundError - # print(f"Opening {xml_filepath.url}") - with xml_filepath.open("r") as f: + with open(new_xml_filepath, "r", encoding="utf-8") as f: file_content = f.read() data["unpublished"] = False @@ -477,11 +478,12 @@ def run_harvard(start_id: int, end_id: int): logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") -def run_columbia(start_id: int, end_id: int): +def run_columbia(start_id: int, end_id: int, xml_dir: str): """ Update opinion order for columbia clusters :param start_id: skip any id lower than this value :param end_id: skip any id greater than this value + :param xml_dir: absolute path to the directory with columbia xml files """ # Get all columbia cluster ids with more than one opinion @@ -510,7 +512,9 @@ def run_columbia(start_id: int, end_id: int): columbia_opinions = None if xml_path: try: - columbia_opinions = get_opinions_columbia_xml(xml_path) + columbia_opinions = get_opinions_columbia_xml( + xml_path, xml_dir + ) except FileNotFoundError: logger.warning( f"Xml file not found in {xml_path}, cluster id: {cluster_id}" @@ -622,6 +626,12 @@ def add_arguments(self, parser): help="Fix columbia opinions order", ) + parser.add_argument( + "--xml-dir", + required=False, + help="The absolute path to the directory with columbia xml files", + ) + parser.add_argument( "--start-id", type=int, @@ -646,5 +656,12 @@ def handle(self, *args, **options): if options["process_harvard"]: run_harvard(options["start_id"], options["end_id"]) - if options["process_columbia"]: - run_columbia(options["start_id"], options["end_id"]) + if options["process_columbia"] and options["xml_dir"]: + run_columbia( + options["start_id"], options["end_id"], options["xml_dir"] + ) + + if options["process_columbia"] and not options["xml_dir"]: + print( + "Argument --xml-dir required to read xml files from mounted directory" + ) From 7b16b42d99c1f8b9076d47ddb0ba916df21b564a Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 5 Sep 2023 16:09:04 -0600 Subject: [PATCH 17/50] feat(update_opinions_order): fix mypy error --- .../management/commands/update_opinions_order.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index f6c72811d8..05a1bdb7f5 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -136,8 +136,8 @@ def get_opinions_columbia_xml(xml_filepath: FieldFile, xml_dir: str) -> list: data = {} # type: dict if "/home/mlissner/columbia/opinions/" in str(xml_filepath): - filepath = str( - xml_filepath.name.replace("/home/mlissner/columbia/opinions/", "") + filepath = str(xml_filepath).replace( + "/home/mlissner/columbia/opinions/", "" ) # fix file path temporarily new_xml_filepath = os.path.join(xml_dir, filepath) From d49708adfacecfa075dd3a298a8cdc867532c008 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 21 Sep 2023 13:27:05 -0600 Subject: [PATCH 18/50] fix(opinion_order): Update poetry.lock --- poetry.lock | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 90d12b08bf..a7f14d94cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "amqp" @@ -1097,6 +1097,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -2577,6 +2588,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2721,7 +2742,7 @@ name = "ndg-httpsclient" version = "0.5.1" description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL" optional = false -python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0" +python-versions = ">=2.7,<3.0.0 || >=3.4.0" files = [ {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"}, {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"}, @@ -5096,4 +5117,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "96bb211d8a53b99b00d7d118fd7f90f35dcf27b9a940532d8ea814eecc5cbd6b" +content-hash = "6ce30a4f34302d7e0ca29bf1f9794ad2fc1759cef8312bcfebb5550a33cb0019" From 9ae8dc891f764a471729dc8131e0e071bd9f9e7c Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 29 Sep 2023 17:15:04 -0600 Subject: [PATCH 19/50] fix(opinion_order): Update poetry.lock --- poetry.lock | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index c9b8295b88..a7fe6b3511 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1097,6 +1097,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -2577,6 +2588,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -5096,4 +5117,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "4b906615444a53e1a26780aa6a3742c0e7844c307c6a991b059ee4de0cb177a8" +content-hash = "6da7f3d3b926ac02caf9720eda2b6c81ae71fe04aafb6a0a35f83e52b4c412cc" From 7702a082063ed2b80b6f803a1a6afa7af6347887 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 29 Sep 2023 18:03:45 -0600 Subject: [PATCH 20/50] fix(opinion_order): Rename migrations --- .../{0020_order_opinions.py => 0022_order_opinions.py} | 2 +- .../{0020_order_opinions.sql => 0022_order_opinions.sql} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cl/search/migrations/{0020_order_opinions.py => 0022_order_opinions.py} (99%) rename cl/search/migrations/{0020_order_opinions.sql => 0022_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0020_order_opinions.py b/cl/search/migrations/0022_order_opinions.py similarity index 99% rename from cl/search/migrations/0020_order_opinions.py rename to cl/search/migrations/0022_order_opinions.py index f614156360..763c98e8fc 100644 --- a/cl/search/migrations/0020_order_opinions.py +++ b/cl/search/migrations/0022_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0019_add_docket_source_noop"), + ("search", "0021_add_pghistory_courthouse"), ] operations = [ diff --git a/cl/search/migrations/0020_order_opinions.sql b/cl/search/migrations/0022_order_opinions.sql similarity index 100% rename from cl/search/migrations/0020_order_opinions.sql rename to cl/search/migrations/0022_order_opinions.sql From 3f173fef6ac191d2c1a0b43f38de3917a9f9b9bf Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 29 Nov 2023 13:08:52 -0600 Subject: [PATCH 21/50] fix(opinions_order): rename migrations update poetry.lock --- ...der_opinions.py => 0024_order_opinions.py} | 2 +- ...r_opinions.sql => 0024_order_opinions.sql} | 0 poetry.lock | 30 ++++++++----------- 3 files changed, 14 insertions(+), 18 deletions(-) rename cl/search/migrations/{0022_order_opinions.py => 0024_order_opinions.py} (99%) rename cl/search/migrations/{0022_order_opinions.sql => 0024_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0022_order_opinions.py b/cl/search/migrations/0024_order_opinions.py similarity index 99% rename from cl/search/migrations/0022_order_opinions.py rename to cl/search/migrations/0024_order_opinions.py index 763c98e8fc..1abaed4d76 100644 --- a/cl/search/migrations/0022_order_opinions.py +++ b/cl/search/migrations/0024_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0021_add_pghistory_courthouse"), + ("search", "0023_add_docket_sources_noop"), ] operations = [ diff --git a/cl/search/migrations/0022_order_opinions.sql b/cl/search/migrations/0024_order_opinions.sql similarity index 100% rename from cl/search/migrations/0022_order_opinions.sql rename to cl/search/migrations/0024_order_opinions.sql diff --git a/poetry.lock b/poetry.lock index cdb46a7ef7..30080de3f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,6 +1101,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -1493,18 +1504,6 @@ files = [ {file = "fast_diff_match_patch-2.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c4cb3aa60664bcafd070915cc0f148c63da3a20babeca29bdf24e6aee80ff481"}, {file = "fast_diff_match_patch-2.0.1-cp310-cp310-win32.whl", hash = "sha256:3423c373c168fcbc56fa488960248ce086dd686402817aa5d4d967537fff1203"}, {file = "fast_diff_match_patch-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:f8b5595277f99b4908ae9bab33548bfe7497a99a1f5dc5c277a4f36051dcf993"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a682a72b93e07902b9af3bc591fe365da4024888cceb308f04cdec59eeb3602d"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d30e7fb0de87e02db88cda54f6c57a9f7d789e4d0922cfed41f61a1d4415408b"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:58b273cecb941bef392bda622a534de03e6ea8d3186d4d07745375cce9db0833"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e39bb9ca0b7632a15e85cb6b0c4c575010e6fb6e43e5714ee53c7cef1aa4135"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-win32.whl", hash = "sha256:b4d4e6aa5c6a4af0b6c66be593021579f4693c94b848084b89e6783180361db6"}, - {file = "fast_diff_match_patch-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:c1154830dbcb83d1c9ed24f43b1e8226cafc7ce46b6e0971e866bdf513ecc216"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6723cfba7bd9fb712e179acbc9c6cb526076612c0325ad4f1066f3bd176064a"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:378251cc37cd21d14802669a3453f026ed3aa07c07a8aa2daabeefd14a0e0a36"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a2e1ce344438b14400a91b65c79c39345b0ce70a0a8797e88b14485577b5fc0"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cc7285d9a1fbf8990361ce37728202fd6ebee6ddc6cfe6fb15a19905e562f304"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-win32.whl", hash = "sha256:3aaeb207fe586979ecb194ecc2c81ba979d351cd0bdaba8489ce4be0f55206dc"}, - {file = "fast_diff_match_patch-2.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:4d759ec2d79c638407f32c29dc348fcef6e6a1659927056527b0939a1ab31ca5"}, {file = "fast_diff_match_patch-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e5205e4f3b820f65138947e0d42959b6910fd959c8e5e8f4fc72472f6fec9d8b"}, {file = "fast_diff_match_patch-2.0.1-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fa1212d0200169e93392805957ca6ae351bfc51282c5119fb231f968c7e12fbc"}, {file = "fast_diff_match_patch-2.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d30a9db041dfee960a9c8a35fa99685b1f29530f52f69fef1e3cc02867f0b9"}, @@ -1545,9 +1544,6 @@ files = [ {file = "fast_diff_match_patch-2.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:58ada748637821445df3cfcb21df412136fb69b8e677ea364aa9ca7a8facb048"}, {file = "fast_diff_match_patch-2.0.1-cp39-cp39-win32.whl", hash = "sha256:b07808e98f0bfcd557281126135b24729a30ee10ccc2db4d3358fb2f18ac1879"}, {file = "fast_diff_match_patch-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:6f2202d1e9d225918ea3803f66ca9c99d080c8ba5094c438680eb2c8dfd2e48c"}, - {file = "fast_diff_match_patch-2.0.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ecff01b3d10d6bed965a1591e37597df118ab0bcc98a3f59a724a0d9bd63fb1"}, - {file = "fast_diff_match_patch-2.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a92ba0d543524234a17ea2da4892a9752273cfdfed528e581f0f76cbd78cf991"}, - {file = "fast_diff_match_patch-2.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd5b3b99bb7c14ce8ea5ab184afb2cc6796dac71439b2cfc6fb6227a6846aef3"}, {file = "fast_diff_match_patch-2.0.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:daa821a8dcbc1026f7f8cc177ca599bcfbaaddccdf90bc1ad1e44255b1c239e1"}, {file = "fast_diff_match_patch-2.0.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27de6dc97e7d6dc207585d778ace58e7cc364b8383e5412164224d52ad4099b5"}, {file = "fast_diff_match_patch-2.0.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec27f797b1ecee79c3d76c9a081a6c20fd89068b41ba3b84a6ebe48317c5c46c"}, @@ -2750,7 +2746,7 @@ name = "ndg-httpsclient" version = "0.5.1" description = "Provides enhanced HTTPS support for httplib and urllib2 using PyOpenSSL" optional = false -python-versions = ">=2.7,<3.0.dev0 || >=3.4.dev0" +python-versions = ">=2.7,<3.0.0 || >=3.4.0" files = [ {file = "ndg_httpsclient-0.5.1-py2-none-any.whl", hash = "sha256:d2c7225f6a1c6cf698af4ebc962da70178a99bcde24ee6d1961c4f3338130d57"}, {file = "ndg_httpsclient-0.5.1-py3-none-any.whl", hash = "sha256:dd174c11d971b6244a891f7be2b32ca9853d3797a72edb34fa5d7b07d8fff7d4"}, @@ -5212,4 +5208,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11, <3.12" -content-hash = "f3edde54a6877b5506669d8d8354b28d8b7c6dffbb08c4b0954079680cec63dc" +content-hash = "ce20135f86ae0bc9264359886c298076a90c74d5a30256f7db4541812ffb4f76" From 9dedd433ca589f7db5f4d71edd7318fbd34e3aa8 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 29 Nov 2023 19:14:00 -0600 Subject: [PATCH 22/50] fix(opinions_order): code refactored NOTE: functions found in columbia_utils.py and utils.py, were temporarily added in the command,when the necessary changes are combined we need to remove the functions and import them from the utils. --- .../commands/update_opinions_order.py | 833 ++++++++++-------- 1 file changed, 461 insertions(+), 372 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 05a1bdb7f5..ae931ba4b7 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -5,18 +5,356 @@ from bs4 import BeautifulSoup, NavigableString, Tag from django.core.management import BaseCommand from django.db.models import Count -from django.db.models.fields.files import FieldFile from cl.corpus_importer.utils import similarity_scores from cl.lib.command_utils import logger from cl.lib.string_diff import get_cosine_similarity -from cl.search.models import Opinion, OpinionCluster +from cl.search.models import SOURCES, Opinion, OpinionCluster + +VALID_COLUMBIA_SOURCES = [ + key + for key in dict(SOURCES.NAMES).keys() + if SOURCES.COLUMBIA_ARCHIVE in key +] + +VALID_HARVARD_SOURCES = [ + key for key in dict(SOURCES.NAMES).keys() if SOURCES.HARVARD_CASELAW in key +] + + +# TODO remove the funcitions below and import them from utils.py and columbia_utils.py when those changes get merged + + +SIMPLE_TAGS = [ + "attorneys", + "caption", + "citation", + "court", + "date", + "docket", + "hearing_date", + "panel", + "posture", + "reporter_caption", +] + + +class EmptyOpinionException(Exception): + """An exception for opinions that raise a ZeroDivisionError Exception due empty + opinion tag or empty opinion content in cl""" + + def __init__(self, message: str) -> None: + self.message = message + + +def read_xml_to_soup(filepath: str) -> BeautifulSoup: + """This function reads the xml file, fixes the bad tags in columbia xml + files and returns a BeautifulSoup object + + :param filepath: path to xml file + :return: BeautifulSoup object of parsed content + """ + with open(filepath, "r", encoding="utf-8") as f: + file_content = f.read() + # Sometimes opening and ending tag mismatch (e.g. ed7c6b39dcb29c9c.xml) + file_content = file_content.replace( + "", "" + ) + # Fix opinion with invalid attribute + if "" in file_content: + file_content = file_content.replace( + "", "" + ) + file_content = file_content.replace("", "").replace( + "", "" + ) + return BeautifulSoup(file_content, "lxml") + + +def add_floating_opinion( + opinions: list, floating_content: list, opinion_order: int +) -> list: + """We have found floating opinions in bs object, we keep the opinion + content as a new opinion + + :param opinions: a list with opinions found + :param floating_content: content that is not in known non-opinion tags + :param opinion_order: opinion position + :return: updated list of opinions + """ + op_type = "opinion" + if opinions: + if opinions[-1].get("type"): + # Use type of previous opinion if exists + op_type = opinions[-1].get("type") + + # Get rid of double spaces from floating content + opinion_content = re.sub( + " +", " ", "\n".join(floating_content) + ).strip() # type: str + if opinion_content: + opinions.append( + { + "opinion": opinion_content, + "order": opinion_order, + "byline": "", + "type": op_type, + } + ) + return opinions + + +def extract_columbia_opinions( + outer_opinion: BeautifulSoup, +) -> list[Optional[dict]]: + """We extract all possible opinions from BeautifulSoup, with and without + author, and we create new opinions if floating content exists(content that + is not explicitly defined within an opinion tag or doesn't have an author) + + :param outer_opinion: element containing all xml tags + :return: list of opinion dicts + """ + opinions: list = [] + floating_content = [] + order = 0 + + # We iterate all content to look for all possible opinions + for i, content in enumerate(outer_opinion): # type: int, Tag + if isinstance(content, NavigableString): + # We found a raw string, store it + floating_content.append(str(content)) + else: + if content.name in SIMPLE_TAGS + [ + "citation_line", + "opinion_byline", + "dissent_byline", + "concurrence_byline", + ]: + # Ignore these tags, it will be processed later + continue + elif content.name in [ + "opinion_text", + "dissent_text", + "concurrence_text", + ]: + if floating_content: + # We have found an opinion, but there is floating + # content, we create a dict with the opinion using the + # floating content with default type = "opinion" + opinions = add_floating_opinion( + opinions, floating_content, order + ) + floating_content = [] + + byline = content.find_previous_sibling() + opinion_author = "" + if byline and "_byline" in byline.name: + opinion_author = byline.get_text() + + opinion_content = re.sub( + " +", " ", content.decode_contents() + ).strip() + if opinion_content: + # Now we create a dict with current opinion + opinions.append( + { + "opinion": opinion_content, + "order": order, + "byline": opinion_author, + "type": content.name.replace("_text", ""), + } + ) + order = order + 1 + + else: + if content.name not in SIMPLE_TAGS + ["syllabus"]: + # We store content that is not inside _text tag and is + # not in one of the known non-opinion tags + floating_content.append(str(content)) + + # Combine the new content into another opinion. great. + if floating_content: + # If we end to go through all the found opinions and if we still + # have floating content out there, we create a new opinion with the + # last type of opinion + opinions = add_floating_opinion(opinions, floating_content, order) + return opinions + + +def is_per_curiam_opinion( + content: Optional[str], byline: Optional[str] +) -> bool: + """Check if opinion author is per curiam + :param content: opinion content + :param byline: opinion text author + :return: True if opinion author is per curiam + """ + if byline and "per curiam" in byline[:1000].lower(): + return True + if content and "per curiam" in content[:1000].lower(): + return True + return False + + +def merge_opinions( + opinions: list, content: list, current_order: int +) -> tuple[list, int]: + """Merge last and previous opinion if are the same type or create a new + opinion if merge is not possible + + :param opinions: list of opinions that is being updated constantly + :param content: list of opinions without an author + :param current_order: opinion position + :return: updated list of opinions + """ + + # We check if the previous stored opinion matches the type of the + # content, and we store the opinion dict temporary + relevant_opinions = ( + [opinions[-1]] + if opinions and opinions[-1]["type"] == content[0].get("type") + else [] + ) + + if relevant_opinions: + relevant_opinions[-1]["opinion"] += "\n" + "\n".join( + [f.get("opinion") for f in content if f.get("opinion")] + ) + + else: + # No relevant opinions found, create a new opinion with the content + opinion_content = "\n".join( + [f.get("opinion") for f in content if f.get("opinion")] + ) + new_opinion = { + "byline": None, + "type": content[0].get("type"), + "opinion": opinion_content, + "order": current_order, + "per_curiam": is_per_curiam_opinion(opinion_content, None), + } + opinions.append(new_opinion) + current_order = current_order + 1 + + return opinions, current_order + + +def process_extracted_opinions(extracted_opinions: list) -> list: + """We read the extracted data in extract_opinions function to merge all + possible floating opinions (it is not explicitly defined within an opinion + tag or doesn't have an author) + + :param extracted_opinions: list of opinions obtained from xml file + :return: a list with extracted and processed opinions + """ + + opinions: list = [] + authorless_content = [] + order = 0 + + for i, found_content in enumerate(extracted_opinions, start=1): + byline = found_content.get("byline") + if not byline: + # Opinion has no byline, store opinion content + authorless_content.append(found_content) + + if byline: + # Opinion has byline, get opinion type and content + opinion_type = found_content.get("type") + opinion_content = found_content.get("opinion", "") + # Store content that doesn't match the current opinion type + alternative_authorless_content = [ + content + for content in authorless_content + if content.get("type") != opinion_type + ] + # Keep content that matches the current type + authorless_content = [ + op_content + for op_content in authorless_content + if op_content.get("type") == opinion_type + ] + + if alternative_authorless_content: + # Keep floating text that are not from the same type, + # we need to create a separate opinion for those, + # for example: in 2713f39c5a8e8684.xml we have an opinion + # without an author, and the next opinion with an author is + # a dissent opinion, we can't combine both + opinions, order = merge_opinions( + opinions, alternative_authorless_content, order + ) + + opinion_content = ( + "\n".join( + [ + f.get("opinion") + for f in authorless_content + if f.get("type") == opinion_type + ] + ) + + "\n\n" + + opinion_content + ) + + # Add new opinion + new_opinion = { + "byline": byline, + "type": opinion_type, + "opinion": opinion_content, + "order": order, + "per_curiam": is_per_curiam_opinion(opinion_content, byline), + } + + opinions.append(new_opinion) + order = order + 1 + authorless_content = [] + + if len(extracted_opinions) == i and authorless_content: + # If is the last opinion, and we still have opinions without + # byline, create an opinion without an author and the contents + # that couldn't be merged + opinions, order = merge_opinions( + opinions, authorless_content, order + ) + + return opinions + + +def map_opinion_types(opinions=None) -> None: + """Map opinion type to model field choice + + :param opinions: a list that contains all opinions as dict elements + :return: None + """ + + if opinions is None: + opinions = [] + lead = False + for op in opinions: + op_type = op.get("type") + # Only first opinion with "opinion" type is a lead opinion, the next + # opinion with "opinion" type is an addendum + if not lead and op_type and op_type == "opinion": + lead = True + op["type"] = "020lead" + continue + elif lead and op_type and op_type == "opinion": + op["type"] = "050addendum" + elif op_type and op_type == "dissent": + op["type"] = "040dissent" + elif op_type and op_type == "concurrence": + op["type"] = "030concurrence" + + +# TODO ------------------------ remove until here ------------------------------- def match_text_lists( file_opinions_list: List[Any], cl_opinions_list: List[Any] ) -> dict[int, int]: """Generate matching lists above threshold + :param file_opinions_list: Opinions from file :param cl_opinions_list: CL opinions :return: Matches if found or empty dict @@ -50,10 +388,11 @@ def match_text_lists( return matches -def get_opinion_content( +def get_opinions_cleaned_content( cluster_id, -) -> tuple[Optional[FieldFile], list[dict], int, bool]: - """Get the opinions content for a cluster object +) -> tuple[Optional[str], list[dict], int, bool]: + """Get cleaned opinions content for a cluster object + :param cluster_id: Cluster ID for a set of opinions :return: (xml path, list of extracted opinions, start position, True if combined opinions exists in cluster) @@ -67,380 +406,108 @@ def get_opinion_content( type="010combined" ) xml_path = None - combined_opinion = False + cluster_has_combined_opinion = False if combined_opinions_cluster: # the combined opinion will be displayed at beginning start_position = combined_opinions_cluster.count() - combined_opinion = True + cluster_has_combined_opinion = True else: # we don't have combined opinions, we start ordering from 0 to n start_position = 0 for i, op in enumerate(opinions_from_cluster.exclude(type="010combined")): if op.local_path and not xml_path: - # We store the field because we are using S3 for storage and that backend - # doesn't support absolute paths - xml_path = op.local_path - content = None - if len(op.html_with_citations) > 1: - content = op.html_with_citations - elif len(op.html_columbia) > 1: - content = op.html_columbia - elif len(op.html_lawbox) > 1: - content = op.html_lawbox - elif len(op.plain_text) > 1: - content = op.plain_text - elif len(op.html) > 1: - content = op.html - elif len(op.xml_harvard) > 1: - content = op.xml_harvard - if content: - soup = BeautifulSoup(content, features="html.parser") - prep_text = re.sub( - r"[^a-zA-Z0-9 ]", "", soup.getText(separator=" ").lower() - ) - prep_text = re.sub(" +", " ", prep_text) - cl_cleaned_opinions.append( - { - "id": op.id, - "byline": op.author_str, - "type": op.type, - "opinion": prep_text, - "order": i, - } - ) - - return xml_path, cl_cleaned_opinions, start_position, combined_opinion - + xml_path = str(op.local_path) -def get_opinions_columbia_xml(xml_filepath: FieldFile, xml_dir: str) -> list: - """Convert xml data into dict - :param xml_filepath: path of xml file - :param xml_dir: absolute path to the directory with columbia xml files - :return: dict with data - """ - - SIMPLE_TAGS = [ - "attorneys", - "caption", - "citation", - "court", - "date", - "docket", - "hearing_date", - "panel", - "posture", - "reporter_caption", - ] - - data = {} # type: dict - - if "/home/mlissner/columbia/opinions/" in str(xml_filepath): - filepath = str(xml_filepath).replace( - "/home/mlissner/columbia/opinions/", "" - ) - # fix file path temporarily - new_xml_filepath = os.path.join(xml_dir, filepath) - else: - logger.info(f"Can't fix xml file path: {xml_filepath}") - raise FileNotFoundError - - with open(new_xml_filepath, "r", encoding="utf-8") as f: - file_content = f.read() + content = None - data["unpublished"] = False + # We can only use columbia's content to infer the ordering + if len(op.html_columbia) > 1: + content = op.html_columbia - if "" in file_content: - file_content = file_content.replace( - "", "" - ) - file_content = file_content.replace("", "").replace( - "", "" + if not content: + raise EmptyOpinionException( + "There is no content in html_columbia field" ) - data["unpublished"] = True + soup = BeautifulSoup(content, features="html.parser") + opinion_text = soup.getText(separator=" ", strip=True) + prep_text = re.sub( + " +", " ", " ".join(opinion_text.split("\n")) + ).strip() + prep_text = re.sub(r"[^a-zA-Z0-9 ]", "", prep_text.lower()) + + cl_cleaned_opinions.append( + { + "id": op.id, + "byline": op.author_str, + "type": op.type, + "opinion": prep_text, + "order": i, + } + ) - # Sometimes opening and ending tag mismatch (e.g. c6b39dcb29c9c.xml) - file_content = file_content.replace( - "", "" + return ( + xml_path, + cl_cleaned_opinions, + start_position, + cluster_has_combined_opinion, ) - soup = BeautifulSoup(file_content, "lxml") - - # Find the outer tag to have all elements inside - find_opinion = soup.find("opinion") - - step_one_opinions = [] # type: list - opinions = [] # type: list - order = 0 - - if find_opinion: - untagged_content = [] - # We iterate all content, with and without tags - # STEP 1: Extract all content in multiple dict elements - for i, content in enumerate(find_opinion): # type: int, Tag - if type(content) == NavigableString: - # We found a raw string, store it - untagged_content.append(str(content)) +def fix_filepath(filepath: str) -> str: + """Fix filepath from file field - else: - if content.name in SIMPLE_TAGS + [ - "citation_line", - "opinion_byline", - "dissent_byline", - "concurrence_byline", - ]: - # Ignore these tags, it will be processed later - continue - elif content.name in [ - "opinion_text", - "dissent_text", - "concurrence_text", - ]: - if untagged_content: - # We found something other than a navigable string that is - # not an opinion, but now we have found an opinion, - # let's create this content first - - # default type - op_type = "opinion" - if step_one_opinions: - if step_one_opinions[-1].get("type"): - # use type of previous opinion if exists - op_type = step_one_opinions[-1].get("type") - - # Get rid of double spaces - opinion_content = re.sub( - " +", " ", "\n".join(untagged_content) - ).strip() # type: str - if opinion_content: - step_one_opinions.append( - { - "opinion": opinion_content, - "order": order, - "byline": "", - "type": op_type, - } - ) - order = order + 1 - untagged_content = [] - - byline = content.find_previous_sibling() - opinion_author = "" - if byline and "_byline" in byline.name: - opinion_author = byline.get_text() - - opinion_content = re.sub( - " +", " ", content.decode_contents() - ).strip() - if opinion_content: - step_one_opinions.append( - { - "opinion": opinion_content, - "order": order, - "byline": opinion_author, - "type": content.name.replace("_text", ""), - } - ) - order = order + 1 - - else: - # Content not inside _text tag, we store it - untagged_content.append(str(content)) - - if untagged_content: - # default type - op_type = "opinion" - if step_one_opinions: - if step_one_opinions[-1].get("type"): - # use type of previous opinion if exists - op_type = step_one_opinions[-1].get("type") - - opinion_content = re.sub( - " +", " ", "\n".join(untagged_content) - ).strip() - if opinion_content: - step_one_opinions.append( - { - "opinion": opinion_content, - "order": order, - "byline": "", - "type": op_type, - } - ) + :param filepath: path from file field + :return: new file path + """ + if "/home/mlissner/columbia/opinions/" in filepath: + filepath = filepath.replace("/home/mlissner/columbia/opinions/", "") + return filepath - # Step 2: Merge found content in the xml file - new_order = 0 - authorless_content = [] - - for i, found_content in enumerate(step_one_opinions, start=1): - byline = found_content.get("byline") - if not byline: - # Opinion has no byline, store it - authorless_content.append(found_content) - - if byline: - # Opinion has byline - opinion_type = found_content.get("type") - opinion_content = found_content.get("opinion", "") - # Store content that doesn't match the current type - alternative_authorless_content = [ - z - for z in authorless_content - if z.get("type") != opinion_type - ] - # Keep content that matches the current type - authorless_content = [ - z - for z in authorless_content - if z.get("type") == opinion_type - ] - - if alternative_authorless_content: - # Keep floating text that are not from the same type, - # we need to create a separate opinion for those, - # for example: in 2713f39c5a8e8684.xml we have an opinion - # without an author, and the next opinion with an author is - # a dissent opinion, we can't combine both - - # We check if the previous stored opinion matches the type of the - # content - relevant_opinions = ( - [opinions[-1]] - if opinions - and opinions[-1]["type"] - == alternative_authorless_content[0].get("type") - else [] - ) - if relevant_opinions: - previous_opinion = relevant_opinions[-1] - if previous_opinion.get( - "type" - ) == alternative_authorless_content[0].get("type"): - # Merge last opinion with previous opinion, it probably - # belongs the same author - relevant_opinions[-1][ - "opinion" - ] += "\n" + "\n".join( - [ - f.get("opinion") - for f in alternative_authorless_content - if f.get("opinion") - ] - ) - authorless_content = [] +def get_opinions_columbia_file(xml_filepath: str) -> list: + """Get opinions from columbia xml file and convert it into dict - else: - # No relevant opinions found, create a new opinion - new_opinion = { - "byline": None, - "type": alternative_authorless_content[0].get( - "type" - ), - "opinion": "\n".join( - [ - f.get("opinion") - for f in alternative_authorless_content - if f.get("opinion") - ] - ), - "order": new_order, - } - new_order = new_order + 1 - opinions.append(new_opinion) - - # Add new opinion - new_opinion = { - "byline": byline, - "type": opinion_type, - "opinion": "\n".join( - [ - f.get("opinion") - for f in authorless_content - if f.get("type") == opinion_type - ] - ) - + "\n\n" - + opinion_content, - "order": new_order, - } - - opinions.append(new_opinion) - new_order = new_order + 1 - authorless_content = [] - - if len(step_one_opinions) == i and authorless_content: - # If is the last opinion, and we still have opinions without - # byline, create an opinion without an author and the contents - # that couldn't be merged - - # We check if the previous stored opinion matches the type of the - # content - relevant_opinions = ( - [opinions[-1]] - if opinions - and opinions[-1]["type"] - == authorless_content[0].get("type") - else [] - ) + :param xml_filepath: path of xml file + :return: dict with data + """ + soup = read_xml_to_soup(xml_filepath) - if relevant_opinions: - previous_opinion = relevant_opinions[-1] - if previous_opinion.get("type") == authorless_content[ - 0 - ].get("type"): - # Merge last opinion with previous opinion, it probably - # belongs the same author - relevant_opinions[-1]["opinion"] += "\n" + "\n".join( - [ - f.get("opinion") - for f in authorless_content - if f.get("opinion") - ] - ) + # Find the outer tag to have all elements inside + outer_opinion = soup.find("opinion") - else: - # Create last floating opinion - new_opinion = { - "byline": None, - "type": authorless_content[0].get("type"), - "opinion": "\n".join( - [ - f.get("opinion") - for f in authorless_content - if f.get("opinion") - ] - ), - "order": new_order, - } - opinions.append(new_opinion) + extracted_opinions = extract_columbia_opinions(outer_opinion) + opinions = process_extracted_opinions(extracted_opinions) + map_opinion_types(opinions) for op in opinions: opinion_content = op.get("opinion") - opinion_content = BeautifulSoup( - opinion_content, "html.parser" - ).getText() - opinion_content = re.sub(r"[^a-zA-Z0-9 ]", "", opinion_content.lower()) - op["opinion"] = opinion_content + soup = BeautifulSoup(opinion_content, "html.parser") + opinion_text = soup.getText(separator=" ", strip=True) + opinion_text = re.sub( + " +", " ", " ".join(opinion_text.split("\n")) + ).strip() + cleaned_opinion = re.sub(r"[^a-zA-Z0-9 ]", "", opinion_text.lower()) + op["opinion"] = cleaned_opinion return opinions -def run_harvard(start_id: int, end_id: int): - """ - We assume that harvard data is already ordered, we just need to fill the order +def sort_harvard_opinions(start_id: int, end_id: int) -> None: + """We assume that harvard data is already ordered, we just need to fill the order field in each opinion + :param start_id: skip any id lower than this value :param end_id: skip any id greater than this value + :return: None """ # Get all harvard clusters with more than one opinion clusters = ( OpinionCluster.objects.prefetch_related("sub_opinions") .annotate(opinions_count=Count("sub_opinions")) - .filter(opinions_count__gt=1, source="U") + .filter(opinions_count__gt=1, source__in=VALID_HARVARD_SOURCES) .order_by("id") ) @@ -478,18 +545,19 @@ def run_harvard(start_id: int, end_id: int): logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") -def run_columbia(start_id: int, end_id: int, xml_dir: str): - """ - Update opinion order for columbia clusters +def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: + """Update opinion ordering for columbia clusters + :param start_id: skip any id lower than this value :param end_id: skip any id greater than this value :param xml_dir: absolute path to the directory with columbia xml files + :return: None """ # Get all columbia cluster ids with more than one opinion clusters = ( OpinionCluster.objects.annotate(opinions_count=Count("sub_opinions")) - .filter(opinions_count__gt=1, source="Z") + .filter(opinions_count__gt=1, source__in=VALID_COLUMBIA_SOURCES) .order_by("id") .values_list("id", flat=True) ) @@ -502,37 +570,53 @@ def run_columbia(start_id: int, end_id: int, xml_dir: str): for cluster_id in clusters: logger.info(f"Processing cluster id: {cluster_id}") - ( - xml_path, - cl_cleaned_opinions, - start_position, - combined_opinion, - ) = get_opinion_content(cluster_id) - - columbia_opinions = None + + try: + ( + xml_path, + cl_cleaned_opinions, + start_position, + cluster_has_combined_opinion, + ) = get_opinions_cleaned_content(cluster_id) + except EmptyOpinionException: + logger.warning( + f"At least one of the opinions from cluster id: {cluster_id} is empty." + ) + continue + + extracted_columbia_opinions = None if xml_path: - try: - columbia_opinions = get_opinions_columbia_xml( - xml_path, xml_dir - ) - except FileNotFoundError: + fixed_xml_filepath = os.path.join(xml_dir, fix_filepath(xml_path)) + + if not os.path.exists(fixed_xml_filepath): logger.warning( - f"Xml file not found in {xml_path}, cluster id: {cluster_id}" + f"Xml file not found in {fixed_xml_filepath}, cluster id: {cluster_id}" + ) + continue + + try: + extracted_columbia_opinions = get_opinions_columbia_file( + fixed_xml_filepath ) + except UnicodeDecodeError: + logger.warning(f"Cannot decode file: {fixed_xml_filepath}") continue - if cl_cleaned_opinions and columbia_opinions: + if cl_cleaned_opinions and extracted_columbia_opinions: + columbia_opinions_content = [ + op.get("opinion") + for op in extracted_columbia_opinions + if op.get("opinion") + ] + cl_opinions_content = [ + op.get("opinion") + for op in cl_cleaned_opinions + if op.get("opinion") + ] + matches = match_text_lists( - [ - op.get("opinion") - for op in columbia_opinions - if op.get("opinion") - ], - [ - op.get("opinion") - for op in cl_cleaned_opinions - if op.get("opinion") - ], + columbia_opinions_content, + cl_opinions_content, ) if matches: @@ -557,7 +641,7 @@ def run_columbia(start_id: int, end_id: int, xml_dir: str): failed = False for file_pos, cl_pos in matches.items(): # file_pos is the correct index to find the opinion id to update - file_opinion = columbia_opinions[file_pos] + file_opinion = extracted_columbia_opinions[file_pos] # the order was calculated using the xml file file_order = file_opinion.get("order") + start_position cl_opinion = cl_cleaned_opinions[cl_pos] @@ -587,7 +671,7 @@ def run_columbia(start_id: int, end_id: int, xml_dir: str): failed = True break - if combined_opinion and not failed: + if cluster_has_combined_opinion and not failed: combined_opinions_cluster = Opinion.objects.filter( cluster_id=cluster_id, type="010combined" ).order_by("id") @@ -628,6 +712,7 @@ def add_arguments(self, parser): parser.add_argument( "--xml-dir", + default="/opt/courtlistener/_columbia", required=False, help="The absolute path to the directory with columbia xml files", ) @@ -653,11 +738,15 @@ def handle(self, *args, **options): ) return + if not options["process_harvard"] and not options["process_columbia"]: + print("One option required: process-harvard or process-columbia") + return + if options["process_harvard"]: - run_harvard(options["start_id"], options["end_id"]) + sort_harvard_opinions(options["start_id"], options["end_id"]) if options["process_columbia"] and options["xml_dir"]: - run_columbia( + sort_columbia_opinions( options["start_id"], options["end_id"], options["xml_dir"] ) From f808b95b68487580b3d24be400afee91dcd4f938 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 30 Nov 2023 11:43:58 -0600 Subject: [PATCH 23/50] fix(opinions_order): code refactored NOTE: functions found in columbia_utils.py and utils.py, were temporarily added in the command,when the necessary changes are combined we need to remove the functions and import them from the utils. --- .../commands/update_opinions_order.py | 231 +++++++++++------- 1 file changed, 147 insertions(+), 84 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index ae931ba4b7..5b86c98130 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,12 +1,13 @@ import os.path import re -from typing import Any, List, Optional +from typing import Any, Optional from bs4 import BeautifulSoup, NavigableString, Tag from django.core.management import BaseCommand +from django.db import transaction from django.db.models import Count -from cl.corpus_importer.utils import similarity_scores +from cl.corpus_importer.utils import compare_documents, similarity_scores from cl.lib.command_utils import logger from cl.lib.string_diff import get_cosine_similarity from cl.search.models import SOURCES, Opinion, OpinionCluster @@ -24,7 +25,6 @@ # TODO remove the funcitions below and import them from utils.py and columbia_utils.py when those changes get merged - SIMPLE_TAGS = [ "attorneys", "caption", @@ -347,47 +347,86 @@ def map_opinion_types(opinions=None) -> None: op["type"] = "030concurrence" -# TODO ------------------------ remove until here ------------------------------- - - -def match_text_lists( - file_opinions_list: List[Any], cl_opinions_list: List[Any] +def match_opinion_lists( + file_opinions_list: list[Any], cl_opinions_list: list[Any] ) -> dict[int, int]: - """Generate matching lists above threshold + """Try to match the opinions on two lists and generate a dict with position of + matching opinions + + Remove non-alphanumeric and non-whitespace characters from lowercased text, + this tries to make both texts in equal conditions to prove if both are similar or + equal + + get_cosine_similarity works great when both texts are almost the same with very + small variations + + Sometimes cosine similarity fails when there are small variations in text, + such as parties, attorneys, case name, or court that are included in the content + of the opinion, compare_documents() checks the percentage of the file opinion + text that it is in courtlistener opinion, having a large percentage means that + almost all the file opinion is in courtlistener opinion, but there is a + possibility that the courtlistener opinion contains some additional data in que + opinion content (such as case name, parties, etc.) + + compare_documents works good when the opinion from the file is a subset of the + opinion in CL, the percentage represents how much of the opinion of the file is + in the opinion from cl (content in cl opinion can have other data in the body + like posture, attorneys, etc. e.g. in cluster id: 7643871 we have the posture and + the opinion text but in the xml file we only have the opinion text, cosine_sim: + 0.1639075094124459 and percent_match: 73) + + Sometimes one algorithm performs better than the other, this is due to some + additional text, such as editor's notes, or the author, page number or posture + added to the opinion + + Key is opinion position from file, Value is opinion position from cl opinion e.g. + matches {0: 1, 1: 2} 0 is file opinion and 1 in cl opinion, 1 is file opinion and + 2 is cl opinion :param file_opinions_list: Opinions from file :param cl_opinions_list: CL opinions :return: Matches if found or empty dict """ - # We import this here to avoid a circular import - from cl.corpus_importer.management.commands.harvard_opinions import ( - compare_documents, - ) scores = similarity_scores(file_opinions_list, cl_opinions_list) matches = {} for i, row in enumerate(scores): j = row.argmax() # type: ignore - # Lower threshold for small opinions. - if ( - get_cosine_similarity(file_opinions_list[i], cl_opinions_list[j]) - < 0.60 - ): - continue - percent_match = compare_documents( - file_opinions_list[i], cl_opinions_list[j] + file_opinion = re.sub( + r"[^a-zA-Z0-9 ]", "", file_opinions_list[i].lower() ) - if percent_match < 60: + cl_opinion = re.sub(r"[^a-zA-Z0-9 ]", "", cl_opinions_list[j].lower()) + + cosine_sim = get_cosine_similarity(file_opinion, cl_opinion) + + percent_match = compare_documents(file_opinion, cl_opinion) + + if cosine_sim < 0.60 and percent_match < 60: continue + matches[i] = j - # Key is opinion position from file, Value is opinion position from cl opinion - # e.g. matches {0: 1, 1: 2} 0 is file opinion and 1 in cl opinion, 1 is file - # opinion and 2 is cl opinion return matches +def clean_opinion_content(text: str) -> str: + """Clean opinion content + + :param text: text to clean + :return: cleaned text + """ + + # Replace line breaks with spaces and get rid of double spaces + text = re.sub(" +", " ", " ".join(text.split("\n"))).strip() + + # Remove non-alphanumeric and non-whitespace characters from lowercased text + return re.sub(r"[^a-zA-Z0-9 ]", "", text.lower()) + + +# TODO ------------------------ remove until here ------------------------------- + + def get_opinions_cleaned_content( cluster_id, ) -> tuple[Optional[str], list[dict], int, bool]: @@ -432,10 +471,7 @@ def get_opinions_cleaned_content( soup = BeautifulSoup(content, features="html.parser") opinion_text = soup.getText(separator=" ", strip=True) - prep_text = re.sub( - " +", " ", " ".join(opinion_text.split("\n")) - ).strip() - prep_text = re.sub(r"[^a-zA-Z0-9 ]", "", prep_text.lower()) + prep_text = clean_opinion_content(opinion_text) cl_cleaned_opinions.append( { @@ -485,10 +521,7 @@ def get_opinions_columbia_file(xml_filepath: str) -> list: opinion_content = op.get("opinion") soup = BeautifulSoup(opinion_content, "html.parser") opinion_text = soup.getText(separator=" ", strip=True) - opinion_text = re.sub( - " +", " ", " ".join(opinion_text.split("\n")) - ).strip() - cleaned_opinion = re.sub(r"[^a-zA-Z0-9 ]", "", opinion_text.lower()) + cleaned_opinion = clean_opinion_content(opinion_text) op["opinion"] = cleaned_opinion return opinions @@ -545,6 +578,78 @@ def sort_harvard_opinions(start_id: int, end_id: int) -> None: logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") +def update_opinions( + cluster_id: int, + cl_opinions: list, + columbia_opinions: list, + matches: dict, + cluster_has_combined_opinion: bool, + start_position: int, +): + """Update opinions with correct order + + :param cluster_id: + :param cl_opinions: a list with cleaned opinions from cl + :param columbia_opinions: a ordered list with cleaned opinions from xml file + :param matches: a dict with the matches of each opinion of both lists + :param cluster_has_combined_opinion: True if the cluster has combined opinions + :param start_position: the number from where the order should begin for + non-combined opinions + :return: None + """ + update_failed = False + + with transaction.atomic(): + for file_pos, cl_pos in matches.items(): + # file_pos is the correct index to find the opinion id to update + file_opinion = columbia_opinions[file_pos] + # the order was calculated using the xml file + file_order = file_opinion.get("order") + start_position + cl_opinion = cl_opinions[cl_pos] + opinion_id_to_update = cl_opinion.get("id") + + if opinion_id_to_update: + try: + # Update opinion order + op = Opinion.objects.get(id=opinion_id_to_update) + op.order = file_order + op.save() + except Opinion.DoesNotExist: + # This should not happen, but it is better to be + # cautious + logger.warning( + f"We can't update opinion, opinion doesn't exist " + f"with id: {opinion_id_to_update}" + ) + update_failed = True + break + + if cluster_has_combined_opinion and not update_failed: + combined_opinions_cluster = Opinion.objects.filter( + cluster_id=cluster_id, type="010combined" + ).order_by("id") + + # Show combined opinions at beginning + for opinion_order, cluster_op in enumerate( + combined_opinions_cluster + ): + cluster_op.order = opinion_order + cluster_op.save() + + if update_failed: + # There was an error updating an opinion, rollback all changes for + # cluster's opinions + logger.warning( + f"There was an error updating the order of opinions of the " + f"cluster id: {cluster_id}" + ) + transaction.set_rollback(True) + else: + logger.info( + f"The order of opinions was updated, cluster id: {cluster_id}" + ) + + def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: """Update opinion ordering for columbia clusters @@ -614,7 +719,7 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: if op.get("opinion") ] - matches = match_text_lists( + matches = match_opinion_lists( columbia_opinions_content, cl_opinions_content, ) @@ -638,57 +743,15 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: # Go to next cluster id continue - failed = False - for file_pos, cl_pos in matches.items(): - # file_pos is the correct index to find the opinion id to update - file_opinion = extracted_columbia_opinions[file_pos] - # the order was calculated using the xml file - file_order = file_opinion.get("order") + start_position - cl_opinion = cl_cleaned_opinions[cl_pos] - opinion_id_to_update = cl_opinion.get("id") - - if opinion_id_to_update: - try: - # Save opinion - op = Opinion.objects.get(id=opinion_id_to_update) - op.order = file_order - op.save() - logger.info( - f"Cluster id processed: {cluster_id} Update opinion id: {opinion_id_to_update} with position: {file_order}" - ) - except Opinion.DoesNotExist: - logger.warning( - f"We can't update opinion, opinion doesn't exist with " - f"id: {opinion_id_to_update}" - ) - failed = True - break - else: - logger.warning( - f"We can't update opinion, empty opinion id " - f"from cluster: {cluster_id}" - ) - failed = True - break - - if cluster_has_combined_opinion and not failed: - combined_opinions_cluster = Opinion.objects.filter( - cluster_id=cluster_id, type="010combined" - ).order_by("id") - - # Show combined opinions at beginning - for opinion_order, cluster_op in enumerate( - combined_opinions_cluster - ): - cluster_op.order = opinion_order - cluster_op.save() - - else: - # No matches found - logger.warning( - f"Failed to match opinions from cluster id: {cluster_id}" + # Update all opinions order + update_opinions( + cluster_id, + cl_cleaned_opinions, + extracted_columbia_opinions, + matches, + cluster_has_combined_opinion, + start_position, ) - continue class Command(BaseCommand): From f928aa021fe9de812f9e82b64a044582b5ffda78 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 19 Feb 2024 18:13:46 -0600 Subject: [PATCH 24/50] fix(opinion_order): update poetry.lock and pyproject.toml --- poetry.lock | 13 ++++++++++++- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 24a1c45791..25db969843 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1062,6 +1062,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -5105,4 +5116,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "d0cb9ebf26ba111318df8c00976f71ad6b18ffc1aafab1df3b506bfe5128611d" +content-hash = "a8dfd3edc2209cb2d357696b751508ebd0c249be0b1b408f2f7225884a5e7b2a" diff --git a/pyproject.toml b/pyproject.toml index 32afda8f5f..e8d88a61f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"} django-model-utils = "^4.3.1" juriscraper = "*" django-permissions-policy = "^4.19.0" +django-ordered-model = "^3.7.4" [tool.poetry.group.dev.dependencies] From d46b42fd39b6abacf301ae3ce46ed090d5cb5446 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 19 Feb 2024 18:21:48 -0600 Subject: [PATCH 25/50] fix(opinion_order): rename migrations --- .../{0024_order_opinions.py => 0027_order_opinions.py} | 2 +- .../{0024_order_opinions.sql => 0027_order_opinions.sql} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cl/search/migrations/{0024_order_opinions.py => 0027_order_opinions.py} (98%) rename cl/search/migrations/{0024_order_opinions.sql => 0027_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0024_order_opinions.py b/cl/search/migrations/0027_order_opinions.py similarity index 98% rename from cl/search/migrations/0024_order_opinions.py rename to cl/search/migrations/0027_order_opinions.py index 1abaed4d76..e1c602e2e5 100644 --- a/cl/search/migrations/0024_order_opinions.py +++ b/cl/search/migrations/0027_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0023_add_docket_sources_noop"), + ("search", "0026_drop_docket_unique_together_and_more"), ] operations = [ diff --git a/cl/search/migrations/0024_order_opinions.sql b/cl/search/migrations/0027_order_opinions.sql similarity index 100% rename from cl/search/migrations/0024_order_opinions.sql rename to cl/search/migrations/0027_order_opinions.sql From cefb8482ed586e65526f59818901eca56ca26e7d Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 6 May 2024 18:12:54 -0600 Subject: [PATCH 26/50] feat(opinion_order): resolve merge conflict, rename migrations --- ..._order_opinions.py => 0031_order_opinions.py} | 2 +- ...rder_opinions.sql => 0031_order_opinions.sql} | 0 poetry.lock | 16 +++++++++++++--- 3 files changed, 14 insertions(+), 4 deletions(-) rename cl/search/migrations/{0027_order_opinions.py => 0031_order_opinions.py} (98%) rename cl/search/migrations/{0027_order_opinions.sql => 0031_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0027_order_opinions.py b/cl/search/migrations/0031_order_opinions.py similarity index 98% rename from cl/search/migrations/0027_order_opinions.py rename to cl/search/migrations/0031_order_opinions.py index e1c602e2e5..9e7774203d 100644 --- a/cl/search/migrations/0027_order_opinions.py +++ b/cl/search/migrations/0031_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0026_drop_docket_unique_together_and_more"), + ("search", "0030_recapdocument_pacer_doc_id_idx"), ] operations = [ diff --git a/cl/search/migrations/0027_order_opinions.sql b/cl/search/migrations/0031_order_opinions.sql similarity index 100% rename from cl/search/migrations/0027_order_opinions.sql rename to cl/search/migrations/0031_order_opinions.sql diff --git a/poetry.lock b/poetry.lock index f22583b490..109cadc2d3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "amqp" @@ -1062,6 +1062,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -2467,7 +2478,6 @@ files = [ {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9e2addd2d1866fe112bc6f80117bcc6bc25191c5ed1bfbcf9f1386a884252ae8"}, {file = "lxml-5.2.1-cp37-cp37m-win32.whl", hash = "sha256:f51969bac61441fd31f028d7b3b45962f3ecebf691a510495e5d2cd8c8092dbd"}, {file = "lxml-5.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b0b58fbfa1bf7367dde8a557994e3b1637294be6cf2169810375caf8571a085c"}, - {file = "lxml-5.2.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3e183c6e3298a2ed5af9d7a356ea823bccaab4ec2349dc9ed83999fd289d14d5"}, {file = "lxml-5.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:804f74efe22b6a227306dd890eecc4f8c59ff25ca35f1f14e7482bbce96ef10b"}, {file = "lxml-5.2.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:08802f0c56ed150cc6885ae0788a321b73505d2263ee56dad84d200cab11c07a"}, {file = "lxml-5.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f8c09ed18ecb4ebf23e02b8e7a22a05d6411911e6fabef3a36e4f371f4f2585"}, @@ -5259,4 +5269,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "994213014ffbb4387604c85fddd76e01112f4e3b66a1be6bc77f601b5b1de1b8" +content-hash = "c6a4dd1a9c6ecf961e254a3d6d0387f4d5e6f6fdb4181c33e2c55174e68d4454" From d1a1708f363764056e4c6f9e0159e460675ad3da Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 5 Jun 2024 12:58:05 -0600 Subject: [PATCH 27/50] fix(opinion_order): update poetry.lock to solve merge conflicts --- poetry.lock | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 6d7f85852a..cbc5ec2cc3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1058,6 +1058,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -5254,4 +5265,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "814ca0b0dc8db689f83e391fc58b494de48f6321085872bfaa8e37b7a7fc0e99" +content-hash = "a64d61d094d3896cb204e882ff2471b4f3b69def7416a2b50cdcedc9acf6455e" From 754d71fda6d7a12d4dfc6dddf121399d6d0582c9 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 5 Jun 2024 13:06:55 -0600 Subject: [PATCH 28/50] fix(opinion_order): rename migration --- .../{0031_order_opinions.py => 0032_order_opinions.py} | 2 +- .../{0031_order_opinions.sql => 0032_order_opinions.sql} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cl/search/migrations/{0031_order_opinions.py => 0032_order_opinions.py} (98%) rename cl/search/migrations/{0031_order_opinions.sql => 0032_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0031_order_opinions.py b/cl/search/migrations/0032_order_opinions.py similarity index 98% rename from cl/search/migrations/0031_order_opinions.py rename to cl/search/migrations/0032_order_opinions.py index 9e7774203d..b34bb01d48 100644 --- a/cl/search/migrations/0031_order_opinions.py +++ b/cl/search/migrations/0032_order_opinions.py @@ -7,7 +7,7 @@ class Migration(migrations.Migration): dependencies = [ - ("search", "0030_recapdocument_pacer_doc_id_idx"), + ("search", "0031_alter_opinion_type_alter_opinioncluster_source_noop"), ] operations = [ diff --git a/cl/search/migrations/0031_order_opinions.sql b/cl/search/migrations/0032_order_opinions.sql similarity index 100% rename from cl/search/migrations/0031_order_opinions.sql rename to cl/search/migrations/0032_order_opinions.sql From d7132ec90bf778ae9f28855b31724b4a21bee33e Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 19 Jul 2024 10:57:20 -0600 Subject: [PATCH 29/50] fix(opinion_order): update poetry.lock --- poetry.lock | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index f7f6c67e40..4d48c0c2ed 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1116,6 +1116,17 @@ files = [ [package.dependencies] Django = ">=3.2" +[[package]] +name = "django-ordered-model" +version = "3.7.4" +description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." +optional = false +python-versions = "*" +files = [ + {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, + {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, +] + [[package]] name = "django-override-storage" version = "0.3.2" @@ -5461,4 +5472,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "e6d34875888f1687912d03d33ea68038bba6c6d487037c6454d5b18449ec6d0c" +content-hash = "5334f16d006f7486a5f9b905906f2a9a68e7f524684c04af3d0994ebd0999384" From 2013633d8c5b87d6fc7dfc4dfc4c701ad0fb18c2 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 19 Jul 2024 12:22:47 -0600 Subject: [PATCH 30/50] refactor(update_opinions_order): refactor code --- .../commands/update_opinions_order.py | 425 +----------------- 1 file changed, 20 insertions(+), 405 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 5b86c98130..85ed93e0e2 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,15 +1,20 @@ import os.path import re -from typing import Any, Optional +from typing import Optional -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup from django.core.management import BaseCommand from django.db import transaction from django.db.models import Count -from cl.corpus_importer.utils import compare_documents, similarity_scores +from cl.corpus_importer.import_columbia.columbia_utils import ( + extract_columbia_opinions, + map_opinion_types, + process_extracted_opinions, + read_xml_to_soup, +) +from cl.corpus_importer.utils import EmptyOpinionException, match_opinion_lists from cl.lib.command_utils import logger -from cl.lib.string_diff import get_cosine_similarity from cl.search.models import SOURCES, Opinion, OpinionCluster VALID_COLUMBIA_SOURCES = [ @@ -23,393 +28,6 @@ ] -# TODO remove the funcitions below and import them from utils.py and columbia_utils.py when those changes get merged - -SIMPLE_TAGS = [ - "attorneys", - "caption", - "citation", - "court", - "date", - "docket", - "hearing_date", - "panel", - "posture", - "reporter_caption", -] - - -class EmptyOpinionException(Exception): - """An exception for opinions that raise a ZeroDivisionError Exception due empty - opinion tag or empty opinion content in cl""" - - def __init__(self, message: str) -> None: - self.message = message - - -def read_xml_to_soup(filepath: str) -> BeautifulSoup: - """This function reads the xml file, fixes the bad tags in columbia xml - files and returns a BeautifulSoup object - - :param filepath: path to xml file - :return: BeautifulSoup object of parsed content - """ - with open(filepath, "r", encoding="utf-8") as f: - file_content = f.read() - # Sometimes opening and ending tag mismatch (e.g. ed7c6b39dcb29c9c.xml) - file_content = file_content.replace( - "", "" - ) - # Fix opinion with invalid attribute - if "" in file_content: - file_content = file_content.replace( - "", "" - ) - file_content = file_content.replace("", "").replace( - "", "" - ) - return BeautifulSoup(file_content, "lxml") - - -def add_floating_opinion( - opinions: list, floating_content: list, opinion_order: int -) -> list: - """We have found floating opinions in bs object, we keep the opinion - content as a new opinion - - :param opinions: a list with opinions found - :param floating_content: content that is not in known non-opinion tags - :param opinion_order: opinion position - :return: updated list of opinions - """ - op_type = "opinion" - if opinions: - if opinions[-1].get("type"): - # Use type of previous opinion if exists - op_type = opinions[-1].get("type") - - # Get rid of double spaces from floating content - opinion_content = re.sub( - " +", " ", "\n".join(floating_content) - ).strip() # type: str - if opinion_content: - opinions.append( - { - "opinion": opinion_content, - "order": opinion_order, - "byline": "", - "type": op_type, - } - ) - return opinions - - -def extract_columbia_opinions( - outer_opinion: BeautifulSoup, -) -> list[Optional[dict]]: - """We extract all possible opinions from BeautifulSoup, with and without - author, and we create new opinions if floating content exists(content that - is not explicitly defined within an opinion tag or doesn't have an author) - - :param outer_opinion: element containing all xml tags - :return: list of opinion dicts - """ - opinions: list = [] - floating_content = [] - order = 0 - - # We iterate all content to look for all possible opinions - for i, content in enumerate(outer_opinion): # type: int, Tag - if isinstance(content, NavigableString): - # We found a raw string, store it - floating_content.append(str(content)) - else: - if content.name in SIMPLE_TAGS + [ - "citation_line", - "opinion_byline", - "dissent_byline", - "concurrence_byline", - ]: - # Ignore these tags, it will be processed later - continue - elif content.name in [ - "opinion_text", - "dissent_text", - "concurrence_text", - ]: - if floating_content: - # We have found an opinion, but there is floating - # content, we create a dict with the opinion using the - # floating content with default type = "opinion" - opinions = add_floating_opinion( - opinions, floating_content, order - ) - floating_content = [] - - byline = content.find_previous_sibling() - opinion_author = "" - if byline and "_byline" in byline.name: - opinion_author = byline.get_text() - - opinion_content = re.sub( - " +", " ", content.decode_contents() - ).strip() - if opinion_content: - # Now we create a dict with current opinion - opinions.append( - { - "opinion": opinion_content, - "order": order, - "byline": opinion_author, - "type": content.name.replace("_text", ""), - } - ) - order = order + 1 - - else: - if content.name not in SIMPLE_TAGS + ["syllabus"]: - # We store content that is not inside _text tag and is - # not in one of the known non-opinion tags - floating_content.append(str(content)) - - # Combine the new content into another opinion. great. - if floating_content: - # If we end to go through all the found opinions and if we still - # have floating content out there, we create a new opinion with the - # last type of opinion - opinions = add_floating_opinion(opinions, floating_content, order) - return opinions - - -def is_per_curiam_opinion( - content: Optional[str], byline: Optional[str] -) -> bool: - """Check if opinion author is per curiam - :param content: opinion content - :param byline: opinion text author - :return: True if opinion author is per curiam - """ - if byline and "per curiam" in byline[:1000].lower(): - return True - if content and "per curiam" in content[:1000].lower(): - return True - return False - - -def merge_opinions( - opinions: list, content: list, current_order: int -) -> tuple[list, int]: - """Merge last and previous opinion if are the same type or create a new - opinion if merge is not possible - - :param opinions: list of opinions that is being updated constantly - :param content: list of opinions without an author - :param current_order: opinion position - :return: updated list of opinions - """ - - # We check if the previous stored opinion matches the type of the - # content, and we store the opinion dict temporary - relevant_opinions = ( - [opinions[-1]] - if opinions and opinions[-1]["type"] == content[0].get("type") - else [] - ) - - if relevant_opinions: - relevant_opinions[-1]["opinion"] += "\n" + "\n".join( - [f.get("opinion") for f in content if f.get("opinion")] - ) - - else: - # No relevant opinions found, create a new opinion with the content - opinion_content = "\n".join( - [f.get("opinion") for f in content if f.get("opinion")] - ) - new_opinion = { - "byline": None, - "type": content[0].get("type"), - "opinion": opinion_content, - "order": current_order, - "per_curiam": is_per_curiam_opinion(opinion_content, None), - } - opinions.append(new_opinion) - current_order = current_order + 1 - - return opinions, current_order - - -def process_extracted_opinions(extracted_opinions: list) -> list: - """We read the extracted data in extract_opinions function to merge all - possible floating opinions (it is not explicitly defined within an opinion - tag or doesn't have an author) - - :param extracted_opinions: list of opinions obtained from xml file - :return: a list with extracted and processed opinions - """ - - opinions: list = [] - authorless_content = [] - order = 0 - - for i, found_content in enumerate(extracted_opinions, start=1): - byline = found_content.get("byline") - if not byline: - # Opinion has no byline, store opinion content - authorless_content.append(found_content) - - if byline: - # Opinion has byline, get opinion type and content - opinion_type = found_content.get("type") - opinion_content = found_content.get("opinion", "") - # Store content that doesn't match the current opinion type - alternative_authorless_content = [ - content - for content in authorless_content - if content.get("type") != opinion_type - ] - # Keep content that matches the current type - authorless_content = [ - op_content - for op_content in authorless_content - if op_content.get("type") == opinion_type - ] - - if alternative_authorless_content: - # Keep floating text that are not from the same type, - # we need to create a separate opinion for those, - # for example: in 2713f39c5a8e8684.xml we have an opinion - # without an author, and the next opinion with an author is - # a dissent opinion, we can't combine both - opinions, order = merge_opinions( - opinions, alternative_authorless_content, order - ) - - opinion_content = ( - "\n".join( - [ - f.get("opinion") - for f in authorless_content - if f.get("type") == opinion_type - ] - ) - + "\n\n" - + opinion_content - ) - - # Add new opinion - new_opinion = { - "byline": byline, - "type": opinion_type, - "opinion": opinion_content, - "order": order, - "per_curiam": is_per_curiam_opinion(opinion_content, byline), - } - - opinions.append(new_opinion) - order = order + 1 - authorless_content = [] - - if len(extracted_opinions) == i and authorless_content: - # If is the last opinion, and we still have opinions without - # byline, create an opinion without an author and the contents - # that couldn't be merged - opinions, order = merge_opinions( - opinions, authorless_content, order - ) - - return opinions - - -def map_opinion_types(opinions=None) -> None: - """Map opinion type to model field choice - - :param opinions: a list that contains all opinions as dict elements - :return: None - """ - - if opinions is None: - opinions = [] - lead = False - for op in opinions: - op_type = op.get("type") - # Only first opinion with "opinion" type is a lead opinion, the next - # opinion with "opinion" type is an addendum - if not lead and op_type and op_type == "opinion": - lead = True - op["type"] = "020lead" - continue - elif lead and op_type and op_type == "opinion": - op["type"] = "050addendum" - elif op_type and op_type == "dissent": - op["type"] = "040dissent" - elif op_type and op_type == "concurrence": - op["type"] = "030concurrence" - - -def match_opinion_lists( - file_opinions_list: list[Any], cl_opinions_list: list[Any] -) -> dict[int, int]: - """Try to match the opinions on two lists and generate a dict with position of - matching opinions - - Remove non-alphanumeric and non-whitespace characters from lowercased text, - this tries to make both texts in equal conditions to prove if both are similar or - equal - - get_cosine_similarity works great when both texts are almost the same with very - small variations - - Sometimes cosine similarity fails when there are small variations in text, - such as parties, attorneys, case name, or court that are included in the content - of the opinion, compare_documents() checks the percentage of the file opinion - text that it is in courtlistener opinion, having a large percentage means that - almost all the file opinion is in courtlistener opinion, but there is a - possibility that the courtlistener opinion contains some additional data in que - opinion content (such as case name, parties, etc.) - - compare_documents works good when the opinion from the file is a subset of the - opinion in CL, the percentage represents how much of the opinion of the file is - in the opinion from cl (content in cl opinion can have other data in the body - like posture, attorneys, etc. e.g. in cluster id: 7643871 we have the posture and - the opinion text but in the xml file we only have the opinion text, cosine_sim: - 0.1639075094124459 and percent_match: 73) - - Sometimes one algorithm performs better than the other, this is due to some - additional text, such as editor's notes, or the author, page number or posture - added to the opinion - - Key is opinion position from file, Value is opinion position from cl opinion e.g. - matches {0: 1, 1: 2} 0 is file opinion and 1 in cl opinion, 1 is file opinion and - 2 is cl opinion - - :param file_opinions_list: Opinions from file - :param cl_opinions_list: CL opinions - :return: Matches if found or empty dict - """ - - scores = similarity_scores(file_opinions_list, cl_opinions_list) - - matches = {} - for i, row in enumerate(scores): - j = row.argmax() # type: ignore - file_opinion = re.sub( - r"[^a-zA-Z0-9 ]", "", file_opinions_list[i].lower() - ) - cl_opinion = re.sub(r"[^a-zA-Z0-9 ]", "", cl_opinions_list[j].lower()) - - cosine_sim = get_cosine_similarity(file_opinion, cl_opinion) - - percent_match = compare_documents(file_opinion, cl_opinion) - - if cosine_sim < 0.60 and percent_match < 60: - continue - - matches[i] = j - - return matches - - def clean_opinion_content(text: str) -> str: """Clean opinion content @@ -424,9 +42,6 @@ def clean_opinion_content(text: str) -> str: return re.sub(r"[^a-zA-Z0-9 ]", "", text.lower()) -# TODO ------------------------ remove until here ------------------------------- - - def get_opinions_cleaned_content( cluster_id, ) -> tuple[Optional[str], list[dict], int, bool]: @@ -531,6 +146,8 @@ def sort_harvard_opinions(start_id: int, end_id: int) -> None: """We assume that harvard data is already ordered, we just need to fill the order field in each opinion + The harvard importer created the opinions in order of appearance in the file + :param start_id: skip any id lower than this value :param end_id: skip any id greater than this value :return: None @@ -795,25 +412,23 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): - if options["process_harvard"] and options["process_columbia"]: - print( - "You can only select one option process-harvard or process-columbia" + + if not options["process_harvard"] and not options["process_columbia"]: + logger.info( + "One option required: process-harvard or process-columbia" ) return - if not options["process_harvard"] and not options["process_columbia"]: - print("One option required: process-harvard or process-columbia") + if options["process_harvard"] and options["process_columbia"]: + logger.info( + "You can only select one option process-harvard or process-columbia" + ) return if options["process_harvard"]: sort_harvard_opinions(options["start_id"], options["end_id"]) - if options["process_columbia"] and options["xml_dir"]: + if options["process_columbia"]: sort_columbia_opinions( options["start_id"], options["end_id"], options["xml_dir"] ) - - if options["process_columbia"] and not options["xml_dir"]: - print( - "Argument --xml-dir required to read xml files from mounted directory" - ) From 1e47ff4b503459047caeda15f5fac4b03e77b59a Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 25 Jul 2024 12:56:00 -0600 Subject: [PATCH 31/50] feat(opinion_order): remove django-ordered-model add order field to opinion model add unique_together for cluster and order --- cl/search/migrations/0032_order_opinions.py | 26 ++++++-------- cl/search/migrations/0032_order_opinions.sql | 37 +++++++++----------- cl/search/models.py | 7 ++-- cl/settings/django.py | 1 - poetry.lock | 13 +------ pyproject.toml | 1 - 6 files changed, 31 insertions(+), 54 deletions(-) diff --git a/cl/search/migrations/0032_order_opinions.py b/cl/search/migrations/0032_order_opinions.py index b34bb01d48..dbbe4707d0 100644 --- a/cl/search/migrations/0032_order_opinions.py +++ b/cl/search/migrations/0032_order_opinions.py @@ -1,8 +1,8 @@ -# Generated by Django 4.2.1 on 2023-06-15 17:56 +# Generated by Django 5.0.7 on 2024-07-25 17:13 -from django.db import migrations, models import pgtrigger.compiler import pgtrigger.migrations +from django.db import migrations, models class Migration(migrations.Migration): @@ -11,10 +11,6 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterModelOptions( - name="opinion", - options={"ordering": ("order",)}, - ), pgtrigger.migrations.RemoveTrigger( model_name="opinion", name="update_or_delete_snapshot_delete", @@ -26,27 +22,25 @@ class Migration(migrations.Migration): migrations.AddField( model_name="opinion", name="order", - field=models.PositiveIntegerField( - db_index=True, default=1, editable=False, verbose_name="order" - ), - preserve_default=False, + field=models.IntegerField(blank=True, null=True), ), migrations.AddField( model_name="opinionevent", name="order", - field=models.PositiveIntegerField( - default=1, editable=False, verbose_name="order" - ), - preserve_default=False, + field=models.IntegerField(blank=True, null=True), + ), + migrations.AlterUniqueTogether( + name="opinion", + unique_together={("cluster", "order")}, ), pgtrigger.migrations.AddTrigger( model_name="opinion", trigger=pgtrigger.compiler.Trigger( name="update_or_delete_snapshot_update", sql=pgtrigger.compiler.UpsertTriggerSql( - condition='WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."order" IS DISTINCT FROM (NEW."order") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr"))', + condition='WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."order" IS DISTINCT FROM (NEW."order"))', func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', - hash="bcac41027f469bbd394e8671cb0b2fa33e7035f3", + hash="89fec08f03e567ec8ecc7cd1e8ec5f665abf9d3b", operation="UPDATE", pgid="pgtrigger_update_or_delete_snapshot_update_67ecd", table="search_opinion", diff --git a/cl/search/migrations/0032_order_opinions.sql b/cl/search/migrations/0032_order_opinions.sql index 3226cb510b..71161b2370 100644 --- a/cl/search/migrations/0032_order_opinions.sql +++ b/cl/search/migrations/0032_order_opinions.sql @@ -1,9 +1,5 @@ BEGIN; -- --- Change Meta options on opinion --- --- (no-op) --- -- Remove trigger update_or_delete_snapshot_delete from model opinion -- DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; @@ -14,13 +10,15 @@ DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "sear -- -- Add field order to opinion -- -ALTER TABLE "search_opinion" ADD COLUMN "order" integer DEFAULT 1 NOT NULL CHECK ("order" >= 0); -ALTER TABLE "search_opinion" ALTER COLUMN "order" DROP DEFAULT; +ALTER TABLE "search_opinion" ADD COLUMN "order" integer NULL; -- -- Add field order to opinionevent -- -ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer DEFAULT 1 NOT NULL CHECK ("order" >= 0); -ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; +ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; +-- +-- Alter unique_together for opinion (1 constraint(s)) +-- +ALTER TABLE "search_opinion" ADD CONSTRAINT "search_opinion_cluster_id_order_8426d97d_uniq" UNIQUE ("cluster_id", "order"); -- -- Create trigger update_or_delete_snapshot_update on model opinion -- @@ -50,7 +48,7 @@ ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_update_67ecd() RETURNS TRIGGER AS $$ - + BEGIN IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN IF (TG_OP = 'DELETE') THEN @@ -66,13 +64,13 @@ ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; CREATE TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd AFTER UPDATE ON "search_opinion" - - - FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."order" IS DISTINCT FROM (NEW."order") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr")) + + + FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."order" IS DISTINCT FROM (NEW."order")) EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_update_67ecd(); - COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS 'bcac41027f469bbd394e8671cb0b2fa33e7035f3'; - + COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS '89fec08f03e567ec8ecc7cd1e8ec5f665abf9d3b'; + -- -- Create trigger update_or_delete_snapshot_delete on model opinion -- @@ -102,7 +100,7 @@ ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_delete_1f4fd() RETURNS TRIGGER AS $$ - + BEGIN IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN IF (TG_OP = 'DELETE') THEN @@ -118,12 +116,11 @@ ALTER TABLE "search_opinionevent" ALTER COLUMN "order" DROP DEFAULT; DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; CREATE TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd AFTER DELETE ON "search_opinion" - - - FOR EACH ROW + + + FOR EACH ROW EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_delete_1f4fd(); COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad'; - -CREATE INDEX "search_opinion_order_d54dd126" ON "search_opinion" ("order"); + COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index 59ad525e88..9c04940e3e 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -22,7 +22,6 @@ from localflavor.us.models import USPostalCodeField, USZipCodeField from localflavor.us.us_states import OBSOLETE_STATES, USPS_CHOICES from model_utils import FieldTracker -from ordered_model.models import OrderedModel from cl.citations.utils import get_citation_depth_between_clusters from cl.custom_filters.templatetags.text_filters import best_case_name @@ -3149,7 +3148,7 @@ def sort_cites(c): @pghistory.track(AfterUpdateOrDeleteSnapshot()) -class Opinion(OrderedModel, AbstractDateTimeModel): +class Opinion(AbstractDateTimeModel): COMBINED = "010combined" UNANIMOUS = "015unamimous" LEAD = "020lead" @@ -3321,10 +3320,10 @@ class Opinion(OrderedModel, AbstractDateTimeModel): "sha1", ] ) - order_with_respect_to = "cluster" + order = models.IntegerField(null=True, blank=True) class Meta: - ordering = ("order",) + unique_together = ("cluster", "order") @property def siblings(self) -> QuerySet: diff --git a/cl/settings/django.py b/cl/settings/django.py index e6d74c3949..968323bcb3 100644 --- a/cl/settings/django.py +++ b/cl/settings/django.py @@ -159,7 +159,6 @@ "django_elasticsearch_dsl", "pghistory", "pgtrigger", - "ordered_model", # CourtListener Apps "cl.alerts", "cl.audio", diff --git a/poetry.lock b/poetry.lock index 65e8c26f25..a769a59f6a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1116,17 +1116,6 @@ files = [ [package.dependencies] Django = ">=3.2" -[[package]] -name = "django-ordered-model" -version = "3.7.4" -description = "Allows Django models to be ordered and provides a simple admin interface for reordering them." -optional = false -python-versions = "*" -files = [ - {file = "django-ordered-model-3.7.4.tar.gz", hash = "sha256:f258b9762525c00a53009e82f8b8bf2a3aa315e8b453e281e8fdbbfe2b8cb3ba"}, - {file = "django_ordered_model-3.7.4-py3-none-any.whl", hash = "sha256:dfcd3183fe0749dad1c9971cba1d6240ce7328742a30ddc92feca41107bb241d"}, -] - [[package]] name = "django-override-storage" version = "0.3.2" @@ -5472,4 +5461,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <3.13" -content-hash = "5334f16d006f7486a5f9b905906f2a9a68e7f524684c04af3d0994ebd0999384" +content-hash = "e6d34875888f1687912d03d33ea68038bba6c6d487037c6454d5b18449ec6d0c" diff --git a/pyproject.toml b/pyproject.toml index 44839b4a8d..33efc0846a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,6 @@ tiktoken = "^0.6.0" hyperscan = "^0.7.7" openai = "^1.31.1" seal-rookery = "^2.2.3" -django-ordered-model = "^3.7.4" [tool.poetry.group.dev.dependencies] From 32821a5914afd2a96c139047e336f7fb293b2e65 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 25 Jul 2024 17:32:05 -0600 Subject: [PATCH 32/50] feat(opinion_order): update tests update fixtures add unique constraint update migrations --- cl/search/fixtures/test_objects_search.json | 4 +- cl/search/migrations/0032_order_opinions.py | 16 ++++--- cl/search/migrations/0032_order_opinions.sql | 8 ++-- cl/search/models.py | 12 ++++- cl/search/tests/tests.py | 47 ++++++++++---------- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/cl/search/fixtures/test_objects_search.json b/cl/search/fixtures/test_objects_search.json index 9fddb84fca..7ae3da4163 100644 --- a/cl/search/fixtures/test_objects_search.json +++ b/cl/search/fixtures/test_objects_search.json @@ -332,7 +332,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "order": 2 }, "model": "search.opinion", "pk": 5 @@ -355,7 +355,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "order": 3 }, "model": "search.opinion", "pk": 6 diff --git a/cl/search/migrations/0032_order_opinions.py b/cl/search/migrations/0032_order_opinions.py index dbbe4707d0..9c7f3fa5d3 100644 --- a/cl/search/migrations/0032_order_opinions.py +++ b/cl/search/migrations/0032_order_opinions.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-25 17:13 +# Generated by Django 5.0.7 on 2024-07-25 23:17 import pgtrigger.compiler import pgtrigger.migrations @@ -7,6 +7,10 @@ class Migration(migrations.Migration): dependencies = [ + ( + "people_db", + "0016_remove_abarating_update_or_delete_snapshot_update_and_more", + ), ("search", "0031_alter_opinion_type_alter_opinioncluster_source_noop"), ] @@ -29,10 +33,6 @@ class Migration(migrations.Migration): name="order", field=models.IntegerField(blank=True, null=True), ), - migrations.AlterUniqueTogether( - name="opinion", - unique_together={("cluster", "order")}, - ), pgtrigger.migrations.AddTrigger( model_name="opinion", trigger=pgtrigger.compiler.Trigger( @@ -62,4 +62,10 @@ class Migration(migrations.Migration): ), ), ), + migrations.AddConstraint( + model_name="opinion", + constraint=models.UniqueConstraint( + fields=("cluster_id", "order"), name="unique_opinion_order" + ), + ), ] diff --git a/cl/search/migrations/0032_order_opinions.sql b/cl/search/migrations/0032_order_opinions.sql index 71161b2370..01cac8adf7 100644 --- a/cl/search/migrations/0032_order_opinions.sql +++ b/cl/search/migrations/0032_order_opinions.sql @@ -16,10 +16,6 @@ ALTER TABLE "search_opinion" ADD COLUMN "order" integer NULL; -- ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; -- --- Alter unique_together for opinion (1 constraint(s)) --- -ALTER TABLE "search_opinion" ADD CONSTRAINT "search_opinion_cluster_id_order_8426d97d_uniq" UNIQUE ("cluster_id", "order"); --- -- Create trigger update_or_delete_snapshot_update on model opinion -- @@ -123,4 +119,8 @@ ALTER TABLE "search_opinion" ADD CONSTRAINT "search_opinion_cluster_id_order_842 COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad'; +-- +-- Create constraint unique_opinion_order on model opinion +-- +ALTER TABLE "search_opinion" ADD CONSTRAINT "unique_opinion_order" UNIQUE ("cluster_id", "order"); COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index 9c04940e3e..d6c17ba4f8 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3323,7 +3323,11 @@ class Opinion(AbstractDateTimeModel): order = models.IntegerField(null=True, blank=True) class Meta: - unique_together = ("cluster", "order") + constraints = [ + models.UniqueConstraint( + fields=["cluster_id", "order"], name="unique_opinion_order" + ) + ] @property def siblings(self) -> QuerySet: @@ -3350,6 +3354,12 @@ def save( *args: List, **kwargs: Dict, ) -> None: + if self.pk is None and self.order is None: + # Add order in new opinions with no defined order value + last_position = Opinion.objects.filter( + cluster=self.cluster + ).aggregate(models.Max("order"))["order__max"] + self.order = (last_position or 0) + 1 super().save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 740bcb3156..5c57cf72bd 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -65,7 +65,6 @@ OpinionClusterFactory, OpinionClusterFactoryWithChildrenAndParents, OpinionFactory, - OpinionsCitedWithParentsFactory, OpinionWithChildrenFactory, OpinionWithParentsFactory, RECAPDocumentFactory, @@ -301,7 +300,7 @@ def test_custom_manager_chained_filter(self) -> None: self.assertEqual(cluster_count, expected_count) def test_opinions_order(self) -> None: - """Test django-ordered-model library""" + """Test opinions order""" # Create court court = CourtFactory(id="nyappdiv") @@ -336,32 +335,34 @@ def test_opinions_order(self) -> None: # Test that the value of the order field matches the order in which # they were created - self.assertEqual(op_1.order, 0) - self.assertEqual(op_2.order, 1) - self.assertEqual(op_3.order, 2) + self.assertEqual(op_1.order, 1) + self.assertEqual(op_2.order, 2) + self.assertEqual(op_3.order, 3) - # Use library method to move lead opinion to first position, we can - # use this function to easily reorder existing opinions - op_3.to(0) + # Can we update an opinion using an existing position? + with transaction.atomic(): + with self.assertRaises(IntegrityError): + op_3.order = 2 + op_3.save() - # The position of the elements was modified, we refresh the objects - op_1.refresh_from_db() - op_2.refresh_from_db() - op_3.refresh_from_db() + # Can we create an opinion using an existing position? + with transaction.atomic(): + with self.assertRaises(IntegrityError): + op_4 = OpinionFactory( + cluster=cluster, type="Lead Opinion", order=1 + ) - # Test new order - self.assertEqual(op_3.order, 0) - self.assertEqual(op_1.order, 1) - self.assertEqual(op_2.order, 2) + # Can we use negative positions? + op_4 = OpinionFactory(cluster=cluster, type="Lead Opinion", order=-1) + self.assertEqual(op_4.order, -1) - # Add new opinion to cluster - op_4 = OpinionFactory( - cluster=cluster, - type="Dissent", + # Can we order the opinions from a cluster using the field? + qs = ( + cluster.sub_opinions.all() + .order_by("order") + .values_list("order", flat=True) ) - - # Test that the new opinion is in last place - self.assertEqual(op_4.order, 3) + self.assertEqual(list(qs), [-1, 1, 2, 3]) class DocketValidationTest(TestCase): From 5a6764e1bf45b99eda5b5f265e713a1665784e72 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 25 Jul 2024 17:51:30 -0600 Subject: [PATCH 33/50] feat(opinion_order): update fixture --- cl/search/fixtures/test_objects_search.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cl/search/fixtures/test_objects_search.json b/cl/search/fixtures/test_objects_search.json index 7ae3da4163..e9a89f1ea5 100644 --- a/cl/search/fixtures/test_objects_search.json +++ b/cl/search/fixtures/test_objects_search.json @@ -309,7 +309,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "order": 2 }, "model": "search.opinion", "pk": 4 @@ -332,7 +332,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 2 + "order": 3 }, "model": "search.opinion", "pk": 5 @@ -355,7 +355,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 3 + "order": 4 }, "model": "search.opinion", "pk": 6 From 37eb6bc82109dca97ed55b90575ba9a8e892def1 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 25 Jul 2024 18:03:03 -0600 Subject: [PATCH 34/50] feat(opinion_order): update fixture test_objects_query_counts.json --- cl/search/fixtures/test_objects_query_counts.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cl/search/fixtures/test_objects_query_counts.json b/cl/search/fixtures/test_objects_query_counts.json index b51117602a..af8b7f3e54 100644 --- a/cl/search/fixtures/test_objects_query_counts.json +++ b/cl/search/fixtures/test_objects_query_counts.json @@ -375,7 +375,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 1 + "order": 2 }, "model":"search.opinion", "pk":4 @@ -400,7 +400,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 1 + "order": 3 }, "model":"search.opinion", "pk":5 @@ -424,7 +424,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 1 + "order": 4 }, "model":"search.opinion", "pk":6 From 4b4d97fdaaac94d69bf5815abd358bd080a7064c Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 25 Jul 2024 18:13:01 -0600 Subject: [PATCH 35/50] feat(opinion_order): update fixture opinions-issue-550.json and functest_opinions.json --- cl/search/fixtures/functest_opinions.json | 2 +- cl/search/fixtures/opinions-issue-550.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/search/fixtures/functest_opinions.json b/cl/search/fixtures/functest_opinions.json index 45f5f0b759..6bc9333003 100644 --- a/cl/search/fixtures/functest_opinions.json +++ b/cl/search/fixtures/functest_opinions.json @@ -187,7 +187,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "order": 2 }, "model": "search.opinion", "pk": 12 diff --git a/cl/search/fixtures/opinions-issue-550.json b/cl/search/fixtures/opinions-issue-550.json index 829a94c7d2..c5f07cea17 100644 --- a/cl/search/fixtures/opinions-issue-550.json +++ b/cl/search/fixtures/opinions-issue-550.json @@ -88,7 +88,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "order": 1 + "order": 2 }, "model": "search.opinion", "pk": 11 From 0050caa417378a0969c8310e7f66b02728eeb0da Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 30 Jul 2024 18:44:53 -0600 Subject: [PATCH 36/50] feat(opinion_order): rename order to ordering_key ignore combined opinions, don't add order number update fixtures update tests --- .../import_columbia/columbia_utils.py | 4 +- .../commands/update_opinions_order.py | 99 ++++++------------- cl/search/fixtures/functest_opinions.json | 8 +- cl/search/fixtures/opinions-issue-412.json | 4 +- cl/search/fixtures/opinions-issue-550.json | 4 +- .../fixtures/test_objects_query_counts.json | 12 +-- cl/search/fixtures/test_objects_search.json | 12 +-- cl/search/migrations/0032_order_opinions.py | 19 ++-- cl/search/migrations/0032_order_opinions.sql | 22 ++--- cl/search/models.py | 5 +- cl/search/tests/tests.py | 14 +-- .../fixtures/api_scotus_map_data.json | 4 +- .../fixtures/scotus_map_data.json | 34 +++---- 13 files changed, 102 insertions(+), 139 deletions(-) diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index b1a62cfd6c..dec91fc1da 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -224,7 +224,7 @@ def extract_columbia_opinions( """ opinions: list = [] floating_content = [] - order = 0 + order = 1 # The opinion count starts from 1 # We iterate all content to look for all possible opinions for i, content in enumerate(outer_opinion): # type: int, Tag @@ -363,7 +363,7 @@ def process_extracted_opinions(extracted_opinions: list) -> list: opinions: list = [] authorless_content = [] - order = 0 + order = 1 # The opinion count starts from 1 for i, found_content in enumerate(extracted_opinions, start=1): byline = found_content.get("byline") diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 85ed93e0e2..5c91d0e4b1 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -38,38 +38,29 @@ def clean_opinion_content(text: str) -> str: # Replace line breaks with spaces and get rid of double spaces text = re.sub(" +", " ", " ".join(text.split("\n"))).strip() - # Remove non-alphanumeric and non-whitespace characters from lowercased text + # Remove non-alphanumeric and non-whitespace characters from lowercase text return re.sub(r"[^a-zA-Z0-9 ]", "", text.lower()) def get_opinions_cleaned_content( cluster_id, -) -> tuple[Optional[str], list[dict], int, bool]: +) -> tuple[Optional[str], list[dict]]: """Get cleaned opinions content for a cluster object :param cluster_id: Cluster ID for a set of opinions - :return: (xml path, list of extracted opinions, start position, True if combined - opinions exists in cluster) + :return: (xml path, list of extracted opinions) """ cl_cleaned_opinions = [] # by default the opinions are ordered by pk - opinions_from_cluster = Opinion.objects.filter( - cluster_id=cluster_id - ).order_by("id") - combined_opinions_cluster = opinions_from_cluster.filter( - type="010combined" + opinions_from_cluster = ( + Opinion.objects.filter(cluster_id=cluster_id) + .order_by("id") + .exclude(type="010combined") ) + xml_path = None - cluster_has_combined_opinion = False - if combined_opinions_cluster: - # the combined opinion will be displayed at beginning - start_position = combined_opinions_cluster.count() - cluster_has_combined_opinion = True - else: - # we don't have combined opinions, we start ordering from 0 to n - start_position = 0 - - for i, op in enumerate(opinions_from_cluster.exclude(type="010combined")): + + for i, op in enumerate(opinions_from_cluster): if op.local_path and not xml_path: xml_path = str(op.local_path) @@ -101,8 +92,6 @@ def get_opinions_cleaned_content( return ( xml_path, cl_cleaned_opinions, - start_position, - cluster_has_combined_opinion, ) @@ -170,26 +159,12 @@ def sort_harvard_opinions(start_id: int, end_id: int) -> None: # cluster_id: 4697264, the combined opinion will go to the last position for oc in clusters: logger.info(f"Processing cluster id: {oc}") - combined_opinions_cluster = oc.sub_opinions.filter( - type="010combined" - ).order_by("id") - if combined_opinions_cluster: - # the combined opinion will be displayed at first - start_position = combined_opinions_cluster.count() - else: - # we don't have combined opinions, we start ordering from 0 to n - start_position = 0 for opinion_order, cluster_op in enumerate( oc.sub_opinions.exclude(type="010combined").order_by("id"), - start=start_position, + start=1, ): - cluster_op.order = opinion_order - cluster_op.save() - - # Show combined opinions at beginning - for opinion_order, cluster_op in enumerate(combined_opinions_cluster): - cluster_op.order = opinion_order + cluster_op.ordering_key = opinion_order cluster_op.save() logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") @@ -200,18 +175,13 @@ def update_opinions( cl_opinions: list, columbia_opinions: list, matches: dict, - cluster_has_combined_opinion: bool, - start_position: int, ): """Update opinions with correct order :param cluster_id: :param cl_opinions: a list with cleaned opinions from cl - :param columbia_opinions: a ordered list with cleaned opinions from xml file + :param columbia_opinions: an ordered list with cleaned opinions from xml file :param matches: a dict with the matches of each opinion of both lists - :param cluster_has_combined_opinion: True if the cluster has combined opinions - :param start_position: the number from where the order should begin for - non-combined opinions :return: None """ update_failed = False @@ -221,7 +191,7 @@ def update_opinions( # file_pos is the correct index to find the opinion id to update file_opinion = columbia_opinions[file_pos] # the order was calculated using the xml file - file_order = file_opinion.get("order") + start_position + file_order = file_opinion.get("order") cl_opinion = cl_opinions[cl_pos] opinion_id_to_update = cl_opinion.get("id") @@ -229,11 +199,10 @@ def update_opinions( try: # Update opinion order op = Opinion.objects.get(id=opinion_id_to_update) - op.order = file_order + op.ordering_key = file_order op.save() except Opinion.DoesNotExist: - # This should not happen, but it is better to be - # cautious + # This should not happen, but it is better to be cautious logger.warning( f"We can't update opinion, opinion doesn't exist " f"with id: {opinion_id_to_update}" @@ -241,18 +210,6 @@ def update_opinions( update_failed = True break - if cluster_has_combined_opinion and not update_failed: - combined_opinions_cluster = Opinion.objects.filter( - cluster_id=cluster_id, type="010combined" - ).order_by("id") - - # Show combined opinions at beginning - for opinion_order, cluster_op in enumerate( - combined_opinions_cluster - ): - cluster_op.order = opinion_order - cluster_op.save() - if update_failed: # There was an error updating an opinion, rollback all changes for # cluster's opinions @@ -294,12 +251,9 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: logger.info(f"Processing cluster id: {cluster_id}") try: - ( - xml_path, - cl_cleaned_opinions, - start_position, - cluster_has_combined_opinion, - ) = get_opinions_cleaned_content(cluster_id) + xml_path, cl_cleaned_opinions = get_opinions_cleaned_content( + cluster_id + ) except EmptyOpinionException: logger.warning( f"At least one of the opinions from cluster id: {cluster_id} is empty." @@ -321,7 +275,9 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: fixed_xml_filepath ) except UnicodeDecodeError: - logger.warning(f"Cannot decode file: {fixed_xml_filepath}") + logger.warning( + f"Cannot decode file: {fixed_xml_filepath}, cluster id: {cluster_id}" + ) continue if cl_cleaned_opinions and extracted_columbia_opinions: @@ -336,6 +292,13 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: if op.get("opinion") ] + if len(columbia_opinions_content) != len(cl_opinions_content): + logger.warning( + f"The number of opinions in cl and the number of opinions in the xml is different, cluster id: {cluster_id}" + ) + continue + + # Try to match content between cl and xml matches = match_opinion_lists( columbia_opinions_content, cl_opinions_content, @@ -360,14 +323,12 @@ def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: # Go to next cluster id continue - # Update all opinions order + # All opinions matched, update all opinions order update_opinions( cluster_id, cl_cleaned_opinions, extracted_columbia_opinions, matches, - cluster_has_combined_opinion, - start_position, ) diff --git a/cl/search/fixtures/functest_opinions.json b/cl/search/fixtures/functest_opinions.json index 6bc9333003..2cc992a633 100644 --- a/cl/search/fixtures/functest_opinions.json +++ b/cl/search/fixtures/functest_opinions.json @@ -65,7 +65,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 10 @@ -136,7 +136,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 11 @@ -187,7 +187,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 2 + "ordering_key": 2 }, "model": "search.opinion", "pk": 12 @@ -258,7 +258,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 12 diff --git a/cl/search/fixtures/opinions-issue-412.json b/cl/search/fixtures/opinions-issue-412.json index 2e429ebecf..0e7fbdc7e6 100644 --- a/cl/search/fixtures/opinions-issue-412.json +++ b/cl/search/fixtures/opinions-issue-412.json @@ -65,7 +65,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 10 @@ -136,7 +136,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 11 diff --git a/cl/search/fixtures/opinions-issue-550.json b/cl/search/fixtures/opinions-issue-550.json index c5f07cea17..3e359b044d 100644 --- a/cl/search/fixtures/opinions-issue-550.json +++ b/cl/search/fixtures/opinions-issue-550.json @@ -65,7 +65,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 10 @@ -88,7 +88,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "order": 2 + "ordering_key": 2 }, "model": "search.opinion", "pk": 11 diff --git a/cl/search/fixtures/test_objects_query_counts.json b/cl/search/fixtures/test_objects_query_counts.json index af8b7f3e54..6a3f97da23 100644 --- a/cl/search/fixtures/test_objects_query_counts.json +++ b/cl/search/fixtures/test_objects_query_counts.json @@ -301,7 +301,7 @@ "html_lawbox":"", "per_curiam":false, "type":"020lead", - "order": 1 + "ordering_key": 1 }, "model":"search.opinion", "pk":1 @@ -326,7 +326,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 1 + "ordering_key": 1 }, "model":"search.opinion", "pk":2 @@ -351,7 +351,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 1 + "ordering_key": 1 }, "model":"search.opinion", "pk":3 @@ -375,7 +375,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 2 + "ordering_key": 2 }, "model":"search.opinion", "pk":4 @@ -400,7 +400,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 3 + "ordering_key": 3 }, "model":"search.opinion", "pk":5 @@ -424,7 +424,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "order": 4 + "ordering_key": 4 }, "model":"search.opinion", "pk":6 diff --git a/cl/search/fixtures/test_objects_search.json b/cl/search/fixtures/test_objects_search.json index e9a89f1ea5..542d297d54 100644 --- a/cl/search/fixtures/test_objects_search.json +++ b/cl/search/fixtures/test_objects_search.json @@ -240,7 +240,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 1 @@ -263,7 +263,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 2 @@ -286,7 +286,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 3 @@ -309,7 +309,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 2 + "ordering_key": 2 }, "model": "search.opinion", "pk": 4 @@ -332,7 +332,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 3 + "ordering_key": 3 }, "model": "search.opinion", "pk": 5 @@ -355,7 +355,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 4 + "ordering_key": 4 }, "model": "search.opinion", "pk": 6 diff --git a/cl/search/migrations/0032_order_opinions.py b/cl/search/migrations/0032_order_opinions.py index 9c7f3fa5d3..9b4db9fbe7 100644 --- a/cl/search/migrations/0032_order_opinions.py +++ b/cl/search/migrations/0032_order_opinions.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-25 23:17 +# Generated by Django 5.0.7 on 2024-07-30 18:59 import pgtrigger.compiler import pgtrigger.migrations @@ -25,12 +25,12 @@ class Migration(migrations.Migration): ), migrations.AddField( model_name="opinion", - name="order", + name="ordering_key", field=models.IntegerField(blank=True, null=True), ), migrations.AddField( model_name="opinionevent", - name="order", + name="ordering_key", field=models.IntegerField(blank=True, null=True), ), pgtrigger.migrations.AddTrigger( @@ -38,9 +38,9 @@ class Migration(migrations.Migration): trigger=pgtrigger.compiler.Trigger( name="update_or_delete_snapshot_update", sql=pgtrigger.compiler.UpsertTriggerSql( - condition='WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."order" IS DISTINCT FROM (NEW."order"))', - func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', - hash="89fec08f03e567ec8ecc7cd1e8ec5f665abf9d3b", + condition='WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."ordering_key" IS DISTINCT FROM (NEW."ordering_key"))', + func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', + hash="7137855274503cc2c50a17729f82e150d2b7d872", operation="UPDATE", pgid="pgtrigger_update_or_delete_snapshot_update_67ecd", table="search_opinion", @@ -53,8 +53,8 @@ class Migration(migrations.Migration): trigger=pgtrigger.compiler.Trigger( name="update_or_delete_snapshot_delete", sql=pgtrigger.compiler.UpsertTriggerSql( - func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', - hash="79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad", + func='INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), \'update_or_delete_snapshot\', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL;', + hash="98fb52aa60fd8e89a83f8f7ac77ba5892739fb37", operation="DELETE", pgid="pgtrigger_update_or_delete_snapshot_delete_1f4fd", table="search_opinion", @@ -65,7 +65,8 @@ class Migration(migrations.Migration): migrations.AddConstraint( model_name="opinion", constraint=models.UniqueConstraint( - fields=("cluster_id", "order"), name="unique_opinion_order" + fields=("cluster_id", "ordering_key"), + name="unique_opinion_ordering_key", ), ), ] diff --git a/cl/search/migrations/0032_order_opinions.sql b/cl/search/migrations/0032_order_opinions.sql index 01cac8adf7..e02c150f4d 100644 --- a/cl/search/migrations/0032_order_opinions.sql +++ b/cl/search/migrations/0032_order_opinions.sql @@ -8,13 +8,13 @@ DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "sear -- DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; -- --- Add field order to opinion +-- Add field ordering_key to opinion -- -ALTER TABLE "search_opinion" ADD COLUMN "order" integer NULL; +ALTER TABLE "search_opinion" ADD COLUMN "ordering_key" integer NULL; -- --- Add field order to opinionevent +-- Add field ordering_key to opinionevent -- -ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; +ALTER TABLE "search_opinionevent" ADD COLUMN "ordering_key" integer NULL; -- -- Create trigger update_or_delete_snapshot_update on model opinion -- @@ -53,7 +53,7 @@ ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; RETURN NEW; END IF; END IF; - INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; + INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; END; $$ LANGUAGE plpgsql; @@ -62,10 +62,10 @@ ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; AFTER UPDATE ON "search_opinion" - FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."order" IS DISTINCT FROM (NEW."order")) + FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."ordering_key" IS DISTINCT FROM (NEW."ordering_key")) EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_update_67ecd(); - COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS '89fec08f03e567ec8ecc7cd1e8ec5f665abf9d3b'; + COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS '7137855274503cc2c50a17729f82e150d2b7d872'; -- -- Create trigger update_or_delete_snapshot_delete on model opinion @@ -105,7 +105,7 @@ ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; RETURN NEW; END IF; END IF; - INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "order", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."order", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; + INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; END; $$ LANGUAGE plpgsql; @@ -117,10 +117,10 @@ ALTER TABLE "search_opinionevent" ADD COLUMN "order" integer NULL; FOR EACH ROW EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_delete_1f4fd(); - COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '79bebd7cda3c6ed3bc40f28799cf9c0f2638e2ad'; + COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '98fb52aa60fd8e89a83f8f7ac77ba5892739fb37'; -- --- Create constraint unique_opinion_order on model opinion +-- Create constraint unique_opinion_ordering_key on model opinion -- -ALTER TABLE "search_opinion" ADD CONSTRAINT "unique_opinion_order" UNIQUE ("cluster_id", "order"); +ALTER TABLE "search_opinion" ADD CONSTRAINT "unique_opinion_ordering_key" UNIQUE ("cluster_id", "ordering_key"); COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index d6c17ba4f8..a0c9fa7eef 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3320,12 +3320,13 @@ class Opinion(AbstractDateTimeModel): "sha1", ] ) - order = models.IntegerField(null=True, blank=True) + ordering_key = models.IntegerField(null=True, blank=True) class Meta: constraints = [ models.UniqueConstraint( - fields=["cluster_id", "order"], name="unique_opinion_order" + fields=["cluster_id", "ordering_key"], + name="unique_opinion_ordering_key", ) ] diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 5c57cf72bd..ca5c384651 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -335,14 +335,14 @@ def test_opinions_order(self) -> None: # Test that the value of the order field matches the order in which # they were created - self.assertEqual(op_1.order, 1) - self.assertEqual(op_2.order, 2) - self.assertEqual(op_3.order, 3) + self.assertEqual(op_1.ordering_key, 1) + self.assertEqual(op_2.ordering_key, 2) + self.assertEqual(op_3.ordering_key, 3) # Can we update an opinion using an existing position? with transaction.atomic(): with self.assertRaises(IntegrityError): - op_3.order = 2 + op_3.ordering_key = 2 op_3.save() # Can we create an opinion using an existing position? @@ -354,13 +354,13 @@ def test_opinions_order(self) -> None: # Can we use negative positions? op_4 = OpinionFactory(cluster=cluster, type="Lead Opinion", order=-1) - self.assertEqual(op_4.order, -1) + self.assertEqual(op_4.ordering_key, -1) # Can we order the opinions from a cluster using the field? qs = ( cluster.sub_opinions.all() - .order_by("order") - .values_list("order", flat=True) + .order_by("ordering_key") + .values_list("ordering_key", flat=True) ) self.assertEqual(list(qs), [-1, 1, 2, 3]) diff --git a/cl/visualizations/fixtures/api_scotus_map_data.json b/cl/visualizations/fixtures/api_scotus_map_data.json index 46dc2f9856..3a13c3e4e7 100644 --- a/cl/visualizations/fixtures/api_scotus_map_data.json +++ b/cl/visualizations/fixtures/api_scotus_map_data.json @@ -122,7 +122,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 1 @@ -145,7 +145,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 2 diff --git a/cl/visualizations/fixtures/scotus_map_data.json b/cl/visualizations/fixtures/scotus_map_data.json index a885e4df54..e0760f42bf 100644 --- a/cl/visualizations/fixtures/scotus_map_data.json +++ b/cl/visualizations/fixtures/scotus_map_data.json @@ -903,7 +903,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 111014 @@ -926,7 +926,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 111113 @@ -949,7 +949,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 111464 @@ -972,7 +972,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 111505 @@ -995,7 +995,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 111924 @@ -1018,7 +1018,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 112331 @@ -1041,7 +1041,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 112646 @@ -1064,7 +1064,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 112779 @@ -1087,7 +1087,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 112874 @@ -1110,7 +1110,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 117967 @@ -1133,7 +1133,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 118377 @@ -1156,7 +1156,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 121168 @@ -1179,7 +1179,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 136984 @@ -1202,7 +1202,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 142900 @@ -1225,7 +1225,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 799990 @@ -1248,7 +1248,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 799993 @@ -1271,7 +1271,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "order": 1 + "ordering_key": 1 }, "model": "search.opinion", "pk": 2674862 From 3eeaafe572121c230459f8f9b36637bd21c4392d Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 30 Jul 2024 18:56:56 -0600 Subject: [PATCH 37/50] feat(opinion_order): update model --- cl/search/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index a0c9fa7eef..2c73363836 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3355,12 +3355,12 @@ def save( *args: List, **kwargs: Dict, ) -> None: - if self.pk is None and self.order is None: + if self.pk is None and self.ordering_key is None: # Add order in new opinions with no defined order value last_position = Opinion.objects.filter( cluster=self.cluster ).aggregate(models.Max("order"))["order__max"] - self.order = (last_position or 0) + 1 + self.ordering_key = (last_position or 0) + 1 super().save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr From ac98d938cef121df8fe7cac8fff12ae58a11a08f Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 30 Jul 2024 19:56:53 -0600 Subject: [PATCH 38/50] feat(opinion_order): update model --- cl/search/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/search/models.py b/cl/search/models.py index 2c73363836..5e755f5062 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3359,7 +3359,7 @@ def save( # Add order in new opinions with no defined order value last_position = Opinion.objects.filter( cluster=self.cluster - ).aggregate(models.Max("order"))["order__max"] + ).aggregate(models.Max("ordering_key"))["ordering_key__max"] self.ordering_key = (last_position or 0) + 1 super().save(*args, **kwargs) if index: From c22eb04c0de730962727f3b596498211274544a1 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 30 Jul 2024 19:57:40 -0600 Subject: [PATCH 39/50] feat(opinion_order): update model --- cl/search/models.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index 5e755f5062..a6b54b9819 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3355,12 +3355,6 @@ def save( *args: List, **kwargs: Dict, ) -> None: - if self.pk is None and self.ordering_key is None: - # Add order in new opinions with no defined order value - last_position = Opinion.objects.filter( - cluster=self.cluster - ).aggregate(models.Max("ordering_key"))["ordering_key__max"] - self.ordering_key = (last_position or 0) + 1 super().save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr From 60744a19e7dca4757ccf692c29deded2cf32185a Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 30 Jul 2024 20:22:47 -0600 Subject: [PATCH 40/50] feat(opinion_order): update tests --- cl/search/tests/tests.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index ca5c384651..a32ffe8868 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -321,16 +321,19 @@ def test_opinions_order(self) -> None: op_1 = OpinionFactory( cluster=cluster, type="Concurrence Opinion", + ordering_key=1, ) op_2 = OpinionFactory( cluster=cluster, type="Dissent", + ordering_key=2, ) op_3 = OpinionFactory( cluster=cluster, type="Lead Opinion", + ordering_key=3, ) # Test that the value of the order field matches the order in which @@ -349,11 +352,13 @@ def test_opinions_order(self) -> None: with transaction.atomic(): with self.assertRaises(IntegrityError): op_4 = OpinionFactory( - cluster=cluster, type="Lead Opinion", order=1 + cluster=cluster, type="Lead Opinion", ordering_key=1 ) # Can we use negative positions? - op_4 = OpinionFactory(cluster=cluster, type="Lead Opinion", order=-1) + op_4 = OpinionFactory( + cluster=cluster, type="Lead Opinion", ordering_key=-1 + ) self.assertEqual(op_4.ordering_key, -1) # Can we order the opinions from a cluster using the field? @@ -364,6 +369,10 @@ def test_opinions_order(self) -> None: ) self.assertEqual(list(qs), [-1, 1, 2, 3]) + # Order default value is null + op_5 = OpinionFactory(cluster=cluster, type="Lead Opinion") + self.assertEqual(op_5.ordering_key, None) + class DocketValidationTest(TestCase): @classmethod From e8a9c68169fd6d376154590e90c573069438385d Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 31 Jul 2024 19:14:20 -0600 Subject: [PATCH 41/50] feat(opinion_order): update code for harvard source --- .../commands/update_opinions_order.py | 66 ++++++++++++------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 5c91d0e4b1..dc00b24818 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup from django.core.management import BaseCommand from django.db import transaction -from django.db.models import Count +from django.db.models import Count, Q from cl.corpus_importer.import_columbia.columbia_utils import ( extract_columbia_opinions, @@ -142,28 +142,38 @@ def sort_harvard_opinions(start_id: int, end_id: int) -> None: :return: None """ + # The filepath_json_harvard field can only be filled by the harvard importer, + # this helps us confirm that it was imported from a Harvard json + base_filter = Q( + opinions_count__gt=1, source__in=VALID_HARVARD_SOURCES + ) & ~Q(filepath_json_harvard="") + + if start_id: + base_filter &= Q(pk__gte=start_id) + + if end_id: + base_filter &= Q(pk__lte=end_id) + # Get all harvard clusters with more than one opinion clusters = ( OpinionCluster.objects.prefetch_related("sub_opinions") .annotate(opinions_count=Count("sub_opinions")) - .filter(opinions_count__gt=1, source__in=VALID_HARVARD_SOURCES) + .filter(base_filter) .order_by("id") ) - if start_id: - clusters = clusters.filter(pk__gte=start_id) - - if end_id: - clusters = clusters.filter(pk__lte=end_id) - - # cluster_id: 4697264, the combined opinion will go to the last position for oc in clusters: logger.info(f"Processing cluster id: {oc}") - for opinion_order, cluster_op in enumerate( - oc.sub_opinions.exclude(type="010combined").order_by("id"), - start=1, - ): + cluster_opinions = oc.sub_opinions.exclude( + type="010combined" + ).order_by("id") + + if not cluster_opinions: + logger.info(f"No opinions left to order for cluster id: {oc}") + continue + + for opinion_order, cluster_op in enumerate(cluster_opinions, start=1): cluster_op.ordering_key = opinion_order cluster_op.save() @@ -344,27 +354,23 @@ def add_arguments(self, parser): action="store_true", help="Fix harvard opinions order", ) - parser.add_argument( "--process-columbia", action="store_true", help="Fix columbia opinions order", ) - parser.add_argument( "--xml-dir", default="/opt/courtlistener/_columbia", required=False, help="The absolute path to the directory with columbia xml files", ) - parser.add_argument( "--start-id", type=int, default=0, help="Start id for a range of clusters (inclusive)", ) - parser.add_argument( "--end-id", type=int, @@ -372,18 +378,32 @@ def add_arguments(self, parser): help="End id for a range of clusters (inclusive)", ) - def handle(self, *args, **options): + def validate_args(self, opts): + """Validate arguments passed to the command - if not options["process_harvard"] and not options["process_columbia"]: - logger.info( + :param opts: dictionary with arguments from the command + :return: true if validations are satisfied else false + """ + if opts["end_id"] > opts["start_id"]: + logger.error("end-id should be greater or equal than start-id") + return False + + if not opts["process_harvard"] and not opts["process_columbia"]: + logger.error( "One option required: process-harvard or process-columbia" ) - return + return False - if options["process_harvard"] and options["process_columbia"]: - logger.info( + if opts["process_harvard"] and opts["process_columbia"]: + logger.error( "You can only select one option process-harvard or process-columbia" ) + return False + return True + + def handle(self, *args, **options): + + if not self.validate_args(options): return if options["process_harvard"]: From cb2a1d398d75bc353c153ce569b58f7c3496f2bc Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 2 Aug 2024 15:31:11 -0400 Subject: [PATCH 42/50] feat(search.models): Add validation for ordering key Dont allow negative or 0 as a key Add validation in save Make check explicit --- cl/search/models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cl/search/models.py b/cl/search/models.py index a6b54b9819..1bde2ebad0 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3347,6 +3347,11 @@ def get_absolute_url(self) -> str: def clean(self) -> None: if self.type == "": raise ValidationError("'type' is a required field.") + if self.ordering_key is not None and self.ordering_key != "": + if self.ordering_key < 1: + raise ValidationError( + {"ordering_key": "Ordering key cannot be zero or negative"} + ) def save( self, @@ -3355,6 +3360,7 @@ def save( *args: List, **kwargs: Dict, ) -> None: + self.clean() super().save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr From 18ba5421355682b1e9813b5322e86cf2061fd4f9 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 2 Aug 2024 15:32:36 -0400 Subject: [PATCH 43/50] refactor(update_opinion_order): Drop columbia Drop columbia from opinion ordering Refactor the argparse to be more CL-ish using skip-until and limit Update filtering commands --- .../commands/update_opinions_order.py | 427 +++--------------- cl/search/tests/tests.py | 40 +- 2 files changed, 80 insertions(+), 387 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index dc00b24818..ab445a1491 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,415 +1,96 @@ -import os.path -import re -from typing import Optional - -from bs4 import BeautifulSoup -from django.core.management import BaseCommand -from django.db import transaction from django.db.models import Count, Q -from cl.corpus_importer.import_columbia.columbia_utils import ( - extract_columbia_opinions, - map_opinion_types, - process_extracted_opinions, - read_xml_to_soup, -) -from cl.corpus_importer.utils import EmptyOpinionException, match_opinion_lists -from cl.lib.command_utils import logger +from cl.lib.command_utils import VerboseCommand, logger from cl.search.models import SOURCES, Opinion, OpinionCluster -VALID_COLUMBIA_SOURCES = [ - key - for key in dict(SOURCES.NAMES).keys() - if SOURCES.COLUMBIA_ARCHIVE in key -] - -VALID_HARVARD_SOURCES = [ - key for key in dict(SOURCES.NAMES).keys() if SOURCES.HARVARD_CASELAW in key -] - - -def clean_opinion_content(text: str) -> str: - """Clean opinion content - - :param text: text to clean - :return: cleaned text - """ - - # Replace line breaks with spaces and get rid of double spaces - text = re.sub(" +", " ", " ".join(text.split("\n"))).strip() - - # Remove non-alphanumeric and non-whitespace characters from lowercase text - return re.sub(r"[^a-zA-Z0-9 ]", "", text.lower()) - - -def get_opinions_cleaned_content( - cluster_id, -) -> tuple[Optional[str], list[dict]]: - """Get cleaned opinions content for a cluster object - - :param cluster_id: Cluster ID for a set of opinions - :return: (xml path, list of extracted opinions) - """ - cl_cleaned_opinions = [] - # by default the opinions are ordered by pk - opinions_from_cluster = ( - Opinion.objects.filter(cluster_id=cluster_id) - .order_by("id") - .exclude(type="010combined") - ) - - xml_path = None - for i, op in enumerate(opinions_from_cluster): - if op.local_path and not xml_path: - xml_path = str(op.local_path) +def sort_harvard_opinions(options) -> None: + """Sort harvard opinions - content = None - - # We can only use columbia's content to infer the ordering - if len(op.html_columbia) > 1: - content = op.html_columbia - - if not content: - raise EmptyOpinionException( - "There is no content in html_columbia field" - ) - - soup = BeautifulSoup(content, features="html.parser") - opinion_text = soup.getText(separator=" ", strip=True) - prep_text = clean_opinion_content(opinion_text) - - cl_cleaned_opinions.append( - { - "id": op.id, - "byline": op.author_str, - "type": op.type, - "opinion": prep_text, - "order": i, - } - ) - - return ( - xml_path, - cl_cleaned_opinions, - ) - - -def fix_filepath(filepath: str) -> str: - """Fix filepath from file field - - :param filepath: path from file field - :return: new file path - """ - if "/home/mlissner/columbia/opinions/" in filepath: - filepath = filepath.replace("/home/mlissner/columbia/opinions/", "") - return filepath - - -def get_opinions_columbia_file(xml_filepath: str) -> list: - """Get opinions from columbia xml file and convert it into dict - - :param xml_filepath: path of xml file - :return: dict with data - """ - soup = read_xml_to_soup(xml_filepath) - - # Find the outer tag to have all elements inside - outer_opinion = soup.find("opinion") - - extracted_opinions = extract_columbia_opinions(outer_opinion) - opinions = process_extracted_opinions(extracted_opinions) - map_opinion_types(opinions) - - for op in opinions: - opinion_content = op.get("opinion") - soup = BeautifulSoup(opinion_content, "html.parser") - opinion_text = soup.getText(separator=" ", strip=True) - cleaned_opinion = clean_opinion_content(opinion_text) - op["opinion"] = cleaned_opinion - - return opinions - - -def sort_harvard_opinions(start_id: int, end_id: int) -> None: - """We assume that harvard data is already ordered, we just need to fill the order - field in each opinion + We assume that harvard data is already ordered, we just need to fill + the order field in each opinion The harvard importer created the opinions in order of appearance in the file - :param start_id: skip any id lower than this value - :param end_id: skip any id greater than this value + :param options: dict of arguments skip until and limit if given :return: None """ - # The filepath_json_harvard field can only be filled by the harvard importer, - # this helps us confirm that it was imported from a Harvard json - base_filter = Q( - opinions_count__gt=1, source__in=VALID_HARVARD_SOURCES - ) & ~Q(filepath_json_harvard="") - - if start_id: - base_filter &= Q(pk__gte=start_id) - - if end_id: - base_filter &= Q(pk__lte=end_id) + skip_until = options.get("skip_until", None) + limit = options.get("limit", None) - # Get all harvard clusters with more than one opinion - clusters = ( - OpinionCluster.objects.prefetch_related("sub_opinions") + base_filter = ( + OpinionCluster.objects.exclude(filepath_json_harvard="") .annotate(opinions_count=Count("sub_opinions")) - .filter(base_filter) - .order_by("id") + .filter(opinions_count__gt=1) ) - for oc in clusters: - logger.info(f"Processing cluster id: {oc}") - - cluster_opinions = oc.sub_opinions.exclude( - type="010combined" - ).order_by("id") - - if not cluster_opinions: - logger.info(f"No opinions left to order for cluster id: {oc}") - continue - - for opinion_order, cluster_op in enumerate(cluster_opinions, start=1): - cluster_op.ordering_key = opinion_order - cluster_op.save() - - logger.info(msg=f"Opinions reordered for cluster id: {oc.id}") - - -def update_opinions( - cluster_id: int, - cl_opinions: list, - columbia_opinions: list, - matches: dict, -): - """Update opinions with correct order + if skip_until: + base_filter &= Q(pk__gte=skip_until) - :param cluster_id: - :param cl_opinions: a list with cleaned opinions from cl - :param columbia_opinions: an ordered list with cleaned opinions from xml file - :param matches: a dict with the matches of each opinion of both lists - :return: None - """ - update_failed = False - - with transaction.atomic(): - for file_pos, cl_pos in matches.items(): - # file_pos is the correct index to find the opinion id to update - file_opinion = columbia_opinions[file_pos] - # the order was calculated using the xml file - file_order = file_opinion.get("order") - cl_opinion = cl_opinions[cl_pos] - opinion_id_to_update = cl_opinion.get("id") - - if opinion_id_to_update: - try: - # Update opinion order - op = Opinion.objects.get(id=opinion_id_to_update) - op.ordering_key = file_order - op.save() - except Opinion.DoesNotExist: - # This should not happen, but it is better to be cautious - logger.warning( - f"We can't update opinion, opinion doesn't exist " - f"with id: {opinion_id_to_update}" - ) - update_failed = True - break - - if update_failed: - # There was an error updating an opinion, rollback all changes for - # cluster's opinions - logger.warning( - f"There was an error updating the order of opinions of the " - f"cluster id: {cluster_id}" - ) - transaction.set_rollback(True) - else: - logger.info( - f"The order of opinions was updated, cluster id: {cluster_id}" - ) - - -def sort_columbia_opinions(start_id: int, end_id: int, xml_dir: str) -> None: - """Update opinion ordering for columbia clusters - - :param start_id: skip any id lower than this value - :param end_id: skip any id greater than this value - :param xml_dir: absolute path to the directory with columbia xml files - :return: None - """ - - # Get all columbia cluster ids with more than one opinion - clusters = ( + harvard_clusters = ( OpinionCluster.objects.annotate(opinions_count=Count("sub_opinions")) - .filter(opinions_count__gt=1, source__in=VALID_COLUMBIA_SOURCES) + .filter(base_filter) .order_by("id") - .values_list("id", flat=True) ) + if limit: + harvard_clusters = harvard_clusters[:limit] - if start_id: - clusters = filter(lambda x: x >= start_id, clusters) - - if end_id: - clusters = filter(lambda x: x <= end_id, clusters) - - for cluster_id in clusters: - logger.info(f"Processing cluster id: {cluster_id}") - - try: - xml_path, cl_cleaned_opinions = get_opinions_cleaned_content( - cluster_id - ) - except EmptyOpinionException: - logger.warning( - f"At least one of the opinions from cluster id: {cluster_id} is empty." + for cluster in harvard_clusters: + logger.info(f"Processing cluster id: {cluster}") + sub_opinions = cluster.sub_opinions.exclude( + type=Opinion.COMBINED, + ).order_by("id") + if not sub_opinions: + logger.info( + f"No sub_opinions left to order for cluster id: {cluster}" ) continue + for opinion_order, cluster_op in enumerate(sub_opinions, start=1): + cluster_op.ordering_key = opinion_order + cluster_op.save() + logger.info(msg=f"Opinions reordered for cluster id: {cluster.id}") - extracted_columbia_opinions = None - if xml_path: - fixed_xml_filepath = os.path.join(xml_dir, fix_filepath(xml_path)) - - if not os.path.exists(fixed_xml_filepath): - logger.warning( - f"Xml file not found in {fixed_xml_filepath}, cluster id: {cluster_id}" - ) - continue - - try: - extracted_columbia_opinions = get_opinions_columbia_file( - fixed_xml_filepath - ) - except UnicodeDecodeError: - logger.warning( - f"Cannot decode file: {fixed_xml_filepath}, cluster id: {cluster_id}" - ) - continue - if cl_cleaned_opinions and extracted_columbia_opinions: - columbia_opinions_content = [ - op.get("opinion") - for op in extracted_columbia_opinions - if op.get("opinion") - ] - cl_opinions_content = [ - op.get("opinion") - for op in cl_cleaned_opinions - if op.get("opinion") - ] +class Command(VerboseCommand): + help = "Add ordering Key for sub opinions" - if len(columbia_opinions_content) != len(cl_opinions_content): - logger.warning( - f"The number of opinions in cl and the number of opinions in the xml is different, cluster id: {cluster_id}" - ) - continue + def __init__(self, *args, **kwargs): + super(Command, self).__init__(*args, **kwargs) - # Try to match content between cl and xml - matches = match_opinion_lists( - columbia_opinions_content, - cl_opinions_content, + def valid_actions(self, s): + if s.lower() not in self.VALID_ACTIONS: + raise argparse.ArgumentTypeError( + "Unable to parse action. Valid actions are: %s" + % (", ".join(self.VALID_ACTIONS.keys())) ) - if matches: - if len(matches.values()) != len(set(matches.values())): - # We don't have a unique match for each opinion, they were - # probably combined incorrectly - logger.info( - f"We can't infer opinions order for cluster id: {cluster_id}" - ) - # Go to next cluster id - continue - - if len(cl_cleaned_opinions) > len(set(matches.values())): - # We have more opinions than matches - logger.info( - f"We couldn't match all cl opinions to the file's " - f"content, cluster id: {cluster_id}" - ) - # Go to next cluster id - continue - - # All opinions matched, update all opinions order - update_opinions( - cluster_id, - cl_cleaned_opinions, - extracted_columbia_opinions, - matches, - ) - - -class Command(BaseCommand): - help = "Fill order field in Opinion objects" - - def __init__(self, *args, **kwargs): - super(Command, self).__init__(*args, **kwargs) + return self.VALID_ACTIONS[s] def add_arguments(self, parser): parser.add_argument( - "--process-harvard", - action="store_true", - help="Fix harvard opinions order", - ) - parser.add_argument( - "--process-columbia", - action="store_true", - help="Fix columbia opinions order", - ) - parser.add_argument( - "--xml-dir", - default="/opt/courtlistener/_columbia", + "--skip-until", + help="Specific cluster id to skip until", + type=int, required=False, - help="The absolute path to the directory with columbia xml files", ) parser.add_argument( - "--start-id", + "--limit", type=int, - default=0, - help="Start id for a range of clusters (inclusive)", + help="Number of files to sort", + required=False, ) + parser.add_argument( - "--end-id", - type=int, - default=0, - help="End id for a range of clusters (inclusive)", + "--action", + type=self.valid_actions, + required=True, + help="The action you wish to take. Valid choices are: %s" + % (", ".join(self.VALID_ACTIONS.keys())), ) - def validate_args(self, opts): - """Validate arguments passed to the command - - :param opts: dictionary with arguments from the command - :return: true if validations are satisfied else false - """ - if opts["end_id"] > opts["start_id"]: - logger.error("end-id should be greater or equal than start-id") - return False - - if not opts["process_harvard"] and not opts["process_columbia"]: - logger.error( - "One option required: process-harvard or process-columbia" - ) - return False - - if opts["process_harvard"] and opts["process_columbia"]: - logger.error( - "You can only select one option process-harvard or process-columbia" - ) - return False - return True - def handle(self, *args, **options): + super().handle(*args, **options) + options["action"](options) - if not self.validate_args(options): - return - - if options["process_harvard"]: - sort_harvard_opinions(options["start_id"], options["end_id"]) - - if options["process_columbia"]: - sort_columbia_opinions( - options["start_id"], options["end_id"], options["xml_dir"] - ) + VALID_ACTIONS = {"sort-harvard": sort_harvard_opinions} diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index ff7ed177b7..b8f85f719d 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -321,19 +321,17 @@ def test_opinions_order(self) -> None: # Create three opinions op_1 = OpinionFactory( cluster=cluster, - type="Concurrence Opinion", + type=Opinion.LEAD, ordering_key=1, ) - op_2 = OpinionFactory( cluster=cluster, - type="Dissent", + type=Opinion.CONCURRENCE, ordering_key=2, ) - op_3 = OpinionFactory( cluster=cluster, - type="Lead Opinion", + type=Opinion.DISSENT, ordering_key=3, ) @@ -343,24 +341,38 @@ def test_opinions_order(self) -> None: self.assertEqual(op_2.ordering_key, 2) self.assertEqual(op_3.ordering_key, 3) + # Can we swap orders? + op_1.ordering_key = None + op_1.save() + + op_2.ordering_key = 1 + op_2.save() + + op_1.ordering_key = 2 + op_1.save() + # Can we update an opinion using an existing position? with transaction.atomic(): with self.assertRaises(IntegrityError): op_3.ordering_key = 2 op_3.save() - # Can we create an opinion using an existing position? + # Validate unique cluster/order with transaction.atomic(): with self.assertRaises(IntegrityError): - op_4 = OpinionFactory( - cluster=cluster, type="Lead Opinion", ordering_key=1 + op = OpinionFactory( + cluster=cluster, + type=Opinion.ADDENDUM, ) + op.ordering_key = 3 + op.save() - # Can we use negative positions? - op_4 = OpinionFactory( - cluster=cluster, type="Lead Opinion", ordering_key=-1 - ) - self.assertEqual(op_4.ordering_key, -1) + # Can we use avoid negative positions? + with transaction.atomic(): + with self.assertRaises(ValidationError): + op = OpinionFactory(cluster=cluster, type=Opinion.LEAD) + op.ordering_key = -1 + op.save() # Can we order the opinions from a cluster using the field? qs = ( @@ -368,7 +380,7 @@ def test_opinions_order(self) -> None: .order_by("ordering_key") .values_list("ordering_key", flat=True) ) - self.assertEqual(list(qs), [-1, 1, 2, 3]) + self.assertEqual(list(qs), [1, 2, 3, None]) # Order default value is null op_5 = OpinionFactory(cluster=cluster, type="Lead Opinion") From aae5840b7e43344fa1d6ce4f357f619805b537be Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 2 Aug 2024 15:45:59 -0400 Subject: [PATCH 44/50] tests(fixtures): Set fixture values to null None of the fixtures are ordered Also i removed the 550 - not sure where it is used or why it exists but i dont see a reason so i removed it. --- cl/search/fixtures/functest_opinions.json | 8 +- cl/search/fixtures/opinions-issue-412.json | 4 +- cl/search/fixtures/opinions-issue-550.json | 96 ------------------- .../fixtures/test_objects_query_counts.json | 12 +-- cl/search/fixtures/test_objects_search.json | 12 +-- .../fixtures/api_scotus_map_data.json | 4 +- .../fixtures/scotus_map_data.json | 34 +++---- 7 files changed, 37 insertions(+), 133 deletions(-) delete mode 100644 cl/search/fixtures/opinions-issue-550.json diff --git a/cl/search/fixtures/functest_opinions.json b/cl/search/fixtures/functest_opinions.json index 2cc992a633..f1e6f2da44 100644 --- a/cl/search/fixtures/functest_opinions.json +++ b/cl/search/fixtures/functest_opinions.json @@ -65,7 +65,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 10 @@ -136,7 +136,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 11 @@ -187,7 +187,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 2 + "ordering_key": null }, "model": "search.opinion", "pk": 12 @@ -258,7 +258,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 12 diff --git a/cl/search/fixtures/opinions-issue-412.json b/cl/search/fixtures/opinions-issue-412.json index 0e7fbdc7e6..fa7d716ccb 100644 --- a/cl/search/fixtures/opinions-issue-412.json +++ b/cl/search/fixtures/opinions-issue-412.json @@ -65,7 +65,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 10 @@ -136,7 +136,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 11 diff --git a/cl/search/fixtures/opinions-issue-550.json b/cl/search/fixtures/opinions-issue-550.json deleted file mode 100644 index 3e359b044d..0000000000 --- a/cl/search/fixtures/opinions-issue-550.json +++ /dev/null @@ -1,96 +0,0 @@ -[ - { - "fields": { - "date_blocked": null, - "date_reargument_denied": "2015-08-15", - "court": "ca1", - "date_reargued": "2015-08-15", - "case_name_full": "Voutila v. Bonvini", - "date_argued": "2015-08-15", - "date_modified": "2015-08-15T13:55:03.669Z", - "case_name": "case name docket 10", - "date_created": "2015-08-15T13:55:03.669Z", - "case_name_short": "short name for Voutila v. Bonvini", - "docket_number": "1337-np", - "slug": "case-name", - "source": 0, - "blocked": false - }, - "model": "search.docket", - "pk": 10 - }, - { - "fields": { - "date_blocked": null, - "case_name_full": "Reference to Voutila v. Bonvini", - "case_name_short": "Case name in short for Voutila v. Bonvini", - "blocked": false, - "syllabus": "some rando syllabus", - "date_filed": "2015-12-20", - "procedural_history": "some rando history", - "source": "C", - "panel": [], - "judges": "", - "case_name": "Voutila v. Bonvini", - "attorneys": "a bunch of crooks!", - "slug": "case-name-cluster", - "posture": "", - "date_modified": "2015-08-15T14:10:56.801Z", - "precedential_status": "Published", - "citation_count": 1, - "scdb_id": "", - "nature_of_suit": "", - "non_participating_judges": [], - "date_created": "2015-08-15T14:10:56.801Z", - "docket": 10 - }, - "model": "search.opinioncluster", - "pk": 10 - }, - { - "fields": { - "sha1": "asdfasdfasdfasdfasdfasddf", - "date_modified": "2015-12-20T14:20:00.801Z", - "extracted_by_ocr": false, - "author": null, - "plain_text": "This is a combined opinion.", - "html": "", - "download_url": null, - "cluster": 10, - "html_with_citations": "", - "local_path": "doc/2005/05/04/state_of_indiana_v._charles_barker.doc", - "html_columbia": "", - "joined_by": [], - "date_created": "2015-08-15T14:10:56.801Z", - "html_lawbox": "", - "per_curiam": false, - "type": "010combined", - "ordering_key": 1 - }, - "model": "search.opinion", - "pk": 10 - }, - { - "fields": { - "sha1": "asdfasdfasdfasdfasdfasddf", - "date_modified": "2015-12-20T14:20:00.801Z", - "extracted_by_ocr": false, - "author": null, - "plain_text": "This is a lead opinion too.", - "html": "", - "download_url": null, - "cluster": 10, - "html_with_citations": "", - "local_path": "txt/2015/12/28/opinion_text.txt", - "html_columbia": "", - "joined_by": [], - "date_created": "2015-08-15T14:10:56.801Z", - "html_lawbox": "", - "per_curiam": false, - "type": "020lead", - "ordering_key": 2 - }, - "model": "search.opinion", - "pk": 11 - } -] diff --git a/cl/search/fixtures/test_objects_query_counts.json b/cl/search/fixtures/test_objects_query_counts.json index 6a3f97da23..ca69a08ccc 100644 --- a/cl/search/fixtures/test_objects_query_counts.json +++ b/cl/search/fixtures/test_objects_query_counts.json @@ -301,7 +301,7 @@ "html_lawbox":"", "per_curiam":false, "type":"020lead", - "ordering_key": 1 + "ordering_key": null }, "model":"search.opinion", "pk":1 @@ -326,7 +326,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "ordering_key": 1 + "ordering_key": null }, "model":"search.opinion", "pk":2 @@ -351,7 +351,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "ordering_key": 1 + "ordering_key": null }, "model":"search.opinion", "pk":3 @@ -375,7 +375,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "ordering_key": 2 + "ordering_key": null }, "model":"search.opinion", "pk":4 @@ -400,7 +400,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "ordering_key": 3 + "ordering_key": null }, "model":"search.opinion", "pk":5 @@ -424,7 +424,7 @@ "html_lawbox":"", "per_curiam":false, "type":"010combined", - "ordering_key": 4 + "ordering_key": null }, "model":"search.opinion", "pk":6 diff --git a/cl/search/fixtures/test_objects_search.json b/cl/search/fixtures/test_objects_search.json index 542d297d54..66c9915581 100644 --- a/cl/search/fixtures/test_objects_search.json +++ b/cl/search/fixtures/test_objects_search.json @@ -240,7 +240,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 1 @@ -263,7 +263,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 2 @@ -286,7 +286,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 3 @@ -309,7 +309,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 2 + "ordering_key": null }, "model": "search.opinion", "pk": 4 @@ -332,7 +332,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 3 + "ordering_key": null }, "model": "search.opinion", "pk": 5 @@ -355,7 +355,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 4 + "ordering_key": null }, "model": "search.opinion", "pk": 6 diff --git a/cl/visualizations/fixtures/api_scotus_map_data.json b/cl/visualizations/fixtures/api_scotus_map_data.json index 3a13c3e4e7..3bce46e664 100644 --- a/cl/visualizations/fixtures/api_scotus_map_data.json +++ b/cl/visualizations/fixtures/api_scotus_map_data.json @@ -122,7 +122,7 @@ "html_lawbox": "", "per_curiam": false, "type": "020lead", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 1 @@ -145,7 +145,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 2 diff --git a/cl/visualizations/fixtures/scotus_map_data.json b/cl/visualizations/fixtures/scotus_map_data.json index e0760f42bf..bf97605525 100644 --- a/cl/visualizations/fixtures/scotus_map_data.json +++ b/cl/visualizations/fixtures/scotus_map_data.json @@ -903,7 +903,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 111014 @@ -926,7 +926,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 111113 @@ -949,7 +949,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 111464 @@ -972,7 +972,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 111505 @@ -995,7 +995,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 111924 @@ -1018,7 +1018,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 112331 @@ -1041,7 +1041,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 112646 @@ -1064,7 +1064,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 112779 @@ -1087,7 +1087,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 112874 @@ -1110,7 +1110,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 117967 @@ -1133,7 +1133,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 118377 @@ -1156,7 +1156,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 121168 @@ -1179,7 +1179,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 136984 @@ -1202,7 +1202,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 142900 @@ -1225,7 +1225,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 799990 @@ -1248,7 +1248,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 799993 @@ -1271,7 +1271,7 @@ "html_lawbox": "", "per_curiam": false, "type": "010combined", - "ordering_key": 1 + "ordering_key": null }, "model": "search.opinion", "pk": 2674862 From 611a174c61f741028efd511cbdb0f29c9c24d035 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 2 Aug 2024 15:49:06 -0400 Subject: [PATCH 45/50] refactor(columbia_utils): remove ordering from utils columbia Unwind the rest of the columbia order --- cl/corpus_importer/import_columbia/columbia_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index dec91fc1da..57bac9a66d 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -224,7 +224,6 @@ def extract_columbia_opinions( """ opinions: list = [] floating_content = [] - order = 1 # The opinion count starts from 1 # We iterate all content to look for all possible opinions for i, content in enumerate(outer_opinion): # type: int, Tag @@ -363,7 +362,6 @@ def process_extracted_opinions(extracted_opinions: list) -> list: opinions: list = [] authorless_content = [] - order = 1 # The opinion count starts from 1 for i, found_content in enumerate(extracted_opinions, start=1): byline = found_content.get("byline") From 7d86408acd6338485fe2bec16ec5200155842bd1 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 2 Aug 2024 15:50:47 -0400 Subject: [PATCH 46/50] refactor(columbia_utils): Reset order utils - line --- cl/corpus_importer/import_columbia/columbia_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index 57bac9a66d..b1a62cfd6c 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -224,6 +224,7 @@ def extract_columbia_opinions( """ opinions: list = [] floating_content = [] + order = 0 # We iterate all content to look for all possible opinions for i, content in enumerate(outer_opinion): # type: int, Tag @@ -362,6 +363,7 @@ def process_extracted_opinions(extracted_opinions: list) -> list: opinions: list = [] authorless_content = [] + order = 0 for i, found_content in enumerate(extracted_opinions, start=1): byline = found_content.get("byline") From 57182e80f4feb83be70cbdb5bdd90c1968634128 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 5 Aug 2024 14:07:40 -0600 Subject: [PATCH 47/50] feat(opinion_order): update clean method in Opinion model update command to order harvard opinions --- .../commands/update_opinions_order.py | 65 ++++++++++++------- cl/search/models.py | 9 ++- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index ab445a1491..592848c3e4 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -1,7 +1,11 @@ -from django.db.models import Count, Q +import argparse +import time + +from django.db import transaction +from django.db.models import Count from cl.lib.command_utils import VerboseCommand, logger -from cl.search.models import SOURCES, Opinion, OpinionCluster +from cl.search.models import Opinion, OpinionCluster def sort_harvard_opinions(options) -> None: @@ -12,44 +16,51 @@ def sort_harvard_opinions(options) -> None: The harvard importer created the opinions in order of appearance in the file - :param options: dict of arguments skip until and limit if given + :param options: dict of arguments passed to the command :return: None """ skip_until = options.get("skip_until", None) limit = options.get("limit", None) - base_filter = ( + # The filepath_json_harvard field can only be filled by the harvard importer, + # this helps us confirm that it was imported from a Harvard json + harvard_clusters = ( OpinionCluster.objects.exclude(filepath_json_harvard="") + .prefetch_related("sub_opinions") .annotate(opinions_count=Count("sub_opinions")) .filter(opinions_count__gt=1) + .order_by("id") ) - if skip_until: - base_filter &= Q(pk__gte=skip_until) + harvard_clusters = harvard_clusters.filter(pk__gte=skip_until) - harvard_clusters = ( - OpinionCluster.objects.annotate(opinions_count=Count("sub_opinions")) - .filter(base_filter) - .order_by("id") - ) if limit: harvard_clusters = harvard_clusters[:limit] for cluster in harvard_clusters: logger.info(f"Processing cluster id: {cluster}") - sub_opinions = cluster.sub_opinions.exclude( - type=Opinion.COMBINED, - ).order_by("id") - if not sub_opinions: - logger.info( - f"No sub_opinions left to order for cluster id: {cluster}" - ) - continue - for opinion_order, cluster_op in enumerate(sub_opinions, start=1): - cluster_op.ordering_key = opinion_order - cluster_op.save() - logger.info(msg=f"Opinions reordered for cluster id: {cluster.id}") + opinion_order = 1 + any_update = False + with transaction.atomic(): + # We need to make sure they are ordered by id + for cluster_op in cluster.sub_opinions.all().order_by("id"): + if cluster_op.type == Opinion.COMBINED: + continue + cluster_op.ordering_key = opinion_order + cluster_op.save() + opinion_order = opinion_order + 1 + any_update = True + if not any_update: + # We want to know if you found anything unexpected, like for example + # only having combined opinions + logger.info( + f"No sub_opinions updated for cluster id: {cluster}" + ) + continue + logger.info(msg=f"Opinions reordered for cluster id: {cluster.id}") + # Wait between each processed cluster to avoid issues with elastic + time.sleep(options["delay"]) class Command(VerboseCommand): @@ -80,7 +91,6 @@ def add_arguments(self, parser): help="Number of files to sort", required=False, ) - parser.add_argument( "--action", type=self.valid_actions, @@ -88,6 +98,13 @@ def add_arguments(self, parser): help="The action you wish to take. Valid choices are: %s" % (", ".join(self.VALID_ACTIONS.keys())), ) + parser.add_argument( + "--delay", + type=float, + default=0.2, + help="How long to wait to update each opinion (in seconds, allows " + "floating numbers).", + ) def handle(self, *args, **options): super().handle(*args, **options) diff --git a/cl/search/models.py b/cl/search/models.py index 1bde2ebad0..e2cdedc905 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3347,11 +3347,10 @@ def get_absolute_url(self) -> str: def clean(self) -> None: if self.type == "": raise ValidationError("'type' is a required field.") - if self.ordering_key is not None and self.ordering_key != "": - if self.ordering_key < 1: - raise ValidationError( - {"ordering_key": "Ordering key cannot be zero or negative"} - ) + if isinstance(self.ordering_key, int) and self.ordering_key < 1: + raise ValidationError( + {"ordering_key": "Ordering key cannot be zero or negative"} + ) def save( self, From d1e2e004f24f326d96b0d0754965de72b28b8590 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 5 Aug 2024 14:20:41 -0600 Subject: [PATCH 48/50] feat(opinion_order): rename migrations --- .../{0032_order_opinions.py => 0033_order_opinions.py} | 4 ++-- .../{0032_order_opinions.sql => 0033_order_opinions.sql} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename cl/search/migrations/{0032_order_opinions.py => 0033_order_opinions.py} (97%) rename cl/search/migrations/{0032_order_opinions.sql => 0033_order_opinions.sql} (100%) diff --git a/cl/search/migrations/0032_order_opinions.py b/cl/search/migrations/0033_order_opinions.py similarity index 97% rename from cl/search/migrations/0032_order_opinions.py rename to cl/search/migrations/0033_order_opinions.py index 9b4db9fbe7..ce5ea91c13 100644 --- a/cl/search/migrations/0032_order_opinions.py +++ b/cl/search/migrations/0033_order_opinions.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-30 18:59 +# Generated by Django 5.0.7 on 2024-08-05 20:19 import pgtrigger.compiler import pgtrigger.migrations @@ -11,7 +11,7 @@ class Migration(migrations.Migration): "people_db", "0016_remove_abarating_update_or_delete_snapshot_update_and_more", ), - ("search", "0031_alter_opinion_type_alter_opinioncluster_source_noop"), + ("search", "0032_update_docket_numbering_fields"), ] operations = [ diff --git a/cl/search/migrations/0032_order_opinions.sql b/cl/search/migrations/0033_order_opinions.sql similarity index 100% rename from cl/search/migrations/0032_order_opinions.sql rename to cl/search/migrations/0033_order_opinions.sql From 5958f54bd5a23b21a5bbd28682e50b98984492ad Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 5 Aug 2024 15:03:17 -0600 Subject: [PATCH 49/50] feat(opinion_order): add customers sql update replica sql --- cl/search/migrations/0033_order_opinions.sql | 112 ------------------ .../0033_order_opinions_customers.sql | 10 ++ 2 files changed, 10 insertions(+), 112 deletions(-) create mode 100644 cl/search/migrations/0033_order_opinions_customers.sql diff --git a/cl/search/migrations/0033_order_opinions.sql b/cl/search/migrations/0033_order_opinions.sql index e02c150f4d..e2e07aee39 100644 --- a/cl/search/migrations/0033_order_opinions.sql +++ b/cl/search/migrations/0033_order_opinions.sql @@ -1,13 +1,5 @@ BEGIN; -- --- Remove trigger update_or_delete_snapshot_delete from model opinion --- -DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; --- --- Remove trigger update_or_delete_snapshot_update from model opinion --- -DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; --- -- Add field ordering_key to opinion -- ALTER TABLE "search_opinion" ADD COLUMN "ordering_key" integer NULL; @@ -15,110 +7,6 @@ ALTER TABLE "search_opinion" ADD COLUMN "ordering_key" integer NULL; -- Add field ordering_key to opinionevent -- ALTER TABLE "search_opinionevent" ADD COLUMN "ordering_key" integer NULL; --- --- Create trigger update_or_delete_snapshot_update on model opinion --- - - CREATE OR REPLACE FUNCTION "public"._pgtrigger_should_ignore( - trigger_name NAME - ) - RETURNS BOOLEAN AS $$ - DECLARE - _pgtrigger_ignore TEXT[]; - _result BOOLEAN; - BEGIN - BEGIN - SELECT INTO _pgtrigger_ignore - CURRENT_SETTING('pgtrigger.ignore'); - EXCEPTION WHEN OTHERS THEN - END; - IF _pgtrigger_ignore IS NOT NULL THEN - SELECT trigger_name = ANY(_pgtrigger_ignore) - INTO _result; - RETURN _result; - ELSE - RETURN FALSE; - END IF; - END; - $$ LANGUAGE plpgsql; - - CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_update_67ecd() - RETURNS TRIGGER AS $$ - - BEGIN - IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN - IF (TG_OP = 'DELETE') THEN - RETURN OLD; - ELSE - RETURN NEW; - END IF; - END IF; - INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; - END; - $$ LANGUAGE plpgsql; - - DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion"; - CREATE TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd - AFTER UPDATE ON "search_opinion" - - - FOR EACH ROW WHEN (OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."date_created" IS DISTINCT FROM (NEW."date_created") OR OLD."cluster_id" IS DISTINCT FROM (NEW."cluster_id") OR OLD."author_id" IS DISTINCT FROM (NEW."author_id") OR OLD."author_str" IS DISTINCT FROM (NEW."author_str") OR OLD."per_curiam" IS DISTINCT FROM (NEW."per_curiam") OR OLD."joined_by_str" IS DISTINCT FROM (NEW."joined_by_str") OR OLD."type" IS DISTINCT FROM (NEW."type") OR OLD."sha1" IS DISTINCT FROM (NEW."sha1") OR OLD."page_count" IS DISTINCT FROM (NEW."page_count") OR OLD."download_url" IS DISTINCT FROM (NEW."download_url") OR OLD."local_path" IS DISTINCT FROM (NEW."local_path") OR OLD."plain_text" IS DISTINCT FROM (NEW."plain_text") OR OLD."html" IS DISTINCT FROM (NEW."html") OR OLD."html_lawbox" IS DISTINCT FROM (NEW."html_lawbox") OR OLD."html_columbia" IS DISTINCT FROM (NEW."html_columbia") OR OLD."html_anon_2020" IS DISTINCT FROM (NEW."html_anon_2020") OR OLD."xml_harvard" IS DISTINCT FROM (NEW."xml_harvard") OR OLD."html_with_citations" IS DISTINCT FROM (NEW."html_with_citations") OR OLD."extracted_by_ocr" IS DISTINCT FROM (NEW."extracted_by_ocr") OR OLD."ordering_key" IS DISTINCT FROM (NEW."ordering_key")) - EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_update_67ecd(); - - COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_update_67ecd ON "search_opinion" IS '7137855274503cc2c50a17729f82e150d2b7d872'; - --- --- Create trigger update_or_delete_snapshot_delete on model opinion --- - - CREATE OR REPLACE FUNCTION "public"._pgtrigger_should_ignore( - trigger_name NAME - ) - RETURNS BOOLEAN AS $$ - DECLARE - _pgtrigger_ignore TEXT[]; - _result BOOLEAN; - BEGIN - BEGIN - SELECT INTO _pgtrigger_ignore - CURRENT_SETTING('pgtrigger.ignore'); - EXCEPTION WHEN OTHERS THEN - END; - IF _pgtrigger_ignore IS NOT NULL THEN - SELECT trigger_name = ANY(_pgtrigger_ignore) - INTO _result; - RETURN _result; - ELSE - RETURN FALSE; - END IF; - END; - $$ LANGUAGE plpgsql; - - CREATE OR REPLACE FUNCTION pgtrigger_update_or_delete_snapshot_delete_1f4fd() - RETURNS TRIGGER AS $$ - - BEGIN - IF ("public"._pgtrigger_should_ignore(TG_NAME) IS TRUE) THEN - IF (TG_OP = 'DELETE') THEN - RETURN OLD; - ELSE - RETURN NEW; - END IF; - END IF; - INSERT INTO "search_opinionevent" ("author_id", "author_str", "cluster_id", "date_created", "date_modified", "download_url", "extracted_by_ocr", "html", "html_anon_2020", "html_columbia", "html_lawbox", "html_with_citations", "id", "joined_by_str", "local_path", "ordering_key", "page_count", "per_curiam", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "plain_text", "sha1", "type", "xml_harvard") VALUES (OLD."author_id", OLD."author_str", OLD."cluster_id", OLD."date_created", OLD."date_modified", OLD."download_url", OLD."extracted_by_ocr", OLD."html", OLD."html_anon_2020", OLD."html_columbia", OLD."html_lawbox", OLD."html_with_citations", OLD."id", OLD."joined_by_str", OLD."local_path", OLD."ordering_key", OLD."page_count", OLD."per_curiam", _pgh_attach_context(), NOW(), 'update_or_delete_snapshot', OLD."id", OLD."plain_text", OLD."sha1", OLD."type", OLD."xml_harvard"); RETURN NULL; - END; - $$ LANGUAGE plpgsql; - - DROP TRIGGER IF EXISTS pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion"; - CREATE TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd - AFTER DELETE ON "search_opinion" - - - FOR EACH ROW - EXECUTE PROCEDURE pgtrigger_update_or_delete_snapshot_delete_1f4fd(); - - COMMENT ON TRIGGER pgtrigger_update_or_delete_snapshot_delete_1f4fd ON "search_opinion" IS '98fb52aa60fd8e89a83f8f7ac77ba5892739fb37'; - -- -- Create constraint unique_opinion_ordering_key on model opinion -- diff --git a/cl/search/migrations/0033_order_opinions_customers.sql b/cl/search/migrations/0033_order_opinions_customers.sql new file mode 100644 index 0000000000..e7158e3002 --- /dev/null +++ b/cl/search/migrations/0033_order_opinions_customers.sql @@ -0,0 +1,10 @@ +BEGIN; +-- +-- Add field ordering_key to opinion +-- +ALTER TABLE "search_opinion" ADD COLUMN "ordering_key" integer NULL; +-- +-- Create constraint unique_opinion_ordering_key on model opinion +-- +ALTER TABLE "search_opinion" ADD CONSTRAINT "unique_opinion_ordering_key" UNIQUE ("cluster_id", "ordering_key"); +COMMIT; From ed564f932b4a0170f374afccceeab869269e4bc9 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 6 Aug 2024 12:46:16 -0600 Subject: [PATCH 50/50] feat(opinion_order): exclude columbia from clusters --- .../management/commands/update_opinions_order.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_opinions_order.py b/cl/corpus_importer/management/commands/update_opinions_order.py index 592848c3e4..1f1e5308e9 100644 --- a/cl/corpus_importer/management/commands/update_opinions_order.py +++ b/cl/corpus_importer/management/commands/update_opinions_order.py @@ -5,7 +5,7 @@ from django.db.models import Count from cl.lib.command_utils import VerboseCommand, logger -from cl.search.models import Opinion, OpinionCluster +from cl.search.models import SOURCES, Opinion, OpinionCluster def sort_harvard_opinions(options) -> None: @@ -24,12 +24,14 @@ def sort_harvard_opinions(options) -> None: limit = options.get("limit", None) # The filepath_json_harvard field can only be filled by the harvard importer, - # this helps us confirm that it was imported from a Harvard json + # this helps us confirm that it was imported from a Harvard json. We exclude + # clusters merged with columbia because those may need some extra verification harvard_clusters = ( OpinionCluster.objects.exclude(filepath_json_harvard="") .prefetch_related("sub_opinions") .annotate(opinions_count=Count("sub_opinions")) .filter(opinions_count__gt=1) + .exclude(source__contains=SOURCES.COLUMBIA_ARCHIVE) .order_by("id") ) if skip_until: