diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index b3f44f216e..7c4a3d9441 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -38,6 +38,7 @@ ) from cl.search.exception import ( BadProximityQuery, + DisallowedWildcardPattern, UnbalancedParenthesesQuery, UnbalancedQuotesQuery, ) @@ -455,6 +456,7 @@ def query_alerts( UnbalancedParenthesesQuery, UnbalancedQuotesQuery, BadProximityQuery, + DisallowedWildcardPattern, TransportError, ConnectionError, RequestError, diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index c39292e9aa..a33118310b 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -43,12 +43,14 @@ ) from cl.lib.utils import ( check_for_proximity_tokens, + check_query_for_disallowed_wildcards, check_unbalanced_parenthesis, check_unbalanced_quotes, cleanup_main_query, get_array_of_selected_fields, lookup_child_courts, map_to_docket_entry_sorting, + perform_special_character_replacements, ) from cl.people_db.models import Position from cl.search.constants import ( @@ -77,6 +79,7 @@ ) from cl.search.exception import ( BadProximityQuery, + DisallowedWildcardPattern, ElasticBadRequestError, QueryType, UnbalancedParenthesesQuery, @@ -488,7 +491,9 @@ def build_text_filter(field: str, value: str) -> List: if value: if isinstance(value, str): validate_query_syntax(value, QueryType.FILTER) - + if check_query_for_disallowed_wildcards(value): + raise DisallowedWildcardPattern(QueryType.FILTER) + value = perform_special_character_replacements(value) return [ Q( "query_string", @@ -3089,6 +3094,7 @@ def do_es_api_query( UnbalancedParenthesesQuery, UnbalancedQuotesQuery, BadProximityQuery, + DisallowedWildcardPattern, ) as e: raise ElasticBadRequestError(detail=e.message) diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 047389acd6..15bd965e84 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -11,6 +11,7 @@ from cl.lib.crypto import sha256 from cl.lib.model_helpers import clean_docket_number, is_docket_number from cl.lib.types import CleanData +from cl.search.exception import DisallowedWildcardPattern, QueryType class _UNSPECIFIED: @@ -232,9 +233,79 @@ def modify_court_id_queries(query_str: str) -> str: return modified_query +def check_query_for_disallowed_wildcards(query_string: str) -> bool: + """Check if the query_string contains not allowed wildcards that can be + really expensive. + https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-wildcard + + * at the beginning of a term + * in a term with less than 3 characters. + ! in a term with less than 3 characters. + + Like: + *ing + a* or !a + + :param query_string: The query string to be checked. + :return: A boolean indicating if the query string contains not allowed wildcards. + """ + + # Match any term that starts with * + wildcard_start = r"(?:^|\s)\*\w+" + + # Match any term with less than 3 chars that ends with * + wildcard_end = r"(?:^|\s)\w{1,2}\*(?=$|\s)" + + # Match any term with less than 3 chars that starts with ! + root_expander_short_term = r"(?:^|\s)\![^\s]{1,2}(?=$|\s)" + + if any( + re.search(pattern, query_string) + for pattern in [wildcard_start, wildcard_end, root_expander_short_term] + ): + return True + return False + + +def perform_special_character_replacements(query_string: str) -> str: + """Perform a series of special character replacements in the given query + string to clean it up and support the % &, !, and * search connectors. + + :param query_string: The user query string. + :return: The transformed query string with the specified replacements applied. + """ + + # Replace smart quotes with standard double quotes for consistency. + query_string = re.sub(r"[“”]", '"', query_string) + + # Replace % (but not) by NOT + query_string = re.sub(r" % ", " NOT ", query_string) + + # Replace & by AND + query_string = re.sub(r" & ", " AND ", query_string) + + # Replace ! (root expander) at the beginning of words with * at the end. + root_expander_pattern = r"(^|\s)!([a-zA-Z]+)" + root_expander_replacement = r"\1\2*" + query_string = re.sub( + root_expander_pattern, root_expander_replacement, query_string + ) + + # Replace * (universal character) that is not at the end of a word with ?. + universal_char_pattern = r"\*(?=\w)" + universal_char_replacement = "?" + query_string = re.sub( + universal_char_pattern, universal_char_replacement, query_string + ) + + return query_string + + def cleanup_main_query(query_string: str) -> str: """Enhance the query string with some simple fixes + - Check for expensive wildcards and thrown an error if found. + - Perform special character replacements for search connectors. - Make any numerical queries into phrases (except dates) - Add hyphens to district docket numbers that lack them - Ignore tokens inside phrases @@ -249,8 +320,12 @@ def cleanup_main_query(query_string: str) -> str: """ inside_a_phrase = False cleaned_items = [] - # Replace smart quotes with standard double quotes for consistency. - query_string = re.sub(r"[“”]", '"', query_string) + + if check_query_for_disallowed_wildcards(query_string): + raise DisallowedWildcardPattern(QueryType.QUERY_STRING) + + query_string = perform_special_character_replacements(query_string) + for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string): if not item: continue diff --git a/cl/search/exception.py b/cl/search/exception.py index f92e76c895..0d51d152d3 100644 --- a/cl/search/exception.py +++ b/cl/search/exception.py @@ -56,3 +56,9 @@ class ElasticBadRequestError(APIException): "Elasticsearch Bad request error. Please review your query." ) default_code = "bad_request" + + +class DisallowedWildcardPattern(SyntaxQueryError): + """Query contains a disallowed wildcard pattern""" + + message = "The query contains a disallowed expensive wildcard pattern." diff --git a/cl/search/feeds.py b/cl/search/feeds.py index 435506cf5d..0e42d955ba 100644 --- a/cl/search/feeds.py +++ b/cl/search/feeds.py @@ -19,6 +19,7 @@ from cl.search.documents import ESRECAPDocument, OpinionClusterDocument from cl.search.exception import ( BadProximityQuery, + DisallowedWildcardPattern, UnbalancedParenthesesQuery, UnbalancedQuotesQuery, ) @@ -255,6 +256,7 @@ def wrapped_view(request, *args, **kwargs): UnbalancedParenthesesQuery, UnbalancedQuotesQuery, BadProximityQuery, + DisallowedWildcardPattern, ApiError, ) as e: logger.warning("Couldn't load the feed page. Error was: %s", e) diff --git a/cl/search/templates/includes/no_results.html b/cl/search/templates/includes/no_results.html index 5e25f1ec84..f42f7b300a 100644 --- a/cl/search/templates/includes/no_results.html +++ b/cl/search/templates/includes/no_results.html @@ -33,6 +33,8 @@

Did you forget to close one or more parentheses? {% elif error_message == "unbalanced_quotes" %} Did you forget to close one or more quotes? + {% elif error_message == "disallowed_wildcard_pattern" %} + The query contains a disallowed expensive wildcard pattern. {% endif %} {% else %} encountered an error. diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 82784ce5ea..9c06e7e6e0 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -422,36 +422,63 @@ def setUpTestData(cls): sub_opinions=RelatedFactory( OpinionWithChildrenFactory, factory_related_name="cluster", - html_columbia="

Code, § 1-815

", + html_columbia="

Code, § 1-815

", + plain_text="", ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) OpinionClusterFactoryWithChildrenAndParents( case_name="Strickland v. Lorem.", + case_name_full="Strickland v. Lorem.", docket=DocketFactory(court=cls.court, docket_number="123456"), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Motion", + ), ) OpinionClusterFactoryWithChildrenAndParents( case_name="America vs Bank", + case_name_full="America vs Bank", docket=DocketFactory( court=cls.child_court_1, docket_number="34-2535" ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Strickland Motion", + ), ) OpinionClusterFactoryWithChildrenAndParents( case_name="Johnson v. National", + case_name_full="Johnson v. National", docket=DocketFactory( court=cls.child_court_2_2, docket_number="36-2000" ), + judges="Computer point", precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Computer point", + ), ) OpinionClusterFactoryWithChildrenAndParents( case_name="California v. Nevada", + case_name_full="California v. Nevada", docket=DocketFactory( court=cls.child_gand_2, docket_number="38-1000" ), + judges="Composition plant", precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Composition plant", + ), ) call_command( "cl_index_parent_and_child_docs", @@ -838,6 +865,413 @@ def test_raise_forbidden_error_on_depth_pagination(self) -> None: ) self.assertEqual(r.status_code, HTTPStatus.FORBIDDEN) + def test_query_cleanup_function(self) -> None: + # Send string of search_query to the function and expect it + # to be encoded properly + q_a = ( + ( + "12-9238 happy Gilmore", + 'docketNumber:"12-9238"~1 happy Gilmore', + ), + ("“ping tree” leads", '"ping tree" leads'), + ('"this is” a “test"', '"this is" a "test"'), + ("1chicken NUGGET", '"1chicken" NUGGET'), + ( + "We can drive her home with 1headlight", + 'We can drive her home with "1headlight"', + ), + # Tildes are ignored even though they have numbers? + ('"net neutrality"~2', '"net neutrality"~2'), + # No changes to regular queries? + ("Look Ma, no numbers!", "Look Ma, no numbers!"), + # Docket numbers hyphenated into phrases? + ( + "12cv9834 Monkey Goose", + 'docketNumber:"12-cv-9834"~1 Monkey Goose', + ), + # Valid dates ignored? + ( + "2020-10-31T00:00:00Z Monkey Goose", + "2020-10-31T00:00:00Z Monkey Goose", + ), + # Simple range query? + ("[1 TO 4]", '["1" TO "4"]'), + # Dates ignored in ranges? + ( + "[* TO 2020-10-31T00:00:00Z] Monkey Goose", + "[* TO 2020-10-31T00:00:00Z] Monkey Goose", + ), + ("id:10", "id:10"), + ("id:[* TO 5] Monkey Goose", 'id:[* TO "5"] Monkey Goose'), + ( + "(Tempura AND 12cv3392) OR sushi", + '(Tempura AND docketNumber:"12-cv-3392"~1) OR sushi', + ), + # Phrase search with numbers (w/and w/o § mark)? + ('"18 USC 242"', '"18 USC 242"'), + ('"18 USC §242"', '"18 USC §242"'), + ('"this is a test" asdf', '"this is a test" asdf'), + ('asdf "this is a test" asdf', 'asdf "this is a test" asdf'), + ( + '"this is a test" 22cv3332', + '"this is a test" docketNumber:"22-cv-3332"~1', + ), + ( + '"this is a test" ~2', + '"this is a test"~2', + ), + ( + '"this is a test" ~2 and "net neutrality" ~5 and 22cv3332', + '"this is a test"~2 and "net neutrality"~5 and docketNumber:"22-cv-3332"~1', + ), + ( + "Strickland % Lorem % America", + "Strickland NOT Lorem NOT America", + ), + ( + "Strickland% Lorem% America", + "Strickland% Lorem% America", + ), + ( + "Strickland & Motion & Lorem", + "Strickland AND Motion AND Lorem", + ), + ( + "!Strick !Mot", + "Strick* Mot*", + ), + ( + "!111 !444", + '!"111" !"444"', + ), + ( + "b*ra*e b*rav*", + "b?ra?e b?rav*", + ), + ) + for q, a in q_a: + print("Does {q} --> {a} ? ".format(**{"q": q, "a": a})) + self.assertEqual(cleanup_main_query(q), a) + + def test_built_in_search_connectors(self) -> None: + """Verify that built in ES search connectors return the expected results.""" + + tests = [ + { + "label": "NOT query", + "search_params": { + "q": "Strickland NOT Lorem NOT America", + }, + "expected_count": 1, + "expected_in_content": ["1:21-cv-1234"], + }, + { + "label": "AND connector test", + "search_params": { + "q": "Strickland AND Motion AND Lorem", + }, + "expected_count": 1, + "expected_in_content": ["123456"], + }, + { + "label": "Zero or more chars wildcard *", + "search_params": { + "q": "Comp*", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Universal Character ?", + "search_params": { + "q": "p??nt", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Combined operators", + "search_params": { + "q": "Strickland AND moti* AND ba?k NOT Lorem", + }, + "expected_count": 1, + "expected_in_content": ["34-2535"], + }, + ] + + for test_case in tests: + with self.subTest(label=test_case["label"]): + response = self.client.get( + reverse("show_results"), + test_case["search_params"], + ) + actual = self.get_article_count(response) + self.assertEqual( + actual, + test_case["expected_count"], + msg=f"Failed on: {test_case['label']}", + ) + decoded_content = response.content.decode() + for expected_str in test_case["expected_in_content"]: + self.assertIn( + expected_str, + decoded_content, + msg=f"Failed on: {test_case['label']} missing {expected_str}", + ) + + def test_support_search_connectors(self) -> None: + """Verify that new supported custom search connectors yield the + expected results. + """ + + tests = [ + { + "label": "But not %", + "search_params": { + "q": "Strickland % Lorem % America", + }, + "expected_count": 1, + "expected_in_content": ["1:21-cv-1234"], + }, + { + "label": "& connector test", + "search_params": { + "q": "Strickland & Motion & Lorem", + }, + "expected_count": 1, + "expected_in_content": ["123456"], + }, + { + "label": "! Root expander suffix", + "search_params": { + "q": "!Comp", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Universal Character *", + "search_params": { + "q": "p**nt", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Combined operators", + "search_params": { + "q": "Strickland & !moti & ba*k % Lorem", + }, + "expected_count": 1, + "expected_in_content": ["34-2535"], + }, + ] + + for test_case in tests: + with self.subTest(label=test_case["label"]): + # Frontend + response = self.client.get( + reverse("show_results"), + test_case["search_params"], + ) + actual = self.get_article_count(response) + self.assertEqual( + actual, + test_case["expected_count"], + msg=f"Failed on: {test_case['label']}", + ) + decoded_content = response.content.decode() + for expected_str in test_case["expected_in_content"]: + self.assertIn( + expected_str, + decoded_content, + msg=f"Failed on: {test_case['label']} missing {expected_str}", + ) + + # API + api_response = self.client.get( + reverse("search-list", kwargs={"version": "v4"}), + test_case["search_params"], + ) + self.assertEqual( + len(api_response.data["results"]), + test_case["expected_count"], + msg=f"Failed on API: {test_case['label']}", + ) + decoded_content = api_response.content.decode() + for expected_str in test_case["expected_in_content"]: + self.assertIn( + expected_str, + decoded_content, + msg=f"Failed on Frontend: {test_case['label']} missing {expected_str}", + ) + + def test_support_search_connectors_filters(self) -> None: + """Verify that new supported custom search connectors yield the + expected results. + """ + + tests = [ + { + "label": "But not %", + "search_params": { + "case_name": "Strickland % Lorem % America", + }, + "expected_count": 1, + "expected_in_content": ["1:21-cv-1234"], + }, + { + "label": "& connector test", + "search_params": { + "case_name": "Strickland & Lorem", + }, + "expected_count": 1, + "expected_in_content": ["123456"], + }, + { + "label": "! Root expander suffix", + "search_params": { + "judge": "!Comp", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Universal Character *", + "search_params": { + "judge": "p**nt", + }, + "expected_count": 2, + "expected_in_content": ["36-2000", "38-1000"], + }, + { + "label": "Combined operators", + "search_params": { + "case_name": "Calif*rnia & !Nev", + }, + "expected_count": 1, + "expected_in_content": ["38-1000"], + }, + ] + + for test_case in tests: + with self.subTest(label=test_case["label"]): + # Frontend + response = self.client.get( + reverse("show_results"), + test_case["search_params"], + ) + actual = self.get_article_count(response) + self.assertEqual( + actual, + test_case["expected_count"], + msg=f"Failed on: {test_case['label']}", + ) + decoded_content = response.content.decode() + for expected_str in test_case["expected_in_content"]: + self.assertIn( + expected_str, + decoded_content, + msg=f"Failed on Frontend: {test_case['label']} missing {expected_str}", + ) + + # API + api_response = self.client.get( + reverse("search-list", kwargs={"version": "v4"}), + test_case["search_params"], + ) + self.assertEqual( + len(api_response.data["results"]), + test_case["expected_count"], + msg=f"Failed on API: {test_case['label']}", + ) + decoded_content = api_response.content.decode() + for expected_str in test_case["expected_in_content"]: + self.assertIn( + expected_str, + decoded_content, + msg=f"Failed on Frontend: {test_case['label']} missing {expected_str}", + ) + + def test_disallowed_wildcard_pattern(self) -> None: + """Verify that expensive wildcard queries thrown an error.""" + + tests = [ + { + "label": "Disallowed ! in short queries.", + "search_params": { + "q": "!ap", + }, + }, + { + "label": "Disallowed * at the end in short queries.", + "search_params": { + "q": "ap*", + }, + }, + { + "label": "Disallowed * at the beginning.", + "search_params": { + "q": "*ing", + }, + }, + { + "label": "Disallowed ! in short queries - Filter.", + "search_params": { + "case_name": "!ap", + }, + }, + { + "label": "Disallowed * at the end in short queries - Filter.", + "search_params": { + "judge": "ap*", + }, + }, + { + "label": "Disallowed * at the beginning - Filter.", + "search_params": { + "case_name": "*ing", + }, + }, + ] + + for test_case in tests: + with self.subTest(label=test_case["label"]): + response = self.client.get( + reverse("show_results"), + test_case["search_params"], + ) + decoded_content = response.content.decode() + self.assertIn( + "The query contains a disallowed expensive wildcard pattern", + decoded_content, + msg=f"Failed on: {test_case['label']}, no disallowed wildcard pattern error.", + ) + + # API V4 + api_response = self.client.get( + reverse("search-list", kwargs={"version": "v4"}), + test_case["search_params"], + ) + self.assertEqual(api_response.status_code, 400) + self.assertEqual( + api_response.data["detail"], + "The query contains a disallowed expensive wildcard pattern.", + msg="Failed for V4", + ) + + # API V3 + api_response = self.client.get( + reverse("search-list", kwargs={"version": "v3"}), + test_case["search_params"], + ) + self.assertEqual(api_response.status_code, 400) + self.assertEqual( + api_response.data["detail"], + "The query contains a disallowed expensive wildcard pattern.", + msg="Failed for V3", + ) + class SearchAPIV4CommonTest(ESIndexTestCase, TestCase): """Common tests for the Search API V4 endpoints.""" @@ -932,70 +1366,6 @@ def _perform_wildcard_search(self): result_count = self.browser.find_element(By.ID, "result-count") self.assertIn("Opinions", result_count.text) - def test_query_cleanup_function(self) -> None: - # Send string of search_query to the function and expect it - # to be encoded properly - q_a = ( - ( - "12-9238 happy Gilmore", - 'docketNumber:"12-9238"~1 happy Gilmore', - ), - ("“ping tree” leads", '"ping tree" leads'), - ('"this is” a “test"', '"this is" a "test"'), - ("1chicken NUGGET", '"1chicken" NUGGET'), - ( - "We can drive her home with 1headlight", - 'We can drive her home with "1headlight"', - ), - # Tildes are ignored even though they have numbers? - ('"net neutrality"~2', '"net neutrality"~2'), - # No changes to regular queries? - ("Look Ma, no numbers!", "Look Ma, no numbers!"), - # Docket numbers hyphenated into phrases? - ( - "12cv9834 Monkey Goose", - 'docketNumber:"12-cv-9834"~1 Monkey Goose', - ), - # Valid dates ignored? - ( - "2020-10-31T00:00:00Z Monkey Goose", - "2020-10-31T00:00:00Z Monkey Goose", - ), - # Simple range query? - ("[1 TO 4]", '["1" TO "4"]'), - # Dates ignored in ranges? - ( - "[* TO 2020-10-31T00:00:00Z] Monkey Goose", - "[* TO 2020-10-31T00:00:00Z] Monkey Goose", - ), - ("id:10", "id:10"), - ("id:[* TO 5] Monkey Goose", 'id:[* TO "5"] Monkey Goose'), - ( - "(Tempura AND 12cv3392) OR sushi", - '(Tempura AND docketNumber:"12-cv-3392"~1) OR sushi', - ), - # Phrase search with numbers (w/and w/o § mark)? - ('"18 USC 242"', '"18 USC 242"'), - ('"18 USC §242"', '"18 USC §242"'), - ('"this is a test" asdf', '"this is a test" asdf'), - ('asdf "this is a test" asdf', 'asdf "this is a test" asdf'), - ( - '"this is a test" 22cv3332', - '"this is a test" docketNumber:"22-cv-3332"~1', - ), - ( - '"this is a test" ~2', - '"this is a test"~2', - ), - ( - '"this is a test" ~2 and "net neutrality" ~5 and 22cv3332', - '"this is a test"~2 and "net neutrality"~5 and docketNumber:"22-cv-3332"~1', - ), - ) - for q, a in q_a: - print("Does {q} --> {a} ? ".format(**{"q": q, "a": a})) - self.assertEqual(cleanup_main_query(q), a) - def test_query_cleanup_integration(self) -> None: # Dora goes to CL and performs a Search using a numbered citation # (e.g. "12-9238" or "3:18-cv-2383") diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 358f7c2725..7014aaa8c8 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -1818,10 +1818,6 @@ async def test_can_use_negation_in_queries(self) -> None: r = await self._test_article_count(search_params, 0, "negation query") self.assertIn("had no results", r.content.decode()) - search_params["q"] = "Howard !Honda" - r = await self._test_article_count(search_params, 0, "negation query") - self.assertIn("had no results", r.content.decode()) - search_params["q"] = "Howard -Honda" r = await self._test_article_count(search_params, 0, "negation query") self.assertIn("had no results", r.content.decode()) diff --git a/cl/search/views.py b/cl/search/views.py index 597ab2644c..4c10e94659 100644 --- a/cl/search/views.py +++ b/cl/search/views.py @@ -66,6 +66,7 @@ ) from cl.search.exception import ( BadProximityQuery, + DisallowedWildcardPattern, UnbalancedParenthesesQuery, UnbalancedQuotesQuery, ) @@ -556,6 +557,9 @@ def do_es_search( suggested_query = "proximity_filter" if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: suggested_query = "proximity_query" + except DisallowedWildcardPattern: + error = True + error_message = "disallowed_wildcard_pattern" finally: # Make sure to always call the _clean_form method search_form = _clean_form(