From 5236cab992bcf89ad218f01a45ac441757e37c84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Wed, 29 Jan 2025 17:56:31 +0100 Subject: [PATCH] mf --- python-client/tests/format_check/__init__.py | 1 + .../tests/format_check/test_check_jsonl_format.py | 11 +++++++++++ .../resources/jsonl-valid-gz/predictions.jsonl.gz | Bin 0 -> 41 bytes python-client/tira/check_format.py | 6 ++++-- python-client/tira/pandas_integration.py | 1 + python-client/tira/third_party_integrations.py | 5 +++-- 6 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 python-client/tests/resources/jsonl-valid-gz/predictions.jsonl.gz diff --git a/python-client/tests/format_check/__init__.py b/python-client/tests/format_check/__init__.py index a482ee93..54d73773 100644 --- a/python-client/tests/format_check/__init__.py +++ b/python-client/tests/format_check/__init__.py @@ -16,4 +16,5 @@ TSV_OUTPUT_VALID = RESOURCES / "tsv-valid" IR_QUERY_OUTPUT = RESOURCES / "query-processing-outputs" / "query-segmentation" JSONL_OUTPUT_VALID = RESOURCES / "jsonl-valid" +JSONL_GZ_OUTPUT_VALID = RESOURCES / "jsonl-valid-gz" JSONL_OUTPUT_INVALID = RESOURCES / "jsonl-invalid" diff --git a/python-client/tests/format_check/test_check_jsonl_format.py b/python-client/tests/format_check/test_check_jsonl_format.py index d3dde92c..479fedee 100644 --- a/python-client/tests/format_check/test_check_jsonl_format.py +++ b/python-client/tests/format_check/test_check_jsonl_format.py @@ -7,6 +7,7 @@ _OK, EMPTY_OUTPUT, IR_QUERY_OUTPUT, + JSONL_GZ_OUTPUT_VALID, JSONL_OUTPUT_INVALID, JSONL_OUTPUT_VALID, TSV_OUTPUT_VALID, @@ -30,6 +31,16 @@ def test_valid_jsonl_output_file(self): actual = check_format(JSONL_OUTPUT_VALID / "predictions.jsonl", "*.jsonl") self.assertEqual(expected, actual) + def test_valid_jsonl_gz_output_directory(self): + expected = [_OK, "The jsonl file has the correct format."] + actual = check_format(JSONL_GZ_OUTPUT_VALID, "*.jsonl") + self.assertEqual(expected, actual) + + def test_valid_jsonl_gz_output_file(self): + expected = [_OK, "The jsonl file has the correct format."] + actual = check_format(JSONL_GZ_OUTPUT_VALID / "predictions.jsonl.gz", "*.jsonl") + self.assertEqual(expected, actual) + def test_invalid_jsonl_output_directory(self): actual = check_format(JSONL_OUTPUT_INVALID, "*.jsonl") self.assertEqual(actual[0], _ERROR) diff --git a/python-client/tests/resources/jsonl-valid-gz/predictions.jsonl.gz b/python-client/tests/resources/jsonl-valid-gz/predictions.jsonl.gz new file mode 100644 index 0000000000000000000000000000000000000000..60fb24bcc2d563422ad70f3f7c9769e80722ba37 GIT binary patch literal 41 tcmb2|=3oE==G9@Rd;&KG8LWA_s%=Bh1D#}x(@UmtGce7K)6oE`000km4OajF literal 0 HcmV?d00001 diff --git a/python-client/tira/check_format.py b/python-client/tira/check_format.py index 0641f1ac..b4b68780 100644 --- a/python-client/tira/check_format.py +++ b/python-client/tira/check_format.py @@ -109,10 +109,12 @@ def check_format(self, run_output: Path): return [_fmt.ERROR, str(e)] def all_lines(self, run_output): - if str(run_output).endswith(".jsonl") and run_output.is_file(): + if (str(run_output).endswith(".jsonl") or str(run_output).endswith(".jsonl.gz")) and run_output.is_file(): matches = [run_output] else: - matches = [run_output / i for i in os.listdir(run_output) if i.endswith(".jsonl")] + matches = [ + run_output / i for i in os.listdir(run_output) if i.endswith(".jsonl") or i.endswith(".jsonl.gz") + ] if len(matches) != 1: raise ValueError( diff --git a/python-client/tira/pandas_integration.py b/python-client/tira/pandas_integration.py index e6b7e674..74870268 100644 --- a/python-client/tira/pandas_integration.py +++ b/python-client/tira/pandas_integration.py @@ -43,6 +43,7 @@ def from_retriever_submission( pd.DataFrame: The run file parsed to a pandas DataFrame. """ import pandas as pd + from tira.ir_datasets_util import translate_irds_id_to_tirex task, team, software = approach.split("/") diff --git a/python-client/tira/third_party_integrations.py b/python-client/tira/third_party_integrations.py index e3702d12..21ca0533 100644 --- a/python-client/tira/third_party_integrations.py +++ b/python-client/tira/third_party_integrations.py @@ -150,7 +150,7 @@ def persist_and_normalize_run( if upload_to_tira and not in_tira_sandbox(): tira = _tira_client(tira_client) tmp = tira.get_dataset(upload_to_tira) - if not tmp or 'dataset_id' not in tmp: + if not tmp or "dataset_id" not in tmp: upload_to_tira = None else: upload_to_tira = None @@ -165,6 +165,7 @@ def persist_and_normalize_run( output_file = Path(output_file).parent upload_run_anonymous(output_file, tira, upload_to_tira) + def _tira_client(default_tira_client=None): if in_tira_sandbox(): return None @@ -177,7 +178,7 @@ def _tira_client(default_tira_client=None): return RestClient() -def upload_run_anonymous(directory: Path=None, tira_client=None, dataset_id=None): +def upload_run_anonymous(directory: Path = None, tira_client=None, dataset_id=None): tira = _tira_client(tira_client) if not tira: return