Skip to content

Commit

Permalink
allow to use md5-cached dirs
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed Jan 27, 2025
1 parent 4ad80b1 commit 9f3132a
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,4 @@ tira-web/tira-web/lib/

# TextMate
frontend/.editorconfig
python-client/tests/resources/local_cached_zip/extracted_datasets/
34 changes: 34 additions & 0 deletions python-client/tests/pd_load_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,37 @@ def test_pd_load_truths_wows_re_ranking(self):
self.assertEqual(3, len(actual))
self.assertEqual("hubble telescope achievements", actual.iloc[0]["query"])
self.assertEqual("1", actual.iloc[0]["qid"])

def test_pd_load_truths_local_cached_zip(self):
tira = Client(tira_cache_dir="tests/resources/local_cached_zip")
actual = tira.pd.truths("task-does-not-exist", "dataset-does-not-exist-20241201-training")
self.assertEqual(13, len(actual))
first_line = actual.iloc[0].to_dict()
last_line = actual.iloc[12].to_dict()

self.assertEqual("5a8865b0-19d7-4b33-bbe3-2f64ad54557f", first_line["id"])
self.assertEqual("1051399", first_line["query_id"])
self.assertEqual(3376628, first_line["unknown_doc_id"])
self.assertEqual(0, first_line["qrel_unknown_doc"])

self.assertEqual("449f69fa-df0e-4c9e-aead-61983ce9eaa8", last_line["id"])
self.assertEqual("833860", last_line["query_id"])
self.assertEqual(2830558, last_line["unknown_doc_id"])
self.assertEqual(0, last_line["qrel_unknown_doc"])

def test_pd_load_inputs_local_cached_zip(self):
tira = Client(tira_cache_dir="tests/resources/local_cached_zip")
actual = tira.pd.inputs("task-does-not-exist", "dataset-does-not-exist-20241201-training")
self.assertEqual(13, len(actual))
first_line = actual.iloc[0].to_dict()
last_line = actual.iloc[12].to_dict()

self.assertEqual("5a8865b0-19d7-4b33-bbe3-2f64ad54557f", first_line["id"])
self.assertTrue("query_id" not in first_line)
self.assertTrue("unknown_doc_id" not in first_line)
self.assertTrue("qrel_unknown_doc" not in first_line)

self.assertEqual("449f69fa-df0e-4c9e-aead-61983ce9eaa8", last_line["id"])
self.assertTrue("query_id" not in last_line)
self.assertTrue("unknown_doc_id" not in last_line)
self.assertTrue("qrel_unknown_doc" not in last_line)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[
{
"id": "dataset-does-not-exist-20241201-training",
"dataset_id": "dataset-does-not-exist-20241201-training",
"default_task": "task-does-not-exist",
"default_task_name": "task-does-not-exist",
"display_name": "Display Name",
"is_confidential": false,
"is_deprecated": false,
"mirrors": {
"truths": {
"Zenodo": "URL does not exist"
},
"inputs": {
"Zenodo": "URL does not exist"
}
},
"dataset_extraction": {
"truths": {
"md5sum": "608b4b658c190d9c3bd840d43653f021",
"subdirectory": "pointwise/labels"
},
"inputs": {
"md5sum": "608b4b658c190d9c3bd840d43653f021",
"subdirectory": "pointwise/inputs"
}
}
}
]
44 changes: 42 additions & 2 deletions python-client/tira/rest_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
import logging
import os
import shutil
import tempfile
import time
import zipfile
from functools import lru_cache
Expand Down Expand Up @@ -533,6 +535,8 @@ def download_dataset(self, task, dataset, truth_dataset=False):
data_type = "training" if dataset.endswith("-training") else "test"
suffix = "inputs" if not truth_dataset else "truths"
url = None
expected_md5 = None
subdirectory = None
if (
not meta_data
or "mirrors" not in meta_data
Expand All @@ -543,6 +547,10 @@ def download_dataset(self, task, dataset, truth_dataset=False):
else:
url = list(meta_data["mirrors"][suffix].values())[0]

if "dataset_extraction" in meta_data and suffix in meta_data["dataset_extraction"]:
expected_md5 = meta_data["dataset_extraction"][suffix]["md5sum"]
subdirectory = meta_data["dataset_extraction"][suffix]["subdirectory"]

target_dir = f"{self.tira_cache_dir}/extracted_datasets/{task}/{dataset}/"
suffix = "input-data" if not truth_dataset else "truth-data"
if os.path.isdir(target_dir + suffix):
Expand All @@ -551,9 +559,12 @@ def download_dataset(self, task, dataset, truth_dataset=False):
if not url:
url = f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip'

self.download_and_extract_zip(url, target_dir)
if expected_md5 and subdirectory:
self.download_and_extract_zip_with_md5(url, target_dir + suffix, expected_md5, subdirectory)
else:
self.download_and_extract_zip(url, target_dir)

os.rename(target_dir + f"/{dataset}", target_dir + suffix)
os.rename(target_dir + f"/{dataset}", target_dir + suffix)

return target_dir + suffix

Expand Down Expand Up @@ -626,6 +637,35 @@ def evaluate_run(self, team, dataset, run_id):

return ret

def download_and_extract_zip_with_md5(self, url, target_dir, expected_md5, subdirectory):
if expected_md5 is None or not expected_md5:
raise ValueError("foo")

if not (Path(self.tira_cache_dir) / ".archived" / expected_md5).exists():
raise ValueError("foo")

z = zipfile.ZipFile((Path(self.tira_cache_dir) / ".archived" / expected_md5))

members_to_extract = []
for i in z.namelist():
if i and not i.endswith("/") and (not subdirectory or i.startswith(subdirectory)):
members_to_extract.append(i)

if len(members_to_extract) == 0:
raise ValueError("I found no files in te zip.")

with tempfile.TemporaryDirectory() as tmpdirname:
for i in members_to_extract:
z._extract_member(i, Path(tmpdirname), pwd=None)

src_dir = Path(tmpdirname)
if subdirectory:
src_dir = src_dir / subdirectory
Path(target_dir).parent.mkdir(exist_ok=True, parents=True)
shutil.move(src=src_dir, dst=target_dir)

return

def download_and_extract_zip(self, url, target_dir, extract=True):
url = redirects(url=url)["urls"][0]
if url.split("://")[1].startswith("files.webis.de"):
Expand Down

0 comments on commit 9f3132a

Please sign in to comment.