From 9f3132a7eb3ceb749d6f393abf946084986687d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Mon, 27 Jan 2025 20:19:39 +0100 Subject: [PATCH] allow to use md5-cached dirs --- .gitignore | 1 + python-client/tests/pd_load_data_test.py | 34 ++++++++++++++ .../608b4b658c190d9c3bd840d43653f021 | Bin 0 -> 10018 bytes .../.archived/v1/datasets/all | 29 ++++++++++++ python-client/tira/rest_api_client.py | 44 +++++++++++++++++- 5 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 python-client/tests/resources/local_cached_zip/.archived/608b4b658c190d9c3bd840d43653f021 create mode 100644 python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all diff --git a/.gitignore b/.gitignore index 568f9df5..f11e9d80 100644 --- a/.gitignore +++ b/.gitignore @@ -289,3 +289,4 @@ tira-web/tira-web/lib/ # TextMate frontend/.editorconfig +python-client/tests/resources/local_cached_zip/extracted_datasets/ diff --git a/python-client/tests/pd_load_data_test.py b/python-client/tests/pd_load_data_test.py index 17bfd4d5..25c4f899 100644 --- a/python-client/tests/pd_load_data_test.py +++ b/python-client/tests/pd_load_data_test.py @@ -19,3 +19,37 @@ def test_pd_load_truths_wows_re_ranking(self): self.assertEqual(3, len(actual)) self.assertEqual("hubble telescope achievements", actual.iloc[0]["query"]) self.assertEqual("1", actual.iloc[0]["qid"]) + + def test_pd_load_truths_local_cached_zip(self): + tira = Client(tira_cache_dir="tests/resources/local_cached_zip") + actual = tira.pd.truths("task-does-not-exist", "dataset-does-not-exist-20241201-training") + self.assertEqual(13, len(actual)) + first_line = actual.iloc[0].to_dict() + last_line = actual.iloc[12].to_dict() + + self.assertEqual("5a8865b0-19d7-4b33-bbe3-2f64ad54557f", first_line["id"]) + self.assertEqual("1051399", first_line["query_id"]) + self.assertEqual(3376628, first_line["unknown_doc_id"]) + self.assertEqual(0, first_line["qrel_unknown_doc"]) + + self.assertEqual("449f69fa-df0e-4c9e-aead-61983ce9eaa8", last_line["id"]) + self.assertEqual("833860", last_line["query_id"]) + self.assertEqual(2830558, last_line["unknown_doc_id"]) + self.assertEqual(0, last_line["qrel_unknown_doc"]) + + def test_pd_load_inputs_local_cached_zip(self): + tira = Client(tira_cache_dir="tests/resources/local_cached_zip") + actual = tira.pd.inputs("task-does-not-exist", "dataset-does-not-exist-20241201-training") + self.assertEqual(13, len(actual)) + first_line = actual.iloc[0].to_dict() + last_line = actual.iloc[12].to_dict() + + self.assertEqual("5a8865b0-19d7-4b33-bbe3-2f64ad54557f", first_line["id"]) + self.assertTrue("query_id" not in first_line) + self.assertTrue("unknown_doc_id" not in first_line) + self.assertTrue("qrel_unknown_doc" not in first_line) + + self.assertEqual("449f69fa-df0e-4c9e-aead-61983ce9eaa8", last_line["id"]) + self.assertTrue("query_id" not in last_line) + self.assertTrue("unknown_doc_id" not in last_line) + self.assertTrue("qrel_unknown_doc" not in last_line) diff --git a/python-client/tests/resources/local_cached_zip/.archived/608b4b658c190d9c3bd840d43653f021 b/python-client/tests/resources/local_cached_zip/.archived/608b4b658c190d9c3bd840d43653f021 new file mode 100644 index 0000000000000000000000000000000000000000..27ed1429166cb8b0eaec4374e1401df06b5abc7e GIT binary patch literal 10018 zcmb7~by!^4vbP%xF2UX1-6aGG1PSgmH0~N)0>RxOKyZh~-Q6uXK?6a8TjP?CIdkqg znKSd9JNM~lTmSK{dTQ0K+O>X5Ne%`U8vpS~Apn59oC=4a1`i#sv^3ibyK0Pv3! z@~N5Gh7rJjPkmk)MaC~dJUEA(qqbPMmS0_rJ21ijtKAX=61g`sBgfQ4fUgZ?=VJE@ z12nx9ohz0`ASa4T_>`EYio#JsW--EgNE-&YbYx$FOH)g3eb?q6=AV-g1Z9C%D2wry zt&Crc4c=xymtbb*oV$g^jXA z#*;;mthTOC2g==9AM|g=`!Z8c=5^k1hzekBJ-L?c2T8nnjGib>T~xj_=|149Ue@1X zpayEUdLE0ztEZa|UfPlH9|Aw@HGh}eiDBa_`5@3J}40H!pPco%WF#>iOqy3Cj^b1bF@XS!>N(qjjk@>D)7lGyaa<8U7C3F zwyui^ucy*)DOR<;wOT71}kfNk*()eLbab(T-_}+q(UBKTFVg)vPiT$g+1SG4;i|Wjy`k zi-UxPO>|##Y!BX?MU9grm!(Rs6En4vA1-HR5be)R&h%D!(nIhaC+f!qpDPRCk^_&@ z*v+N7y#tKlc`y_V! zI6ohe-p_8?Q(J`G=6~C36>&)2JCn&H%F9DS!b2~}2i*)R=Oy@P! zZVs$mGPb)V+lIO`9me;g^F+ww<%+_?7L8uFJEsRFiuQ`fU{9Z$HA|Q?D$hnP8zL#= zo7oYmI_+J>PY+kKZNfpU$rTZ8$j!Hb3SwY>@Xh_u?spvYK(w{k$D7mU5XoAvCgc0m zaf3;&)vrfbeB&G5Mx$O6tsL;CcncOnAH;GWn9c)CD(&&}Bu3&6Y6M#GaJTs7Sm@U& zz{`xS2e0rCA||>7xUg5L#$g-q7}KQJ+6Lo)TF!(Vb{%kDWWG4*G#+G~Qg51+7Rp;> z>~c!c{x~FGAcNj)YD(8vn7j-58X>mf#04s`Y3XWTZP?YRxKcM3_l;=(EEG*7>cLzq z* zaIxnK;Qb{LN9RE@!HY79e?1EyIKs^Hi(AD<>-oF1GsLV2zq`ninZJw=C2)Lc-c zywh0IzAK{pOUV37Nvb&rI5v$m#_`iOevRYA&_K6wD2_@ghzCbA1Dzx3WoQcbl@SLH zPmLCZxOhWrLMlO3;y_$MJZVXia<$j#V{+QbdMnj_nUNkgWg$C%VCUW(VA zFx+G4rA$}qr(sR#3E=b%MtXJr0J(39ZWWi|sApnJ8$o(?Uq1yC5p_osewzP-?jboT zKX&G8in&F~pX%4Y5%SNK@mB=K zuLv8Ek*SUI@1&S#O3Po9ey_37*+M!9$=!#t&;WoPEZ~{*i!uKs{eEG!a<;d#`GA8yxFm)Ex3$wnWShdpr1a8 zpTP_mhIAqMHzG1SZj?(~B-E((tt8)DU4~mnFN5t%U3v-i>)orfAqfD~ zfDHH}zxm8AS}aJ=eU${}3ewiU?0yxBk|#U%jV12Vg0MA|5U$!>r{0&ME&RBqm#?s; z;I23)CP4Kp{q^@%@exNWky?`k=7b@Mg}z8b@y7A!7%C?u80(i02rX2$Uig9DZv(Pk zDohR}6u8-Zd0K>5ocRn7t8h5G58TP@<_jM?xHg#~*P_c8wyO&=1A>`dn;`jH12+Vy zdq`rwcfx~$l4QY9IORbOB!YqWTDeI@4kQq`Ww)r8`vMUTSX4lygoGI{3m$_C&XVw> z3dVD#OU6v#P(^mrq6X0o=HVKt-9s|#o3h$woJ71+AQ2q4_Z$`)>K;@0^FMm59y)rC z+BQIV$@Tyy4n)NDe&Z8nZ)u%KfplGQm&EMFR{?EiJrlbsu&n~fFHasqFtw8(D}6ch z6A*#3ra*OzS)2His(7e5&kpK|4I7&@;3rq z;fL3;^xv&@=c~BX zir1&IsMn^K&svF;RpoxYbZrK1x^`$yTd8zyX;PD__U7~hmLx0?1$SW}6)`Ue2zRZ% z;68|lm>UH&C`QQ#zo?_Q6m3hy?DLOI714I9njRximO84Z`SNkJB0AhV?v>Y8w)&1T zOB|gHeyPg{0g76Wv6bu83)1zw1YGob($IG?A~|lj3Q$O?Z)~rjS3b#T$Yg^Sx{LhG zn7>iwZ8^=5IEJ`ldk^TqX8>N=+~A?jmyQ!M;pteQpl#BWNqpC$gJCdAl#Rq_kFD=Z z5)0q2uS%JiKa+7!SWPOjSL7MoUlUJA6yqML27^yNx45=>ofjRcBwr?G! zS+jf7$gjQ;PPzIirWaA(?uF?d{?v8|$cWe#MNr&XdC3n-nLp*c#_W*ot6Yyzpgk2Q za$d7QopY~qs>_qdrg0;FV>FfI>pO7!xq;#Ci;IaV%SR%54B?%4aGT$9KHFE{Vy)#} z&R$gdHq64A2wpPHmNo|Fii!oMn|PGfEY>TaUGLY3W(MZUZ|%m&nBFchE=lxS%&a?w zRYE(aJQ1r#mx37ks4G9jiEb*cZBI{AO(5PJd_kAmANv)hT_ToZQdFQ@TM**dZbRlw zwVx_QZ~qXwEB}$Bk$%gp_Lg?<|DETvo_mz+KldsB4u99@?EGtS&WlPNgQwqe>}M-QOP`$A0!&=;oIIm#?aiAE8=A{Oc z&THB~?ohu0;nJ9fiA#v+kASu+`#UcoKjQnWMUWVP1X2=Iv*c@W7$ZiZG+`Oa-(><~ z-4_{+-++qe3v;B^yw13SuQx@Gc%8Ju$)a@1#IGKo?zl#+NJ?mRY*&=jl2hGt4qthl zy*o>|Qzewo<#nRYELjhY@njQ?Tsp*i9hDnEkFT3hoZS_C9A75gCQ;Sa7;!Ku#Y}44 za303B>B;{x<~uu57IU98uq#R!8QC_9cwbT56L#>ed*67CaO(l^*wdkI#n<}%QOafj zzg&-v`uU7y6F$W5+Y}i*W(&?qRit)!@cOJ-*?4G-RC&l$u@zww={GVK-;^xTFcOqL zuu;viYHLze5>LZTjb=UJH8H8Lsoaa5o?Ra8>k(nJPMdml#p84KTW-S+IYM`}EBLbv z?xE`T`=$5y)yw?oZZ+atUY2imV{?w^jIs{8nZh((XRqbXM&Lc4zna{wC{V-XOv=3Y z92B)f-^P$srfPd|0$m~@BSz(hh8|n}RCsj5jFaDc`~j4w%Tq+|#810qAIAN#c+036 zmbuAgwPTZ|NE01ZUzpB;bJdB=YzZ@=c2W2ZLRydE_{ju*X5gXLo9InLYim77qbkk$ zM@wrgiAZnLGfD_)vGUoVm{(iSIHedkW-<77rdlalN~4exS-2g3by2L9@#^CiA!1E2!!mP{xULy? zJ}jonydz*sjP72K~^X>ikBsD&EJU z#AH2-Tu!#+&6FHXysai_bo*{wX422usxEXo@!w5OpM=qu4}XNm2jC`w*}Q+uClD4u zQN_+rh^@3n`Gflv^hKC-+z`OLE^l19$vGYx)KD0^no4)KdvW#KDO{0EdL#Up*y_`o z6IHiioBD=J=@q9VZwU29aCpiXkpn368D$da5s|_#3~`1XXhv~uXoLbr3G_P9+*XnG zp8Cf?12N+!>f@Q^Ld~Y^GMH-}oeR!kXL@M^zI13~fnqu3H7y`J$Hfz@tWrf%Cm7Sw z^fyFP)<-;KeHv-XbE@ckqZouaARj0h*Ml(Ae-}mAAguKm zUbv={+#xsyLI(*>pEpU~^x&Jr+id3@fI=}pq4&+mmU?tLxd2YEwJLW7WqP_V>5kST z(jBmr5mYrzS7)q+wbDJ2N}!}kX^{n-$R+ezq)w%8q(Z5T|BRD z2?!6~YSiNcjW`yK!;zb`zlZ>lZlgMLR^-i;L=|+1s8eoR>d|iOcISuYaU(0^_j}L> z)WvefI?@I-<#OvvX(#gC639;&p1U+101eOE*ZcwGB{`Q|`AmoiwVs58vuFV9fTH+% zNZpo7+=T#_zj#a_Gf8WEZ3(4@!o6LkIk9RUBxm>F26`Xo3cv5AGtBnuF0u>rSiS`X z=AjGZzzGjxm3hHeLA|V7pZtwiL-)d5EwOR6u`^!z{UC(eR2#T0)q=C_N5I(8Rtmy~ zw{2&vVT?yq_K^1Dg7zkPj%a9aKVjUk>jif*Ri#N4mzW1*AC^{~a2GZ{LUsS3NhLXQ z7*M-49dn=VG|XG#Gd*WSXq>S$ePc+l?x#GcKw{6&%Bdmjsa}*77!L4ME1Z9Jr~YzRbT0!5-97_*7yuR$;e;z^HxD2~FmJer8)ZgyJb~)V@HsE_ z10J@(oOoKJC7_P+qhNSX@mLSSkTdf8=(mb5=E~P5wzFLkL7*i%&QJqqN_t?HoCmVz~To*1O$nj5NmDWqS_6sy<=)2w21-15+1&;Q$Df-?J`DY3CaXtPp9N>eu9YR2?@n@y+j5eBh6&h~s4N`mEjGT~yE(IZi@!F56vE zCDw`{0IC=@fkUSdKlfVUId!)|lqJf>)yz;9lO_Haht4LvMte%Auy`*$l=I%g6_^#5Xi2@kP#dBf9a0_m^njz*96c3u6~(`vY>}RIyQ0bYrLFXoTi6aZj0;$>}Bn}0@r0EiAOa!7x>?sK3WC( z+4mtI`5vWy0E9zv$osROy4mVSpM{?iekA32io9-BEsvyrt65YUlSgW$#$!39;SsCJ zV+io3D9#l|D+M#ou#e`dy9fJop?F*3yNBC0XY%_D-PQTYo7!2e(=R*H8=xJKewbjw zK!B?73p87(7e3xxW0>C^r}v@>DbuvCpl^=L;%{#_{rtI~t%BLJC@omCr0#pUc8g^| zIEW!r!I1qF-rJnHxu&08C_eX18?QV=7kBKvBI!sb_T;#ERSNWPZI)n0Jg$BRHb(UyoD{-tkLwCIFv zeBgU;zq=)QFO%6G{RP5n-3IiE(kL5&QYCjn{5XUusZ6%OIH79g$9|QS&u$0DVK>f3 znTILj9{b-H(o{G1(rAPe=GI`Ab+s?TD%{IZe;|d1V(qy&n#OSLSa`a7?s_&AT)th& znDR)a^>7dOBc7w6;fgF3=M1qW$X8c|Go4nc(lNCpr8Vu)S+(>1%ysv9>gP#hrWg0c zeUTf4YI!759p38i`b&rWkq9X&!> zQbNrT;f$1}dV^eaD)F-4BRSj+eDtoiK`d^=THY;61Y4Fl({i@@=8TkRwUCl3_s2wa zNG|02afNB(+Jk?I=4Nw?TKi4vwFc7~CB^BszA?I#s}d;I`^|Zfn(K!JjJ`s*a0dV^ zVW%Fe*!4r7{s<*v?cs+9Hlm+!f49H*wZHyb{lzBwN-`ND0D$@2a{RHs_&+w-y?Ir=otVOi5i!T6!D4#6eo%J?v62Jnq+3XncR@XrSB+L>(|%}<|`to%wG^N z$vG5o%M{f!1R(}^I{FXPtVr4mrc;Ho^TmwA%j2|`x=Yjt9b*NI=efmq5loI}gYw5i z@Mr-zz!+UNSJyV&nO8!XYtg!EJTVJ-B9Uz;-(`BPHbr>#&w*ly?aR7+!;Y94By>f~ z9@DN6qnb^z^JPBASRl$$#Mh8KUp663C$%Q~M!uGr8RZ`wHeVAC%9n?$smL0b4>I~m zfbgwlGus!T_jVRr>XwQ-@Ov2D9G_i#9SL2vKEkLVGDJeKJ_deW8w=D8B*{xybRsv_ z*sKrcaBdS^b@Hu`jqXFmK=w)ij3ikcafe>HJ7wy>WcJv!5b&@!vH>*LVOSwkDK*ga`|GX~2k&KxuQ7NFcB0%ov` z&}BaRA%5~67*Y1lhN&6bGfmI@M!4^vAA?uTw)+Vu)#{EiMRpxKaVxOLuEix2q7O_W zU=$i9)0k*%^Pp4^I>}sfoPx*0$pJ&zm}hJvwO|e7bZ|Oky&g7G(5~#W&zoAn-d`9( zjD2xYTr2Jt@IzLyw)bh9^aW#`UveH!`C$bWa9R7^bk0cr^=L?yroduL%aQeA-H~B* zPm3M7m;m>SljO9GH#~L0r*Tww9TLRcNc?H9IPJl}JFX_D%-}*w^?IqaS%!Mqj$vL| zg#*qK&Qy(>y|Nl+(!}5o&OEF#CuWsG($z#;nV@BxChE1tf^GwuqKC;%wIlDqt0mq3 zOV83A1r)}`-f{A%#uOnCxIOxa?P^+~B^~vPq-~~t8{auXF&a$xb$!BV+C?LAU`h^< zZVL?s&K^iGT;A{C(~xO5B(M3p0k@x1W`>NqF|GE(mH+xZO(uc;$x0jk?!$X}6cLK{ zYK=7{v3n60sJttDpIE>6fYTX|@S0q3x^Fq=DT1KXHteQmV!mn7G-c(eu#aJQelzXt zhUOiXdT`Bhylxm??}~l|`h0UNQnWK>=i~wR0g#01wbNH(4JdF1DpLAuj4>4Ad4e2v z+Rr*W1Y(N;O+PpJvhgE{!xd(P#;f|xZuYlO=N&%TpHpneAYGqC>>4D?X1^!n$Bbo+ z7vr$r7}+yi<>^|pn<7|A=cqCac#QHz^V}VV>{cxtFte$AnQoCcjbBiqbY1sFyEiFR-{J!gawdqJWvypXM{bF^yd2j1Pi+cPw^#yUj~=~{{D z5Gg8j2G2L^rKs!E_>5{5n8wvHl#B1E>(`~OVDO;#>?2=QSsQM$Sbcwch_Lz<6_=9g z@OtG|#Rip@)N`7O|2?O-R4L4newZ69$3~(%C>7ukx4b#J_LT&^U>NFfW)^z1=v8>L zP#p*ZPMFKKc-ZobmNa@DEUYq*g>5lTbpj(=5KxF1Ndn6IO9&#AANpvb|DHc|fsj|C z*+*)q>M1fVy5Xg4W9Z>Wt2Zob2Z-v!`H+D3fpG;xgxNx8LYyr`_yq=+on#z7p-V>1Wk%JDFgqow+ zBIw?x@e!A4Tw9oUwi?DqkcVe*&#_r_j5mZxx~|n$+7ey78ncod?rgoEStZmCW>cO@ zq>_trHT#%Eje)G#YP0DlH^ZVN zchLZt*y4$a3Ds4<{a->G)Ar0^x{`a14TX^XhHtSblq8( zjlpehHO7uCYvKeliL%@TK8Rza9|c-T=iCsci_WIPoJGb(mKd`$u26w{r3z`NCc5f4 z9;tR|HKoUjJ@67wQ>bwjUc8(9Ta656ct~3#kJaarDZd}*$kJy*$a<7S`j2YQscEi8bMnT(&&@C{^VE2 zboP4Pf=blc$1_w065Q%`R86nQ8X3R7#YRCCea+Hs<69Y~MDeY3&g$}K#~aQkcqKU~ zXi}Kp*LDBANe2I4E5`Z$zuo@tTV~*X-!k(ba^CNw{Qoq|{}}}Xa3sF)yg}y|1@jsA zn}YdYaDUb>f5j2P|APD1BKI@(H#PGwssCQ${-c)pI~D5L3Hkmee?k4@diV1Pf1zsr z>W2SrH_WrI{^f>$eLVj?yZOE|5=Op)!!W`&vyT!Og!^`Qzrh> z)4zwqpLK~}dC;i8@_wDb-@1su#Qu8%|EN*?!4ZFj{lCuOf4AI-;8)9kox;Ci`5(KV fUo9{HE9~E!L%8Qx3jjcV{_Q<)W22(_?dksk5x%VL literal 0 HcmV?d00001 diff --git a/python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all b/python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all new file mode 100644 index 00000000..4aec5ebe --- /dev/null +++ b/python-client/tests/resources/local_cached_zip/.archived/v1/datasets/all @@ -0,0 +1,29 @@ +[ +{ + "id": "dataset-does-not-exist-20241201-training", + "dataset_id": "dataset-does-not-exist-20241201-training", + "default_task": "task-does-not-exist", + "default_task_name": "task-does-not-exist", + "display_name": "Display Name", + "is_confidential": false, + "is_deprecated": false, + "mirrors": { + "truths": { + "Zenodo": "URL does not exist" + }, + "inputs": { + "Zenodo": "URL does not exist" + } + }, + "dataset_extraction": { + "truths": { + "md5sum": "608b4b658c190d9c3bd840d43653f021", + "subdirectory": "pointwise/labels" + }, + "inputs": { + "md5sum": "608b4b658c190d9c3bd840d43653f021", + "subdirectory": "pointwise/inputs" + } + } + } +] diff --git a/python-client/tira/rest_api_client.py b/python-client/tira/rest_api_client.py index 6f249525..606f27f3 100644 --- a/python-client/tira/rest_api_client.py +++ b/python-client/tira/rest_api_client.py @@ -3,6 +3,8 @@ import json import logging import os +import shutil +import tempfile import time import zipfile from functools import lru_cache @@ -533,6 +535,8 @@ def download_dataset(self, task, dataset, truth_dataset=False): data_type = "training" if dataset.endswith("-training") else "test" suffix = "inputs" if not truth_dataset else "truths" url = None + expected_md5 = None + subdirectory = None if ( not meta_data or "mirrors" not in meta_data @@ -543,6 +547,10 @@ def download_dataset(self, task, dataset, truth_dataset=False): else: url = list(meta_data["mirrors"][suffix].values())[0] + if "dataset_extraction" in meta_data and suffix in meta_data["dataset_extraction"]: + expected_md5 = meta_data["dataset_extraction"][suffix]["md5sum"] + subdirectory = meta_data["dataset_extraction"][suffix]["subdirectory"] + target_dir = f"{self.tira_cache_dir}/extracted_datasets/{task}/{dataset}/" suffix = "input-data" if not truth_dataset else "truth-data" if os.path.isdir(target_dir + suffix): @@ -551,9 +559,12 @@ def download_dataset(self, task, dataset, truth_dataset=False): if not url: url = f'{self.base_url}/data-download/{data_type}/input-{("" if not truth_dataset else "truth")}/{dataset}.zip' - self.download_and_extract_zip(url, target_dir) + if expected_md5 and subdirectory: + self.download_and_extract_zip_with_md5(url, target_dir + suffix, expected_md5, subdirectory) + else: + self.download_and_extract_zip(url, target_dir) - os.rename(target_dir + f"/{dataset}", target_dir + suffix) + os.rename(target_dir + f"/{dataset}", target_dir + suffix) return target_dir + suffix @@ -626,6 +637,35 @@ def evaluate_run(self, team, dataset, run_id): return ret + def download_and_extract_zip_with_md5(self, url, target_dir, expected_md5, subdirectory): + if expected_md5 is None or not expected_md5: + raise ValueError("foo") + + if not (Path(self.tira_cache_dir) / ".archived" / expected_md5).exists(): + raise ValueError("foo") + + z = zipfile.ZipFile((Path(self.tira_cache_dir) / ".archived" / expected_md5)) + + members_to_extract = [] + for i in z.namelist(): + if i and not i.endswith("/") and (not subdirectory or i.startswith(subdirectory)): + members_to_extract.append(i) + + if len(members_to_extract) == 0: + raise ValueError("I found no files in te zip.") + + with tempfile.TemporaryDirectory() as tmpdirname: + for i in members_to_extract: + z._extract_member(i, Path(tmpdirname), pwd=None) + + src_dir = Path(tmpdirname) + if subdirectory: + src_dir = src_dir / subdirectory + Path(target_dir).parent.mkdir(exist_ok=True, parents=True) + shutil.move(src=src_dir, dst=target_dir) + + return + def download_and_extract_zip(self, url, target_dir, extract=True): url = redirects(url=url)["urls"][0] if url.split("://")[1].startswith("files.webis.de"):