kokes · sedmar · Sep 8, 2022
diff --git a/data/ares/main.py b/data/ares/main.py
@@ -6,6 +6,7 @@
 from urllib.request import urlopen, urlretrieve
 
 import lxml.etree
+import re
 
 BASE_URL = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz"
 
@@ -21,15 +22,40 @@ def attr(root, parts, nsmap):
 
     return ret
 
+def attri(root, parts, nsmap, fnc):
+    ret = []
+    dat_d = {}
+    for j in parts:
+        el = root.findall("./are:%s" % j, namespaces=nsmap)
+        if len(el) > 0:
+            for x in fnc(el): 
+                f_dict=json.loads(x)
+                for k in f_dict.keys():
+                    dat_d[k] = dat_d.get(k,"") + f_dict[k] + ";" 
+
+    ret.append(json.dumps(dat_d,ensure_ascii=False))
+    return ret
 
-def obj(root):
+def obj(root, multiple_same_tag = False):
     if root is None:
         return None
-    els = {j.tag: j.text for j in root.getchildren()}
-    els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}
 
+    els = {}
+    if multiple_same_tag:
+        {els.setdefault(root.tag + j.tag, [] ).append(j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ')) for i,j in enumerate(root.getchildren())}
+        pp = re.compile("\{.*\}(.+)\{.*\}(.+)")
+        els = { pp.match(j).group(1) + pp.match(j).group(2) : ";".join(k) for j,k in els.items()}
+    else:
+        els = {j.tag : j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ') for j in root.getchildren()}
+        els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}
+
     return json.dumps(els, ensure_ascii=False)
 
+def list_obj(el):
+    ret = []
+    for eli in el:
+        ret.append(obj(eli, True))
+    return ret
 
 def organi(root, ico, nsmap):
     nazev = root.find("./are:Nazev", namespaces=nsmap).text
@@ -104,6 +130,7 @@ def main(outdir: str, partial: bool = False):
             "datum_zapisu",
             "datum_vymazu",
             "sidlo",
+            "cinnosti",
         ]
         udc.writerow(cols)
         foc.writerow(
@@ -190,6 +217,18 @@ def main(outdir: str, partial: bool = False):
             dt.extend(zi)
             dt.append(obj(zakl.find("./are:Sidlo", namespaces=et.nsmap)))
 
+            # zaznamy o predmetu cinnosti
+            cinn = zakl.find("./are:Cinnosti", namespaces=et.nsmap)
+            cinn_cols = [
+                "PredmetPodnikani",
+                "Ucel",
+                "DoplnkovaCinnost",
+                "PredmetCinnosti",
+            ]
+
+            pr = attri(cinn, cinn_cols, et.nsmap, list_obj) if cinn is not None else ['{}']
+            dt.extend(pr)
+
             # zapis dat do master tabulky
             udc.writerow(dt)
 

diff --git a/data/ares/schema.py b/data/ares/schema.py
@@ -18,6 +18,7 @@
         Column("datum_zapisu", Date, nullable=False),
         Column("datum_vymazu", Date, nullable=True),
         Column("sidlo", JSON, nullable=True),
+        Column("cinnosti", JSON, nullable=True),
     ),
     Table(
         "fosoby",

diff --git a/main.py b/main.py
@@ -1,9 +1,11 @@
 import argparse
 import csv
+from functools import partial
 import os
 import shutil
 from collections import defaultdict
 from importlib import import_module
+from datetime import datetime
 
 from sqlalchemy import Boolean, create_engine
 
@@ -33,8 +35,10 @@
         action="store_true",
         help="procesuj jen cast vstupnich dat - vhodne pro testovani, CI apod.",
     )
+    parser.add_argument("--timesubf", action="store_true", help="Ve vystupnim adresari vytvori podadresar s casovy razitkem do ktereho se teprve budou ukladat zpracovana data z jednoltivych modulu.")
     parser.add_argument("--all", action="store_true", help="procesuj vsechny moduly")
     parser.add_argument("modules", nargs="*", help="specify which datasets to include")
+
     args = parser.parse_args()
 
     if args.all and len(args.modules) > 0:
@@ -48,6 +52,11 @@
         )
 
     base_outdir = "csv"
+
+    if args.timesubf:
+        prefix_d = "full_" if not(args.partial) else "partial_"
+        base_outdir = os.path.join(base_outdir, prefix_d + datetime.now().strftime("%Y%m%d%H%M%S"))
+
     os.makedirs(base_outdir, exist_ok=True)
 
     engine = None