From d4146882e14ada8c46e8e39cef726c1af290beb6 Mon Sep 17 00:00:00 2001 From: Marek Sedlacek Date: Thu, 8 Sep 2022 11:30:13 +0200 Subject: [PATCH] Adding feature of getting company's business scope from business registry to output (ARES) --- data/ares/main.py | 45 ++++++++++++++++++++++++++++++++++++++++++--- data/ares/schema.py | 1 + main.py | 9 +++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/data/ares/main.py b/data/ares/main.py index 8ff002f..478c541 100644 --- a/data/ares/main.py +++ b/data/ares/main.py @@ -6,6 +6,7 @@ from urllib.request import urlopen, urlretrieve import lxml.etree +import re BASE_URL = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz" @@ -21,15 +22,40 @@ def attr(root, parts, nsmap): return ret +def attri(root, parts, nsmap, fnc): + ret = [] + dat_d = {} + for j in parts: + el = root.findall("./are:%s" % j, namespaces=nsmap) + if len(el) > 0: + for x in fnc(el): + f_dict=json.loads(x) + for k in f_dict.keys(): + dat_d[k] = dat_d.get(k,"") + f_dict[k] + ";" + + ret.append(json.dumps(dat_d,ensure_ascii=False)) + return ret -def obj(root): +def obj(root, multiple_same_tag = False): if root is None: return None - els = {j.tag: j.text for j in root.getchildren()} - els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()} + els = {} + if multiple_same_tag: + {els.setdefault(root.tag + j.tag, [] ).append(j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ')) for i,j in enumerate(root.getchildren())} + pp = re.compile("\{.*\}(.+)\{.*\}(.+)") + els = { pp.match(j).group(1) + pp.match(j).group(2) : ";".join(k) for j,k in els.items()} + else: + els = {j.tag : j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ') for j in root.getchildren()} + els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()} + return json.dumps(els, ensure_ascii=False) +def list_obj(el): + ret = [] + for eli in el: + ret.append(obj(eli, True)) + return ret def organi(root, ico, nsmap): nazev = root.find("./are:Nazev", namespaces=nsmap).text @@ -104,6 +130,7 @@ def main(outdir: str, partial: bool = False): "datum_zapisu", "datum_vymazu", "sidlo", + "cinnosti", ] udc.writerow(cols) foc.writerow( @@ -190,6 +217,18 @@ def main(outdir: str, partial: bool = False): dt.extend(zi) dt.append(obj(zakl.find("./are:Sidlo", namespaces=et.nsmap))) + # zaznamy o predmetu cinnosti + cinn = zakl.find("./are:Cinnosti", namespaces=et.nsmap) + cinn_cols = [ + "PredmetPodnikani", + "Ucel", + "DoplnkovaCinnost", + "PredmetCinnosti", + ] + + pr = attri(cinn, cinn_cols, et.nsmap, list_obj) if cinn is not None else ['{}'] + dt.extend(pr) + # zapis dat do master tabulky udc.writerow(dt) diff --git a/data/ares/schema.py b/data/ares/schema.py index f8c93de..0a3377b 100644 --- a/data/ares/schema.py +++ b/data/ares/schema.py @@ -18,6 +18,7 @@ Column("datum_zapisu", Date, nullable=False), Column("datum_vymazu", Date, nullable=True), Column("sidlo", JSON, nullable=True), + Column("cinnosti", JSON, nullable=True), ), Table( "fosoby", diff --git a/main.py b/main.py index 0e33999..1bca6a9 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,11 @@ import argparse import csv +from functools import partial import os import shutil from collections import defaultdict from importlib import import_module +from datetime import datetime from sqlalchemy import Boolean, create_engine @@ -33,8 +35,10 @@ action="store_true", help="procesuj jen cast vstupnich dat - vhodne pro testovani, CI apod.", ) + parser.add_argument("--timesubf", action="store_true", help="Ve vystupnim adresari vytvori podadresar s casovy razitkem do ktereho se teprve budou ukladat zpracovana data z jednoltivych modulu.") parser.add_argument("--all", action="store_true", help="procesuj vsechny moduly") parser.add_argument("modules", nargs="*", help="specify which datasets to include") + args = parser.parse_args() if args.all and len(args.modules) > 0: @@ -48,6 +52,11 @@ ) base_outdir = "csv" + + if args.timesubf: + prefix_d = "full_" if not(args.partial) else "partial_" + base_outdir = os.path.join(base_outdir, prefix_d + datetime.now().strftime("%Y%m%d%H%M%S")) + os.makedirs(base_outdir, exist_ok=True) engine = None