Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding feature of getting company's business scope from business registry to output (ARES) #218

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions data/ares/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from urllib.request import urlopen, urlretrieve

import lxml.etree
import re

BASE_URL = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz"

Expand All @@ -21,15 +22,40 @@ def attr(root, parts, nsmap):

return ret

def attri(root, parts, nsmap, fnc):
ret = []
dat_d = {}
for j in parts:
el = root.findall("./are:%s" % j, namespaces=nsmap)
if len(el) > 0:
for x in fnc(el):
f_dict=json.loads(x)
for k in f_dict.keys():
dat_d[k] = dat_d.get(k,"") + f_dict[k] + ";"

ret.append(json.dumps(dat_d,ensure_ascii=False))
return ret

def obj(root):
def obj(root, multiple_same_tag = False):
if root is None:
return None
els = {j.tag: j.text for j in root.getchildren()}
els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}

els = {}
if multiple_same_tag:
{els.setdefault(root.tag + j.tag, [] ).append(j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ')) for i,j in enumerate(root.getchildren())}
pp = re.compile("\{.*\}(.+)\{.*\}(.+)")
els = { pp.match(j).group(1) + pp.match(j).group(2) : ";".join(k) for j,k in els.items()}
else:
els = {j.tag : j.text.strip().replace('\n'," ").replace('\t'," ").replace('"','').replace(u'\xa0',' ') for j in root.getchildren()}
els = {j[j.rindex("}") + 1 :]: k for j, k in els.items()}

return json.dumps(els, ensure_ascii=False)

def list_obj(el):
ret = []
for eli in el:
ret.append(obj(eli, True))
return ret

def organi(root, ico, nsmap):
nazev = root.find("./are:Nazev", namespaces=nsmap).text
Expand Down Expand Up @@ -104,6 +130,7 @@ def main(outdir: str, partial: bool = False):
"datum_zapisu",
"datum_vymazu",
"sidlo",
"cinnosti",
]
udc.writerow(cols)
foc.writerow(
Expand Down Expand Up @@ -190,6 +217,18 @@ def main(outdir: str, partial: bool = False):
dt.extend(zi)
dt.append(obj(zakl.find("./are:Sidlo", namespaces=et.nsmap)))

# zaznamy o predmetu cinnosti
cinn = zakl.find("./are:Cinnosti", namespaces=et.nsmap)
cinn_cols = [
"PredmetPodnikani",
"Ucel",
"DoplnkovaCinnost",
"PredmetCinnosti",
]

pr = attri(cinn, cinn_cols, et.nsmap, list_obj) if cinn is not None else ['{}']
dt.extend(pr)

# zapis dat do master tabulky
udc.writerow(dt)

Expand Down
1 change: 1 addition & 0 deletions data/ares/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Column("datum_zapisu", Date, nullable=False),
Column("datum_vymazu", Date, nullable=True),
Column("sidlo", JSON, nullable=True),
Column("cinnosti", JSON, nullable=True),
),
Table(
"fosoby",
Expand Down
9 changes: 9 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import argparse
import csv
from functools import partial
import os
import shutil
from collections import defaultdict
from importlib import import_module
from datetime import datetime

from sqlalchemy import Boolean, create_engine

Expand Down Expand Up @@ -33,8 +35,10 @@
action="store_true",
help="procesuj jen cast vstupnich dat - vhodne pro testovani, CI apod.",
)
parser.add_argument("--timesubf", action="store_true", help="Ve vystupnim adresari vytvori podadresar s casovy razitkem do ktereho se teprve budou ukladat zpracovana data z jednoltivych modulu.")
parser.add_argument("--all", action="store_true", help="procesuj vsechny moduly")
parser.add_argument("modules", nargs="*", help="specify which datasets to include")

args = parser.parse_args()

if args.all and len(args.modules) > 0:
Expand All @@ -48,6 +52,11 @@
)

base_outdir = "csv"

if args.timesubf:
prefix_d = "full_" if not(args.partial) else "partial_"
base_outdir = os.path.join(base_outdir, prefix_d + datetime.now().strftime("%Y%m%d%H%M%S"))

os.makedirs(base_outdir, exist_ok=True)

engine = None
Expand Down