diff --git a/.github/workflows/partial.yaml b/.github/workflows/partial.yaml deleted file mode 100644 index 0201834a..00000000 --- a/.github/workflows/partial.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: partial - -on: - push: - branches: - - main - pull_request: - workflow_dispatch: - schedule: - - cron: '0 0 * * *' - -jobs: - partial: - runs-on: ${{ matrix.os }} - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - os: ['ubuntu-latest', 'windows-latest'] - python-version: ['3.12', '3.8'] # na partial nam staci nejnovejsi a nejstarsi verze - module: ['datovky', 'dotinfo', 'eufondy', 'iissp', 'czechpoint', 'justice', 'psp', 'steno', 'smlouvy', 'szif', 'zakazky', 'volby', 'udhpsh', 'res', 'red', 'ruian', 'ares'] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - name: Run partial processing - id: main - if: matrix.module != 'eufondy' && matrix.module != 'psp' && matrix.module != 'steno' - run: | - python3 main.py --connstring sqlite:///data.db --partial ${{ matrix.module }} - - name: Run partial processing (broken jobs) - # par jobu jsme preskocili, ted je pustime znovu, ale ocekavame, ze spadnou - # exity nejsou na windows, takze poustime jen na ubuntu - if: steps.main.conclusion == 'skipped' && matrix.os != 'windows-latest' - run: | - python3 main.py --connstring sqlite:///data.db --partial ${{ matrix.module }} || exit 0 - exit 1 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 274ca726..1e8540d2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -5,16 +5,18 @@ on: branches: - main pull_request: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' jobs: - build: - - runs-on: ubuntu-latest - timeout-minutes: 5 + test: + runs-on: ${{ matrix.os }} + timeout-minutes: 10 strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - # dataset: ["ares", "cedr", "cssz", "datovky", "dotinfo", "iissp", "psp.steno", "szif"] + os: ['ubuntu-latest', 'windows-latest'] + python-version: ['3.8', '3.12'] steps: - uses: actions/checkout@v4 @@ -26,12 +28,10 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pytest - - name: Test with pytest + pip install pytest pytest-xdist + - name: pytest (pipeliny) + run: | + python3 -m pytest -v data + - name: pytest (main) run: | - pytest -v data/steno/ - # TODO: doresit problem s importy - # TODO: vypnute, protoze vzdycky aspon jeden job spadnul kvuli random timeoutu - # - name: Try all pipelines - # run: | - # python3 main.py --partial ${{ matrix.dataset }} + python3 -m pytest -v -n 4 --durations=5 main_test.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/cssz/README.md b/data/cssz/README.md deleted file mode 100644 index 8da5769b..00000000 --- a/data/cssz/README.md +++ /dev/null @@ -1 +0,0 @@ -První zpracování dat z [ČSSZ](https://data.cssz.cz/). Ve schématech mají spoustu chyb, takže bez ručních úprav toto fungovat nebude. Vše je nahlášeno, tak se to snad časem zlepší. \ No newline at end of file diff --git a/data/cssz/main.py b/data/cssz/main.py deleted file mode 100644 index 5d96d000..00000000 --- a/data/cssz/main.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -from functools import lru_cache -from urllib.parse import urljoin -from urllib.request import urlretrieve - -import lxml.html -import requests -from tqdm import tqdm - - -@lru_cache(maxsize=None) -def req(url): - r = requests.get(url, timeout=60) - assert r.ok - return r - - -def najdi_textem(root, element, text: str): - els = [j for j in root.iterfind(f".//{element}") if j.text == text] - assert len(els) == 1, els - return els[0] - - -def main(outdir: str, partial: bool = False): - # TODO: odstranit generovani mapping.json, misto toho ho zaverzovat - # (a zkratit nazvy tabulek, jsou moc dlouhy pro pg) - burl = "https://data.cssz.cz/web/otevrena-data/katalog-otevrenych-dat" - r = req(burl) - ht = lxml.html.fromstring(r.text) - - ds = [] - for num, tr in tqdm(enumerate(ht.cssselect("tbody.table-data")[0].findall("tr"))): - if partial and num > 15: - break - a = tr.find("td").find("a") - link = a.attrib["href"] - assert link.startswith("http://") or link.startswith("https://"), link - dr = req(link) - - dht = lxml.html.fromstring(dr.text) - scha = najdi_textem(dht, "a", "Schéma (JSON)") - sch_url = urljoin(link, scha.attrib["href"]) - da = najdi_textem(dht, "a", "Data (CSV)") - - schema = req(sch_url).json() - - ds.append( - { - "nazev": a.text, - "nazev_ascii": link.rpartition("/")[-1].replace("-", "_"), - "url": { - "dataset": link, - "schema": sch_url, - "data": urljoin(link, da.attrib["href"]), - }, - "schema": schema, - } - ) - - # TODO: odstranit? - # cdir = os.path.dirname(os.path.abspath(__file__)) - # with open(os.path.join(cdir, 'mapping.json'), 'w') as fw: - # json.dump(ds, fw, ensure_ascii=False, indent=2) - - for dataset in ds: - tfn = os.path.join(outdir, dataset["nazev_ascii"] + ".csv") - urlretrieve(dataset["url"]["data"], tfn) - - -if __name__ == "__main__": - main(".") diff --git a/data/cssz/schema.py b/data/cssz/schema.py deleted file mode 100644 index 1e0980d6..00000000 --- a/data/cssz/schema.py +++ /dev/null @@ -1,822 +0,0 @@ -from sqlalchemy import Column, MetaData, Table -from sqlalchemy.sql.sqltypes import Date, Integer, Numeric, Text - -meta = MetaData() - -schema = [ - Table( - "ciselnik_datovych_typu", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_druhu_akci_v_tiskopise_onz", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - Column("poznamka", Text, primary_key=False), - ), - Table( - "ciselnik_druhu_pracovnich_cinnosti", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_druhu_zamestnani_pro_e_neschopenku", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("popis_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_duvodu_k_ukonceni_pracovni_neschopnosti", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("popis_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_okresu", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("popis_v_tiskopisech", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_pohlavi", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("zkraceny_nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_specifikace_cizozemskeho_nositele_pojisteni", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - ), - Table( - "ciselnik_zdravotnich_pojistoven", - meta, - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("platnost_od", Date, primary_key=False), - Column("platnost_do", Date, primary_key=False), - Column("poznamka", Text, primary_key=False), - ), - Table( - "doba_rizeni_o_namitkach", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column( - "prumerna_delka_rizeni_s_posouzenim_zdravotniho_stavu", - Integer, - primary_key=False, - ), - Column( - "prumerna_delka_rizeni_bez_posouzeni_zdravotniho_stavu", - Integer, - primary_key=False, - ), - ), - Table( - # zkracen puvodni nazev (limit postgresy) - "kontroly_povinnosti_zamestnavatelu__socialniho_zabezpeceni", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pocet_kontrol", Integer, primary_key=False), - ), - Table( - # zkracen puvodni nazev (limit postgresy) - "mesicni_vyse_duchodu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("kvantil_vyse_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("kvantil_vyse_duchodu", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("mesicni_vyse_duchodu", Text, primary_key=False), - ), - Table( - "minimalni_vymerovaci_zaklad_osvc", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("charakter_cinnosti_kod", Text, primary_key=True, autoincrement=False), - Column("charakter_cinnosti", Text, primary_key=False), - Column("minimalni_vymerovaci_zaklad_osvc", Integer, primary_key=False), - ), - Table( - "nejcastejsi_priciny_vzniku_invalidity", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("vekova_skupina", Text, primary_key=True, autoincrement=False), - Column( - "podskupina_diagnoz_dle_who_kod", - Text, - primary_key=True, - autoincrement=False, - ), - Column("podskupina_diagnoz_dle_who", Text, primary_key=False), - Column("pocet_vzniku_invalidity", Integer, primary_key=False), - ), - Table( - "pocet_duchodcu_s_exekucni_srazkou_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pocet_duchodcu", Text, primary_key=False), - Column("prumerna_vyse_duchodu", Text, primary_key=False), - Column("prumerna_vyse_srazky", Text, primary_key=False), - Column("prumerny_vek", Text, primary_key=False), - ), - Table( - "pocet_duchodcu_s_exekucni_srazkou_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pocet_duchodcu", Text, primary_key=False), - Column("prumerna_vyse_duchodu", Text, primary_key=False), - Column("prumerna_vyse_srazky", Text, primary_key=False), - Column("prumerny_vek", Text, primary_key=False), - ), - Table( - "pocet_duchodcu_s_vyplatou_do_ciziny_dle_statu", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("pocet_duchodcu_cizina", Integer, primary_key=False), - ), - Table( - "pocet_duchodu_s_vyplatou_do_ciziny_dle_statu", - meta, - Column("datum", Date, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pocet_duchodu_cizina", Integer, primary_key=False), - ), - Table( - "pocet_muzu_a_zen_v_cssz", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=True, autoincrement=False), - Column("ustredni_reditel", Integer, primary_key=False), - Column("sekcni_reditel", Integer, primary_key=False), - Column("reditel_pracoviste_cssz", Integer, primary_key=False), - Column("reditel_ossz", Integer, primary_key=False), - Column("reditel_odboru", Integer, primary_key=False), - Column("vedouci_oddeleni", Integer, primary_key=False), - Column("vedouci_referatu", Integer, primary_key=False), - Column("ostatni_zamestnanci", Integer, primary_key=False), - ), - Table( - "pocet_namitek", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("celkovy_pocet_podanych_namitek", Integer, primary_key=False), - Column("zdravotni_stav", Integer, primary_key=False), - Column("hodnoceni_dob", Integer, primary_key=False), - Column("vymerovaci_zaklad", Integer, primary_key=False), - Column("ostatni", Integer, primary_key=False), - Column("ukoncena_rizeni", Integer, primary_key=False), - Column("potvrzujici_rozhodnuti", Integer, primary_key=False), - ), - Table( - "nove_priznane_duchody_dle_vyse_duchodu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vyse_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("vyse_duchodu", Text, primary_key=False), - Column("pocet_nove_priznanych_duchodu", Text, primary_key=False), - ), - Table( - "nove_priznane_duchody_dle_osobniho_vymerovaciho_zakladu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column( - "vyse_osobniho_vymerovaciho_zakladu_kod", - Text, - primary_key=True, - autoincrement=False, - ), - Column("vyse_osobniho_vymerovaciho_zakladu", Text, primary_key=False), - Column("pocet_nove_priznanych_duchodu", Text, primary_key=False), - ), - Table( - "nove_priznane_duchody_dle_veku", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column("pocet_nove_priznanych_duchodu", Text, primary_key=False), - ), - Table( - "invalidita", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column( - "skupina_diagnoz_dle_who_kod", Text, primary_key=True, autoincrement=False - ), - Column("skupina_diagnoz_dle_who", Text, primary_key=False), - Column("pocet_nove_priznanych_duchodu", Text, primary_key=False), - ), - Table( - "pocet_podanych_eldp", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("zpusob_podani_eldp_kod", Text, primary_key=True, autoincrement=False), - Column("zpusob_podani_eldp", Text, primary_key=False), - Column("pocet_podanych_eldp", Integer, primary_key=False), - ), - Table( - "pocet_spravnich_exekuci", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("celkovy_pocet_odeslanych_vyzev", Integer, primary_key=False), - Column("celkovy_pocet_spravnich_exekuci", Integer, primary_key=False), - ), - Table( - "pocet_sto_a_viceletych_duchodcu_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column("pocet_duchodcu", Text, primary_key=False), - ), - Table( - "pocet_sto_a_viceletych_duchodcu_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column("pocet_duchodcu", Text, primary_key=False), - ), - Table( - "pocet_vyplacenych_dnp_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("typ_dnp", Text, primary_key=True, autoincrement=False), - Column("pocet_dnp", Text, primary_key=False), - ), - Table( - "pocet_vyplacenych_dnp_podle_okresu", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("okres_kod", Text, primary_key=True, autoincrement=False), - Column("okres", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("typ_dnp", Text, primary_key=True, autoincrement=False), - Column("pocet_dnp", Text, primary_key=False), - ), - Table( - "pocet_vyplacenych_dnp_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("typ_dnp", Text, primary_key=True, autoincrement=False), - Column("pocet_dnp", Text, primary_key=False), - ), - Table( - "vyplacene_duchody_dle_vyse", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vyse_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("vyse_duchodu", Text, primary_key=False), - Column("pocet_duchodu", Text, primary_key=False), - ), - Table( - "duchody_dle_veku", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column("pocet_duchodu", Text, primary_key=False), - ), - Table( - "pocet_vyplacenych_invalidnich_duchodu_v_ceske_republice", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column( - "skupina_diagnoz_dle_who_kod", Text, primary_key=True, autoincrement=False - ), - Column("skupina_diagnoz_dle_who", Text, primary_key=False), - Column("pocet_vyplacenych_duchodu", Text, primary_key=False), - ), - Table( - "pocet_vyplat_duchodu_s_exekucni_srazkou", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pocet_vyplat", Integer, primary_key=False), - ), - Table( - "priplatky_a_prispevky_k_duchodum_dle_odskodnovacich_zakonu", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("priplatek_politickym_veznum", Integer, primary_key=False), - Column("priplatek_ucastnikum_odboje", Integer, primary_key=False), - Column("zvlastni_prispevek", Integer, primary_key=False), - ), - Table( - "pocet_vyrizenych_ioldp", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pocet_ioldp", Integer, primary_key=False), - ), - # TODO: pridat do mappingu - # Table( - # "pocet_vystavenych_potvrzeni_o_prislusnosti_k_ceskym_pravnim_predpisum", - # meta, - # Column("rok", Integer, primary_key=True, autoincrement=False), - # Column("vyslani", Integer, primary_key=False), - # Column("soubehy", Integer, primary_key=False), - # Column("vyjimky", Integer, primary_key=False), - # Column("mezinarodni_smlouvy", Integer, primary_key=False), - # ), - Table( - "zanikle_duchody", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("duvod_zaniku_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("duvod_zaniku_duchodu", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("pocet_zaniklych_duchodu", Text, primary_key=False), - ), - Table( - "pocty_prihlasenych_pohledavek_cssz_do_insolvencniho_rizeni", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("celkovy_pocet_prihlasek_cssz", Integer, primary_key=False), - ), - Table( - "pocet_predcasnych_starobnich_duchodu_a_vydaje_v_cr", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pocet_vyplacenych_st", Integer, primary_key=False), - Column("pocet_vyplacenych_sd", Integer, primary_key=False), - Column("vydaje_na_st", Numeric(12, 4), primary_key=False), - Column("vydaje_na_sd", Numeric(12, 4), primary_key=False), - ), - Table( - "pomocne_ciselniky", - meta, - Column("nazev_ciselniku", Text, primary_key=True, autoincrement=False), - Column("kod_polozky", Text, primary_key=True, autoincrement=False), - Column("nazev_polozky", Text, primary_key=False), - Column("popis_polozky", Text, primary_key=False), - ), - Table( - "posudky_provedene_lps", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("typ_posudku", Text, primary_key=True, autoincrement=False), - Column("pocet_posudku", Integer, primary_key=False), - ), - Table( - "prum_delka_pobirani_s_duchodu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("rok_zaniku_duchodu", Text, primary_key=True, autoincrement=False), - Column("prumerna_doba_pobirani_starobniho_duchodu", Text, primary_key=False), - ), - Table( - "prum_vyse_duchodu_u_nove_priznanych_duchodu_podle_druhu_duchodu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column( - "prumerna_vyse_duchodu_u_nove_priznanych_duchodu", Text, primary_key=False - ), - ), - # TODO: pridat do mappingu - # Table( - # "prum_vyse_osobniho_vymerovaciho_zakladu_u_nove_priznanych_duchodu_podle_druhu_duchodu", - # meta, - # Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - # Column("druh_duchodu", Text, primary_key=False), - # Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - # Column("pohlavi", Text, primary_key=False), - # Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - # Column("prumerna_vyse_ovz_u_nove_" - # "priznanych_duchodu", Text, primary_key=False), - # ), - Table( - "prumerny_plat_v_cssz", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("prumerny_plat_cssz", Integer, primary_key=False), - Column("median", Integer, primary_key=False), - ), - Table( - "prum_vek_u_nove_priznanych_duchodu_dle_druhu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("prumerny_vek_u_nove_priznanych_duchodu", Text, primary_key=False), - ), - Table( - "prehled_o_celkovem_poctu_osvc_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=True, autoincrement=False), - Column("vykonavana_cinnost_hlavni", Text, primary_key=False), - Column("vykonavana_cinnost_vedlejsi", Text, primary_key=False), - Column("plati_zalohy_dp_hlavni", Text, primary_key=False), - Column("plati_zalohy_dp_vedlejsi", Text, primary_key=False), - Column("dobrovolne_dp", Text, primary_key=False), - Column("dobrovolne_np", Text, primary_key=False), - ), - Table( - "prehled_o_celkovem_poctu_osvc_podle_okresu", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("okres_kod", Text, primary_key=True, autoincrement=False), - Column("okres", Text, primary_key=True, autoincrement=False), - Column("vykonavana_cinnost_hlavni", Text, primary_key=False), - Column("vykonavana_cinnost_vedlejsi", Text, primary_key=False), - Column("plati_zalohy_dp_hlavni", Text, primary_key=False), - Column("plati_zalohy_dp_vedlejsi", Text, primary_key=False), - Column("dobrovolne_dp", Text, primary_key=False), - Column("dobrovolne_np", Text, primary_key=False), - ), - Table( - "prehled_o_celkovem_poctu_osvc_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=True, autoincrement=False), - Column("vykonavana_cinnost_hlavni", Text, primary_key=False), - Column("vykonavana_cinnost_vedlejsi", Text, primary_key=False), - Column("plati_zalohy_dp_hlavni", Text, primary_key=False), - Column("plati_zalohy_dp_vedlejsi", Text, primary_key=False), - Column("dobrovolne_dp", Text, primary_key=False), - Column("dobrovolne_np", Text, primary_key=False), - ), - Table( - "duchodci_v_cr_krajich_okresech", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("referencni_oblast_kod", Text, primary_key=True, autoincrement=False), - Column("referencni_oblast", Text, primary_key=False), - Column("pocet_duchodcu", Text, primary_key=False), - Column("prumerny_vek", Text, primary_key=False), - Column("prumerna_vyse_duchodu", Text, primary_key=False), - ), - Table( - "zamestnavatele_pojistencu_a_pojistnych_vztahu_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("pocet_zamestnavatelu", Text, primary_key=False), - Column("pocet_pojistencu", Text, primary_key=False), - Column("pocet_pojistnych_vztahu", Text, primary_key=False), - ), - Table( - "zamestnavatele_pojistencu_a_pojistnych_vztahu_podle_okresu", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("okres_kod", Text, primary_key=True, autoincrement=False), - Column("okres", Text, primary_key=False), - Column("pocet_zamestnavatelu", Text, primary_key=False), - Column("pocet_pojistencu", Text, primary_key=False), - Column("pocet_pojistnych_vztahu", Text, primary_key=False), - ), - Table( - "zamestnavatele_pojistencu_a_pojistnych_vztahu_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("pocet_zamestnavatelu", Text, primary_key=False), - Column("pocet_pojistencu", Text, primary_key=False), - Column("pocet_pojistnych_vztahu", Text, primary_key=False), - ), - Table( - "rozhodovani_o_promijeni_penale", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("celkovy_pocet_zadosti", Integer, primary_key=False), - Column("pocet_vyhovenych_zadosti", Integer, primary_key=False), - Column("pocet_nevyhovenych_zadosti", Integer, primary_key=False), - ), - Table( - "sazby_pojisteni_v_cr", - meta, - Column("platnost_od", Text, primary_key=True, autoincrement=False), - Column("platnost_do", Text, primary_key=True, autoincrement=False), - Column("sazba_pojistneho", Text, primary_key=True, autoincrement=False), - Column("zamestnavatel", Numeric(12, 4), primary_key=False), - Column( - "zamestnavatel_s_mensim_poctem_zamestnancu", - Numeric(12, 4), - primary_key=False, - ), - Column("zamestnanec", Numeric(12, 4), primary_key=False), - Column("zamestnanec_ds", Numeric(12, 4), primary_key=False), - Column("osvc_ucastna_dp", Numeric(12, 4), primary_key=False), - Column("osvc_ucastna_dp_ds", Numeric(12, 4), primary_key=False), - Column("osvc_ucastna_np", Numeric(12, 4), primary_key=False), - Column("osoba_dobrovolne_ucastna_dp", Numeric(12, 4), primary_key=False), - Column("osoba_dobrovolne_ucastna_dp_ds", Numeric(12, 4), primary_key=False), - Column("zahranicni_zamestnanec", Numeric(12, 4), primary_key=False), - ), - Table( - "statistika_zadosti_dle_zak_106_1999_sb", - meta, - Column("referencni_obdobi", Integer, primary_key=True, autoincrement=False), - Column( - "celkovy_pocet_podanych_zadosti_o_informace", Integer, primary_key=False - ), - Column( - "pocet_zadosti_o_informace_podanych_fyzickymi_osobami", - Integer, - primary_key=False, - ), - Column( - "pocet_zadosti_o_informace_podanych_pravnickymi_osobami", - Integer, - primary_key=False, - ), - Column( - "pocet_zadosti_o_informace_podanych_elektronicky", - Integer, - primary_key=False, - ), - Column( - "pocet_zadosti_o_informace_podanych_pisemne", Integer, primary_key=False - ), - Column("pocet_zadosti_o_informace_podanych_osobne", Integer, primary_key=False), - Column( - "pocet_vydanych_rozhodnuti_o_odmitnuti_zadosti", Integer, primary_key=False - ), - Column("pocet_podanych_odvolani_proti_rozhodnuti", Integer, primary_key=False), - Column("soudni_rizeni", Integer, primary_key=False), - Column("poskytnute_vyhradni_licence", Integer, primary_key=False), - Column("pocet_stiznosti", Integer, primary_key=False), - Column("dalsi_informace", Integer, primary_key=False), - ), - Table( - "systemizovana_mista_a_fluktuace_zamestnancu_cssz", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("plan_sm", Integer, primary_key=False), - Column("ppep", Integer, primary_key=False), - Column("fluktuace", Numeric(12, 4), primary_key=False), - ), - Table( - "udaje_o_cinnosti_call_centra_pro_dp", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("telefonaty_celkem", Integer, primary_key=False), - Column("vyrizene_hovory", Integer, primary_key=False), - Column("duchodove_pojisteni", Integer, primary_key=False), - Column("vyplata_duchodu", Integer, primary_key=False), - Column("duchodove_pojisteni_v_zahranici", Integer, primary_key=False), - Column("agendy_bez_rozliseni", Integer, primary_key=False), - Column("exekuce", Integer, primary_key=False), - Column("dobrovolne_penzijni_sporeni", Integer, primary_key=False), - ), - Table( - "udaje_o_cinnosti_call_centra_pro_np", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("druh_dotazu", Text, primary_key=True, autoincrement=False), - Column("tema_davky", Text, primary_key=True, autoincrement=False), - Column("pocet_vyrizenych_pripadu", Text, primary_key=False), - ), - Table( - "udaje_o_plneni_podilu_osob_se_zdravotnim_postizenim", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("pocet_zamestnancu", Numeric(12, 4), primary_key=False), - Column("povinny_podil", Numeric(12, 4), primary_key=False), - Column("plneni_povinnosti", Numeric(12, 4), primary_key=False), - ), - Table( - "ukazatel_pracovni_neschopnosti_podle_delky_trvani_dpn_a_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("delka_trvani_dpn", Text, primary_key=True, autoincrement=False), - Column("pocet_ukoncenych_pripadu_dpn", Text, primary_key=False), - ), - Table( - "ukazatel_pracovni_neschopnosti_podle_delky_trvani_dpn_v_cr", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=True, autoincrement=False), - Column("stat", Text, primary_key=False), - Column("delka_trvani_dpn", Text, primary_key=True, autoincrement=False), - Column("pocet_ukoncenych_pripadu_dpn", Text, primary_key=False), - ), - Table( - "ukazatele_pracovni_neschopnosti_podle_kraju", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("kraj_kod", Text, primary_key=True, autoincrement=False), - Column("kraj", Text, primary_key=False), - Column("pocet_ukoncenych_pripadu_dpn", Text, primary_key=False), - Column("pocet_prostonanych_dnu", Text, primary_key=False), - Column("prumerna_delka_trvani_jednoho_pripadu_dpn", Text, primary_key=False), - ), - Table( - "ukazatele_pracovni_neschopnosti_podle_pohlavi_a_diagnozy", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("skupina_diagnoz", Text, primary_key=True, autoincrement=False), - Column("pocet_ukoncenych_pripadu_dpn", Text, primary_key=False), - Column("pocet_prostonanych_dnu", Text, primary_key=False), - Column("prumerna_delka_trvani_jednoho_pripadu_dpn", Text, primary_key=False), - ), - Table( - "ukazatele_pracovni_neschopnosti_podle_pohlavi_a_vekove_skupiny", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column("vekova_skupina_kod", Text, primary_key=True, autoincrement=False), - Column("vekova_skupina", Text, primary_key=False), - Column("pocet_ukoncenych_pripadu_dpn", Text, primary_key=False), - Column("pocet_prostonanych_dnu", Text, primary_key=False), - Column("prumerna_delka_trvani_jednoho_pripadu_dpn", Text, primary_key=False), - ), - Table( - "ukazatele_pracovni_neschopnosti", - meta, - Column("datum", Text, primary_key=True, autoincrement=False), - Column("stat_kod", Text, primary_key=False), - Column("stat", Text, primary_key=False), - Column("pocet_ukoncenych_pripadu_dpn", Integer, primary_key=False), - Column("pocet_prostonanych_dnu", Integer, primary_key=False), - Column( - "prumerna_doba_trvani_jednoho_pripadu_dpn", - Numeric(12, 4), - primary_key=False, - ), - Column("pocet_dpn_ukoncene_rozhodnutim", Integer, primary_key=False), - Column( - "pocet_pripadu_s_porusenim_lecebneho_rezimu", Integer, primary_key=False - ), - ), - Table( - "vydaje_na_duchody_v_cr", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("vydaje_na_duchody_opravene_o_zalohy_v_tis_kc", Text, primary_key=False), - ), - Table( - "zamestnanci_cssz_dle_veku", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("vek_kod", Text, primary_key=True, autoincrement=False), - Column("vek", Text, primary_key=False), - Column("pocet_zamestnancu_cssz", Integer, primary_key=False), - ), - Table( - "zamestnanci_cssz_dle_vzdelani", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("dosazene_vzdelani_kod", Text, primary_key=True, autoincrement=False), - Column("dosazene_vzdelani", Text, primary_key=False), - Column("pocet_zamestnancu_cssz", Integer, primary_key=False), - ), - Table( - "zmeny_mezi_stupni_invalidniho_duchodu", - meta, - Column("druh_duchodu_kod", Text, primary_key=True, autoincrement=False), - Column("druh_duchodu", Text, primary_key=False), - Column("pohlavi_kod", Text, primary_key=True, autoincrement=False), - Column("pohlavi", Text, primary_key=False), - Column( - "duvod_zmeny_stupne_invalidniho_duchodu_kod", - Text, - primary_key=True, - autoincrement=False, - ), - Column("duvod_zmeny_stupne_invalidniho_duchodu", Text, primary_key=False), - Column("referencni_obdobi", Text, primary_key=True, autoincrement=False), - Column("pocet_vyplacenych_duchodu", Text, primary_key=False), - ), - Table( - "zpusob_vyplat_duchodu_v_cr", - meta, - Column("rok", Integer, primary_key=True, autoincrement=False), - Column("hotovostni_vyplata_cp", Integer, primary_key=False), - Column("osobni_ucty", Integer, primary_key=False), - Column("vyplata_do_zss", Integer, primary_key=False), - ), -] - -if __name__ == "__main__": - from sqlalchemy import create_engine - from sqlalchemy.schema import CreateTable - - engine = create_engine("sqlite:///:memory:") - for table in schema: - print(f"-- {table.name} as created in SQLite") - print(CreateTable(table).compile(engine)) diff --git a/data/justice/main.py b/data/justice/main.py index 57ce744d..8a8ea636 100644 --- a/data/justice/main.py +++ b/data/justice/main.py @@ -78,7 +78,7 @@ def uprav_data(row, mapping): @contextlib.contextmanager -def cached_urlopen(url, timeout=HTTP_TIMEOUT): +def cached_urlopen(url, timeout=HTTP_TIMEOUT, partial=False): shasum = hashlib.sha256(url.encode("utf-8")).hexdigest() if url.endswith(".gz"): shasum += ".gz" @@ -93,7 +93,10 @@ def cached_urlopen(url, timeout=HTTP_TIMEOUT): # driv jsme cetli rovnou z webu, ale delalo to problemy, # tak to holt docasne ukladame a poustime to z disku with open(fn_tmp, "wb") as f: - shutil.copyfileobj(r, f) + if partial: + f.write(r.read(1000_000)) + else: + shutil.copyfileobj(r, f) # pri vypnuty cache se jen cte s tempfilu a pak se maze if not CACHE_ENABLED: @@ -107,15 +110,15 @@ def cached_urlopen(url, timeout=HTTP_TIMEOUT): yield r -def nahraj_ds(url): - with cached_urlopen(url, timeout=HTTP_TIMEOUT) as r: +def nahraj_ds(url, partial=False): + with cached_urlopen(url, timeout=HTTP_TIMEOUT, partial=partial) as r: with gzip.open(r, "rb") as f: et = lxml.etree.iterparse(f) yield from et def zpracuj_ds(url, schemas, outdir, partial, autogen, icos): - et = nahraj_ds(url) + et = nahraj_ds(url, partial) fs, csvs, schemasd = dict(), dict(), dict() ds = os.path.basename(urlparse(url).path).partition(".")[0] @@ -149,7 +152,7 @@ def zpracuj_ds(url, schemas, outdir, partial, autogen, icos): csvs[udaj] = cw for num, (action, el) in enumerate(et): - if partial and num > 1e5: + if partial and num > 1e4: break assert action == "end", action if el.tag != "Subjekt": @@ -289,7 +292,7 @@ def main(outdir: str, partial: bool = False): urls = [] for j, ds in enumerate(tqdm(dss, desc=f"{year} meta")): - if partial and len(urls) > 20: + if partial and len(urls) > 10: break url = f"https://dataor.justice.cz/api/3/action/package_show?id={ds}" with cached_urlopen(url, timeout=HTTP_TIMEOUT) as r: diff --git a/main.py b/main.py index 428a0a41..63cc6de4 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,172 @@ def warninger(message, category, filename, lineno, line=None): warnings.formatwarning = warninger + +def main( + *, + module_name: str, + engine, + base_outdir: str, + partial: bool, + load_only: bool = False, + drop_first: bool = False, + preserve_csv: bool = False, + schema_prefix: str = "", +): + print(module_name) + print("=" * len(module_name)) + + module = import_module(f"data.{module_name}.main").main + schema = import_module(f"data.{module_name}.schema").schema + + outdir = os.path.join(base_outdir, module_name) + os.makedirs(outdir, exist_ok=True) + with tempfile.TemporaryDirectory(dir=base_outdir) as outdir_tmp: + if os.path.isdir(outdir) and not load_only: + shutil.rmtree(outdir) + + if not load_only: + module(outdir_tmp, partial=partial) + os.rename(outdir_tmp, outdir) + + if not engine: + return + + table_loads = defaultdict(list) + for table in schema: + fn_cand = os.path.join(outdir, table.name + ".csv") + dir_cand = os.path.join(outdir, table.name) + if os.path.isfile(fn_cand): + table_loads[(module_name, table.name)].append(fn_cand) + elif os.path.isdir(dir_cand): + for basename in os.listdir(dir_cand): + table_loads[(module_name, table.name)].append( + os.path.join(dir_cand, basename) + ) + else: + raise IOError(f"neexistujou data pro {module_name}.{table.name}") + + for table in schema: + t = time.time() + print(f"Nahravam {table.name} do {module_name}", end="") + files = table_loads[(module_name, table.name)] + fkeys = [j for j in table.constraints if isinstance(j, ForeignKeyConstraint)] + + # základní kontrola integrity (oflagovat?) + # .lower() na obou stranach pro case insensitive porovnani + db_column_names = [j.name.lower() for j in table.columns] + db_column_nullable = [j.nullable for j in table.columns] + data_nullable = [False for _ in table.columns] + for file in files: + with open(file, "rt", encoding="utf-8") as f: + cr = csv.reader(f) + header = [j.lower() for j in next(cr)] + if header != db_column_names: + errmap = dict( + (k, v) for k, v in zip(header, db_column_names) if k != v + ) + warnings.warn(f"databáze očekává jiné sloupce: {errmap}") + + for j, row in enumerate(cr): + if len(row) != len(db_column_names): + raise ValueError( + f"nečekaný počet sloupců, {len(row)} vs." + f" {len(db_column_names)} (řádka {j+2}" + ) + for k, val in enumerate(row): + if val == "": + data_nullable[k] = True + + if data_nullable != db_column_nullable: + for j, (dnull, dbnull) in enumerate(zip(data_nullable, db_column_nullable)): + if dnull == dbnull: + continue + warnings.warn( + f"NULL neshoda v {table.name} ({db_column_names[j]}):" + f" data ({dnull}) vs. DB ({dbnull})" + ) + + if engine.name == "postgresql": + table.schema = f"{schema_prefix}{module_name}" + with engine.begin() as conn: + conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {table.schema}")) + elif engine.name == "sqlite": + table.name = f"{schema_prefix}{module_name}_{table.name}" + + if drop_first: + table.drop(engine, checkfirst=True) + table.create(engine, checkfirst=True) + + # dropni fkeys pred nahravanim dat + # z nejakeho duvodu jsou v sqlite nepojmenovany klice + if engine.name == "postgresql": + dbtable = Table( + table.name, + table.metadata, + schema=table.schema, + autoload_with=engine, + ) + for fk in dbtable.constraints: + if isinstance(fk, ForeignKeyConstraint): + sql = DropConstraint(fk).compile() + with engine.begin() as conn: + conn.execute(text(sql.string)) + + if engine.name == "postgresql": + full_table_name = f"{table.schema}.{table.name}" + conn = engine.raw_connection() + cur = conn.cursor() + cur.execute(f"TRUNCATE {full_table_name} CASCADE") # TODO: cascade yolo + for filename in files: + with open(filename, "rt", encoding="utf-8") as f: + cur.copy_expert( + f"COPY {full_table_name} FROM stdin WITH CSV HEADER", f + ) + conn.commit() # TODO: proc nejde context manager? starej psycopg? + elif engine.name == "sqlite": + conn = engine.raw_connection() + conn.execute(f"DELETE FROM {table.name}") # truncate v sqlite neni + + ph = ", ".join(["?"] * len(table.columns)) + query = f"INSERT INTO {table.name} VALUES({ph})" + bools = [isinstance(j.type, Boolean) for j in table.columns] + for filename in files: + buffer = [] + with open(filename, "rt", encoding="utf-8") as f: + cr = csv.reader(f) + next(cr) # header + for row in cr: + row = [ + bool(el) if bools[j] and el != "" else el + for j, el in enumerate(row) + ] + row = [None if j == "" else j for j in row] + buffer.append(row) + if len(buffer) == 100: + conn.executemany(query, buffer) + buffer = [] + if len(buffer) > 0: + conn.executemany(query, buffer) + conn.commit() + else: + raise IOError(f"{engine.name} not supported yet") + + # constrainty jsme neumeli dropnout u sqlite... a nejdou ani pridat + if engine.name == "postgresql": + for fk in fkeys: + if not isinstance(fk, ForeignKeyConstraint): + continue + sql = AddConstraint(fk).compile() + with engine.begin() as conn: + conn.execute(text(sql.string)) + + print(f" ({time.time() - t:.2f}s)") + + # data nahrana do db, muzu mazat CSV + if not preserve_csv: + shutil.rmtree(outdir) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -59,9 +225,6 @@ def warninger(message, category, filename, lineno, line=None): if args.load_only and not args.connstring: raise ValueError("při --load-only je třeba specifikovat --connstring") - base_outdir = "csv" - os.makedirs(base_outdir, exist_ok=True) - engine = None if args.connstring: engine = create_engine(args.connstring) @@ -69,9 +232,6 @@ def warninger(message, category, filename, lineno, line=None): # TODO: nejak pridat `czechinvest` - je to ready, jen nefunguje stahovani souboru module_names = [ "red", - # TODO: docasne vypnuto, protoze jsem zkracoval nazvy tabulek, ktery - # ted nesedej mezi schematem a mappingem - # "cssz", "datovky", "dotinfo", "eufondy", @@ -91,164 +251,15 @@ def warninger(message, category, filename, lineno, line=None): ] if args.modules: module_names = args.modules - modules = {} - schemas = {} - - for module in module_names: - modules[module] = import_module(f"data.{module}.main").main - schemas[module] = import_module(f"data.{module}.schema").schema # TODO: multiprocessing - for module_name, module in modules.items(): - print(module_name) - print("=" * len(module_name)) - - outdir = os.path.join(base_outdir, module_name) - with tempfile.TemporaryDirectory(dir=base_outdir) as outdir_tmp: - if os.path.isdir(outdir) and not args.load_only: - shutil.rmtree(outdir) - - if not args.load_only: - module(outdir_tmp, partial=args.partial) - os.rename(outdir_tmp, outdir) - - if not engine: - continue - - table_loads = defaultdict(list) - for table in schemas[module_name]: - fn_cand = os.path.join(outdir, table.name + ".csv") - dir_cand = os.path.join(outdir, table.name) - if os.path.isfile(fn_cand): - table_loads[(module_name, table.name)].append(fn_cand) - elif os.path.isdir(dir_cand): - for basename in os.listdir(dir_cand): - table_loads[(module_name, table.name)].append( - os.path.join(dir_cand, basename) - ) - else: - raise IOError(f"neexistujou data pro {module_name}.{table.name}") - - for table in schemas[module_name]: - t = time.time() - print(f"Nahravam {table.name} do {module_name}", end="") - files = table_loads[(module_name, table.name)] - fkeys = [ - j for j in table.constraints if isinstance(j, ForeignKeyConstraint) - ] - - # základní kontrola integrity (oflagovat?) - # .lower() na obou stranach pro case insensitive porovnani - db_column_names = [j.name.lower() for j in table.columns] - db_column_nullable = [j.nullable for j in table.columns] - data_nullable = [False for _ in table.columns] - for file in files: - with open(file, "rt", encoding="utf-8") as f: - cr = csv.reader(f) - header = [j.lower() for j in next(cr)] - if header != db_column_names: - errmap = dict( - (k, v) for k, v in zip(header, db_column_names) if k != v - ) - warnings.warn(f"databáze očekává jiné sloupce: {errmap}") - - for j, row in enumerate(cr): - if len(row) != len(db_column_names): - raise ValueError( - f"nečekaný počet sloupců, {len(row)} vs." - f" {len(db_column_names)} (řádka {j+2}" - ) - for k, val in enumerate(row): - if val == "": - data_nullable[k] = True - - if data_nullable != db_column_nullable: - for j, (dnull, dbnull) in enumerate( - zip(data_nullable, db_column_nullable) - ): - if dnull == dbnull: - continue - warnings.warn( - f"NULL neshoda v {table.name} ({db_column_names[j]}):" - f" data ({dnull}) vs. DB ({dbnull})" - ) - - if engine.name == "postgresql": - table.schema = f"{args.schema_prefix}{module_name}" - with engine.begin() as conn: - conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {table.schema}")) - elif engine.name == "sqlite": - table.name = f"{args.schema_prefix}{module_name}_{table.name}" - - if args.drop_first: - table.drop(engine, checkfirst=True) - table.create(engine, checkfirst=True) - - # dropni fkeys pred nahravanim dat - # z nejakeho duvodu jsou v sqlite nepojmenovany klice - if engine.name == "postgresql": - dbtable = Table( - table.name, - table.metadata, - schema=table.schema, - autoload_with=engine, - ) - for fk in dbtable.constraints: - if isinstance(fk, ForeignKeyConstraint): - sql = DropConstraint(fk).compile() - with engine.begin() as conn: - conn.execute(text(sql.string)) - - if engine.name == "postgresql": - full_table_name = f"{table.schema}.{table.name}" - conn = engine.raw_connection() - cur = conn.cursor() - cur.execute(f"TRUNCATE {full_table_name} CASCADE") # TODO: cascade yolo - for filename in files: - with open(filename, "rt", encoding="utf-8") as f: - cur.copy_expert( - f"COPY {full_table_name} FROM stdin WITH CSV HEADER", f - ) - conn.commit() # TODO: proc nejde context manager? starej psycopg? - elif engine.name == "sqlite": - conn = engine.raw_connection() - conn.execute(f"DELETE FROM {table.name}") # truncate v sqlite neni - - ph = ", ".join(["?"] * len(table.columns)) - query = f"INSERT INTO {table.name} VALUES({ph})" - bools = [isinstance(j.type, Boolean) for j in table.columns] - for filename in files: - buffer = [] - with open(filename, "rt", encoding="utf-8") as f: - cr = csv.reader(f) - next(cr) # header - for row in cr: - row = [ - bool(el) if bools[j] and el != "" else el - for j, el in enumerate(row) - ] - row = [None if j == "" else j for j in row] - buffer.append(row) - if len(buffer) == 100: - conn.executemany(query, buffer) - buffer = [] - if len(buffer) > 0: - conn.executemany(query, buffer) - conn.commit() - else: - raise IOError(f"{engine.name} not supported yet") - - # constrainty jsme neumeli dropnout u sqlite... a nejdou ani pridat - if engine.name == "postgresql": - for fk in fkeys: - if not isinstance(fk, ForeignKeyConstraint): - continue - sql = AddConstraint(fk).compile() - with engine.begin() as conn: - conn.execute(text(sql.string)) - - print(f" ({time.time() - t:.2f}s)") - - # data nahrana do db, muzu mazat CSV - if not args.preserve_csv: - shutil.rmtree(outdir) + for module in module_names: + main( + module_name=module, + engine=engine, + base_outdir="csv", + load_only=args.load_only, + partial=args.partial, + drop_first=args.drop_first, + preserve_csv=args.preserve_csv, + ) diff --git a/main_test.py b/main_test.py new file mode 100644 index 00000000..93aff4d4 --- /dev/null +++ b/main_test.py @@ -0,0 +1,41 @@ +import pytest +from sqlalchemy import create_engine + + +@pytest.mark.parametrize( + ["module"], + [ + # TODO: zapnout postupne dalsi + ("ares",), + # czechinvest uplne zrusil svoje data asi - poptavam + # ("czechinvest",), # TODO + ("czechpoint",), + ("datovky",), + ("dotinfo",), + # ("eufondy",), + ("iissp",), + # ("justice",), + # TODO: psp ma problem s konektivitou + # ("psp",), + # ("steno",), + # ("red",), + ("res",), + ("ruian",), + ("smlouvy",), + ("szif",), + ("udhpsh",), + # ("volby",), + ("zakazky",), + ], +) +def test_partial(tmp_path, module): + from . import main + + engine = create_engine(f"sqlite:///{tmp_path / 'data.db'}") + + main.main( + base_outdir=tmp_path, + module_name=module, + partial=True, + engine=engine, + ) diff --git a/requirements.txt b/requirements.txt index d4f7cb34..385b9ede 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ lxml openpyxl psycopg2-binary -requests tqdm xlrd cssselect