-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkramerius.py
46 lines (35 loc) · 1.15 KB
/
kramerius.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
###################
#
# Skript pro stažení obsahu dokumentu z Digitální knihovny, API v. 5.0
#
# Dokumentace API: https://github.com/ceskaexpedice/kramerius/wiki/ClientAPIDEV
#
# Stáhne obrázek v plné kvalitě, text OCR a ALTO
#
####################
import requests
import urllib.request
import json
from urllib.parse import urlparse
url_ui = input("Zadej webový odkaz na dokument v Digitální knihovně: ")
url_ui_pars = urlparse(url_ui)
dk = url_ui_pars.hostname
path = url_ui_pars.path
uuid = path.split('/')[-1]
dokument = requests.get(f'https://{dk}/search/api/v5.0/item/{uuid}/children')
count = 0
dokJSON = json.loads(dokument.content)
for page in dokJSON:
count += 1
fileCount = '{:04d}'.format(count)
print(count)
streams = f'https://{dk}/search/api/v5.0/item/{page["pid"]}/streams'
jpg = requests.get(f'{streams}/IMG_FULL', verify=False)
with open(f'{fileCount}.jpg', "wb") as f:
f.write(jpg.content)
txt = requests.get(f'{streams}/TEXT_OCR')
with open(f'{fileCount}.txt', 'wb') as f:
f.write(txt.content)
alto = requests.get(f'{streams}/alto', verify=False)
with open(f'{fileCount}.xml', "wb") as f:
f.write(alto.content)