diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..d24eb04 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,32 @@ +name: Python Lint with Black + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.12" + - "3.11" + - "3.10" + - "3.9" + - "3.8" + - "3.7" # The oldest version supported by Github Actions + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install black + run: pipx install black + + - name: Lint with Black + run: black --check ebooklib/ \ No newline at end of file diff --git a/ebooklib/__init__.py b/ebooklib/__init__.py index 5f5abd0..956b500 100644 --- a/ebooklib/__init__.py +++ b/ebooklib/__init__.py @@ -33,14 +33,15 @@ ITEM_SMIL = 11 # EXTENSION MAPPER -EXTENSIONS = {ITEM_IMAGE: ['.jpg', '.jpeg', '.gif', '.tiff', '.tif', '.png'], - ITEM_STYLE: ['.css'], - ITEM_VECTOR: ['.svg'], - ITEM_FONT: ['.otf', '.woff', '.ttf'], - ITEM_SCRIPT: ['.js'], - ITEM_NAVIGATION: ['.ncx'], - ITEM_VIDEO: ['.mov', '.mp4', '.avi'], - ITEM_AUDIO: ['.mp3', '.ogg'], - ITEM_COVER: ['.jpg', '.jpeg', '.png'], - ITEM_SMIL: ['.smil'] - } +EXTENSIONS = { + ITEM_IMAGE: [".jpg", ".jpeg", ".gif", ".tiff", ".tif", ".png"], + ITEM_STYLE: [".css"], + ITEM_VECTOR: [".svg"], + ITEM_FONT: [".otf", ".woff", ".ttf"], + ITEM_SCRIPT: [".js"], + ITEM_NAVIGATION: [".ncx"], + ITEM_VIDEO: [".mov", ".mp4", ".avi"], + ITEM_AUDIO: [".mp3", ".ogg"], + ITEM_COVER: [".jpg", ".jpeg", ".png"], + ITEM_SMIL: [".smil"], +} diff --git a/ebooklib/epub.py b/ebooklib/epub.py index 62fea67..772b9c6 100644 --- a/ebooklib/epub.py +++ b/ebooklib/epub.py @@ -33,40 +33,54 @@ import ebooklib -from ebooklib.utils import parse_string, parse_html_string, guess_type, get_pages_for_items +from ebooklib.utils import ( + parse_string, + parse_html_string, + guess_type, + get_pages_for_items, +) # Version of EPUB library VERSION = (0, 18, 1) -NAMESPACES = {'XML': 'http://www.w3.org/XML/1998/namespace', - 'EPUB': 'http://www.idpf.org/2007/ops', - 'DAISY': 'http://www.daisy.org/z3986/2005/ncx/', - 'OPF': 'http://www.idpf.org/2007/opf', - 'CONTAINERNS': 'urn:oasis:names:tc:opendocument:xmlns:container', - 'DC': 'http://purl.org/dc/elements/1.1/', - 'XHTML': 'http://www.w3.org/1999/xhtml'} +NAMESPACES = { + "XML": "http://www.w3.org/XML/1998/namespace", + "EPUB": "http://www.idpf.org/2007/ops", + "DAISY": "http://www.daisy.org/z3986/2005/ncx/", + "OPF": "http://www.idpf.org/2007/opf", + "CONTAINERNS": "urn:oasis:names:tc:opendocument:xmlns:container", + "DC": "http://purl.org/dc/elements/1.1/", + "XHTML": "http://www.w3.org/1999/xhtml", +} # XML Templates -CONTAINER_PATH = 'META-INF/container.xml' +CONTAINER_PATH = "META-INF/container.xml" -CONTAINER_XML = ''' +CONTAINER_XML = """ -''' +""" -NCX_XML = six.b(''' -''') +NCX_XML = six.b( + """ +""" +) -NAV_XML = six.b('''''') +NAV_XML = six.b( + """""" +) -CHAPTER_XML = six.b('''''') +CHAPTER_XML = six.b( + """""" +) -COVER_XML = six.b(''' +COVER_XML = six.b( + """ @@ -78,17 +92,19 @@ -''') +""" +) -IMAGE_MEDIA_TYPES = ['image/jpeg', 'image/jpg', 'image/png', 'image/svg+xml'] +IMAGE_MEDIA_TYPES = ["image/jpeg", "image/jpg", "image/png", "image/svg+xml"] # TOC and navigation elements + class Section(object): - def __init__(self, title, href=''): + def __init__(self, title, href=""): self.title = title self.href = href @@ -100,6 +116,7 @@ def __init__(self, href, title, uid=None): self.title = title self.uid = uid + # Exceptions @@ -112,16 +129,18 @@ def __init__(self, code, msg): def __str__(self): return repr(self.msg) + # Items class EpubItem(object): - """ Base class for the items in a book. """ - def __init__(self, uid=None, file_name='', media_type='', content=six.b(''), manifest=True): + def __init__( + self, uid=None, file_name="", media_type="", content=six.b(""), manifest=True + ): """ :Args: - uid: Unique identifier for this item (optional) @@ -188,7 +207,7 @@ def get_type(self): return ebooklib.ITEM_UNKNOWN - def get_content(self, default=six.b('')): + def get_content(self, default=six.b("")): """ Returns content of the item. Content should be of type 'str' (Python 2) or 'bytes' (Python 3) @@ -210,45 +229,56 @@ def set_content(self, content): self.content = content def __str__(self): - return '' % self.id + return "" % self.id class EpubNcx(EpubItem): "Represents Navigation Control File (NCX) in the EPUB." - def __init__(self, uid='ncx', file_name='toc.ncx'): - super(EpubNcx, self).__init__(uid=uid, file_name=file_name, media_type='application/x-dtbncx+xml') + def __init__(self, uid="ncx", file_name="toc.ncx"): + super(EpubNcx, self).__init__( + uid=uid, file_name=file_name, media_type="application/x-dtbncx+xml" + ) def __str__(self): - return '' % self.id + return "" % self.id class EpubCover(EpubItem): - """ Represents Cover image in the EPUB file. """ - def __init__(self, uid='cover-img', file_name=''): + def __init__(self, uid="cover-img", file_name=""): super(EpubCover, self).__init__(uid=uid, file_name=file_name) def get_type(self): return ebooklib.ITEM_COVER def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) class EpubHtml(EpubItem): - """ Represents HTML document in the EPUB file. """ - _template_name = 'chapter' - def __init__(self, uid=None, file_name='', media_type='', content=None, title='', - lang=None, direction=None, media_overlay=None, media_duration=None): + _template_name = "chapter" + + def __init__( + self, + uid=None, + file_name="", + media_type="", + content=None, + title="", + lang=None, + direction=None, + media_overlay=None, + media_duration=None, + ): super(EpubHtml, self).__init__(uid, file_name, media_type, content) self.title = title @@ -305,9 +335,9 @@ def add_link(self, **kwgs): >>> add_link(href='styles.css', rel='stylesheet', type='text/css') """ self.links.append(kwgs) - if kwgs.get('type') == 'text/javascript': - if 'scripted' not in self.properties: - self.properties.append('scripted') + if kwgs.get("type") == "text/javascript": + if "scripted" not in self.properties: + self.properties.append("scripted") def get_links(self): """ @@ -325,7 +355,7 @@ def get_links_of_type(self, link_type): :Returns: As tuple returns list of links. """ - return (link for link in self.links if link.get('type', '') == link_type) + return (link for link in self.links if link.get("type", "") == link_type) def add_item(self, item): """ @@ -335,10 +365,10 @@ def add_item(self, item): - item: item we want to add defined as instance of EpubItem """ if item.get_type() == ebooklib.ITEM_STYLE: - self.add_link(href=item.get_name(), rel='stylesheet', type='text/css') + self.add_link(href=item.get_name(), rel="stylesheet", type="text/css") if item.get_type() == ebooklib.ITEM_SCRIPT: - self.add_link(src=item.get_name(), type='text/javascript') + self.add_link(src=item.get_name(), type="text/javascript") def get_body_content(self): """ @@ -352,24 +382,26 @@ def get_body_content(self): try: html_tree = parse_html_string(self.content) except: - return '' + return "" html_root = html_tree.getroottree() - if len(html_root.find('body')) != 0: - body = html_tree.find('body') + if len(html_root.find("body")) != 0: + body = html_tree.find("body") - tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) + tree_str = etree.tostring( + body, pretty_print=True, encoding="utf-8", xml_declaration=False + ) # this is so stupid - if tree_str.startswith(six.b('')): - n = tree_str.rindex(six.b('')) + if tree_str.startswith(six.b("")): + n = tree_str.rindex(six.b("")) return tree_str[6:n] return tree_str - return '' + return "" def get_content(self, default=None): """ @@ -386,8 +418,10 @@ def get_content(self, default=None): tree = parse_string(self.book.get_template(self._template_name)) tree_root = tree.getroot() - tree_root.set('lang', self.lang or self.book.language) - tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language + tree_root.set("lang", self.lang or self.book.language) + tree_root.attrib["{%s}lang" % NAMESPACES["XML"]] = ( + self.lang or self.book.language + ) # add to the head also # @@ -395,25 +429,25 @@ def get_content(self, default=None): try: html_tree = parse_html_string(self.content) except: - return '' + return "" html_root = html_tree.getroottree() # create and populate head - _head = etree.SubElement(tree_root, 'head') + _head = etree.SubElement(tree_root, "head") - if self.title != '': - _title = etree.SubElement(_head, 'title') + if self.title != "": + _title = etree.SubElement(_head, "title") _title.text = self.title for lnk in self.links: - if lnk.get('type') == 'text/javascript': - _lnk = etree.SubElement(_head, 'script', lnk) + if lnk.get("type") == "text/javascript": + _lnk = etree.SubElement(_head, "script", lnk) # force - _lnk.text = '' + _lnk.text = "" else: - _lnk = etree.SubElement(_head, 'link', lnk) + _lnk = etree.SubElement(_head, "link", lnk) # this should not be like this # head = html_root.find('head') @@ -425,31 +459,34 @@ def get_content(self, default=None): # create and populate body - _body = etree.SubElement(tree_root, 'body') + _body = etree.SubElement(tree_root, "body") if self.direction: - _body.set('dir', self.direction) - tree_root.set('dir', self.direction) + _body.set("dir", self.direction) + tree_root.set("dir", self.direction) - body = html_tree.find('body') + body = html_tree.find("body") if body is not None: for i in body.getchildren(): _body.append(i) - tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) + tree_str = etree.tostring( + tree, pretty_print=True, encoding="utf-8", xml_declaration=True + ) return tree_str def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) class EpubCoverHtml(EpubHtml): - """ Represents Cover page in the EPUB file. """ - def __init__(self, uid='cover', file_name='cover.xhtml', image_name='', title='Cover'): + def __init__( + self, uid="cover", file_name="cover.xhtml", image_name="", title="Cover" + ): super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title) self.image_name = image_name @@ -473,32 +510,48 @@ def get_content(self): Returns content of this document. """ - self.content = self.book.get_template('cover') + self.content = self.book.get_template("cover") tree = parse_string(super(EpubCoverHtml, self).get_content()) tree_root = tree.getroot() - images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']}) + images = tree_root.xpath( + "//xhtml:img", namespaces={"xhtml": NAMESPACES["XHTML"]} + ) - images[0].set('src', self.image_name) - images[0].set('alt', self.title) + images[0].set("src", self.image_name) + images[0].set("alt", self.title) - tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) + tree_str = etree.tostring( + tree, pretty_print=True, encoding="utf-8", xml_declaration=True + ) return tree_str def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) class EpubNav(EpubHtml): - """ Represents Navigation Document in the EPUB file. """ - def __init__(self, uid='nav', file_name='nav.xhtml', media_type='application/xhtml+xml', title='', direction=None): - super(EpubNav, self).__init__(uid=uid, file_name=file_name, media_type=media_type, title=title, direction=direction) + def __init__( + self, + uid="nav", + file_name="nav.xhtml", + media_type="application/xhtml+xml", + title="", + direction=None, + ): + super(EpubNav, self).__init__( + uid=uid, + file_name=file_name, + media_type=media_type, + title=title, + direction=direction, + ) def is_chapter(self): """ @@ -511,11 +564,10 @@ def is_chapter(self): return False def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) class EpubImage(EpubItem): - """ Represents Image in the EPUB file. """ @@ -527,23 +579,29 @@ def get_type(self): return ebooklib.ITEM_IMAGE def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) class EpubSMIL(EpubItem): - def __init__(self, uid=None, file_name='', content=None): - super(EpubSMIL, self).__init__(uid=uid, file_name=file_name, media_type='application/smil+xml', content=content) + def __init__(self, uid=None, file_name="", content=None): + super(EpubSMIL, self).__init__( + uid=uid, + file_name=file_name, + media_type="application/smil+xml", + content=content, + ) def get_type(self): return ebooklib.ITEM_SMIL def __str__(self): - return '' % (self.id, self.file_name) + return "" % (self.id, self.file_name) # EpubBook + class EpubBook(object): def __init__(self): @@ -564,27 +622,33 @@ def reset(self): self.toc = [] self.bindings = [] - self.IDENTIFIER_ID = 'id' - self.FOLDER_NAME = 'EPUB' + self.IDENTIFIER_ID = "id" + self.FOLDER_NAME = "EPUB" self._id_html = 0 self._id_image = 0 self._id_static = 0 - self.title = '' - self.language = 'en' + self.title = "" + self.language = "en" self.direction = None self.templates = { - 'ncx': NCX_XML, - 'nav': NAV_XML, - 'chapter': CHAPTER_XML, - 'cover': COVER_XML + "ncx": NCX_XML, + "nav": NAV_XML, + "chapter": CHAPTER_XML, + "cover": COVER_XML, } - self.add_metadata('OPF', 'generator', '', { - 'name': 'generator', 'content': 'Ebook-lib %s' % '.'.join([str(s) for s in VERSION]) - }) + self.add_metadata( + "OPF", + "generator", + "", + { + "name": "generator", + "content": "Ebook-lib %s" % ".".join([str(s) for s in VERSION]), + }, + ) # default to using a randomly-unique identifier if one is not specified manually self.set_identifier(str(uuid.uuid4())) @@ -603,7 +667,9 @@ def set_identifier(self, uid): self.uid = uid - self.set_unique_metadata('DC', 'identifier', self.uid, {'id': self.IDENTIFIER_ID}) + self.set_unique_metadata( + "DC", "identifier", self.uid, {"id": self.IDENTIFIER_ID} + ) def set_title(self, title): """ @@ -615,7 +681,7 @@ def set_title(self, title): self.title = title - self.add_metadata('DC', 'title', self.title) + self.add_metadata("DC", "title", self.title) def set_language(self, lang): """ @@ -628,7 +694,7 @@ def set_language(self, lang): self.language = lang - self.add_metadata('DC', 'language', lang) + self.add_metadata("DC", "language", lang) def set_direction(self, direction): """ @@ -657,21 +723,33 @@ def set_cover(self, file_name, content, create_page=True): c1 = EpubCoverHtml(image_name=file_name) self.add_item(c1) - self.add_metadata(None, 'meta', '', OrderedDict([('name', 'cover'), ('content', 'cover-img')])) + self.add_metadata( + None, "meta", "", OrderedDict([("name", "cover"), ("content", "cover-img")]) + ) - def add_author(self, author, file_as=None, role=None, uid='creator'): + def add_author(self, author, file_as=None, role=None, uid="creator"): "Add author for this document" - self.add_metadata('DC', 'creator', author, {'id': uid}) + self.add_metadata("DC", "creator", author, {"id": uid}) if file_as: - self.add_metadata(None, 'meta', file_as, {'refines': '#' + uid, - 'property': 'file-as', - 'scheme': 'marc:relators'}) + self.add_metadata( + None, + "meta", + file_as, + { + "refines": "#" + uid, + "property": "file-as", + "scheme": "marc:relators", + }, + ) if role: - self.add_metadata(None, 'meta', role, {'refines': '#' + uid, - 'property': 'role', - 'scheme': 'marc:relators'}) + self.add_metadata( + None, + "meta", + role, + {"refines": "#" + uid, "property": "role", "scheme": "marc:relators"}, + ) def add_metadata(self, namespace, name, value, others=None): "Add metadata" @@ -714,7 +792,7 @@ def add_item(self, item): :Args: - item: Item instance """ - if item.media_type == '': + if item.media_type == "": (has_guessed, media_type) = guess_type(item.get_name().lower()) if has_guessed: @@ -723,20 +801,20 @@ def add_item(self, item): else: item.media_type = has_guessed else: - item.media_type = 'application/octet-stream' + item.media_type = "application/octet-stream" if not item.get_id(): # make chapter_, image_ and static_ configurable if isinstance(item, EpubHtml): - item.id = 'chapter_%d' % self._id_html + item.id = "chapter_%d" % self._id_html self._id_html += 1 # If there's a page list, append it to the book's page list self.pages += item.pages elif isinstance(item, EpubImage): - item.id = 'image_%d' % self._id_image + item.id = "image_%d" % self._id_image self._id_image += 1 else: - item.id = 'static_%d' % self._id_static + item.id = "static_%d" % self._id_static self._id_static += 1 item.book = self @@ -856,24 +934,21 @@ def add_prefix(self, name, uri): - uri: URI for the namespace """ - self.prefixes.append('%s: %s' % (name, uri)) + self.prefixes.append("%s: %s" % (name, uri)) class EpubWriter(object): DEFAULT_OPTIONS = { - 'epub2_guide': True, - 'epub3_landmark': True, - 'epub3_pages': True, - 'landmark_title': 'Guide', - 'pages_title': 'Pages', - 'spine_direction': True, - 'package_direction': False, - 'play_order': { - 'enabled': False, - 'start_from': 1 - }, - 'raise_exceptions': False, - 'compresslevel': 6 + "epub2_guide": True, + "epub3_landmark": True, + "epub3_pages": True, + "landmark_title": "Guide", + "pages_title": "Pages", + "spine_direction": True, + "package_direction": False, + "play_order": {"enabled": False, "start_from": 1}, + "raise_exceptions": False, + "compresslevel": 6, } def __init__(self, name, book, options=None): @@ -887,31 +962,28 @@ def __init__(self, name, book, options=None): self._init_play_order() def _init_play_order(self): - self._play_order = { - 'enabled': False, - 'start_from': 1 - } + self._play_order = {"enabled": False, "start_from": 1} try: - self._play_order['enabled'] = self.options['play_order']['enabled'] - self._play_order['start_from'] = self.options['play_order']['start_from'] + self._play_order["enabled"] = self.options["play_order"]["enabled"] + self._play_order["start_from"] = self.options["play_order"]["start_from"] except KeyError: pass def process(self): # should cache this html parsing so we don't do it for every plugin - for plg in self.options.get('plugins', []): - if hasattr(plg, 'before_write'): + for plg in self.options.get("plugins", []): + if hasattr(plg, "before_write"): plg.before_write(self.book) for item in self.book.get_items(): if isinstance(item, EpubHtml): - for plg in self.options.get('plugins', []): - if hasattr(plg, 'html_before_write'): + for plg in self.options.get("plugins", []): + if hasattr(plg, "html_before_write"): plg.html_before_write(self.book, item) def _write_container(self): - container_xml = CONTAINER_XML % {'folder_name': self.book.FOLDER_NAME} + container_xml = CONTAINER_XML % {"folder_name": self.book.FOLDER_NAME} self.out.writestr(CONTAINER_PATH, container_xml) def _write_opf_metadata(self, root): @@ -923,46 +995,52 @@ def _write_opf_metadata(self, root): # if ns_name == ns_url: # nsmap[n_id.lower()] = NAMESPACES[n_id] - nsmap = {'dc': NAMESPACES['DC'], 'opf': NAMESPACES['OPF']} + nsmap = {"dc": NAMESPACES["DC"], "opf": NAMESPACES["OPF"]} nsmap.update(self.book.namespaces) - metadata = etree.SubElement(root, 'metadata', nsmap=nsmap) + metadata = etree.SubElement(root, "metadata", nsmap=nsmap) - el = etree.SubElement(metadata, 'meta', {'property': 'dcterms:modified'}) - if 'mtime' in self.options: - mtime = self.options['mtime'] + el = etree.SubElement(metadata, "meta", {"property": "dcterms:modified"}) + if "mtime" in self.options: + mtime = self.options["mtime"] else: import datetime + mtime = datetime.datetime.now() - el.text = mtime.strftime('%Y-%m-%dT%H:%M:%SZ') + el.text = mtime.strftime("%Y-%m-%dT%H:%M:%SZ") for ns_name, values in six.iteritems(self.book.metadata): - if ns_name == NAMESPACES['OPF']: + if ns_name == NAMESPACES["OPF"]: for values in values.values(): for v in values: - if 'property' in v[1] and v[1]['property'] == 'dcterms:modified': + if ( + "property" in v[1] + and v[1]["property"] == "dcterms:modified" + ): continue try: - el = etree.SubElement(metadata, 'meta', v[1]) + el = etree.SubElement(metadata, "meta", v[1]) if v[0]: el.text = v[0] except ValueError: - logging.error('Could not create metadata.') + logging.error("Could not create metadata.") else: for name, values in six.iteritems(values): for v in values: try: if ns_name: - el = etree.SubElement(metadata, '{%s}%s' % (ns_name, name), v[1]) + el = etree.SubElement( + metadata, "{%s}%s" % (ns_name, name), v[1] + ) else: - el = etree.SubElement(metadata, '%s' % name, v[1]) + el = etree.SubElement(metadata, "%s" % name, v[1]) el.text = v[0] except ValueError: logging.info('Could not create metadata "{}".'.format(name)) def _write_opf_manifest(self, root): - manifest = etree.SubElement(root, 'manifest') + manifest = etree.SubElement(root, "manifest") _ncx_id = None # mathml, scripted, svg, remote-resources, and switch @@ -974,45 +1052,65 @@ def _write_opf_manifest(self, root): continue if isinstance(item, EpubNav): - etree.SubElement(manifest, 'item', {'href': item.get_name(), - 'id': item.id, - 'media-type': item.media_type, - 'properties': 'nav'}) + etree.SubElement( + manifest, + "item", + { + "href": item.get_name(), + "id": item.id, + "media-type": item.media_type, + "properties": "nav", + }, + ) elif isinstance(item, EpubNcx): _ncx_id = item.id - etree.SubElement(manifest, 'item', {'href': item.file_name, - 'id': item.id, - 'media-type': item.media_type}) + etree.SubElement( + manifest, + "item", + { + "href": item.file_name, + "id": item.id, + "media-type": item.media_type, + }, + ) elif isinstance(item, EpubCover): - etree.SubElement(manifest, 'item', {'href': item.file_name, - 'id': item.id, - 'media-type': item.media_type, - 'properties': 'cover-image'}) + etree.SubElement( + manifest, + "item", + { + "href": item.file_name, + "id": item.id, + "media-type": item.media_type, + "properties": "cover-image", + }, + ) else: - opts = {'href': item.file_name, - 'id': item.id, - 'media-type': item.media_type} + opts = { + "href": item.file_name, + "id": item.id, + "media-type": item.media_type, + } - if hasattr(item, 'properties') and len(item.properties) > 0: - opts['properties'] = ' '.join(item.properties) + if hasattr(item, "properties") and len(item.properties) > 0: + opts["properties"] = " ".join(item.properties) - if hasattr(item, 'media_overlay') and item.media_overlay is not None: - opts['media-overlay'] = item.media_overlay + if hasattr(item, "media_overlay") and item.media_overlay is not None: + opts["media-overlay"] = item.media_overlay - if hasattr(item, 'media_duration') and item.media_duration is not None: - opts['duration'] = item.media_duration + if hasattr(item, "media_duration") and item.media_duration is not None: + opts["duration"] = item.media_duration - etree.SubElement(manifest, 'item', opts) + etree.SubElement(manifest, "item", opts) return _ncx_id def _write_opf_spine(self, root, ncx_id): - spine_attributes = {'toc': ncx_id or 'ncx'} - if self.book.direction and self.options['spine_direction']: - spine_attributes['page-progression-direction'] = self.book.direction + spine_attributes = {"toc": ncx_id or "ncx"} + if self.book.direction and self.options["spine_direction"]: + spine_attributes["page-progression-direction"] = self.book.direction - spine = etree.SubElement(root, 'spine', spine_attributes) + spine = etree.SubElement(root, "spine", spine_attributes) for _item in self.book.spine: # this is for now @@ -1024,78 +1122,86 @@ def _write_opf_spine(self, root, ncx_id): item = _item[0] if len(_item) > 1: - if _item[1] == 'no': + if _item[1] == "no": is_linear = False else: item = _item if isinstance(item, EpubHtml): - opts = {'idref': item.get_id()} + opts = {"idref": item.get_id()} if not item.is_linear or not is_linear: - opts['linear'] = 'no' + opts["linear"] = "no" elif isinstance(item, EpubItem): - opts = {'idref': item.get_id()} + opts = {"idref": item.get_id()} if not item.is_linear or not is_linear: - opts['linear'] = 'no' + opts["linear"] = "no" else: - opts = {'idref': item} + opts = {"idref": item} try: itm = self.book.get_item_with_id(item) if not itm.is_linear or not is_linear: - opts['linear'] = 'no' + opts["linear"] = "no" except: pass - etree.SubElement(spine, 'itemref', opts) + etree.SubElement(spine, "itemref", opts) def _write_opf_guide(self, root): # - http://www.idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.6 - if len(self.book.guide) > 0 and self.options.get('epub2_guide'): - guide = etree.SubElement(root, 'guide', {}) + if len(self.book.guide) > 0 and self.options.get("epub2_guide"): + guide = etree.SubElement(root, "guide", {}) for item in self.book.guide: - if 'item' in item: - chap = item.get('item') + if "item" in item: + chap = item.get("item") if chap: _href = chap.file_name _title = chap.title else: - _href = item.get('href', '') - _title = item.get('title', '') + _href = item.get("href", "") + _title = item.get("title", "") if _title is None: - _title = '' - ref = etree.SubElement(guide, 'reference', {'type': item.get('type', ''), - 'title': _title, - 'href': _href}) + _title = "" + ref = etree.SubElement( + guide, + "reference", + {"type": item.get("type", ""), "title": _title, "href": _href}, + ) def _write_opf_bindings(self, root): if len(self.book.bindings) > 0: - bindings = etree.SubElement(root, 'bindings', {}) + bindings = etree.SubElement(root, "bindings", {}) for item in self.book.bindings: - etree.SubElement(bindings, 'mediaType', item) + etree.SubElement(bindings, "mediaType", item) def _write_opf_file(self, root): - tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) + tree_str = etree.tostring( + root, pretty_print=True, encoding="utf-8", xml_declaration=True + ) - self.out.writestr('%s/content.opf' % self.book.FOLDER_NAME, tree_str) + self.out.writestr("%s/content.opf" % self.book.FOLDER_NAME, tree_str) def _write_opf(self): - package_attributes = {'xmlns': NAMESPACES['OPF'], - 'unique-identifier': self.book.IDENTIFIER_ID, - 'version': '3.0'} - if self.book.direction and self.options['package_direction']: - package_attributes['dir'] = self.book.direction + package_attributes = { + "xmlns": NAMESPACES["OPF"], + "unique-identifier": self.book.IDENTIFIER_ID, + "version": "3.0", + } + if self.book.direction and self.options["package_direction"]: + package_attributes["dir"] = self.book.direction - root = etree.Element('package', package_attributes) + root = etree.Element("package", package_attributes) - prefixes = ['rendition: http://www.idpf.org/vocab/rendition/#'] + self.book.prefixes - root.attrib['prefix'] = ' '.join(prefixes) + prefixes = [ + "rendition: http://www.idpf.org/vocab/rendition/#" + ] + self.book.prefixes + root.attrib["prefix"] = " ".join(prefixes) # METADATA self._write_opf_metadata(root) @@ -1117,60 +1223,88 @@ def _write_opf(self): def _get_nav(self, item): # just a basic navigation for now - nav_xml = parse_string(self.book.get_template('nav')) + nav_xml = parse_string(self.book.get_template("nav")) root = nav_xml.getroot() - root.set('lang', self.book.language) - root.attrib['{%s}lang' % NAMESPACES['XML']] = self.book.language + root.set("lang", self.book.language) + root.attrib["{%s}lang" % NAMESPACES["XML"]] = self.book.language nav_dir_name = os.path.dirname(item.file_name) - head = etree.SubElement(root, 'head') - title = etree.SubElement(head, 'title') + head = etree.SubElement(root, "head") + title = etree.SubElement(head, "title") title.text = item.title or self.book.title # for now this just handles css files and ignores others for _link in item.links: - _lnk = etree.SubElement(head, 'link', { - 'href': _link.get('href', ''), 'rel': 'stylesheet', 'type': 'text/css' - }) + _lnk = etree.SubElement( + head, + "link", + { + "href": _link.get("href", ""), + "rel": "stylesheet", + "type": "text/css", + }, + ) - body = etree.SubElement(root, 'body') + body = etree.SubElement(root, "body") if item.direction: - body.set('dir', item.direction) - nav = etree.SubElement(body, 'nav', { - '{%s}type' % NAMESPACES['EPUB']: 'toc', - 'id': 'id', - 'role': 'doc-toc', - }) - - content_title = etree.SubElement(nav, 'h2') + body.set("dir", item.direction) + nav = etree.SubElement( + body, + "nav", + { + "{%s}type" % NAMESPACES["EPUB"]: "toc", + "id": "id", + "role": "doc-toc", + }, + ) + + content_title = etree.SubElement(nav, "h2") content_title.text = item.title or self.book.title def _create_section(itm, items): - ol = etree.SubElement(itm, 'ol') + ol = etree.SubElement(itm, "ol") for item in items: if isinstance(item, tuple) or isinstance(item, list): - li = etree.SubElement(ol, 'li') + li = etree.SubElement(ol, "li") if isinstance(item[0], EpubHtml): - a = etree.SubElement(li, 'a', {'href': zip_path.relpath(item[0].file_name, nav_dir_name)}) - elif isinstance(item[0], Section) and item[0].href != '': - a = etree.SubElement(li, 'a', {'href': zip_path.relpath(item[0].href, nav_dir_name)}) + a = etree.SubElement( + li, + "a", + {"href": zip_path.relpath(item[0].file_name, nav_dir_name)}, + ) + elif isinstance(item[0], Section) and item[0].href != "": + a = etree.SubElement( + li, + "a", + {"href": zip_path.relpath(item[0].href, nav_dir_name)}, + ) elif isinstance(item[0], Link): - a = etree.SubElement(li, 'a', {'href': zip_path.relpath(item[0].href, nav_dir_name)}) + a = etree.SubElement( + li, + "a", + {"href": zip_path.relpath(item[0].href, nav_dir_name)}, + ) else: - a = etree.SubElement(li, 'span') + a = etree.SubElement(li, "span") a.text = item[0].title _create_section(li, item[1]) elif isinstance(item, Link): - li = etree.SubElement(ol, 'li') - a = etree.SubElement(li, 'a', {'href': zip_path.relpath(item.href, nav_dir_name)}) + li = etree.SubElement(ol, "li") + a = etree.SubElement( + li, "a", {"href": zip_path.relpath(item.href, nav_dir_name)} + ) a.text = item.title elif isinstance(item, EpubHtml): - li = etree.SubElement(ol, 'li') - a = etree.SubElement(li, 'a', {'href': zip_path.relpath(item.file_name, nav_dir_name)}) + li = etree.SubElement(ol, "li") + a = etree.SubElement( + li, + "a", + {"href": zip_path.relpath(item.file_name, nav_dir_name)}, + ) a.text = item.title _create_section(nav, self.book.toc) @@ -1178,196 +1312,240 @@ def _create_section(itm, items): # LANDMARKS / GUIDE # - http://www.idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def-types-landmarks - if len(self.book.guide) > 0 and self.options.get('epub3_landmark'): + if len(self.book.guide) > 0 and self.options.get("epub3_landmark"): # Epub2 guide types do not map completely to epub3 landmark types. - guide_to_landscape_map = { - 'notes': 'rearnotes', - 'text': 'bodymatter' - } + guide_to_landscape_map = {"notes": "rearnotes", "text": "bodymatter"} - guide_nav = etree.SubElement(body, 'nav', {'{%s}type' % NAMESPACES['EPUB']: 'landmarks'}) + guide_nav = etree.SubElement( + body, "nav", {"{%s}type" % NAMESPACES["EPUB"]: "landmarks"} + ) - guide_content_title = etree.SubElement(guide_nav, 'h2') - guide_content_title.text = self.options.get('landmark_title', 'Guide') + guide_content_title = etree.SubElement(guide_nav, "h2") + guide_content_title.text = self.options.get("landmark_title", "Guide") - guild_ol = etree.SubElement(guide_nav, 'ol') + guild_ol = etree.SubElement(guide_nav, "ol") for elem in self.book.guide: - li_item = etree.SubElement(guild_ol, 'li') + li_item = etree.SubElement(guild_ol, "li") - if 'item' in elem: - chap = elem.get('item', None) + if "item" in elem: + chap = elem.get("item", None) if chap: _href = chap.file_name _title = chap.title else: - _href = elem.get('href', '') - _title = elem.get('title', '') - - guide_type = elem.get('type', '') - a_item = etree.SubElement(li_item, 'a', { - '{%s}type' % NAMESPACES['EPUB']: guide_to_landscape_map.get(guide_type, guide_type), - 'href': zip_path.relpath(_href, nav_dir_name) - }) + _href = elem.get("href", "") + _title = elem.get("title", "") + + guide_type = elem.get("type", "") + a_item = etree.SubElement( + li_item, + "a", + { + "{%s}type" + % NAMESPACES["EPUB"]: guide_to_landscape_map.get( + guide_type, guide_type + ), + "href": zip_path.relpath(_href, nav_dir_name), + }, + ) a_item.text = _title # PAGE-LIST - if self.options.get('epub3_pages'): - inserted_pages = get_pages_for_items([item for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT) \ - if not isinstance(item, EpubNav)]) + if self.options.get("epub3_pages"): + inserted_pages = get_pages_for_items( + [ + item + for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT) + if not isinstance(item, EpubNav) + ] + ) if len(inserted_pages) > 0: pagelist_nav = etree.SubElement( body, - 'nav', + "nav", { - '{%s}type' % NAMESPACES['EPUB']: 'page-list', - 'id': 'pages', - 'hidden': 'hidden', - } + "{%s}type" % NAMESPACES["EPUB"]: "page-list", + "id": "pages", + "hidden": "hidden", + }, ) - pagelist_content_title = etree.SubElement(pagelist_nav, 'h2') - pagelist_content_title.text = self.options.get( - 'pages_title', 'Pages' - ) - - pages_ol = etree.SubElement(pagelist_nav, 'ol') - + pagelist_content_title = etree.SubElement(pagelist_nav, "h2") + pagelist_content_title.text = self.options.get("pages_title", "Pages") + pages_ol = etree.SubElement(pagelist_nav, "ol") for filename, pageref, label in inserted_pages: - li_item = etree.SubElement(pages_ol, 'li') + li_item = etree.SubElement(pages_ol, "li") - _href = u'{}#{}'.format(filename, pageref) + _href = "{}#{}".format(filename, pageref) _title = label - a_item = etree.SubElement(li_item, 'a', { - 'href': zip_path.relpath(_href, nav_dir_name), - }) + a_item = etree.SubElement( + li_item, + "a", + { + "href": zip_path.relpath(_href, nav_dir_name), + }, + ) a_item.text = _title - tree_str = etree.tostring(nav_xml, pretty_print=True, encoding='utf-8', xml_declaration=True) + tree_str = etree.tostring( + nav_xml, pretty_print=True, encoding="utf-8", xml_declaration=True + ) return tree_str def _get_ncx(self): # we should be able to setup language for NCX as also - ncx = parse_string(self.book.get_template('ncx')) + ncx = parse_string(self.book.get_template("ncx")) root = ncx.getroot() - head = etree.SubElement(root, 'head') + head = etree.SubElement(root, "head") # get this id - uid = etree.SubElement(head, 'meta', {'content': self.book.uid, 'name': 'dtb:uid'}) - uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:depth'}) - uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:totalPageCount'}) - uid = etree.SubElement(head, 'meta', {'content': '0', 'name': 'dtb:maxPageNumber'}) - - doc_title = etree.SubElement(root, 'docTitle') - title = etree.SubElement(doc_title, 'text') + uid = etree.SubElement( + head, "meta", {"content": self.book.uid, "name": "dtb:uid"} + ) + uid = etree.SubElement(head, "meta", {"content": "0", "name": "dtb:depth"}) + uid = etree.SubElement( + head, "meta", {"content": "0", "name": "dtb:totalPageCount"} + ) + uid = etree.SubElement( + head, "meta", {"content": "0", "name": "dtb:maxPageNumber"} + ) + + doc_title = etree.SubElement(root, "docTitle") + title = etree.SubElement(doc_title, "text") title.text = self.book.title -# doc_author = etree.SubElement(root, 'docAuthor') -# author = etree.SubElement(doc_author, 'text') -# author.text = 'Name of the person' + # doc_author = etree.SubElement(root, 'docAuthor') + # author = etree.SubElement(doc_author, 'text') + # author.text = 'Name of the person' # For now just make a very simple navMap - nav_map = etree.SubElement(root, 'navMap') + nav_map = etree.SubElement(root, "navMap") def _add_play_order(nav_point): - nav_point.set('playOrder', str(self._play_order['start_from'])) - self._play_order['start_from'] += 1 + nav_point.set("playOrder", str(self._play_order["start_from"])) + self._play_order["start_from"] += 1 def _create_section(itm, items, uid): for item in items: if isinstance(item, tuple) or isinstance(item, list): section, subsection = item[0], item[1] - np = etree.SubElement(itm, 'navPoint', { - 'id': section.get_id() if isinstance(section, EpubHtml) else 'sep_%d' % uid - }) - - if self._play_order['enabled']: + np = etree.SubElement( + itm, + "navPoint", + { + "id": ( + section.get_id() + if isinstance(section, EpubHtml) + else "sep_%d" % uid + ) + }, + ) + + if self._play_order["enabled"]: _add_play_order(np) - nl = etree.SubElement(np, 'navLabel') - nt = etree.SubElement(nl, 'text') + nl = etree.SubElement(np, "navLabel") + nt = etree.SubElement(nl, "text") nt.text = section.title # CAN NOT HAVE EMPTY SRC HERE - href = '' + href = "" if isinstance(section, EpubHtml): href = section.file_name - elif isinstance(section, Section) and section.href != '': + elif isinstance(section, Section) and section.href != "": href = section.href elif isinstance(section, Link): href = section.href - nc = etree.SubElement(np, 'content', {'src': href}) + nc = etree.SubElement(np, "content", {"src": href}) uid = _create_section(np, subsection, uid + 1) elif isinstance(item, Link): _parent = itm - _content = _parent.find('content') + _content = _parent.find("content") if _content is not None: - if _content.get('src') == '': - _content.set('src', item.href) + if _content.get("src") == "": + _content.set("src", item.href) - np = etree.SubElement(itm, 'navPoint', {'id': item.uid}) + np = etree.SubElement(itm, "navPoint", {"id": item.uid}) - if self._play_order['enabled']: + if self._play_order["enabled"]: _add_play_order(np) - nl = etree.SubElement(np, 'navLabel') - nt = etree.SubElement(nl, 'text') + nl = etree.SubElement(np, "navLabel") + nt = etree.SubElement(nl, "text") nt.text = item.title - nc = etree.SubElement(np, 'content', {'src': item.href}) + nc = etree.SubElement(np, "content", {"src": item.href}) elif isinstance(item, EpubHtml): _parent = itm - _content = _parent.find('content') + _content = _parent.find("content") if _content is not None: - if _content.get('src') == '': - _content.set('src', item.file_name) + if _content.get("src") == "": + _content.set("src", item.file_name) - np = etree.SubElement(itm, 'navPoint', {'id': item.get_id()}) + np = etree.SubElement(itm, "navPoint", {"id": item.get_id()}) - if self._play_order['enabled']: + if self._play_order["enabled"]: _add_play_order(np) - nl = etree.SubElement(np, 'navLabel') - nt = etree.SubElement(nl, 'text') + nl = etree.SubElement(np, "navLabel") + nt = etree.SubElement(nl, "text") nt.text = item.title - nc = etree.SubElement(np, 'content', {'src': item.file_name}) + nc = etree.SubElement(np, "content", {"src": item.file_name}) return uid _create_section(nav_map, self.book.toc, 0) - tree_str = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) + tree_str = etree.tostring( + root, pretty_print=True, encoding="utf-8", xml_declaration=True + ) return tree_str def _write_items(self): for item in self.book.get_items(): if isinstance(item, EpubNcx): - self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_ncx()) + self.out.writestr( + "%s/%s" % (self.book.FOLDER_NAME, item.file_name), self._get_ncx() + ) elif isinstance(item, EpubNav): - self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), self._get_nav(item)) + self.out.writestr( + "%s/%s" % (self.book.FOLDER_NAME, item.file_name), + self._get_nav(item), + ) elif item.manifest: - self.out.writestr('%s/%s' % (self.book.FOLDER_NAME, item.file_name), item.get_content()) + self.out.writestr( + "%s/%s" % (self.book.FOLDER_NAME, item.file_name), + item.get_content(), + ) else: - self.out.writestr('%s' % item.file_name, item.get_content()) + self.out.writestr("%s" % item.file_name, item.get_content()) def write(self): # check for the option allowZip64 - self.out = zipfile.ZipFile(self.file_name, 'w', zipfile.ZIP_DEFLATED, compresslevel=self.options['compresslevel']) - self.out.writestr('mimetype', 'application/epub+zip', compress_type=zipfile.ZIP_STORED) + self.out = zipfile.ZipFile( + self.file_name, + "w", + zipfile.ZIP_DEFLATED, + compresslevel=self.options["compresslevel"], + ) + self.out.writestr( + "mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED + ) self._write_container() self._write_opf() @@ -1377,17 +1555,15 @@ def write(self): class EpubReader(object): - DEFAULT_OPTIONS = { - 'ignore_ncx': False - } + DEFAULT_OPTIONS = {"ignore_ncx": False} def __init__(self, epub_file_name, options=None): self.file_name = epub_file_name self.book = EpubBook() self.zf = None - self.opf_file = '' - self.opf_dir = '' + self.opf_file = "" + self.opf_dir = "" self.options = dict(self.DEFAULT_OPTIONS) if options: @@ -1396,19 +1572,21 @@ def __init__(self, epub_file_name, options=None): self._check_deprecated() def _check_deprecated(self): - if self.options.get('ignore_ncx') is None: - warnings.warn('In the future version we will turn default option ignore_ncx to True.') + if self.options.get("ignore_ncx") is None: + warnings.warn( + "In the future version we will turn default option ignore_ncx to True." + ) def process(self): # should cache this html parsing so we don't do it for every plugin - for plg in self.options.get('plugins', []): - if hasattr(plg, 'after_read'): + for plg in self.options.get("plugins", []): + if hasattr(plg, "after_read"): plg.after_read(self.book) for item in self.book.get_items(): if isinstance(item, EpubHtml): - for plg in self.options.get('plugins', []): - if hasattr(plg, 'html_after_read'): + for plg in self.options.get("plugins", []): + if hasattr(plg, "html_after_read"): plg.html_after_read(self.book, item) def load(self): @@ -1422,31 +1600,34 @@ def read_file(self, name): return self.zf.read(name) def _load_container(self): - meta_inf = self.read_file('META-INF/container.xml') + meta_inf = self.read_file("META-INF/container.xml") tree = parse_string(meta_inf) - for root_file in tree.findall('.//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}): - if root_file.get('media-type') == 'application/oebps-package+xml': - self.opf_file = root_file.get('full-path') + for root_file in tree.findall( + ".//xmlns:rootfile[@media-type]", + namespaces={"xmlns": NAMESPACES["CONTAINERNS"]}, + ): + if root_file.get("media-type") == "application/oebps-package+xml": + self.opf_file = root_file.get("full-path") self.opf_dir = zip_path.dirname(self.opf_file) def _load_metadata(self): container_root = self.container.getroot() # get epub version - self.book.version = container_root.get('version', None) + self.book.version = container_root.get("version", None) # get unique-identifier - if container_root.get('unique-identifier', None): - self.book.IDENTIFIER_ID = container_root.get('unique-identifier') + if container_root.get("unique-identifier", None): + self.book.IDENTIFIER_ID = container_root.get("unique-identifier") # get xml:lang # get metadata - metadata = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'metadata')) + metadata = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "metadata")) nsmap = metadata.nsmap - nstags = dict((k, '{%s}' % v) for k, v in six.iteritems(nsmap)) - default_ns = nstags.get(None, '') + nstags = dict((k, "{%s}" % v) for k, v in six.iteritems(nsmap)) + default_ns = nstags.get(None, "") nsdict = dict((v, {}) for v in nsmap.values()) @@ -1460,21 +1641,21 @@ def add_item(ns, tag, value, extra): for t in metadata: if not etree.iselement(t) or t.tag is etree.Comment: continue - if t.tag == default_ns + 'meta': - name = t.get('name') + if t.tag == default_ns + "meta": + name = t.get("name") others = dict((k, v) for k, v in t.items()) - if name and ':' in name: - prefix, name = name.split(':', 1) + if name and ":" in name: + prefix, name = name.split(":", 1) else: prefix = None add_item(t.nsmap.get(prefix, prefix), name, t.text, others) else: - tag = t.tag[t.tag.rfind('}') + 1:] + tag = t.tag[t.tag.rfind("}") + 1 :] - if (t.prefix and t.prefix.lower() == 'dc') and tag == 'identifier': - _id = t.get('id', None) + if (t.prefix and t.prefix.lower() == "dc") and tag == "identifier": + _id = t.get("id", None) if _id: self.book.IDENTIFIER_ID = _id @@ -1484,77 +1665,87 @@ def add_item(ns, tag, value, extra): self.book.metadata = nsdict - titles = self.book.get_metadata('DC', 'title') + titles = self.book.get_metadata("DC", "title") if len(titles) > 0: self.book.title = titles[0][0] - for value, others in self.book.get_metadata('DC', 'identifier'): - if others.get('id') == self.book.IDENTIFIER_ID: + for value, others in self.book.get_metadata("DC", "identifier"): + if others.get("id") == self.book.IDENTIFIER_ID: self.book.uid = value def _load_manifest(self): - for r in self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'manifest')): - if r is not None and r.tag != '{%s}item' % NAMESPACES['OPF']: + for r in self.container.find("{%s}%s" % (NAMESPACES["OPF"], "manifest")): + if r is not None and r.tag != "{%s}item" % NAMESPACES["OPF"]: continue - media_type = r.get('media-type') - _properties = r.get('properties', '') + media_type = r.get("media-type") + _properties = r.get("properties", "") if _properties: - properties = _properties.split(' ') + properties = _properties.split(" ") else: properties = [] # people use wrong content types - if media_type == 'image/jpg': - media_type = 'image/jpeg' + if media_type == "image/jpg": + media_type = "image/jpeg" - if media_type == 'application/x-dtbncx+xml': - ei = EpubNcx(uid=r.get('id'), file_name=unquote(r.get('href'))) + if media_type == "application/x-dtbncx+xml": + ei = EpubNcx(uid=r.get("id"), file_name=unquote(r.get("href"))) ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name)) - elif media_type == 'application/smil+xml': - ei = EpubSMIL(uid=r.get('id'), file_name=unquote(r.get('href'))) + elif media_type == "application/smil+xml": + ei = EpubSMIL(uid=r.get("id"), file_name=unquote(r.get("href"))) ei.content = self.read_file(zip_path.join(self.opf_dir, ei.file_name)) - elif media_type == 'application/xhtml+xml': - if 'nav' in properties: - ei = EpubNav(uid=r.get('id'), file_name=unquote(r.get('href'))) - - ei.content = self.read_file(zip_path.join(self.opf_dir, r.get('href'))) - elif 'cover' in properties: + elif media_type == "application/xhtml+xml": + if "nav" in properties: + ei = EpubNav(uid=r.get("id"), file_name=unquote(r.get("href"))) + + ei.content = self.read_file( + zip_path.join(self.opf_dir, r.get("href")) + ) + elif "cover" in properties: ei = EpubCoverHtml() - ei.content = self.read_file(zip_path.join(self.opf_dir, unquote(r.get('href')))) + ei.content = self.read_file( + zip_path.join(self.opf_dir, unquote(r.get("href"))) + ) else: ei = EpubHtml() - ei.id = r.get('id') - ei.file_name = unquote(r.get('href')) + ei.id = r.get("id") + ei.file_name = unquote(r.get("href")) ei.media_type = media_type - ei.media_overlay = r.get('media-overlay', None) - ei.media_duration = r.get('duration', None) - ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + ei.media_overlay = r.get("media-overlay", None) + ei.media_duration = r.get("duration", None) + ei.content = self.read_file( + zip_path.join(self.opf_dir, ei.get_name()) + ) ei.properties = properties elif media_type in IMAGE_MEDIA_TYPES: - if 'cover-image' in properties: - ei = EpubCover(uid=r.get('id'), file_name=unquote(r.get('href'))) + if "cover-image" in properties: + ei = EpubCover(uid=r.get("id"), file_name=unquote(r.get("href"))) ei.media_type = media_type - ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + ei.content = self.read_file( + zip_path.join(self.opf_dir, ei.get_name()) + ) else: ei = EpubImage() - ei.id = r.get('id') - ei.file_name = unquote(r.get('href')) + ei.id = r.get("id") + ei.file_name = unquote(r.get("href")) ei.media_type = media_type - ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) + ei.content = self.read_file( + zip_path.join(self.opf_dir, ei.get_name()) + ) else: # different types ei = EpubItem() - ei.id = r.get('id') - ei.file_name = unquote(r.get('href')) + ei.id = r.get("id") + ei.file_name = unquote(r.get("href")) ei.media_type = media_type ei.content = self.read_file(zip_path.join(self.opf_dir, ei.get_name())) @@ -1565,34 +1756,33 @@ def _parse_ncx(self, data): tree = parse_string(data) tree_root = tree.getroot() - nav_map = tree_root.find('{%s}navMap' % NAMESPACES['DAISY']) + nav_map = tree_root.find("{%s}navMap" % NAMESPACES["DAISY"]) def _get_children(elems, n, nid): - label, content = '', '' + label, content = "", "" children = [] for a in elems.getchildren(): - if a.tag == '{%s}navLabel' % NAMESPACES['DAISY']: + if a.tag == "{%s}navLabel" % NAMESPACES["DAISY"]: label = a.getchildren()[0].text - if a.tag == '{%s}content' % NAMESPACES['DAISY']: - content = a.get('src', '') - if a.tag == '{%s}navPoint' % NAMESPACES['DAISY']: - children.append(_get_children(a, n + 1, a.get('id', ''))) + if a.tag == "{%s}content" % NAMESPACES["DAISY"]: + content = a.get("src", "") + if a.tag == "{%s}navPoint" % NAMESPACES["DAISY"]: + children.append(_get_children(a, n + 1, a.get("id", ""))) if len(children) > 0: if n == 0: return children - return (Section(label, href=content), - children) + return (Section(label, href=content), children) else: return Link(content, label, nid) - self.book.toc = _get_children(nav_map, 0, '') + self.book.toc = _get_children(nav_map, 0, "") - def _parse_nav(self, data, base_path, navtype='toc'): + def _parse_nav(self, data, base_path, navtype="toc"): html_node = parse_html_string(data) - if navtype == 'toc': + if navtype == "toc": # parsing the table of contents nav_node = html_node.xpath("//nav[@*='toc']")[0] else: @@ -1605,33 +1795,37 @@ def _parse_nav(self, data, base_path, navtype='toc'): def parse_list(list_node): items = [] - for item_node in list_node.findall('li'): + for item_node in list_node.findall("li"): - sublist_node = item_node.find('ol') - link_node = item_node.find('a') + sublist_node = item_node.find("ol") + link_node = item_node.find("a") if sublist_node is not None: title = item_node[0].text_content() children = parse_list(sublist_node) if link_node is not None: - href = zip_path.normpath(zip_path.join(base_path, link_node.get('href'))) + href = zip_path.normpath( + zip_path.join(base_path, link_node.get("href")) + ) items.append((Section(title, href=href), children)) else: items.append((Section(title), children)) elif link_node is not None: title = link_node.text_content() - href = zip_path.normpath(zip_path.join(base_path, link_node.get('href'))) + href = zip_path.normpath( + zip_path.join(base_path, link_node.get("href")) + ) items.append(Link(href, title)) return items - if navtype == 'toc': - self.book.toc = parse_list(nav_node.find('ol')) + if navtype == "toc": + self.book.toc = parse_list(nav_node.find("ol")) elif nav_node is not None: # generate the pages list if there is one - self.book.pages = parse_list(nav_node.find('ol')) + self.book.pages = parse_list(nav_node.find("ol")) # generate the per-file pages lists # because of the order of parsing the files, this can't be done @@ -1642,41 +1836,50 @@ def parse_list(list_node): htmlfiles[htmlfile.file_name] = htmlfile for page in self.book.pages: try: - (filename, idref) = page.href.split('#') + (filename, idref) = page.href.split("#") except ValueError: filename = page.href if filename in htmlfiles: htmlfiles[filename].pages.append(page) def _load_spine(self): - spine = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'spine')) + spine = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "spine")) - self.book.spine = [(t.get('idref'), t.get('linear', 'yes')) for t in spine] + self.book.spine = [(t.get("idref"), t.get("linear", "yes")) for t in spine] - toc = spine.get('toc', '') - self.book.set_direction(spine.get('page-progression-direction', None)) + toc = spine.get("toc", "") + self.book.set_direction(spine.get("page-progression-direction", None)) # should read ncx or nav file - nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None) + nav_item = next( + (item for item in self.book.items if isinstance(item, EpubNav)), None + ) if toc: - if not self.options.get('ignore_ncx') or not nav_item: + if not self.options.get("ignore_ncx") or not nav_item: try: - ncxFile = self.read_file(zip_path.join(self.opf_dir, self.book.get_item_with_id(toc).get_name())) + ncxFile = self.read_file( + zip_path.join( + self.opf_dir, self.book.get_item_with_id(toc).get_name() + ) + ) except KeyError: - raise EpubException(-1, 'Can not find ncx file.') + raise EpubException(-1, "Can not find ncx file.") self._parse_ncx(ncxFile) def _load_guide(self): - guide = self.container.find('{%s}%s' % (NAMESPACES['OPF'], 'guide')) + guide = self.container.find("{%s}%s" % (NAMESPACES["OPF"], "guide")) if guide is not None: - self.book.guide = [{'href': t.get('href'), 'title': t.get('title'), 'type': t.get('type')} for t in guide] + self.book.guide = [ + {"href": t.get("href"), "title": t.get("title"), "type": t.get("type")} + for t in guide + ] def _load_opf_file(self): try: s = self.read_file(self.opf_file) except KeyError: - raise EpubException(-1, 'Can not find container file') + raise EpubException(-1, "Can not find container file") self.container = parse_string(s) @@ -1687,18 +1890,18 @@ def _load_opf_file(self): # read nav file if found # - nav_item = next((item for item in self.book.items if isinstance(item, EpubNav)), None) + nav_item = next( + (item for item in self.book.items if isinstance(item, EpubNav)), None + ) if nav_item: - if self.options.get('ignore_ncx') or not self.book.toc: + if self.options.get("ignore_ncx") or not self.book.toc: self._parse_nav( nav_item.content, zip_path.dirname(nav_item.file_name), - navtype='toc' + navtype="toc", ) self._parse_nav( - nav_item.content, - zip_path.dirname(nav_item.file_name), - navtype='pages' + nav_item.content, zip_path.dirname(nav_item.file_name), navtype="pages" ) def _load(self): @@ -1707,7 +1910,7 @@ def _load(self): class Directory: def read(self, subname): - with open(os.path.join(file_name, subname), 'rb') as fp: + with open(os.path.join(file_name, subname), "rb") as fp: return fp.read() def close(self): @@ -1716,11 +1919,16 @@ def close(self): self.zf = Directory() else: try: - self.zf = zipfile.ZipFile(self.file_name, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True) + self.zf = zipfile.ZipFile( + self.file_name, + "r", + compression=zipfile.ZIP_DEFLATED, + allowZip64=True, + ) except zipfile.BadZipfile as bz: - raise EpubException(0, 'Bad Zip file') + raise EpubException(0, "Bad Zip file") except zipfile.LargeZipFile as bz: - raise EpubException(1, 'Large Zip file') + raise EpubException(1, "Large Zip file") # 1st check metadata self._load_container() @@ -1729,9 +1937,9 @@ def close(self): self.zf.close() - # WRITE + def write_epub(name, book, options=None): """ Creates epub file with the content defined in EpubBook. @@ -1750,15 +1958,18 @@ def write_epub(name, book, options=None): try: epub.write() except IOError: - warnings.warn('In the future throwing exceptions while writing will be default behavior.') + warnings.warn( + "In the future throwing exceptions while writing will be default behavior." + ) t, v, tb = sys.exc_info() - if options and options.get('raise_exceptions'): + if options and options.get("raise_exceptions"): six.reraise(t, v, tb) else: return False return True + # READ diff --git a/ebooklib/plugins/booktype.py b/ebooklib/plugins/booktype.py index 9842538..1b0033d 100644 --- a/ebooklib/plugins/booktype.py +++ b/ebooklib/plugins/booktype.py @@ -17,14 +17,15 @@ from ebooklib.plugins.base import BasePlugin from ebooklib.utils import parse_html_string + class BooktypeLinks(BasePlugin): - NAME = 'Booktype Links' + NAME = "Booktype Links" def __init__(self, booktype_book): self.booktype_book = booktype_book def html_before_write(self, book, chapter): - from lxml import etree + from lxml import etree try: from urlparse import urlparse, urljoin @@ -38,44 +39,44 @@ def html_before_write(self, book, chapter): root = tree.getroottree() - if len(root.find('body')) != 0: - body = tree.find('body') + if len(root.find("body")) != 0: + body = tree.find("body") # should also be aware to handle # ../chapter/ # ../chapter/#reference # ../chapter#reference - for _link in body.xpath('//a'): + for _link in body.xpath("//a"): # This is just temporary for the footnotes - if _link.get('href', '').find('InsertNoteID') != -1: - _ln = _link.get('href', '') - i = _ln.find('#') - _link.set('href', _ln[i:]) + if _link.get("href", "").find("InsertNoteID") != -1: + _ln = _link.get("href", "") + i = _ln.find("#") + _link.set("href", _ln[i:]) continue - _u = urlparse(_link.get('href', '')) + _u = urlparse(_link.get("href", "")) # Let us care only for internal links at the moment - if _u.scheme == '': - if _u.path != '': - _link.set('href', '%s.xhtml' % _u.path) - - if _u.fragment != '': - _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment)) - - if _link.get('name') != None: - _link.set('id', _link.get('name')) - etree.strip_attributes(_link, 'name') + if _u.scheme == "": + if _u.path != "": + _link.set("href", "%s.xhtml" % _u.path) - chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') + if _u.fragment != "": + _link.set( + "href", urljoin(_link.get("href"), "#%s" % _u.fragment) + ) + if _link.get("name") != None: + _link.set("id", _link.get("name")) + etree.strip_attributes(_link, "name") + chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8") class BooktypeFootnotes(BasePlugin): - NAME = 'Booktype Footnotes' + NAME = "Booktype Footnotes" def __init__(self, booktype_book): self.booktype_book = booktype_book @@ -92,8 +93,8 @@ def html_before_write(self, book, chapter): root = tree.getroottree() - if len(root.find('body')) != 0: - body = tree.find('body') + if len(root.find("body")) != 0: + body = tree.find("body") # 1 #
  1. prvi footnote ^
  2. @@ -101,19 +102,19 @@ def html_before_write(self, book, chapter): # 1

    # for footnote in body.xpath('//span[@class="InsertNoteMarker"]'): - footnote_id = footnote.get('id')[:-8] + footnote_id = footnote.get("id")[:-8] a = footnote.getchildren()[0].getchildren()[0] footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0] - a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref' - ftn = etree.SubElement(body, 'aside', {'id': footnote_id}) - ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote' - ftn_p = etree.SubElement(ftn, 'p') + a.attrib["{%s}type" % epub.NAMESPACES["EPUB"]] = "noteref" + ftn = etree.SubElement(body, "aside", {"id": footnote_id}) + ftn.attrib["{%s}type" % epub.NAMESPACES["EPUB"]] = "footnote" + ftn_p = etree.SubElement(ftn, "p") ftn_p.text = footnote_text.text old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]') if len(old_footnote) > 0: body.remove(old_footnote[0]) - chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') + chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8") diff --git a/ebooklib/plugins/sourcecode.py b/ebooklib/plugins/sourcecode.py index 4f973a2..a279737 100644 --- a/ebooklib/plugins/sourcecode.py +++ b/ebooklib/plugins/sourcecode.py @@ -17,7 +17,8 @@ from ebooklib.plugins.base import BasePlugin from ebooklib.utils import parse_html_string -class SourceHighlighter(BasePlugin): + +class SourceHighlighter(BasePlugin): def __init__(self): pass @@ -38,24 +39,26 @@ def html_before_write(self, book, chapter): had_source = False - if len(root.find('body')) != 0: - body = tree.find('body') + if len(root.find("body")) != 0: + body = tree.find("body") # check for embeded source for source in body.xpath('//pre[contains(@class,"source-")]'): - css_class = source.get('class') + css_class = source.get("class") - source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()]) + source_text = (source.text or "") + "".join( + [html.tostring(child) for child in source.iterchildren()] + ) - if 'source-python' in css_class: + if "source-python" in css_class: from pygments.lexers import PythonLexer -# _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline")) - _text = highlight(source_text, PythonLexer(), HtmlFormatter()) + # _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline")) + _text = highlight(source_text, PythonLexer(), HtmlFormatter()) - if 'source-css' in css_class: + if "source-css" in css_class: from pygments.lexers import CssLexer - _text = highlight(source_text, CssLexer(), HtmlFormatter()) + _text = highlight(source_text, CssLexer(), HtmlFormatter()) _parent = source.getparent() _parent.replace(source, etree.XML(_text)) @@ -64,5 +67,4 @@ def html_before_write(self, book, chapter): if had_source: chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css") - chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8') - + chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8") diff --git a/ebooklib/plugins/standard.py b/ebooklib/plugins/standard.py index 61576f9..94ebdef 100644 --- a/ebooklib/plugins/standard.py +++ b/ebooklib/plugins/standard.py @@ -23,14 +23,48 @@ # - should also look for the _required_ elements # http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element -ATTRIBUTES_GLOBAL = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable', - 'dropzone', 'hidden', 'id', 'inert', 'itemid', 'itemprop', 'itemref', - 'itemscope', 'itemtype', 'lang', 'spellcheck', 'style', 'tabindex', - 'title', 'translate', 'epub:type'] +ATTRIBUTES_GLOBAL = [ + "accesskey", + "class", + "contenteditable", + "contextmenu", + "dir", + "draggable", + "dropzone", + "hidden", + "id", + "inert", + "itemid", + "itemprop", + "itemref", + "itemscope", + "itemtype", + "lang", + "spellcheck", + "style", + "tabindex", + "title", + "translate", + "epub:type", +] # Remove for now from here -DEPRECATED_TAGS = ['acronym', 'applet', 'basefont', 'big', 'center', 'dir', 'font', 'frame', - 'frameset', 'isindex', 'noframes', 's', 'strike', 'tt'] +DEPRECATED_TAGS = [ + "acronym", + "applet", + "basefont", + "big", + "center", + "dir", + "font", + "frame", + "frameset", + "isindex", + "noframes", + "s", + "strike", + "tt", +] def leave_only(item, tag_list): @@ -40,7 +74,7 @@ def leave_only(item, tag_list): class SyntaxPlugin(BasePlugin): - NAME = 'Check HTML syntax' + NAME = "Check HTML syntax" def html_before_write(self, book, chapter): from lxml import etree @@ -57,130 +91,307 @@ def html_before_write(self, book, chapter): for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) - head = tree.find('head') - + head = tree.find("head") + if head is not None and len(head) != 0: - + for _item in head: - if _item.tag == 'base': - leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) - elif _item.tag == 'link': - leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes']) - elif _item.tag == 'title': - if _item.text == '': + if _item.tag == "base": + leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target"]) + elif _item.tag == "link": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "href", + "crossorigin", + "rel", + "media", + "hreflang", + "type", + "sizes", + ], + ) + elif _item.tag == "title": + if _item.text == "": head.remove(_item) - elif _item.tag == 'meta': - leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) + elif _item.tag == "meta": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + ["name", "http-equiv", "content", "charset"], + ) # just remove for now, but really should not be like this - head.remove(_item) - elif _item.tag == 'script': - leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin']) - elif _item.tag == 'source': - leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) - elif _item.tag == 'style': - leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) + head.remove(_item) + elif _item.tag == "script": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + ["src", "type", "charset", "async", "defer", "crossorigin"], + ) + elif _item.tag == "source": + leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "media"]) + elif _item.tag == "style": + leave_only(_item, ATTRIBUTES_GLOBAL + ["media", "type", "scoped"]) else: leave_only(_item, ATTRIBUTES_GLOBAL) - - if len(root.find('body')) != 0: - body = tree.find('body') + if len(root.find("body")) != 0: + body = tree.find("body") for _item in body.iter(): # it is not # - - if _item.tag == 'a': - leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type']) - elif _item.tag == 'area': - leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type']) - elif _item.tag == 'audio': - leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls']) - elif _item.tag == 'blockquote': - leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) - elif _item.tag == 'button': - leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', - 'formtarget', 'name', 'type', 'value', 'menu']) - elif _item.tag == 'canvas': - leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) - elif _item.tag == 'canvas': - leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) - elif _item.tag == 'del': - leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) - elif _item.tag == 'details': - leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) - elif _item.tag == 'embed': - leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) - elif _item.tag == 'fieldset': - leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) - elif _item.tag == 'details': - leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target']) - elif _item.tag == 'iframe': - leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height']) - elif _item.tag == 'img': - _src = _item.get('src', '').lower() - if _src.startswith('http://') or _src.startswith('https://'): - if 'remote-resources' not in chapter.properties: - chapter.properties.append('remote-resources') + + if _item.tag == "a": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + ["href", "target", "download", "rel", "hreflang", "type"], + ) + elif _item.tag == "area": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "alt", + "coords", + "shape", + "href", + "target", + "download", + "rel", + "hreflang", + "type", + ], + ) + elif _item.tag == "audio": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "src", + "crossorigin", + "preload", + "autoplay", + "mediagroup", + "loop", + "muted", + "controls", + ], + ) + elif _item.tag == "blockquote": + leave_only(_item, ATTRIBUTES_GLOBAL + ["cite"]) + elif _item.tag == "button": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "autofocus", + "disabled", + "form", + "formaction", + "formenctype", + "formmethod", + "formnovalidate", + "formtarget", + "name", + "type", + "value", + "menu", + ], + ) + elif _item.tag == "canvas": + leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) + elif _item.tag == "canvas": + leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) + elif _item.tag == "del": + leave_only(_item, ATTRIBUTES_GLOBAL + ["cite", "datetime"]) + elif _item.tag == "details": + leave_only(_item, ATTRIBUTES_GLOBAL + ["open"]) + elif _item.tag == "embed": + leave_only( + _item, ATTRIBUTES_GLOBAL + ["src", "type", "width", "height"] + ) + elif _item.tag == "fieldset": + leave_only(_item, ATTRIBUTES_GLOBAL + ["disable", "form", "name"]) + elif _item.tag == "details": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "accept-charset", + "action", + "autocomplete", + "enctype", + "method", + "name", + "novalidate", + "target", + ], + ) + elif _item.tag == "iframe": + leave_only( + _item, + ATTRIBUTES_GLOBAL + + [ + "src", + "srcdoc", + "name", + "sandbox", + "seamless", + "allowfullscreen", + "width", + "height", + ], + ) + elif _item.tag == "img": + _src = _item.get("src", "").lower() + if _src.startswith("http://") or _src.startswith("https://"): + if "remote-resources" not in chapter.properties: + chapter.properties.append("remote-resources") # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH 0: text = headers[0].text_content().strip() @@ -97,20 +99,20 @@ def get_pages(item): pages = [] for elem in body.iter(): - if 'epub:type' in elem.attrib: - if elem.get('id') is not None: + if "epub:type" in elem.attrib: + if elem.get("id") is not None: _text = None - - if elem.text is not None and elem.text.strip() != '': + + if elem.text is not None and elem.text.strip() != "": _text = elem.text.strip() if _text is None: - _text = elem.get('aria-label') + _text = elem.get("aria-label") if _text is None: _text = get_headers(elem) - pages.append((item.get_name(), elem.get('id'), _text or elem.get('id'))) + pages.append((item.get_name(), elem.get("id"), _text or elem.get("id"))) return pages diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..db272a8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +lxml +six