Skip to content

Commit

Permalink
pdf_helper: multi-attachments
Browse files Browse the repository at this point in the history
Replace code by odoo.tools.pdf
  • Loading branch information
jbaudoux authored and bosd committed Jul 12, 2024
1 parent 6386e42 commit 5066863
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 54 deletions.
20 changes: 10 additions & 10 deletions pdf_helper/models/helper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Copyright 2022 Camptocamp SA
# @author: Simone Orsi <[email protected]>
# Copyright 2023 Jacques-Etienne Baudoux (BCIM) <[email protected]>
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
import logging

from PyPDF2.utils import PdfReadError
import logging

from odoo import models
from odoo import api, models

from ..utils import PDFParser

Expand All @@ -18,16 +18,16 @@ class PDFHelper(models.AbstractModel):

_PDF_PARSER_KLASS = PDFParser

@api.model
def pdf_get_xml_files(self, pdf_file):
"""Extract XML attachments from pdf
:param pdf_file: binary PDF file content
:returns: a dict like {$filename: $parsed_xml_file_obj}.
"""
parser = self._PDF_PARSER_KLASS(pdf_file)
try:
return parser.get_xml_files()
except self._pdf_get_xml_files_swallable_exceptions() as err:
# TODO: can't we catch specific exceptions?
# This try/except block was added to reflect what done
# in base_business_document_import till now.
except parser.get_xml_files_swallable_exceptions() as err:
_logger.error("PDF file parsing failed: %s", str(err))
return {}

def _pdf_get_xml_files_swallable_exceptions(self):
return (KeyError, PdfReadError)
1 change: 1 addition & 0 deletions pdf_helper/readme/CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
* Simone Orsi <[email protected]>
* Alexis de Lattre <[email protected]>
* Jacques-Etienne Baudoux (BCIM) <[email protected]>
4 changes: 2 additions & 2 deletions pdf_helper/tests/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def test_parse_xml(self):


class TestPDFHelper(TransactionCase):
def test_parse_xml(self):
def test_get_xml(self):
pdf_content = read_test_file("pdf_with_xml_test.pdf", mode="rb")
res = self.env["pdf.helper"].pdf_get_xml_files(pdf_content)
fname, xml_root = tuple(res.items())[0]
self.assertEqual(fname, "factur-x.xml")
self.assertTrue(isinstance(xml_root, etree._Element))

def test_parse_xml_fail(self):
def test_get_xml_fail(self):
with self.assertLogs(
"odoo.addons.pdf_helper.models.helper", level="ERROR"
) as log_catcher:
Expand Down
62 changes: 20 additions & 42 deletions pdf_helper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).

import logging
import mimetypes
from io import BytesIO
from struct import error as StructError

from lxml import etree

_logger = logging.getLogger(__name__)

try:
import PyPDF2
from PyPDF2.errors import PdfReadError
except ImportError:
_logger.debug("Cannot import PyPDF2")
from PyPDF2.utils import PdfReadError

from odoo.tools.pdf import OdooPdfFileReader

_logger = logging.getLogger(__name__)


class PDFParser:
Expand All @@ -29,42 +31,18 @@ def get_xml_files(self):
:returns: a dict like {$filename: $parsed_xml_file_obj}.
"""
res = {}
with BytesIO(self.pdf_file) as fd:
xmlfiles = self._extract_xml_files(fd)
for filename, xml_obj in xmlfiles.items():
root = self._extract_xml_root(xml_obj)
if root is None or not len(root):
continue
res[filename] = root
if res:
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
with BytesIO(self.pdf_file) as buffer:
pdf_reader = OdooPdfFileReader(buffer, strict=False)

# Process embedded files.
for xml_name, content in pdf_reader.getAttachments():
try:
res[xml_name] = etree.fromstring(content)
except Exception:
_logger.debug("Non XML file found in PDF")
if res:
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
return res

def _extract_xml_files(self, fd):
pdf = PyPDF2.PdfFileReader(fd)
_logger.debug("pdf.trailer=%s", pdf.trailer)
pdf_root = pdf.trailer["/Root"]
_logger.debug("pdf_root=%s", pdf_root)
# TODO add support for /Kids
embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"]
i = 0
xmlfiles = {} # key = filename, value = PDF obj
for embeddedfile in embeddedfiles[:-1]:
mime_res = mimetypes.guess_type(embeddedfile)
if mime_res and mime_res[0] in ["application/xml", "text/xml"]:
xmlfiles[embeddedfile] = embeddedfiles[i + 1]
i += 1
_logger.debug("xmlfiles=%s", xmlfiles)
return xmlfiles

def _extract_xml_root(self, xml_obj):
xml_root = None
try:
xml_file_dict = xml_obj.getObject()
_logger.debug("xml_file_dict=%s", xml_file_dict)
xml_string = xml_file_dict["/EF"]["/F"].getData()
xml_root = etree.fromstring(xml_string)
except Exception as err:
# TODO: can't we catch specific exceptions?
_logger.debug("_pdf_extract_xml_root failed: %s", str(err))
return xml_root
def get_xml_files_swallable_exceptions(self):
return (NotImplementedError, StructError, PdfReadError)

0 comments on commit 5066863

Please sign in to comment.