diff --git a/pdf_helper/models/helper.py b/pdf_helper/models/helper.py index 52b30355b4..cc34487d90 100644 --- a/pdf_helper/models/helper.py +++ b/pdf_helper/models/helper.py @@ -1,11 +1,11 @@ # Copyright 2022 Camptocamp SA # @author: Simone Orsi +# Copyright 2023 Jacques-Etienne Baudoux (BCIM) # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). -import logging -from PyPDF2.utils import PdfReadError +import logging -from odoo import models +from odoo import api, models from ..utils import PDFParser @@ -18,16 +18,16 @@ class PDFHelper(models.AbstractModel): _PDF_PARSER_KLASS = PDFParser + @api.model def pdf_get_xml_files(self, pdf_file): + """Extract XML attachments from pdf + + :param pdf_file: binary PDF file content + :returns: a dict like {$filename: $parsed_xml_file_obj}. + """ parser = self._PDF_PARSER_KLASS(pdf_file) try: return parser.get_xml_files() - except self._pdf_get_xml_files_swallable_exceptions() as err: - # TODO: can't we catch specific exceptions? - # This try/except block was added to reflect what done - # in base_business_document_import till now. + except parser.get_xml_files_swallable_exceptions() as err: _logger.error("PDF file parsing failed: %s", str(err)) return {} - - def _pdf_get_xml_files_swallable_exceptions(self): - return (KeyError, PdfReadError) diff --git a/pdf_helper/readme/CONTRIBUTORS.rst b/pdf_helper/readme/CONTRIBUTORS.rst index fe493ea973..ad925fe476 100644 --- a/pdf_helper/readme/CONTRIBUTORS.rst +++ b/pdf_helper/readme/CONTRIBUTORS.rst @@ -1,2 +1,3 @@ * Simone Orsi * Alexis de Lattre +* Jacques-Etienne Baudoux (BCIM) diff --git a/pdf_helper/tests/test_helper.py b/pdf_helper/tests/test_helper.py index 9b956e2e3a..72ca44c900 100644 --- a/pdf_helper/tests/test_helper.py +++ b/pdf_helper/tests/test_helper.py @@ -28,14 +28,14 @@ def test_parse_xml(self): class TestPDFHelper(TransactionCase): - def test_parse_xml(self): + def test_get_xml(self): pdf_content = read_test_file("pdf_with_xml_test.pdf", mode="rb") res = self.env["pdf.helper"].pdf_get_xml_files(pdf_content) fname, xml_root = tuple(res.items())[0] self.assertEqual(fname, "factur-x.xml") self.assertTrue(isinstance(xml_root, etree._Element)) - def test_parse_xml_fail(self): + def test_get_xml_fail(self): with self.assertLogs( "odoo.addons.pdf_helper.models.helper", level="ERROR" ) as log_catcher: diff --git a/pdf_helper/utils.py b/pdf_helper/utils.py index eceb939b5d..23ad16c5f0 100644 --- a/pdf_helper/utils.py +++ b/pdf_helper/utils.py @@ -5,17 +5,19 @@ # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). import logging -import mimetypes from io import BytesIO +from struct import error as StructError from lxml import etree -_logger = logging.getLogger(__name__) - try: - import PyPDF2 + from PyPDF2.errors import PdfReadError except ImportError: - _logger.debug("Cannot import PyPDF2") + from PyPDF2.utils import PdfReadError + +from odoo.tools.pdf import OdooPdfFileReader + +_logger = logging.getLogger(__name__) class PDFParser: @@ -29,42 +31,18 @@ def get_xml_files(self): :returns: a dict like {$filename: $parsed_xml_file_obj}. """ res = {} - with BytesIO(self.pdf_file) as fd: - xmlfiles = self._extract_xml_files(fd) - for filename, xml_obj in xmlfiles.items(): - root = self._extract_xml_root(xml_obj) - if root is None or not len(root): - continue - res[filename] = root - if res: - _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) + with BytesIO(self.pdf_file) as buffer: + pdf_reader = OdooPdfFileReader(buffer, strict=False) + + # Process embedded files. + for xml_name, content in pdf_reader.getAttachments(): + try: + res[xml_name] = etree.fromstring(content) + except Exception: + _logger.debug("Non XML file found in PDF") + if res: + _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) return res - def _extract_xml_files(self, fd): - pdf = PyPDF2.PdfFileReader(fd) - _logger.debug("pdf.trailer=%s", pdf.trailer) - pdf_root = pdf.trailer["/Root"] - _logger.debug("pdf_root=%s", pdf_root) - # TODO add support for /Kids - embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"] - i = 0 - xmlfiles = {} # key = filename, value = PDF obj - for embeddedfile in embeddedfiles[:-1]: - mime_res = mimetypes.guess_type(embeddedfile) - if mime_res and mime_res[0] in ["application/xml", "text/xml"]: - xmlfiles[embeddedfile] = embeddedfiles[i + 1] - i += 1 - _logger.debug("xmlfiles=%s", xmlfiles) - return xmlfiles - - def _extract_xml_root(self, xml_obj): - xml_root = None - try: - xml_file_dict = xml_obj.getObject() - _logger.debug("xml_file_dict=%s", xml_file_dict) - xml_string = xml_file_dict["/EF"]["/F"].getData() - xml_root = etree.fromstring(xml_string) - except Exception as err: - # TODO: can't we catch specific exceptions? - _logger.debug("_pdf_extract_xml_root failed: %s", str(err)) - return xml_root + def get_xml_files_swallable_exceptions(self): + return (NotImplementedError, StructError, PdfReadError)