From 89423e048d8de43727d74a585c7fa6eb9f6b4c8f Mon Sep 17 00:00:00 2001 From: Alexis de Lattre Date: Mon, 13 May 2024 15:59:44 +0200 Subject: [PATCH] [FIX] pdf_helper: crash in PDF parsing Switch from PyPDF2 (deprecated) to pypdf (actively maintained) --- pdf_helper/__manifest__.py | 1 + pdf_helper/models/helper.py | 2 +- pdf_helper/static/description/index.html | 1 - pdf_helper/utils.py | 55 ++++++++++-------------- requirements.txt | 1 + 5 files changed, 25 insertions(+), 35 deletions(-) diff --git a/pdf_helper/__manifest__.py b/pdf_helper/__manifest__.py index 5259262e7b..53fbe8c800 100644 --- a/pdf_helper/__manifest__.py +++ b/pdf_helper/__manifest__.py @@ -15,4 +15,5 @@ "depends": [ "base", ], + "external_dependencies": {"python": ["pypdf"]}, } diff --git a/pdf_helper/models/helper.py b/pdf_helper/models/helper.py index 52b30355b4..dc6ffb9e9f 100644 --- a/pdf_helper/models/helper.py +++ b/pdf_helper/models/helper.py @@ -3,7 +3,7 @@ # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). import logging -from PyPDF2.utils import PdfReadError +from pypdf.errors import PdfReadError from odoo import models diff --git a/pdf_helper/static/description/index.html b/pdf_helper/static/description/index.html index fa2438ebcf..21a07f3c14 100644 --- a/pdf_helper/static/description/index.html +++ b/pdf_helper/static/description/index.html @@ -1,4 +1,3 @@ - diff --git a/pdf_helper/utils.py b/pdf_helper/utils.py index eceb939b5d..2f764b1155 100644 --- a/pdf_helper/utils.py +++ b/pdf_helper/utils.py @@ -13,9 +13,9 @@ _logger = logging.getLogger(__name__) try: - import PyPDF2 + import pypdf except ImportError: - _logger.debug("Cannot import PyPDF2") + _logger.debug("Cannot import pypdf") class PDFParser: @@ -30,41 +30,30 @@ def get_xml_files(self): """ res = {} with BytesIO(self.pdf_file) as fd: - xmlfiles = self._extract_xml_files(fd) - for filename, xml_obj in xmlfiles.items(): - root = self._extract_xml_root(xml_obj) - if root is None or not len(root): - continue - res[filename] = root + res = self._extract_xml_files(fd) if res: _logger.debug("Valid XML files found in PDF: %s", list(res.keys())) return res def _extract_xml_files(self, fd): - pdf = PyPDF2.PdfFileReader(fd) - _logger.debug("pdf.trailer=%s", pdf.trailer) - pdf_root = pdf.trailer["/Root"] - _logger.debug("pdf_root=%s", pdf_root) - # TODO add support for /Kids - embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"] - i = 0 - xmlfiles = {} # key = filename, value = PDF obj - for embeddedfile in embeddedfiles[:-1]: - mime_res = mimetypes.guess_type(embeddedfile) + reader = pypdf.PdfReader(fd) + # attachment parsing via pypdf doesn't support /Kids + # cf my bug report https://github.com/py-pdf/pypdf/issues/2087 + xmlfiles = {} + for filename, content_list in reader.attachments.items(): + _logger.debug("Attachment %s found in PDF", filename) + mime_res = mimetypes.guess_type(filename) if mime_res and mime_res[0] in ["application/xml", "text/xml"]: - xmlfiles[embeddedfile] = embeddedfiles[i + 1] - i += 1 - _logger.debug("xmlfiles=%s", xmlfiles) + try: + _logger.debug("Trying to parse XML attachment %s", filename) + xml_root = etree.fromstring(content_list[0]) + if len(xml_root) > 0: + _logger.info("Valid XML file %s found in attachments", filename) + xmlfiles[filename] = xml_root + else: + _logger.warning("XML file %s is empty", filename) + except Exception as err: + _logger.warning( + "Failed to parse XML file %s. Error: %s", filename, str(err) + ) return xmlfiles - - def _extract_xml_root(self, xml_obj): - xml_root = None - try: - xml_file_dict = xml_obj.getObject() - _logger.debug("xml_file_dict=%s", xml_file_dict) - xml_string = xml_file_dict["/EF"]["/F"].getData() - xml_root = etree.fromstring(xml_string) - except Exception as err: - # TODO: can't we catch specific exceptions? - _logger.debug("_pdf_extract_xml_root failed: %s", str(err)) - return xml_root diff --git a/requirements.txt b/requirements.txt index 90190d3c7a..d72b73d55c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ factur-x invoice2data ovh phonenumbers +pypdf pypdf>=3.1.0 pyyaml regex