Skip to content

Commit

Permalink
[FIX] pdf_helper: crash in PDF parsing
Browse files Browse the repository at this point in the history
Switch from PyPDF2 (deprecated) to pypdf (actively maintained)
  • Loading branch information
alexis-via committed May 23, 2024
1 parent fac3327 commit 89423e0
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 35 deletions.
1 change: 1 addition & 0 deletions pdf_helper/__manifest__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
"depends": [
"base",
],
"external_dependencies": {"python": ["pypdf"]},
}
2 changes: 1 addition & 1 deletion pdf_helper/models/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
import logging

from PyPDF2.utils import PdfReadError
from pypdf.errors import PdfReadError

from odoo import models

Expand Down
1 change: 0 additions & 1 deletion pdf_helper/static/description/index.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
Expand Down
55 changes: 22 additions & 33 deletions pdf_helper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
_logger = logging.getLogger(__name__)

try:
import PyPDF2
import pypdf
except ImportError:
_logger.debug("Cannot import PyPDF2")
_logger.debug("Cannot import pypdf")

Check warning on line 18 in pdf_helper/utils.py

View check run for this annotation

Codecov / codecov/patch

pdf_helper/utils.py#L18

Added line #L18 was not covered by tests


class PDFParser:
Expand All @@ -30,41 +30,30 @@ def get_xml_files(self):
"""
res = {}
with BytesIO(self.pdf_file) as fd:
xmlfiles = self._extract_xml_files(fd)
for filename, xml_obj in xmlfiles.items():
root = self._extract_xml_root(xml_obj)
if root is None or not len(root):
continue
res[filename] = root
res = self._extract_xml_files(fd)
if res:
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
return res

def _extract_xml_files(self, fd):
pdf = PyPDF2.PdfFileReader(fd)
_logger.debug("pdf.trailer=%s", pdf.trailer)
pdf_root = pdf.trailer["/Root"]
_logger.debug("pdf_root=%s", pdf_root)
# TODO add support for /Kids
embeddedfiles = pdf_root["/Names"]["/EmbeddedFiles"]["/Names"]
i = 0
xmlfiles = {} # key = filename, value = PDF obj
for embeddedfile in embeddedfiles[:-1]:
mime_res = mimetypes.guess_type(embeddedfile)
reader = pypdf.PdfReader(fd)
# attachment parsing via pypdf doesn't support /Kids
# cf my bug report https://github.com/py-pdf/pypdf/issues/2087
xmlfiles = {}
for filename, content_list in reader.attachments.items():
_logger.debug("Attachment %s found in PDF", filename)
mime_res = mimetypes.guess_type(filename)
if mime_res and mime_res[0] in ["application/xml", "text/xml"]:
xmlfiles[embeddedfile] = embeddedfiles[i + 1]
i += 1
_logger.debug("xmlfiles=%s", xmlfiles)
try:
_logger.debug("Trying to parse XML attachment %s", filename)
xml_root = etree.fromstring(content_list[0])
if len(xml_root) > 0:
_logger.info("Valid XML file %s found in attachments", filename)
xmlfiles[filename] = xml_root
else:
_logger.warning("XML file %s is empty", filename)
except Exception as err:
_logger.warning(

Check warning on line 56 in pdf_helper/utils.py

View check run for this annotation

Codecov / codecov/patch

pdf_helper/utils.py#L54-L56

Added lines #L54 - L56 were not covered by tests
"Failed to parse XML file %s. Error: %s", filename, str(err)
)
return xmlfiles

def _extract_xml_root(self, xml_obj):
xml_root = None
try:
xml_file_dict = xml_obj.getObject()
_logger.debug("xml_file_dict=%s", xml_file_dict)
xml_string = xml_file_dict["/EF"]["/F"].getData()
xml_root = etree.fromstring(xml_string)
except Exception as err:
# TODO: can't we catch specific exceptions?
_logger.debug("_pdf_extract_xml_root failed: %s", str(err))
return xml_root
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ factur-x
invoice2data
ovh
phonenumbers
pypdf
pypdf>=3.1.0
pyyaml
regex
Expand Down

0 comments on commit 89423e0

Please sign in to comment.