From 1bddcdf774184afa16fb15e247b1dfe3ec238989 Mon Sep 17 00:00:00 2001 From: Alexis de Lattre Date: Thu, 1 Aug 2024 22:15:29 +0200 Subject: [PATCH 1/2] [FIX] account_invoice_import_simple_pdf: version extraction for fitz/PyMuPDF Use with statement for NamedTemporaryFile() --- .../wizard/account_invoice_import.py | 71 +++++++++++-------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py index 62ccdb852a..fd0f904e06 100644 --- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py +++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py @@ -46,6 +46,7 @@ def fallback_parse_pdf_invoice(self, file_data): @api.model def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info): res = False + version = None try: pages = [] doc = fitz.open(fileobj.name) @@ -55,8 +56,14 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info): "all": "\n\n".join(pages), "first": pages and pages[0] or "", } - logger.info("Text extraction made with PyMuPDF %s", fitz.__version__) - test_info["text_extraction"] = "pymupdf %s" % fitz.__version__ + # For PyMuPDF, we used to get the version via __version__ + # but it is not possible with newer version of the lib + if hasattr(fitz, "__version__"): + version = fitz.__version__ + elif hasattr(fitz, "version") and isinstance(fitz.version, tuple): + version = fitz.version[0] + logger.info("Text extraction made with PyMuPDF %s", version) + test_info["text_extraction"] = "pymupdf %s" % version except Exception as e: logger.warning("Text extraction with PyMuPDF failed. Error: %s", e) return res @@ -164,12 +171,6 @@ def _simple_pdf_text_extraction_specific_tool( @api.model def simple_pdf_text_extraction(self, file_data, test_info): - fileobj = NamedTemporaryFile("wb", prefix="odoo-simple-pdf-", suffix=".pdf") - fileobj.write(file_data) - # Extract text from PDF - # Very interesting reading: - # https://dida.do/blog/how-to-extract-text-from-pdf - # https://github.com/erfelipe/PDFtextExtraction specific_tool = ( self.env["ir.config_parameter"] .sudo() @@ -178,27 +179,40 @@ def simple_pdf_text_extraction(self, file_data, test_info): if specific_tool: specific_tool = specific_tool.strip().lower() test_info["text_extraction_config"] = specific_tool - if specific_tool: - res = self._simple_pdf_text_extraction_specific_tool( - specific_tool, fileobj, test_info - ) - else: - # From best tool to worst - res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info) - if not res: - res = self._simple_pdf_text_extraction_pdftotext_lib(fileobj, test_info) - if not res: - res = self._simple_pdf_text_extraction_pdftotext_cmd(fileobj, test_info) - if not res: - res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info) - if not res: - raise UserError( - _( - "Odoo could not extract the text from the PDF invoice. " - "Refer to the Odoo server logs for more technical information " - "about the cause of the failure." - ) + + with NamedTemporaryFile( + "wb", prefix="odoo-simple-pdf-", suffix=".pdf" + ) as fileobj: + fileobj.write(file_data) + # Extract text from PDF + # Very interesting reading: + # https://dida.do/blog/how-to-extract-text-from-pdf + # https://github.com/erfelipe/PDFtextExtraction + if specific_tool: + res = self._simple_pdf_text_extraction_specific_tool( + specific_tool, fileobj, test_info ) + else: + # From best tool to worst + res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info) + if not res: + res = self._simple_pdf_text_extraction_pdftotext_lib( + fileobj, test_info + ) + if not res: + res = self._simple_pdf_text_extraction_pdftotext_cmd( + fileobj, test_info + ) + if not res: + res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info) + if not res: + raise UserError( + _( + "Odoo could not extract the text from the PDF invoice. " + "Refer to the Odoo server logs for more technical information " + "about the cause of the failure." + ) + ) for key, text in res.items(): if text: # Remove lonely accents @@ -213,7 +227,6 @@ def simple_pdf_text_extraction(self, file_data, test_info): res["first_no_space"] = regex.sub( "%s+" % test_info["space_pattern"], "", res["first"] ) - fileobj.close() return res @api.model From a5c733c09200b69cd2952fe0dd57ccb3566acaf0 Mon Sep 17 00:00:00 2001 From: Alexis de Lattre Date: Thu, 1 Aug 2024 21:43:09 +0000 Subject: [PATCH 2/2] [FIX] account_invoice_import_simple_pdf: add seek(0) to avoid error "empty file" Add try/except on pypdf text extraction --- .../wizard/account_invoice_import.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py index fd0f904e06..8d103e2cfa 100644 --- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py +++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py @@ -71,16 +71,19 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info): @api.model def _simple_pdf_text_extraction_pypdf(self, fileobj, test_info): res = False - reader = pypdf.PdfReader(fileobj.name) - pages = [] - for pdf_page in reader.pages: - pages.append(pdf_page.extract_text()) - res = { - "all": "\n\n".join(pages), - "first": pages and pages[0] or "", - } - test_info["text_extraction"] = "pypdf %s" % pypdf.__version__ - logger.info("Text extraction made with pypdf %s", pypdf.__version__) + try: + reader = pypdf.PdfReader(fileobj.name) + pages = [] + for pdf_page in reader.pages: + pages.append(pdf_page.extract_text()) + res = { + "all": "\n\n".join(pages), + "first": pages and pages[0] or "", + } + test_info["text_extraction"] = "pypdf %s" % pypdf.__version__ + logger.info("Text extraction made with pypdf %s", pypdf.__version__) + except Exception as e: + logger.warning("Text extraction with pypdf failed. Error: %s", e) return res @api.model @@ -184,6 +187,7 @@ def simple_pdf_text_extraction(self, file_data, test_info): "wb", prefix="odoo-simple-pdf-", suffix=".pdf" ) as fileobj: fileobj.write(file_data) + fileobj.seek(0) # Extract text from PDF # Very interesting reading: # https://dida.do/blog/how-to-extract-text-from-pdf