From 1bddcdf774184afa16fb15e247b1dfe3ec238989 Mon Sep 17 00:00:00 2001
From: Alexis de Lattre <alexis.delattre@akretion.com>
Date: Thu, 1 Aug 2024 22:15:29 +0200
Subject: [PATCH 1/2] [FIX] account_invoice_import_simple_pdf: version
 extraction for fitz/PyMuPDF

Use with statement for NamedTemporaryFile()
---
 .../wizard/account_invoice_import.py          | 71 +++++++++++--------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
index 62ccdb852a..fd0f904e06 100644
--- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
+++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
@@ -46,6 +46,7 @@ def fallback_parse_pdf_invoice(self, file_data):
     @api.model
     def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
         res = False
+        version = None
         try:
             pages = []
             doc = fitz.open(fileobj.name)
@@ -55,8 +56,14 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
                 "all": "\n\n".join(pages),
                 "first": pages and pages[0] or "",
             }
-            logger.info("Text extraction made with PyMuPDF %s", fitz.__version__)
-            test_info["text_extraction"] = "pymupdf %s" % fitz.__version__
+            # For PyMuPDF, we used to get the version via __version__
+            # but it is not possible with newer version of the lib
+            if hasattr(fitz, "__version__"):
+                version = fitz.__version__
+            elif hasattr(fitz, "version") and isinstance(fitz.version, tuple):
+                version = fitz.version[0]
+            logger.info("Text extraction made with PyMuPDF %s", version)
+            test_info["text_extraction"] = "pymupdf %s" % version
         except Exception as e:
             logger.warning("Text extraction with PyMuPDF failed. Error: %s", e)
         return res
@@ -164,12 +171,6 @@ def _simple_pdf_text_extraction_specific_tool(
 
     @api.model
     def simple_pdf_text_extraction(self, file_data, test_info):
-        fileobj = NamedTemporaryFile("wb", prefix="odoo-simple-pdf-", suffix=".pdf")
-        fileobj.write(file_data)
-        # Extract text from PDF
-        # Very interesting reading:
-        # https://dida.do/blog/how-to-extract-text-from-pdf
-        # https://github.com/erfelipe/PDFtextExtraction
         specific_tool = (
             self.env["ir.config_parameter"]
             .sudo()
@@ -178,27 +179,40 @@ def simple_pdf_text_extraction(self, file_data, test_info):
         if specific_tool:
             specific_tool = specific_tool.strip().lower()
         test_info["text_extraction_config"] = specific_tool
-        if specific_tool:
-            res = self._simple_pdf_text_extraction_specific_tool(
-                specific_tool, fileobj, test_info
-            )
-        else:
-            # From best tool to worst
-            res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
-            if not res:
-                res = self._simple_pdf_text_extraction_pdftotext_lib(fileobj, test_info)
-            if not res:
-                res = self._simple_pdf_text_extraction_pdftotext_cmd(fileobj, test_info)
-            if not res:
-                res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
-            if not res:
-                raise UserError(
-                    _(
-                        "Odoo could not extract the text from the PDF invoice. "
-                        "Refer to the Odoo server logs for more technical information "
-                        "about the cause of the failure."
-                    )
+
+        with NamedTemporaryFile(
+            "wb", prefix="odoo-simple-pdf-", suffix=".pdf"
+        ) as fileobj:
+            fileobj.write(file_data)
+            # Extract text from PDF
+            # Very interesting reading:
+            # https://dida.do/blog/how-to-extract-text-from-pdf
+            # https://github.com/erfelipe/PDFtextExtraction
+            if specific_tool:
+                res = self._simple_pdf_text_extraction_specific_tool(
+                    specific_tool, fileobj, test_info
                 )
+            else:
+                # From best tool to worst
+                res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
+                if not res:
+                    res = self._simple_pdf_text_extraction_pdftotext_lib(
+                        fileobj, test_info
+                    )
+                if not res:
+                    res = self._simple_pdf_text_extraction_pdftotext_cmd(
+                        fileobj, test_info
+                    )
+                if not res:
+                    res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
+                if not res:
+                    raise UserError(
+                        _(
+                            "Odoo could not extract the text from the PDF invoice. "
+                            "Refer to the Odoo server logs for more technical information "
+                            "about the cause of the failure."
+                        )
+                    )
         for key, text in res.items():
             if text:
                 # Remove lonely accents
@@ -213,7 +227,6 @@ def simple_pdf_text_extraction(self, file_data, test_info):
         res["first_no_space"] = regex.sub(
             "%s+" % test_info["space_pattern"], "", res["first"]
         )
-        fileobj.close()
         return res
 
     @api.model

From a5c733c09200b69cd2952fe0dd57ccb3566acaf0 Mon Sep 17 00:00:00 2001
From: Alexis de Lattre <alexis.delattre@akretion.com>
Date: Thu, 1 Aug 2024 21:43:09 +0000
Subject: [PATCH 2/2] [FIX] account_invoice_import_simple_pdf: add seek(0) to
 avoid error "empty file"

Add try/except on pypdf text extraction
---
 .../wizard/account_invoice_import.py          | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
index fd0f904e06..8d103e2cfa 100644
--- a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
+++ b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
@@ -71,16 +71,19 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
     @api.model
     def _simple_pdf_text_extraction_pypdf(self, fileobj, test_info):
         res = False
-        reader = pypdf.PdfReader(fileobj.name)
-        pages = []
-        for pdf_page in reader.pages:
-            pages.append(pdf_page.extract_text())
-            res = {
-                "all": "\n\n".join(pages),
-                "first": pages and pages[0] or "",
-            }
-        test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
-        logger.info("Text extraction made with pypdf %s", pypdf.__version__)
+        try:
+            reader = pypdf.PdfReader(fileobj.name)
+            pages = []
+            for pdf_page in reader.pages:
+                pages.append(pdf_page.extract_text())
+                res = {
+                    "all": "\n\n".join(pages),
+                    "first": pages and pages[0] or "",
+                }
+            test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
+            logger.info("Text extraction made with pypdf %s", pypdf.__version__)
+        except Exception as e:
+            logger.warning("Text extraction with pypdf failed. Error: %s", e)
         return res
 
     @api.model
@@ -184,6 +187,7 @@ def simple_pdf_text_extraction(self, file_data, test_info):
             "wb", prefix="odoo-simple-pdf-", suffix=".pdf"
         ) as fileobj:
             fileobj.write(file_data)
+            fileobj.seek(0)
             # Extract text from PDF
             # Very interesting reading:
             # https://dida.do/blog/how-to-extract-text-from-pdf