Merge pull request #13 from gowthamshankar99/doc-support

[Enhance] Support for doc and docx files
awslabs · Sep 30, 2024 · 26d96d5 · 26d96d5
2 parents cfb4bc1 + ee673b4
commit 26d96d5
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 2 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -23,4 +23,4 @@ rpds-py==0.18.0 ; python_version >= "3.10"
 s3transfer==0.10.1 ; python_version >= "3.10"
 six==1.16.0 ; python_version >= "3.10"
 typing-extensions==4.10.0 ; python_version >= "3.10"
-urllib3==2.2.1 ; python_version >= "3.10"
+urllib3==2.2.1 ; python_version >= "3.10"
diff --git a/src/rhubarb/file_converter/file_converter.py b/src/rhubarb/file_converter/file_converter.py
@@ -9,8 +9,14 @@
 
 import boto3
 import pdfplumber
-from PIL import Image
+from PIL import Image, ImageDraw
 
+try:
+    from docx import Document
+
+    DOCX_AVAILABLE = True
+except ImportError:
+    DOCX_AVAILABLE = False
 from .image_validator import ImageValidator
 
 logger = logging.getLogger(__name__)
@@ -144,6 +150,38 @@ def convert_to_base64(self) -> List[Dict[str, Union[int, str]]]:
                         base64_string = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
                         base64_strings.append({"page": i + 1, "base64string": base64_string})
                 return base64_strings
+            elif self.mime_type in [
+                "application/msword",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            ]:
+                if not DOCX_AVAILABLE:
+                    raise ImportError(
+                        "The 'python-docx' library is not installed. Please install it to process .docx files."
+                    )
+                document = Document(
+                    BytesIO(self.file_bytes)
+                    if self.file_path.startswith("s3://")
+                    else self.file_path
+                )
+                base64_strings = []
+                page_count = len(document.paragraphs)  # Assuming paragraphs as a proxy for pages
+                if self.pages == [0]:
+                    page_nums = range(min(20, page_count))
+                else:
+                    page_nums = [p - 1 for p in self.pages if p <= page_count and p > 0]
+
+                for page_num in page_nums:
+                    paragraph = document.paragraphs[page_num].text
+                    img = Image.new(
+                        "RGB", (800, 600), color=(255, 255, 255)
+                    )  # Placeholder image for paragraph
+                    d = ImageDraw.Draw(img)
+                    d.text((10, 10), paragraph, fill=(0, 0, 0))
+                    img_bytes = BytesIO()
+                    img.save(img_bytes, format="PNG")
+                    base64_string = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+                    base64_strings.append({"page": page_num + 1, "base64string": base64_string})
+                return base64_strings
             else:
                 logger.error("Unsupported file type")
                 raise ValueError("Unsupported file type")