Merge pull request #47 from amosproj/feat/text_to_.json_chunks

Feat/text to .json chunks
amosproj · May 19, 2024 · 8a66fc8 · 8a66fc8
2 parents 9669638 + f0cf07a
commit 8a66fc8
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 2 deletions.
diff --git a/Project/backend/codebase/graph_creator/__init__.py b/Project/backend/codebase/graph_creator/__init__.py
diff --git a/Project/backend/codebase/graph_creator/pdf_handler.py b/Project/backend/codebase/graph_creator/pdf_handler.py
@@ -0,0 +1,36 @@
+import os
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+
+
+def process_pdf_into_chunks(filename):
+    """
+    Takes pdf file, and converts it into text chunks of equal length
+
+    Parameters
+    ----------
+    filename : str
+        The name of the pdf file to be proccessed
+
+    Returns
+    -------
+    list
+        a list of strings that are the chunks of the pdf converted to text
+    """
+
+    # load pdf
+    if not os.path.isfile(filename):
+        raise ValueError("Invalid PDF file path.")
+    if not filename.endswith(".pdf"):
+        raise ValueError("File is not a PDF.")
+    loader = PyPDFLoader(filename)
+    docs = loader.load()
+
+    if not docs:
+        raise ValueError("Failed to load PDF documents.")
+
+    # splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    splits = text_splitter.split_documents(docs)
+
+    return splits
diff --git a/Project/backend/codebase/migrations/env.py b/Project/backend/codebase/migrations/env.py
@@ -1,6 +1,5 @@
 import os
 from logging.config import fileConfig
-from sys import modules
 
 from sqlalchemy import engine_from_config
 from sqlalchemy import pool

diff --git a/Project/backend/codebase/requirements.txt b/Project/backend/codebase/requirements.txt
@@ -34,7 +34,7 @@ multidict==6.0.5
 mypy-extensions==1.0.0
 nest-asyncio==1.6.0
 orjson==3.10.3
-packaging==24.0
+packaging==23.2
 pathspec==0.12.1
 platformdirs==4.2.1
 pluggy==1.5.0
@@ -61,3 +61,21 @@ uvloop==0.19.0
 watchfiles==0.21.0
 websockets==12.0
 yarl==1.9.4
+charset-normalizer==3.3.2
+dataclasses-json==0.6.6
+exceptiongroup==1.2.1
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.1.20
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-text-splitters==0.0.1
+langsmith==0.1.56
+marshmallow==3.21.2
+numpy==1.26.4
+pypdf==4.2.0
+requests==2.31.0
+tenacity==8.3.0
+tomli==2.0.1
+typing-inspect==0.9.0
+urllib3==2.2.1
diff --git a/Project/backend/codebase/tests/__init__.py b/Project/backend/codebase/tests/__init__.py
diff --git a/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf b/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf
diff --git a/Project/backend/codebase/tests/test_pdfHandler.py b/Project/backend/codebase/tests/test_pdfHandler.py
@@ -0,0 +1,13 @@
+from graph_creator import pdf_handler
+
+
+def test_chunking():
+    """
+    Tests if the text chunk extraction from a test pdf is successful
+    """
+    # Arrange
+    testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf"
+    # Act
+    chunks = pdf_handler.process_pdf_into_chunks(testfile)
+    # Assert
+    assert chunks is not None