diff --git a/Project/backend/codebase/graph_creator/__init__.py b/Project/backend/codebase/graph_creator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project/backend/codebase/graph_creator/pdf_handler.py b/Project/backend/codebase/graph_creator/pdf_handler.py new file mode 100644 index 0000000..06adc7a --- /dev/null +++ b/Project/backend/codebase/graph_creator/pdf_handler.py @@ -0,0 +1,36 @@ +import os +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader + + +def process_pdf_into_chunks(filename): + """ + Takes pdf file, and converts it into text chunks of equal length + + Parameters + ---------- + filename : str + The name of the pdf file to be proccessed + + Returns + ------- + list + a list of strings that are the chunks of the pdf converted to text + """ + + # load pdf + if not os.path.isfile(filename): + raise ValueError("Invalid PDF file path.") + if not filename.endswith(".pdf"): + raise ValueError("File is not a PDF.") + loader = PyPDFLoader(filename) + docs = loader.load() + + if not docs: + raise ValueError("Failed to load PDF documents.") + + # splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page']) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) + splits = text_splitter.split_documents(docs) + + return splits diff --git a/Project/backend/codebase/migrations/env.py b/Project/backend/codebase/migrations/env.py index 9b3184d..cdaa16c 100644 --- a/Project/backend/codebase/migrations/env.py +++ b/Project/backend/codebase/migrations/env.py @@ -1,6 +1,5 @@ import os from logging.config import fileConfig -from sys import modules from sqlalchemy import engine_from_config from sqlalchemy import pool diff --git a/Project/backend/codebase/requirements.txt b/Project/backend/codebase/requirements.txt index a888dca..fdbc27f 100644 --- a/Project/backend/codebase/requirements.txt +++ b/Project/backend/codebase/requirements.txt @@ -34,7 +34,7 @@ multidict==6.0.5 mypy-extensions==1.0.0 nest-asyncio==1.6.0 orjson==3.10.3 -packaging==24.0 +packaging==23.2 pathspec==0.12.1 platformdirs==4.2.1 pluggy==1.5.0 @@ -61,3 +61,21 @@ uvloop==0.19.0 watchfiles==0.21.0 websockets==12.0 yarl==1.9.4 +charset-normalizer==3.3.2 +dataclasses-json==0.6.6 +exceptiongroup==1.2.1 +jsonpatch==1.33 +jsonpointer==2.4 +langchain==0.1.20 +langchain-community==0.0.38 +langchain-core==0.1.52 +langchain-text-splitters==0.0.1 +langsmith==0.1.56 +marshmallow==3.21.2 +numpy==1.26.4 +pypdf==4.2.0 +requests==2.31.0 +tenacity==8.3.0 +tomli==2.0.1 +typing-inspect==0.9.0 +urllib3==2.2.1 diff --git a/Project/backend/codebase/tests/__init__.py b/Project/backend/codebase/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf b/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf new file mode 100644 index 0000000..126a995 Binary files /dev/null and b/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf differ diff --git a/Project/backend/codebase/tests/test_pdfHandler.py b/Project/backend/codebase/tests/test_pdfHandler.py new file mode 100644 index 0000000..ed290a2 --- /dev/null +++ b/Project/backend/codebase/tests/test_pdfHandler.py @@ -0,0 +1,13 @@ +from graph_creator import pdf_handler + + +def test_chunking(): + """ + Tests if the text chunk extraction from a test pdf is successful + """ + # Arrange + testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf" + # Act + chunks = pdf_handler.process_pdf_into_chunks(testfile) + # Assert + assert chunks is not None