Skip to content

Commit

Permalink
Merge pull request #47 from amosproj/feat/text_to_.json_chunks
Browse files Browse the repository at this point in the history
Feat/text to .json chunks
  • Loading branch information
kristikotini authored May 19, 2024
2 parents 9669638 + f0cf07a commit 8a66fc8
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 2 deletions.
Empty file.
36 changes: 36 additions & 0 deletions Project/backend/codebase/graph_creator/pdf_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


def process_pdf_into_chunks(filename):
"""
Takes pdf file, and converts it into text chunks of equal length
Parameters
----------
filename : str
The name of the pdf file to be proccessed
Returns
-------
list
a list of strings that are the chunks of the pdf converted to text
"""

# load pdf
if not os.path.isfile(filename):
raise ValueError("Invalid PDF file path.")
if not filename.endswith(".pdf"):
raise ValueError("File is not a PDF.")
loader = PyPDFLoader(filename)
docs = loader.load()

if not docs:
raise ValueError("Failed to load PDF documents.")

# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

return splits
1 change: 0 additions & 1 deletion Project/backend/codebase/migrations/env.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
from logging.config import fileConfig
from sys import modules

from sqlalchemy import engine_from_config
from sqlalchemy import pool
Expand Down
20 changes: 19 additions & 1 deletion Project/backend/codebase/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
orjson==3.10.3
packaging==24.0
packaging==23.2
pathspec==0.12.1
platformdirs==4.2.1
pluggy==1.5.0
Expand All @@ -61,3 +61,21 @@ uvloop==0.19.0
watchfiles==0.21.0
websockets==12.0
yarl==1.9.4
charset-normalizer==3.3.2
dataclasses-json==0.6.6
exceptiongroup==1.2.1
jsonpatch==1.33
jsonpointer==2.4
langchain==0.1.20
langchain-community==0.0.38
langchain-core==0.1.52
langchain-text-splitters==0.0.1
langsmith==0.1.56
marshmallow==3.21.2
numpy==1.26.4
pypdf==4.2.0
requests==2.31.0
tenacity==8.3.0
tomli==2.0.1
typing-inspect==0.9.0
urllib3==2.2.1
Empty file.
Binary file not shown.
13 changes: 13 additions & 0 deletions Project/backend/codebase/tests/test_pdfHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from graph_creator import pdf_handler


def test_chunking():
"""
Tests if the text chunk extraction from a test pdf is successful
"""
# Arrange
testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf"
# Act
chunks = pdf_handler.process_pdf_into_chunks(testfile)
# Assert
assert chunks is not None

0 comments on commit 8a66fc8

Please sign in to comment.