generated from amosproj/amos202Xss0Y-projname
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #47 from amosproj/feat/text_to_.json_chunks
Feat/text to .json chunks
- Loading branch information
Showing
7 changed files
with
68 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
from langchain_community.document_loaders import PyPDFLoader | ||
|
||
|
||
def process_pdf_into_chunks(filename): | ||
""" | ||
Takes pdf file, and converts it into text chunks of equal length | ||
Parameters | ||
---------- | ||
filename : str | ||
The name of the pdf file to be proccessed | ||
Returns | ||
------- | ||
list | ||
a list of strings that are the chunks of the pdf converted to text | ||
""" | ||
|
||
# load pdf | ||
if not os.path.isfile(filename): | ||
raise ValueError("Invalid PDF file path.") | ||
if not filename.endswith(".pdf"): | ||
raise ValueError("File is not a PDF.") | ||
loader = PyPDFLoader(filename) | ||
docs = loader.load() | ||
|
||
if not docs: | ||
raise ValueError("Failed to load PDF documents.") | ||
|
||
# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page']) | ||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | ||
splits = text_splitter.split_documents(docs) | ||
|
||
return splits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from graph_creator import pdf_handler | ||
|
||
|
||
def test_chunking(): | ||
""" | ||
Tests if the text chunk extraction from a test pdf is successful | ||
""" | ||
# Arrange | ||
testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf" | ||
# Act | ||
chunks = pdf_handler.process_pdf_into_chunks(testfile) | ||
# Assert | ||
assert chunks is not None |