Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/text to .json chunks #47

Merged
merged 6 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project/backend/codebase/common/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ async def get_db_session(request: Request) -> AsyncGenerator[AsyncSession, None]
yield session
finally:
await session.commit()
await session.close()
await session.close()
Empty file.
36 changes: 36 additions & 0 deletions Project/backend/codebase/graphCreator/pdfHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


def proccess(filename):
kristikotini marked this conversation as resolved.
Show resolved Hide resolved
"""
Takes pdf file, and converts it into text chunks of equal length

Parameters
----------
filename : str
The name of the pdf file to be proccessed

Returns
-------
list
a list of strings that are the chunks of the pdf converted to text
"""

# load pdf
if not os.path.isfile(filename):
raise ValueError("Invalid PDF file path.")
if not filename.endswith(".pdf"):
raise ValueError("File is not a PDF.")
loader = PyPDFLoader(filename)
docs = loader.load()

if not docs:
raise ValueError("Failed to load PDF documents.")

# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

return splits
4 changes: 2 additions & 2 deletions Project/backend/codebase/lifetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _setup_db(app: FastAPI) -> None: # pragma: no cover


def register_startup_event(
app: FastAPI,
app: FastAPI,
) -> Callable[[], Awaitable[None]]: # pragma: no cover
"""
Actions to run on application startup.
Expand All @@ -49,7 +49,7 @@ async def _startup() -> None: # noqa: WPS430


def register_shutdown_event(
app: FastAPI,
app: FastAPI,
) -> Callable[[], Awaitable[None]]: # pragma: no cover
"""
Actions to run on application's shutdown.
Expand Down
6 changes: 2 additions & 4 deletions Project/backend/codebase/migrations/env.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
from logging.config import fileConfig
from sys import modules

from sqlalchemy import engine_from_config
from sqlalchemy import pool
Expand Down Expand Up @@ -31,6 +30,7 @@
# my_important_option = config.get_main_option("my_important_option")
# ... etc.


def get_url():
user = os.getenv("POSTGRES_USER", "amos")
password = os.getenv("POSTGRES_PASSWORD", "password")
Expand Down Expand Up @@ -82,9 +82,7 @@ def run_migrations_online() -> None:
)

with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
context.configure(connection=connection, target_metadata=target_metadata)

with context.begin_transaction():
context.run_migrations()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,38 @@
Create Date: 2024-05-12 23:49:26.779256

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'ce5e8cc6632d'
revision: str = "ce5e8cc6632d"
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('healthcheck',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True),
sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True),
sa.PrimaryKeyConstraint('id')
op.create_table(
"healthcheck",
sa.Column("id", sa.Uuid(), nullable=False),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=True,
),
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
sa.PrimaryKeyConstraint("id"),
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('healthcheck')
op.drop_table("healthcheck")
# ### end Alembic commands ###
4 changes: 2 additions & 2 deletions Project/backend/codebase/monitoring/dao/healthcheck_dao.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ async def get_all_healthchecks(self, limit: int, offset: int) -> List[HealthChec
return list(raw_checks.scalars().fetchall())

async def get(
self,
obj_id: uuid.UUID,
self,
obj_id: uuid.UUID,
) -> HealthCheck:
"""
Get specific healthcheck model.
Expand Down
2 changes: 1 addition & 1 deletion Project/backend/codebase/monitoring/models/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

class HealthCheck(Base, TrackedModel):
__tablename__ = "healthcheck"
__table_args__ = {'extend_existing': True}
__table_args__ = {"extend_existing": True}
6 changes: 3 additions & 3 deletions Project/backend/codebase/monitoring/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ async def create_check(check_dao: HealthCheckDAO = Depends()) -> {}:

@router.get("/list-checks", response_model=List[HealthCheckResponse])
async def get_dummy_models(
limit: int = 10,
offset: int = 0,
check_dao: HealthCheckDAO = Depends(),
limit: int = 10,
offset: int = 0,
check_dao: HealthCheckDAO = Depends(),
) -> List[HealthCheckResponse]:
"""
Retrieve all health-check objects from the database.
Expand Down
20 changes: 19 additions & 1 deletion Project/backend/codebase/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
orjson==3.10.3
packaging==24.0
packaging==23.2
pathspec==0.12.1
platformdirs==4.2.1
pluggy==1.5.0
Expand All @@ -61,3 +61,21 @@ uvloop==0.19.0
watchfiles==0.21.0
websockets==12.0
yarl==1.9.4
charset-normalizer==3.3.2
dataclasses-json==0.6.6
exceptiongroup==1.2.1
jsonpatch==1.33
jsonpointer==2.4
langchain==0.1.20
langchain-community==0.0.38
langchain-core==0.1.52
langchain-text-splitters==0.0.1
langsmith==0.1.56
marshmallow==3.21.2
numpy==1.26.4
pypdf==4.2.0
requests==2.31.0
tenacity==8.3.0
tomli==2.0.1
typing-inspect==0.9.0
urllib3==2.2.1
Empty file.
Binary file not shown.
13 changes: 13 additions & 0 deletions Project/backend/codebase/tests/test_pdfHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from graphCreator import pdfHandler


def test_chunking():
"""
Tests if the text chunk extraction from a test pdf is successful
"""
# Arrange
testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf"
# Act
chunks = pdfHandler.proccess(testfile)
# Assert
assert chunks is not None
Loading