amosproj · kristikotini · May 19, 2024 · May 13, 2024 · May 14, 2024 · May 14, 2024
diff --git a/Project/backend/codebase/common/dependencies.py b/Project/backend/codebase/common/dependencies.py
@@ -16,4 +16,4 @@ async def get_db_session(request: Request) -> AsyncGenerator[AsyncSession, None]
         yield session
     finally:
         await session.commit()
-        await session.close()
+        await session.close()
diff --git a/Project/backend/codebase/graphCreator/__init__.py b/Project/backend/codebase/graphCreator/__init__.py
diff --git a/Project/backend/codebase/graphCreator/pdfHandler.py b/Project/backend/codebase/graphCreator/pdfHandler.py
@@ -0,0 +1,36 @@
+import os
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+
+
+def proccess(filename):
+    """
+    Takes pdf file, and converts it into text chunks of equal length
+
+    Parameters
+    ----------
+    filename : str
+        The name of the pdf file to be proccessed
+
+    Returns
+    -------
+    list
+        a list of strings that are the chunks of the pdf converted to text
+    """
+
+    # load pdf
+    if not os.path.isfile(filename):
+        raise ValueError("Invalid PDF file path.")
+    if not filename.endswith(".pdf"):
+        raise ValueError("File is not a PDF.")
+    loader = PyPDFLoader(filename)
+    docs = loader.load()
+
+    if not docs:
+        raise ValueError("Failed to load PDF documents.")
+
+    # splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    splits = text_splitter.split_documents(docs)
+
+    return splits
diff --git a/Project/backend/codebase/lifetime.py b/Project/backend/codebase/lifetime.py
@@ -26,7 +26,7 @@ def _setup_db(app: FastAPI) -> None:  # pragma: no cover
 
 
 def register_startup_event(
-        app: FastAPI,
+    app: FastAPI,
 ) -> Callable[[], Awaitable[None]]:  # pragma: no cover
     """
     Actions to run on application startup.
@@ -49,7 +49,7 @@ async def _startup() -> None:  # noqa: WPS430
 
 
 def register_shutdown_event(
-        app: FastAPI,
+    app: FastAPI,
 ) -> Callable[[], Awaitable[None]]:  # pragma: no cover
     """
     Actions to run on application's shutdown.

diff --git a/Project/backend/codebase/migrations/env.py b/Project/backend/codebase/migrations/env.py
@@ -1,6 +1,5 @@
 import os
 from logging.config import fileConfig
-from sys import modules
 
 from sqlalchemy import engine_from_config
 from sqlalchemy import pool
@@ -31,6 +30,7 @@
 # my_important_option = config.get_main_option("my_important_option")
 # ... etc.
 
+
 def get_url():
     user = os.getenv("POSTGRES_USER", "amos")
     password = os.getenv("POSTGRES_PASSWORD", "password")
@@ -82,9 +82,7 @@ def run_migrations_online() -> None:
     )
 
     with connectable.connect() as connection:
-        context.configure(
-            connection=connection, target_metadata=target_metadata
-        )
+        context.configure(connection=connection, target_metadata=target_metadata)
 
         with context.begin_transaction():
             context.run_migrations()

diff --git a/Project/backend/codebase/migrations/versions/ce5e8cc6632d_initial_migrations.py b/Project/backend/codebase/migrations/versions/ce5e8cc6632d_initial_migrations.py
@@ -5,31 +5,38 @@
 Create Date: 2024-05-12 23:49:26.779256
 
 """
+
 from typing import Sequence, Union
 
 from alembic import op
 import sqlalchemy as sa
 
 
 # revision identifiers, used by Alembic.
-revision: str = 'ce5e8cc6632d'
+revision: str = "ce5e8cc6632d"
 down_revision: Union[str, None] = None
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
 
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('healthcheck',
-    sa.Column('id', sa.Uuid(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=True),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True),
-    sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "healthcheck",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=True,
+        ),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
     )
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_table('healthcheck')
+    op.drop_table("healthcheck")
     # ### end Alembic commands ###
diff --git a/Project/backend/codebase/monitoring/dao/healthcheck_dao.py b/Project/backend/codebase/monitoring/dao/healthcheck_dao.py
@@ -32,8 +32,8 @@ async def get_all_healthchecks(self, limit: int, offset: int) -> List[HealthChec
         return list(raw_checks.scalars().fetchall())
 
     async def get(
-            self,
-            obj_id: uuid.UUID,
+        self,
+        obj_id: uuid.UUID,
     ) -> HealthCheck:
         """
         Get specific healthcheck model.

diff --git a/Project/backend/codebase/monitoring/models/monitoring.py b/Project/backend/codebase/monitoring/models/monitoring.py
@@ -3,4 +3,4 @@
 
 class HealthCheck(Base, TrackedModel):
     __tablename__ = "healthcheck"
-    __table_args__ = {'extend_existing': True}
+    __table_args__ = {"extend_existing": True}
diff --git a/Project/backend/codebase/monitoring/router.py b/Project/backend/codebase/monitoring/router.py
@@ -31,9 +31,9 @@ async def create_check(check_dao: HealthCheckDAO = Depends()) -> {}:
 
 @router.get("/list-checks", response_model=List[HealthCheckResponse])
 async def get_dummy_models(
-        limit: int = 10,
-        offset: int = 0,
-        check_dao: HealthCheckDAO = Depends(),
+    limit: int = 10,
+    offset: int = 0,
+    check_dao: HealthCheckDAO = Depends(),
 ) -> List[HealthCheckResponse]:
     """
     Retrieve all health-check objects from the database.

diff --git a/Project/backend/codebase/requirements.txt b/Project/backend/codebase/requirements.txt
@@ -34,7 +34,7 @@ multidict==6.0.5
 mypy-extensions==1.0.0
 nest-asyncio==1.6.0
 orjson==3.10.3
-packaging==24.0
+packaging==23.2
 pathspec==0.12.1
 platformdirs==4.2.1
 pluggy==1.5.0
@@ -61,3 +61,21 @@ uvloop==0.19.0
 watchfiles==0.21.0
 websockets==12.0
 yarl==1.9.4
+charset-normalizer==3.3.2
+dataclasses-json==0.6.6
+exceptiongroup==1.2.1
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.1.20
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-text-splitters==0.0.1
+langsmith==0.1.56
+marshmallow==3.21.2
+numpy==1.26.4
+pypdf==4.2.0
+requests==2.31.0
+tenacity==8.3.0
+tomli==2.0.1
+typing-inspect==0.9.0
+urllib3==2.2.1
diff --git a/Project/backend/codebase/tests/__init__.py b/Project/backend/codebase/tests/__init__.py
diff --git a/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf b/Project/backend/codebase/tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf
diff --git a/Project/backend/codebase/tests/test_pdfHandler.py b/Project/backend/codebase/tests/test_pdfHandler.py
@@ -0,0 +1,13 @@
+from graphCreator import pdfHandler
+
+
+def test_chunking():
+    """
+    Tests if the text chunk extraction from a test pdf is successful
+    """
+    # Arrange
+    testfile = "tests/data/Automotive-SPICE-PAM-v40_p1-3.pdf"
+    # Act
+    chunks = pdfHandler.proccess(testfile)
+    # Assert
+    assert chunks is not None