From 59c72e9097224bf000b07be0c87a21d48ad151a2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 17:12:32 +0100 Subject: [PATCH 001/127] added last_scheduled datetime --- .../1be37720e832_add_last_scheduled_time.py | 30 +++++++++++++++++++ .../models/comp_runs.py | 7 +++++ 2 files changed, 37 insertions(+) create mode 100644 packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py new file mode 100644 index 00000000000..eaf4d5f116c --- /dev/null +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py @@ -0,0 +1,30 @@ +"""add last scheduled time + +Revision ID: 1be37720e832 +Revises: 8e1f83486be7 +Create Date: 2024-11-15 16:12:08.825985+00:00 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1be37720e832" +down_revision = "8e1f83486be7" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "comp_runs", + sa.Column("last_scheduled", sa.DateTime(timezone=True), nullable=True), + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("comp_runs", "last_scheduled") + # ### end Alembic commands ### diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py index 3975cb91eee..3cdf2a9b3bb 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py @@ -1,6 +1,7 @@ """ Computational Runs Table """ + import sqlalchemy as sa from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.sql import func @@ -106,6 +107,12 @@ nullable=True, doc="If filled, when cancellation was requested", ), + sa.Column( + "last_scheduled", + sa.DateTime(timezone=True), + nullable=True, + doc="last time the pipeline was scheduled", + ), sa.Column("metadata", JSONB, nullable=True, doc="the run optional metadata"), sa.Column( "use_on_demand_clusters", From 1828bf5bb1b62ec56dbae98b3efb015010cbe956 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:21:50 +0100 Subject: [PATCH 002/127] ongoing new scheduler --- .../core/settings.py | 13 ++++++- .../comp_scheduler/_distributed_scheduler.py | 31 +++++++++++++++ .../modules/db/__init__.py | 12 +++++- .../modules/db/repositories/comp_runs.py | 39 +++++++++++-------- 4 files changed, 76 insertions(+), 19 deletions(-) create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index 0ccdce64de1..e64074c6d99 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -4,10 +4,15 @@ import datetime from functools import cached_property -from typing import Annotated +from typing import Annotated, cast from common_library.pydantic_validators import validate_numeric_string_as_timedelta -from models_library.basic_types import LogLevel, PortInt, VersionTag +from fastapi import FastAPI +from models_library.basic_types import ( + LogLevel, + PortInt, + VersionTag, +) from models_library.clusters import ( DEFAULT_CLUSTER_ID, Cluster, @@ -263,3 +268,7 @@ def _validate_loglevel(cls, value: str) -> str: _validate_service_tracking_heartbeat = validate_numeric_string_as_timedelta( "SERVICE_TRACKING_HEARTBEAT" ) + + +def get_application_settings(app: FastAPI) -> AppSettings: + return cast(AppSettings, app.state.settings) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py new file mode 100644 index 00000000000..cb2dfc089e2 --- /dev/null +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -0,0 +1,31 @@ +import datetime +from typing import Final + +from fastapi import FastAPI +from servicelib.redis import RedisClientSDK +from servicelib.redis_utils import exclusive +from settings_library.redis import RedisDatabase + +from ...core.settings import get_application_settings +from ...utils.comp_scheduler import SCHEDULED_STATES +from ..db import get_db_engine +from ..db.repositories.comp_runs import CompRunsRepository +from ..redis import get_redis_client_manager + +_SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) + + +def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: + assert kwargs # nosec + app = args[0] + assert isinstance(app, FastAPI) # nosec + return get_redis_client_manager(app).client(RedisDatabase.LOCKS) + + +@exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") +async def schedule_pipelines(app: FastAPI) -> None: + app_settings = get_application_settings(app) + db_engine = get_db_engine(app) + runs_to_schedule = CompRunsRepository.instance(db_engine).list( + filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL + ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/db/__init__.py index 7a5826d1aa3..a112ae63d46 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/__init__.py @@ -1,3 +1,6 @@ +from typing import cast + +from aiopg.sa import Engine from fastapi import FastAPI from settings_library.postgres import PostgresSettings @@ -22,4 +25,11 @@ async def on_shutdown() -> None: app.add_event_handler("shutdown", on_shutdown) -__all__: tuple[str, ...] = ("get_asyncpg_engine",) +def get_db_engine(app: FastAPI) -> Engine: + return cast(Engine, app.state.engine) + + +__all__: tuple[str, ...] = ( + "get_asyncpg_engine", + "get_db_engine", +) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 9ce28bcda8d..b012f39264e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -1,6 +1,5 @@ import datetime import logging -from collections import deque from typing import Any import arrow @@ -55,24 +54,32 @@ async def get( return CompRunsAtDB.model_validate(row) async def list( - self, filter_by_state: set[RunningState] | None = None + self, + filter_by_state: set[RunningState] | None = None, + scheduled_since: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: - if not filter_by_state: - filter_by_state = set() - runs_in_db: deque[CompRunsAtDB] = deque() + + conditions = [] + if filter_by_state: + conditions.append( + or_( + *[ + comp_runs.c.result == RUNNING_STATE_TO_DB[s] + for s in filter_by_state + ] + ) + ) + if scheduled_since is not None: + scheduled_cutoff = arrow.utcnow().datetime - scheduled_since + conditions.append(comp_runs.c.last_scheduled >= scheduled_cutoff) + async with self.db_engine.acquire() as conn: - async for row in conn.execute( - sa.select(comp_runs).where( - or_( - *[ - comp_runs.c.result == RUNNING_STATE_TO_DB[s] - for s in filter_by_state - ] - ) + return [ + CompRunsAtDB.model_validate(row) + async for row in conn.execute( + sa.select(comp_runs).where(sa.and_(*conditions)) ) - ): - runs_in_db.append(CompRunsAtDB.model_validate(row)) - return list(runs_in_db) + ] async def create( self, From 2b12dcdd212a90b9c8ecc581230f45dbe563f98c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:23:10 +0100 Subject: [PATCH 003/127] ongoing new scheduler --- .../modules/db/repositories/comp_runs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index b012f39264e..aff2a1894d2 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -71,7 +71,12 @@ async def list( ) if scheduled_since is not None: scheduled_cutoff = arrow.utcnow().datetime - scheduled_since - conditions.append(comp_runs.c.last_scheduled >= scheduled_cutoff) + conditions.append( + or_( + comp_runs.c.last_scheduled.is_(None), + comp_runs.c.last_scheduled >= scheduled_cutoff, + ) + ) async with self.db_engine.acquire() as conn: return [ From 70c65732fbc8048fe99e3b0ca18e3e8574b71ee9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:26:09 +0100 Subject: [PATCH 004/127] ongoing new scheduler --- .../modules/comp_scheduler/_distributed_scheduler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index cb2dfc089e2..a19f2fa8b54 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -26,6 +26,10 @@ def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: async def schedule_pipelines(app: FastAPI) -> None: app_settings = get_application_settings(app) db_engine = get_db_engine(app) - runs_to_schedule = CompRunsRepository.instance(db_engine).list( + runs_to_schedule = await CompRunsRepository.instance(db_engine).list( filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL ) + + for run in runs_to_schedule: + # await rpc_request_schedule_pipeline(run.user_id, run.project_uuid, run.iteration) + pass From b1fb6c0002b0fefe9b2456cc23fa384ee63e391a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 22:42:00 +0100 Subject: [PATCH 005/127] initial test --- .../comp_scheduler/_distributed_scheduler.py | 2 +- services/director-v2/tests/unit/conftest.py | 5 ++ .../test_distributed_scheduler.py | 51 +++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index a19f2fa8b54..5101ab9f692 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -16,7 +16,7 @@ def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: - assert kwargs # nosec + assert kwargs is not None # nosec app = args[0] assert isinstance(app, FastAPI) # nosec return get_redis_client_manager(app).client(RedisDatabase.LOCKS) diff --git a/services/director-v2/tests/unit/conftest.py b/services/director-v2/tests/unit/conftest.py index 1375795f0cb..3e1764c4e55 100644 --- a/services/director-v2/tests/unit/conftest.py +++ b/services/director-v2/tests/unit/conftest.py @@ -184,6 +184,11 @@ def fake_s3_settings(faker: Faker) -> S3Settings: ) +@pytest.fixture +def fake_s3_envs(fake_s3_settings: S3Settings) -> EnvVarsDict: + return fake_s3_settings.dict() + + @pytest.fixture def mocked_storage_service_api( fake_s3_settings: S3Settings, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py new file mode 100644 index 00000000000..df54e19624e --- /dev/null +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -0,0 +1,51 @@ +# pylint:disable=unused-variable +# pylint:disable=unused-argument +# pylint:disable=redefined-outer-name +# pylint:disable=no-value-for-parameter +# pylint:disable=protected-access +# pylint:disable=too-many-arguments +# pylint:disable=no-name-in-module +# pylint: disable=too-many-statements + + +import pytest +import sqlalchemy as sa +from fastapi import FastAPI +from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict +from pytest_simcore.helpers.typing_env import EnvVarsDict +from settings_library.rabbit import RabbitSettings +from settings_library.redis import RedisSettings +from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( + schedule_pipelines, +) + +pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] +pytest_simcore_ops_services_selection = [ + "adminer", +] + + +@pytest.fixture +def mock_env( + mock_env: EnvVarsDict, + monkeypatch: pytest.MonkeyPatch, + fake_s3_envs: EnvVarsDict, + postgres_db: sa.engine.Engine, + postgres_host_config: dict[str, str], + rabbit_service: RabbitSettings, + redis_service: RedisSettings, +) -> EnvVarsDict: + return mock_env | setenvs_from_dict( + monkeypatch, + {k: f"{v}" for k, v in fake_s3_envs.items()} + | { + "COMPUTATIONAL_BACKEND_ENABLED": True, + "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": True, + }, + ) + + +async def test_schedule_pipelines( + initialized_app: FastAPI, +): + await schedule_pipelines(initialized_app) From 9250d0603ebe1f2909b292d0e8d3d1c8b9d591e3 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 22:45:07 +0100 Subject: [PATCH 006/127] initial test --- .../comp_scheduler/test_distributed_scheduler.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index df54e19624e..c95700cce7f 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -8,11 +8,14 @@ # pylint: disable=too-many-statements +import asyncio + import pytest import sqlalchemy as sa from fastapi import FastAPI from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict from pytest_simcore.helpers.typing_env import EnvVarsDict +from servicelib.redis import CouldNotAcquireLockError from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( @@ -49,3 +52,13 @@ async def test_schedule_pipelines( initialized_app: FastAPI, ): await schedule_pipelines(initialized_app) + + +async def test_schedule_pipelines_concurently_raises( + initialized_app: FastAPI, +): + with pytest.raises( + CouldNotAcquireLockError, + match=".+ computational-distributed-scheduler", + ): + await asyncio.gather(*(schedule_pipelines(initialized_app) for _ in range(2))) From 0810978854e66c360e86fce936f625ae27e41d20 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 22:48:29 +0100 Subject: [PATCH 007/127] initial test --- .../comp_scheduler/test_distributed_scheduler.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index c95700cce7f..60260d6adc8 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -54,11 +54,21 @@ async def test_schedule_pipelines( await schedule_pipelines(initialized_app) -async def test_schedule_pipelines_concurently_raises( +async def test_schedule_pipelines_concurently_raises_and_only_one_runs( initialized_app: FastAPI, ): + CONCURRENCY = 5 with pytest.raises( CouldNotAcquireLockError, match=".+ computational-distributed-scheduler", ): - await asyncio.gather(*(schedule_pipelines(initialized_app) for _ in range(2))) + await asyncio.gather( + *(schedule_pipelines(initialized_app) for _ in range(CONCURRENCY)) + ) + + results = await asyncio.gather( + *(schedule_pipelines(initialized_app) for _ in range(CONCURRENCY)), + return_exceptions=True, + ) + + assert results.count(None) == 1, "Only one task should have run" From 94caa2abb071bb6db42388bcb99ad97b3af4cb91 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 23:09:55 +0100 Subject: [PATCH 008/127] added rabbit message exchange --- .../comp_scheduler/_distributed_scheduler.py | 21 ++++++++++++++++--- .../modules/comp_scheduler/_models.py | 19 +++++++++++++++++ .../modules/db/repositories/comp_runs.py | 10 +++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 5101ab9f692..c42ed74ff86 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -5,6 +5,10 @@ from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive from settings_library.redis import RedisDatabase +from simcore_service_director_v2.modules.comp_scheduler._models import ( + SchedulePipelineRabbitMessage, +) +from simcore_service_director_v2.modules.rabbitmq import get_rabbitmq_client from ...core.settings import get_application_settings from ...utils.comp_scheduler import SCHEDULED_STATES @@ -29,7 +33,18 @@ async def schedule_pipelines(app: FastAPI) -> None: runs_to_schedule = await CompRunsRepository.instance(db_engine).list( filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL ) - + rabbitmq_client = get_rabbitmq_client(app) for run in runs_to_schedule: - # await rpc_request_schedule_pipeline(run.user_id, run.project_uuid, run.iteration) - pass + # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # async with transaction_context(get_asyncpg_engine(app)) as connection: + await rabbitmq_client.publish( + SchedulePipelineRabbitMessage.channel_name, + SchedulePipelineRabbitMessage( + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ), + ) + await CompRunsRepository.instance(db_engine).mark_as_scheduled( + user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py new file mode 100644 index 00000000000..f98a18b2851 --- /dev/null +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py @@ -0,0 +1,19 @@ +from typing import Literal + +from models_library.projects import ProjectID +from models_library.rabbitmq_messages import RabbitMessageBase +from models_library.users import UserID + +from ...utils.comp_scheduler import Iteration + + +class SchedulePipelineRabbitMessage(RabbitMessageBase): + channel_name: Literal[ + "simcore.services.director-v2.scheduling" + ] = "simcore.services.director-v2.scheduling" + user_id: UserID + project_id: ProjectID + iteration: Iteration + + def routing_key(self) -> str | None: + return None diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index aff2a1894d2..12bed67d3eb 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -176,3 +176,13 @@ async def mark_for_cancellation( iteration, cancelled=arrow.utcnow().datetime, ) + + async def mark_as_scheduled( + self, *, user_id: UserID, project_id: ProjectID, iteration: PositiveInt + ) -> CompRunsAtDB | None: + return await self.update( + user_id, + project_id, + iteration, + last_scheduled=arrow.utcnow().datetime, + ) From 655d51474bee16c7d90162f3ee058722ccad6aa5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 23:30:21 +0100 Subject: [PATCH 009/127] skeleton for distributed scheduler --- .../comp_scheduler/_distributed_scheduler.py | 123 +++++++++++++++--- 1 file changed, 107 insertions(+), 16 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index c42ed74ff86..3d6b6d86e3a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -1,22 +1,33 @@ import datetime +import logging from typing import Final +import networkx as nx +from aiopg.sa import Engine from fastapi import FastAPI +from models_library.clusters import ClusterID +from models_library.projects import ProjectID +from models_library.users import UserID +from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive +from servicelib.utils import limited_gather from settings_library.redis import RedisDatabase -from simcore_service_director_v2.modules.comp_scheduler._models import ( - SchedulePipelineRabbitMessage, -) -from simcore_service_director_v2.modules.rabbitmq import get_rabbitmq_client +from simcore_service_director_v2.utils.rabbitmq import publish_project_log from ...core.settings import get_application_settings +from ...models.comp_runs import CompRunsAtDB, RunMetadataDict from ...utils.comp_scheduler import SCHEDULED_STATES +from ..comp_scheduler._models import SchedulePipelineRabbitMessage from ..db import get_db_engine +from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository +from ..rabbitmq import get_rabbitmq_client from ..redis import get_redis_client_manager +_logger = logging.getLogger(__name__) _SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) +_MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: @@ -26,6 +37,30 @@ def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: return get_redis_client_manager(app).client(RedisDatabase.LOCKS) +async def _distribute_pipeline( + run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine +) -> None: + # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # async with transaction_context(get_asyncpg_engine(app)) as connection: + await rabbitmq_client.publish( + SchedulePipelineRabbitMessage.channel_name, + SchedulePipelineRabbitMessage( + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ), + ) + await CompRunsRepository.instance(db_engine).mark_as_scheduled( + user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + ) + + +async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGraph: + comp_pipeline_repo = CompPipelinesRepository.instance(db_engine) + pipeline_at_db = await comp_pipeline_repo.get_pipeline(project_id) + return pipeline_at_db.get_graph() + + @exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") async def schedule_pipelines(app: FastAPI) -> None: app_settings = get_application_settings(app) @@ -34,17 +69,73 @@ async def schedule_pipelines(app: FastAPI) -> None: filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL ) rabbitmq_client = get_rabbitmq_client(app) - for run in runs_to_schedule: - # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency - # async with transaction_context(get_asyncpg_engine(app)) as connection: - await rabbitmq_client.publish( - SchedulePipelineRabbitMessage.channel_name, - SchedulePipelineRabbitMessage( - user_id=run.user_id, - project_id=run.project_uuid, - iteration=run.iteration, - ), + await limited_gather( + *( + _distribute_pipeline(run, rabbitmq_client, db_engine) + for run in runs_to_schedule + ), + limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, + ) + + +async def run_new_pipeline( + app: FastAPI, + *, + user_id: UserID, + project_id: ProjectID, + cluster_id: ClusterID, + run_metadata: RunMetadataDict, + use_on_demand_clusters: bool, +) -> None: + """Sets a new pipeline to be scheduled on the computational resources. + Passing cluster_id=0 will use the default cluster. Passing an existing ID will instruct + the scheduler to run the tasks on the defined cluster""" + # ensure the pipeline exists and is populated with something + db_engine = get_db_engine(app) + dag = await _get_pipeline_dag(project_id, db_engine) + if not dag: + _logger.warning( + "project %s has no computational dag defined. not scheduled for a run.", + f"{project_id=}", ) - await CompRunsRepository.instance(db_engine).mark_as_scheduled( - user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + return + + new_run = await CompRunsRepository.instance(db_engine).create( + user_id=user_id, + project_id=project_id, + cluster_id=cluster_id, + metadata=run_metadata, + use_on_demand_clusters=use_on_demand_clusters, + ) + await _distribute_pipeline(new_run, get_rabbitmq_client(app), db_engine) + await publish_project_log( + get_rabbitmq_client(app), + user_id, + project_id, + log=f"Project pipeline scheduled using {'on-demand clusters' if use_on_demand_clusters else 'pre-defined clusters'}, starting soon...", + log_level=logging.INFO, + ) + + +async def stop_pipeline( + app: FastAPI, + *, + user_id: UserID, + project_id: ProjectID, + iteration: int | None = None, +) -> None: + comp_run = await CompRunsRepository.instance(get_db_engine(app)).get( + user_id, project_id, iteration + ) + + # mark the scheduled pipeline for stopping + updated_comp_run = await CompRunsRepository.instance( + get_db_engine(app) + ).mark_for_cancellation( + user_id=user_id, project_id=project_id, iteration=comp_run.iteration + ) + if updated_comp_run: + # ensure the scheduler starts right away + await _distribute_pipeline( + updated_comp_run, get_rabbitmq_client(app), get_db_engine(app) ) From 365444436c60b0b772545b6ad9e5fcf15703f967 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 15 Nov 2024 23:33:22 +0100 Subject: [PATCH 010/127] skeleton for distributed scheduler --- .../comp_scheduler/_distributed_scheduler.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 3d6b6d86e3a..3d45574f5e5 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -13,11 +13,10 @@ from servicelib.redis_utils import exclusive from servicelib.utils import limited_gather from settings_library.redis import RedisDatabase -from simcore_service_director_v2.utils.rabbitmq import publish_project_log -from ...core.settings import get_application_settings from ...models.comp_runs import CompRunsAtDB, RunMetadataDict from ...utils.comp_scheduler import SCHEDULED_STATES +from ...utils.rabbitmq import publish_project_log from ..comp_scheduler._models import SchedulePipelineRabbitMessage from ..db import get_db_engine from ..db.repositories.comp_pipelines import CompPipelinesRepository @@ -63,7 +62,6 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr @exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") async def schedule_pipelines(app: FastAPI) -> None: - app_settings = get_application_settings(app) db_engine = get_db_engine(app) runs_to_schedule = await CompRunsRepository.instance(db_engine).list( filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL @@ -107,9 +105,11 @@ async def run_new_pipeline( metadata=run_metadata, use_on_demand_clusters=use_on_demand_clusters, ) - await _distribute_pipeline(new_run, get_rabbitmq_client(app), db_engine) + + rabbitmq_client = get_rabbitmq_client(app) + await _distribute_pipeline(new_run, rabbitmq_client, db_engine) await publish_project_log( - get_rabbitmq_client(app), + rabbitmq_client, user_id, project_id, log=f"Project pipeline scheduled using {'on-demand clusters' if use_on_demand_clusters else 'pre-defined clusters'}, starting soon...", @@ -124,18 +124,18 @@ async def stop_pipeline( project_id: ProjectID, iteration: int | None = None, ) -> None: - comp_run = await CompRunsRepository.instance(get_db_engine(app)).get( + db_engine = get_db_engine(app) + comp_run = await CompRunsRepository.instance(db_engine).get( user_id, project_id, iteration ) # mark the scheduled pipeline for stopping updated_comp_run = await CompRunsRepository.instance( - get_db_engine(app) + db_engine ).mark_for_cancellation( user_id=user_id, project_id=project_id, iteration=comp_run.iteration ) if updated_comp_run: # ensure the scheduler starts right away - await _distribute_pipeline( - updated_comp_run, get_rabbitmq_client(app), get_db_engine(app) - ) + rabbitmq_client = get_rabbitmq_client(app) + await _distribute_pipeline(updated_comp_run, rabbitmq_client, db_engine) From 47710717408ac23ffaa8909249796d9ca663ea2b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 17 Nov 2024 20:38:28 +0100 Subject: [PATCH 011/127] checking rabbit mq --- .../comp_scheduler/_distributed_scheduler.py | 94 +++++++++---------- .../test_distributed_scheduler.py | 36 +++++++ 2 files changed, 83 insertions(+), 47 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 3d45574f5e5..2f7e723fd40 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -29,53 +29,6 @@ _MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 -def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: - assert kwargs is not None # nosec - app = args[0] - assert isinstance(app, FastAPI) # nosec - return get_redis_client_manager(app).client(RedisDatabase.LOCKS) - - -async def _distribute_pipeline( - run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine -) -> None: - # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency - # async with transaction_context(get_asyncpg_engine(app)) as connection: - await rabbitmq_client.publish( - SchedulePipelineRabbitMessage.channel_name, - SchedulePipelineRabbitMessage( - user_id=run.user_id, - project_id=run.project_uuid, - iteration=run.iteration, - ), - ) - await CompRunsRepository.instance(db_engine).mark_as_scheduled( - user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration - ) - - -async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGraph: - comp_pipeline_repo = CompPipelinesRepository.instance(db_engine) - pipeline_at_db = await comp_pipeline_repo.get_pipeline(project_id) - return pipeline_at_db.get_graph() - - -@exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") -async def schedule_pipelines(app: FastAPI) -> None: - db_engine = get_db_engine(app) - runs_to_schedule = await CompRunsRepository.instance(db_engine).list( - filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL - ) - rabbitmq_client = get_rabbitmq_client(app) - await limited_gather( - *( - _distribute_pipeline(run, rabbitmq_client, db_engine) - for run in runs_to_schedule - ), - limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, - ) - - async def run_new_pipeline( app: FastAPI, *, @@ -139,3 +92,50 @@ async def stop_pipeline( # ensure the scheduler starts right away rabbitmq_client = get_rabbitmq_client(app) await _distribute_pipeline(updated_comp_run, rabbitmq_client, db_engine) + + +def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: + assert kwargs is not None # nosec + app = args[0] + assert isinstance(app, FastAPI) # nosec + return get_redis_client_manager(app).client(RedisDatabase.LOCKS) + + +async def _distribute_pipeline( + run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine +) -> None: + # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # async with transaction_context(get_asyncpg_engine(app)) as connection: + await rabbitmq_client.publish( + SchedulePipelineRabbitMessage.channel_name, + SchedulePipelineRabbitMessage( + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ), + ) + await CompRunsRepository.instance(db_engine).mark_as_scheduled( + user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + ) + + +async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGraph: + comp_pipeline_repo = CompPipelinesRepository.instance(db_engine) + pipeline_at_db = await comp_pipeline_repo.get_pipeline(project_id) + return pipeline_at_db.get_graph() + + +@exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") +async def schedule_pipelines(app: FastAPI) -> None: + db_engine = get_db_engine(app) + runs_to_schedule = await CompRunsRepository.instance(db_engine).list( + filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL + ) + rabbitmq_client = get_rabbitmq_client(app) + await limited_gather( + *( + _distribute_pipeline(run, rabbitmq_client, db_engine) + for run in runs_to_schedule + ), + limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, + ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 60260d6adc8..18be49aaefa 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -9,18 +9,29 @@ import asyncio +from typing import Any, AsyncIterator, Awaitable, Callable +from unittest import mock import pytest import sqlalchemy as sa from fastapi import FastAPI +from models_library.projects import ProjectAtDB +from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict from pytest_simcore.helpers.typing_env import EnvVarsDict +from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import CouldNotAcquireLockError from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings +from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB +from simcore_service_director_v2.models.comp_runs import CompRunsAtDB +from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( schedule_pipelines, ) +from simcore_service_director_v2.modules.comp_scheduler._models import ( + SchedulePipelineRabbitMessage, +) pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] pytest_simcore_ops_services_selection = [ @@ -48,10 +59,25 @@ def mock_env( ) +@pytest.fixture +async def scheduler_rabbit_client_parser( + create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture +) -> AsyncIterator[mock.AsyncMock]: + client = create_rabbitmq_client("scheduling_pytest_consumer") + mock = mocker.AsyncMock(return_value=True) + queue_name = await client.subscribe( + SchedulePipelineRabbitMessage.get_channel_name(), mock + ) + yield mock + await client.unsubscribe(queue_name) + + async def test_schedule_pipelines( initialized_app: FastAPI, + scheduler_rabbit_client_parser: mock.AsyncMock, ): await schedule_pipelines(initialized_app) + scheduler_rabbit_client_parser.assert_not_called() async def test_schedule_pipelines_concurently_raises_and_only_one_runs( @@ -72,3 +98,13 @@ async def test_schedule_pipelines_concurently_raises_and_only_one_runs( ) assert results.count(None) == 1, "Only one task should have run" + + +async def test_schedule_pipelines_with_runs( + registered_user: Callable[..., dict[str, Any]], + project: Callable[..., Awaitable[ProjectAtDB]], + pipeline: Callable[..., CompPipelineAtDB], + tasks: Callable[..., list[CompTaskAtDB]], + runs: Callable[..., CompRunsAtDB], +): + ... From adb66895553a6f5af1143d27d70e34f704ec8a6c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 17 Nov 2024 20:43:38 +0100 Subject: [PATCH 012/127] checking comp_runs table --- .../test_distributed_scheduler.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 18be49aaefa..1b324f57121 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -12,6 +12,7 @@ from typing import Any, AsyncIterator, Awaitable, Callable from unittest import mock +import aiopg.sa import pytest import sqlalchemy as sa from fastapi import FastAPI @@ -23,6 +24,7 @@ from servicelib.redis import CouldNotAcquireLockError from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings +from simcore_postgres_database.models.comp_runs import comp_runs from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB @@ -75,10 +77,25 @@ async def scheduler_rabbit_client_parser( async def test_schedule_pipelines( initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, + aiopg_engine: aiopg.sa.engine.Engine, ): + async with aiopg_engine.acquire() as conn: + # check comp_runs is empty + total_number_of_items = await conn.scalar( + sa.select(sa.func.count()).select_from(comp_runs) + ) + assert total_number_of_items == 0 + await schedule_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_not_called() + async with aiopg_engine.acquire() as conn: + # check comp_runs is empty + total_number_of_items = await conn.scalar( + sa.select(sa.func.count()).select_from(comp_runs) + ) + assert total_number_of_items == 0 + async def test_schedule_pipelines_concurently_raises_and_only_one_runs( initialized_app: FastAPI, From 081d7e1dc17118855e4a7754ed92c0f115be7cae Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 17 Nov 2024 21:31:26 +0100 Subject: [PATCH 013/127] checking comp_runs table --- .../test_distributed_scheduler.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 1b324f57121..ac778fdc51f 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -12,7 +12,6 @@ from typing import Any, AsyncIterator, Awaitable, Callable from unittest import mock -import aiopg.sa import pytest import sqlalchemy as sa from fastapi import FastAPI @@ -34,6 +33,7 @@ from simcore_service_director_v2.modules.comp_scheduler._models import ( SchedulePipelineRabbitMessage, ) +from sqlalchemy.ext.asyncio import AsyncEngine pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] pytest_simcore_ops_services_selection = [ @@ -74,23 +74,25 @@ async def scheduler_rabbit_client_parser( await client.unsubscribe(queue_name) -async def test_schedule_pipelines( +async def test_schedule_pipelines_empty_db( initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, ): - async with aiopg_engine.acquire() as conn: - # check comp_runs is empty + # check comp_runs is empty + async with sqlalchemy_async_engine.connect() as conn: total_number_of_items = await conn.scalar( sa.select(sa.func.count()).select_from(comp_runs) ) assert total_number_of_items == 0 await schedule_pipelines(initialized_app) + + # check nothing was distributed scheduler_rabbit_client_parser.assert_not_called() - async with aiopg_engine.acquire() as conn: - # check comp_runs is empty + # check comp_runs is still empty + async with sqlalchemy_async_engine.connect() as conn: total_number_of_items = await conn.scalar( sa.select(sa.func.count()).select_from(comp_runs) ) @@ -101,6 +103,7 @@ async def test_schedule_pipelines_concurently_raises_and_only_one_runs( initialized_app: FastAPI, ): CONCURRENCY = 5 + # TODO: this can be flaky as an empty scheduling is very short with pytest.raises( CouldNotAcquireLockError, match=".+ computational-distributed-scheduler", @@ -124,4 +127,11 @@ async def test_schedule_pipelines_with_runs( tasks: Callable[..., list[CompTaskAtDB]], runs: Callable[..., CompRunsAtDB], ): - ... + user = registered_user() + proj = await project(user, workbench=fake_workbench_without_outputs) + pipeline( + project_id=proj.uuid, + dag_adjacency_list=fake_workbench_adjacency, + ) + comp_tasks = tasks(user=user, project=proj, state=StateType.PUBLISHED, progress=0) + comp_runs = runs(user=user, project=proj, result=StateType.PUBLISHED) From dd5cce77cff336eef29decfa8bab38592f12f039 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 08:53:23 +0100 Subject: [PATCH 014/127] ongoing new scheduler tests --- .../test_distributed_scheduler.py | 70 +++++++++++-------- .../tests/unit/with_dbs/conftest.py | 33 ++++++--- 2 files changed, 63 insertions(+), 40 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index ac778fdc51f..80f33bf4211 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -9,13 +9,14 @@ import asyncio -from typing import Any, AsyncIterator, Awaitable, Callable +from collections.abc import AsyncIterator, Callable from unittest import mock import pytest import sqlalchemy as sa +from _helpers import PublishedProject from fastapi import FastAPI -from models_library.projects import ProjectAtDB +from models_library.clusters import DEFAULT_CLUSTER_ID from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict from pytest_simcore.helpers.typing_env import EnvVarsDict @@ -24,10 +25,9 @@ from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs -from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB -from simcore_service_director_v2.models.comp_runs import CompRunsAtDB -from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB +from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( + run_new_pipeline, schedule_pipelines, ) from simcore_service_director_v2.modules.comp_scheduler._models import ( @@ -74,17 +74,28 @@ async def scheduler_rabbit_client_parser( await client.unsubscribe(queue_name) +async def _assert_comp_runs( + sqlalchemy_async_engine: AsyncEngine, *, expected_total: int +) -> list[CompRunsAtDB]: + async with sqlalchemy_async_engine.connect() as conn: + list_of_comp_runs = [ + CompRunsAtDB.from_orm(row) + async for row in await conn.stream(sa.select().select_from(comp_runs)) + ] + assert len(list_of_comp_runs) == expected_total + return list_of_comp_runs + + +async def _assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: + await _assert_comp_runs(sqlalchemy_async_engine, expected_total=0) + + async def test_schedule_pipelines_empty_db( initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, sqlalchemy_async_engine: AsyncEngine, ): - # check comp_runs is empty - async with sqlalchemy_async_engine.connect() as conn: - total_number_of_items = await conn.scalar( - sa.select(sa.func.count()).select_from(comp_runs) - ) - assert total_number_of_items == 0 + await _assert_comp_runs_empty(sqlalchemy_async_engine) await schedule_pipelines(initialized_app) @@ -92,11 +103,7 @@ async def test_schedule_pipelines_empty_db( scheduler_rabbit_client_parser.assert_not_called() # check comp_runs is still empty - async with sqlalchemy_async_engine.connect() as conn: - total_number_of_items = await conn.scalar( - sa.select(sa.func.count()).select_from(comp_runs) - ) - assert total_number_of_items == 0 + await _assert_comp_runs_empty(sqlalchemy_async_engine) async def test_schedule_pipelines_concurently_raises_and_only_one_runs( @@ -120,18 +127,23 @@ async def test_schedule_pipelines_concurently_raises_and_only_one_runs( assert results.count(None) == 1, "Only one task should have run" -async def test_schedule_pipelines_with_runs( - registered_user: Callable[..., dict[str, Any]], - project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], - runs: Callable[..., CompRunsAtDB], +async def test_schedule_pipelines_with_non_scheduled_runs( + initialized_app: FastAPI, + published_project: PublishedProject, + sqlalchemy_async_engine: AsyncEngine, + run_metadata: RunMetadataDict, + scheduler_rabbit_client_parser: mock.AsyncMock, ): - user = registered_user() - proj = await project(user, workbench=fake_workbench_without_outputs) - pipeline( - project_id=proj.uuid, - dag_adjacency_list=fake_workbench_adjacency, + await _assert_comp_runs_empty(sqlalchemy_async_engine) + # now we schedule a pipeline + assert published_project.project.prj_owner + await run_new_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, ) - comp_tasks = tasks(user=user, project=proj, state=StateType.PUBLISHED, progress=0) - comp_runs = runs(user=user, project=proj, result=StateType.PUBLISHED) + scheduler_rabbit_client_parser.assert_called_once() + comp_run = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1)[0] diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index fdb3b7d5a64..ab728c921b0 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -295,24 +295,35 @@ def creator(user: dict[str, Any], **cluster_kwargs) -> Cluster: @pytest.fixture -async def published_project( +async def publish_project( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], tasks: Callable[..., list[CompTaskAtDB]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], -) -> PublishedProject: +) -> Callable[[], Awaitable[PublishedProject]]: user = registered_user() - created_project = await project(user, workbench=fake_workbench_without_outputs) - return PublishedProject( - project=created_project, - pipeline=pipeline( - project_id=f"{created_project.uuid}", - dag_adjacency_list=fake_workbench_adjacency, - ), - tasks=tasks(user=user, project=created_project, state=StateType.PUBLISHED), - ) + + async def _() -> PublishedProject: + created_project = await project(user, workbench=fake_workbench_without_outputs) + return PublishedProject( + project=created_project, + pipeline=pipeline( + project_id=f"{created_project.uuid}", + dag_adjacency_list=fake_workbench_adjacency, + ), + tasks=tasks(user=user, project=created_project, state=StateType.PUBLISHED), + ) + + return _ + + +@pytest.fixture +async def published_project( + publish_project: Callable[[], Awaitable[PublishedProject]] +) -> PublishedProject: + return await publish_project() @pytest.fixture From 39a463351ffb5f670a54689d21a657080e65f57b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 08:54:35 +0100 Subject: [PATCH 015/127] ongoing new scheduler tests --- .../with_dbs/comp_scheduler/test_distributed_scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 80f33bf4211..f358e260c0b 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -145,5 +145,5 @@ async def test_schedule_pipelines_with_non_scheduled_runs( run_metadata=run_metadata, use_on_demand_clusters=False, ) - scheduler_rabbit_client_parser.assert_called_once() - comp_run = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1)[0] + scheduler_rabbit_client_parser.assert_called_once_with() + comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) From 01cf898c31d9f469c5ab9f0f38b1aa910ffaeed5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:09:58 +0100 Subject: [PATCH 016/127] fixed tests --- .../comp_scheduler/_distributed_scheduler.py | 2 +- .../test_distributed_scheduler.py | 48 ++++++++++++------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 2f7e723fd40..4f9cfdc639e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -107,7 +107,7 @@ async def _distribute_pipeline( # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency # async with transaction_context(get_asyncpg_engine(app)) as connection: await rabbitmq_client.publish( - SchedulePipelineRabbitMessage.channel_name, + SchedulePipelineRabbitMessage.get_channel_name(), SchedulePipelineRabbitMessage( user_id=run.user_id, project_id=run.project_uuid, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index f358e260c0b..f06996ac492 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -22,6 +22,7 @@ from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import CouldNotAcquireLockError +from servicelib.utils import limited_gather from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs @@ -36,9 +37,7 @@ from sqlalchemy.ext.asyncio import AsyncEngine pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] -pytest_simcore_ops_services_selection = [ - "adminer", -] +pytest_simcore_ops_services_selection = ["adminer", "redis-commander"] @pytest.fixture @@ -80,7 +79,7 @@ async def _assert_comp_runs( async with sqlalchemy_async_engine.connect() as conn: list_of_comp_runs = [ CompRunsAtDB.from_orm(row) - async for row in await conn.stream(sa.select().select_from(comp_runs)) + for row in await conn.execute(sa.select(comp_runs)) ] assert len(list_of_comp_runs) == expected_total return list_of_comp_runs @@ -106,25 +105,34 @@ async def test_schedule_pipelines_empty_db( await _assert_comp_runs_empty(sqlalchemy_async_engine) -async def test_schedule_pipelines_concurently_raises_and_only_one_runs( - initialized_app: FastAPI, +async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( + initialized_app: FastAPI, mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch ): CONCURRENCY = 5 - # TODO: this can be flaky as an empty scheduling is very short - with pytest.raises( - CouldNotAcquireLockError, - match=".+ computational-distributed-scheduler", - ): - await asyncio.gather( - *(schedule_pipelines(initialized_app) for _ in range(CONCURRENCY)) - ) + # NOTE: this ensure no flakyness as empty scheduling is very fast + original_function = limited_gather + + async def slow_limited_gather(*args, **kwargs): + result = await original_function(*args, **kwargs) + await asyncio.sleep(3) # to ensure flakyness does not occur + return result + + mock_function = mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler.limited_gather", + autospec=True, + side_effect=slow_limited_gather, + ) results = await asyncio.gather( *(schedule_pipelines(initialized_app) for _ in range(CONCURRENCY)), return_exceptions=True, ) - assert results.count(None) == 1, "Only one task should have run" + assert results.count(None) == 1, f"Only one task should have run: {results}" + for r in results: + if r: + assert isinstance(r, CouldNotAcquireLockError) + mock_function.assert_called_once() async def test_schedule_pipelines_with_non_scheduled_runs( @@ -135,8 +143,8 @@ async def test_schedule_pipelines_with_non_scheduled_runs( scheduler_rabbit_client_parser: mock.AsyncMock, ): await _assert_comp_runs_empty(sqlalchemy_async_engine) - # now we schedule a pipeline assert published_project.project.prj_owner + # now we schedule a pipeline await run_new_pipeline( initialized_app, user_id=published_project.project.prj_owner, @@ -145,5 +153,11 @@ async def test_schedule_pipelines_with_non_scheduled_runs( run_metadata=run_metadata, use_on_demand_clusters=False, ) - scheduler_rabbit_client_parser.assert_called_once_with() + scheduler_rabbit_client_parser.assert_called_once_with( + SchedulePipelineRabbitMessage( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + ).body() + ) comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) From dad78fb1189eed9d96836904357256f5c8a14e03 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:57:06 +0100 Subject: [PATCH 017/127] 100% tested --- .../models/comp_runs.py | 5 + .../modules/db/repositories/comp_runs.py | 2 +- .../test_distributed_scheduler.py | 120 +++++++++++++++++- 3 files changed, 125 insertions(+), 2 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py index 62270380293..65b9468686a 100644 --- a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py @@ -51,6 +51,7 @@ class CompRunsAtDB(BaseModel): cancelled: datetime.datetime | None metadata: RunMetadataDict = RunMetadataDict() use_on_demand_clusters: bool + last_scheduled: datetime.datetime | None @field_validator("result", mode="before") @classmethod @@ -103,6 +104,7 @@ def convert_null_to_empty_metadata(cls, v): "modified": "2021-03-01T13:07:34.191610", "cancelled": None, "use_on_demand_clusters": False, + "last_scheduled": None, }, { "run_id": 432, @@ -117,6 +119,7 @@ def convert_null_to_empty_metadata(cls, v): "modified": "2021-03-01T13:07:34.191610", "cancelled": None, "use_on_demand_clusters": False, + "last_scheduled": None, }, { "run_id": 43243, @@ -138,6 +141,7 @@ def convert_null_to_empty_metadata(cls, v): "some-other-metadata-which-is-an-array": [1, 3, 4], }, "use_on_demand_clusters": False, + "last_scheduled": None, }, { "run_id": 43243, @@ -153,6 +157,7 @@ def convert_null_to_empty_metadata(cls, v): "cancelled": None, "metadata": None, "use_on_demand_clusters": False, + "last_scheduled": None, }, ] }, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 12bed67d3eb..ed4fac45081 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -74,7 +74,7 @@ async def list( conditions.append( or_( comp_runs.c.last_scheduled.is_(None), - comp_runs.c.last_scheduled >= scheduled_cutoff, + comp_runs.c.last_scheduled <= scheduled_cutoff, ) ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index f06996ac492..4f7213b9dcd 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -9,7 +9,9 @@ import asyncio +import logging from collections.abc import AsyncIterator, Callable +from typing import Any, Awaitable from unittest import mock import pytest @@ -17,6 +19,8 @@ from _helpers import PublishedProject from fastapi import FastAPI from models_library.clusters import DEFAULT_CLUSTER_ID +from models_library.projects import ProjectAtDB +from models_library.projects_state import RunningState from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict from pytest_simcore.helpers.typing_env import EnvVarsDict @@ -26,10 +30,14 @@ from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs +from simcore_service_director_v2.core.errors import PipelineNotFoundError +from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( + _SCHEDULER_INTERVAL, run_new_pipeline, schedule_pipelines, + stop_pipeline, ) from simcore_service_director_v2.modules.comp_scheduler._models import ( SchedulePipelineRabbitMessage, @@ -135,7 +143,7 @@ async def slow_limited_gather(*args, **kwargs): mock_function.assert_called_once() -async def test_schedule_pipelines_with_non_scheduled_runs( +async def test_schedule_pipelines( initialized_app: FastAPI, published_project: PublishedProject, sqlalchemy_async_engine: AsyncEngine, @@ -153,6 +161,7 @@ async def test_schedule_pipelines_with_non_scheduled_runs( run_metadata=run_metadata, use_on_demand_clusters=False, ) + # this directly schedule a new pipeline scheduler_rabbit_client_parser.assert_called_once_with( SchedulePipelineRabbitMessage( user_id=published_project.project.prj_owner, @@ -160,4 +169,113 @@ async def test_schedule_pipelines_with_non_scheduled_runs( iteration=1, ).body() ) + scheduler_rabbit_client_parser.reset_mock() comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.project_uuid == published_project.project.uuid + assert comp_run.user_id == published_project.project.prj_owner + assert comp_run.iteration == 1 + assert comp_run.cancelled is None + assert comp_run.cluster_id == DEFAULT_CLUSTER_ID + assert comp_run.metadata == run_metadata + assert comp_run.result is RunningState.PUBLISHED + assert comp_run.last_scheduled is not None + start_schedule_time = comp_run.last_scheduled + start_modified_time = comp_run.modified + + await asyncio.sleep(_SCHEDULER_INTERVAL.total_seconds() - 1) + + # this will now not schedule the pipeline since it was last scheduled + await schedule_pipelines(initialized_app) + scheduler_rabbit_client_parser.assert_not_called() + comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.last_scheduled == start_schedule_time, "scheduled time changed!" + assert comp_run.cancelled is None + assert comp_run.modified == start_modified_time + + await asyncio.sleep(_SCHEDULER_INTERVAL.total_seconds() + 1) + # this will now schedule the pipeline since the time passed + await schedule_pipelines(initialized_app) + scheduler_rabbit_client_parser.assert_called_once_with( + SchedulePipelineRabbitMessage( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + ).body() + ) + scheduler_rabbit_client_parser.reset_mock() + comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.last_scheduled is not None + assert comp_run.last_scheduled > start_schedule_time + last_schedule_time = comp_run.last_scheduled + assert comp_run.cancelled is None + assert comp_run.modified > start_modified_time + + # now we stop the pipeline, which should instantly trigger a schedule + await stop_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + ) + await schedule_pipelines(initialized_app) + scheduler_rabbit_client_parser.assert_called_once_with( + SchedulePipelineRabbitMessage( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + ).body() + ) + scheduler_rabbit_client_parser.reset_mock() + comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.last_scheduled is not None + assert comp_run.last_scheduled > last_schedule_time + assert comp_run.cancelled is not None + + +async def test_empty_pipeline_is_not_scheduled( + initialized_app: FastAPI, + registered_user: Callable[..., dict[str, Any]], + project: Callable[..., Awaitable[ProjectAtDB]], + pipeline: Callable[..., CompPipelineAtDB], + run_metadata: RunMetadataDict, + sqlalchemy_async_engine: AsyncEngine, + scheduler_rabbit_client_parser: mock.AsyncMock, + caplog: pytest.LogCaptureFixture, +): + await _assert_comp_runs_empty(sqlalchemy_async_engine) + user = registered_user() + empty_project = await project(user) + + # the project is not in the comp_pipeline, therefore scheduling it should fail + with pytest.raises(PipelineNotFoundError): + await run_new_pipeline( + initialized_app, + user_id=user["id"], + project_id=empty_project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + await _assert_comp_runs_empty(sqlalchemy_async_engine) + scheduler_rabbit_client_parser.assert_not_called() + + # create the empty pipeline now + pipeline(project_id=f"{empty_project.uuid}") + + # creating a run with an empty pipeline is useless, check the scheduler is not kicking in + with caplog.at_level(logging.WARNING): + await run_new_pipeline( + initialized_app, + user_id=user["id"], + project_id=empty_project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + assert len(caplog.records) == 1 + assert "no computational dag defined" in caplog.records[0].message + await _assert_comp_runs_empty(sqlalchemy_async_engine) + scheduler_rabbit_client_parser.assert_not_called() From acceed4d1cd5cf54fb13e9aeddcb5af780712f27 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:21:21 +0100 Subject: [PATCH 018/127] setup distributed scheduler --- .../modules/comp_scheduler/__init__.py | 30 ++++++++++++------- .../comp_scheduler/_distributed_scheduler.py | 4 +-- .../test_distributed_scheduler.py | 6 ++-- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 2b29acf16c9..d6100d638de 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -1,12 +1,17 @@ import logging from collections.abc import Callable, Coroutine -from typing import Any, cast +from typing import Any from fastapi import FastAPI +from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context -from . import _scheduler_factory -from ._base_scheduler import BaseCompScheduler +from ._distributed_scheduler import ( + SCHEDULER_INTERVAL, + run_new_pipeline, + schedule_pipelines, + stop_pipeline, +) _logger = logging.getLogger(__name__) @@ -16,22 +21,25 @@ async def start_scheduler() -> None: with log_context( _logger, level=logging.INFO, msg="starting computational scheduler" ): - app.state.scheduler = await _scheduler_factory.create_from_db(app) + app.state.scheduler = start_periodic_task( + schedule_pipelines, + interval=SCHEDULER_INTERVAL, + task_name="computational-distributed-scheduler", + ) return start_scheduler def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def stop_scheduler() -> None: - await get_scheduler(app).shutdown() + with log_context( + _logger, level=logging.INFO, msg="stopping computational scheduler" + ): + await stop_periodic_task(app.state.scheduler) return stop_scheduler -def get_scheduler(app: FastAPI) -> BaseCompScheduler: - return cast(BaseCompScheduler, app.state.scheduler) - - def setup(app: FastAPI): app.add_event_handler("startup", on_app_startup(app)) app.add_event_handler("shutdown", on_app_shutdown(app)) @@ -39,6 +47,6 @@ def setup(app: FastAPI): __all__: tuple[str, ...] = ( "setup", - "BaseCompScheduler", - "get_scheduler", + "run_new_pipeline", + "stop_pipeline", ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 4f9cfdc639e..595cfce3666 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -25,7 +25,7 @@ from ..redis import get_redis_client_manager _logger = logging.getLogger(__name__) -_SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) +SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) _MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 @@ -129,7 +129,7 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr async def schedule_pipelines(app: FastAPI) -> None: db_engine = get_db_engine(app) runs_to_schedule = await CompRunsRepository.instance(db_engine).list( - filter_by_state=SCHEDULED_STATES, scheduled_since=_SCHEDULER_INTERVAL + filter_by_state=SCHEDULED_STATES, scheduled_since=SCHEDULER_INTERVAL ) rabbitmq_client = get_rabbitmq_client(app) await limited_gather( diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 4f7213b9dcd..c7cf0522aa9 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -34,7 +34,7 @@ from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( - _SCHEDULER_INTERVAL, + SCHEDULER_INTERVAL, run_new_pipeline, schedule_pipelines, stop_pipeline, @@ -183,7 +183,7 @@ async def test_schedule_pipelines( start_schedule_time = comp_run.last_scheduled start_modified_time = comp_run.modified - await asyncio.sleep(_SCHEDULER_INTERVAL.total_seconds() - 1) + await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() - 1) # this will now not schedule the pipeline since it was last scheduled await schedule_pipelines(initialized_app) @@ -194,7 +194,7 @@ async def test_schedule_pipelines( assert comp_run.cancelled is None assert comp_run.modified == start_modified_time - await asyncio.sleep(_SCHEDULER_INTERVAL.total_seconds() + 1) + await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() + 1) # this will now schedule the pipeline since the time passed await schedule_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_called_once_with( From a6ddc56ad8e292d334cdbd0522fdfec915fdc692 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:46:58 +0100 Subject: [PATCH 019/127] first connection manager/worker --- .../modules/comp_scheduler/__init__.py | 41 +++- .../modules/comp_scheduler/_base_scheduler.py | 230 ++---------------- .../comp_scheduler/_scheduler_factory.py | 16 +- 3 files changed, 65 insertions(+), 222 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index d6100d638de..7d282c18e4c 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -1,3 +1,4 @@ +import functools import logging from collections.abc import Callable, Coroutine from typing import Any @@ -5,23 +6,53 @@ from fastapi import FastAPI from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context +from simcore_service_director_v2.modules.comp_scheduler._base_scheduler import ( + BaseCompScheduler, +) +from ..rabbitmq import get_rabbitmq_client from ._distributed_scheduler import ( SCHEDULER_INTERVAL, run_new_pipeline, schedule_pipelines, stop_pipeline, ) +from ._models import SchedulePipelineRabbitMessage +from ._scheduler_factory import create_scheduler _logger = logging.getLogger(__name__) +def _empty_wake_up_callack() -> None: + return + + +async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: + to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) + await _get_scheduler_worker(app)._schedule_pipeline( + user_id=to_schedule_pipeline.user_id, + project_id=to_schedule_pipeline.project_id, + iteration=to_schedule_pipeline.iteration, + wake_up_callback=_empty_wake_up_callack, + ) + return True + + def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def start_scheduler() -> None: with log_context( _logger, level=logging.INFO, msg="starting computational scheduler" ): - app.state.scheduler = start_periodic_task( + rabbitmq_client = get_rabbitmq_client(app) + await rabbitmq_client.subscribe( + SchedulePipelineRabbitMessage.get_channel_name(), + functools.partial(_handle_distributed_pipeline, app), + exclusive_queue=False, + ) + + app.state.scheduler_worker = create_scheduler(app) + + app.state.scheduler_manager = start_periodic_task( schedule_pipelines, interval=SCHEDULER_INTERVAL, task_name="computational-distributed-scheduler", @@ -35,11 +66,17 @@ async def stop_scheduler() -> None: with log_context( _logger, level=logging.INFO, msg="stopping computational scheduler" ): - await stop_periodic_task(app.state.scheduler) + await stop_periodic_task(app.state.scheduler_manager) + + # TODO: we might want to stop anything running in the worker too return stop_scheduler +def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: + return app.state.scheduler_worker + + def setup(app: FastAPI): app.add_event_handler("startup", on_app_startup(app)) app.add_event_handler("shutdown", on_app_shutdown(app)) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py index 2d663aec9a1..d0b4f253cd7 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py @@ -12,19 +12,16 @@ """ import asyncio -import contextlib import datetime -import functools import logging from abc import ABC, abstractmethod from collections.abc import Callable -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Final, TypeAlias import arrow import networkx as nx from aiopg.sa.engine import Engine -from models_library.clusters import ClusterID from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID, NodeIDStr from models_library.projects_state import RunningState @@ -32,12 +29,10 @@ from models_library.users import UserID from networkx.classes.reportviews import InDegreeView from pydantic import PositiveInt -from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE from servicelib.logging_utils import log_context from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient -from servicelib.redis import CouldNotAcquireLockError, RedisClientSDK -from servicelib.redis_utils import exclusive +from servicelib.redis import RedisClientSDK from ...constants import UNDEFINED_STR_METADATA from ...core.errors import ( @@ -59,7 +54,6 @@ COMPLETED_STATES, PROCESSING_STATES, RUNNING_STATES, - SCHEDULED_STATES, TASK_TO_START_STATES, WAITING_FOR_START_STATES, Iteration, @@ -84,9 +78,9 @@ _Current = CompTaskAtDB _MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10 _SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) -_TASK_NAME_TEMPLATE: Final[ - str -] = "computational-scheduler-{user_id}:{project_id}:{iteration}" +_TASK_NAME_TEMPLATE: Final[str] = ( + "computational-scheduler-{user_id}:{project_id}:{iteration}" +) PipelineSchedulingTask: TypeAlias = asyncio.Task PipelineSchedulingWakeUpEvent: TypeAlias = asyncio.Event @@ -159,169 +153,6 @@ class BaseCompScheduler(ABC): service_runtime_heartbeat_interval: datetime.timedelta redis_client: RedisClientSDK - # NOTE: this is a trick to be able to inheritate from the class - _scheduled_pipelines: dict[ - tuple[UserID, ProjectID, Iteration], ScheduledPipelineParams - ] = field(default_factory=dict, init=False) - - def __post_init__(self) -> None: - self._scheduled_pipelines = {} - - async def restore_scheduling_from_db(self) -> None: - # get currently scheduled runs - comp_runs = await CompRunsRepository.instance(self.db_engine).list( - filter_by_state=SCHEDULED_STATES - ) - - for run in comp_runs: - task, wake_up_event = self._start_scheduling( - run.user_id, run.project_uuid, run.iteration - ) - self._scheduled_pipelines |= { - ( - run.user_id, - run.project_uuid, - run.iteration, - ): ScheduledPipelineParams( - scheduler_task=task, scheduler_waker=wake_up_event - ) - } - - async def run_new_pipeline( - self, - user_id: UserID, - project_id: ProjectID, - cluster_id: ClusterID, - run_metadata: RunMetadataDict, - *, - use_on_demand_clusters: bool, - ) -> None: - """Sets a new pipeline to be scheduled on the computational resources. - Passing cluster_id=0 will use the default cluster. Passing an existing ID will instruct - the scheduler to run the tasks on the defined cluster""" - # ensure the pipeline exists and is populated with something - dag = await self._get_pipeline_dag(project_id) - if not dag: - _logger.warning( - "project %s has no computational dag defined. not scheduled for a run.", - f"{project_id=}", - ) - return - - runs_repo = CompRunsRepository.instance(self.db_engine) - new_run = await runs_repo.create( - user_id=user_id, - project_id=project_id, - cluster_id=cluster_id, - metadata=run_metadata, - use_on_demand_clusters=use_on_demand_clusters, - ) - task, wake_up_event = self._start_scheduling( - user_id, project_id, new_run.iteration - ) - self._scheduled_pipelines[ - (user_id, project_id, new_run.iteration) - ] = ScheduledPipelineParams(scheduler_task=task, scheduler_waker=wake_up_event) - await publish_project_log( - self.rabbitmq_client, - user_id, - project_id, - log=f"Project pipeline scheduled using {'on-demand clusters' if use_on_demand_clusters else 'pre-defined clusters'}, starting soon...", - log_level=logging.INFO, - ) - - async def stop_pipeline( - self, user_id: UserID, project_id: ProjectID, iteration: int | None = None - ) -> None: - if iteration is None: - # if no iteration given find the latest one in the list - possible_iterations = { - it - for u_id, p_id, it in self._scheduled_pipelines - if u_id == user_id and p_id == project_id - } - if not possible_iterations: - msg = f"There are no pipeline scheduled for {user_id}:{project_id}" - raise ComputationalSchedulerError(msg=msg) - current_max_iteration = max(possible_iterations) - selected_iteration = current_max_iteration - else: - selected_iteration = iteration - - # mark the scheduled pipeline for stopping - updated_comp_run = await CompRunsRepository.instance( - self.db_engine - ).mark_for_cancellation( - user_id=user_id, project_id=project_id, iteration=selected_iteration - ) - if updated_comp_run: - assert updated_comp_run.cancelled is not None # nosec - # ensure the scheduler starts right away - self._scheduled_pipelines[ - (user_id, project_id, selected_iteration) - ].wake_up() - - async def shutdown(self) -> None: - # cancel all current scheduling processes - await asyncio.gather( - *( - stop_periodic_task(p.scheduler_task, timeout=3) - for p in self._scheduled_pipelines.values() - if p.scheduler_task - ), - return_exceptions=True, - ) - - def _get_last_iteration(self, user_id: UserID, project_id: ProjectID) -> Iteration: - # if no iteration given find the latest one in the list - possible_iterations = { - it - for u_id, p_id, it in self._scheduled_pipelines - if u_id == user_id and p_id == project_id - } - if not possible_iterations: - msg = f"There are no pipeline scheduled for {user_id}:{project_id}" - raise ComputationalSchedulerError(msg=msg) - return max(possible_iterations) - - def _start_scheduling( - self, - user_id: UserID, - project_id: ProjectID, - iteration: Iteration, - ) -> tuple[PipelineSchedulingTask, PipelineSchedulingWakeUpEvent]: - async def _exclusive_safe_schedule_pipeline( - *, - user_id: UserID, - project_id: ProjectID, - iteration: Iteration, - wake_up_callback: Callable[[], None], - ) -> None: - with contextlib.suppress(CouldNotAcquireLockError): - await self._schedule_pipeline( - user_id=user_id, - project_id=project_id, - iteration=iteration, - wake_up_callback=wake_up_callback, - ) - - pipeline_wake_up_event = asyncio.Event() - pipeline_task = start_periodic_task( - functools.partial( - _exclusive_safe_schedule_pipeline, - user_id=user_id, - project_id=project_id, - iteration=iteration, - wake_up_callback=pipeline_wake_up_event.set, - ), - interval=_SCHEDULER_INTERVAL, - task_name=_TASK_NAME_TEMPLATE.format( - user_id=user_id, project_id=project_id, iteration=iteration - ), - early_wake_up_event=pipeline_wake_up_event, - ) - return pipeline_task, pipeline_wake_up_event - async def _get_pipeline_dag(self, project_id: ProjectID) -> nx.DiGraph: comp_pipeline_repo = CompPipelinesRepository.instance(self.db_engine) pipeline_at_db: CompPipelineAtDB = await comp_pipeline_repo.get_pipeline( @@ -653,20 +484,17 @@ async def _start_tasks( scheduled_tasks: dict[NodeID, CompTaskAtDB], comp_run: CompRunsAtDB, wake_up_callback: Callable[[], None], - ) -> None: - ... + ) -> None: ... @abstractmethod async def _get_tasks_status( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> list[RunningState]: - ... + ) -> list[RunningState]: ... @abstractmethod async def _stop_tasks( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> None: - ... + ) -> None: ... @abstractmethod async def _process_completed_tasks( @@ -675,25 +503,8 @@ async def _process_completed_tasks( tasks: list[CompTaskAtDB], iteration: Iteration, comp_run: CompRunsAtDB, - ) -> None: - ... - - @staticmethod - def _build_exclusive_lock_key(*args, **kwargs) -> str: - assert args # nosec - return f"{kwargs['user_id']}:{kwargs['project_id']}:{kwargs['iteration']}" - - @staticmethod - def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: - assert kwargs # nosec - zelf = args[0] - assert isinstance(zelf, BaseCompScheduler) # nosec - return zelf.redis_client - - @exclusive( - redis=_redis_client_getter, - lock_key=_build_exclusive_lock_key, - ) + ) -> None: ... + async def _schedule_pipeline( self, *, @@ -753,17 +564,11 @@ async def _schedule_pipeline( # 7. Are we done scheduling that pipeline? if not dag.nodes() or pipeline_result in COMPLETED_STATES: # there is nothing left, the run is completed, we're done here - self._scheduled_pipelines.pop( - (user_id, project_id, iteration), None - ) _logger.info( "pipeline %s scheduling completed with result %s", f"{project_id=}", f"{pipeline_result=}", ) - current_task = asyncio.current_task() - assert current_task is not None # nosec - current_task.cancel() except PipelineNotFoundError: _logger.warning( "pipeline %s does not exist in comp_pipeline table, it will be removed from scheduler", @@ -772,7 +577,6 @@ async def _schedule_pipeline( await self._set_run_result( user_id, project_id, iteration, RunningState.ABORTED ) - self._scheduled_pipelines.pop((user_id, project_id, iteration), None) except InvalidPipelineError as exc: _logger.warning( "pipeline %s appears to be misconfigured, it will be removed from scheduler. Please check pipeline:\n%s", @@ -782,7 +586,6 @@ async def _schedule_pipeline( await self._set_run_result( user_id, project_id, iteration, RunningState.ABORTED ) - self._scheduled_pipelines.pop((user_id, project_id, iteration), None) except (DaskClientAcquisisitonError, ClustersKeeperNotAvailableError): _logger.exception( "Unexpected error while connecting with computational backend, aborting pipeline" @@ -799,7 +602,6 @@ async def _schedule_pipeline( await self._set_run_result( user_id, project_id, iteration, RunningState.FAILED ) - self._scheduled_pipelines.pop((user_id, project_id, iteration), None) except ComputationalBackendNotConnectedError: _logger.exception("Computational backend is not connected!") @@ -879,9 +681,9 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[ - NodeIDStr(f"{task}") - ].state = RunningState.WAITING_FOR_CLUSTER + comp_tasks[NodeIDStr(f"{task}")].state = ( + RunningState.WAITING_FOR_CLUSTER + ) except ComputationalBackendOnDemandNotReadyError as exc: _logger.info( @@ -903,9 +705,9 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[ - NodeIDStr(f"{task}") - ].state = RunningState.WAITING_FOR_CLUSTER + comp_tasks[NodeIDStr(f"{task}")].state = ( + RunningState.WAITING_FOR_CLUSTER + ) except ClustersKeeperNotAvailableError: _logger.exception("Unexpected error while starting tasks:") await publish_project_log( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index 524dfc7e8ad..0f474398c70 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -7,6 +7,7 @@ from ...core.errors import ConfigurationError from ...core.settings import AppSettings from ..dask_clients_pool import DaskClientsPool +from ..db import get_db_engine from ..rabbitmq import get_rabbitmq_client, get_rabbitmq_rpc_client from ..redis import get_redis_client_manager from ._base_scheduler import BaseCompScheduler @@ -16,23 +17,26 @@ async def create_from_db(app: FastAPI) -> BaseCompScheduler: + scheduler = create_scheduler(app) + await scheduler.restore_scheduling_from_db() + return scheduler + + +def create_scheduler(app: FastAPI) -> BaseCompScheduler: if not hasattr(app.state, "engine"): msg = "Database connection is missing. Please check application configuration." - raise ConfigurationError(msg=msg) - db_engine = app.state.engine + raise ConfigurationError(msg) with log_context( _logger, logging.INFO, msg="Creating Dask-based computational scheduler" ): app_settings: AppSettings = app.state.settings - scheduler = DaskScheduler( + return DaskScheduler( settings=app_settings.DIRECTOR_V2_COMPUTATIONAL_BACKEND, dask_clients_pool=DaskClientsPool.instance(app), rabbitmq_client=get_rabbitmq_client(app), rabbitmq_rpc_client=get_rabbitmq_rpc_client(app), redis_client=get_redis_client_manager(app).client(RedisDatabase.LOCKS), - db_engine=db_engine, + db_engine=get_db_engine(app), service_runtime_heartbeat_interval=app_settings.SERVICE_TRACKING_HEARTBEAT, ) - await scheduler.restore_scheduling_from_db() - return scheduler From 353e18079fccc7c1c3188046643bf3e929cca473 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:54:57 +0100 Subject: [PATCH 020/127] updated syntax in tests --- .../api/dependencies/scheduler.py | 8 +--- .../api/routes/computations.py | 38 ++++++++++--------- .../modules/comp_scheduler/__init__.py | 4 +- .../comp_scheduler/_scheduler_factory.py | 6 --- ...t_modules_comp_scheduler_dask_scheduler.py | 6 +-- 5 files changed, 26 insertions(+), 36 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py b/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py index aa01af1f34b..e480d204d3b 100644 --- a/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py @@ -1,17 +1,11 @@ from typing import Annotated -from fastapi import Depends, FastAPI, Request +from fastapi import Depends, FastAPI from ...core.settings import ComputationalBackendSettings -from ...modules.comp_scheduler import BaseCompScheduler from . import get_app -def get_scheduler(request: Request) -> BaseCompScheduler: - scheduler: BaseCompScheduler = request.app.state.scheduler - return scheduler - - def get_scheduler_settings( app: Annotated[FastAPI, Depends(get_app)] ) -> ComputationalBackendSettings: diff --git a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py index 251e35fa638..f25fdf32ece 100644 --- a/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py +++ b/services/director-v2/src/simcore_service_director_v2/api/routes/computations.py @@ -21,7 +21,7 @@ from typing import Annotated, Any, Final import networkx as nx -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, FastAPI, HTTPException from models_library.api_schemas_directorv2.comp_tasks import ( ComputationCreate, ComputationDelete, @@ -63,7 +63,7 @@ from ...models.comp_runs import CompRunsAtDB, ProjectMetadataDict, RunMetadataDict from ...models.comp_tasks import CompTaskAtDB from ...modules.catalog import CatalogClient -from ...modules.comp_scheduler import BaseCompScheduler +from ...modules.comp_scheduler import run_new_pipeline, stop_pipeline from ...modules.db.repositories.clusters import ClustersRepository from ...modules.db.repositories.comp_pipelines import CompPipelinesRepository from ...modules.db.repositories.comp_runs import CompRunsRepository @@ -89,7 +89,6 @@ from ..dependencies.director_v0 import get_director_v0_client from ..dependencies.rabbitmq import rabbitmq_rpc_client from ..dependencies.rut_client import get_rut_client -from ..dependencies.scheduler import get_scheduler from .computations_tasks import analyze_pipeline _PIPELINE_ABORT_TIMEOUT_S: Final[int] = 10 @@ -212,12 +211,12 @@ async def _get_project_node_names( async def _try_start_pipeline( + app: FastAPI, *, project_repo: ProjectsRepository, computation: ComputationCreate, complete_dag: nx.DiGraph, minimal_dag: nx.DiGraph, - scheduler: BaseCompScheduler, project: ProjectAtDB, users_repo: UsersRepository, projects_metadata_repo: ProjectsMetadataRepository, @@ -242,11 +241,12 @@ async def _try_start_pipeline( wallet_id = computation.wallet_info.wallet_id wallet_name = computation.wallet_info.wallet_name - await scheduler.run_new_pipeline( - computation.user_id, - computation.project_id, - computation.cluster_id or DEFAULT_CLUSTER_ID, - RunMetadataDict( + await run_new_pipeline( + app, + user_id=computation.user_id, + project_id=computation.project_id, + cluster_id=computation.cluster_id or DEFAULT_CLUSTER_ID, + run_metadata=RunMetadataDict( node_id_names_map={ NodeID(node_idstr): node_data.label for node_idstr, node_data in project.workbench.items() @@ -313,7 +313,6 @@ async def create_computation( # noqa: PLR0913 # pylint: disable=too-many-positi ProjectsMetadataRepository, Depends(get_repository(ProjectsMetadataRepository)) ], director_client: Annotated[DirectorV0Client, Depends(get_director_v0_client)], - scheduler: Annotated[BaseCompScheduler, Depends(get_scheduler)], catalog_client: Annotated[CatalogClient, Depends(get_catalog_client)], rut_client: Annotated[ResourceUsageTrackerClient, Depends(get_rut_client)], rpc_client: Annotated[RabbitMQRPCClient, Depends(rabbitmq_rpc_client)], @@ -370,11 +369,11 @@ async def create_computation( # noqa: PLR0913 # pylint: disable=too-many-positi if computation.start_pipeline: await _try_start_pipeline( + request.app, project_repo=project_repo, computation=computation, complete_dag=complete_dag, minimal_dag=minimal_computational_dag, - scheduler=scheduler, project=project, users_repo=users_repo, projects_metadata_repo=projects_metadata_repo, @@ -549,7 +548,6 @@ async def stop_computation( comp_runs_repo: Annotated[ CompRunsRepository, Depends(get_repository(CompRunsRepository)) ], - scheduler: Annotated[BaseCompScheduler, Depends(get_scheduler)], ) -> ComputationGet: _logger.debug( "User %s stopping computation for project %s", @@ -575,7 +573,9 @@ async def stop_computation( pipeline_state = utils.get_pipeline_state_from_task_states(filtered_tasks) if utils.is_pipeline_running(pipeline_state): - await scheduler.stop_pipeline(computation_stop.user_id, project_id) + await stop_pipeline( + request.app, user_id=computation_stop.user_id, project_id=project_id + ) # get run details if any last_run: CompRunsAtDB | None = None @@ -615,6 +615,7 @@ async def stop_computation( async def delete_computation( computation_stop: ComputationDelete, project_id: ProjectID, + request: Request, project_repo: Annotated[ ProjectsRepository, Depends(get_repository(ProjectsRepository)) ], @@ -624,7 +625,6 @@ async def delete_computation( comp_tasks_repo: Annotated[ CompTasksRepository, Depends(get_repository(CompTasksRepository)) ], - scheduler: Annotated[BaseCompScheduler, Depends(get_scheduler)], ) -> None: try: # get the project @@ -642,7 +642,9 @@ async def delete_computation( ) # abort the pipeline first try: - await scheduler.stop_pipeline(computation_stop.user_id, project_id) + await stop_pipeline( + request.app, user_id=computation_stop.user_id, project_id=project_id + ) except ComputationalSchedulerError as e: _logger.warning( "Project %s could not be stopped properly.\n reason: %s", @@ -663,9 +665,9 @@ def return_last_value(retry_state: Any) -> Any: before_sleep=before_sleep_log(_logger, logging.INFO), ) async def check_pipeline_stopped() -> bool: - comp_tasks: list[ - CompTaskAtDB - ] = await comp_tasks_repo.list_computational_tasks(project_id) + comp_tasks: list[CompTaskAtDB] = ( + await comp_tasks_repo.list_computational_tasks(project_id) + ) pipeline_state = utils.get_pipeline_state_from_task_states( comp_tasks, ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 7d282c18e4c..efcacc4ad08 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -1,7 +1,7 @@ import functools import logging from collections.abc import Callable, Coroutine -from typing import Any +from typing import Any, cast from fastapi import FastAPI from servicelib.background_task import start_periodic_task, stop_periodic_task @@ -74,7 +74,7 @@ async def stop_scheduler() -> None: def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: - return app.state.scheduler_worker + return cast(BaseCompScheduler, app.state.scheduler_worker) def setup(app: FastAPI): diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index 0f474398c70..f9f5a61306c 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -16,12 +16,6 @@ _logger = logging.getLogger(__name__) -async def create_from_db(app: FastAPI) -> BaseCompScheduler: - scheduler = create_scheduler(app) - await scheduler.restore_scheduling_from_db() - return scheduler - - def create_scheduler(app: FastAPI) -> BaseCompScheduler: if not hasattr(app.state, "engine"): msg = "Database connection is missing. Please check application configuration." diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index 6f016f297c0..f9cdc86367b 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -69,7 +69,7 @@ from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState from simcore_service_director_v2.modules.comp_scheduler import ( BaseCompScheduler, - get_scheduler, + _get_scheduler_worker, ) from simcore_service_director_v2.modules.comp_scheduler._dask_scheduler import ( DaskScheduler, @@ -219,7 +219,7 @@ def scheduler( aiopg_engine: aiopg.sa.engine.Engine, minimal_app: FastAPI, ) -> BaseCompScheduler: - scheduler = get_scheduler(minimal_app) + scheduler = _get_scheduler_worker(minimal_app) assert scheduler is not None return scheduler @@ -296,7 +296,7 @@ async def test_scheduler_gracefully_starts_and_stops( minimal_app: FastAPI, ): # check it started correctly - assert get_scheduler(minimal_app) is not None + assert _get_scheduler_worker(minimal_app) is not None @pytest.mark.parametrize( From dfbf38f7a8f1f76e0e385d62872f29ddcf6f43e9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:27:08 +0100 Subject: [PATCH 021/127] splitted code --- .../modules/comp_scheduler/__init__.py | 38 ++---------------- .../modules/comp_scheduler/_base_scheduler.py | 2 +- .../comp_scheduler/_distributed_worker.py | 39 +++++++++++++++++++ .../test_distributed_scheduler.py | 35 ++++++++++++++--- ...t_modules_comp_scheduler_dask_scheduler.py | 2 +- 5 files changed, 75 insertions(+), 41 deletions(-) create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index efcacc4ad08..dc4fe5ac73d 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -1,61 +1,35 @@ -import functools import logging from collections.abc import Callable, Coroutine -from typing import Any, cast +from typing import Any from fastapi import FastAPI from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context -from simcore_service_director_v2.modules.comp_scheduler._base_scheduler import ( - BaseCompScheduler, -) -from ..rabbitmq import get_rabbitmq_client from ._distributed_scheduler import ( SCHEDULER_INTERVAL, run_new_pipeline, schedule_pipelines, stop_pipeline, ) -from ._models import SchedulePipelineRabbitMessage -from ._scheduler_factory import create_scheduler +from ._distributed_worker import setup_worker _logger = logging.getLogger(__name__) -def _empty_wake_up_callack() -> None: - return - - -async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: - to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) - await _get_scheduler_worker(app)._schedule_pipeline( - user_id=to_schedule_pipeline.user_id, - project_id=to_schedule_pipeline.project_id, - iteration=to_schedule_pipeline.iteration, - wake_up_callback=_empty_wake_up_callack, - ) - return True - - def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def start_scheduler() -> None: with log_context( _logger, level=logging.INFO, msg="starting computational scheduler" ): - rabbitmq_client = get_rabbitmq_client(app) - await rabbitmq_client.subscribe( - SchedulePipelineRabbitMessage.get_channel_name(), - functools.partial(_handle_distributed_pipeline, app), - exclusive_queue=False, - ) - app.state.scheduler_worker = create_scheduler(app) + await setup_worker(app) app.state.scheduler_manager = start_periodic_task( schedule_pipelines, interval=SCHEDULER_INTERVAL, task_name="computational-distributed-scheduler", + app=app, ) return start_scheduler @@ -73,10 +47,6 @@ async def stop_scheduler() -> None: return stop_scheduler -def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: - return cast(BaseCompScheduler, app.state.scheduler_worker) - - def setup(app: FastAPI): app.add_event_handler("startup", on_app_startup(app)) app.add_event_handler("shutdown", on_app_shutdown(app)) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py index d0b4f253cd7..a1fe4da7939 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py @@ -505,7 +505,7 @@ async def _process_completed_tasks( comp_run: CompRunsAtDB, ) -> None: ... - async def _schedule_pipeline( + async def schedule_pipeline( self, *, user_id: UserID, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py new file mode 100644 index 00000000000..1a13c41cff9 --- /dev/null +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py @@ -0,0 +1,39 @@ +import functools +from typing import cast + +from fastapi import FastAPI + +from ..rabbitmq import get_rabbitmq_client +from ._base_scheduler import BaseCompScheduler +from ._models import SchedulePipelineRabbitMessage +from ._scheduler_factory import create_scheduler + + +def _empty_wake_up_callack() -> None: + return + + +def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: + return cast(BaseCompScheduler, app.state.scheduler_worker) + + +async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: + to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) + await _get_scheduler_worker(app).schedule_pipeline( + user_id=to_schedule_pipeline.user_id, + project_id=to_schedule_pipeline.project_id, + iteration=to_schedule_pipeline.iteration, + wake_up_callback=_empty_wake_up_callack, + ) + return True + + +async def setup_worker(app: FastAPI) -> None: + rabbitmq_client = get_rabbitmq_client(app) + await rabbitmq_client.subscribe( + SchedulePipelineRabbitMessage.get_channel_name(), + functools.partial(_handle_distributed_pipeline, app), + exclusive_queue=False, + ) + + app.state.scheduler_worker = create_scheduler(app) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index c7cf0522aa9..26bd17b20ec 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -68,6 +68,24 @@ def mock_env( ) +@pytest.fixture +def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: + mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.stop_periodic_task", + ) + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.start_periodic_task", + ) + + +@pytest.fixture +def with_disabled_scheduler_worker(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.setup_worker", + autospec=True, + ) + + @pytest.fixture async def scheduler_rabbit_client_parser( create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture @@ -75,7 +93,7 @@ async def scheduler_rabbit_client_parser( client = create_rabbitmq_client("scheduling_pytest_consumer") mock = mocker.AsyncMock(return_value=True) queue_name = await client.subscribe( - SchedulePipelineRabbitMessage.get_channel_name(), mock + SchedulePipelineRabbitMessage.get_channel_name(), mock, exclusive_queue=False ) yield mock await client.unsubscribe(queue_name) @@ -98,10 +116,12 @@ async def _assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: async def test_schedule_pipelines_empty_db( + with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, sqlalchemy_async_engine: AsyncEngine, ): + with_disabled_auto_scheduling.assert_called_once() await _assert_comp_runs_empty(sqlalchemy_async_engine) await schedule_pipelines(initialized_app) @@ -114,7 +134,10 @@ async def test_schedule_pipelines_empty_db( async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( - initialized_app: FastAPI, mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch + with_disabled_auto_scheduling: mock.Mock, + initialized_app: FastAPI, + mocker: MockerFixture, + monkeypatch: pytest.MonkeyPatch, ): CONCURRENCY = 5 # NOTE: this ensure no flakyness as empty scheduling is very fast @@ -144,6 +167,8 @@ async def slow_limited_gather(*args, **kwargs): async def test_schedule_pipelines( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, published_project: PublishedProject, sqlalchemy_async_engine: AsyncEngine, @@ -183,8 +208,6 @@ async def test_schedule_pipelines( start_schedule_time = comp_run.last_scheduled start_modified_time = comp_run.modified - await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() - 1) - # this will now not schedule the pipeline since it was last scheduled await schedule_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_not_called() @@ -194,8 +217,8 @@ async def test_schedule_pipelines( assert comp_run.cancelled is None assert comp_run.modified == start_modified_time - await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() + 1) # this will now schedule the pipeline since the time passed + await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() + 1) await schedule_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_called_once_with( SchedulePipelineRabbitMessage( @@ -236,6 +259,8 @@ async def test_schedule_pipelines( async def test_empty_pipeline_is_not_scheduled( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index f9cdc86367b..c53fe0f9861 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -165,7 +165,7 @@ async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None: local_pipelines = deepcopy(scheduler._scheduled_pipelines) # noqa: SLF001 results = await asyncio.gather( *( - scheduler._schedule_pipeline( # noqa: SLF001 + scheduler.schedule_pipeline( user_id=user_id, project_id=project_id, iteration=iteration, From 8dc49d6d2001f94be7d09ee26c4f28c7d66120e1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:44:46 +0100 Subject: [PATCH 022/127] refactored --- .../modules/comp_scheduler/__init__.py | 14 +++----- .../comp_scheduler/_distributed_scheduler.py | 14 ++++++++ .../comp_scheduler/_distributed_worker.py | 6 ++++ .../test_distributed_scheduler.py | 35 +++++++++++++++++-- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index dc4fe5ac73d..c09268302ee 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -7,9 +7,9 @@ from servicelib.logging_utils import log_context from ._distributed_scheduler import ( - SCHEDULER_INTERVAL, run_new_pipeline, - schedule_pipelines, + setup_manager, + shutdown_manager, stop_pipeline, ) from ._distributed_worker import setup_worker @@ -24,13 +24,7 @@ async def start_scheduler() -> None: ): await setup_worker(app) - - app.state.scheduler_manager = start_periodic_task( - schedule_pipelines, - interval=SCHEDULER_INTERVAL, - task_name="computational-distributed-scheduler", - app=app, - ) + await setup_manager(app) return start_scheduler @@ -40,7 +34,7 @@ async def stop_scheduler() -> None: with log_context( _logger, level=logging.INFO, msg="stopping computational scheduler" ): - await stop_periodic_task(app.state.scheduler_manager) + await shutdown_manager(app) # TODO: we might want to stop anything running in the worker too diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index 595cfce3666..b20885ce8d7 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -8,6 +8,7 @@ from models_library.clusters import ClusterID from models_library.projects import ProjectID from models_library.users import UserID +from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive @@ -139,3 +140,16 @@ async def schedule_pipelines(app: FastAPI) -> None: ), limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, ) + + +async def setup_manager(app: FastAPI) -> None: + app.state.scheduler_manager = start_periodic_task( + schedule_pipelines, + interval=SCHEDULER_INTERVAL, + task_name="computational-distributed-scheduler", + app=app, + ) + + +async def shutdown_manager(app: FastAPI) -> None: + await stop_periodic_task(app.state.scheduler_manager) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py index 1a13c41cff9..d3825459d17 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py @@ -37,3 +37,9 @@ async def setup_worker(app: FastAPI) -> None: ) app.state.scheduler_worker = create_scheduler(app) + + +async def shutdown_worker(app: FastAPI) -> None: + assert app.state.scheduler_worker # nosec + # TODO: we might need to cancel stuff here. not sure yet what + # unsubscribing is maybe not a good idea if we want to keep the data in the queue diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 26bd17b20ec..97eb6a9abd3 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -9,6 +9,7 @@ import asyncio +import datetime import logging from collections.abc import AsyncIterator, Callable from typing import Any, Awaitable @@ -71,10 +72,10 @@ def mock_env( @pytest.fixture def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler.stop_periodic_task", + "simcore_service_director_v2.modules.comp_scheduler.shutdown_manager", ) return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler.start_periodic_task", + "simcore_service_director_v2.modules.comp_scheduler.setup_manager", ) @@ -304,3 +305,33 @@ async def test_empty_pipeline_is_not_scheduled( assert "no computational dag defined" in caplog.records[0].message await _assert_comp_runs_empty(sqlalchemy_async_engine) scheduler_rabbit_client_parser.assert_not_called() + + +@pytest.fixture +def with_fast_scheduling(mocker: MockerFixture) -> None: + from simcore_service_director_v2.modules.comp_scheduler import ( + _distributed_scheduler, + ) + + mocker.patch.object( + _distributed_scheduler, "SCHEDULER_INTERVAL", datetime.timedelta(seconds=0.01) + ) + + +@pytest.fixture +def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler.schedule_pipelines", + autospec=True, + ) + + +async def test_auto_scheduling( + with_fast_scheduling: None, + with_disabled_scheduler_worker: mock.Mock, + mocked_schedule_pipelines: mock.Mock, + initialized_app: FastAPI, + sqlalchemy_async_engine: AsyncEngine, +): + await _assert_comp_runs_empty(sqlalchemy_async_engine) + mocked_schedule_pipelines.assert_called() From fd50743322635241d618225df7626541fccc3bb0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:47:51 +0100 Subject: [PATCH 023/127] refactored --- .../unit/with_dbs/comp_scheduler/conftest.py | 57 +++++++++++++++++++ .../test_distributed_scheduler.py | 46 +-------------- 2 files changed, 59 insertions(+), 44 deletions(-) create mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py new file mode 100644 index 00000000000..c7abf8cdd8f --- /dev/null +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py @@ -0,0 +1,57 @@ +# pylint:disable=unused-variable +# pylint:disable=unused-argument +# pylint:disable=redefined-outer-name +# pylint:disable=no-value-for-parameter +# pylint:disable=protected-access +# pylint:disable=too-many-arguments +# pylint:disable=no-name-in-module +# pylint: disable=too-many-statements + + +from unittest import mock + +import pytest +import sqlalchemy as sa +from pytest_mock.plugin import MockerFixture +from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict +from pytest_simcore.helpers.typing_env import EnvVarsDict +from settings_library.rabbit import RabbitSettings +from settings_library.redis import RedisSettings + + +@pytest.fixture +def mock_env( + mock_env: EnvVarsDict, + monkeypatch: pytest.MonkeyPatch, + fake_s3_envs: EnvVarsDict, + postgres_db: sa.engine.Engine, + postgres_host_config: dict[str, str], + rabbit_service: RabbitSettings, + redis_service: RedisSettings, +) -> EnvVarsDict: + return mock_env | setenvs_from_dict( + monkeypatch, + {k: f"{v}" for k, v in fake_s3_envs.items()} + | { + "COMPUTATIONAL_BACKEND_ENABLED": True, + "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": True, + }, + ) + + +@pytest.fixture +def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: + mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.shutdown_manager", + ) + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.setup_manager", + ) + + +@pytest.fixture +def with_disabled_scheduler_worker(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.setup_worker", + autospec=True, + ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py index 97eb6a9abd3..24f276f246b 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py @@ -11,8 +11,8 @@ import asyncio import datetime import logging -from collections.abc import AsyncIterator, Callable -from typing import Any, Awaitable +from collections.abc import AsyncIterator, Awaitable, Callable +from typing import Any from unittest import mock import pytest @@ -23,13 +23,9 @@ from models_library.projects import ProjectAtDB from models_library.projects_state import RunningState from pytest_mock.plugin import MockerFixture -from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict -from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import CouldNotAcquireLockError from servicelib.utils import limited_gather -from settings_library.rabbit import RabbitSettings -from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs from simcore_service_director_v2.core.errors import PipelineNotFoundError from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB @@ -49,44 +45,6 @@ pytest_simcore_ops_services_selection = ["adminer", "redis-commander"] -@pytest.fixture -def mock_env( - mock_env: EnvVarsDict, - monkeypatch: pytest.MonkeyPatch, - fake_s3_envs: EnvVarsDict, - postgres_db: sa.engine.Engine, - postgres_host_config: dict[str, str], - rabbit_service: RabbitSettings, - redis_service: RedisSettings, -) -> EnvVarsDict: - return mock_env | setenvs_from_dict( - monkeypatch, - {k: f"{v}" for k, v in fake_s3_envs.items()} - | { - "COMPUTATIONAL_BACKEND_ENABLED": True, - "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": True, - }, - ) - - -@pytest.fixture -def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: - mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler.shutdown_manager", - ) - return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler.setup_manager", - ) - - -@pytest.fixture -def with_disabled_scheduler_worker(mocker: MockerFixture) -> mock.Mock: - return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler.setup_worker", - autospec=True, - ) - - @pytest.fixture async def scheduler_rabbit_client_parser( create_rabbitmq_client: Callable[[str], RabbitMQClient], mocker: MockerFixture From 96a2424c9b3b119c26035310d93c1b3e01d71df1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:52:49 +0100 Subject: [PATCH 024/127] cleanup --- .../modules/comp_scheduler/_base_scheduler.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py index a1fe4da7939..cd60b824959 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py @@ -17,7 +17,7 @@ from abc import ABC, abstractmethod from collections.abc import Callable from dataclasses import dataclass -from typing import Final, TypeAlias +from typing import Final import arrow import networkx as nx @@ -77,13 +77,6 @@ _Previous = CompTaskAtDB _Current = CompTaskAtDB _MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10 -_SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) -_TASK_NAME_TEMPLATE: Final[str] = ( - "computational-scheduler-{user_id}:{project_id}:{iteration}" -) - -PipelineSchedulingTask: TypeAlias = asyncio.Task -PipelineSchedulingWakeUpEvent: TypeAlias = asyncio.Event @dataclass(frozen=True, slots=True) @@ -135,15 +128,6 @@ async def _triage_changed_tasks( ) -@dataclass(kw_only=True) -class ScheduledPipelineParams: - scheduler_task: asyncio.Task - scheduler_waker: asyncio.Event - - def wake_up(self) -> None: - self.scheduler_waker.set() - - @dataclass class BaseCompScheduler(ABC): db_engine: Engine From 4dbfc09fd553c338e7dfc911f1f6f1ea12c57957 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 17:45:28 +0100 Subject: [PATCH 025/127] almost there --- .../comp_scheduler/_distributed_scheduler.py | 35 ++++++++++++------- .../comp_scheduler/_distributed_worker.py | 22 +++++++----- .../comp_scheduler/test_distributed_worker.py | 0 3 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_worker.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index b20885ce8d7..aebd249effc 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -9,6 +9,7 @@ from models_library.projects import ProjectID from models_library.users import UserID from servicelib.background_task import start_periodic_task, stop_periodic_task +from servicelib.logging_utils import log_context from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive @@ -97,7 +98,11 @@ async def stop_pipeline( def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: assert kwargs is not None # nosec - app = args[0] + if args: + app = args[0] + else: + assert "app" in kwargs # nosec + app = kwargs["app"] assert isinstance(app, FastAPI) # nosec return get_redis_client_manager(app).client(RedisDatabase.LOCKS) @@ -128,18 +133,22 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr @exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") async def schedule_pipelines(app: FastAPI) -> None: - db_engine = get_db_engine(app) - runs_to_schedule = await CompRunsRepository.instance(db_engine).list( - filter_by_state=SCHEDULED_STATES, scheduled_since=SCHEDULER_INTERVAL - ) - rabbitmq_client = get_rabbitmq_client(app) - await limited_gather( - *( - _distribute_pipeline(run, rabbitmq_client, db_engine) - for run in runs_to_schedule - ), - limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, - ) + with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): + db_engine = get_db_engine(app) + runs_to_schedule = await CompRunsRepository.instance(db_engine).list( + filter_by_state=SCHEDULED_STATES, scheduled_since=SCHEDULER_INTERVAL + ) + + rabbitmq_client = get_rabbitmq_client(app) + await limited_gather( + *( + _distribute_pipeline(run, rabbitmq_client, db_engine) + for run in runs_to_schedule + ), + limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, + ) + if runs_to_schedule: + _logger.debug("distributed %d pipelines", len(runs_to_schedule)) async def setup_manager(app: FastAPI) -> None: diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py index d3825459d17..896e8dbb917 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py @@ -1,15 +1,20 @@ import functools +import logging from typing import cast from fastapi import FastAPI +from servicelib.logging_utils import log_context from ..rabbitmq import get_rabbitmq_client from ._base_scheduler import BaseCompScheduler from ._models import SchedulePipelineRabbitMessage from ._scheduler_factory import create_scheduler +_logger = logging.getLogger(__name__) + def _empty_wake_up_callack() -> None: + # TODO: need to re-publish here? return @@ -18,14 +23,15 @@ def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: - to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) - await _get_scheduler_worker(app).schedule_pipeline( - user_id=to_schedule_pipeline.user_id, - project_id=to_schedule_pipeline.project_id, - iteration=to_schedule_pipeline.iteration, - wake_up_callback=_empty_wake_up_callack, - ) - return True + with log_context(_logger, logging.DEBUG, msg="handling scheduling"): + to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) + await _get_scheduler_worker(app).schedule_pipeline( + user_id=to_schedule_pipeline.user_id, + project_id=to_schedule_pipeline.project_id, + iteration=to_schedule_pipeline.iteration, + wake_up_callback=_empty_wake_up_callack, + ) + return True async def setup_worker(app: FastAPI) -> None: diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_worker.py new file mode 100644 index 00000000000..e69de29bb2d From 1add0670797f94e3b7d66fe68f6530a6d6e62a58 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:28:16 +0100 Subject: [PATCH 026/127] ensure naming --- .../modules/comp_scheduler/__init__.py | 10 +++---- .../modules/comp_scheduler/_constants.py | 6 +++++ .../comp_scheduler/_distributed_scheduler.py | 27 +++++++++++++------ .../comp_scheduler/_distributed_worker.py | 5 ++++ 4 files changed, 33 insertions(+), 15 deletions(-) create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index c09268302ee..cb2e911cbd5 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -6,6 +6,7 @@ from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context +from ._constants import MODULE_NAME from ._distributed_scheduler import ( run_new_pipeline, setup_manager, @@ -19,10 +20,7 @@ def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def start_scheduler() -> None: - with log_context( - _logger, level=logging.INFO, msg="starting computational scheduler" - ): - + with log_context(_logger, level=logging.INFO, msg=f"starting {MODULE_NAME}"): await setup_worker(app) await setup_manager(app) @@ -31,9 +29,7 @@ async def start_scheduler() -> None: def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def stop_scheduler() -> None: - with log_context( - _logger, level=logging.INFO, msg="stopping computational scheduler" - ): + with log_context(_logger, level=logging.INFO, msg=f"stopping {MODULE_NAME}"): await shutdown_manager(app) # TODO: we might want to stop anything running in the worker too diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py new file mode 100644 index 00000000000..1be1cbb7cb6 --- /dev/null +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py @@ -0,0 +1,6 @@ +import datetime +from typing import Final + +MODULE_NAME: Final[str] = "computational-distributed-scheduler" +SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) +MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py index aebd249effc..b7cebdf98a4 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py @@ -1,6 +1,4 @@ -import datetime import logging -from typing import Final import networkx as nx from aiopg.sa import Engine @@ -25,10 +23,13 @@ from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client from ..redis import get_redis_client_manager +from ._constants import ( + MAX_CONCURRENT_PIPELINE_SCHEDULING, + MODULE_NAME, + SCHEDULER_INTERVAL, +) _logger = logging.getLogger(__name__) -SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) -_MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 async def run_new_pipeline( @@ -96,7 +97,7 @@ async def stop_pipeline( await _distribute_pipeline(updated_comp_run, rabbitmq_client, db_engine) -def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: +def _get_app_from_args(*args, **kwargs) -> FastAPI: assert kwargs is not None # nosec if args: app = args[0] @@ -104,9 +105,19 @@ def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: assert "app" in kwargs # nosec app = kwargs["app"] assert isinstance(app, FastAPI) # nosec + return app + + +def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: + app = _get_app_from_args(*args, **kwargs) return get_redis_client_manager(app).client(RedisDatabase.LOCKS) +def _redis_lock_key_builder(*args, **kwargs) -> str: + app = _get_app_from_args(*args, **kwargs) + return f"{app.title}_{MODULE_NAME}" + + async def _distribute_pipeline( run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine ) -> None: @@ -131,7 +142,7 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr return pipeline_at_db.get_graph() -@exclusive(_redis_client_getter, lock_key="computational-distributed-scheduler") +@exclusive(_redis_client_getter, lock_key=_redis_lock_key_builder) async def schedule_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): db_engine = get_db_engine(app) @@ -145,7 +156,7 @@ async def schedule_pipelines(app: FastAPI) -> None: _distribute_pipeline(run, rabbitmq_client, db_engine) for run in runs_to_schedule ), - limit=_MAX_CONCURRENT_PIPELINE_SCHEDULING, + limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, ) if runs_to_schedule: _logger.debug("distributed %d pipelines", len(runs_to_schedule)) @@ -155,7 +166,7 @@ async def setup_manager(app: FastAPI) -> None: app.state.scheduler_manager = start_periodic_task( schedule_pipelines, interval=SCHEDULER_INTERVAL, - task_name="computational-distributed-scheduler", + task_name=MODULE_NAME, app=app, ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py index 896e8dbb917..4feec77f375 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py @@ -23,8 +23,13 @@ def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: + with log_context(_logger, logging.DEBUG, msg="handling scheduling"): to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) + get_rabbitmq_client(app).publish( + SchedulePipelineRabbitMessage.get_channel_name(), + to_schedule_pipeline, + ) await _get_scheduler_worker(app).schedule_pipeline( user_id=to_schedule_pipeline.user_id, project_id=to_schedule_pipeline.project_id, From 083ca8664ab83fcfc15bb1209cf684cfcc83324c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:37:18 +0100 Subject: [PATCH 027/127] renaming --- .../modules/comp_scheduler/__init__.py | 9 ++------- .../{_distributed_scheduler.py => _manager.py} | 2 +- .../{_distributed_worker.py => _worker.py} | 0 ...test_distributed_scheduler.py => test_manager.py} | 12 +++++------- .../{test_distributed_worker.py => test_worker.py} | 0 5 files changed, 8 insertions(+), 15 deletions(-) rename services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/{_distributed_scheduler.py => _manager.py} (98%) rename services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/{_distributed_worker.py => _worker.py} (100%) rename services/director-v2/tests/unit/with_dbs/comp_scheduler/{test_distributed_scheduler.py => test_manager.py} (96%) rename services/director-v2/tests/unit/with_dbs/comp_scheduler/{test_distributed_worker.py => test_worker.py} (100%) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index cb2e911cbd5..c681a86d317 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -7,13 +7,8 @@ from servicelib.logging_utils import log_context from ._constants import MODULE_NAME -from ._distributed_scheduler import ( - run_new_pipeline, - setup_manager, - shutdown_manager, - stop_pipeline, -) -from ._distributed_worker import setup_worker +from ._manager import run_new_pipeline, setup_manager, shutdown_manager, stop_pipeline +from ._worker import setup_worker _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py similarity index 98% rename from services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py rename to services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index b7cebdf98a4..3c55e1d30be 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -17,7 +17,6 @@ from ...models.comp_runs import CompRunsAtDB, RunMetadataDict from ...utils.comp_scheduler import SCHEDULED_STATES from ...utils.rabbitmq import publish_project_log -from ..comp_scheduler._models import SchedulePipelineRabbitMessage from ..db import get_db_engine from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository @@ -28,6 +27,7 @@ MODULE_NAME, SCHEDULER_INTERVAL, ) +from ._models import SchedulePipelineRabbitMessage _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py similarity index 100% rename from services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_distributed_worker.py rename to services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py similarity index 96% rename from services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py rename to services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 24f276f246b..c205d3d786b 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -30,7 +30,7 @@ from simcore_service_director_v2.core.errors import PipelineNotFoundError from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict -from simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler import ( +from simcore_service_director_v2.modules.comp_scheduler._manager import ( SCHEDULER_INTERVAL, run_new_pipeline, schedule_pipelines, @@ -108,7 +108,7 @@ async def slow_limited_gather(*args, **kwargs): return result mock_function = mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler.limited_gather", + "simcore_service_director_v2.modules.comp_scheduler._manager.limited_gather", autospec=True, side_effect=slow_limited_gather, ) @@ -267,19 +267,17 @@ async def test_empty_pipeline_is_not_scheduled( @pytest.fixture def with_fast_scheduling(mocker: MockerFixture) -> None: - from simcore_service_director_v2.modules.comp_scheduler import ( - _distributed_scheduler, - ) + from simcore_service_director_v2.modules.comp_scheduler import _manager mocker.patch.object( - _distributed_scheduler, "SCHEDULER_INTERVAL", datetime.timedelta(seconds=0.01) + _manager, "SCHEDULER_INTERVAL", datetime.timedelta(seconds=0.01) ) @pytest.fixture def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._distributed_scheduler.schedule_pipelines", + "simcore_service_director_v2.modules.comp_scheduler._manager.schedule_pipelines", autospec=True, ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py similarity index 100% rename from services/director-v2/tests/unit/with_dbs/comp_scheduler/test_distributed_worker.py rename to services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py From 68cb686c72d85797840bffffbab3e5c292055917 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:39:11 +0100 Subject: [PATCH 028/127] rename method --- .../modules/comp_scheduler/_manager.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 3c55e1d30be..f7d93f1b013 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -63,7 +63,7 @@ async def run_new_pipeline( ) rabbitmq_client = get_rabbitmq_client(app) - await _distribute_pipeline(new_run, rabbitmq_client, db_engine) + await _request_pipeline_scheduling(new_run, rabbitmq_client, db_engine) await publish_project_log( rabbitmq_client, user_id, @@ -94,7 +94,7 @@ async def stop_pipeline( if updated_comp_run: # ensure the scheduler starts right away rabbitmq_client = get_rabbitmq_client(app) - await _distribute_pipeline(updated_comp_run, rabbitmq_client, db_engine) + await _request_pipeline_scheduling(updated_comp_run, rabbitmq_client, db_engine) def _get_app_from_args(*args, **kwargs) -> FastAPI: @@ -118,7 +118,7 @@ def _redis_lock_key_builder(*args, **kwargs) -> str: return f"{app.title}_{MODULE_NAME}" -async def _distribute_pipeline( +async def _request_pipeline_scheduling( run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine ) -> None: # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency @@ -153,7 +153,7 @@ async def schedule_pipelines(app: FastAPI) -> None: rabbitmq_client = get_rabbitmq_client(app) await limited_gather( *( - _distribute_pipeline(run, rabbitmq_client, db_engine) + _request_pipeline_scheduling(run, rabbitmq_client, db_engine) for run in runs_to_schedule ), limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, From 864591e2c0dc5e3400668ed5dbdd36b8ff07d47b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:43:41 +0100 Subject: [PATCH 029/127] moved method --- .../modules/comp_scheduler/_manager.py | 29 ++++--------------- .../modules/comp_scheduler/_publisher.py | 23 +++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) create mode 100644 services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index f7d93f1b013..d7837e0e6f5 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -8,13 +8,12 @@ from models_library.users import UserID from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context -from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive from servicelib.utils import limited_gather from settings_library.redis import RedisDatabase -from ...models.comp_runs import CompRunsAtDB, RunMetadataDict +from ...models.comp_runs import RunMetadataDict from ...utils.comp_scheduler import SCHEDULED_STATES from ...utils.rabbitmq import publish_project_log from ..db import get_db_engine @@ -27,7 +26,7 @@ MODULE_NAME, SCHEDULER_INTERVAL, ) -from ._models import SchedulePipelineRabbitMessage +from ._publisher import request_pipeline_scheduling _logger = logging.getLogger(__name__) @@ -63,7 +62,7 @@ async def run_new_pipeline( ) rabbitmq_client = get_rabbitmq_client(app) - await _request_pipeline_scheduling(new_run, rabbitmq_client, db_engine) + await request_pipeline_scheduling(new_run, rabbitmq_client, db_engine) await publish_project_log( rabbitmq_client, user_id, @@ -94,7 +93,7 @@ async def stop_pipeline( if updated_comp_run: # ensure the scheduler starts right away rabbitmq_client = get_rabbitmq_client(app) - await _request_pipeline_scheduling(updated_comp_run, rabbitmq_client, db_engine) + await request_pipeline_scheduling(updated_comp_run, rabbitmq_client, db_engine) def _get_app_from_args(*args, **kwargs) -> FastAPI: @@ -118,24 +117,6 @@ def _redis_lock_key_builder(*args, **kwargs) -> str: return f"{app.title}_{MODULE_NAME}" -async def _request_pipeline_scheduling( - run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine -) -> None: - # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency - # async with transaction_context(get_asyncpg_engine(app)) as connection: - await rabbitmq_client.publish( - SchedulePipelineRabbitMessage.get_channel_name(), - SchedulePipelineRabbitMessage( - user_id=run.user_id, - project_id=run.project_uuid, - iteration=run.iteration, - ), - ) - await CompRunsRepository.instance(db_engine).mark_as_scheduled( - user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration - ) - - async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGraph: comp_pipeline_repo = CompPipelinesRepository.instance(db_engine) pipeline_at_db = await comp_pipeline_repo.get_pipeline(project_id) @@ -153,7 +134,7 @@ async def schedule_pipelines(app: FastAPI) -> None: rabbitmq_client = get_rabbitmq_client(app) await limited_gather( *( - _request_pipeline_scheduling(run, rabbitmq_client, db_engine) + request_pipeline_scheduling(run, rabbitmq_client, db_engine) for run in runs_to_schedule ), limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py new file mode 100644 index 00000000000..36310b826d2 --- /dev/null +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py @@ -0,0 +1,23 @@ +from servicelib.rabbitmq import RabbitMQClient + +from ...models.comp_runs import CompRunsAtDB +from ..db.repositories.comp_runs import CompRunsRepository +from ._models import SchedulePipelineRabbitMessage + + +async def request_pipeline_scheduling( + run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine +) -> None: + # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # async with transaction_context(get_asyncpg_engine(app)) as connection: + await rabbitmq_client.publish( + SchedulePipelineRabbitMessage.get_channel_name(), + SchedulePipelineRabbitMessage( + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ), + ) + await CompRunsRepository.instance(db_engine).mark_as_scheduled( + user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + ) From 5aac1ead4200e4c46b39803eee817e6c787a1b64 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:44:44 +0100 Subject: [PATCH 030/127] cleanup --- .../modules/comp_scheduler/_scheduler_factory.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index f9f5a61306c..70e63650921 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -4,7 +4,6 @@ from servicelib.logging_utils import log_context from settings_library.redis import RedisDatabase -from ...core.errors import ConfigurationError from ...core.settings import AppSettings from ..dask_clients_pool import DaskClientsPool from ..db import get_db_engine @@ -17,10 +16,6 @@ def create_scheduler(app: FastAPI) -> BaseCompScheduler: - if not hasattr(app.state, "engine"): - msg = "Database connection is missing. Please check application configuration." - raise ConfigurationError(msg) - with log_context( _logger, logging.INFO, msg="Creating Dask-based computational scheduler" ): From 10bbcb4288e09859ac095b93f6a4fe228848871f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:46:44 +0100 Subject: [PATCH 031/127] renaming --- .../modules/comp_scheduler/_dask_scheduler.py | 2 +- .../comp_scheduler/{_base_scheduler.py => _scheduler_base.py} | 0 .../modules/comp_scheduler/_scheduler_factory.py | 2 +- .../modules/comp_scheduler/_worker.py | 2 +- .../unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/{_base_scheduler.py => _scheduler_base.py} (100%) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py index 2fdf7acd2e9..51e6694b11a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py @@ -50,7 +50,7 @@ from ..db.repositories.clusters import ClustersRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository -from ._base_scheduler import BaseCompScheduler +from ._scheduler_base import BaseCompScheduler _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py similarity index 100% rename from services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py rename to services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index 70e63650921..56a90e12713 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -9,8 +9,8 @@ from ..db import get_db_engine from ..rabbitmq import get_rabbitmq_client, get_rabbitmq_rpc_client from ..redis import get_redis_client_manager -from ._base_scheduler import BaseCompScheduler from ._dask_scheduler import DaskScheduler +from ._scheduler_base import BaseCompScheduler _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 4feec77f375..789cdd6c17d 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -6,8 +6,8 @@ from servicelib.logging_utils import log_context from ..rabbitmq import get_rabbitmq_client -from ._base_scheduler import BaseCompScheduler from ._models import SchedulePipelineRabbitMessage +from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler _logger = logging.getLogger(__name__) diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index c53fe0f9861..90af7606d25 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -265,7 +265,7 @@ def _fake_starter( return scheduler_task, scheduler_task_wake_up_event return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._base_scheduler.BaseCompScheduler._start_scheduling", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler._start_scheduling", autospec=True, side_effect=_fake_starter, ) From 8d8cb669742e96f746ca9eb5c5306f51867b2f68 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:48:03 +0100 Subject: [PATCH 032/127] renaming --- ...{_dask_scheduler.py => _scheduler_dask.py} | 3 +-- .../comp_scheduler/_scheduler_factory.py | 2 +- ...t_modules_comp_scheduler_dask_scheduler.py | 22 +++++++++---------- 3 files changed, 13 insertions(+), 14 deletions(-) rename services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/{_dask_scheduler.py => _scheduler_dask.py} (99%) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py similarity index 99% rename from services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py rename to services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py index 51e6694b11a..08a99b3a2cd 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py @@ -1,10 +1,9 @@ import asyncio import contextlib import logging -from collections.abc import AsyncIterator +from collections.abc import AsyncIterator, Callable from contextlib import asynccontextmanager from dataclasses import dataclass -from typing import Callable import arrow from dask_task_models_library.container_tasks.errors import TaskCancelledError diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index 56a90e12713..edda456f303 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -9,8 +9,8 @@ from ..db import get_db_engine from ..rabbitmq import get_rabbitmq_client, get_rabbitmq_rpc_client from ..redis import get_redis_client_manager -from ._dask_scheduler import DaskScheduler from ._scheduler_base import BaseCompScheduler +from ._scheduler_dask import DaskScheduler _logger = logging.getLogger(__name__) diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index 90af7606d25..d43aeb3fd00 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -71,7 +71,7 @@ BaseCompScheduler, _get_scheduler_worker, ) -from simcore_service_director_v2.modules.comp_scheduler._dask_scheduler import ( +from simcore_service_director_v2.modules.comp_scheduler._scheduler_dask import ( DaskScheduler, ) from simcore_service_director_v2.modules.dask_client import ( @@ -192,7 +192,7 @@ async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None: @pytest.fixture -def minimal_dask_scheduler_config( +def minimal_scheduler_dask_config( mock_env: EnvVarsDict, postgres_host_config: dict[str, str], monkeypatch: pytest.MonkeyPatch, @@ -215,7 +215,7 @@ def minimal_dask_scheduler_config( @pytest.fixture def scheduler( - minimal_dask_scheduler_config: None, + minimal_scheduler_dask_config: None, aiopg_engine: aiopg.sa.engine.Engine, minimal_app: FastAPI, ) -> BaseCompScheduler: @@ -237,7 +237,7 @@ def mocked_dask_client(mocker: MockerFixture) -> mock.MagicMock: @pytest.fixture def mocked_parse_output_data_fct(mocker: MockerFixture) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._dask_scheduler.parse_output_data", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.parse_output_data", autospec=True, ) @@ -245,7 +245,7 @@ def mocked_parse_output_data_fct(mocker: MockerFixture) -> mock.Mock: @pytest.fixture def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._dask_scheduler.clean_task_output_and_log_files_if_invalid", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid", return_value=None, autospec=True, ) @@ -284,13 +284,13 @@ async def minimal_app(async_client: httpx.AsyncClient) -> FastAPI: @pytest.fixture def mocked_clean_task_output_and_log_files_if_invalid(mocker: MockerFixture) -> None: mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._dask_scheduler.clean_task_output_and_log_files_if_invalid", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid", autospec=True, ) async def test_scheduler_gracefully_starts_and_stops( - minimal_dask_scheduler_config: None, + minimal_scheduler_dask_config: None, aiopg_engine: aiopg.sa.engine.Engine, dask_spec_local_cluster: SpecCluster, minimal_app: FastAPI, @@ -306,7 +306,7 @@ async def test_scheduler_gracefully_starts_and_stops( ], ) def test_scheduler_raises_exception_for_missing_dependencies( - minimal_dask_scheduler_config: None, + minimal_scheduler_dask_config: None, aiopg_engine: aiopg.sa.engine.Engine, dask_spec_local_cluster: SpecCluster, monkeypatch: pytest.MonkeyPatch, @@ -1100,7 +1100,7 @@ async def test_task_progress_triggers( ), ], ) -async def test_handling_of_disconnected_dask_scheduler( +async def test_handling_of_disconnected_scheduler_dask( with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1112,7 +1112,7 @@ async def test_handling_of_disconnected_dask_scheduler( ): # this will create a non connected backend issue that will trigger re-connection mocked_dask_client_send_task = mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._dask_scheduler.DaskClient.send_computation_tasks", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.DaskClient.send_computation_tasks", side_effect=backend_error, ) assert mocked_dask_client_send_task @@ -1506,7 +1506,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta @pytest.fixture async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._dask_scheduler.get_or_create_on_demand_cluster", + "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.get_or_create_on_demand_cluster", autospec=True, ) From 1381706f53808ad46e6287650f237c761e680436 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:56:03 +0100 Subject: [PATCH 033/127] ongoing --- .../modules/comp_scheduler/_worker.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 789cdd6c17d..f3a8e65a3c2 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -3,19 +3,32 @@ from typing import cast from fastapi import FastAPI +from models_library.projects import ProjectID +from models_library.users import UserID from servicelib.logging_utils import log_context +from ...utils.comp_scheduler import Iteration +from ..db import get_db_engine +from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client from ._models import SchedulePipelineRabbitMessage +from ._publisher import request_pipeline_scheduling from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler _logger = logging.getLogger(__name__) -def _empty_wake_up_callack() -> None: - # TODO: need to re-publish here? - return +def _empty_wake_up_callack( + app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration +) -> None: + async def _async_cb(): + db_engine = get_db_engine(app) + rabbit_mq_client = get_rabbitmq_client(app) + comp_run = await CompRunsRepository.instance(db_engine).get( + user_id=user_id, project_id=project_id, iteration=iteration + ) + await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: From c4b13ceb119469ddac2f61f77225c91ac85abda6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 10:41:05 +0100 Subject: [PATCH 034/127] missing import --- .../modules/comp_scheduler/_publisher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py index 36310b826d2..d11b1a65704 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py @@ -1,3 +1,4 @@ +from aiopg.sa import Engine from servicelib.rabbitmq import RabbitMQClient from ...models.comp_runs import CompRunsAtDB From 22c27b667fca16749b1684da7e5d382ab73b680b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:27:52 +0100 Subject: [PATCH 035/127] moved utils --- .../modules/comp_scheduler/_manager.py | 2 +- .../modules/comp_scheduler/_models.py | 2 +- .../modules/comp_scheduler/_scheduler_base.py | 20 ++++++------- .../modules/comp_scheduler/_scheduler_dask.py | 2 +- .../comp_scheduler/_utils.py} | 2 +- .../modules/comp_scheduler/_worker.py | 2 +- .../db/repositories/comp_tasks/_utils.py | 30 +++++++++---------- .../tests/unit/test_utils_comp_scheduler.py | 2 +- ...t_modules_comp_scheduler_dask_scheduler.py | 2 +- 9 files changed, 32 insertions(+), 32 deletions(-) rename services/director-v2/src/simcore_service_director_v2/{utils/comp_scheduler.py => modules/comp_scheduler/_utils.py} (97%) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index d7837e0e6f5..f4def349566 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -14,7 +14,6 @@ from settings_library.redis import RedisDatabase from ...models.comp_runs import RunMetadataDict -from ...utils.comp_scheduler import SCHEDULED_STATES from ...utils.rabbitmq import publish_project_log from ..db import get_db_engine from ..db.repositories.comp_pipelines import CompPipelinesRepository @@ -27,6 +26,7 @@ SCHEDULER_INTERVAL, ) from ._publisher import request_pipeline_scheduling +from ._utils import SCHEDULED_STATES _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py index f98a18b2851..5bc65e23d64 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py @@ -4,7 +4,7 @@ from models_library.rabbitmq_messages import RabbitMessageBase from models_library.users import UserID -from ...utils.comp_scheduler import Iteration +from ._utils import Iteration class SchedulePipelineRabbitMessage(RabbitMessageBase): diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index cd60b824959..daffb0b19a0 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -50,16 +50,6 @@ from ...models.comp_pipelines import CompPipelineAtDB from ...models.comp_runs import CompRunsAtDB, RunMetadataDict from ...models.comp_tasks import CompTaskAtDB -from ...utils.comp_scheduler import ( - COMPLETED_STATES, - PROCESSING_STATES, - RUNNING_STATES, - TASK_TO_START_STATES, - WAITING_FOR_START_STATES, - Iteration, - create_service_resources_from_task, - get_resource_tracking_run_id, -) from ...utils.computations import get_pipeline_state_from_task_states from ...utils.rabbitmq import ( publish_project_log, @@ -70,6 +60,16 @@ from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository +from ._utils import ( + COMPLETED_STATES, + PROCESSING_STATES, + RUNNING_STATES, + TASK_TO_START_STATES, + WAITING_FOR_START_STATES, + Iteration, + create_service_resources_from_task, + get_resource_tracking_run_id, +) _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py index 08a99b3a2cd..e97ae55cc86 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py @@ -30,7 +30,6 @@ from ...models.comp_runs import CompRunsAtDB, RunMetadataDict from ...models.comp_tasks import CompTaskAtDB from ...models.dask_subsystem import DaskClientTaskState -from ...utils.comp_scheduler import Iteration, get_resource_tracking_run_id from ...utils.dask import ( clean_task_output_and_log_files_if_invalid, parse_dask_job_id, @@ -50,6 +49,7 @@ from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository from ._scheduler_base import BaseCompScheduler +from ._utils import Iteration, get_resource_tracking_run_id _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/utils/comp_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py similarity index 97% rename from services/director-v2/src/simcore_service_director_v2/utils/comp_scheduler.py rename to services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py index 15f3481da10..7f8767188c3 100644 --- a/services/director-v2/src/simcore_service_director_v2/utils/comp_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py @@ -12,7 +12,7 @@ from models_library.users import UserID from pydantic import PositiveInt -from ..models.comp_tasks import CompTaskAtDB +from ...models.comp_tasks import CompTaskAtDB SCHEDULED_STATES: set[RunningState] = { RunningState.PUBLISHED, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index f3a8e65a3c2..cc6c3769cec 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -7,7 +7,6 @@ from models_library.users import UserID from servicelib.logging_utils import log_context -from ...utils.comp_scheduler import Iteration from ..db import get_db_engine from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client @@ -15,6 +14,7 @@ from ._publisher import request_pipeline_scheduling from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler +from ._utils import Iteration _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py index 637e0c7faf6..51082b698f1 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.py @@ -55,9 +55,9 @@ from .....models.comp_tasks import CompTaskAtDB, Image, NodeSchema from .....models.pricing import PricingInfo from .....modules.resource_usage_tracker_client import ResourceUsageTrackerClient -from .....utils.comp_scheduler import COMPLETED_STATES from .....utils.computations import to_node_class from ....catalog import CatalogClient +from ....comp_scheduler._utils import COMPLETED_STATES from ....director_v0 import DirectorV0Client from ...tables import NodeClass @@ -146,12 +146,12 @@ async def _get_node_infos( None, ) - result: tuple[ - ServiceMetaDataPublished, ServiceExtras, SimcoreServiceLabels - ] = await asyncio.gather( - _get_service_details(catalog_client, user_id, product_name, node), - director_client.get_service_extras(node.key, node.version), - director_client.get_service_labels(node), + result: tuple[ServiceMetaDataPublished, ServiceExtras, SimcoreServiceLabels] = ( + await asyncio.gather( + _get_service_details(catalog_client, user_id, product_name, node), + director_client.get_service_extras(node.key, node.version), + director_client.get_service_labels(node), + ) ) return result @@ -247,9 +247,9 @@ async def _get_pricing_and_hardware_infos( return pricing_info, hardware_info -_RAM_SAFE_MARGIN_RATIO: Final[ - float -] = 0.1 # NOTE: machines always have less available RAM than advertised +_RAM_SAFE_MARGIN_RATIO: Final[float] = ( + 0.1 # NOTE: machines always have less available RAM than advertised +) _CPUS_SAFE_MARGIN: Final[float] = 0.1 @@ -267,11 +267,11 @@ async def _update_project_node_resources_from_hardware_info( if not hardware_info.aws_ec2_instances: return try: - unordered_list_ec2_instance_types: list[ - EC2InstanceTypeGet - ] = await get_instance_type_details( - rabbitmq_rpc_client, - instance_type_names=set(hardware_info.aws_ec2_instances), + unordered_list_ec2_instance_types: list[EC2InstanceTypeGet] = ( + await get_instance_type_details( + rabbitmq_rpc_client, + instance_type_names=set(hardware_info.aws_ec2_instances), + ) ) assert unordered_list_ec2_instance_types # nosec diff --git a/services/director-v2/tests/unit/test_utils_comp_scheduler.py b/services/director-v2/tests/unit/test_utils_comp_scheduler.py index dfb7c0326b1..05c899a5e40 100644 --- a/services/director-v2/tests/unit/test_utils_comp_scheduler.py +++ b/services/director-v2/tests/unit/test_utils_comp_scheduler.py @@ -10,7 +10,7 @@ from models_library.projects_state import RunningState from models_library.users import UserID from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB -from simcore_service_director_v2.utils.comp_scheduler import ( +from simcore_service_director_v2.modules.comp_scheduler._utils import ( COMPLETED_STATES, SCHEDULED_STATES, TASK_TO_START_STATES, diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index d43aeb3fd00..7885a547caf 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -74,11 +74,11 @@ from simcore_service_director_v2.modules.comp_scheduler._scheduler_dask import ( DaskScheduler, ) +from simcore_service_director_v2.modules.comp_scheduler._utils import COMPLETED_STATES from simcore_service_director_v2.modules.dask_client import ( DaskJobID, PublishedComputationTask, ) -from simcore_service_director_v2.utils.comp_scheduler import COMPLETED_STATES from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers from starlette.testclient import TestClient from tenacity.asyncio import AsyncRetrying From 8843b2e1a9fb67927b687437cbb37e23e3adfed2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:31:44 +0100 Subject: [PATCH 036/127] moved iteartion to models --- .../modules/comp_scheduler/_models.py | 5 +++-- .../modules/comp_scheduler/_scheduler_base.py | 9 ++++----- .../modules/comp_scheduler/_scheduler_dask.py | 3 ++- .../modules/comp_scheduler/_utils.py | 7 +------ 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py index 5bc65e23d64..2af28102a30 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py @@ -1,10 +1,11 @@ -from typing import Literal +from typing import Literal, TypeAlias from models_library.projects import ProjectID from models_library.rabbitmq_messages import RabbitMessageBase from models_library.users import UserID +from pydantic import PositiveInt -from ._utils import Iteration +Iteration: TypeAlias = PositiveInt class SchedulePipelineRabbitMessage(RabbitMessageBase): diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index daffb0b19a0..a6e58f0bc0f 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -28,7 +28,6 @@ from models_library.services import ServiceKey, ServiceType, ServiceVersion from models_library.users import UserID from networkx.classes.reportviews import InDegreeView -from pydantic import PositiveInt from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE from servicelib.logging_utils import log_context from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient @@ -60,13 +59,13 @@ from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository +from ._models import Iteration from ._utils import ( COMPLETED_STATES, PROCESSING_STATES, RUNNING_STATES, TASK_TO_START_STATES, WAITING_FOR_START_STATES, - Iteration, create_service_resources_from_task, get_resource_tracking_run_id, ) @@ -167,7 +166,7 @@ async def _update_run_result_from_tasks( self, user_id: UserID, project_id: ProjectID, - iteration: PositiveInt, + iteration: Iteration, pipeline_tasks: dict[NodeIDStr, CompTaskAtDB], ) -> RunningState: pipeline_state_from_tasks: RunningState = get_pipeline_state_from_task_states( @@ -187,7 +186,7 @@ async def _set_run_result( self, user_id: UserID, project_id: ProjectID, - iteration: PositiveInt, + iteration: Iteration, run_result: RunningState, ) -> None: comp_runs_repo = CompRunsRepository.instance(self.db_engine) @@ -494,7 +493,7 @@ async def schedule_pipeline( *, user_id: UserID, project_id: ProjectID, - iteration: PositiveInt, + iteration: Iteration, wake_up_callback: Callable[[], None], ) -> None: with log_context( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py index e97ae55cc86..d95227ed809 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py @@ -48,8 +48,9 @@ from ..db.repositories.clusters import ClustersRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository +from ._models import Iteration from ._scheduler_base import BaseCompScheduler -from ._utils import Iteration, get_resource_tracking_run_id +from ._utils import get_resource_tracking_run_id _logger = logging.getLogger(__name__) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py index 7f8767188c3..86f78637bf4 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py @@ -1,5 +1,3 @@ -from typing import TypeAlias - from models_library.docker import DockerGenericTag from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID @@ -10,9 +8,9 @@ ServiceResourcesDictHelpers, ) from models_library.users import UserID -from pydantic import PositiveInt from ...models.comp_tasks import CompTaskAtDB +from ._models import Iteration SCHEDULED_STATES: set[RunningState] = { RunningState.PUBLISHED, @@ -51,9 +49,6 @@ } -Iteration: TypeAlias = PositiveInt - - def get_resource_tracking_run_id( user_id: UserID, project_id: ProjectID, node_id: NodeID, iteration: Iteration ) -> str: From d037813e684eb7369b0f90bccf32d1205d20e362 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:35:14 +0100 Subject: [PATCH 037/127] moved iteartion to models --- .../src/simcore_service_director_v2/models/comp_runs.py | 6 +++++- .../modules/comp_scheduler/_models.py | 5 ++--- .../modules/comp_scheduler/_scheduler_base.py | 3 +-- .../modules/comp_scheduler/_scheduler_dask.py | 3 +-- .../modules/comp_scheduler/_utils.py | 2 +- .../modules/comp_scheduler/_worker.py | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py index 65b9468686a..f5cd1165cc7 100644 --- a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py @@ -1,5 +1,6 @@ import datetime from contextlib import suppress +from typing import TypeAlias from models_library.clusters import DEFAULT_CLUSTER_ID, ClusterID from models_library.projects import ProjectID @@ -37,12 +38,15 @@ class RunMetadataDict(TypedDict, total=False): project_metadata: ProjectMetadataDict +Iteration: TypeAlias = PositiveInt + + class CompRunsAtDB(BaseModel): run_id: PositiveInt project_uuid: ProjectID user_id: UserID cluster_id: ClusterID | None - iteration: PositiveInt + iteration: Iteration result: RunningState created: datetime.datetime modified: datetime.datetime diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py index 2af28102a30..7ec438bc589 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py @@ -1,11 +1,10 @@ -from typing import Literal, TypeAlias +from typing import Literal from models_library.projects import ProjectID from models_library.rabbitmq_messages import RabbitMessageBase from models_library.users import UserID -from pydantic import PositiveInt -Iteration: TypeAlias = PositiveInt +from ...models.comp_runs import Iteration class SchedulePipelineRabbitMessage(RabbitMessageBase): diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index a6e58f0bc0f..97f9a921c24 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -47,7 +47,7 @@ ) from ...core.settings import ComputationalBackendSettings from ...models.comp_pipelines import CompPipelineAtDB -from ...models.comp_runs import CompRunsAtDB, RunMetadataDict +from ...models.comp_runs import CompRunsAtDB, Iteration, RunMetadataDict from ...models.comp_tasks import CompTaskAtDB from ...utils.computations import get_pipeline_state_from_task_states from ...utils.rabbitmq import ( @@ -59,7 +59,6 @@ from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository -from ._models import Iteration from ._utils import ( COMPLETED_STATES, PROCESSING_STATES, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py index d95227ed809..adc67853686 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_dask.py @@ -27,7 +27,7 @@ ComputationalBackendOnDemandNotReadyError, TaskSchedulingError, ) -from ...models.comp_runs import CompRunsAtDB, RunMetadataDict +from ...models.comp_runs import CompRunsAtDB, Iteration, RunMetadataDict from ...models.comp_tasks import CompTaskAtDB from ...models.dask_subsystem import DaskClientTaskState from ...utils.dask import ( @@ -48,7 +48,6 @@ from ..db.repositories.clusters import ClustersRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository -from ._models import Iteration from ._scheduler_base import BaseCompScheduler from ._utils import get_resource_tracking_run_id diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py index 86f78637bf4..8ebda030bed 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py @@ -9,8 +9,8 @@ ) from models_library.users import UserID +from ...models.comp_runs import Iteration from ...models.comp_tasks import CompTaskAtDB -from ._models import Iteration SCHEDULED_STATES: set[RunningState] = { RunningState.PUBLISHED, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index cc6c3769cec..2b12232746e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -7,6 +7,7 @@ from models_library.users import UserID from servicelib.logging_utils import log_context +from ...models.comp_runs import Iteration from ..db import get_db_engine from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client @@ -14,7 +15,6 @@ from ._publisher import request_pipeline_scheduling from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler -from ._utils import Iteration _logger = logging.getLogger(__name__) From fe64bf8ba9dbf7bdc461cebaaa3a02724925ada9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:46:19 +0100 Subject: [PATCH 038/127] move test to context --- .../test_modules_comp_scheduler_dask_scheduler.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename services/director-v2/tests/unit/with_dbs/{ => comp_scheduler}/test_modules_comp_scheduler_dask_scheduler.py (100%) diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py similarity index 100% rename from services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py rename to services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py From 63612b6cdc9c585c9d0a37c26343749fcd36167e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:50:07 +0100 Subject: [PATCH 039/127] checking tests --- .../test_modules_comp_scheduler_dask_scheduler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py index 7885a547caf..68b16c7be5a 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py @@ -67,9 +67,8 @@ from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState -from simcore_service_director_v2.modules.comp_scheduler import ( +from simcore_service_director_v2.modules.comp_scheduler._scheduler_base import ( BaseCompScheduler, - _get_scheduler_worker, ) from simcore_service_director_v2.modules.comp_scheduler._scheduler_dask import ( DaskScheduler, From 01406fb56decb976471fda69e5f90e238adb7dee Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:07:51 +0100 Subject: [PATCH 040/127] base test for worker init/shutdown --- .../tests/unit/with_dbs/comp_scheduler/test_worker.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index e69de29bb2d..51ae202cfba 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -0,0 +1,8 @@ +import FastAPI +from simcore_service_director_v2.modules.comp_scheduler._worker import ( + _get_scheduler_worker, +) + + +async def test_worker_starts_and_stops(initialized_app: FastAPI): + assert _get_scheduler_worker(initialized_app) is not None From 86df8bfb5bf8ea8052de6c953c06ac0e12279617 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:08:02 +0100 Subject: [PATCH 041/127] re-order --- .../with_dbs/comp_scheduler/test_manager.py | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index c205d3d786b..130bad7a2e9 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -74,13 +74,41 @@ async def _assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: await _assert_comp_runs(sqlalchemy_async_engine, expected_total=0) +@pytest.fixture +def with_fast_scheduling(mocker: MockerFixture) -> None: + from simcore_service_director_v2.modules.comp_scheduler import _manager + + mocker.patch.object( + _manager, "SCHEDULER_INTERVAL", datetime.timedelta(seconds=0.01) + ) + + +@pytest.fixture +def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._manager.schedule_pipelines", + autospec=True, + ) + + +async def test_manager_starts_and_auto_schedules_pipelines( + with_fast_scheduling: None, + with_disabled_scheduler_worker: mock.Mock, + mocked_schedule_pipelines: mock.Mock, + initialized_app: FastAPI, + sqlalchemy_async_engine: AsyncEngine, +): + await _assert_comp_runs_empty(sqlalchemy_async_engine) + mocked_schedule_pipelines.assert_called() + + async def test_schedule_pipelines_empty_db( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, sqlalchemy_async_engine: AsyncEngine, ): - with_disabled_auto_scheduling.assert_called_once() await _assert_comp_runs_empty(sqlalchemy_async_engine) await schedule_pipelines(initialized_app) @@ -96,10 +124,10 @@ async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, mocker: MockerFixture, - monkeypatch: pytest.MonkeyPatch, ): CONCURRENCY = 5 # NOTE: this ensure no flakyness as empty scheduling is very fast + # so we slow down the limited_gather function original_function = limited_gather async def slow_limited_gather(*args, **kwargs): @@ -263,31 +291,3 @@ async def test_empty_pipeline_is_not_scheduled( assert "no computational dag defined" in caplog.records[0].message await _assert_comp_runs_empty(sqlalchemy_async_engine) scheduler_rabbit_client_parser.assert_not_called() - - -@pytest.fixture -def with_fast_scheduling(mocker: MockerFixture) -> None: - from simcore_service_director_v2.modules.comp_scheduler import _manager - - mocker.patch.object( - _manager, "SCHEDULER_INTERVAL", datetime.timedelta(seconds=0.01) - ) - - -@pytest.fixture -def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: - return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._manager.schedule_pipelines", - autospec=True, - ) - - -async def test_auto_scheduling( - with_fast_scheduling: None, - with_disabled_scheduler_worker: mock.Mock, - mocked_schedule_pipelines: mock.Mock, - initialized_app: FastAPI, - sqlalchemy_async_engine: AsyncEngine, -): - await _assert_comp_runs_empty(sqlalchemy_async_engine) - mocked_schedule_pipelines.assert_called() From f736f54a3faaaef5efc4d9bb6f0452bd2c554e14 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:08:15 +0100 Subject: [PATCH 042/127] rename --- ...sk_scheduler.py => test_scheduler_dask.py} | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) rename services/director-v2/tests/unit/with_dbs/comp_scheduler/{test_modules_comp_scheduler_dask_scheduler.py => test_scheduler_dask.py} (99%) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py similarity index 99% rename from services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py rename to services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 68b16c7be5a..dc1d855f8dd 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -18,7 +18,6 @@ import aiopg import aiopg.sa -import httpx import pytest from _helpers import PublishedProject, RunningProject from dask.distributed import SpecCluster @@ -216,9 +215,9 @@ def minimal_scheduler_dask_config( def scheduler( minimal_scheduler_dask_config: None, aiopg_engine: aiopg.sa.engine.Engine, - minimal_app: FastAPI, + initialized_app: FastAPI, ) -> BaseCompScheduler: - scheduler = _get_scheduler_worker(minimal_app) + scheduler = _get_scheduler_worker(initialized_app) assert scheduler is not None return scheduler @@ -270,16 +269,6 @@ def _fake_starter( ) -@pytest.fixture -async def minimal_app(async_client: httpx.AsyncClient) -> FastAPI: - # must use the minimal app from from the `async_client`` - # the`client` uses starlette's TestClient which spawns - # a new thread on which it creates a new loop - # causing issues downstream with coroutines not - # being created on the same loop - return async_client._transport.app # type: ignore # noqa: SLF001 - - @pytest.fixture def mocked_clean_task_output_and_log_files_if_invalid(mocker: MockerFixture) -> None: mocker.patch( @@ -292,10 +281,10 @@ async def test_scheduler_gracefully_starts_and_stops( minimal_scheduler_dask_config: None, aiopg_engine: aiopg.sa.engine.Engine, dask_spec_local_cluster: SpecCluster, - minimal_app: FastAPI, + initialized_app: FastAPI, ): # check it started correctly - assert _get_scheduler_worker(minimal_app) is not None + assert _get_scheduler_worker(initialized_app) is not None @pytest.mark.parametrize( From c37f1b58349da891169d5faa7f8e15f10acd418c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:21:14 +0100 Subject: [PATCH 043/127] put it back in working state --- .../modules/comp_scheduler/_worker.py | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 2b12232746e..92d532e6872 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -3,16 +3,10 @@ from typing import cast from fastapi import FastAPI -from models_library.projects import ProjectID -from models_library.users import UserID from servicelib.logging_utils import log_context -from ...models.comp_runs import Iteration -from ..db import get_db_engine -from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client from ._models import SchedulePipelineRabbitMessage -from ._publisher import request_pipeline_scheduling from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler @@ -20,15 +14,16 @@ def _empty_wake_up_callack( - app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration + # app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration ) -> None: - async def _async_cb(): - db_engine = get_db_engine(app) - rabbit_mq_client = get_rabbitmq_client(app) - comp_run = await CompRunsRepository.instance(db_engine).get( - user_id=user_id, project_id=project_id, iteration=iteration - ) - await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) + # async def _async_cb(): + # db_engine = get_db_engine(app) + # rabbit_mq_client = get_rabbitmq_client(app) + # comp_run = await CompRunsRepository.instance(db_engine).get( + # user_id=user_id, project_id=project_id, iteration=iteration + # ) + # await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) + ... def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: @@ -39,10 +34,10 @@ async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: with log_context(_logger, logging.DEBUG, msg="handling scheduling"): to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) - get_rabbitmq_client(app).publish( - SchedulePipelineRabbitMessage.get_channel_name(), - to_schedule_pipeline, - ) + # get_rabbitmq_client(app).publish( + # SchedulePipelineRabbitMessage.get_channel_name(), + # to_schedule_pipeline, + # ) await _get_scheduler_worker(app).schedule_pipeline( user_id=to_schedule_pipeline.user_id, project_id=to_schedule_pipeline.project_id, From 96402cec72e76d5d275d23bd49d766310b6d953c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:21:26 +0100 Subject: [PATCH 044/127] cleaning --- .../comp_scheduler/test_scheduler_dask.py | 192 ++---------------- 1 file changed, 13 insertions(+), 179 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index dc1d855f8dd..18c0b4bdf6c 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -20,7 +20,6 @@ import aiopg.sa import pytest from _helpers import PublishedProject, RunningProject -from dask.distributed import SpecCluster from dask_task_models_library.container_tasks.errors import TaskCancelledError from dask_task_models_library.container_tasks.events import TaskProgressEvent from dask_task_models_library.container_tasks.io import TaskOutputData @@ -42,14 +41,9 @@ from models_library.users import UserID from pydantic import TypeAdapter from pytest_mock.plugin import MockerFixture -from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.rabbitmq import RabbitMQClient -from servicelib.redis import CouldNotAcquireLockError -from settings_library.rabbit import RabbitSettings -from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs from simcore_postgres_database.models.comp_tasks import NodeClass, comp_tasks -from simcore_service_director_v2.core.application import init_app from simcore_service_director_v2.core.errors import ( ClustersKeeperNotAvailableError, ComputationalBackendNotConnectedError, @@ -58,10 +52,7 @@ ComputationalBackendTaskResultsNotReadyError, ComputationalSchedulerChangedError, ComputationalSchedulerError, - ConfigurationError, - PipelineNotFoundError, ) -from simcore_service_director_v2.core.settings import AppSettings from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image @@ -73,12 +64,14 @@ DaskScheduler, ) from simcore_service_director_v2.modules.comp_scheduler._utils import COMPLETED_STATES +from simcore_service_director_v2.modules.comp_scheduler._worker import ( + _get_scheduler_worker, +) from simcore_service_director_v2.modules.dask_client import ( DaskJobID, PublishedComputationTask, ) from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers -from starlette.testclient import TestClient from tenacity.asyncio import AsyncRetrying from tenacity.retry import retry_if_exception_type from tenacity.stop import stop_after_delay @@ -155,75 +148,8 @@ async def _assert_comp_tasks_db( ), f"{expected_progress=}, found: {[t.progress for t in tasks]}" -async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None: - # NOTE: we take a copy of the pipelines, as this could change quickly if there are - # misconfigured pipelines that would be removed from the scheduler - # NOTE: we simulate multiple dv-2 replicas by running several times - # the same pipeline scheduling - local_pipelines = deepcopy(scheduler._scheduled_pipelines) # noqa: SLF001 - results = await asyncio.gather( - *( - scheduler.schedule_pipeline( - user_id=user_id, - project_id=project_id, - iteration=iteration, - wake_up_callback=params.scheduler_waker.set, - ) - for _ in range(3) - for ( - user_id, - project_id, - iteration, - ), params in local_pipelines.items() - ), - return_exceptions=True, - ) - # we should have exceptions 2/3 of the time - could_not_acquire_lock_count = sum( - isinstance(r, CouldNotAcquireLockError) for r in results - ) - total_results_count = len(results) - - # Check if 2/3 of the results are CouldNotAcquireLockError - # checks that scheduling is done exclusively - assert could_not_acquire_lock_count == (2 / 3) * total_results_count - - -@pytest.fixture -def minimal_scheduler_dask_config( - mock_env: EnvVarsDict, - postgres_host_config: dict[str, str], - monkeypatch: pytest.MonkeyPatch, - rabbit_service: RabbitSettings, - redis_service: RedisSettings, - faker: Faker, -) -> None: - """set a minimal configuration for testing the dask connection only""" - monkeypatch.setenv("DIRECTOR_V2_DYNAMIC_SIDECAR_ENABLED", "false") - monkeypatch.setenv("DIRECTOR_V0_ENABLED", "0") - monkeypatch.setenv("COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED", "1") - monkeypatch.setenv("COMPUTATIONAL_BACKEND_ENABLED", "1") - monkeypatch.setenv("R_CLONE_PROVIDER", "MINIO") - monkeypatch.setenv("S3_ENDPOINT", faker.url()) - monkeypatch.setenv("S3_ACCESS_KEY", faker.pystr()) - monkeypatch.setenv("S3_REGION", faker.pystr()) - monkeypatch.setenv("S3_SECRET_KEY", faker.pystr()) - monkeypatch.setenv("S3_BUCKET_NAME", faker.pystr()) - - -@pytest.fixture -def scheduler( - minimal_scheduler_dask_config: None, - aiopg_engine: aiopg.sa.engine.Engine, - initialized_app: FastAPI, -) -> BaseCompScheduler: - scheduler = _get_scheduler_worker(initialized_app) - assert scheduler is not None - return scheduler - - @pytest.fixture -def mocked_dask_client(mocker: MockerFixture) -> mock.MagicMock: +def mocked_dask_client(mocker: MockerFixture) -> mock.Mock: mocked_dask_client = mocker.patch( "simcore_service_director_v2.modules.dask_clients_pool.DaskClient", autospec=True, @@ -241,7 +167,7 @@ def mocked_parse_output_data_fct(mocker: MockerFixture) -> mock.Mock: @pytest.fixture -def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock: +def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.Mock: return mocker.patch( "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid", return_value=None, @@ -250,114 +176,22 @@ def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock: @pytest.fixture -def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.MagicMock: - """disables the scheduler task, note that it needs to be triggered manu>ally then""" - - def _fake_starter( - self: BaseCompScheduler, - *args, - **kwargs, - ): - scheduler_task = mocker.MagicMock() - scheduler_task_wake_up_event = mocker.MagicMock() - return scheduler_task, scheduler_task_wake_up_event - +def mocked_clean_task_output_and_log_files_if_invalid( + mocker: MockerFixture, +) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler._start_scheduling", - autospec=True, - side_effect=_fake_starter, - ) - - -@pytest.fixture -def mocked_clean_task_output_and_log_files_if_invalid(mocker: MockerFixture) -> None: - mocker.patch( "simcore_service_director_v2.modules.comp_scheduler._scheduler_dask.clean_task_output_and_log_files_if_invalid", autospec=True, ) -async def test_scheduler_gracefully_starts_and_stops( - minimal_scheduler_dask_config: None, - aiopg_engine: aiopg.sa.engine.Engine, - dask_spec_local_cluster: SpecCluster, - initialized_app: FastAPI, -): - # check it started correctly - assert _get_scheduler_worker(initialized_app) is not None - - -@pytest.mark.parametrize( - "missing_dependency", - [ - "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED", - ], -) -def test_scheduler_raises_exception_for_missing_dependencies( - minimal_scheduler_dask_config: None, - aiopg_engine: aiopg.sa.engine.Engine, - dask_spec_local_cluster: SpecCluster, - monkeypatch: pytest.MonkeyPatch, - missing_dependency: str, -): - # disable the dependency - monkeypatch.setenv(missing_dependency, "0") - # create the client - settings = AppSettings.create_from_envs() - app = init_app(settings) - - with pytest.raises(ConfigurationError), TestClient( - app, raise_server_exceptions=True - ) as _: - pass - - -async def test_empty_pipeline_is_not_scheduled( - with_disabled_auto_scheduling: None, - scheduler: BaseCompScheduler, - registered_user: Callable[..., dict[str, Any]], - project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - aiopg_engine: aiopg.sa.engine.Engine, - run_metadata: RunMetadataDict, -): - user = registered_user() - empty_project = await project(user) - - # the project is not in the comp_pipeline, therefore scheduling it should fail - with pytest.raises(PipelineNotFoundError): - await scheduler.run_new_pipeline( - user_id=user["id"], - project_id=empty_project.uuid, - cluster_id=DEFAULT_CLUSTER_ID, - run_metadata=run_metadata, - use_on_demand_clusters=False, - ) - # create the empty pipeline now - pipeline(project_id=f"{empty_project.uuid}") - - # creating a run with an empty pipeline is useless, check the scheduler is not kicking in - await scheduler.run_new_pipeline( - user_id=user["id"], - project_id=empty_project.uuid, - cluster_id=DEFAULT_CLUSTER_ID, - run_metadata=run_metadata, - use_on_demand_clusters=False, - ) - assert len(scheduler._scheduled_pipelines) == 0 # noqa: SLF001 - # check the database is empty - async with aiopg_engine.acquire() as conn: - result = await conn.scalar( - comp_runs.select().where( - (comp_runs.c.user_id == user["id"]) - & (comp_runs.c.project_uuid == f"{empty_project.uuid}") - ) # there is only one entry - ) - assert result is None +@pytest.fixture +def scheduler(initialized_app: FastAPI) -> BaseCompScheduler: + return _get_scheduler_worker(initialized_app) async def test_misconfigured_pipeline_is_not_scheduled( - with_disabled_auto_scheduling: None, + with_disabled_auto_scheduling: mock.Mock, scheduler: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], @@ -639,7 +473,7 @@ async def _trigger_progress_event( ) -@pytest.mark.acceptance_test() +@pytest.mark.acceptance_test async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, From 748cbb96c7bbd5091222b37bf6a70504b1b492e8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:22:31 +0100 Subject: [PATCH 045/127] rename --- .../unit/with_dbs/comp_scheduler/conftest.py | 2 +- .../with_dbs/comp_scheduler/test_manager.py | 8 ++++---- .../comp_scheduler/test_scheduler_dask.py | 18 +++++++++--------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py index c7abf8cdd8f..2380f3bf956 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py @@ -40,7 +40,7 @@ def mock_env( @pytest.fixture -def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: +def with_disabled_scheduler_manager(mocker: MockerFixture) -> mock.Mock: mocker.patch( "simcore_service_director_v2.modules.comp_scheduler.shutdown_manager", ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 130bad7a2e9..1d25daff456 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -103,7 +103,7 @@ async def test_manager_starts_and_auto_schedules_pipelines( async def test_schedule_pipelines_empty_db( - with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_manager: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, @@ -121,7 +121,7 @@ async def test_schedule_pipelines_empty_db( async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( - with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_manager: mock.Mock, initialized_app: FastAPI, mocker: MockerFixture, ): @@ -154,7 +154,7 @@ async def slow_limited_gather(*args, **kwargs): async def test_schedule_pipelines( - with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_manager: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, published_project: PublishedProject, @@ -246,7 +246,7 @@ async def test_schedule_pipelines( async def test_empty_pipeline_is_not_scheduled( - with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_manager: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, registered_user: Callable[..., dict[str, Any]], diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 18c0b4bdf6c..9ab9a0fce9c 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -191,7 +191,7 @@ def scheduler(initialized_app: FastAPI) -> BaseCompScheduler: async def test_misconfigured_pipeline_is_not_scheduled( - with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_manager: mock.Mock, scheduler: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], @@ -475,7 +475,7 @@ async def _trigger_progress_event( @pytest.mark.acceptance_test async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -860,7 +860,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta async def test_task_progress_triggers( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -923,7 +923,7 @@ async def test_task_progress_triggers( ], ) async def test_handling_of_disconnected_scheduler_dask( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -1068,7 +1068,7 @@ class RebootState: ], ) async def test_handling_scheduling_after_reboot( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project: RunningProject, @@ -1150,7 +1150,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: async def test_handling_cancellation_of_jobs_after_reboot( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project_mark_for_cancellation: RunningProject, @@ -1244,7 +1244,7 @@ def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int: async def test_running_pipeline_triggers_heartbeat( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, with_fast_service_heartbeat_s: int, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1334,7 +1334,7 @@ async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, @@ -1408,7 +1408,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( [ClustersKeeperNotAvailableError], ) async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( - with_disabled_auto_scheduling: None, + with_disabled_scheduler_manager: mock.Mock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, From 1a5157ed8cf83791e4a7d7c8e336be657157e8b5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 18:02:38 +0100 Subject: [PATCH 046/127] refactor and fixes --- .../modules/comp_scheduler/__init__.py | 1 - services/director-v2/tests/unit/_helpers.py | 36 ++++-- .../unit/with_dbs/comp_scheduler/conftest.py | 10 +- .../with_dbs/comp_scheduler/test_manager.py | 52 +++------ .../comp_scheduler/test_scheduler_dask.py | 103 ++++++++++-------- .../with_dbs/comp_scheduler/test_worker.py | 2 +- 6 files changed, 111 insertions(+), 93 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index c681a86d317..929178cdf8a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -3,7 +3,6 @@ from typing import Any from fastapi import FastAPI -from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context from ._constants import MODULE_NAME diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index 779d6cdd117..c0cee3df0a3 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -3,13 +3,15 @@ import aiopg import aiopg.sa +import sqlalchemy as sa from models_library.projects import ProjectAtDB from models_library.projects_nodes_io import NodeID -from simcore_postgres_database.models.comp_pipeline import StateType +from simcore_postgres_database.models.comp_runs import comp_runs from simcore_postgres_database.models.comp_tasks import comp_tasks from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB +from sqlalchemy.ext.asyncio import AsyncEngine @dataclass @@ -24,17 +26,6 @@ class RunningProject(PublishedProject): runs: CompRunsAtDB -async def set_comp_task_state( - aiopg_engine: aiopg.sa.engine.Engine, node_id: str, state: StateType -) -> None: - async with aiopg_engine.acquire() as conn: - await conn.execute( - comp_tasks.update() - .where(comp_tasks.c.node_id == node_id) - .values(state=state) - ) - - async def set_comp_task_outputs( aiopg_engine: aiopg.sa.engine.Engine, node_id: NodeID, @@ -61,3 +52,24 @@ async def set_comp_task_inputs( .where(comp_tasks.c.node_id == f"{node_id}") .values(inputs=inputs, schema={"outputs": {}, "inputs": inputs_schema}) ) + + +async def assert_comp_runs( + sqlalchemy_async_engine: AsyncEngine, + *, + expected_total: int, + where_statement: Any | None = None, +) -> list[CompRunsAtDB]: + async with sqlalchemy_async_engine.connect() as conn: + query = sa.select(comp_runs) + if where_statement is not None: + query = query.where(where_statement) + list_of_comp_runs = [ + CompRunsAtDB.from_orm(row) for row in await conn.execute(query) + ] + assert len(list_of_comp_runs) == expected_total + return list_of_comp_runs + + +async def assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: + await assert_comp_runs(sqlalchemy_async_engine, expected_total=0) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py index 2380f3bf956..ed4738f3d68 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py @@ -40,7 +40,7 @@ def mock_env( @pytest.fixture -def with_disabled_scheduler_manager(mocker: MockerFixture) -> mock.Mock: +def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: mocker.patch( "simcore_service_director_v2.modules.comp_scheduler.shutdown_manager", ) @@ -55,3 +55,11 @@ def with_disabled_scheduler_worker(mocker: MockerFixture) -> mock.Mock: "simcore_service_director_v2.modules.comp_scheduler.setup_worker", autospec=True, ) + + +@pytest.fixture +def with_disabled_scheduler_publisher(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._manager.request_pipeline_scheduling", + autospec=True, + ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 1d25daff456..ed1d0a31a6f 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -16,8 +16,7 @@ from unittest import mock import pytest -import sqlalchemy as sa -from _helpers import PublishedProject +from _helpers import PublishedProject, assert_comp_runs, assert_comp_runs_empty from fastapi import FastAPI from models_library.clusters import DEFAULT_CLUSTER_ID from models_library.projects import ProjectAtDB @@ -26,10 +25,9 @@ from servicelib.rabbitmq._client import RabbitMQClient from servicelib.redis import CouldNotAcquireLockError from servicelib.utils import limited_gather -from simcore_postgres_database.models.comp_runs import comp_runs from simcore_service_director_v2.core.errors import PipelineNotFoundError from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB -from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict +from simcore_service_director_v2.models.comp_runs import RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._manager import ( SCHEDULER_INTERVAL, run_new_pipeline, @@ -58,22 +56,6 @@ async def scheduler_rabbit_client_parser( await client.unsubscribe(queue_name) -async def _assert_comp_runs( - sqlalchemy_async_engine: AsyncEngine, *, expected_total: int -) -> list[CompRunsAtDB]: - async with sqlalchemy_async_engine.connect() as conn: - list_of_comp_runs = [ - CompRunsAtDB.from_orm(row) - for row in await conn.execute(sa.select(comp_runs)) - ] - assert len(list_of_comp_runs) == expected_total - return list_of_comp_runs - - -async def _assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: - await _assert_comp_runs(sqlalchemy_async_engine, expected_total=0) - - @pytest.fixture def with_fast_scheduling(mocker: MockerFixture) -> None: from simcore_service_director_v2.modules.comp_scheduler import _manager @@ -98,18 +80,18 @@ async def test_manager_starts_and_auto_schedules_pipelines( initialized_app: FastAPI, sqlalchemy_async_engine: AsyncEngine, ): - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) mocked_schedule_pipelines.assert_called() async def test_schedule_pipelines_empty_db( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, scheduler_rabbit_client_parser: mock.AsyncMock, sqlalchemy_async_engine: AsyncEngine, ): - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) await schedule_pipelines(initialized_app) @@ -117,11 +99,11 @@ async def test_schedule_pipelines_empty_db( scheduler_rabbit_client_parser.assert_not_called() # check comp_runs is still empty - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, mocker: MockerFixture, ): @@ -154,7 +136,7 @@ async def slow_limited_gather(*args, **kwargs): async def test_schedule_pipelines( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, published_project: PublishedProject, @@ -162,7 +144,7 @@ async def test_schedule_pipelines( run_metadata: RunMetadataDict, scheduler_rabbit_client_parser: mock.AsyncMock, ): - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) assert published_project.project.prj_owner # now we schedule a pipeline await run_new_pipeline( @@ -182,7 +164,7 @@ async def test_schedule_pipelines( ).body() ) scheduler_rabbit_client_parser.reset_mock() - comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] assert comp_run.project_uuid == published_project.project.uuid assert comp_run.user_id == published_project.project.prj_owner @@ -198,7 +180,7 @@ async def test_schedule_pipelines( # this will now not schedule the pipeline since it was last scheduled await schedule_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_not_called() - comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] assert comp_run.last_scheduled == start_schedule_time, "scheduled time changed!" assert comp_run.cancelled is None @@ -215,7 +197,7 @@ async def test_schedule_pipelines( ).body() ) scheduler_rabbit_client_parser.reset_mock() - comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] assert comp_run.last_scheduled is not None assert comp_run.last_scheduled > start_schedule_time @@ -238,7 +220,7 @@ async def test_schedule_pipelines( ).body() ) scheduler_rabbit_client_parser.reset_mock() - comp_runs = await _assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] assert comp_run.last_scheduled is not None assert comp_run.last_scheduled > last_schedule_time @@ -246,7 +228,7 @@ async def test_schedule_pipelines( async def test_empty_pipeline_is_not_scheduled( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, registered_user: Callable[..., dict[str, Any]], @@ -257,7 +239,7 @@ async def test_empty_pipeline_is_not_scheduled( scheduler_rabbit_client_parser: mock.AsyncMock, caplog: pytest.LogCaptureFixture, ): - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) user = registered_user() empty_project = await project(user) @@ -271,7 +253,7 @@ async def test_empty_pipeline_is_not_scheduled( run_metadata=run_metadata, use_on_demand_clusters=False, ) - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) scheduler_rabbit_client_parser.assert_not_called() # create the empty pipeline now @@ -289,5 +271,5 @@ async def test_empty_pipeline_is_not_scheduled( ) assert len(caplog.records) == 1 assert "no computational dag defined" in caplog.records[0].message - await _assert_comp_runs_empty(sqlalchemy_async_engine) + await assert_comp_runs_empty(sqlalchemy_async_engine) scheduler_rabbit_client_parser.assert_not_called() diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 9ab9a0fce9c..4f0c1fcfe30 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -19,7 +19,12 @@ import aiopg import aiopg.sa import pytest -from _helpers import PublishedProject, RunningProject +from _helpers import ( + PublishedProject, + RunningProject, + assert_comp_runs, + assert_comp_runs_empty, +) from dask_task_models_library.container_tasks.errors import TaskCancelledError from dask_task_models_library.container_tasks.events import TaskProgressEvent from dask_task_models_library.container_tasks.io import TaskOutputData @@ -57,6 +62,7 @@ from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState +from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline from simcore_service_director_v2.modules.comp_scheduler._scheduler_base import ( BaseCompScheduler, ) @@ -72,6 +78,7 @@ PublishedComputationTask, ) from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers +from sqlalchemy.ext.asyncio import AsyncEngine from tenacity.asyncio import AsyncRetrying from tenacity.retry import retry_if_exception_type from tenacity.stop import stop_after_delay @@ -185,21 +192,29 @@ def mocked_clean_task_output_and_log_files_if_invalid( ) +@pytest.fixture +def mocked_wake_up_callback(mocker: MockerFixture) -> mock.Mock: + return mock.Mock() + + @pytest.fixture def scheduler(initialized_app: FastAPI) -> BaseCompScheduler: return _get_scheduler_worker(initialized_app) -async def test_misconfigured_pipeline_is_not_scheduled( - with_disabled_scheduler_manager: mock.Mock, +async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, + initialized_app: FastAPI, scheduler: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, run_metadata: RunMetadataDict, + mocked_wake_up_callback: mock.Mock, ): """A pipeline which comp_tasks are missing should not be scheduled. It shall be aborted and shown as such in the comp_runs db""" @@ -209,46 +224,48 @@ async def test_misconfigured_pipeline_is_not_scheduled( project_id=f"{sleepers_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) - # check the pipeline is correctly added to the scheduled pipelines - await scheduler.run_new_pipeline( + await assert_comp_runs_empty(sqlalchemy_async_engine) + + # since the publisher is disabled, it will not automatically trigger a scheduling + # this is done to verify the scheduler internal state + await run_new_pipeline( + initialized_app, user_id=user["id"], project_id=sleepers_project.uuid, cluster_id=DEFAULT_CLUSTER_ID, run_metadata=run_metadata, use_on_demand_clusters=False, ) - assert len(scheduler._scheduled_pipelines) == 1 # noqa: SLF001 - for ( - u_id, - p_id, - it, - ) in scheduler._scheduled_pipelines: # noqa: SLF001 - assert u_id == user["id"] - assert p_id == sleepers_project.uuid - assert it > 0 + with_disabled_scheduler_publisher.assert_called_once() # check the database was properly updated - async with aiopg_engine.acquire() as conn: - result = await conn.execute( - comp_runs.select().where( - (comp_runs.c.user_id == user["id"]) - & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}") - ) # there is only one entry - ) - run_entry = CompRunsAtDB.model_validate(await result.first()) + runs = await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + where_statement=(comp_runs.c.user_id == user["id"]) + & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), + ) + run_entry = runs[0] assert run_entry.result == RunningState.PUBLISHED - # let the scheduler kick in - await schedule_all_pipelines(scheduler) - # check the scheduled pipelines is again empty since it's misconfigured - assert len(scheduler._scheduled_pipelines) == 0 # noqa: SLF001 + assert run_entry.metadata == run_metadata + + # run now the scheduler, this will abort the run directly + await scheduler.schedule_pipeline( + user_id=run_entry.user_id, + project_id=run_entry.project_uuid, + iteration=run_entry.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + # the pipeline is misconfigured, so the callback will NOT be called since nothing ran + mocked_wake_up_callback.assert_not_called() + # check the database entry is correctly updated - async with aiopg_engine.acquire() as conn: - result = await conn.execute( - comp_runs.select().where( - (comp_runs.c.user_id == user["id"]) - & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}") - ) # there is only one entry - ) - run_entry = CompRunsAtDB.model_validate(await result.first()) + runs = await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + where_statement=(comp_runs.c.user_id == user["id"]) + & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), + ) + run_entry = runs[0] assert run_entry.result == RunningState.ABORTED assert run_entry.metadata == run_metadata @@ -475,7 +492,7 @@ async def _trigger_progress_event( @pytest.mark.acceptance_test async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -860,7 +877,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta async def test_task_progress_triggers( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -923,7 +940,7 @@ async def test_task_progress_triggers( ], ) async def test_handling_of_disconnected_scheduler_dask( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -1068,7 +1085,7 @@ class RebootState: ], ) async def test_handling_scheduling_after_reboot( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project: RunningProject, @@ -1150,7 +1167,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: async def test_handling_cancellation_of_jobs_after_reboot( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project_mark_for_cancellation: RunningProject, @@ -1244,7 +1261,7 @@ def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int: async def test_running_pipeline_triggers_heartbeat( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, with_fast_service_heartbeat_s: int, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1334,7 +1351,7 @@ async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, @@ -1408,7 +1425,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( [ClustersKeeperNotAvailableError], ) async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( - with_disabled_scheduler_manager: mock.Mock, + with_disabled_auto_scheduling: mock.Mock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 51ae202cfba..a73b6887f39 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -1,4 +1,4 @@ -import FastAPI +from fastapi import FastAPI from simcore_service_director_v2.modules.comp_scheduler._worker import ( _get_scheduler_worker, ) From 2eacd17bb3483bc32f57989dff0e4d0cbc6f5ad7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 19 Nov 2024 19:23:43 +0100 Subject: [PATCH 047/127] refactoring before testing --- .../modules/comp_scheduler/_scheduler_base.py | 4 +- services/director-v2/tests/unit/_helpers.py | 34 +- .../comp_scheduler/test_scheduler_dask.py | 804 ++++++++++++------ 3 files changed, 568 insertions(+), 274 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 97f9a921c24..30f5a55878f 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -493,7 +493,9 @@ async def schedule_pipeline( user_id: UserID, project_id: ProjectID, iteration: Iteration, - wake_up_callback: Callable[[], None], + wake_up_callback: Callable[ + [], None + ], # TODO: this should not be in the interface ) -> None: with log_context( _logger, diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index c0cee3df0a3..6fa68330683 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -4,8 +4,10 @@ import aiopg import aiopg.sa import sqlalchemy as sa -from models_library.projects import ProjectAtDB +from models_library.projects import ProjectAtDB, ProjectID from models_library.projects_nodes_io import NodeID +from models_library.projects_state import RunningState +from pydantic import parse_obj_as from simcore_postgres_database.models.comp_runs import comp_runs from simcore_postgres_database.models.comp_tasks import comp_tasks from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB @@ -58,6 +60,7 @@ async def assert_comp_runs( sqlalchemy_async_engine: AsyncEngine, *, expected_total: int, + expected_state: RunningState | None = None, where_statement: Any | None = None, ) -> list[CompRunsAtDB]: async with sqlalchemy_async_engine.connect() as conn: @@ -68,8 +71,37 @@ async def assert_comp_runs( CompRunsAtDB.from_orm(row) for row in await conn.execute(query) ] assert len(list_of_comp_runs) == expected_total + if list_of_comp_runs and expected_state: + assert all( + r.result is expected_state for r in list_of_comp_runs + ), f"expected state '{expected_state}', got {[r.result for r in list_of_comp_runs]}" return list_of_comp_runs async def assert_comp_runs_empty(sqlalchemy_async_engine: AsyncEngine) -> None: await assert_comp_runs(sqlalchemy_async_engine, expected_total=0) + + +async def assert_comp_tasks( + sqlalchemy_async_engine: AsyncEngine, + *, + project_uuid: ProjectID, + task_ids: list[NodeID], + expected_state: RunningState, + expected_progress: float | None, +) -> None: + # check the database is correctly updated, the run is published + async with sqlalchemy_async_engine.connect() as conn: + result = await conn.execute( + comp_tasks.select().where( + (comp_tasks.c.project_id == f"{project_uuid}") + & (comp_tasks.c.node_id.in_([f"{n}" for n in task_ids])) + ) # there is only one entry + ) + tasks = parse_obj_as(list[CompTaskAtDB], result.fetchall()) + assert all( + t.state == expected_state for t in tasks + ), f"expected state: {expected_state}, found: {[t.state for t in tasks]}" + assert all( + t.progress == expected_progress for t in tasks + ), f"{expected_progress=}, found: {[t.progress for t in tasks]}" diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 4f0c1fcfe30..5c239c77dc2 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -16,14 +16,13 @@ from typing import Any, cast from unittest import mock -import aiopg -import aiopg.sa import pytest from _helpers import ( PublishedProject, RunningProject, assert_comp_runs, assert_comp_runs_empty, + assert_comp_tasks, ) from dask_task_models_library.container_tasks.errors import TaskCancelledError from dask_task_models_library.container_tasks.events import TaskProgressEvent @@ -48,7 +47,7 @@ from pytest_mock.plugin import MockerFixture from servicelib.rabbitmq import RabbitMQClient from simcore_postgres_database.models.comp_runs import comp_runs -from simcore_postgres_database.models.comp_tasks import NodeClass, comp_tasks +from simcore_postgres_database.models.comp_tasks import NodeClass from simcore_service_director_v2.core.errors import ( ClustersKeeperNotAvailableError, ComputationalBackendNotConnectedError, @@ -62,7 +61,10 @@ from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState -from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline +from simcore_service_director_v2.modules.comp_scheduler._manager import ( + run_new_pipeline, + stop_pipeline, +) from simcore_service_director_v2.modules.comp_scheduler._scheduler_base import ( BaseCompScheduler, ) @@ -78,6 +80,7 @@ PublishedComputationTask, ) from simcore_service_director_v2.utils.dask_client_utils import TaskHandlers +from sqlalchemy import and_ from sqlalchemy.ext.asyncio import AsyncEngine from tenacity.asyncio import AsyncRetrying from tenacity.retry import retry_if_exception_type @@ -111,50 +114,6 @@ def _assert_dask_client_correctly_initialized( ) -async def _assert_comp_run_db( - aiopg_engine: aiopg.sa.engine.Engine, - pub_project: PublishedProject, - expected_state: RunningState, -) -> None: - # check the database is correctly updated, the run is published - async with aiopg_engine.acquire() as conn: - result = await conn.execute( - comp_runs.select().where( - (comp_runs.c.user_id == pub_project.project.prj_owner) - & (comp_runs.c.project_uuid == f"{pub_project.project.uuid}") - ) # there is only one entry - ) - run_entry = CompRunsAtDB.model_validate(await result.first()) - assert ( - run_entry.result == expected_state - ), f"comp_runs: expected state '{expected_state}, found '{run_entry.result}'" - - -async def _assert_comp_tasks_db( - aiopg_engine: aiopg.sa.engine.Engine, - project_uuid: ProjectID, - task_ids: list[NodeID], - *, - expected_state: RunningState, - expected_progress: float | None, -) -> None: - # check the database is correctly updated, the run is published - async with aiopg_engine.acquire() as conn: - result = await conn.execute( - comp_tasks.select().where( - (comp_tasks.c.project_id == f"{project_uuid}") - & (comp_tasks.c.node_id.in_([f"{n}" for n in task_ids])) - ) # there is only one entry - ) - tasks = TypeAdapter(list[CompTaskAtDB]).validate_python(await result.fetchall()) - assert all( - t.state == expected_state for t in tasks - ), f"expected state: {expected_state}, found: {[t.state for t in tasks]}" - assert all( - t.progress == expected_progress for t in tasks - ), f"{expected_progress=}, found: {[t.progress for t in tasks]}" - - @pytest.fixture def mocked_dask_client(mocker: MockerFixture) -> mock.Mock: mocked_dask_client = mocker.patch( @@ -241,12 +200,11 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( runs = await assert_comp_runs( sqlalchemy_async_engine, expected_total=1, + expected_state=RunningState.PUBLISHED, where_statement=(comp_runs.c.user_id == user["id"]) & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), ) run_entry = runs[0] - assert run_entry.result == RunningState.PUBLISHED - assert run_entry.metadata == run_metadata # run now the scheduler, this will abort the run directly await scheduler.schedule_pipeline( @@ -259,62 +217,60 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( mocked_wake_up_callback.assert_not_called() # check the database entry is correctly updated - runs = await assert_comp_runs( + await assert_comp_runs( sqlalchemy_async_engine, expected_total=1, + expected_state=RunningState.ABORTED, where_statement=(comp_runs.c.user_id == user["id"]) & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), ) - run_entry = runs[0] - assert run_entry.result == RunningState.ABORTED - assert run_entry.metadata == run_metadata async def _assert_start_pipeline( - aiopg_engine, + app: FastAPI, + *, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, - scheduler: BaseCompScheduler, run_metadata: RunMetadataDict, -) -> list[CompTaskAtDB]: +) -> tuple[CompRunsAtDB, list[CompTaskAtDB]]: exp_published_tasks = deepcopy(published_project.tasks) assert published_project.project.prj_owner - await scheduler.run_new_pipeline( + await run_new_pipeline( + app, user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, run_metadata=run_metadata, use_on_demand_clusters=False, ) - assert ( - len(scheduler._scheduled_pipelines) == 1 # noqa: SLF001 - ), "the pipeline is not scheduled!" - for ( - u_id, - p_id, - it, - ) in scheduler._scheduled_pipelines: # noqa: SLF001 - assert u_id == published_project.project.prj_owner - assert p_id == published_project.project.uuid - assert it > 0 # check the database is correctly updated, the run is published - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in exp_published_tasks], + runs = await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in exp_published_tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) - return exp_published_tasks + return runs[0], exp_published_tasks async def _assert_schedule_pipeline_PENDING( # noqa: N802 - aiopg_engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, published_tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, + wake_up_callback: Callable[[], None], ) -> list[CompTaskAtDB]: expected_pending_tasks = [ published_tasks[1], @@ -327,21 +283,35 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] return [DaskClientTaskState.PENDING for job_id in job_ids] mocked_dask_client.get_tasks_status.side_effect = _return_tasks_pending - await schedule_all_pipelines(scheduler) + assert published_project.project.prj_owner + await scheduler.schedule_pipeline( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + wake_up_callback=wake_up_callback, + ) _assert_dask_client_correctly_initialized(mocked_dask_client, scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_pending_tasks], + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) # the other tasks are still waiting in published state - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in published_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in published_tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, # since we bypass the API entrypoint this is correct ) @@ -366,19 +336,30 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] mocked_dask_client.get_tasks_status.assert_not_called() mocked_dask_client.get_task_result.assert_not_called() # there is a second run of the scheduler to move comp_runs to pending, the rest does not change - await schedule_all_pipelines(scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_pending_tasks], + await scheduler.schedule_pipeline( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + wake_up_callback=wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PENDING, + where_statement=(comp_runs.c.user_id == published_project.project.prj_owner) + & (comp_runs.c.project_uuid == f"{published_project.project.uuid}"), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in published_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in published_tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) @@ -493,30 +474,37 @@ async def _trigger_progress_event( @pytest.mark.acceptance_test async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, + initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, mocked_parse_output_data_fct: mock.Mock, mocked_clean_task_output_and_log_files_if_invalid: None, instrumentation_rabbit_client_parser: mock.AsyncMock, resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, + mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) - expected_published_tasks = await _assert_start_pipeline( - aiopg_engine, published_project, scheduler, run_metadata + run_in_db, expected_published_tasks = await _assert_start_pipeline( + initialized_app, + sqlalchemy_async_engine=sqlalchemy_async_engine, + published_project=published_project, + run_metadata=run_metadata, ) # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( - aiopg_engine, + sqlalchemy_async_engine, published_project, expected_published_tasks, mocked_dask_client, scheduler, + mocked_wake_up_callback, ) # ------------------------------------------------------------------------------- @@ -536,28 +524,46 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta ] mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running - - await schedule_all_pipelines(scheduler) - - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await asyncio.sleep(0) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PENDING, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.PENDING, expected_progress=None, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_pending_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_published_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_published_tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, # since we bypass the API entrypoint this is correct ) @@ -583,27 +589,40 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta node_id=exp_started_task.node_id, ) - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) # comp_run, the comp_task switch to STARTED - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.STARTED, expected_progress=0, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_pending_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_published_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_published_tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) @@ -649,12 +668,25 @@ async def _return_random_task_result(job_id) -> TaskOutputData: return TaskOutputData.model_validate({"out_1": None, "out_2": 45}) mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await schedule_all_pipelines(scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.SUCCESS, expected_progress=1, ) @@ -674,17 +706,17 @@ async def _return_random_task_result(job_id) -> TaskOutputData: completed_tasks = [exp_started_task] next_pending_task = published_project.tasks[2] expected_pending_tasks.append(next_pending_task) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [p.node_id for p in expected_pending_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [ + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[ p.node_id for p in published_project.tasks if p not in expected_pending_tasks + completed_tasks @@ -746,12 +778,25 @@ async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await schedule_all_pipelines(scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.STARTED, expected_progress=0, ) @@ -790,12 +835,25 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed mocked_dask_client.get_task_result.side_effect = None - await schedule_all_pipelines(scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.FAILED, expected_progress=1, ) @@ -839,13 +897,26 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_task_result.side_effect = _return_random_task_result # trigger the scheduler, it should switch to FAILED, as we are done - await schedule_all_pipelines(scheduler) - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.FAILED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [exp_started_task.node_id], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], expected_state=RunningState.SUCCESS, expected_progress=1, ) @@ -872,32 +943,35 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta assert isinstance(messages[0], RabbitResourceTrackingStartedMessage) assert isinstance(messages[1], RabbitResourceTrackingStoppedMessage) - # the scheduled pipeline shall be removed - assert scheduler._scheduled_pipelines == {} # noqa: SLF001 - async def test_task_progress_triggers( with_disabled_auto_scheduling: mock.Mock, + initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, mocked_parse_output_data_fct: None, mocked_clean_task_output_and_log_files_if_invalid: None, run_metadata: RunMetadataDict, + mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) - expected_published_tasks = await _assert_start_pipeline( - aiopg_engine, published_project, scheduler, run_metadata + _run_in_db, expected_published_tasks = await _assert_start_pipeline( + initialized_app, + sqlalchemy_async_engine=sqlalchemy_async_engine, + published_project=published_project, + run_metadata=run_metadata, ) # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( - aiopg_engine, + sqlalchemy_async_engine, published_project, expected_published_tasks, mocked_dask_client, scheduler, + mocked_wake_up_callback, ) # send some progress @@ -920,10 +994,10 @@ async def test_task_progress_triggers( DaskScheduler, scheduler )._task_progress_change_handler(progress_event.model_dump_json()) # NOTE: not sure whether it should switch to STARTED.. it would make sense - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [started_task.node_id], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[started_task.node_id], expected_state=RunningState.STARTED, expected_progress=min(max(0, progress), 1), ) @@ -941,13 +1015,15 @@ async def test_task_progress_triggers( ) async def test_handling_of_disconnected_scheduler_dask( with_disabled_auto_scheduling: mock.Mock, + initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, mocker: MockerFixture, published_project: PublishedProject, backend_error: ComputationalSchedulerError, run_metadata: RunMetadataDict, + mocked_wake_up_callback: mock.Mock, ): # this will create a non connected backend issue that will trigger re-connection mocked_dask_client_send_task = mocker.patch( @@ -958,7 +1034,8 @@ async def test_handling_of_disconnected_scheduler_dask( # running the pipeline will now raise and the tasks are set back to PUBLISHED assert published_project.project.prj_owner - await scheduler.run_new_pipeline( + await run_new_pipeline( + initialized_app, user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, @@ -968,28 +1045,43 @@ async def test_handling_of_disconnected_scheduler_dask( # since there is no cluster, there is no dask-scheduler, # the tasks shall all still be in PUBLISHED state now - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) + runs_in_db = await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + run_in_db = runs_in_db[0] - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in published_project.tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in published_project.tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) # on the next iteration of the pipeline it will try to re-connect # now try to abort the tasks since we are wondering what is happening, this should auto-trigger the scheduler - await scheduler.stop_pipeline( + await stop_pipeline( + initialized_app, user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, ) # we ensure the scheduler was run - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) # after this step the tasks are marked as ABORTED - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [ + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[ t.node_id for t in published_project.tasks if t.node_class == NodeClass.COMPUTATIONAL @@ -998,9 +1090,22 @@ async def test_handling_of_disconnected_scheduler_dask( expected_progress=1, ) # then we have another scheduler run - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) # now the run should be ABORTED - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.ABORTED) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.ABORTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) @dataclass(frozen=True, kw_only=True) @@ -1087,12 +1192,13 @@ class RebootState: async def test_handling_scheduling_after_reboot( with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, running_project: RunningProject, scheduler: BaseCompScheduler, mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, reboot_state: RebootState, + mocked_wake_up_callback: mock.Mock, ): """After the dask client is rebooted, or that the director-v2 reboots the dv-2 internal scheduler shall continue scheduling correctly. Even though the task might have continued to run @@ -1109,8 +1215,13 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: return reboot_state.task_result mocked_dask_client.get_task_result.side_effect = mocked_get_task_result - - await schedule_all_pipelines(scheduler) + assert running_project.project.prj_owner + await scheduler.schedule_pipeline( + user_id=running_project.project.prj_owner, + project_id=running_project.project.uuid, + iteration=1, + wake_up_callback=mocked_wake_up_callback, + ) # the status will be called once for all RUNNING tasks mocked_dask_client.get_tasks_status.assert_called_once() if reboot_state.expected_run_state in COMPLETED_STATES: @@ -1142,10 +1253,10 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: else: mocked_clean_task_output_fct.assert_not_called() - await _assert_comp_tasks_db( - aiopg_engine, - running_project.project.uuid, - [ + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=running_project.project.uuid, + task_ids=[ running_project.tasks[1].node_id, running_project.tasks[2].node_id, running_project.tasks[3].node_id, @@ -1153,40 +1264,58 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: expected_state=reboot_state.expected_task_state_group1, expected_progress=reboot_state.expected_task_progress_group1, ) - await _assert_comp_tasks_db( - aiopg_engine, - running_project.project.uuid, - [running_project.tasks[4].node_id], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=running_project.project.uuid, + task_ids=[running_project.tasks[4].node_id], expected_state=reboot_state.expected_task_state_group2, expected_progress=reboot_state.expected_task_progress_group2, ) assert running_project.project.prj_owner - await _assert_comp_run_db( - aiopg_engine, running_project, reboot_state.expected_run_state + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=reboot_state.expected_run_state, + where_statement=and_( + comp_runs.c.user_id == running_project.project.prj_owner, + comp_runs.c.project_uuid == f"{running_project.project.uuid}", + ), ) async def test_handling_cancellation_of_jobs_after_reboot( with_disabled_auto_scheduling: mock.Mock, mocked_dask_client: mock.MagicMock, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, running_project_mark_for_cancellation: RunningProject, scheduler: BaseCompScheduler, mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, + mocked_wake_up_callback: mock.Mock, ): """A running pipeline was cancelled by a user and the DV-2 was restarted BEFORE It could actually cancel the task. On reboot the DV-2 shall recover and actually cancel the pipeline properly""" # check initial status - await _assert_comp_run_db( - aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED - ) - await _assert_comp_tasks_db( - aiopg_engine, - running_project_mark_for_cancellation.project.uuid, - [t.node_id for t in running_project_mark_for_cancellation.tasks], + run_in_db = ( + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id + == running_project_mark_for_cancellation.project.prj_owner, + comp_runs.c.project_uuid + == f"{running_project_mark_for_cancellation.project.uuid}", + ), + ) + )[0] + + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=running_project_mark_for_cancellation.project.uuid, + task_ids=[t.node_id for t in running_project_mark_for_cancellation.tasks], expected_state=RunningState.STARTED, expected_progress=0, ) @@ -1197,7 +1326,12 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status # Running the scheduler, should actually cancel the run now - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) mocked_dask_client.abort_computation_task.assert_called() assert mocked_dask_client.abort_computation_task.call_count == len( [ @@ -1207,10 +1341,10 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat ] ) # in the DB they are still running, they will be stopped in the next iteration - await _assert_comp_tasks_db( - aiopg_engine, - running_project_mark_for_cancellation.project.uuid, - [ + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=running_project_mark_for_cancellation.project.uuid, + task_ids=[ t.node_id for t in running_project_mark_for_cancellation.tasks if t.node_class == NodeClass.COMPUTATIONAL @@ -1218,8 +1352,16 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat expected_state=RunningState.STARTED, expected_progress=0, ) - await _assert_comp_run_db( - aiopg_engine, running_project_mark_for_cancellation, RunningState.STARTED + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id + == running_project_mark_for_cancellation.project.prj_owner, + comp_runs.c.project_uuid + == f"{running_project_mark_for_cancellation.project.uuid}", + ), ) # the backend shall now report the tasks as aborted @@ -1234,12 +1376,17 @@ async def _return_random_task_result(job_id) -> TaskOutputData: raise TaskCancelledError mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) # now should be stopped - await _assert_comp_tasks_db( - aiopg_engine, - running_project_mark_for_cancellation.project.uuid, - [ + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=running_project_mark_for_cancellation.project.uuid, + task_ids=[ t.node_id for t in running_project_mark_for_cancellation.tasks if t.node_class == NodeClass.COMPUTATIONAL @@ -1247,8 +1394,16 @@ async def _return_random_task_result(job_id) -> TaskOutputData: expected_state=RunningState.ABORTED, expected_progress=1, ) - await _assert_comp_run_db( - aiopg_engine, running_project_mark_for_cancellation, RunningState.ABORTED + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.ABORTED, + where_statement=and_( + comp_runs.c.user_id + == running_project_mark_for_cancellation.project.prj_owner, + comp_runs.c.project_uuid + == f"{running_project_mark_for_cancellation.project.uuid}", + ), ) mocked_clean_task_output_fct.assert_called() @@ -1263,25 +1418,31 @@ def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int: async def test_running_pipeline_triggers_heartbeat( with_disabled_auto_scheduling: mock.Mock, with_fast_service_heartbeat_s: int, + initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, + mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) - expected_published_tasks = await _assert_start_pipeline( - aiopg_engine, published_project, scheduler, run_metadata + run_in_db, expected_published_tasks = await _assert_start_pipeline( + initialized_app, + sqlalchemy_async_engine=sqlalchemy_async_engine, + published_project=published_project, + run_metadata=run_metadata, ) # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( - aiopg_engine, + sqlalchemy_async_engine, published_project, expected_published_tasks, mocked_dask_client, scheduler, + mocked_wake_up_callback, ) # ------------------------------------------------------------------------------- # 2. the "worker" starts processing a task @@ -1308,7 +1469,12 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, @@ -1320,8 +1486,18 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 3. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await schedule_all_pipelines(scheduler) - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -1332,8 +1508,18 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 4. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await schedule_all_pipelines(scheduler) - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -1352,12 +1538,14 @@ async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( with_disabled_auto_scheduling: mock.Mock, + initialized_app: FastAPI, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, run_metadata: RunMetadataDict, mocked_get_or_create_cluster: mock.Mock, faker: Faker, + mocked_wake_up_callback: mock.Mock, ): mocked_get_or_create_cluster.side_effect = ( ComputationalBackendOnDemandNotReadyError( @@ -1366,7 +1554,8 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( ) # running the pipeline will trigger a call to the clusters-keeper assert published_project.project.prj_owner - await scheduler.run_new_pipeline( + await run_new_pipeline( + initialized_app, user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, @@ -1375,11 +1564,21 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( ) # we ask to use an on-demand cluster, therefore the tasks are published first - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in published_project.tasks], + run_in_db = ( + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + )[0] + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in published_project.tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) @@ -1389,32 +1588,54 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( published_project.tasks[1], published_project.tasks[3], ] - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() - await _assert_comp_run_db( - aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.WAITING_FOR_CLUSTER, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in expected_waiting_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in expected_waiting_tasks], expected_state=RunningState.WAITING_FOR_CLUSTER, expected_progress=None, ) # again will trigger the same response - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() - await _assert_comp_run_db( - aiopg_engine, published_project, RunningState.WAITING_FOR_CLUSTER + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.WAITING_FOR_CLUSTER, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), ) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in expected_waiting_tasks], + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in expected_waiting_tasks], expected_state=RunningState.WAITING_FOR_CLUSTER, expected_progress=None, ) @@ -1426,17 +1647,20 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( ) async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( with_disabled_auto_scheduling: mock.Mock, + initialized_app: FastAPI, scheduler: BaseCompScheduler, - aiopg_engine: aiopg.sa.engine.Engine, + sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, run_metadata: RunMetadataDict, mocked_get_or_create_cluster: mock.Mock, get_or_create_exception: Exception, + mocked_wake_up_callback: mock.Mock, ): mocked_get_or_create_cluster.side_effect = get_or_create_exception # running the pipeline will trigger a call to the clusters-keeper assert published_project.project.prj_owner - await scheduler.run_new_pipeline( + await run_new_pipeline( + initialized_app, user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, @@ -1445,11 +1669,21 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( ) # we ask to use an on-demand cluster, therefore the tasks are published first - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in published_project.tasks], + run_in_db = ( + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + )[0] + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in published_project.tasks], expected_state=RunningState.PUBLISHED, expected_progress=None, ) @@ -1458,26 +1692,52 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( published_project.tasks[1], published_project.tasks[3], ] - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in expected_failed_tasks], + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.FAILED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in expected_failed_tasks], expected_state=RunningState.FAILED, expected_progress=1.0, ) # again will not re-trigger the call to clusters-keeper - await schedule_all_pipelines(scheduler) + await scheduler.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + wake_up_callback=mocked_wake_up_callback, + ) mocked_get_or_create_cluster.assert_not_called() - await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED) - await _assert_comp_tasks_db( - aiopg_engine, - published_project.project.uuid, - [t.node_id for t in expected_failed_tasks], + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.FAILED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[t.node_id for t in expected_failed_tasks], expected_state=RunningState.FAILED, expected_progress=1.0, ) From b3e7ac20336fc1e22344ec72280c5dc40fa2e5bc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:29:13 +0100 Subject: [PATCH 048/127] test if fixed --- .../with_dbs/comp_scheduler/test_scheduler_dask.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 5c239c77dc2..483ce2770d9 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -530,13 +530,6 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta iteration=run_in_db.iteration, wake_up_callback=mocked_wake_up_callback, ) - await asyncio.sleep(0) - await scheduler.schedule_pipeline( - user_id=run_in_db.user_id, - project_id=run_in_db.project_uuid, - iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, - ) await assert_comp_runs( sqlalchemy_async_engine, expected_total=1, @@ -946,6 +939,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta async def test_task_progress_triggers( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1015,6 +1009,7 @@ async def test_task_progress_triggers( ) async def test_handling_of_disconnected_scheduler_dask( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1191,6 +1186,7 @@ class RebootState: ) async def test_handling_scheduling_after_reboot( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, mocked_dask_client: mock.MagicMock, sqlalchemy_async_engine: AsyncEngine, running_project: RunningProject, @@ -1285,6 +1281,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: async def test_handling_cancellation_of_jobs_after_reboot( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, mocked_dask_client: mock.MagicMock, sqlalchemy_async_engine: AsyncEngine, running_project_mark_for_cancellation: RunningProject, @@ -1417,6 +1414,7 @@ def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int: async def test_running_pipeline_triggers_heartbeat( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, with_fast_service_heartbeat_s: int, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, @@ -1538,6 +1536,7 @@ async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, scheduler: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, @@ -1647,6 +1646,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( ) async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, scheduler: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, From 68f9f56694f3728f5f0135092c92053f9a13391d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:31:47 +0100 Subject: [PATCH 049/127] missing service dependencies --- .../tests/unit/with_dbs/comp_scheduler/test_worker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index a73b6887f39..db6901ca192 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -3,6 +3,9 @@ _get_scheduler_worker, ) +pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] +pytest_simcore_ops_services_selection = ["adminer"] + async def test_worker_starts_and_stops(initialized_app: FastAPI): assert _get_scheduler_worker(initialized_app) is not None From 7e418b3d27923728a392502418c19b169a43cd44 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:35:58 +0100 Subject: [PATCH 050/127] added basic test --- .../with_dbs/comp_scheduler/test_worker.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index db6901ca192..298cf93e4dd 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -1,5 +1,23 @@ +# pylint:disable=unused-variable +# pylint:disable=unused-argument +# pylint:disable=redefined-outer-name +# pylint:disable=no-value-for-parameter +# pylint:disable=protected-access +# pylint:disable=too-many-arguments +# pylint:disable=no-name-in-module +# pylint: disable=too-many-statements + +from unittest import mock + +import pytest +from _helpers import PublishedProject from fastapi import FastAPI +from models_library.clusters import DEFAULT_CLUSTER_ID +from pytest_mock import MockerFixture +from simcore_service_director_v2.models.comp_runs import RunMetadataDict +from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline from simcore_service_director_v2.modules.comp_scheduler._worker import ( + _empty_wake_up_callack, _get_scheduler_worker, ) @@ -9,3 +27,46 @@ async def test_worker_starts_and_stops(initialized_app: FastAPI): assert _get_scheduler_worker(initialized_app) is not None + + +@pytest.fixture +def mock_schedule_pipeline(mocker: MockerFixture) -> mock.Mock: + mock_scheduler_worker = mock.Mock() + mock_scheduler_worker.schedule_pipeline = mocker.AsyncMock(return_value=True) + return mock_scheduler_worker + + +@pytest.fixture +def mocked_get_scheduler_worker( + mocker: MockerFixture, + mock_schedule_pipeline: mock.Mock, +) -> mock.Mock: + # Mock `_get_scheduler_worker` to return our mock scheduler + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._worker._get_scheduler_worker", + return_value=mock_schedule_pipeline, + ) + + +async def test_worker_properly_calls_scheduler_api( + initialized_app: FastAPI, + mocked_get_scheduler_worker: mock.Mock, + published_project: PublishedProject, + run_metadata: RunMetadataDict, +): + assert published_project.project.prj_owner + await run_new_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + mocked_get_scheduler_worker.assert_called_once_with(initialized_app) + mocked_get_scheduler_worker.return_value.schedule_pipeline.assert_called_once_with( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + wake_up_callback=_empty_wake_up_callack, + ) From f9ff4e574b429ce87551d8f8e9d9d7ea2391a294 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:37:35 +0100 Subject: [PATCH 051/127] ensure we call shutdown on the worker as well --- .../modules/comp_scheduler/__init__.py | 3 ++- .../tests/unit/with_dbs/comp_scheduler/conftest.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 929178cdf8a..5dc7b020e6c 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -7,7 +7,7 @@ from ._constants import MODULE_NAME from ._manager import run_new_pipeline, setup_manager, shutdown_manager, stop_pipeline -from ._worker import setup_worker +from ._worker import setup_worker, shutdown_worker _logger = logging.getLogger(__name__) @@ -25,6 +25,7 @@ def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def stop_scheduler() -> None: with log_context(_logger, level=logging.INFO, msg=f"stopping {MODULE_NAME}"): await shutdown_manager(app) + await shutdown_worker(app) # TODO: we might want to stop anything running in the worker too diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py index ed4738f3d68..8f1c2898222 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/conftest.py @@ -51,6 +51,10 @@ def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.Mock: @pytest.fixture def with_disabled_scheduler_worker(mocker: MockerFixture) -> mock.Mock: + mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler.shutdown_worker", + autospec=True, + ) return mocker.patch( "simcore_service_director_v2.modules.comp_scheduler.setup_worker", autospec=True, From 962b5dab35a1242bad92c351f6818a696ca832ae Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:58:58 +0100 Subject: [PATCH 052/127] the callback is run in a separate thread --- .../modules/comp_scheduler/_worker.py | 26 ++++++++++++------- .../with_dbs/comp_scheduler/test_worker.py | 3 +-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 92d532e6872..4b43981a9c9 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -1,12 +1,14 @@ import functools import logging -from typing import cast +from typing import Callable, cast from fastapi import FastAPI +from models_library.projects import ProjectID +from models_library.users import UserID from servicelib.logging_utils import log_context from ..rabbitmq import get_rabbitmq_client -from ._models import SchedulePipelineRabbitMessage +from ._models import Iteration, SchedulePipelineRabbitMessage from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler @@ -14,8 +16,11 @@ def _empty_wake_up_callack( - # app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration -) -> None: + app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration +) -> Callable[[], None]: + def _cb() -> None: + ... + # async def _async_cb(): # db_engine = get_db_engine(app) # rabbit_mq_client = get_rabbitmq_client(app) @@ -23,7 +28,7 @@ def _empty_wake_up_callack( # user_id=user_id, project_id=project_id, iteration=iteration # ) # await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) - ... + return _cb def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: @@ -34,15 +39,16 @@ async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: with log_context(_logger, logging.DEBUG, msg="handling scheduling"): to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) - # get_rabbitmq_client(app).publish( - # SchedulePipelineRabbitMessage.get_channel_name(), - # to_schedule_pipeline, - # ) await _get_scheduler_worker(app).schedule_pipeline( user_id=to_schedule_pipeline.user_id, project_id=to_schedule_pipeline.project_id, iteration=to_schedule_pipeline.iteration, - wake_up_callback=_empty_wake_up_callack, + wake_up_callback=_empty_wake_up_callack( + app, + to_schedule_pipeline.user_id, + to_schedule_pipeline.project_id, + to_schedule_pipeline.iteration, + ), ) return True diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 298cf93e4dd..66d1b4ff2d7 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -17,7 +17,6 @@ from simcore_service_director_v2.models.comp_runs import RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline from simcore_service_director_v2.modules.comp_scheduler._worker import ( - _empty_wake_up_callack, _get_scheduler_worker, ) @@ -68,5 +67,5 @@ async def test_worker_properly_calls_scheduler_api( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, - wake_up_callback=_empty_wake_up_callack, + wake_up_callback=mock.ANY, ) From 6aba458df042e71c6c3b851be0fa762a876c6d98 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 23:00:32 +0100 Subject: [PATCH 053/127] add documentation --- .../modules/comp_scheduler/_scheduler_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 30f5a55878f..7cab4c42f0e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -497,6 +497,11 @@ async def schedule_pipeline( [], None ], # TODO: this should not be in the interface ) -> None: + """schedules a pipeline for a given user, project and iteration. + + Arguments: + wake_up_callback -- a callback function that is called in a separate thread everytime a pipeline node is completed + """ with log_context( _logger, level=logging.INFO, From dac33f36b59d5a00192d3c2e964cff8ca8006a11 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 23:05:46 +0100 Subject: [PATCH 054/127] removed wake_up callback from api interface --- .../modules/comp_scheduler/_scheduler_base.py | 23 +++++++++-- .../modules/comp_scheduler/_worker.py | 28 +------------- .../comp_scheduler/test_scheduler_dask.py | 38 ------------------- .../with_dbs/comp_scheduler/test_worker.py | 1 - 4 files changed, 21 insertions(+), 69 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 7cab4c42f0e..05b36d9c9a4 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -77,6 +77,22 @@ _MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10 +def _temporary_empty_wake_up_callack( + user_id: UserID, project_id: ProjectID, iteration: Iteration +) -> Callable[[], None]: + def _cb() -> None: + ... + + # async def _async_cb(): + # db_engine = get_db_engine(app) + # rabbit_mq_client = get_rabbitmq_client(app) + # comp_run = await CompRunsRepository.instance(db_engine).get( + # user_id=user_id, project_id=project_id, iteration=iteration + # ) + # await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) + return _cb + + @dataclass(frozen=True, slots=True) class SortedTasks: started: list[CompTaskAtDB] @@ -493,9 +509,6 @@ async def schedule_pipeline( user_id: UserID, project_id: ProjectID, iteration: Iteration, - wake_up_callback: Callable[ - [], None - ], # TODO: this should not be in the interface ) -> None: """schedules a pipeline for a given user, project and iteration. @@ -534,7 +547,9 @@ async def schedule_pipeline( comp_tasks=comp_tasks, dag=dag, comp_run=comp_run, - wake_up_callback=wake_up_callback, + wake_up_callback=_temporary_empty_wake_up_callack( + user_id, project_id, iteration + ), ) # 4. timeout if waiting for cluster has been there for more than X minutes comp_tasks = await self._timeout_if_waiting_for_cluster_too_long( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 4b43981a9c9..5a5acfbc36c 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -1,36 +1,18 @@ import functools import logging -from typing import Callable, cast +from typing import cast from fastapi import FastAPI -from models_library.projects import ProjectID -from models_library.users import UserID from servicelib.logging_utils import log_context from ..rabbitmq import get_rabbitmq_client -from ._models import Iteration, SchedulePipelineRabbitMessage +from ._models import SchedulePipelineRabbitMessage from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler _logger = logging.getLogger(__name__) -def _empty_wake_up_callack( - app: FastAPI, user_id: UserID, project_id: ProjectID, iteration: Iteration -) -> Callable[[], None]: - def _cb() -> None: - ... - - # async def _async_cb(): - # db_engine = get_db_engine(app) - # rabbit_mq_client = get_rabbitmq_client(app) - # comp_run = await CompRunsRepository.instance(db_engine).get( - # user_id=user_id, project_id=project_id, iteration=iteration - # ) - # await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) - return _cb - - def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: return cast(BaseCompScheduler, app.state.scheduler_worker) @@ -43,12 +25,6 @@ async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: user_id=to_schedule_pipeline.user_id, project_id=to_schedule_pipeline.project_id, iteration=to_schedule_pipeline.iteration, - wake_up_callback=_empty_wake_up_callack( - app, - to_schedule_pipeline.user_id, - to_schedule_pipeline.project_id, - to_schedule_pipeline.iteration, - ), ) return True diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 483ce2770d9..8ace69b2a77 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -173,7 +173,6 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( fake_workbench_adjacency: dict[str, Any], sqlalchemy_async_engine: AsyncEngine, run_metadata: RunMetadataDict, - mocked_wake_up_callback: mock.Mock, ): """A pipeline which comp_tasks are missing should not be scheduled. It shall be aborted and shown as such in the comp_runs db""" @@ -211,10 +210,7 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( user_id=run_entry.user_id, project_id=run_entry.project_uuid, iteration=run_entry.iteration, - wake_up_callback=mocked_wake_up_callback, ) - # the pipeline is misconfigured, so the callback will NOT be called since nothing ran - mocked_wake_up_callback.assert_not_called() # check the database entry is correctly updated await assert_comp_runs( @@ -270,7 +266,6 @@ async def _assert_schedule_pipeline_PENDING( # noqa: N802 published_tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, - wake_up_callback: Callable[[], None], ) -> list[CompTaskAtDB]: expected_pending_tasks = [ published_tasks[1], @@ -288,7 +283,6 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, - wake_up_callback=wake_up_callback, ) _assert_dask_client_correctly_initialized(mocked_dask_client, scheduler) await assert_comp_runs( @@ -340,7 +334,6 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, - wake_up_callback=wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -485,7 +478,6 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 instrumentation_rabbit_client_parser: mock.AsyncMock, resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, - mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) @@ -504,7 +496,6 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 expected_published_tasks, mocked_dask_client, scheduler, - mocked_wake_up_callback, ) # ------------------------------------------------------------------------------- @@ -528,7 +519,6 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -586,7 +576,6 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) # comp_run, the comp_task switch to STARTED await assert_comp_runs( @@ -665,7 +654,6 @@ async def _return_random_task_result(job_id) -> TaskOutputData: user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -775,7 +763,6 @@ async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -832,7 +819,6 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -894,7 +880,6 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await assert_comp_runs( sqlalchemy_async_engine, @@ -948,7 +933,6 @@ async def test_task_progress_triggers( mocked_parse_output_data_fct: None, mocked_clean_task_output_and_log_files_if_invalid: None, run_metadata: RunMetadataDict, - mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) _run_in_db, expected_published_tasks = await _assert_start_pipeline( @@ -965,7 +949,6 @@ async def test_task_progress_triggers( expected_published_tasks, mocked_dask_client, scheduler, - mocked_wake_up_callback, ) # send some progress @@ -1018,7 +1001,6 @@ async def test_handling_of_disconnected_scheduler_dask( published_project: PublishedProject, backend_error: ComputationalSchedulerError, run_metadata: RunMetadataDict, - mocked_wake_up_callback: mock.Mock, ): # this will create a non connected backend issue that will trigger re-connection mocked_dask_client_send_task = mocker.patch( @@ -1070,7 +1052,6 @@ async def test_handling_of_disconnected_scheduler_dask( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) # after this step the tasks are marked as ABORTED await assert_comp_tasks( @@ -1089,7 +1070,6 @@ async def test_handling_of_disconnected_scheduler_dask( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) # now the run should be ABORTED await assert_comp_runs( @@ -1194,7 +1174,6 @@ async def test_handling_scheduling_after_reboot( mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, reboot_state: RebootState, - mocked_wake_up_callback: mock.Mock, ): """After the dask client is rebooted, or that the director-v2 reboots the dv-2 internal scheduler shall continue scheduling correctly. Even though the task might have continued to run @@ -1216,7 +1195,6 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: user_id=running_project.project.prj_owner, project_id=running_project.project.uuid, iteration=1, - wake_up_callback=mocked_wake_up_callback, ) # the status will be called once for all RUNNING tasks mocked_dask_client.get_tasks_status.assert_called_once() @@ -1288,7 +1266,6 @@ async def test_handling_cancellation_of_jobs_after_reboot( scheduler: BaseCompScheduler, mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, - mocked_wake_up_callback: mock.Mock, ): """A running pipeline was cancelled by a user and the DV-2 was restarted BEFORE It could actually cancel the task. On reboot the DV-2 shall recover @@ -1327,7 +1304,6 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) mocked_dask_client.abort_computation_task.assert_called() assert mocked_dask_client.abort_computation_task.call_count == len( @@ -1377,7 +1353,6 @@ async def _return_random_task_result(job_id) -> TaskOutputData: user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) # now should be stopped await assert_comp_tasks( @@ -1423,7 +1398,6 @@ async def test_running_pipeline_triggers_heartbeat( published_project: PublishedProject, resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, - mocked_wake_up_callback: mock.Mock, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) run_in_db, expected_published_tasks = await _assert_start_pipeline( @@ -1440,7 +1414,6 @@ async def test_running_pipeline_triggers_heartbeat( expected_published_tasks, mocked_dask_client, scheduler, - mocked_wake_up_callback, ) # ------------------------------------------------------------------------------- # 2. the "worker" starts processing a task @@ -1471,7 +1444,6 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) messages = await _assert_message_received( @@ -1488,13 +1460,11 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await scheduler.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, @@ -1510,13 +1480,11 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) await scheduler.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, @@ -1544,7 +1512,6 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( run_metadata: RunMetadataDict, mocked_get_or_create_cluster: mock.Mock, faker: Faker, - mocked_wake_up_callback: mock.Mock, ): mocked_get_or_create_cluster.side_effect = ( ComputationalBackendOnDemandNotReadyError( @@ -1591,7 +1558,6 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 @@ -1617,7 +1583,6 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 @@ -1654,7 +1619,6 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( run_metadata: RunMetadataDict, mocked_get_or_create_cluster: mock.Mock, get_or_create_exception: Exception, - mocked_wake_up_callback: mock.Mock, ): mocked_get_or_create_cluster.side_effect = get_or_create_exception # running the pipeline will trigger a call to the clusters-keeper @@ -1696,7 +1660,6 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 @@ -1722,7 +1685,6 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, - wake_up_callback=mocked_wake_up_callback, ) mocked_get_or_create_cluster.assert_not_called() await assert_comp_runs( diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 66d1b4ff2d7..4872a5c001f 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -67,5 +67,4 @@ async def test_worker_properly_calls_scheduler_api( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, - wake_up_callback=mock.ANY, ) From a6ee7c7f0b8d2536477bd903c08959ecc3fb1258 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 23:21:25 +0100 Subject: [PATCH 055/127] add no cover for abstract methods --- .../modules/comp_scheduler/_scheduler_base.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 05b36d9c9a4..83170cf6eb0 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -80,8 +80,7 @@ def _temporary_empty_wake_up_callack( user_id: UserID, project_id: ProjectID, iteration: Iteration ) -> Callable[[], None]: - def _cb() -> None: - ... + def _cb() -> None: ... # async def _async_cb(): # db_engine = get_db_engine(app) @@ -482,17 +481,17 @@ async def _start_tasks( scheduled_tasks: dict[NodeID, CompTaskAtDB], comp_run: CompRunsAtDB, wake_up_callback: Callable[[], None], - ) -> None: ... + ) -> None: ... # pragma: no cover @abstractmethod async def _get_tasks_status( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> list[RunningState]: ... + ) -> list[RunningState]: ... # pragma: no cover @abstractmethod async def _stop_tasks( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> None: ... + ) -> None: ... # pragma: no cover @abstractmethod async def _process_completed_tasks( @@ -501,7 +500,7 @@ async def _process_completed_tasks( tasks: list[CompTaskAtDB], iteration: Iteration, comp_run: CompRunsAtDB, - ) -> None: ... + ) -> None: ... # pragma: no cover async def schedule_pipeline( self, From 7ab4dc0ff3203db427edff43db42a8ff81f1db69 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 20 Nov 2024 23:22:40 +0100 Subject: [PATCH 056/127] revert --- .../modules/comp_scheduler/_scheduler_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 83170cf6eb0..3e92549b24b 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -481,17 +481,17 @@ async def _start_tasks( scheduled_tasks: dict[NodeID, CompTaskAtDB], comp_run: CompRunsAtDB, wake_up_callback: Callable[[], None], - ) -> None: ... # pragma: no cover + ) -> None: ... @abstractmethod async def _get_tasks_status( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> list[RunningState]: ... # pragma: no cover + ) -> list[RunningState]: ... @abstractmethod async def _stop_tasks( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> None: ... # pragma: no cover + ) -> None: ... @abstractmethod async def _process_completed_tasks( @@ -500,7 +500,7 @@ async def _process_completed_tasks( tasks: list[CompTaskAtDB], iteration: Iteration, comp_run: CompRunsAtDB, - ) -> None: ... # pragma: no cover + ) -> None: ... async def schedule_pipeline( self, From 71e0079184a2823a0399d4eec8ea0d76811fd5c5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:40:24 +0100 Subject: [PATCH 057/127] use new style --- .coveragerc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.coveragerc b/.coveragerc index fb3d7c12624..ebf1465b0fb 100644 --- a/.coveragerc +++ b/.coveragerc @@ -7,25 +7,20 @@ parallel = True [report] # Regexes for lines to exclude from consideration -exclude_lines = - # Have to re-enable the standard pragma - pragma: no cover - +exclude_also = # Don't complain about missing debug-only code: def __repr__ if self\.debug - # Don't complain if tests don't hit defensive assertion code: raise AssertionError raise NotImplementedError - # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: if __name__ == __main__.: + class .*\bProtocol\): # Don't complain about abstract methods, they aren't run: @(abc\.)?abstract(((class|static)?method)|property) - # Don't complain about type checking if TYPE_CHECKING: From d7c49a01d14ba2dee89fcd96254feb4f2f7f7e64 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:40:52 +0100 Subject: [PATCH 058/127] use docstrings so that coverage is correctly computed --- .../modules/comp_scheduler/_scheduler_base.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 3e92549b24b..00fd1f8efab 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -481,17 +481,20 @@ async def _start_tasks( scheduled_tasks: dict[NodeID, CompTaskAtDB], comp_run: CompRunsAtDB, wake_up_callback: Callable[[], None], - ) -> None: ... + ) -> None: + """start tasks in the 3rd party backend""" @abstractmethod async def _get_tasks_status( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> list[RunningState]: ... + ) -> list[RunningState]: + """returns tasks status from the 3rd party backend""" @abstractmethod async def _stop_tasks( self, user_id: UserID, tasks: list[CompTaskAtDB], comp_run: CompRunsAtDB - ) -> None: ... + ) -> None: + """stop tasks in the 3rd party backend""" @abstractmethod async def _process_completed_tasks( @@ -500,7 +503,8 @@ async def _process_completed_tasks( tasks: list[CompTaskAtDB], iteration: Iteration, comp_run: CompRunsAtDB, - ) -> None: ... + ) -> None: + """process tasks from the 3rd party backend""" async def schedule_pipeline( self, From 62f34c9a2d5417676fc234c5e0065cd25da2f187 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:32:17 +0100 Subject: [PATCH 059/127] pyv2 --- services/director-v2/tests/unit/_helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index 6fa68330683..4e89ce7b79c 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -7,7 +7,7 @@ from models_library.projects import ProjectAtDB, ProjectID from models_library.projects_nodes_io import NodeID from models_library.projects_state import RunningState -from pydantic import parse_obj_as +from pydantic import TypeAdapter from simcore_postgres_database.models.comp_runs import comp_runs from simcore_postgres_database.models.comp_tasks import comp_tasks from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB @@ -68,7 +68,7 @@ async def assert_comp_runs( if where_statement is not None: query = query.where(where_statement) list_of_comp_runs = [ - CompRunsAtDB.from_orm(row) for row in await conn.execute(query) + CompRunsAtDB.model_validate(row) for row in await conn.execute(query) ] assert len(list_of_comp_runs) == expected_total if list_of_comp_runs and expected_state: @@ -98,7 +98,7 @@ async def assert_comp_tasks( & (comp_tasks.c.node_id.in_([f"{n}" for n in task_ids])) ) # there is only one entry ) - tasks = parse_obj_as(list[CompTaskAtDB], result.fetchall()) + tasks = TypeAdapter(list[CompTaskAtDB]).validate_python(result.fetchall()) assert all( t.state == expected_state for t in tasks ), f"expected state: {expected_state}, found: {[t.state for t in tasks]}" From 77014303f885e822ef90d09efbb7b37f22f32a43 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:32:33 +0100 Subject: [PATCH 060/127] unskip test --- .../tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 8ace69b2a77..9cc99551f38 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -1094,9 +1094,6 @@ class RebootState: expected_run_state: RunningState -@pytest.mark.skip( - reason="awaiting refactor in https://github.com/ITISFoundation/osparc-simcore/pull/6736" -) @pytest.mark.parametrize( "reboot_state", [ From 2fa420a4f82b6aef84455451aedf4308d8e03cde Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:37:49 +0100 Subject: [PATCH 061/127] improve name --- .../tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 9cc99551f38..e39b901782a 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -1161,7 +1161,7 @@ class RebootState: ), ], ) -async def test_handling_scheduling_after_reboot( +async def test_handling_scheduled_tasks_after_director_reboots( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, mocked_dask_client: mock.MagicMock, From a79df13364e4674a3c96b4fd478f94171fa2d32f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:44:02 +0100 Subject: [PATCH 062/127] add docs --- .../tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index e39b901782a..e0c4288605e 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -1617,6 +1617,8 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( mocked_get_or_create_cluster: mock.Mock, get_or_create_exception: Exception, ): + # needs to change: https://github.com/ITISFoundation/osparc-simcore/issues/6817 + mocked_get_or_create_cluster.side_effect = get_or_create_exception # running the pipeline will trigger a call to the clusters-keeper assert published_project.project.prj_owner From a758376843d93c8b10059b603e48dc4862435bd4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:50:34 +0100 Subject: [PATCH 063/127] merge --- ...uled_time.py => 4885182f6206_add_last_scheduled_time.py} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename packages/postgres-database/src/simcore_postgres_database/migration/versions/{1be37720e832_add_last_scheduled_time.py => 4885182f6206_add_last_scheduled_time.py} (86%) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py similarity index 86% rename from packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py rename to packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py index eaf4d5f116c..e153106401a 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/1be37720e832_add_last_scheduled_time.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py @@ -1,15 +1,15 @@ """add last scheduled time -Revision ID: 1be37720e832 +Revision ID: 4885182f6206 Revises: 8e1f83486be7 -Create Date: 2024-11-15 16:12:08.825985+00:00 +Create Date: 2024-11-22 15:50:17.798131+00:00 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. -revision = "1be37720e832" +revision = "4885182f6206" down_revision = "8e1f83486be7" branch_labels = None depends_on = None From 75b7983cf002966688e6f1a451b649f20a89197a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:49:23 +0100 Subject: [PATCH 064/127] add doc for next PR --- .../modules/comp_scheduler/_publisher.py | 3 ++- .../modules/comp_scheduler/_scheduler_base.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py index d11b1a65704..bd4f9af8c51 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py @@ -9,7 +9,8 @@ async def request_pipeline_scheduling( run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine ) -> None: - # TODO: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # NOTE: we should use the transaction and the asyncpg engine here to ensure 100% consistency + # https://github.com/ITISFoundation/osparc-simcore/issues/6818 # async with transaction_context(get_asyncpg_engine(app)) as connection: await rabbitmq_client.publish( SchedulePipelineRabbitMessage.get_channel_name(), diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 00fd1f8efab..b2c9734b031 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -39,7 +39,6 @@ ComputationalBackendNotConnectedError, ComputationalBackendOnDemandNotReadyError, ComputationalSchedulerChangedError, - ComputationalSchedulerError, DaskClientAcquisisitonError, InvalidPipelineError, PipelineNotFoundError, @@ -80,7 +79,8 @@ def _temporary_empty_wake_up_callack( user_id: UserID, project_id: ProjectID, iteration: Iteration ) -> Callable[[], None]: - def _cb() -> None: ... + def _cb() -> None: + ... # async def _async_cb(): # db_engine = get_db_engine(app) @@ -688,9 +688,9 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[NodeIDStr(f"{task}")].state = ( - RunningState.WAITING_FOR_CLUSTER - ) + comp_tasks[ + NodeIDStr(f"{task}") + ].state = RunningState.WAITING_FOR_CLUSTER except ComputationalBackendOnDemandNotReadyError as exc: _logger.info( @@ -712,9 +712,9 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[NodeIDStr(f"{task}")].state = ( - RunningState.WAITING_FOR_CLUSTER - ) + comp_tasks[ + NodeIDStr(f"{task}") + ].state = RunningState.WAITING_FOR_CLUSTER except ClustersKeeperNotAvailableError: _logger.exception("Unexpected error while starting tasks:") await publish_project_log( From 328e518768cfb40177c2a26d3ef91c0709542ef7 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:59:25 +0100 Subject: [PATCH 065/127] add doc for next PR --- .../comp_scheduler/test_scheduler_dask.py | 204 +++++++++--------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index e0c4288605e..392d9414600 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -152,76 +152,10 @@ def mocked_clean_task_output_and_log_files_if_invalid( @pytest.fixture -def mocked_wake_up_callback(mocker: MockerFixture) -> mock.Mock: - return mock.Mock() - - -@pytest.fixture -def scheduler(initialized_app: FastAPI) -> BaseCompScheduler: +def scheduler_api(initialized_app: FastAPI) -> BaseCompScheduler: return _get_scheduler_worker(initialized_app) -async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( - with_disabled_auto_scheduling: mock.Mock, - with_disabled_scheduler_publisher: mock.Mock, - initialized_app: FastAPI, - scheduler: BaseCompScheduler, - registered_user: Callable[..., dict[str, Any]], - project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - fake_workbench_without_outputs: dict[str, Any], - fake_workbench_adjacency: dict[str, Any], - sqlalchemy_async_engine: AsyncEngine, - run_metadata: RunMetadataDict, -): - """A pipeline which comp_tasks are missing should not be scheduled. - It shall be aborted and shown as such in the comp_runs db""" - user = registered_user() - sleepers_project = await project(user, workbench=fake_workbench_without_outputs) - pipeline( - project_id=f"{sleepers_project.uuid}", - dag_adjacency_list=fake_workbench_adjacency, - ) - await assert_comp_runs_empty(sqlalchemy_async_engine) - - # since the publisher is disabled, it will not automatically trigger a scheduling - # this is done to verify the scheduler internal state - await run_new_pipeline( - initialized_app, - user_id=user["id"], - project_id=sleepers_project.uuid, - cluster_id=DEFAULT_CLUSTER_ID, - run_metadata=run_metadata, - use_on_demand_clusters=False, - ) - with_disabled_scheduler_publisher.assert_called_once() - # check the database was properly updated - runs = await assert_comp_runs( - sqlalchemy_async_engine, - expected_total=1, - expected_state=RunningState.PUBLISHED, - where_statement=(comp_runs.c.user_id == user["id"]) - & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), - ) - run_entry = runs[0] - - # run now the scheduler, this will abort the run directly - await scheduler.schedule_pipeline( - user_id=run_entry.user_id, - project_id=run_entry.project_uuid, - iteration=run_entry.iteration, - ) - - # check the database entry is correctly updated - await assert_comp_runs( - sqlalchemy_async_engine, - expected_total=1, - expected_state=RunningState.ABORTED, - where_statement=(comp_runs.c.user_id == user["id"]) - & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), - ) - - async def _assert_start_pipeline( app: FastAPI, *, @@ -470,7 +404,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, mocked_parse_output_data_fct: mock.Mock, @@ -481,6 +415,9 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) + # + # Initiate new pipeline run + # run_in_db, expected_published_tasks = await _assert_start_pipeline( initialized_app, sqlalchemy_async_engine=sqlalchemy_async_engine, @@ -495,7 +432,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 published_project, expected_published_tasks, mocked_dask_client, - scheduler, + scheduler_api, ) # ------------------------------------------------------------------------------- @@ -515,7 +452,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta ] mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -565,14 +502,14 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta assert exp_started_task.node_id assert published_project.project.prj_owner await _trigger_progress_event( - scheduler, + scheduler_api, job_id=exp_started_task.job_id, user_id=published_project.project.prj_owner, project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -650,7 +587,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: return TaskOutputData.model_validate({"out_1": None, "out_2": 45}) mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -753,13 +690,13 @@ async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # trigger the scheduler, run state should keep to STARTED, task should be as well assert exp_started_task.job_id await _trigger_progress_event( - scheduler, + scheduler_api, job_id=exp_started_task.job_id, user_id=published_project.project.prj_owner, project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -815,7 +752,7 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed mocked_dask_client.get_task_result.side_effect = None - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -876,7 +813,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_task_result.side_effect = _return_random_task_result # trigger the scheduler, it should switch to FAILED, as we are done - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -922,12 +859,75 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta assert isinstance(messages[1], RabbitResourceTrackingStoppedMessage) +async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, + initialized_app: FastAPI, + scheduler_api: BaseCompScheduler, + registered_user: Callable[..., dict[str, Any]], + project: Callable[..., Awaitable[ProjectAtDB]], + pipeline: Callable[..., CompPipelineAtDB], + fake_workbench_without_outputs: dict[str, Any], + fake_workbench_adjacency: dict[str, Any], + sqlalchemy_async_engine: AsyncEngine, + run_metadata: RunMetadataDict, +): + """A pipeline which comp_tasks are missing should not be scheduled. + It shall be aborted and shown as such in the comp_runs db""" + user = registered_user() + sleepers_project = await project(user, workbench=fake_workbench_without_outputs) + pipeline( + project_id=f"{sleepers_project.uuid}", + dag_adjacency_list=fake_workbench_adjacency, + ) + await assert_comp_runs_empty(sqlalchemy_async_engine) + + # + # Initiate new pipeline scheduling + # + await run_new_pipeline( + initialized_app, + user_id=user["id"], + project_id=sleepers_project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + with_disabled_scheduler_publisher.assert_called_once() + # we shall have a a new comp_runs row with the new pipeline job + run_entry = ( + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PUBLISHED, + where_statement=(comp_runs.c.user_id == user["id"]) + & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), + ) + )[0] + + # + # Trigger scheduling manually. since the pipeline is broken, it shall be aborted + # + await scheduler_api.schedule_pipeline( + user_id=run_entry.user_id, + project_id=run_entry.project_uuid, + iteration=run_entry.iteration, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.ABORTED, + where_statement=(comp_runs.c.user_id == user["id"]) + & (comp_runs.c.project_uuid == f"{sleepers_project.uuid}"), + ) + + async def test_task_progress_triggers( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, mocked_parse_output_data_fct: None, @@ -948,7 +948,7 @@ async def test_task_progress_triggers( published_project, expected_published_tasks, mocked_dask_client, - scheduler, + scheduler_api, ) # send some progress @@ -968,7 +968,7 @@ async def test_task_progress_triggers( ), ) await cast( # noqa: SLF001 - DaskScheduler, scheduler + DaskScheduler, scheduler_api )._task_progress_change_handler(progress_event.model_dump_json()) # NOTE: not sure whether it should switch to STARTED.. it would make sense await assert_comp_tasks( @@ -995,7 +995,7 @@ async def test_handling_of_disconnected_scheduler_dask( with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, mocker: MockerFixture, published_project: PublishedProject, @@ -1048,7 +1048,7 @@ async def test_handling_of_disconnected_scheduler_dask( project_id=published_project.project.uuid, ) # we ensure the scheduler was run - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1066,7 +1066,7 @@ async def test_handling_of_disconnected_scheduler_dask( expected_progress=1, ) # then we have another scheduler run - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1167,7 +1167,7 @@ async def test_handling_scheduled_tasks_after_director_reboots( mocked_dask_client: mock.MagicMock, sqlalchemy_async_engine: AsyncEngine, running_project: RunningProject, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, reboot_state: RebootState, @@ -1188,7 +1188,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: mocked_dask_client.get_task_result.side_effect = mocked_get_task_result assert running_project.project.prj_owner - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=running_project.project.prj_owner, project_id=running_project.project.uuid, iteration=1, @@ -1260,7 +1260,7 @@ async def test_handling_cancellation_of_jobs_after_reboot( mocked_dask_client: mock.MagicMock, sqlalchemy_async_engine: AsyncEngine, running_project_mark_for_cancellation: RunningProject, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, mocked_parse_output_data_fct: mock.MagicMock, mocked_clean_task_output_fct: mock.MagicMock, ): @@ -1297,7 +1297,7 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status # Running the scheduler, should actually cancel the run now - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1346,7 +1346,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: raise TaskCancelledError mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1390,7 +1390,7 @@ async def test_running_pipeline_triggers_heartbeat( with_fast_service_heartbeat_s: int, initialized_app: FastAPI, mocked_dask_client: mock.MagicMock, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, resource_tracking_rabbit_client_parser: mock.AsyncMock, @@ -1410,7 +1410,7 @@ async def test_running_pipeline_triggers_heartbeat( published_project, expected_published_tasks, mocked_dask_client, - scheduler, + scheduler_api, ) # ------------------------------------------------------------------------------- # 2. the "worker" starts processing a task @@ -1431,13 +1431,13 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta assert exp_started_task.job_id assert published_project.project.prj_owner await _trigger_progress_event( - scheduler, + scheduler_api, job_id=exp_started_task.job_id, user_id=published_project.project.prj_owner, project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1453,12 +1453,12 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 3. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1473,12 +1473,12 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 4. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1503,7 +1503,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, run_metadata: RunMetadataDict, @@ -1551,7 +1551,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( published_project.tasks[1], published_project.tasks[3], ] - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1576,7 +1576,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( expected_progress=None, ) # again will trigger the same response - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1610,7 +1610,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, initialized_app: FastAPI, - scheduler: BaseCompScheduler, + scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, run_metadata: RunMetadataDict, @@ -1655,7 +1655,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( published_project.tasks[1], published_project.tasks[3], ] - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1680,7 +1680,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( expected_progress=1.0, ) # again will not re-trigger the call to clusters-keeper - await scheduler.schedule_pipeline( + await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, From dec4b4be088650a70f0184f51b748f4e3b250aee Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:59:38 +0100 Subject: [PATCH 066/127] ruff --- .../tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 392d9414600..4866dee8588 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -398,7 +398,7 @@ async def _trigger_progress_event( ) -@pytest.mark.acceptance_test +@pytest.mark.acceptance_test() async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, From 34def3731d8d1567e4fde566f2aa03a1c1c5d7e3 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 18:11:55 +0100 Subject: [PATCH 067/127] fix after merge --- ..._time.py => da1700f9eceb_added_last_scheduled.py} | 12 ++++++------ .../simcore_postgres_database/models/comp_runs.py | 2 +- .../with_dbs/comp_scheduler/test_scheduler_dask.py | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) rename packages/postgres-database/src/simcore_postgres_database/migration/versions/{4885182f6206_add_last_scheduled_time.py => da1700f9eceb_added_last_scheduled.py} (75%) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py similarity index 75% rename from packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py rename to packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py index e153106401a..7ce24024d00 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/4885182f6206_add_last_scheduled_time.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py @@ -1,16 +1,16 @@ -"""add last scheduled time +"""added_last_scheduled -Revision ID: 4885182f6206 -Revises: 8e1f83486be7 -Create Date: 2024-11-22 15:50:17.798131+00:00 +Revision ID: da1700f9eceb +Revises: c9db8bf5091e +Create Date: 2024-11-24 17:11:30.519365+00:00 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. -revision = "4885182f6206" -down_revision = "8e1f83486be7" +revision = "da1700f9eceb" +down_revision = "c9db8bf5091e" branch_labels = None depends_on = None diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py index 3cdf2a9b3bb..06d8ea97252 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py @@ -111,7 +111,7 @@ "last_scheduled", sa.DateTime(timezone=True), nullable=True, - doc="last time the pipeline was scheduled", + doc="last time the pipeline was scheduled to be processed", ), sa.Column("metadata", JSONB, nullable=True, doc="the run optional metadata"), sa.Column( diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 4866dee8588..f87b1aa1e84 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -424,6 +424,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 published_project=published_project, run_metadata=run_metadata, ) + with_disabled_scheduler_publisher.assert_called() # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the worker can take them From 819b71fa83a06814dbd634aaafa33c2760d1c4e5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 18:19:00 +0100 Subject: [PATCH 068/127] cleanup --- services/director-v2/tests/unit/conftest.py | 2 +- .../comp_scheduler/test_scheduler_dask.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/services/director-v2/tests/unit/conftest.py b/services/director-v2/tests/unit/conftest.py index 3e1764c4e55..cdf0751fab4 100644 --- a/services/director-v2/tests/unit/conftest.py +++ b/services/director-v2/tests/unit/conftest.py @@ -186,7 +186,7 @@ def fake_s3_settings(faker: Faker) -> S3Settings: @pytest.fixture def fake_s3_envs(fake_s3_settings: S3Settings) -> EnvVarsDict: - return fake_s3_settings.dict() + return fake_s3_settings.model_dump() @pytest.fixture diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index f87b1aa1e84..c4444ed9a7e 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -408,7 +408,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, mocked_parse_output_data_fct: mock.Mock, - mocked_clean_task_output_and_log_files_if_invalid: None, + mocked_clean_task_output_and_log_files_if_invalid: mock.Mock, instrumentation_rabbit_client_parser: mock.AsyncMock, resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, @@ -758,6 +758,9 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) + mocked_clean_task_output_and_log_files_if_invalid.assert_called_once() + mocked_clean_task_output_and_log_files_if_invalid.reset_mock() + await assert_comp_runs( sqlalchemy_async_engine, expected_total=1, @@ -819,6 +822,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) + mocked_clean_task_output_and_log_files_if_invalid.assert_not_called() await assert_comp_runs( sqlalchemy_async_engine, expected_total=1, @@ -931,8 +935,8 @@ async def test_task_progress_triggers( scheduler_api: BaseCompScheduler, sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, - mocked_parse_output_data_fct: None, - mocked_clean_task_output_and_log_files_if_invalid: None, + mocked_parse_output_data_fct: mock.Mock, + mocked_clean_task_output_and_log_files_if_invalid: mock.Mock, run_metadata: RunMetadataDict, ): _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) @@ -1169,8 +1173,8 @@ async def test_handling_scheduled_tasks_after_director_reboots( sqlalchemy_async_engine: AsyncEngine, running_project: RunningProject, scheduler_api: BaseCompScheduler, - mocked_parse_output_data_fct: mock.MagicMock, - mocked_clean_task_output_fct: mock.MagicMock, + mocked_parse_output_data_fct: mock.Mock, + mocked_clean_task_output_fct: mock.Mock, reboot_state: RebootState, ): """After the dask client is rebooted, or that the director-v2 reboots the dv-2 internal scheduler @@ -1262,8 +1266,8 @@ async def test_handling_cancellation_of_jobs_after_reboot( sqlalchemy_async_engine: AsyncEngine, running_project_mark_for_cancellation: RunningProject, scheduler_api: BaseCompScheduler, - mocked_parse_output_data_fct: mock.MagicMock, - mocked_clean_task_output_fct: mock.MagicMock, + mocked_parse_output_data_fct: mock.Mock, + mocked_clean_task_output_fct: mock.Mock, ): """A running pipeline was cancelled by a user and the DV-2 was restarted BEFORE It could actually cancel the task. On reboot the DV-2 shall recover From a4bca6071cd45def5c872f74ebaef3b0d6f9fd9b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 18:29:54 +0100 Subject: [PATCH 069/127] cleanup --- .../comp_scheduler/test_scheduler_dask.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index c4444ed9a7e..07fc32050b7 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -413,6 +413,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, ): + with_disabled_auto_scheduling.assert_called_once() _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) # @@ -427,7 +428,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 with_disabled_scheduler_publisher.assert_called() # ------------------------------------------------------------------------------- - # 1. first run will move comp_tasks to PENDING so the worker can take them + # 1. first run will move comp_tasks to PENDING so the dask-worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( sqlalchemy_async_engine, published_project, @@ -437,7 +438,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 ) # ------------------------------------------------------------------------------- - # 2.1. the worker might be taking the task, until we get a progress we do not know + # 2.1. the dask-worker might be taking the task, until we get a progress we do not know # whether it effectively started or it is still queued in the worker process exp_started_task = expected_pending_tasks[0] expected_pending_tasks.remove(exp_started_task) @@ -470,14 +471,8 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta await assert_comp_tasks( sqlalchemy_async_engine, project_uuid=published_project.project.uuid, - task_ids=[exp_started_task.node_id], - expected_state=RunningState.PENDING, - expected_progress=None, - ) - await assert_comp_tasks( - sqlalchemy_async_engine, - project_uuid=published_project.project.uuid, - task_ids=[p.node_id for p in expected_pending_tasks], + task_ids=[exp_started_task.node_id] + + [p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) @@ -496,8 +491,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_task_result.assert_not_called() # ------------------------------------------------------------------------------- - # 3. the "worker" starts processing a task - # here we trigger a progress from the worker + # 3. the dask-worker starts processing a task here we simulate a progress event assert exp_started_task.job_id assert exp_started_task.project_id assert exp_started_task.node_id @@ -552,6 +546,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta ) mocked_dask_client.get_tasks_status.reset_mock() mocked_dask_client.get_task_result.assert_not_called() + # check the metrics are properly published messages = await _assert_message_received( instrumentation_rabbit_client_parser, 1, @@ -560,9 +555,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta assert messages[0].metrics == "service_started" assert messages[0].service_uuid == exp_started_task.node_id - def _parser(x) -> RabbitResourceTrackingMessages: - return TypeAdapter(RabbitResourceTrackingMessages).validate_json(x) - + # check the RUT messages are properly published messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -571,7 +564,7 @@ def _parser(x) -> RabbitResourceTrackingMessages: assert messages[0].node_id == exp_started_task.node_id # ------------------------------------------------------------------------------- - # 4. the "worker" completed the task successfully + # 4. the dask-worker completed the task successfully async def _return_1st_task_success(job_ids: list[str]) -> list[DaskClientTaskState]: return [ ( @@ -609,6 +602,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: expected_state=RunningState.SUCCESS, expected_progress=1, ) + # check metrics are published messages = await _assert_message_received( instrumentation_rabbit_client_parser, 1, @@ -616,6 +610,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: ) assert messages[0].metrics == "service_stopped" assert messages[0].service_uuid == exp_started_task.node_id + # check RUT messages are published messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -674,7 +669,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: mocked_parse_output_data_fct.reset_mock() # ------------------------------------------------------------------------------- - # 6. the "worker" starts processing a task + # 6. the dask-worker starts processing a task exp_started_task = next_pending_task async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskState]: @@ -850,7 +845,11 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta 2, InstrumentationRabbitMessage.model_validate_json, ) + # NOTE: the service was fast and went directly to success + def _parser(x) -> RabbitResourceTrackingMessages: + return TypeAdapter(RabbitResourceTrackingMessages).validate_json(x) + assert messages[0].metrics == "service_started" assert messages[0].service_uuid == exp_started_task.node_id assert messages[1].metrics == "service_stopped" From c0d749fa173ec1b4b680fb0c71a161fbb779856c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 22:35:13 +0100 Subject: [PATCH 070/127] change signature --- .../modules/comp_scheduler/_manager.py | 8 +++++++- .../modules/comp_scheduler/_publisher.py | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index f4def349566..9befa0a9d97 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -134,7 +134,13 @@ async def schedule_pipelines(app: FastAPI) -> None: rabbitmq_client = get_rabbitmq_client(app) await limited_gather( *( - request_pipeline_scheduling(run, rabbitmq_client, db_engine) + request_pipeline_scheduling( + rabbitmq_client, + db_engine, + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ) for run in runs_to_schedule ), limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py index bd4f9af8c51..5a8bcbc2027 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py @@ -1,13 +1,20 @@ from aiopg.sa import Engine +from models_library.projects import ProjectID +from models_library.users import UserID from servicelib.rabbitmq import RabbitMQClient -from ...models.comp_runs import CompRunsAtDB +from ...models.comp_runs import Iteration from ..db.repositories.comp_runs import CompRunsRepository from ._models import SchedulePipelineRabbitMessage async def request_pipeline_scheduling( - run: CompRunsAtDB, rabbitmq_client: RabbitMQClient, db_engine: Engine + rabbitmq_client: RabbitMQClient, + db_engine: Engine, + *, + user_id: UserID, + project_id: ProjectID, + iteration: Iteration ) -> None: # NOTE: we should use the transaction and the asyncpg engine here to ensure 100% consistency # https://github.com/ITISFoundation/osparc-simcore/issues/6818 @@ -15,11 +22,11 @@ async def request_pipeline_scheduling( await rabbitmq_client.publish( SchedulePipelineRabbitMessage.get_channel_name(), SchedulePipelineRabbitMessage( - user_id=run.user_id, - project_id=run.project_uuid, - iteration=run.iteration, + user_id=user_id, + project_id=project_id, + iteration=iteration, ), ) await CompRunsRepository.instance(db_engine).mark_as_scheduled( - user_id=run.user_id, project_id=run.project_uuid, iteration=run.iteration + user_id=user_id, project_id=project_id, iteration=iteration ) From dc2e7ce7c3251f7fdbd0c5c49e8bc777e44cc3d8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 22:47:34 +0100 Subject: [PATCH 071/127] changed syntax --- .../modules/comp_scheduler/_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 9befa0a9d97..c1fcc93cf9d 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -62,7 +62,13 @@ async def run_new_pipeline( ) rabbitmq_client = get_rabbitmq_client(app) - await request_pipeline_scheduling(new_run, rabbitmq_client, db_engine) + await request_pipeline_scheduling( + rabbitmq_client, + db_engine, + user_id=new_run.user_id, + project_id=new_run.project_uuid, + iteration=new_run.iteration, + ) await publish_project_log( rabbitmq_client, user_id, From f36adc478df46c43d7e008919398bd480757513d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 23:01:20 +0100 Subject: [PATCH 072/127] initial implementation --- .../modules/comp_scheduler/_scheduler_base.py | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index b2c9734b031..962a88b180d 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -29,7 +29,7 @@ from models_library.users import UserID from networkx.classes.reportviews import InDegreeView from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE -from servicelib.logging_utils import log_context +from servicelib.logging_utils import log_catch, log_context from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient from servicelib.redis import RedisClientSDK @@ -58,6 +58,7 @@ from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository from ..db.repositories.comp_tasks import CompTasksRepository +from ._publisher import request_pipeline_scheduling from ._utils import ( COMPLETED_STATES, PROCESSING_STATES, @@ -76,19 +77,30 @@ _MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10 -def _temporary_empty_wake_up_callack( - user_id: UserID, project_id: ProjectID, iteration: Iteration +def _auto_schedule_callback( + db_engine: Engine, + rabbit_mq_client: RabbitMQClient, + *, + user_id: UserID, + project_id: ProjectID, + iteration: Iteration, ) -> Callable[[], None]: def _cb() -> None: - ... - - # async def _async_cb(): - # db_engine = get_db_engine(app) - # rabbit_mq_client = get_rabbitmq_client(app) - # comp_run = await CompRunsRepository.instance(db_engine).get( - # user_id=user_id, project_id=project_id, iteration=iteration - # ) - # await request_pipeline_scheduling(comp_run, rabbit_mq_client, db_engine) + async def _async_cb(): + await request_pipeline_scheduling( + rabbit_mq_client, + db_engine, + user_id=user_id, + project_id=project_id, + iteration=iteration, + ) + + future = asyncio.run_coroutine_threadsafe( + _async_cb(), asyncio.get_running_loop() + ) + with log_catch(_logger, reraise=False): + future.result(timeout=10) + return _cb @@ -550,8 +562,12 @@ async def schedule_pipeline( comp_tasks=comp_tasks, dag=dag, comp_run=comp_run, - wake_up_callback=_temporary_empty_wake_up_callack( - user_id, project_id, iteration + wake_up_callback=_auto_schedule_callback( + self.db_engine, + self.rabbitmq_client, + user_id=user_id, + project_id=project_id, + iteration=iteration, ), ) # 4. timeout if waiting for cluster has been there for more than X minutes From d5dcf7f4840a8a55fb9c2ed7d3e5e078804177b4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 23:06:02 +0100 Subject: [PATCH 073/127] maybe --- .../comp_scheduler/test_scheduler_dask.py | 209 +++++++++++++++++- 1 file changed, 202 insertions(+), 7 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 07fc32050b7..9e32a61310a 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -11,6 +11,7 @@ import asyncio import datetime from collections.abc import AsyncIterator, Awaitable, Callable +from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from dataclasses import dataclass from typing import Any, cast @@ -245,6 +246,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] ) # tasks were send to the backend assert published_project.project.prj_owner is not None + assert isinstance(mocked_dask_client.send_computation_tasks, mock.Mock) mocked_dask_client.send_computation_tasks.assert_has_calls( calls=[ mock.call( @@ -353,9 +355,9 @@ async def _assert_message_received( return parsed_messages -def _mock_send_computation_tasks( +def _with_mock_send_computation_tasks( tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock -) -> None: +) -> mock.Mock: node_id_to_job_id_map = {task.node_id: task.job_id for task in tasks} async def _send_computation_tasks( @@ -372,6 +374,7 @@ async def _send_computation_tasks( ] # type: ignore mocked_dask_client.send_computation_tasks.side_effect = _send_computation_tasks + return mocked_dask_client.send_computation_tasks async def _trigger_progress_event( @@ -414,7 +417,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 run_metadata: RunMetadataDict, ): with_disabled_auto_scheduling.assert_called_once() - _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) + _with_mock_send_computation_tasks(published_project.tasks, mocked_dask_client) # # Initiate new pipeline run @@ -863,6 +866,198 @@ def _parser(x) -> RabbitResourceTrackingMessages: assert isinstance(messages[1], RabbitResourceTrackingStoppedMessage) +@pytest.fixture +async def with_started_project( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_publisher: mock.Mock, + initialized_app: FastAPI, + sqlalchemy_async_engine: AsyncEngine, + publish_project: Callable[[], Awaitable[PublishedProject]], + mocked_dask_client: mock.Mock, + run_metadata: RunMetadataDict, + scheduler_api: BaseCompScheduler, + instrumentation_rabbit_client_parser: mock.AsyncMock, + resource_tracking_rabbit_client_parser: mock.AsyncMock, +) -> RunningProject: + with_disabled_auto_scheduling.assert_called_once() + published_project = await publish_project() + # + # 1. Initiate new pipeline run + # + run_in_db, expected_published_tasks = await _assert_start_pipeline( + initialized_app, + sqlalchemy_async_engine=sqlalchemy_async_engine, + published_project=published_project, + run_metadata=run_metadata, + ) + with_disabled_scheduler_publisher.assert_called_once() + + # + # 2. This runs the scheduler until the project is started scheduled in the back-end + # + expected_pending_tasks = await _assert_schedule_pipeline_PENDING( + sqlalchemy_async_engine, + published_project, + expected_published_tasks, + mocked_dask_client, + scheduler_api, + ) + + # + # The dask-worker can take a job when it is PENDING, but the dask scheduler makes + # no difference between PENDING and STARTED + # + exp_started_task = expected_pending_tasks[0] + expected_pending_tasks.remove(exp_started_task) + + async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskState]: + return [ + ( + DaskClientTaskState.PENDING_OR_STARTED + if job_id == exp_started_task.job_id + else DaskClientTaskState.PENDING + ) + for job_id in job_ids + ] + + mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running + await scheduler_api.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + ) + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.PENDING, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id] + + [p.node_id for p in expected_pending_tasks], + expected_state=RunningState.PENDING, + expected_progress=None, + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_published_tasks], + expected_state=RunningState.PUBLISHED, + expected_progress=None, # since we bypass the API entrypoint this is correct + ) + mocked_dask_client.send_computation_tasks.assert_not_called() + mocked_dask_client.get_tasks_status.assert_called_once_with( + [p.job_id for p in (exp_started_task, *expected_pending_tasks)], + ) + mocked_dask_client.get_tasks_status.reset_mock() + mocked_dask_client.get_task_result.assert_not_called() + + # ------------------------------------------------------------------------------- + # 4. the dask-worker starts processing a task here we simulate a progress event + assert exp_started_task.job_id + assert exp_started_task.project_id + assert exp_started_task.node_id + assert published_project.project.prj_owner + await _trigger_progress_event( + scheduler_api, + job_id=exp_started_task.job_id, + user_id=published_project.project.prj_owner, + project_id=exp_started_task.project_id, + node_id=exp_started_task.node_id, + ) + + await scheduler_api.schedule_pipeline( + user_id=run_in_db.user_id, + project_id=run_in_db.project_uuid, + iteration=run_in_db.iteration, + ) + # comp_run, the comp_task switch to STARTED + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[exp_started_task.node_id], + expected_state=RunningState.STARTED, + expected_progress=0, + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_pending_tasks], + expected_state=RunningState.PENDING, + expected_progress=None, + ) + await assert_comp_tasks( + sqlalchemy_async_engine, + project_uuid=published_project.project.uuid, + task_ids=[p.node_id for p in expected_published_tasks], + expected_state=RunningState.PUBLISHED, + expected_progress=None, + ) + mocked_dask_client.send_computation_tasks.assert_not_called() + mocked_dask_client.get_tasks_status.assert_called_once_with( + [p.job_id for p in (exp_started_task, *expected_pending_tasks)], + ) + mocked_dask_client.get_tasks_status.reset_mock() + mocked_dask_client.get_task_result.assert_not_called() + # check the metrics are properly published + messages = await _assert_message_received( + instrumentation_rabbit_client_parser, + 1, + InstrumentationRabbitMessage.model_validate_json, + ) + assert messages[0].metrics == "service_started" + assert messages[0].service_uuid == exp_started_task.node_id + + # check the RUT messages are properly published + messages = await _assert_message_received( + resource_tracking_rabbit_client_parser, + 1, + RabbitResourceTrackingStartedMessage.model_validate_json, + ) + assert messages[0].node_id == exp_started_task.node_id + + return RunningProject( + published_project.project, + published_project.pipeline, + published_project.tasks, + runs=run_in_db, + ) + + +async def test_completed_task_triggers_new_scheduling_task( + with_disabled_scheduler_publisher: mock.Mock, + with_started_project: RunningProject, + initialized_app: FastAPI, + mocked_dask_client: mock.MagicMock, + scheduler_api: BaseCompScheduler, + sqlalchemy_async_engine: AsyncEngine, + mocker: MockerFixture, +): + """When a pipeline job completes, the Dask backend provides a callback + that runs in a separate thread. We use that callback to ask the + director-v2 computational scheduler manager to ask for a new schedule + After fiddling in distributed source code, here is a similar way to trigger that callback + """ + with ThreadPoolExecutor( + max_workers=1, thread_name_prefix="pytest-callback-thread" + ) as executor: + ... + + async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_publisher: mock.Mock, @@ -938,7 +1133,7 @@ async def test_task_progress_triggers( mocked_clean_task_output_and_log_files_if_invalid: mock.Mock, run_metadata: RunMetadataDict, ): - _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) + _with_mock_send_computation_tasks(published_project.tasks, mocked_dask_client) _run_in_db, expected_published_tasks = await _assert_start_pipeline( initialized_app, sqlalchemy_async_engine=sqlalchemy_async_engine, @@ -946,7 +1141,7 @@ async def test_task_progress_triggers( run_metadata=run_metadata, ) # ------------------------------------------------------------------------------- - # 1. first run will move comp_tasks to PENDING so the worker can take them + # 1. first run will move comp_tasks to PENDING so the dask-worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( sqlalchemy_async_engine, published_project, @@ -1400,7 +1595,7 @@ async def test_running_pipeline_triggers_heartbeat( resource_tracking_rabbit_client_parser: mock.AsyncMock, run_metadata: RunMetadataDict, ): - _mock_send_computation_tasks(published_project.tasks, mocked_dask_client) + _with_mock_send_computation_tasks(published_project.tasks, mocked_dask_client) run_in_db, expected_published_tasks = await _assert_start_pipeline( initialized_app, sqlalchemy_async_engine=sqlalchemy_async_engine, @@ -1408,7 +1603,7 @@ async def test_running_pipeline_triggers_heartbeat( run_metadata=run_metadata, ) # ------------------------------------------------------------------------------- - # 1. first run will move comp_tasks to PENDING so the worker can take them + # 1. first run will move comp_tasks to PENDING so the dask-worker can take them expected_pending_tasks = await _assert_schedule_pipeline_PENDING( sqlalchemy_async_engine, published_project, From 3c964ad5a300b930aee98675424281ef71b69a14 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sun, 24 Nov 2024 23:13:41 +0100 Subject: [PATCH 074/127] maybe --- .../modules/comp_scheduler/_scheduler_base.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 962a88b180d..c081a01d2ac 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -12,6 +12,8 @@ """ import asyncio +import concurrent +import concurrent.futures import datetime import logging from abc import ABC, abstractmethod @@ -78,6 +80,7 @@ def _auto_schedule_callback( + loop: asyncio.AbstractEventLoop, db_engine: Engine, rabbit_mq_client: RabbitMQClient, *, @@ -95,11 +98,13 @@ async def _async_cb(): iteration=iteration, ) - future = asyncio.run_coroutine_threadsafe( - _async_cb(), asyncio.get_running_loop() - ) - with log_catch(_logger, reraise=False): - future.result(timeout=10) + future = asyncio.run_coroutine_threadsafe(_async_cb(), loop) + + def handle_future_result(fut: concurrent.futures.Future) -> None: + with log_catch(_logger, reraise=False): + fut.result(timeout=10) + + future.add_done_callback(handle_future_result) return _cb From cb65ccb57c4254cc6f2070f5b9f5cc8563309ea2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:32:56 +0100 Subject: [PATCH 075/127] refactor --- .../modules/comp_scheduler/_constants.py | 1 + .../modules/comp_scheduler/_scheduler_base.py | 13 ++---- .../modules/comp_scheduler/_utils.py | 33 ++++++++++++++ .../modules/comp_scheduler/_worker.py | 44 ++++++++++++++++--- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py index 1be1cbb7cb6..fd41e2436b5 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py @@ -2,5 +2,6 @@ from typing import Final MODULE_NAME: Final[str] = "computational-distributed-scheduler" +MODULE_NAME_WORKER: Final[str] = "computational-distributed-worker" SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index c081a01d2ac..10b6d11cc01 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -12,8 +12,6 @@ """ import asyncio -import concurrent -import concurrent.futures import datetime import logging from abc import ABC, abstractmethod @@ -89,7 +87,7 @@ def _auto_schedule_callback( iteration: Iteration, ) -> Callable[[], None]: def _cb() -> None: - async def _async_cb(): + async def _async_cb() -> None: await request_pipeline_scheduling( rabbit_mq_client, db_engine, @@ -99,12 +97,8 @@ async def _async_cb(): ) future = asyncio.run_coroutine_threadsafe(_async_cb(), loop) - - def handle_future_result(fut: concurrent.futures.Future) -> None: - with log_catch(_logger, reraise=False): - fut.result(timeout=10) - - future.add_done_callback(handle_future_result) + with log_catch(_logger, reraise=False): + future.result(timeout=10) return _cb @@ -568,6 +562,7 @@ async def schedule_pipeline( dag=dag, comp_run=comp_run, wake_up_callback=_auto_schedule_callback( + asyncio.get_running_loop(), self.db_engine, self.rabbitmq_client, user_id=user_id, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py index 8ebda030bed..8d810b7e8f1 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py @@ -1,3 +1,6 @@ +from typing import Callable + +from fastapi import FastAPI from models_library.docker import DockerGenericTag from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID @@ -8,6 +11,9 @@ ServiceResourcesDictHelpers, ) from models_library.users import UserID +from servicelib.redis import RedisClientSDK +from settings_library.redis import RedisDatabase +from simcore_service_director_v2.modules.redis import get_redis_client_manager from ...models.comp_runs import Iteration from ...models.comp_tasks import CompTaskAtDB @@ -68,3 +74,30 @@ def create_service_resources_from_task(task: CompTaskAtDB) -> ServiceResourcesDi }, [task.image.boot_mode], ) + + +def _get_app_from_args(*args, **kwargs) -> FastAPI: + assert kwargs is not None # nosec + if args: + app = args[0] + else: + assert "app" in kwargs # nosec + app = kwargs["app"] + assert isinstance(app, FastAPI) # nosec + return app + + +def get_redis_client_from_app(*args, **kwargs) -> RedisClientSDK: + app = _get_app_from_args(*args, **kwargs) + return get_redis_client_manager(app).client(RedisDatabase.LOCKS) + + +def get_redis_lock_key( + suffix: str, *, unique_lock_key_builder: Callable[..., str] +) -> Callable[..., str]: + def _(*args, **kwargs) -> str: + app = _get_app_from_args(*args, **kwargs) + unique_lock_part = unique_lock_key_builder(*args, **kwargs) + return f"{app.title}-{suffix}-{unique_lock_part}" + + return _ diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 5a5acfbc36c..6d458dd1c12 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -1,14 +1,22 @@ +import contextlib import functools import logging from typing import cast from fastapi import FastAPI +from models_library.projects import ProjectID +from models_library.users import UserID from servicelib.logging_utils import log_context +from servicelib.redis import CouldNotAcquireLockError +from servicelib.redis_utils import exclusive +from ...models.comp_runs import Iteration from ..rabbitmq import get_rabbitmq_client +from ._constants import MODULE_NAME_WORKER from ._models import SchedulePipelineRabbitMessage from ._scheduler_base import BaseCompScheduler from ._scheduler_factory import create_scheduler +from ._utils import get_redis_client_from_app, get_redis_lock_key _logger = logging.getLogger(__name__) @@ -17,15 +25,39 @@ def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: return cast(BaseCompScheduler, app.state.scheduler_worker) +def _unique_key_builder( + user_id: UserID, project_id: ProjectID, iteration: Iteration +) -> str: + return f"{user_id}:{project_id}:{iteration}" + + +@exclusive( + redis=get_redis_client_from_app, + lock_key=get_redis_lock_key( + MODULE_NAME_WORKER, unique_lock_key_builder=_unique_key_builder + ), +) +async def _exclusively_schedule_pipeline( + app: FastAPI, *, user_id: UserID, project_id: ProjectID, iteration: Iteration +) -> None: + await _get_scheduler_worker(app).schedule_pipeline( + user_id=user_id, + project_id=project_id, + iteration=iteration, + ) + + async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: with log_context(_logger, logging.DEBUG, msg="handling scheduling"): - to_schedule_pipeline = SchedulePipelineRabbitMessage.parse_raw(data) - await _get_scheduler_worker(app).schedule_pipeline( - user_id=to_schedule_pipeline.user_id, - project_id=to_schedule_pipeline.project_id, - iteration=to_schedule_pipeline.iteration, - ) + to_schedule_pipeline = SchedulePipelineRabbitMessage.model_validate(data) + with contextlib.suppress(CouldNotAcquireLockError): + await _exclusively_schedule_pipeline( + app, + user_id=to_schedule_pipeline.user_id, + project_id=to_schedule_pipeline.project_id, + iteration=to_schedule_pipeline.iteration, + ) return True From cc2a002affb3b96f4411a7056a02ef7a866979cf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:40:46 +0100 Subject: [PATCH 076/127] refactor --- .../modules/comp_scheduler/__init__.py | 10 +++-- .../modules/comp_scheduler/_constants.py | 2 +- .../modules/comp_scheduler/_manager.py | 45 +++++++------------ 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 5dc7b020e6c..3b5b310ddec 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -5,7 +5,7 @@ from fastapi import FastAPI from servicelib.logging_utils import log_context -from ._constants import MODULE_NAME +from ._constants import MODULE_NAME_SCHEDULER from ._manager import run_new_pipeline, setup_manager, shutdown_manager, stop_pipeline from ._worker import setup_worker, shutdown_worker @@ -14,7 +14,9 @@ def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def start_scheduler() -> None: - with log_context(_logger, level=logging.INFO, msg=f"starting {MODULE_NAME}"): + with log_context( + _logger, level=logging.INFO, msg=f"starting {MODULE_NAME_SCHEDULER}" + ): await setup_worker(app) await setup_manager(app) @@ -23,7 +25,9 @@ async def start_scheduler() -> None: def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: async def stop_scheduler() -> None: - with log_context(_logger, level=logging.INFO, msg=f"stopping {MODULE_NAME}"): + with log_context( + _logger, level=logging.INFO, msg=f"stopping {MODULE_NAME_SCHEDULER}" + ): await shutdown_manager(app) await shutdown_worker(app) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py index fd41e2436b5..45efe93f0b0 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_constants.py @@ -1,7 +1,7 @@ import datetime from typing import Final -MODULE_NAME: Final[str] = "computational-distributed-scheduler" +MODULE_NAME_SCHEDULER: Final[str] = "computational-distributed-scheduler" MODULE_NAME_WORKER: Final[str] = "computational-distributed-worker" SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) MAX_CONCURRENT_PIPELINE_SCHEDULING: Final[int] = 10 diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index c1fcc93cf9d..334ad7cb622 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -8,10 +8,8 @@ from models_library.users import UserID from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.logging_utils import log_context -from servicelib.redis import RedisClientSDK from servicelib.redis_utils import exclusive from servicelib.utils import limited_gather -from settings_library.redis import RedisDatabase from ...models.comp_runs import RunMetadataDict from ...utils.rabbitmq import publish_project_log @@ -19,14 +17,13 @@ from ..db.repositories.comp_pipelines import CompPipelinesRepository from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client -from ..redis import get_redis_client_manager from ._constants import ( MAX_CONCURRENT_PIPELINE_SCHEDULING, - MODULE_NAME, + MODULE_NAME_SCHEDULER, SCHEDULER_INTERVAL, ) from ._publisher import request_pipeline_scheduling -from ._utils import SCHEDULED_STATES +from ._utils import SCHEDULED_STATES, get_redis_client_from_app, get_redis_lock_key _logger = logging.getLogger(__name__) @@ -99,28 +96,13 @@ async def stop_pipeline( if updated_comp_run: # ensure the scheduler starts right away rabbitmq_client = get_rabbitmq_client(app) - await request_pipeline_scheduling(updated_comp_run, rabbitmq_client, db_engine) - - -def _get_app_from_args(*args, **kwargs) -> FastAPI: - assert kwargs is not None # nosec - if args: - app = args[0] - else: - assert "app" in kwargs # nosec - app = kwargs["app"] - assert isinstance(app, FastAPI) # nosec - return app - - -def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: - app = _get_app_from_args(*args, **kwargs) - return get_redis_client_manager(app).client(RedisDatabase.LOCKS) - - -def _redis_lock_key_builder(*args, **kwargs) -> str: - app = _get_app_from_args(*args, **kwargs) - return f"{app.title}_{MODULE_NAME}" + await request_pipeline_scheduling( + rabbitmq_client, + db_engine, + user_id=updated_comp_run.user_id, + project_id=updated_comp_run.project_uuid, + iteration=updated_comp_run.iteration, + ) async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGraph: @@ -129,7 +111,12 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr return pipeline_at_db.get_graph() -@exclusive(_redis_client_getter, lock_key=_redis_lock_key_builder) +@exclusive( + get_redis_client_from_app, + lock_key=get_redis_lock_key( + MODULE_NAME_SCHEDULER, unique_lock_key_builder=lambda: "" + ), +) async def schedule_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): db_engine = get_db_engine(app) @@ -159,7 +146,7 @@ async def setup_manager(app: FastAPI) -> None: app.state.scheduler_manager = start_periodic_task( schedule_pipelines, interval=SCHEDULER_INTERVAL, - task_name=MODULE_NAME, + task_name=MODULE_NAME_SCHEDULER, app=app, ) From 5ddab65af82f719510af8e1210ae06b6bb62f2b1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:42:41 +0100 Subject: [PATCH 077/127] docs --- .../modules/comp_scheduler/_scheduler_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 10b6d11cc01..989eccbc5ee 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -86,6 +86,10 @@ def _auto_schedule_callback( project_id: ProjectID, iteration: Iteration, ) -> Callable[[], None]: + """this function is called via Dask-backend from a separate thread. + Therefore the need to use run_coroutine_threadsafe to request a new + pipeline scheduling""" + def _cb() -> None: async def _async_cb() -> None: await request_pipeline_scheduling( From 57b770e2e06c4dd5068212b66efb67ab7709da22 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:49:26 +0100 Subject: [PATCH 078/127] ruff --- .../src/models_library/projects_nodes_io.py | 2 +- .../modules/comp_scheduler/_scheduler_base.py | 30 ++++++++----------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/packages/models-library/src/models_library/projects_nodes_io.py b/packages/models-library/src/models_library/projects_nodes_io.py index 3a79b6acf00..4d4637ac362 100644 --- a/packages/models-library/src/models_library/projects_nodes_io.py +++ b/packages/models-library/src/models_library/projects_nodes_io.py @@ -34,7 +34,7 @@ UUIDStr: TypeAlias = Annotated[str, StringConstraints(pattern=UUID_RE)] -NodeIDStr = UUIDStr +NodeIDStr: TypeAlias = UUIDStr LocationID = int LocationName = str diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 989eccbc5ee..dd217655316 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -25,7 +25,7 @@ from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID, NodeIDStr from models_library.projects_state import RunningState -from models_library.services import ServiceKey, ServiceType, ServiceVersion +from models_library.services import ServiceType from models_library.users import UserID from networkx.classes.reportviews import InDegreeView from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE @@ -179,7 +179,7 @@ async def _get_pipeline_tasks( ) -> dict[NodeIDStr, CompTaskAtDB]: comp_tasks_repo = CompTasksRepository.instance(self.db_engine) pipeline_comp_tasks: dict[NodeIDStr, CompTaskAtDB] = { - NodeIDStr(f"{t.node_id}"): t + f"{t.node_id}": t for t in await comp_tasks_repo.list_computational_tasks(project_id) if (f"{t.node_id}" in list(pipeline_dag.nodes())) } @@ -237,9 +237,9 @@ async def _set_states_following_failed_to_aborted( for task in tasks.values(): if task.state == RunningState.FAILED: node_ids_to_set_as_aborted.update(nx.bfs_tree(dag, f"{task.node_id}")) - node_ids_to_set_as_aborted.remove(NodeIDStr(f"{task.node_id}")) + node_ids_to_set_as_aborted.remove(f"{task.node_id}") for node_id in node_ids_to_set_as_aborted: - tasks[NodeIDStr(f"{node_id}")].state = RunningState.ABORTED + tasks[f"{node_id}"].state = RunningState.ABORTED if node_ids_to_set_as_aborted: # update the current states back in DB comp_tasks_repo = CompTasksRepository.instance(self.db_engine) @@ -387,8 +387,8 @@ async def _process_started_tasks( root_parent_node_id=run_metadata.get("project_metadata", {}).get( "root_parent_node_id" ), - service_key=ServiceKey(t.image.name), - service_version=ServiceVersion(t.image.tag), + service_key=t.image.name, + service_version=t.image.tag, service_type=ServiceType.COMPUTATIONAL, service_resources=create_service_resources_from_task(t), service_additional_metadata={}, @@ -675,9 +675,9 @@ async def _schedule_tasks_to_start( # noqa: C901 # get the tasks to start tasks_ready_to_start: dict[NodeID, CompTaskAtDB] = { - node_id: comp_tasks[NodeIDStr(f"{node_id}")] + node_id: comp_tasks[f"{node_id}"] for node_id in next_task_node_ids - if comp_tasks[NodeIDStr(f"{node_id}")].state in TASK_TO_START_STATES + if comp_tasks[f"{node_id}"].state in TASK_TO_START_STATES } if not tasks_ready_to_start: @@ -708,9 +708,7 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[ - NodeIDStr(f"{task}") - ].state = RunningState.WAITING_FOR_CLUSTER + comp_tasks[f"{task}"].state = RunningState.WAITING_FOR_CLUSTER except ComputationalBackendOnDemandNotReadyError as exc: _logger.info( @@ -732,9 +730,7 @@ async def _schedule_tasks_to_start( # noqa: C901 RunningState.WAITING_FOR_CLUSTER, ) for task in tasks_ready_to_start: - comp_tasks[ - NodeIDStr(f"{task}") - ].state = RunningState.WAITING_FOR_CLUSTER + comp_tasks[f"{task}"].state = RunningState.WAITING_FOR_CLUSTER except ClustersKeeperNotAvailableError: _logger.exception("Unexpected error while starting tasks:") await publish_project_log( @@ -755,7 +751,7 @@ async def _schedule_tasks_to_start( # noqa: C901 optional_stopped=arrow.utcnow().datetime, ) for task in tasks_ready_to_start: - comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED + comp_tasks[f"{task}"].state = RunningState.FAILED raise except TaskSchedulingError as exc: _logger.exception( @@ -773,7 +769,7 @@ async def _schedule_tasks_to_start( # noqa: C901 optional_progress=1.0, optional_stopped=arrow.utcnow().datetime, ) - comp_tasks[NodeIDStr(f"{exc.node_id}")].state = RunningState.FAILED + comp_tasks[f"{exc.node_id}"].state = RunningState.FAILED except Exception: _logger.exception( "Unexpected error for %s with %s on %s happened when scheduling %s:", @@ -792,7 +788,7 @@ async def _schedule_tasks_to_start( # noqa: C901 optional_stopped=arrow.utcnow().datetime, ) for task in tasks_ready_to_start: - comp_tasks[NodeIDStr(f"{task}")].state = RunningState.FAILED + comp_tasks[f"{task}"].state = RunningState.FAILED raise return comp_tasks From 5a5b2dbbf64b561c80c4af43922683ec62d77671 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:35:23 +0100 Subject: [PATCH 079/127] some fine tuning for tests --- services/director-v2/tests/unit/_helpers.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index 4e89ce7b79c..1e02a79e5c3 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any +from typing import Any, Callable import aiopg import aiopg.sa @@ -23,9 +23,10 @@ class PublishedProject: tasks: list[CompTaskAtDB] -@dataclass +@dataclass(kw_only=True) class RunningProject(PublishedProject): runs: CompRunsAtDB + task_to_callback_mapping: dict[NodeID, Callable[[], None]] async def set_comp_task_outputs( @@ -89,7 +90,7 @@ async def assert_comp_tasks( task_ids: list[NodeID], expected_state: RunningState, expected_progress: float | None, -) -> None: +) -> list[CompTaskAtDB]: # check the database is correctly updated, the run is published async with sqlalchemy_async_engine.connect() as conn: result = await conn.execute( @@ -105,3 +106,4 @@ async def assert_comp_tasks( assert all( t.progress == expected_progress for t in tasks ), f"{expected_progress=}, found: {[t.progress for t in tasks]}" + return tasks From b45f33f95aa0b541ee6b76fb0bddadb0ab809d1c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:35:41 +0100 Subject: [PATCH 080/127] use the correct v2 method --- .../modules/comp_scheduler/_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 6d458dd1c12..aab84160968 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -50,7 +50,7 @@ async def _exclusively_schedule_pipeline( async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: with log_context(_logger, logging.DEBUG, msg="handling scheduling"): - to_schedule_pipeline = SchedulePipelineRabbitMessage.model_validate(data) + to_schedule_pipeline = SchedulePipelineRabbitMessage.model_validate_json(data) with contextlib.suppress(CouldNotAcquireLockError): await _exclusively_schedule_pipeline( app, From 1fcbf124210d0e2647c879a75e109ed176948b00 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:35:54 +0100 Subject: [PATCH 081/127] changed syntax --- services/director-v2/tests/unit/with_dbs/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index ab728c921b0..3b2ac6c8a53 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -358,6 +358,7 @@ async def running_project( started=now_time, result=StateType.RUNNING, ), + task_to_callback_mapping={}, ) @@ -394,6 +395,7 @@ async def running_project_mark_for_cancellation( started=now_time, cancelled=now_time + datetime.timedelta(seconds=5), ), + task_to_callback_mapping={}, ) From 3281ffeab2696d78328799e59102b6013abcdf12 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:36:20 +0100 Subject: [PATCH 082/127] test the callback mechanism --- .../comp_scheduler/test_scheduler_dask.py | 88 ++++++++++++------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 9e32a61310a..2db60c05a64 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -11,7 +11,6 @@ import asyncio import datetime from collections.abc import AsyncIterator, Awaitable, Callable -from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from dataclasses import dataclass from typing import Any, cast @@ -195,13 +194,13 @@ async def _assert_start_pipeline( return runs[0], exp_published_tasks -async def _assert_schedule_pipeline_PENDING( # noqa: N802 +async def _assert_publish_in_dask_backend( sqlalchemy_async_engine: AsyncEngine, published_project: PublishedProject, published_tasks: list[CompTaskAtDB], mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, -) -> list[CompTaskAtDB]: +) -> tuple[list[CompTaskAtDB], dict[NodeID, Callable[[], None]]]: expected_pending_tasks = [ published_tasks[1], published_tasks[3], @@ -247,6 +246,8 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] # tasks were send to the backend assert published_project.project.prj_owner is not None assert isinstance(mocked_dask_client.send_computation_tasks, mock.Mock) + assert isinstance(mocked_dask_client.get_tasks_status, mock.Mock) + assert isinstance(mocked_dask_client.get_task_result, mock.Mock) mocked_dask_client.send_computation_tasks.assert_has_calls( calls=[ mock.call( @@ -262,6 +263,12 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] ], any_order=True, ) + task_to_callback_mapping = { + task.node_id: mocked_dask_client.send_computation_tasks.call_args_list[ + i + ].kwargs["callback"] + for i, task in enumerate(expected_pending_tasks) + } mocked_dask_client.send_computation_tasks.reset_mock() mocked_dask_client.get_tasks_status.assert_not_called() mocked_dask_client.get_task_result.assert_not_called() @@ -298,7 +305,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] ) mocked_dask_client.get_tasks_status.reset_mock() mocked_dask_client.get_task_result.assert_not_called() - return expected_pending_tasks + return expected_pending_tasks, task_to_callback_mapping @pytest.fixture @@ -432,7 +439,7 @@ async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the dask-worker can take them - expected_pending_tasks = await _assert_schedule_pipeline_PENDING( + expected_pending_tasks, _ = await _assert_publish_in_dask_backend( sqlalchemy_async_engine, published_project, expected_published_tasks, @@ -895,7 +902,10 @@ async def with_started_project( # # 2. This runs the scheduler until the project is started scheduled in the back-end # - expected_pending_tasks = await _assert_schedule_pipeline_PENDING( + ( + expected_pending_tasks, + task_to_callback_mapping, + ) = await _assert_publish_in_dask_backend( sqlalchemy_async_engine, published_project, expected_published_tasks, @@ -920,6 +930,9 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta for job_id in job_ids ] + assert isinstance(mocked_dask_client.get_tasks_status, mock.Mock) + assert isinstance(mocked_dask_client.send_computation_tasks, mock.Mock) + assert isinstance(mocked_dask_client.get_task_result, mock.Mock) mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running await scheduler_api.schedule_pipeline( user_id=run_in_db.user_id, @@ -977,30 +990,32 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta iteration=run_in_db.iteration, ) # comp_run, the comp_task switch to STARTED - await assert_comp_runs( - sqlalchemy_async_engine, - expected_total=1, - expected_state=RunningState.STARTED, - where_statement=and_( - comp_runs.c.user_id == published_project.project.prj_owner, - comp_runs.c.project_uuid == f"{published_project.project.uuid}", - ), - ) - await assert_comp_tasks( + run_in_db = ( + await assert_comp_runs( + sqlalchemy_async_engine, + expected_total=1, + expected_state=RunningState.STARTED, + where_statement=and_( + comp_runs.c.user_id == published_project.project.prj_owner, + comp_runs.c.project_uuid == f"{published_project.project.uuid}", + ), + ) + )[0] + tasks_in_db = await assert_comp_tasks( sqlalchemy_async_engine, project_uuid=published_project.project.uuid, task_ids=[exp_started_task.node_id], expected_state=RunningState.STARTED, expected_progress=0, ) - await assert_comp_tasks( + tasks_in_db += await assert_comp_tasks( sqlalchemy_async_engine, project_uuid=published_project.project.uuid, task_ids=[p.node_id for p in expected_pending_tasks], expected_state=RunningState.PENDING, expected_progress=None, ) - await assert_comp_tasks( + tasks_in_db += await assert_comp_tasks( sqlalchemy_async_engine, project_uuid=published_project.project.uuid, task_ids=[p.node_id for p in expected_published_tasks], @@ -1033,29 +1048,40 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta return RunningProject( published_project.project, published_project.pipeline, - published_project.tasks, + tasks_in_db, runs=run_in_db, + task_to_callback_mapping=task_to_callback_mapping, + ) + + +@pytest.fixture +def mocked_worker_publisher(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.request_pipeline_scheduling", + autospec=True, ) async def test_completed_task_triggers_new_scheduling_task( - with_disabled_scheduler_publisher: mock.Mock, + mocked_worker_publisher: mock.Mock, with_started_project: RunningProject, - initialized_app: FastAPI, - mocked_dask_client: mock.MagicMock, - scheduler_api: BaseCompScheduler, - sqlalchemy_async_engine: AsyncEngine, - mocker: MockerFixture, ): """When a pipeline job completes, the Dask backend provides a callback that runs in a separate thread. We use that callback to ask the director-v2 computational scheduler manager to ask for a new schedule After fiddling in distributed source code, here is a similar way to trigger that callback """ - with ThreadPoolExecutor( - max_workers=1, thread_name_prefix="pytest-callback-thread" - ) as executor: - ... + completed_node_id = with_started_project.tasks[0].node_id + callback = with_started_project.task_to_callback_mapping[completed_node_id] + await asyncio.to_thread(callback) + + mocked_worker_publisher.assert_called_once_with( + mock.ANY, + mock.ANY, + user_id=with_started_project.runs.user_id, + project_id=with_started_project.runs.project_uuid, + iteration=with_started_project.runs.iteration, + ) async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( @@ -1142,7 +1168,7 @@ async def test_task_progress_triggers( ) # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the dask-worker can take them - expected_pending_tasks = await _assert_schedule_pipeline_PENDING( + expected_pending_tasks, _ = await _assert_publish_in_dask_backend( sqlalchemy_async_engine, published_project, expected_published_tasks, @@ -1604,7 +1630,7 @@ async def test_running_pipeline_triggers_heartbeat( ) # ------------------------------------------------------------------------------- # 1. first run will move comp_tasks to PENDING so the dask-worker can take them - expected_pending_tasks = await _assert_schedule_pipeline_PENDING( + expected_pending_tasks, _ = await _assert_publish_in_dask_backend( sqlalchemy_async_engine, published_project, expected_published_tasks, From db50afbef1076a3dd4a572a32df87721b99dd663 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:55:10 +0100 Subject: [PATCH 083/127] test callback mechanism --- .../modules/comp_scheduler/_manager.py | 4 +--- .../modules/comp_scheduler/_utils.py | 10 +++++++--- .../modules/comp_scheduler/_worker.py | 2 +- .../tests/unit/with_dbs/comp_scheduler/test_worker.py | 1 + 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 334ad7cb622..a9cb6f5e103 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -113,9 +113,7 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr @exclusive( get_redis_client_from_app, - lock_key=get_redis_lock_key( - MODULE_NAME_SCHEDULER, unique_lock_key_builder=lambda: "" - ), + lock_key=get_redis_lock_key(MODULE_NAME_SCHEDULER, unique_lock_key_builder=None), ) async def schedule_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py index 8d810b7e8f1..0458b159811 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_utils.py @@ -93,11 +93,15 @@ def get_redis_client_from_app(*args, **kwargs) -> RedisClientSDK: def get_redis_lock_key( - suffix: str, *, unique_lock_key_builder: Callable[..., str] + suffix: str, *, unique_lock_key_builder: Callable[..., str] | None ) -> Callable[..., str]: def _(*args, **kwargs) -> str: app = _get_app_from_args(*args, **kwargs) - unique_lock_part = unique_lock_key_builder(*args, **kwargs) - return f"{app.title}-{suffix}-{unique_lock_part}" + unique_lock_part = ( + unique_lock_key_builder(*args, **kwargs) if unique_lock_key_builder else "" + ) + if unique_lock_part: + unique_lock_part = f"-{unique_lock_part}" + return f"{app.title}-{suffix}{unique_lock_part}" return _ diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index aab84160968..7075a57e744 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -26,7 +26,7 @@ def _get_scheduler_worker(app: FastAPI) -> BaseCompScheduler: def _unique_key_builder( - user_id: UserID, project_id: ProjectID, iteration: Iteration + _app, user_id: UserID, project_id: ProjectID, iteration: Iteration ) -> str: return f"{user_id}:{project_id}:{iteration}" diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 4872a5c001f..c52b842516d 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -48,6 +48,7 @@ def mocked_get_scheduler_worker( async def test_worker_properly_calls_scheduler_api( + with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, mocked_get_scheduler_worker: mock.Mock, published_project: PublishedProject, From 4c80279f007a66314cc387b20003afc2805eba1e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:56:21 +0100 Subject: [PATCH 084/127] renaming --- .../modules/comp_scheduler/_manager.py | 4 +-- .../with_dbs/comp_scheduler/test_manager.py | 26 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index a9cb6f5e103..87558d14296 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -115,7 +115,7 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr get_redis_client_from_app, lock_key=get_redis_lock_key(MODULE_NAME_SCHEDULER, unique_lock_key_builder=None), ) -async def schedule_pipelines(app: FastAPI) -> None: +async def schedule_all_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): db_engine = get_db_engine(app) runs_to_schedule = await CompRunsRepository.instance(db_engine).list( @@ -142,7 +142,7 @@ async def schedule_pipelines(app: FastAPI) -> None: async def setup_manager(app: FastAPI) -> None: app.state.scheduler_manager = start_periodic_task( - schedule_pipelines, + schedule_all_pipelines, interval=SCHEDULER_INTERVAL, task_name=MODULE_NAME_SCHEDULER, app=app, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index ed1d0a31a6f..2880e91c022 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -31,7 +31,7 @@ from simcore_service_director_v2.modules.comp_scheduler._manager import ( SCHEDULER_INTERVAL, run_new_pipeline, - schedule_pipelines, + schedule_all_pipelines, stop_pipeline, ) from simcore_service_director_v2.modules.comp_scheduler._models import ( @@ -66,9 +66,9 @@ def with_fast_scheduling(mocker: MockerFixture) -> None: @pytest.fixture -def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: +def mocked_schedule_all_pipelines(mocker: MockerFixture) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._manager.schedule_pipelines", + "simcore_service_director_v2.modules.comp_scheduler._manager.schedule_all_pipelines", autospec=True, ) @@ -76,15 +76,15 @@ def mocked_schedule_pipelines(mocker: MockerFixture) -> mock.Mock: async def test_manager_starts_and_auto_schedules_pipelines( with_fast_scheduling: None, with_disabled_scheduler_worker: mock.Mock, - mocked_schedule_pipelines: mock.Mock, + mocked_schedule_all_pipelines: mock.Mock, initialized_app: FastAPI, sqlalchemy_async_engine: AsyncEngine, ): await assert_comp_runs_empty(sqlalchemy_async_engine) - mocked_schedule_pipelines.assert_called() + mocked_schedule_all_pipelines.assert_called() -async def test_schedule_pipelines_empty_db( +async def test_schedule_all_pipelines_empty_db( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, @@ -93,7 +93,7 @@ async def test_schedule_pipelines_empty_db( ): await assert_comp_runs_empty(sqlalchemy_async_engine) - await schedule_pipelines(initialized_app) + await schedule_all_pipelines(initialized_app) # check nothing was distributed scheduler_rabbit_client_parser.assert_not_called() @@ -102,7 +102,7 @@ async def test_schedule_pipelines_empty_db( await assert_comp_runs_empty(sqlalchemy_async_engine) -async def test_schedule_pipelines_concurently_runs_exclusively_and_raises( +async def test_schedule_all_pipelines_concurently_runs_exclusively_and_raises( with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, mocker: MockerFixture, @@ -124,7 +124,7 @@ async def slow_limited_gather(*args, **kwargs): ) results = await asyncio.gather( - *(schedule_pipelines(initialized_app) for _ in range(CONCURRENCY)), + *(schedule_all_pipelines(initialized_app) for _ in range(CONCURRENCY)), return_exceptions=True, ) @@ -135,7 +135,7 @@ async def slow_limited_gather(*args, **kwargs): mock_function.assert_called_once() -async def test_schedule_pipelines( +async def test_schedule_all_pipelines( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, initialized_app: FastAPI, @@ -178,7 +178,7 @@ async def test_schedule_pipelines( start_modified_time = comp_run.modified # this will now not schedule the pipeline since it was last scheduled - await schedule_pipelines(initialized_app) + await schedule_all_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_not_called() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] @@ -188,7 +188,7 @@ async def test_schedule_pipelines( # this will now schedule the pipeline since the time passed await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() + 1) - await schedule_pipelines(initialized_app) + await schedule_all_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_called_once_with( SchedulePipelineRabbitMessage( user_id=published_project.project.prj_owner, @@ -211,7 +211,7 @@ async def test_schedule_pipelines( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, ) - await schedule_pipelines(initialized_app) + await schedule_all_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_called_once_with( SchedulePipelineRabbitMessage( user_id=published_project.project.prj_owner, From c54337fc0f21930dbb83845bb5acc0410b847093 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:10:19 +0100 Subject: [PATCH 085/127] creation of parallel test --- .../with_dbs/comp_scheduler/test_worker.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index c52b842516d..e4c56b3b88a 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -7,6 +7,8 @@ # pylint:disable=no-name-in-module # pylint: disable=too-many-statements +import asyncio +from typing import Awaitable, Callable from unittest import mock import pytest @@ -47,7 +49,7 @@ def mocked_get_scheduler_worker( ) -async def test_worker_properly_calls_scheduler_api( +async def test_worker_properly_autocalls_scheduler_api( with_disabled_auto_scheduling: mock.Mock, initialized_app: FastAPI, mocked_get_scheduler_worker: mock.Mock, @@ -69,3 +71,34 @@ async def test_worker_properly_calls_scheduler_api( project_id=published_project.project.uuid, iteration=1, ) + + +@pytest.fixture +async def mocked_scheduler_api(mocker: MockerFixture) -> mock.Mock: + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler.schedule_pipeline" + ) + + +async def test_worker_scheduling_parallelism( + with_disabled_auto_scheduling: mock.Mock, + mocked_scheduler_api: mock.Mock, + initialized_app: FastAPI, + publish_project: Callable[[], Awaitable[PublishedProject]], + run_metadata: RunMetadataDict, +): + with_disabled_auto_scheduling.assert_called_once() + + mocked_scheduler_api.side_effect = asyncio.sleep(10) + + published_project = await publish_project() + assert published_project.project.prj_owner + await run_new_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + mocked_scheduler_api.assert_called_once() From 8df0116853ebd3d3e7efa068dbb6b1a0b3b01bc6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 08:26:50 +0100 Subject: [PATCH 086/127] renaming --- .../modules/comp_scheduler/_scheduler_base.py | 2 +- .../modules/comp_scheduler/_worker.py | 6 +-- .../comp_scheduler/test_scheduler_dask.py | 50 +++++++++---------- .../with_dbs/comp_scheduler/test_worker.py | 36 ++++++++----- 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index dd217655316..45ad74ae64b 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -521,7 +521,7 @@ async def _process_completed_tasks( ) -> None: """process tasks from the 3rd party backend""" - async def schedule_pipeline( + async def apply( self, *, user_id: UserID, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 7075a57e744..38b57575142 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -40,14 +40,14 @@ def _unique_key_builder( async def _exclusively_schedule_pipeline( app: FastAPI, *, user_id: UserID, project_id: ProjectID, iteration: Iteration ) -> None: - await _get_scheduler_worker(app).schedule_pipeline( + await _get_scheduler_worker(app).apply( user_id=user_id, project_id=project_id, iteration=iteration, ) -async def _handle_distributed_pipeline(app: FastAPI, data: bytes) -> bool: +async def _handle_apply_distributed_schedule(app: FastAPI, data: bytes) -> bool: with log_context(_logger, logging.DEBUG, msg="handling scheduling"): to_schedule_pipeline = SchedulePipelineRabbitMessage.model_validate_json(data) @@ -65,7 +65,7 @@ async def setup_worker(app: FastAPI) -> None: rabbitmq_client = get_rabbitmq_client(app) await rabbitmq_client.subscribe( SchedulePipelineRabbitMessage.get_channel_name(), - functools.partial(_handle_distributed_pipeline, app), + functools.partial(_handle_apply_distributed_schedule, app), exclusive_queue=False, ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 2db60c05a64..3b64d938288 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -213,7 +213,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] mocked_dask_client.get_tasks_status.side_effect = _return_tasks_pending assert published_project.project.prj_owner - await scheduler.schedule_pipeline( + await scheduler.apply( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, @@ -273,7 +273,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] mocked_dask_client.get_tasks_status.assert_not_called() mocked_dask_client.get_task_result.assert_not_called() # there is a second run of the scheduler to move comp_runs to pending, the rest does not change - await scheduler.schedule_pipeline( + await scheduler.apply( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, @@ -464,7 +464,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta ] mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -514,7 +514,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta node_id=exp_started_task.node_id, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -591,7 +591,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: return TaskOutputData.model_validate({"out_1": None, "out_2": 45}) mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -702,7 +702,7 @@ async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -758,7 +758,7 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed mocked_dask_client.get_task_result.side_effect = None - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -822,7 +822,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_task_result.side_effect = _return_random_task_result # trigger the scheduler, it should switch to FAILED, as we are done - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -934,7 +934,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta assert isinstance(mocked_dask_client.send_computation_tasks, mock.Mock) assert isinstance(mocked_dask_client.get_task_result, mock.Mock) mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -984,7 +984,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta node_id=exp_started_task.node_id, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1133,7 +1133,7 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( # # Trigger scheduling manually. since the pipeline is broken, it shall be aborted # - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_entry.user_id, project_id=run_entry.project_uuid, iteration=run_entry.iteration, @@ -1273,7 +1273,7 @@ async def test_handling_of_disconnected_scheduler_dask( project_id=published_project.project.uuid, ) # we ensure the scheduler was run - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1291,7 +1291,7 @@ async def test_handling_of_disconnected_scheduler_dask( expected_progress=1, ) # then we have another scheduler run - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1413,7 +1413,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: mocked_dask_client.get_task_result.side_effect = mocked_get_task_result assert running_project.project.prj_owner - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=running_project.project.prj_owner, project_id=running_project.project.uuid, iteration=1, @@ -1522,7 +1522,7 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status # Running the scheduler, should actually cancel the run now - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1571,7 +1571,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: raise TaskCancelledError mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1662,7 +1662,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1678,12 +1678,12 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 3. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1698,12 +1698,12 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 4. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, ) - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1776,7 +1776,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( published_project.tasks[1], published_project.tasks[3], ] - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1801,7 +1801,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( expected_progress=None, ) # again will trigger the same response - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1880,7 +1880,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( published_project.tasks[1], published_project.tasks[3], ] - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, @@ -1905,7 +1905,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( expected_progress=1.0, ) # again will not re-trigger the call to clusters-keeper - await scheduler_api.schedule_pipeline( + await scheduler_api.apply( user_id=run_in_db.user_id, project_id=run_in_db.project_uuid, iteration=run_in_db.iteration, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index e4c56b3b88a..9e56af3d4a0 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -76,7 +76,7 @@ async def test_worker_properly_autocalls_scheduler_api( @pytest.fixture async def mocked_scheduler_api(mocker: MockerFixture) -> mock.Mock: return mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler.schedule_pipeline" + "simcore_service_director_v2.modules.comp_scheduler._scheduler_base.BaseCompScheduler.apply" ) @@ -89,16 +89,26 @@ async def test_worker_scheduling_parallelism( ): with_disabled_auto_scheduling.assert_called_once() - mocked_scheduler_api.side_effect = asyncio.sleep(10) - - published_project = await publish_project() - assert published_project.project.prj_owner - await run_new_pipeline( - initialized_app, - user_id=published_project.project.prj_owner, - project_id=published_project.project.uuid, - cluster_id=DEFAULT_CLUSTER_ID, - run_metadata=run_metadata, - use_on_demand_clusters=False, + async def _side_effect(*args, **kwargs): + await asyncio.sleep(10) + + mocked_scheduler_api.side_effect = _side_effect + + async def _project_pipeline_creation_workflow(): + published_project = await publish_project() + assert published_project.project.prj_owner + await run_new_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + + num_concurrent_calls = 10 + await asyncio.gather( + *(_project_pipeline_creation_workflow() for _ in range(num_concurrent_calls)) ) - mocked_scheduler_api.assert_called_once() + mocked_scheduler_api.assert_called() + assert mocked_scheduler_api.call_count == num_concurrent_calls From 1e781456112a787ce92881791c15d5a4469490fc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:22:39 +0100 Subject: [PATCH 087/127] added rabbitmq queue purger --- .../src/pytest_simcore/rabbit_service.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/rabbit_service.py b/packages/pytest-simcore/src/pytest_simcore/rabbit_service.py index d8dc38feb09..91873a69d08 100644 --- a/packages/pytest-simcore/src/pytest_simcore/rabbit_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/rabbit_service.py @@ -6,11 +6,12 @@ import asyncio import logging from collections.abc import AsyncIterator, Awaitable, Callable +from contextlib import suppress import aio_pika import pytest import tenacity -from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient +from servicelib.rabbitmq import QueueName, RabbitMQClient, RabbitMQRPCClient from settings_library.rabbit import RabbitSettings from tenacity.before_sleep import before_sleep_log from tenacity.stop import stop_after_attempt @@ -131,3 +132,23 @@ async def _creator(client_name: str, *, heartbeat: int = 60) -> RabbitMQRPCClien yield _creator # cleanup, properly close the clients await asyncio.gather(*(client.close() for client in created_clients)) + + +@pytest.fixture +async def ensure_parametrized_queue_is_empty( + create_rabbitmq_client: Callable[[str], RabbitMQClient], queue_name: QueueName +) -> AsyncIterator[None]: + rabbitmq_client = create_rabbitmq_client("pytest-purger") + + async def _queue_messages_purger() -> None: + with suppress(aio_pika.exceptions.ChannelClosed): + assert rabbitmq_client._channel_pool # noqa: SLF001 + async with rabbitmq_client._channel_pool.acquire() as channel: # noqa: SLF001 + assert isinstance(channel, aio_pika.RobustChannel) + queue = await channel.get_queue(queue_name) + await queue.purge() + + await _queue_messages_purger() + yield + # cleanup + await _queue_messages_purger() From b2596bbaf14001ec84ea363380f4cc9a5906d754 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:23:14 +0100 Subject: [PATCH 088/127] cleanup --- .../modules/comp_scheduler/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 3b5b310ddec..cf3370f4da8 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -31,8 +31,6 @@ async def stop_scheduler() -> None: await shutdown_manager(app) await shutdown_worker(app) - # TODO: we might want to stop anything running in the worker too - return stop_scheduler From efc46f18b5a43600992bfd354c2d66bd34590df4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:23:42 +0100 Subject: [PATCH 089/127] test parallelism --- .../tests/unit/with_dbs/comp_scheduler/test_worker.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 9e56af3d4a0..9c21d769809 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -18,6 +18,9 @@ from pytest_mock import MockerFixture from simcore_service_director_v2.models.comp_runs import RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline +from simcore_service_director_v2.modules.comp_scheduler._models import ( + SchedulePipelineRabbitMessage, +) from simcore_service_director_v2.modules.comp_scheduler._worker import ( _get_scheduler_worker, ) @@ -80,12 +83,16 @@ async def mocked_scheduler_api(mocker: MockerFixture) -> mock.Mock: ) +@pytest.mark.parametrize( + "queue_name", [SchedulePipelineRabbitMessage.get_channel_name()] +) async def test_worker_scheduling_parallelism( with_disabled_auto_scheduling: mock.Mock, mocked_scheduler_api: mock.Mock, initialized_app: FastAPI, publish_project: Callable[[], Awaitable[PublishedProject]], run_metadata: RunMetadataDict, + ensure_parametrized_queue_is_empty: None, ): with_disabled_auto_scheduling.assert_called_once() @@ -94,7 +101,7 @@ async def _side_effect(*args, **kwargs): mocked_scheduler_api.side_effect = _side_effect - async def _project_pipeline_creation_workflow(): + async def _project_pipeline_creation_workflow() -> None: published_project = await publish_project() assert published_project.project.prj_owner await run_new_pipeline( From d90312eb86040273730d9fcef39ba7c7be670fd6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:37:35 +0100 Subject: [PATCH 090/127] added setting to control scheduling concurrency --- .../core/settings.py | 11 ++++---- .../modules/comp_scheduler/_worker.py | 28 +++++++++++++++---- .../with_dbs/comp_scheduler/test_worker.py | 22 ++++++++++++--- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index e64074c6d99..fe0af49fc5c 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -8,11 +8,7 @@ from common_library.pydantic_validators import validate_numeric_string_as_timedelta from fastapi import FastAPI -from models_library.basic_types import ( - LogLevel, - PortInt, - VersionTag, -) +from models_library.basic_types import LogLevel, PortInt, VersionTag from models_library.clusters import ( DEFAULT_CLUSTER_ID, Cluster, @@ -26,6 +22,7 @@ AnyUrl, Field, NonNegativeInt, + PositiveInt, field_validator, ) from servicelib.logging_utils_filtering import LoggerName, MessageSubstring @@ -77,6 +74,10 @@ class ComputationalBackendSettings(BaseCustomSettings): COMPUTATIONAL_BACKEND_ENABLED: bool = Field( default=True, ) + COMPUTATIONAL_BACKEND_SCHEDULING_CONCURRENCY: PositiveInt = Field( + default=50, + description="defines how many pipelines the application can schedule concurrently", + ) COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED: bool = Field( default=True, ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py index 38b57575142..397b68db0c9 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_worker.py @@ -1,3 +1,4 @@ +import asyncio import contextlib import functools import logging @@ -10,6 +11,7 @@ from servicelib.redis import CouldNotAcquireLockError from servicelib.redis_utils import exclusive +from ...core.settings import get_application_settings from ...models.comp_runs import Iteration from ..rabbitmq import get_rabbitmq_client from ._constants import MODULE_NAME_WORKER @@ -62,11 +64,19 @@ async def _handle_apply_distributed_schedule(app: FastAPI, data: bytes) -> bool: async def setup_worker(app: FastAPI) -> None: + app_settings = get_application_settings(app) rabbitmq_client = get_rabbitmq_client(app) - await rabbitmq_client.subscribe( - SchedulePipelineRabbitMessage.get_channel_name(), - functools.partial(_handle_apply_distributed_schedule, app), - exclusive_queue=False, + app.state.scheduler_worker_consumers = await asyncio.gather( + *( + rabbitmq_client.subscribe( + SchedulePipelineRabbitMessage.get_channel_name(), + functools.partial(_handle_apply_distributed_schedule, app), + exclusive_queue=False, + ) + for _ in range( + app_settings.DIRECTOR_V2_COMPUTATIONAL_BACKEND.COMPUTATIONAL_BACKEND_SCHEDULING_CONCURRENCY + ) + ) ) app.state.scheduler_worker = create_scheduler(app) @@ -74,5 +84,11 @@ async def setup_worker(app: FastAPI) -> None: async def shutdown_worker(app: FastAPI) -> None: assert app.state.scheduler_worker # nosec - # TODO: we might need to cancel stuff here. not sure yet what - # unsubscribing is maybe not a good idea if we want to keep the data in the queue + rabbitmq_client = get_rabbitmq_client(app) + await asyncio.gather( + *( + rabbitmq_client.unsubscribe_consumer(*consumer) + for consumer in app.state.scheduler_worker_consumers + ), + return_exceptions=False, + ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index 9c21d769809..e9bc9e831a8 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -8,7 +8,7 @@ # pylint: disable=too-many-statements import asyncio -from typing import Awaitable, Callable +from collections.abc import Awaitable, Callable from unittest import mock import pytest @@ -16,6 +16,8 @@ from fastapi import FastAPI from models_library.clusters import DEFAULT_CLUSTER_ID from pytest_mock import MockerFixture +from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict +from pytest_simcore.helpers.typing_env import EnvVarsDict from simcore_service_director_v2.models.comp_runs import RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._manager import run_new_pipeline from simcore_service_director_v2.modules.comp_scheduler._models import ( @@ -83,10 +85,23 @@ async def mocked_scheduler_api(mocker: MockerFixture) -> mock.Mock: ) +@pytest.fixture +def with_scheduling_concurrency( + mock_env: EnvVarsDict, monkeypatch: pytest.MonkeyPatch, scheduling_concurrency: int +) -> EnvVarsDict: + return mock_env | setenvs_from_dict( + monkeypatch, + {"COMPUTATIONAL_BACKEND_SCHEDULING_CONCURRENCY": f"{scheduling_concurrency}"}, + ) + + +@pytest.mark.parametrize("scheduling_concurrency", [1, 50, 100]) @pytest.mark.parametrize( "queue_name", [SchedulePipelineRabbitMessage.get_channel_name()] ) async def test_worker_scheduling_parallelism( + scheduling_concurrency: int, + with_scheduling_concurrency: EnvVarsDict, with_disabled_auto_scheduling: mock.Mock, mocked_scheduler_api: mock.Mock, initialized_app: FastAPI, @@ -113,9 +128,8 @@ async def _project_pipeline_creation_workflow() -> None: use_on_demand_clusters=False, ) - num_concurrent_calls = 10 await asyncio.gather( - *(_project_pipeline_creation_workflow() for _ in range(num_concurrent_calls)) + *(_project_pipeline_creation_workflow() for _ in range(scheduling_concurrency)) ) mocked_scheduler_api.assert_called() - assert mocked_scheduler_api.call_count == num_concurrent_calls + assert mocked_scheduler_api.call_count == scheduling_concurrency From a45cfbebe50e7bd90aa62d3932283d9d54e9fa85 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:55:46 +0100 Subject: [PATCH 091/127] ensure unsubscribe consumer is only unsubscribing the right consumer and allow multiple consumers --- packages/service-library/src/servicelib/rabbitmq/_models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/service-library/src/servicelib/rabbitmq/_models.py b/packages/service-library/src/servicelib/rabbitmq/_models.py index d713edfdc1d..cd674e526ff 100644 --- a/packages/service-library/src/servicelib/rabbitmq/_models.py +++ b/packages/service-library/src/servicelib/rabbitmq/_models.py @@ -18,11 +18,9 @@ class RabbitMessage(Protocol): - def body(self) -> bytes: - ... + def body(self) -> bytes: ... - def routing_key(self) -> str | None: - ... + def routing_key(self) -> str | None: ... class RPCNamespacedMethodName(ConstrainedStr): From 1c4ae483c4bb5aa48f524d93a6152c8d191dbed2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 12:20:01 +0100 Subject: [PATCH 092/127] fix test after renaming --- .../tests/unit/with_dbs/comp_scheduler/test_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py index e9bc9e831a8..9eb301e0910 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_worker.py @@ -71,7 +71,7 @@ async def test_worker_properly_autocalls_scheduler_api( use_on_demand_clusters=False, ) mocked_get_scheduler_worker.assert_called_once_with(initialized_app) - mocked_get_scheduler_worker.return_value.schedule_pipeline.assert_called_once_with( + mocked_get_scheduler_worker.return_value.apply.assert_called_once_with( user_id=published_project.project.prj_owner, project_id=published_project.project.uuid, iteration=1, From 986cd42df818c966e56c4932291379d8e4fb0354 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:51:25 +0100 Subject: [PATCH 093/127] fixed after new syntax --- .../tests/unit/with_dbs/comp_scheduler/test_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 2880e91c022..f70070a325f 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -49,7 +49,7 @@ async def scheduler_rabbit_client_parser( ) -> AsyncIterator[mock.AsyncMock]: client = create_rabbitmq_client("scheduling_pytest_consumer") mock = mocker.AsyncMock(return_value=True) - queue_name = await client.subscribe( + queue_name, _ = await client.subscribe( SchedulePipelineRabbitMessage.get_channel_name(), mock, exclusive_queue=False ) yield mock From 7dc96afc41f278885267b99e74d68aa0e199aea2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:53:00 +0100 Subject: [PATCH 094/127] ensure worker marks the scheduling as done --- .../modules/comp_scheduler/_publisher.py | 2 +- .../modules/comp_scheduler/_scheduler_base.py | 14 ++++++++++++++ .../modules/db/repositories/comp_runs.py | 12 +++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py index 5a8bcbc2027..1c7ea23ac43 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_publisher.py @@ -27,6 +27,6 @@ async def request_pipeline_scheduling( iteration=iteration, ), ) - await CompRunsRepository.instance(db_engine).mark_as_scheduled( + await CompRunsRepository.instance(db_engine).mark_for_scheduling( user_id=user_id, project_id=project_id, iteration=iteration ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 45ad74ae64b..8c07bb3ccbb 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -227,6 +227,18 @@ async def _set_run_result( final_state=(run_result in COMPLETED_STATES), ) + async def _set_schedule_done( + self, + user_id: UserID, + project_id: ProjectID, + iteration: Iteration, + ) -> None: + await CompRunsRepository.instance(self.db_engine).mark_as_scheduled_done( + user_id=user_id, + project_id=project_id, + iteration=iteration, + ) + async def _set_states_following_failed_to_aborted( self, project_id: ProjectID, dag: nx.DiGraph ) -> dict[NodeIDStr, CompTaskAtDB]: @@ -631,6 +643,8 @@ async def apply( ) except ComputationalBackendNotConnectedError: _logger.exception("Computational backend is not connected!") + finally: + await self._set_schedule_done(user_id, project_id, iteration) async def _schedule_tasks_to_stop( self, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index ed4fac45081..bd5132e6658 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -177,7 +177,7 @@ async def mark_for_cancellation( cancelled=arrow.utcnow().datetime, ) - async def mark_as_scheduled( + async def mark_for_scheduling( self, *, user_id: UserID, project_id: ProjectID, iteration: PositiveInt ) -> CompRunsAtDB | None: return await self.update( @@ -186,3 +186,13 @@ async def mark_as_scheduled( iteration, last_scheduled=arrow.utcnow().datetime, ) + + async def mark_as_scheduled_done( + self, *, user_id: UserID, project_id: ProjectID, iteration: PositiveInt + ) -> CompRunsAtDB | None: + return await self.update( + user_id, + project_id, + iteration, + last_scheduled=None, + ) From 34fa851547fb2ac972d306f6bec3db9b9c1d158c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:37:38 +0100 Subject: [PATCH 095/127] manager checks for properly scheduled tasks and lost ones --- .../modules/comp_scheduler/_manager.py | 44 +++++--- .../modules/db/repositories/comp_runs.py | 15 ++- .../with_dbs/comp_scheduler/test_manager.py | 102 +++++++++++++++++- 3 files changed, 137 insertions(+), 24 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 87558d14296..10549e3f2a3 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -1,4 +1,5 @@ import logging +from typing import Final import networkx as nx from aiopg.sa import Engine @@ -111,6 +112,9 @@ async def _get_pipeline_dag(project_id: ProjectID, db_engine: Engine) -> nx.DiGr return pipeline_at_db.get_graph() +_LOST_TASKS_FACTOR: Final[int] = 10 + + @exclusive( get_redis_client_from_app, lock_key=get_redis_lock_key(MODULE_NAME_SCHEDULER, unique_lock_key_builder=None), @@ -119,23 +123,35 @@ async def schedule_all_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): db_engine = get_db_engine(app) runs_to_schedule = await CompRunsRepository.instance(db_engine).list( - filter_by_state=SCHEDULED_STATES, scheduled_since=SCHEDULER_INTERVAL + filter_by_state=SCHEDULED_STATES, need_scheduling=True ) + possibly_lost_scheduled_pipelines = await CompRunsRepository.instance( + db_engine + ).list( + filter_by_state=SCHEDULED_STATES, + scheduled_since=SCHEDULER_INTERVAL * _LOST_TASKS_FACTOR, + ) + if possibly_lost_scheduled_pipelines: + _logger.error( + "found %d lost pipelines, they will be re-scheduled now", + len(possibly_lost_scheduled_pipelines), + ) rabbitmq_client = get_rabbitmq_client(app) - await limited_gather( - *( - request_pipeline_scheduling( - rabbitmq_client, - db_engine, - user_id=run.user_id, - project_id=run.project_uuid, - iteration=run.iteration, - ) - for run in runs_to_schedule - ), - limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, - ) + with log_context(_logger, logging.DEBUG, msg="distributing pipelines"): + await limited_gather( + *( + request_pipeline_scheduling( + rabbitmq_client, + db_engine, + user_id=run.user_id, + project_id=run.project_uuid, + iteration=run.iteration, + ) + for run in runs_to_schedule + possibly_lost_scheduled_pipelines + ), + limit=MAX_CONCURRENT_PIPELINE_SCHEDULING, + ) if runs_to_schedule: _logger.debug("distributed %d pipelines", len(runs_to_schedule)) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index bd5132e6658..5dce589e3ee 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -56,6 +56,7 @@ async def get( async def list( self, filter_by_state: set[RunningState] | None = None, + need_scheduling: bool | None = None, scheduled_since: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: @@ -69,15 +70,19 @@ async def list( ] ) ) + + scheduling_or_conditions = [] + if need_scheduling is not None: + scheduling_or_conditions.append(comp_runs.c.last_scheduled.is_(None)) if scheduled_since is not None: scheduled_cutoff = arrow.utcnow().datetime - scheduled_since - conditions.append( - or_( - comp_runs.c.last_scheduled.is_(None), - comp_runs.c.last_scheduled <= scheduled_cutoff, - ) + scheduling_or_conditions.append( + comp_runs.c.last_scheduled <= scheduled_cutoff ) + if scheduling_or_conditions: + conditions.append(sa.or_(*scheduling_or_conditions)) + async with self.db_engine.acquire() as conn: return [ CompRunsAtDB.model_validate(row) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index f70070a325f..e83b6e77687 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -29,6 +29,7 @@ from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import RunMetadataDict from simcore_service_director_v2.modules.comp_scheduler._manager import ( + _LOST_TASKS_FACTOR, SCHEDULER_INTERVAL, run_new_pipeline, schedule_all_pipelines, @@ -37,6 +38,9 @@ from simcore_service_director_v2.modules.comp_scheduler._models import ( SchedulePipelineRabbitMessage, ) +from simcore_service_director_v2.modules.db.repositories.comp_runs import ( + CompRunsRepository, +) from sqlalchemy.ext.asyncio import AsyncEngine pytest_simcore_core_services_selection = ["postgres", "rabbit", "redis"] @@ -141,6 +145,7 @@ async def test_schedule_all_pipelines( initialized_app: FastAPI, published_project: PublishedProject, sqlalchemy_async_engine: AsyncEngine, + aiopg_engine, run_metadata: RunMetadataDict, scheduler_rabbit_client_parser: mock.AsyncMock, ): @@ -164,8 +169,7 @@ async def test_schedule_all_pipelines( ).body() ) scheduler_rabbit_client_parser.reset_mock() - comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) - comp_run = comp_runs[0] + comp_run = (await assert_comp_runs(sqlalchemy_async_engine, expected_total=1))[0] assert comp_run.project_uuid == published_project.project.uuid assert comp_run.user_id == published_project.project.prj_owner assert comp_run.iteration == 1 @@ -177,7 +181,7 @@ async def test_schedule_all_pipelines( start_schedule_time = comp_run.last_scheduled start_modified_time = comp_run.modified - # this will now not schedule the pipeline since it was last scheduled + # this will now not schedule the pipeline since it was already scheduled await schedule_all_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_not_called() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) @@ -186,8 +190,15 @@ async def test_schedule_all_pipelines( assert comp_run.cancelled is None assert comp_run.modified == start_modified_time - # this will now schedule the pipeline since the time passed - await asyncio.sleep(SCHEDULER_INTERVAL.total_seconds() + 1) + # once the worker is done, the schedule time is set back to None + await CompRunsRepository(aiopg_engine).mark_as_scheduled_done( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + ) + + # now we schedule a pipeline again, but we wait for the scheduler interval to pass + # this will trigger a new schedule await schedule_all_pipelines(initialized_app) scheduler_rabbit_client_parser.assert_called_once_with( SchedulePipelineRabbitMessage( @@ -227,6 +238,87 @@ async def test_schedule_all_pipelines( assert comp_run.cancelled is not None +async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines( + with_disabled_auto_scheduling: mock.Mock, + with_disabled_scheduler_worker: mock.Mock, + initialized_app: FastAPI, + published_project: PublishedProject, + sqlalchemy_async_engine: AsyncEngine, + aiopg_engine, + run_metadata: RunMetadataDict, + scheduler_rabbit_client_parser: mock.AsyncMock, + caplog: pytest.LogCaptureFixture, +): + await assert_comp_runs_empty(sqlalchemy_async_engine) + assert published_project.project.prj_owner + # now we schedule a pipeline + await run_new_pipeline( + initialized_app, + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + run_metadata=run_metadata, + use_on_demand_clusters=False, + ) + # this directly schedule a new pipeline + scheduler_rabbit_client_parser.assert_called_once_with( + SchedulePipelineRabbitMessage( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + ).body() + ) + scheduler_rabbit_client_parser.reset_mock() + comp_run = (await assert_comp_runs(sqlalchemy_async_engine, expected_total=1))[0] + assert comp_run.project_uuid == published_project.project.uuid + assert comp_run.user_id == published_project.project.prj_owner + assert comp_run.iteration == 1 + assert comp_run.cancelled is None + assert comp_run.cluster_id == DEFAULT_CLUSTER_ID + assert comp_run.metadata == run_metadata + assert comp_run.result is RunningState.PUBLISHED + assert comp_run.last_scheduled is not None + start_schedule_time = comp_run.last_scheduled + start_modified_time = comp_run.modified + + # this will now not schedule the pipeline since it was already scheduled + await schedule_all_pipelines(initialized_app) + scheduler_rabbit_client_parser.assert_not_called() + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.last_scheduled == start_schedule_time, "scheduled time changed!" + assert comp_run.cancelled is None + assert comp_run.modified == start_modified_time + + # now we artificially set the last_schedule time well in the past + await CompRunsRepository(aiopg_engine).update( + comp_run.user_id, + comp_run.project_uuid, + comp_run.iteration, + last_scheduled=datetime.datetime.now(tz=datetime.UTC) + - SCHEDULER_INTERVAL * (_LOST_TASKS_FACTOR + 1), + ) + with caplog.at_level(logging.ERROR): + await schedule_all_pipelines(initialized_app) + assert ( + "found 1 lost pipelines, they will be re-scheduled now" in caplog.messages + ) + scheduler_rabbit_client_parser.assert_called_once_with( + SchedulePipelineRabbitMessage( + user_id=published_project.project.prj_owner, + project_id=published_project.project.uuid, + iteration=1, + ).body() + ) + scheduler_rabbit_client_parser.reset_mock() + comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) + comp_run = comp_runs[0] + assert comp_run.last_scheduled is not None + assert comp_run.last_scheduled > start_schedule_time + assert comp_run.cancelled is None + assert comp_run.modified > start_modified_time + + async def test_empty_pipeline_is_not_scheduled( with_disabled_auto_scheduling: mock.Mock, with_disabled_scheduler_worker: mock.Mock, From 7ed3b923549f8d93314e7f8fe7b6e3691fe0f698 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:24:07 +0100 Subject: [PATCH 096/127] changed column names and add processed column --- ...f6d8aa2_added_distributed_comp_scheduler.py} | 17 ++++++++++------- .../models/comp_runs.py | 8 +++++++- .../models/comp_runs.py | 15 ++++++++++----- 3 files changed, 27 insertions(+), 13 deletions(-) rename packages/postgres-database/src/simcore_postgres_database/migration/versions/{da1700f9eceb_added_last_scheduled.py => b7f23f6d8aa2_added_distributed_comp_scheduler.py} (51%) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/b7f23f6d8aa2_added_distributed_comp_scheduler.py similarity index 51% rename from packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py rename to packages/postgres-database/src/simcore_postgres_database/migration/versions/b7f23f6d8aa2_added_distributed_comp_scheduler.py index 7ce24024d00..b1e5bc9f30c 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/da1700f9eceb_added_last_scheduled.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/b7f23f6d8aa2_added_distributed_comp_scheduler.py @@ -1,15 +1,15 @@ -"""added_last_scheduled +"""added_distributed_comp_scheduler -Revision ID: da1700f9eceb +Revision ID: b7f23f6d8aa2 Revises: c9db8bf5091e -Create Date: 2024-11-24 17:11:30.519365+00:00 +Create Date: 2024-11-26 17:06:27.053774+00:00 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. -revision = "da1700f9eceb" +revision = "b7f23f6d8aa2" down_revision = "c9db8bf5091e" branch_labels = None depends_on = None @@ -18,13 +18,16 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.add_column( - "comp_runs", - sa.Column("last_scheduled", sa.DateTime(timezone=True), nullable=True), + "comp_runs", sa.Column("scheduled", sa.DateTime(timezone=True), nullable=True) + ) + op.add_column( + "comp_runs", sa.Column("processed", sa.DateTime(timezone=True), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_column("comp_runs", "last_scheduled") + op.drop_column("comp_runs", "processed") + op.drop_column("comp_runs", "scheduled") # ### end Alembic commands ### diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py index 06d8ea97252..cb657c20801 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py @@ -108,11 +108,17 @@ doc="If filled, when cancellation was requested", ), sa.Column( - "last_scheduled", + "scheduled", sa.DateTime(timezone=True), nullable=True, doc="last time the pipeline was scheduled to be processed", ), + sa.Column( + "processed", + sa.DateTime(timezone=True), + nullable=True, + doc="last time the pipeline was actually processed", + ), sa.Column("metadata", JSONB, nullable=True, doc="the run optional metadata"), sa.Column( "use_on_demand_clusters", diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py index f5cd1165cc7..f3fedc6a9f9 100644 --- a/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/models/comp_runs.py @@ -55,7 +55,8 @@ class CompRunsAtDB(BaseModel): cancelled: datetime.datetime | None metadata: RunMetadataDict = RunMetadataDict() use_on_demand_clusters: bool - last_scheduled: datetime.datetime | None + scheduled: datetime.datetime | None + processed: datetime.datetime | None @field_validator("result", mode="before") @classmethod @@ -108,7 +109,8 @@ def convert_null_to_empty_metadata(cls, v): "modified": "2021-03-01T13:07:34.191610", "cancelled": None, "use_on_demand_clusters": False, - "last_scheduled": None, + "scheduled": None, + "processed": None, }, { "run_id": 432, @@ -123,7 +125,8 @@ def convert_null_to_empty_metadata(cls, v): "modified": "2021-03-01T13:07:34.191610", "cancelled": None, "use_on_demand_clusters": False, - "last_scheduled": None, + "scheduled": None, + "processed": None, }, { "run_id": 43243, @@ -145,7 +148,8 @@ def convert_null_to_empty_metadata(cls, v): "some-other-metadata-which-is-an-array": [1, 3, 4], }, "use_on_demand_clusters": False, - "last_scheduled": None, + "scheduled": None, + "processed": None, }, { "run_id": 43243, @@ -161,7 +165,8 @@ def convert_null_to_empty_metadata(cls, v): "cancelled": None, "metadata": None, "use_on_demand_clusters": False, - "last_scheduled": None, + "scheduled": None, + "processed": None, }, ] }, From 94c1ed245d1e28103a1c8856836b02d43f3f676d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:39:40 +0100 Subject: [PATCH 097/127] createing tests for repository --- .../modules/comp_scheduler/_manager.py | 7 ++- .../modules/comp_scheduler/_scheduler_base.py | 2 +- .../modules/db/repositories/comp_runs.py | 29 ++++++----- .../test_db_repositories_comp_runs.py | 51 +++++++++++++++++++ .../with_dbs/comp_scheduler/test_manager.py | 30 +++++------ 5 files changed, 88 insertions(+), 31 deletions(-) create mode 100644 services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 10549e3f2a3..f134da556f7 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -1,6 +1,7 @@ import logging from typing import Final +import arrow import networkx as nx from aiopg.sa import Engine from fastapi import FastAPI @@ -123,13 +124,15 @@ async def schedule_all_pipelines(app: FastAPI) -> None: with log_context(_logger, logging.DEBUG, msg="scheduling pipelines"): db_engine = get_db_engine(app) runs_to_schedule = await CompRunsRepository.instance(db_engine).list( - filter_by_state=SCHEDULED_STATES, need_scheduling=True + filter_by_state=SCHEDULED_STATES, + never_scheduled=True, + processed_before=arrow.utcnow().datetime - SCHEDULER_INTERVAL, ) possibly_lost_scheduled_pipelines = await CompRunsRepository.instance( db_engine ).list( filter_by_state=SCHEDULED_STATES, - scheduled_since=SCHEDULER_INTERVAL * _LOST_TASKS_FACTOR, + scheduled_after=SCHEDULER_INTERVAL * _LOST_TASKS_FACTOR, ) if possibly_lost_scheduled_pipelines: _logger.error( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index 8c07bb3ccbb..c7e732579d9 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -233,7 +233,7 @@ async def _set_schedule_done( project_id: ProjectID, iteration: Iteration, ) -> None: - await CompRunsRepository.instance(self.db_engine).mark_as_scheduled_done( + await CompRunsRepository.instance(self.db_engine).mark_scheduling_done( user_id=user_id, project_id=project_id, iteration=iteration, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 5dce589e3ee..9bd922ae29a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -55,11 +55,12 @@ async def get( async def list( self, + *, filter_by_state: set[RunningState] | None = None, - need_scheduling: bool | None = None, - scheduled_since: datetime.timedelta | None = None, + never_scheduled: bool = False, + processed_before: datetime.datetime | None = None, + scheduled_after: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: - conditions = [] if filter_by_state: conditions.append( @@ -72,13 +73,14 @@ async def list( ) scheduling_or_conditions = [] - if need_scheduling is not None: - scheduling_or_conditions.append(comp_runs.c.last_scheduled.is_(None)) - if scheduled_since is not None: - scheduled_cutoff = arrow.utcnow().datetime - scheduled_since - scheduling_or_conditions.append( - comp_runs.c.last_scheduled <= scheduled_cutoff - ) + if never_scheduled: + scheduling_or_conditions.append(comp_runs.c.scheduled.is_(None)) + if scheduled_after is not None: + scheduled_cutoff = arrow.utcnow().datetime - scheduled_after + scheduling_or_conditions.append(comp_runs.c.scheduled <= scheduled_cutoff) + + if processed_before is not None: + scheduling_or_conditions.append(comp_runs.c.processed <= processed_before) if scheduling_or_conditions: conditions.append(sa.or_(*scheduling_or_conditions)) @@ -189,15 +191,16 @@ async def mark_for_scheduling( user_id, project_id, iteration, - last_scheduled=arrow.utcnow().datetime, + scheduled=arrow.utcnow().datetime, + processed=None, ) - async def mark_as_scheduled_done( + async def mark_scheduling_done( self, *, user_id: UserID, project_id: ProjectID, iteration: PositiveInt ) -> CompRunsAtDB | None: return await self.update( user_id, project_id, iteration, - last_scheduled=None, + processed=arrow.utcnow().datetime, ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py new file mode 100644 index 00000000000..c90bcf5a568 --- /dev/null +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -0,0 +1,51 @@ +import pytest +from models_library.projects import ProjectID +from models_library.users import UserID +from simcore_service_director_v2.core.errors import ComputationalRunNotFoundError +from simcore_service_director_v2.modules.db.repositories.comp_runs import ( + CompRunsRepository, +) + +pytest_simcore_core_services_selection = [ + "postgres", +] +pytest_simcore_ops_services_selection = [ + "adminer", +] + + +async def test_get(aiopg_engine, user_id: UserID, project_id: ProjectID): + with pytest.raises(ComputationalRunNotFoundError): + await CompRunsRepository(aiopg_engine).get(user_id, project_id) + + +async def test_list(): + ... + + +async def test_create(): + ... + + +async def test_update(): + ... + + +async def test_delete(): + ... + + +async def test_set_run_result(): + ... + + +async def test_mark_for_cancellation(): + ... + + +async def test_mark_for_scheduling(): + ... + + +async def test_mark_scheduling_done(): + ... diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index e83b6e77687..169989023a2 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -177,8 +177,8 @@ async def test_schedule_all_pipelines( assert comp_run.cluster_id == DEFAULT_CLUSTER_ID assert comp_run.metadata == run_metadata assert comp_run.result is RunningState.PUBLISHED - assert comp_run.last_scheduled is not None - start_schedule_time = comp_run.last_scheduled + assert comp_run.scheduled is not None + start_schedule_time = comp_run.scheduled start_modified_time = comp_run.modified # this will now not schedule the pipeline since it was already scheduled @@ -186,12 +186,12 @@ async def test_schedule_all_pipelines( scheduler_rabbit_client_parser.assert_not_called() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] - assert comp_run.last_scheduled == start_schedule_time, "scheduled time changed!" + assert comp_run.scheduled == start_schedule_time, "scheduled time changed!" assert comp_run.cancelled is None assert comp_run.modified == start_modified_time # once the worker is done, the schedule time is set back to None - await CompRunsRepository(aiopg_engine).mark_as_scheduled_done( + await CompRunsRepository(aiopg_engine).mark_scheduling_done( user_id=comp_run.user_id, project_id=comp_run.project_uuid, iteration=comp_run.iteration, @@ -210,9 +210,9 @@ async def test_schedule_all_pipelines( scheduler_rabbit_client_parser.reset_mock() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] - assert comp_run.last_scheduled is not None - assert comp_run.last_scheduled > start_schedule_time - last_schedule_time = comp_run.last_scheduled + assert comp_run.scheduled is not None + assert comp_run.scheduled > start_schedule_time + last_schedule_time = comp_run.scheduled assert comp_run.cancelled is None assert comp_run.modified > start_modified_time @@ -233,8 +233,8 @@ async def test_schedule_all_pipelines( scheduler_rabbit_client_parser.reset_mock() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] - assert comp_run.last_scheduled is not None - assert comp_run.last_scheduled > last_schedule_time + assert comp_run.scheduled is not None + assert comp_run.scheduled > last_schedule_time assert comp_run.cancelled is not None @@ -277,8 +277,8 @@ async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines( assert comp_run.cluster_id == DEFAULT_CLUSTER_ID assert comp_run.metadata == run_metadata assert comp_run.result is RunningState.PUBLISHED - assert comp_run.last_scheduled is not None - start_schedule_time = comp_run.last_scheduled + assert comp_run.scheduled is not None + start_schedule_time = comp_run.scheduled start_modified_time = comp_run.modified # this will now not schedule the pipeline since it was already scheduled @@ -286,7 +286,7 @@ async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines( scheduler_rabbit_client_parser.assert_not_called() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] - assert comp_run.last_scheduled == start_schedule_time, "scheduled time changed!" + assert comp_run.scheduled == start_schedule_time, "scheduled time changed!" assert comp_run.cancelled is None assert comp_run.modified == start_modified_time @@ -295,7 +295,7 @@ async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines( comp_run.user_id, comp_run.project_uuid, comp_run.iteration, - last_scheduled=datetime.datetime.now(tz=datetime.UTC) + scheduled=datetime.datetime.now(tz=datetime.UTC) - SCHEDULER_INTERVAL * (_LOST_TASKS_FACTOR + 1), ) with caplog.at_level(logging.ERROR): @@ -313,8 +313,8 @@ async def test_schedule_all_pipelines_logs_error_if_it_find_old_pipelines( scheduler_rabbit_client_parser.reset_mock() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] - assert comp_run.last_scheduled is not None - assert comp_run.last_scheduled > start_schedule_time + assert comp_run.scheduled is not None + assert comp_run.scheduled > start_schedule_time assert comp_run.cancelled is None assert comp_run.modified > start_modified_time From 47c4afff6507568af007f3b2257fd7b92ac8cb24 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:50:22 +0100 Subject: [PATCH 098/127] moving to asyncengine --- .../tests/unit/with_dbs/conftest.py | 61 ++++++++++--------- .../with_dbs/test_api_route_computations.py | 16 +++-- .../test_api_route_computations_tasks.py | 6 +- 3 files changed, 46 insertions(+), 37 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 3b2ac6c8a53..1eab33c31e2 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -7,7 +7,7 @@ import datetime from collections.abc import Awaitable, Callable, Iterator -from typing import Any, cast +from typing import Any, AsyncIterator, cast from uuid import uuid4 import arrow @@ -36,6 +36,7 @@ from simcore_service_director_v2.utils.dask import generate_dask_job_id from simcore_service_director_v2.utils.db import to_clusters_db from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncEngine @pytest.fixture @@ -75,12 +76,12 @@ def creator(**pipeline_kwargs) -> CompPipelineAtDB: @pytest.fixture -def tasks( - postgres_db: sa.engine.Engine, -) -> Iterator[Callable[..., list[CompTaskAtDB]]]: +async def create_tasks( + sqlalchemy_async_engine: AsyncEngine, +) -> AsyncIterator[Callable[..., Awaitable[list[CompTaskAtDB]]]]: created_task_ids: list[int] = [] - def creator( + async def creator( user: dict[str, Any], project: ProjectAtDB, **overrides_kwargs ) -> list[CompTaskAtDB]: created_tasks: list[CompTaskAtDB] = [] @@ -132,8 +133,8 @@ def creator( ), } task_config.update(**overrides_kwargs) - with postgres_db.connect() as conn: - result = conn.execute( + async with sqlalchemy_async_engine.connect() as conn: + result = await conn.execute( comp_tasks.insert() .values(**task_config) .returning(sa.literal_column("*")) @@ -146,8 +147,8 @@ def creator( yield creator # cleanup - with postgres_db.connect() as conn: - conn.execute( + async with sqlalchemy_async_engine.connect() as conn: + await conn.execute( comp_tasks.delete().where(comp_tasks.c.task_id.in_(created_task_ids)) ) @@ -186,12 +187,12 @@ def run_metadata( @pytest.fixture -def runs( - postgres_db: sa.engine.Engine, run_metadata: RunMetadataDict -) -> Iterator[Callable[..., CompRunsAtDB]]: +async def create_comp_run( + sqlalchemy_async_engine: AsyncEngine, run_metadata: RunMetadataDict +) -> AsyncIterator[Callable[..., Awaitable[CompRunsAtDB]]]: created_run_ids: list[int] = [] - def creator( + async def _( user: dict[str, Any], project: ProjectAtDB, **run_kwargs ) -> CompRunsAtDB: run_config = { @@ -203,8 +204,8 @@ def creator( "use_on_demand_clusters": False, } run_config.update(**run_kwargs) - with postgres_db.connect() as conn: - result = conn.execute( + async with sqlalchemy_async_engine.connect() as conn: + result = await conn.execute( comp_runs.insert() .values(**jsonable_encoder(run_config)) .returning(sa.literal_column("*")) @@ -213,11 +214,13 @@ def creator( created_run_ids.append(new_run.run_id) return new_run - yield creator + yield _ # cleanup - with postgres_db.connect() as conn: - conn.execute(comp_runs.delete().where(comp_runs.c.run_id.in_(created_run_ids))) + async with sqlalchemy_async_engine.connect() as conn: + await conn.execute( + comp_runs.delete().where(comp_runs.c.run_id.in_(created_run_ids)) + ) @pytest.fixture @@ -299,7 +302,7 @@ async def publish_project( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], + create_tasks: Callable[..., list[CompTaskAtDB]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> Callable[[], Awaitable[PublishedProject]]: @@ -313,7 +316,9 @@ async def _() -> PublishedProject: project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=tasks(user=user, project=created_project, state=StateType.PUBLISHED), + tasks=create_tasks( + user=user, project=created_project, state=StateType.PUBLISHED + ), ) return _ @@ -331,8 +336,8 @@ async def running_project( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], - runs: Callable[..., CompRunsAtDB], + create_tasks: Callable[..., list[CompTaskAtDB]], + create_comp_run: Callable[..., CompRunsAtDB], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> RunningProject: @@ -345,14 +350,14 @@ async def running_project( project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=tasks( + tasks=create_tasks( user=user, project=created_project, state=StateType.RUNNING, progress=0.0, start=now_time, ), - runs=runs( + runs=create_comp_run( user=user, project=created_project, started=now_time, @@ -367,8 +372,8 @@ async def running_project_mark_for_cancellation( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], - runs: Callable[..., CompRunsAtDB], + create_tasks: Callable[..., list[CompTaskAtDB]], + create_comp_run: Callable[..., CompRunsAtDB], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> RunningProject: @@ -381,14 +386,14 @@ async def running_project_mark_for_cancellation( project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=tasks( + tasks=create_tasks( user=user, project=created_project, state=StateType.RUNNING, progress=0.0, start=now_time, ), - runs=runs( + runs=create_comp_run( user=user, project=created_project, result=StateType.RUNNING, diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py index add9c4d77d3..92a142f1cce 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py @@ -918,7 +918,7 @@ async def test_get_computation_from_not_started_computation_task( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], async_client: httpx.AsyncClient, ): user = registered_user() @@ -935,7 +935,7 @@ async def test_get_computation_from_not_started_computation_task( assert response.status_code == status.HTTP_409_CONFLICT, response.text # now create the expected tasks and the state is good again - comp_tasks = tasks(user=user, project=proj) + comp_tasks = await create_tasks(user=user, project=proj) response = await async_client.get(get_computation_url) assert response.status_code == status.HTTP_200_OK, response.text returned_computation = ComputationGet.model_validate(response.json()) @@ -990,8 +990,8 @@ async def test_get_computation_from_published_computation_task( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], - runs: Callable[..., CompRunsAtDB], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], + create_comp_run: Callable[..., Awaitable[CompRunsAtDB]], async_client: httpx.AsyncClient, ): user = registered_user() @@ -1000,8 +1000,12 @@ async def test_get_computation_from_published_computation_task( project_id=proj.uuid, dag_adjacency_list=fake_workbench_adjacency, ) - comp_tasks = tasks(user=user, project=proj, state=StateType.PUBLISHED, progress=0) - comp_runs = runs(user=user, project=proj, result=StateType.PUBLISHED) + comp_tasks = await create_tasks( + user=user, project=proj, state=StateType.PUBLISHED, progress=0 + ) + comp_runs = await create_comp_run( + user=user, project=proj, result=StateType.PUBLISHED + ) assert comp_runs get_computation_url = httpx.URL( f"/v2/computations/{proj.uuid}?user_id={user['id']}" diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py index 10bd1ba3a2f..7816c024ffc 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py @@ -9,7 +9,6 @@ from uuid import uuid4 import httpx -from pydantic import TypeAdapter import pytest from faker import Faker from fastapi import FastAPI, status @@ -22,6 +21,7 @@ from models_library.projects import ProjectAtDB, ProjectID from models_library.projects_nodes_io import NodeID from models_library.users import UserID +from pydantic import TypeAdapter from pytest_simcore.helpers.monkeypatch_envs import setenvs_from_dict from pytest_simcore.helpers.typing_env import EnvVarsDict from simcore_service_director_v2.core.settings import AppSettings @@ -117,7 +117,7 @@ async def project_id( user: dict[str, Any], project: Callable[..., Awaitable[ProjectAtDB]], pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], ): """project uuid of a saved project (w/ tasks up-to-date)""" @@ -130,7 +130,7 @@ async def project_id( dag_adjacency_list=fake_workbench_adjacency, ) # insert tasks -> comp_tasks - comp_tasks = tasks(user=user, project=proj) + comp_tasks = await create_tasks(user=user, project=proj) return proj.uuid From c9b85af7d4b82bd6fa8ee8cce04de4df19bf8c94 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:54:51 +0100 Subject: [PATCH 099/127] moving to asyncengine --- .../with_dbs/comp_scheduler/test_manager.py | 4 +- .../comp_scheduler/test_scheduler_dask.py | 4 +- .../tests/unit/with_dbs/conftest.py | 95 ++++++++++--------- .../unit/with_dbs/test_api_route_clusters.py | 40 ++++---- .../test_api_route_clusters_details.py | 6 +- .../with_dbs/test_api_route_computations.py | 18 ++-- .../test_api_route_computations_tasks.py | 6 +- .../unit/with_dbs/test_utils_rabbitmq.py | 10 +- 8 files changed, 95 insertions(+), 88 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 169989023a2..d3ae2726157 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -325,7 +325,7 @@ async def test_empty_pipeline_is_not_scheduled( initialized_app: FastAPI, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], run_metadata: RunMetadataDict, sqlalchemy_async_engine: AsyncEngine, scheduler_rabbit_client_parser: mock.AsyncMock, @@ -349,7 +349,7 @@ async def test_empty_pipeline_is_not_scheduled( scheduler_rabbit_client_parser.assert_not_called() # create the empty pipeline now - pipeline(project_id=f"{empty_project.uuid}") + await create_pipeline(project_id=f"{empty_project.uuid}") # creating a run with an empty pipeline is useless, check the scheduler is not kicking in with caplog.at_level(logging.WARNING): diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index 3b64d938288..f453da7ee28 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -1091,7 +1091,7 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( scheduler_api: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], sqlalchemy_async_engine: AsyncEngine, @@ -1101,7 +1101,7 @@ async def test_broken_pipeline_configuration_is_not_scheduled_and_aborted( It shall be aborted and shown as such in the comp_runs db""" user = registered_user() sleepers_project = await project(user, workbench=fake_workbench_without_outputs) - pipeline( + await create_pipeline( project_id=f"{sleepers_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 1eab33c31e2..76e900c5131 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -6,8 +6,8 @@ import datetime -from collections.abc import Awaitable, Callable, Iterator -from typing import Any, AsyncIterator, cast +from collections.abc import AsyncIterator, Awaitable, Callable +from typing import Any, cast from uuid import uuid4 import arrow @@ -40,20 +40,20 @@ @pytest.fixture -def pipeline( - postgres_db: sa.engine.Engine, -) -> Iterator[Callable[..., CompPipelineAtDB]]: +async def create_pipeline( + sqlalchemy_async_engine: AsyncEngine, +) -> AsyncIterator[Callable[..., Awaitable[CompPipelineAtDB]]]: created_pipeline_ids: list[str] = [] - def creator(**pipeline_kwargs) -> CompPipelineAtDB: + async def _(**pipeline_kwargs) -> CompPipelineAtDB: pipeline_config = { "project_id": f"{uuid4()}", "dag_adjacency_list": {}, "state": StateType.NOT_STARTED, } pipeline_config.update(**pipeline_kwargs) - with postgres_db.begin() as conn: - result = conn.execute( + async with sqlalchemy_async_engine.begin() as conn: + result = await conn.execute( comp_pipeline.insert() .values(**pipeline_config) .returning(sa.literal_column("*")) @@ -64,11 +64,11 @@ def creator(**pipeline_kwargs) -> CompPipelineAtDB: created_pipeline_ids.append(f"{new_pipeline.project_id}") return new_pipeline - yield creator + yield _ # cleanup - with postgres_db.connect() as conn: - conn.execute( + async with sqlalchemy_async_engine.connect() as conn: + await conn.execute( comp_pipeline.delete().where( comp_pipeline.c.project_id.in_(created_pipeline_ids) ) @@ -81,7 +81,7 @@ async def create_tasks( ) -> AsyncIterator[Callable[..., Awaitable[list[CompTaskAtDB]]]]: created_task_ids: list[int] = [] - async def creator( + async def _( user: dict[str, Any], project: ProjectAtDB, **overrides_kwargs ) -> list[CompTaskAtDB]: created_tasks: list[CompTaskAtDB] = [] @@ -144,7 +144,7 @@ async def creator( created_task_ids.extend([t.task_id for t in created_tasks if t.task_id]) return created_tasks - yield creator + yield _ # cleanup async with sqlalchemy_async_engine.connect() as conn: @@ -224,29 +224,37 @@ async def _( @pytest.fixture -def cluster( - postgres_db: sa.engine.Engine, -) -> Iterator[Callable[..., Cluster]]: +async def create_cluster( + sqlalchemy_async_engine: AsyncEngine, +) -> AsyncIterator[Callable[..., Awaitable[Cluster]]]: created_cluster_ids: list[str] = [] - def creator(user: dict[str, Any], **cluster_kwargs) -> Cluster: + async def _(user: dict[str, Any], **cluster_kwargs) -> Cluster: + assert "json_schema_extra" in Cluster.model_config + assert isinstance(Cluster.model_config["json_schema_extra"], dict) + assert isinstance(Cluster.model_config["json_schema_extra"]["examples"], list) + assert isinstance( + Cluster.model_config["json_schema_extra"]["examples"][1], dict + ) cluster_config = Cluster.model_config["json_schema_extra"]["examples"][1] cluster_config["owner"] = user["primary_gid"] cluster_config.update(**cluster_kwargs) new_cluster = Cluster.model_validate(cluster_config) assert new_cluster - with postgres_db.connect() as conn: + async with sqlalchemy_async_engine.connect() as conn: # insert basic cluster - created_cluster = conn.execute( - sa.insert(clusters) - .values(to_clusters_db(new_cluster, only_update=False)) - .returning(sa.literal_column("*")) + created_cluster = ( + await conn.execute( + sa.insert(clusters) + .values(to_clusters_db(new_cluster, only_update=False)) + .returning(sa.literal_column("*")) + ) ).one() created_cluster_ids.append(created_cluster.id) if "access_rights" in cluster_kwargs: for gid, rights in cluster_kwargs["access_rights"].items(): - conn.execute( + await conn.execute( pg_insert(cluster_to_groups) .values( cluster_id=created_cluster.id, @@ -259,7 +267,7 @@ def creator(user: dict[str, Any], **cluster_kwargs) -> Cluster: ) ) access_rights_in_db = {} - for row in conn.execute( + for row in await conn.execute( sa.select( cluster_to_groups.c.gid, cluster_to_groups.c.read, @@ -287,12 +295,11 @@ def creator(user: dict[str, Any], **cluster_kwargs) -> Cluster: thumbnail=None, ) - yield creator + yield _ # cleanup - with postgres_db.connect() as conn: - conn.execute( - # pylint: disable=no-value-for-parameter + async with sqlalchemy_async_engine.connect() as conn: + await conn.execute( clusters.delete().where(clusters.c.id.in_(created_cluster_ids)) ) @@ -301,8 +308,8 @@ def creator(user: dict[str, Any], **cluster_kwargs) -> Cluster: async def publish_project( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - create_tasks: Callable[..., list[CompTaskAtDB]], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> Callable[[], Awaitable[PublishedProject]]: @@ -312,11 +319,11 @@ async def _() -> PublishedProject: created_project = await project(user, workbench=fake_workbench_without_outputs) return PublishedProject( project=created_project, - pipeline=pipeline( + pipeline=await create_pipeline( project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=create_tasks( + tasks=await create_tasks( user=user, project=created_project, state=StateType.PUBLISHED ), ) @@ -335,9 +342,9 @@ async def published_project( async def running_project( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - create_tasks: Callable[..., list[CompTaskAtDB]], - create_comp_run: Callable[..., CompRunsAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], + create_comp_run: Callable[..., Awaitable[CompRunsAtDB]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> RunningProject: @@ -346,18 +353,18 @@ async def running_project( now_time = arrow.utcnow().datetime return RunningProject( project=created_project, - pipeline=pipeline( + pipeline=await create_pipeline( project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=create_tasks( + tasks=await create_tasks( user=user, project=created_project, state=StateType.RUNNING, progress=0.0, start=now_time, ), - runs=create_comp_run( + runs=await create_comp_run( user=user, project=created_project, started=now_time, @@ -371,9 +378,9 @@ async def running_project( async def running_project_mark_for_cancellation( registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], - create_tasks: Callable[..., list[CompTaskAtDB]], - create_comp_run: Callable[..., CompRunsAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], + create_comp_run: Callable[..., Awaitable[CompRunsAtDB]], fake_workbench_without_outputs: dict[str, Any], fake_workbench_adjacency: dict[str, Any], ) -> RunningProject: @@ -382,18 +389,18 @@ async def running_project_mark_for_cancellation( now_time = arrow.utcnow().datetime return RunningProject( project=created_project, - pipeline=pipeline( + pipeline=await create_pipeline( project_id=f"{created_project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ), - tasks=create_tasks( + tasks=await create_tasks( user=user, project=created_project, state=StateType.RUNNING, progress=0.0, start=now_time, ), - runs=create_comp_run( + runs=await create_comp_run( user=user, project=created_project, result=StateType.RUNNING, diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_clusters.py b/services/director-v2/tests/unit/with_dbs/test_api_route_clusters.py index 19ab0ea2df3..9f55e71f935 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_clusters.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_clusters.py @@ -4,7 +4,7 @@ import random from collections.abc import Callable, Iterator -from typing import Any +from typing import Any, Awaitable import httpx import pytest @@ -85,7 +85,7 @@ def clusters_cleaner(postgres_db: sa.engine.Engine) -> Iterator: async def test_list_clusters( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], async_client: httpx.AsyncClient, ): user_1 = registered_user() @@ -106,7 +106,7 @@ async def test_list_clusters( # let's create some clusters NUM_CLUSTERS = 111 for n in range(NUM_CLUSTERS): - cluster(user_1, name=f"pytest cluster{n:04}") + await create_cluster(user_1, name=f"pytest cluster{n:04}") response = await async_client.get(list_clusters_url) assert response.status_code == status.HTTP_200_OK @@ -141,7 +141,7 @@ async def test_list_clusters( (CLUSTER_MANAGER_RIGHTS, "manager rights"), (CLUSTER_ADMIN_RIGHTS, "admin rights"), ]: - cluster( + await create_cluster( user_1, # cluster is owned by user_1 name=f"cluster with {name}", access_rights={ @@ -172,7 +172,7 @@ async def test_list_clusters( async def test_get_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], async_client: httpx.AsyncClient, ): user_1 = registered_user() @@ -183,7 +183,7 @@ async def test_get_cluster( assert response.status_code == status.HTTP_404_NOT_FOUND # let's create some clusters a_bunch_of_clusters = [ - cluster(user_1, name=f"pytest cluster{n:04}") for n in range(111) + await create_cluster(user_1, name=f"pytest cluster{n:04}") for n in range(111) ] the_cluster = random.choice(a_bunch_of_clusters) @@ -213,7 +213,7 @@ async def test_get_cluster( (CLUSTER_MANAGER_RIGHTS, True), (CLUSTER_ADMIN_RIGHTS, True), ]: - a_cluster = cluster( + a_cluster = await create_cluster( user_2, # cluster is owned by user_2 access_rights={ user_2["primary_gid"]: CLUSTER_ADMIN_RIGHTS, @@ -243,7 +243,7 @@ async def test_get_cluster( async def test_get_another_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], async_client: httpx.AsyncClient, cluster_sharing_rights: ClusterAccessRights, can_use: bool, @@ -252,7 +252,7 @@ async def test_get_another_cluster( user_2 = registered_user() # let's create some clusters a_bunch_of_clusters = [ - cluster( + await create_cluster( user_1, name=f"pytest cluster{n:04}", access_rights={ @@ -349,7 +349,7 @@ async def test_create_cluster( async def test_update_own_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], cluster_simple_authentication: Callable, async_client: httpx.AsyncClient, faker: Faker, @@ -366,7 +366,7 @@ async def test_update_own_cluster( assert response.status_code == status.HTTP_404_NOT_FOUND # let's create some clusters a_bunch_of_clusters = [ - cluster(user_1, name=f"pytest cluster{n:04}") for n in range(111) + await create_cluster(user_1, name=f"pytest cluster{n:04}") for n in range(111) ] the_cluster = random.choice(a_bunch_of_clusters) # get the original one @@ -471,7 +471,7 @@ async def test_update_own_cluster( async def test_update_default_cluster_fails( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], cluster_simple_authentication: Callable, async_client: httpx.AsyncClient, faker: Faker, @@ -506,7 +506,7 @@ async def test_update_default_cluster_fails( async def test_update_another_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], cluster_simple_authentication: Callable, async_client: httpx.AsyncClient, faker: Faker, @@ -522,7 +522,7 @@ async def test_update_another_cluster( user_2 = registered_user() # let's create some clusters a_bunch_of_clusters = [ - cluster( + await create_cluster( user_1, name=f"pytest cluster{n:04}", access_rights={ @@ -603,13 +603,13 @@ async def test_update_another_cluster( async def test_delete_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], async_client: httpx.AsyncClient, ): user_1 = registered_user() # let's create some clusters a_bunch_of_clusters = [ - cluster( + await create_cluster( user_1, name=f"pytest cluster{n:04}", access_rights={ @@ -647,7 +647,7 @@ async def test_delete_cluster( async def test_delete_another_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], cluster_simple_authentication: Callable, async_client: httpx.AsyncClient, faker: Faker, @@ -658,7 +658,7 @@ async def test_delete_another_cluster( user_2 = registered_user() # let's create some clusters a_bunch_of_clusters = [ - cluster( + await create_cluster( user_1, name=f"pytest cluster{n:04}", access_rights={ @@ -754,7 +754,7 @@ async def test_ping_cluster( async def test_ping_specific_cluster( clusters_config: None, registered_user: Callable[..., dict], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], async_client: httpx.AsyncClient, local_dask_gateway_server: DaskGatewayServer, ): @@ -767,7 +767,7 @@ async def test_ping_specific_cluster( # let's create some clusters and ping one a_bunch_of_clusters = [ - cluster( + await create_cluster( user_1, name=f"pytest cluster{n:04}", endpoint=local_dask_gateway_server.address, diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_clusters_details.py b/services/director-v2/tests/unit/with_dbs/test_api_route_clusters_details.py index 5dd1abaa594..357f3b7647a 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_clusters_details.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_clusters_details.py @@ -4,7 +4,7 @@ import json from collections.abc import Callable -from typing import Any +from typing import Any, Awaitable import httpx import pytest @@ -142,14 +142,14 @@ async def test_get_cluster_details( registered_user: Callable[..., dict[str, Any]], async_client: httpx.AsyncClient, local_dask_gateway_server: DaskGatewayServer, - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], dask_gateway_cluster: GatewayCluster, dask_gateway_cluster_client: DaskClient, gateway_username: str, ): user_1 = registered_user() # define the cluster in the DB - some_cluster = cluster( + some_cluster = await create_cluster( user_1, endpoint=local_dask_gateway_server.address, authentication=SimpleAuthentication( diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py index 92a142f1cce..3c7076b0aec 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py @@ -789,12 +789,12 @@ async def test_start_computation_with_deprecated_services_raises_406( @pytest.fixture -def unusable_cluster( +async def unusable_cluster( registered_user: Callable[..., dict[str, Any]], - cluster: Callable[..., Cluster], + create_cluster: Callable[..., Awaitable[Cluster]], ) -> ClusterID: user = registered_user() - created_cluster = cluster(user) + created_cluster = await create_cluster(user) return created_cluster.id @@ -865,7 +865,7 @@ async def test_get_computation_from_empty_project( fake_workbench_adjacency: dict[str, Any], registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], faker: Faker, async_client: httpx.AsyncClient, ): @@ -884,7 +884,7 @@ async def test_get_computation_from_empty_project( response = await async_client.get(get_computation_url) assert response.status_code == status.HTTP_404_NOT_FOUND, response.text # create an empty pipeline - pipeline( + await create_pipeline( project_id=proj.uuid, ) response = await async_client.get(get_computation_url) @@ -917,7 +917,7 @@ async def test_get_computation_from_not_started_computation_task( fake_workbench_adjacency: dict[str, Any], registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], async_client: httpx.AsyncClient, ): @@ -926,7 +926,7 @@ async def test_get_computation_from_not_started_computation_task( get_computation_url = httpx.URL( f"/v2/computations/{proj.uuid}?user_id={user['id']}" ) - pipeline( + await create_pipeline( project_id=proj.uuid, dag_adjacency_list=fake_workbench_adjacency, ) @@ -989,14 +989,14 @@ async def test_get_computation_from_published_computation_task( fake_workbench_adjacency: dict[str, Any], registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], create_comp_run: Callable[..., Awaitable[CompRunsAtDB]], async_client: httpx.AsyncClient, ): user = registered_user() proj = await project(user, workbench=fake_workbench_without_outputs) - pipeline( + await create_pipeline( project_id=proj.uuid, dag_adjacency_list=fake_workbench_adjacency, ) diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py index 7816c024ffc..80f329658b5 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py @@ -116,16 +116,16 @@ async def project_id( fake_workbench_adjacency: dict[str, Any], user: dict[str, Any], project: Callable[..., Awaitable[ProjectAtDB]], - pipeline: Callable[..., CompPipelineAtDB], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], -): +) -> ProjectID: """project uuid of a saved project (w/ tasks up-to-date)""" # insert project -> db proj = await project(user, workbench=fake_workbench_without_outputs) # insert pipeline -> comp_pipeline - pipeline( + await create_pipeline( project_id=proj.uuid, dag_adjacency_list=fake_workbench_adjacency, ) diff --git a/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py b/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py index a041f70ecc7..32bfe4d52ff 100644 --- a/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py +++ b/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py @@ -93,18 +93,18 @@ async def project( @pytest.fixture -def tasks( +async def tasks( user: dict[str, Any], project: ProjectAtDB, fake_workbench_adjacency: dict[str, Any], - pipeline: Callable[..., CompPipelineAtDB], - tasks: Callable[..., list[CompTaskAtDB]], + create_pipeline: Callable[..., Awaitable[CompPipelineAtDB]], + create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], ) -> list[CompTaskAtDB]: - pipeline( + await create_pipeline( project_id=project.uuid, dag_adjacency_list=fake_workbench_adjacency, ) - comp_tasks = tasks(user, project) + comp_tasks = await create_tasks(user, project) assert len(comp_tasks) > 0 return comp_tasks From 28cb4b9bf1f466cbcca04fddb20ed0c6ad184f8d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:35:34 +0100 Subject: [PATCH 100/127] use begin --- services/director-v2/tests/unit/_helpers.py | 1 + .../test_db_repositories_comp_runs.py | 45 ++++++++++++++++++- .../comp_scheduler/test_scheduler_dask.py | 1 + .../tests/unit/with_dbs/conftest.py | 21 +++++---- 4 files changed, 57 insertions(+), 11 deletions(-) diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index 1e02a79e5c3..45632d0454a 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -18,6 +18,7 @@ @dataclass class PublishedProject: + user: dict[str, Any] project: ProjectAtDB pipeline: CompPipelineAtDB tasks: list[CompTaskAtDB] diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index c90bcf5a568..3a08d5e6d88 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -1,7 +1,19 @@ +# pylint: disable=no-value-for-parameter +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=too-many-arguments +# pylint: disable=unused-argument +# pylint: disable=unused-variable + +from typing import Awaitable, Callable + import pytest +from _helpers import PublishedProject +from faker import Faker from models_library.projects import ProjectID from models_library.users import UserID from simcore_service_director_v2.core.errors import ComputationalRunNotFoundError +from simcore_service_director_v2.models.comp_runs import CompRunsAtDB from simcore_service_director_v2.modules.db.repositories.comp_runs import ( CompRunsRepository, ) @@ -14,9 +26,38 @@ ] -async def test_get(aiopg_engine, user_id: UserID, project_id: ProjectID): +@pytest.fixture +def fake_user_id(faker: Faker) -> UserID: + return faker.pyint(min_value=1) + + +@pytest.fixture +def fake_project_id(faker: Faker) -> ProjectID: + return ProjectID(f"{faker.uuid4(cast_to=None)}") + + +async def test_get( + aiopg_engine, + fake_user_id: UserID, + fake_project_id: ProjectID, + publish_project: Callable[[], Awaitable[PublishedProject]], + create_comp_run: Callable[..., Awaitable[CompRunsAtDB]], +): + with pytest.raises(ComputationalRunNotFoundError): + await CompRunsRepository(aiopg_engine).get(fake_user_id, fake_project_id) + + published_project = await publish_project() + assert published_project.project.prj_owner + # there is still no comp run created with pytest.raises(ComputationalRunNotFoundError): - await CompRunsRepository(aiopg_engine).get(user_id, project_id) + await CompRunsRepository(aiopg_engine).get( + published_project.project.prj_owner, published_project.project.uuid + ) + + comp_run = await create_comp_run(published_project.user, published_project.project) + await CompRunsRepository(aiopg_engine).get( + published_project.project.prj_owner, published_project.project.uuid + ) async def test_list(): diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py index f453da7ee28..f6a041b934e 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_scheduler_dask.py @@ -1046,6 +1046,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta assert messages[0].node_id == exp_started_task.node_id return RunningProject( + published_project.user, published_project.project, published_project.pipeline, tasks_in_db, diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 76e900c5131..3d9c00b892a 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -67,7 +67,7 @@ async def _(**pipeline_kwargs) -> CompPipelineAtDB: yield _ # cleanup - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: await conn.execute( comp_pipeline.delete().where( comp_pipeline.c.project_id.in_(created_pipeline_ids) @@ -123,7 +123,7 @@ async def _( ), "node_class": to_node_class(node_data.key), "internal_id": internal_id + 1, - "submit": datetime.datetime.now(tz=datetime.UTC), + "submit": datetime.datetime.now(), "job_id": generate_dask_job_id( service_key=node_data.key, service_version=node_data.version, @@ -133,7 +133,7 @@ async def _( ), } task_config.update(**overrides_kwargs) - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: result = await conn.execute( comp_tasks.insert() .values(**task_config) @@ -147,7 +147,7 @@ async def _( yield _ # cleanup - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: await conn.execute( comp_tasks.delete().where(comp_tasks.c.task_id.in_(created_task_ids)) ) @@ -197,14 +197,14 @@ async def _( ) -> CompRunsAtDB: run_config = { "project_uuid": f"{project.uuid}", - "user_id": f"{user['id']}", + "user_id": user["id"], "iteration": 1, "result": StateType.NOT_STARTED, "metadata": run_metadata, "use_on_demand_clusters": False, } run_config.update(**run_kwargs) - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: result = await conn.execute( comp_runs.insert() .values(**jsonable_encoder(run_config)) @@ -217,7 +217,7 @@ async def _( yield _ # cleanup - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: await conn.execute( comp_runs.delete().where(comp_runs.c.run_id.in_(created_run_ids)) ) @@ -242,7 +242,7 @@ async def _(user: dict[str, Any], **cluster_kwargs) -> Cluster: new_cluster = Cluster.model_validate(cluster_config) assert new_cluster - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: # insert basic cluster created_cluster = ( await conn.execute( @@ -298,7 +298,7 @@ async def _(user: dict[str, Any], **cluster_kwargs) -> Cluster: yield _ # cleanup - async with sqlalchemy_async_engine.connect() as conn: + async with sqlalchemy_async_engine.begin() as conn: await conn.execute( clusters.delete().where(clusters.c.id.in_(created_cluster_ids)) ) @@ -318,6 +318,7 @@ async def publish_project( async def _() -> PublishedProject: created_project = await project(user, workbench=fake_workbench_without_outputs) return PublishedProject( + user=user, project=created_project, pipeline=await create_pipeline( project_id=f"{created_project.uuid}", @@ -352,6 +353,7 @@ async def running_project( created_project = await project(user, workbench=fake_workbench_without_outputs) now_time = arrow.utcnow().datetime return RunningProject( + user=user, project=created_project, pipeline=await create_pipeline( project_id=f"{created_project.uuid}", @@ -388,6 +390,7 @@ async def running_project_mark_for_cancellation( created_project = await project(user, workbench=fake_workbench_without_outputs) now_time = arrow.utcnow().datetime return RunningProject( + user=user, project=created_project, pipeline=await create_pipeline( project_id=f"{created_project.uuid}", From 2357932c5350f2b9e264929f61ec110517ec2f37 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:57:30 +0100 Subject: [PATCH 101/127] repository almost tested --- .../core/errors.py | 4 + .../modules/db/repositories/comp_runs.py | 19 +++- .../test_db_repositories_comp_runs.py | 90 ++++++++++++++++--- 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/core/errors.py b/services/director-v2/src/simcore_service_director_v2/core/errors.py index 18a5b674ed2..492e75bdeab 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/errors.py +++ b/services/director-v2/src/simcore_service_director_v2/core/errors.py @@ -35,6 +35,10 @@ class ConfigurationError(DirectorError): msg_template: str = "Application misconfiguration: {msg}" +class UserNotFoundError(DirectorError): + msg_template: str = "user {user_id} not found" + + class ProjectNotFoundError(DirectorError): msg_template: str = "project {project_id} not found" diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 9bd922ae29a..06def040890 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -16,7 +16,13 @@ from sqlalchemy.sql.elements import literal_column from sqlalchemy.sql.expression import desc -from ....core.errors import ClusterNotFoundError, ComputationalRunNotFoundError +from ....core.errors import ( + ClusterNotFoundError, + ComputationalRunNotFoundError, + DirectorError, + ProjectNotFoundError, + UserNotFoundError, +) from ....models.comp_runs import CompRunsAtDB, RunMetadataDict from ....utils.db import RUNNING_STATE_TO_DB from ..tables import comp_runs @@ -136,7 +142,16 @@ async def create( row = await result.first() return CompRunsAtDB.model_validate(row) except ForeignKeyViolation as exc: - raise ClusterNotFoundError(cluster_id=cluster_id) from exc + message = exc.args[0] + match message: + case s if "users" in s and "user_id" in s: + raise UserNotFoundError(user_id=user_id) from exc + case s if "projects" in s and "project_uuid" in s: + raise ProjectNotFoundError(project_id=project_id) from exc + case s if "clusters" in s and "cluster_id" in s: + raise ClusterNotFoundError(cluster_id=cluster_id) from exc + case _: + raise DirectorError from exc async def update( self, user_id: UserID, project_id: ProjectID, iteration: PositiveInt, **values diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index 3a08d5e6d88..2de38c72104 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -5,15 +5,21 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable -from typing import Awaitable, Callable +from collections.abc import Awaitable, Callable import pytest from _helpers import PublishedProject from faker import Faker +from models_library.clusters import DEFAULT_CLUSTER_ID, Cluster from models_library.projects import ProjectID from models_library.users import UserID -from simcore_service_director_v2.core.errors import ComputationalRunNotFoundError -from simcore_service_director_v2.models.comp_runs import CompRunsAtDB +from simcore_service_director_v2.core.errors import ( + ClusterNotFoundError, + ComputationalRunNotFoundError, + ProjectNotFoundError, + UserNotFoundError, +) +from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.modules.db.repositories.comp_runs import ( CompRunsRepository, ) @@ -54,39 +60,95 @@ async def test_get( published_project.project.prj_owner, published_project.project.uuid ) - comp_run = await create_comp_run(published_project.user, published_project.project) + await create_comp_run(published_project.user, published_project.project) await CompRunsRepository(aiopg_engine).get( published_project.project.prj_owner, published_project.project.uuid ) -async def test_list(): - ... +async def test_list( + aiopg_engine, +): + assert await CompRunsRepository(aiopg_engine).list() == [] -async def test_create(): - ... +async def test_create( + aiopg_engine, + fake_user_id: UserID, + fake_project_id: ProjectID, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], + create_cluster: Callable[..., Awaitable[Cluster]], +): + with pytest.raises(ProjectNotFoundError): + await CompRunsRepository(aiopg_engine).create( + user_id=fake_user_id, + project_id=fake_project_id, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + published_project = await publish_project() + with pytest.raises(UserNotFoundError): + await CompRunsRepository(aiopg_engine).create( + user_id=fake_user_id, + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + + await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + + with pytest.raises(ClusterNotFoundError): + await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=faker.pyint(min_value=1), + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + cluster = await create_cluster(published_project.user) + await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=cluster.id, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) -async def test_update(): +async def test_update(aiopg_engine): ... -async def test_delete(): +async def test_delete(aiopg_engine): ... -async def test_set_run_result(): +async def test_set_run_result(aiopg_engine): ... -async def test_mark_for_cancellation(): +async def test_mark_for_cancellation(aiopg_engine): ... -async def test_mark_for_scheduling(): +async def test_mark_for_scheduling(aiopg_engine): ... -async def test_mark_scheduling_done(): +async def test_mark_scheduling_done(aiopg_engine): ... From 0963624de0e532ad580a7f357cb11afccd044704 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 23:06:25 +0100 Subject: [PATCH 102/127] repository almost tested --- .../test_db_repositories_comp_runs.py | 69 ++++++++++++++++++- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index 2de38c72104..e335fa7f689 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -5,6 +5,7 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable +import datetime from collections.abc import Awaitable, Callable import pytest @@ -101,7 +102,22 @@ async def test_create( use_on_demand_clusters=faker.pybool(), ) - await CompRunsRepository(aiopg_engine).create( + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + + # creating a second one auto increment the iteration + created = await CompRunsRepository(aiopg_engine).create( user_id=published_project.user["id"], project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, @@ -109,6 +125,15 @@ async def test_create( metadata=run_metadata, use_on_demand_clusters=faker.pybool(), ) + assert created != got + assert created.iteration == got.iteration + 1 + + # getting without specifying the iteration returns the latest + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got with pytest.raises(ClusterNotFoundError): await CompRunsRepository(aiopg_engine).create( @@ -130,8 +155,46 @@ async def test_create( ) -async def test_update(aiopg_engine): - ... +async def test_update( + aiopg_engine, + fake_user_id: UserID, + fake_project_id: ProjectID, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], +): + # this updates nothing but also does not complain + updated = await CompRunsRepository(aiopg_engine).update( + fake_user_id, fake_project_id, faker.pyint(min_value=1) + ) + assert updated is None + # now let's create a valid one + published_project = await publish_project() + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + + updated = await CompRunsRepository(aiopg_engine).update( + created.user_id, + created.project_uuid, + created.iteration, + scheduled=datetime.datetime.now(datetime.UTC), + ) + assert updated is not None + assert created != updated + assert created.scheduled is None + assert updated.scheduled is not None async def test_delete(aiopg_engine): From 743f82bcdbbc47c37c9283399cb81cfd806e6be0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 26 Nov 2024 23:17:35 +0100 Subject: [PATCH 103/127] repository almost tested --- .../test_db_repositories_comp_runs.py | 149 ++++++++++++++++-- 1 file changed, 139 insertions(+), 10 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index e335fa7f689..dce8d654e8b 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -13,6 +13,7 @@ from faker import Faker from models_library.clusters import DEFAULT_CLUSTER_ID, Cluster from models_library.projects import ProjectID +from models_library.projects_state import RunningState from models_library.users import UserID from simcore_service_director_v2.core.errors import ( ClusterNotFoundError, @@ -197,21 +198,149 @@ async def test_update( assert updated.scheduled is not None -async def test_delete(aiopg_engine): - ... +async def test_set_run_result( + aiopg_engine, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], +): + published_project = await publish_project() + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + assert created.result is not RunningState.PENDING + assert created.ended is None + + updated = await CompRunsRepository(aiopg_engine).set_run_result( + user_id=created.user_id, + project_id=created.project_uuid, + iteration=created.iteration, + result_state=RunningState.PENDING, + final_state=False, + ) + assert updated + assert updated != created + assert updated.result is RunningState.PENDING + assert updated.ended is None + + final_updated = await CompRunsRepository(aiopg_engine).set_run_result( + user_id=created.user_id, + project_id=created.project_uuid, + iteration=created.iteration, + result_state=RunningState.ABORTED, + final_state=True, + ) + assert final_updated + assert final_updated != updated + assert final_updated.result is RunningState.ABORTED + assert final_updated.ended is not None -async def test_set_run_result(aiopg_engine): - ... +async def test_mark_for_cancellation( + aiopg_engine, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], +): + published_project = await publish_project() + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + assert created.cancelled is None + updated = await CompRunsRepository(aiopg_engine).mark_for_cancellation( + user_id=created.user_id, + project_id=created.project_uuid, + iteration=created.iteration, + ) + assert updated + assert updated != created + assert updated.cancelled is not None -async def test_mark_for_cancellation(aiopg_engine): - ... +async def test_mark_for_scheduling( + aiopg_engine, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], +): + published_project = await publish_project() + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + assert created.scheduled is None + assert created.processed is None + + updated = await CompRunsRepository(aiopg_engine).mark_for_scheduling( + user_id=created.user_id, + project_id=created.project_uuid, + iteration=created.iteration, + ) + assert updated + assert updated != created + assert updated.scheduled is not None + assert updated.processed is None -async def test_mark_for_scheduling(aiopg_engine): - ... +async def test_mark_scheduling_done( + aiopg_engine, + run_metadata: RunMetadataDict, + faker: Faker, + publish_project: Callable[[], Awaitable[PublishedProject]], +): + published_project = await publish_project() + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + got = await CompRunsRepository(aiopg_engine).get( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + ) + assert created == got + assert created.scheduled is None + assert created.processed is None -async def test_mark_scheduling_done(aiopg_engine): - ... + updated = await CompRunsRepository(aiopg_engine).mark_scheduling_done( + user_id=created.user_id, + project_id=created.project_uuid, + iteration=created.iteration, + ) + assert updated + assert updated != created + assert updated.scheduled is None + assert updated.processed is not None From 11eede83f179dcd83a26bdde993efb9532b26773 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:35:22 +0100 Subject: [PATCH 104/127] testing listing --- .../test_db_repositories_comp_runs.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index dce8d654e8b..b8d9efc3050 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -5,6 +5,7 @@ # pylint: disable=unused-argument # pylint: disable=unused-variable +import asyncio import datetime from collections.abc import Awaitable, Callable @@ -70,9 +71,42 @@ async def test_get( async def test_list( aiopg_engine, + publish_project: Callable[[], Awaitable[PublishedProject]], + run_metadata: RunMetadataDict, + faker: Faker, ): assert await CompRunsRepository(aiopg_engine).list() == [] + published_project = await publish_project() + assert await CompRunsRepository(aiopg_engine).list() == [] + + created = await CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=None, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + assert await CompRunsRepository(aiopg_engine).list() == [created] + + created = [created] + await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).create( + user_id=published_project.user["id"], + project_id=published_project.project.uuid, + cluster_id=DEFAULT_CLUSTER_ID, + iteration=created.iteration + n + 1, + metadata=run_metadata, + use_on_demand_clusters=faker.pybool(), + ) + for n in range(50) + ) + ) + assert sorted( + await CompRunsRepository(aiopg_engine).list(), key=lambda x: x.iteration + ) == sorted(created, key=lambda x: x.iteration) + async def test_create( aiopg_engine, From 4028fbb6bb1cd663611397f5b38a7856b663a48e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:50:10 +0100 Subject: [PATCH 105/127] testing listing with filter --- .../test_db_repositories_comp_runs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index b8d9efc3050..c8dc6fd927b 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -107,6 +107,24 @@ async def test_list( await CompRunsRepository(aiopg_engine).list(), key=lambda x: x.iteration ) == sorted(created, key=lambda x: x.iteration) + # test with filter of state + any_state_but_published = { + s for s in RunningState if s is not RunningState.PUBLISHED + } + assert ( + await CompRunsRepository(aiopg_engine).list( + filter_by_state=any_state_but_published + ) + == [] + ) + + assert sorted( + await CompRunsRepository(aiopg_engine).list( + filter_by_state={RunningState.PUBLISHED} + ), + key=lambda x: x.iteration, + ) == sorted(created, key=lambda x: x.iteration) + async def test_create( aiopg_engine, From a9c7cf85b376fa381f5b7f5176af877a23b0fbe4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 22:27:47 +0100 Subject: [PATCH 106/127] listing with processed since works --- .../modules/comp_scheduler/_scheduler_base.py | 2 +- .../modules/db/repositories/comp_runs.py | 9 +- .../test_db_repositories_comp_runs.py | 83 ++++++++++++++++++- .../with_dbs/comp_scheduler/test_manager.py | 2 +- 4 files changed, 88 insertions(+), 8 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py index c7e732579d9..a16821d0fba 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_base.py @@ -233,7 +233,7 @@ async def _set_schedule_done( project_id: ProjectID, iteration: Iteration, ) -> None: - await CompRunsRepository.instance(self.db_engine).mark_scheduling_done( + await CompRunsRepository.instance(self.db_engine).mark_as_processed( user_id=user_id, project_id=project_id, iteration=iteration, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 06def040890..d879cfc49f5 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -64,7 +64,7 @@ async def list( *, filter_by_state: set[RunningState] | None = None, never_scheduled: bool = False, - processed_before: datetime.datetime | None = None, + processed_since: datetime.timedelta | None = None, scheduled_after: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: conditions = [] @@ -85,8 +85,9 @@ async def list( scheduled_cutoff = arrow.utcnow().datetime - scheduled_after scheduling_or_conditions.append(comp_runs.c.scheduled <= scheduled_cutoff) - if processed_before is not None: - scheduling_or_conditions.append(comp_runs.c.processed <= processed_before) + if processed_since is not None: + processed_cutoff = arrow.utcnow().datetime - processed_since + scheduling_or_conditions.append(comp_runs.c.processed <= processed_cutoff) if scheduling_or_conditions: conditions.append(sa.or_(*scheduling_or_conditions)) @@ -210,7 +211,7 @@ async def mark_for_scheduling( processed=None, ) - async def mark_scheduling_done( + async def mark_as_processed( self, *, user_id: UserID, project_id: ProjectID, iteration: PositiveInt ) -> CompRunsAtDB | None: return await self.update( diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index c8dc6fd927b..8aba0e06183 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -7,8 +7,11 @@ import asyncio import datetime +import random from collections.abc import Awaitable, Callable +from typing import cast +import arrow import pytest from _helpers import PublishedProject from faker import Faker @@ -23,6 +26,9 @@ UserNotFoundError, ) from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict +from simcore_service_director_v2.modules.comp_scheduler._constants import ( + SCHEDULER_INTERVAL, +) from simcore_service_director_v2.modules.db.repositories.comp_runs import ( CompRunsRepository, ) @@ -117,7 +123,6 @@ async def test_list( ) == [] ) - assert sorted( await CompRunsRepository(aiopg_engine).list( filter_by_state={RunningState.PUBLISHED} @@ -125,6 +130,80 @@ async def test_list( key=lambda x: x.iteration, ) == sorted(created, key=lambda x: x.iteration) + # test with never scheduled filter, let's create a bunch of scheduled entries, + assert sorted( + await CompRunsRepository(aiopg_engine).list(never_scheduled=True), + key=lambda x: x.iteration, + ) == sorted(created, key=lambda x: x.iteration) + comp_runs_marked_for_scheduling = random.sample(created, k=25) + await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).mark_for_scheduling( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + ) + for comp_run in comp_runs_marked_for_scheduling + ) + ) + # filter them away + created = [r for r in created if r not in comp_runs_marked_for_scheduling] + assert sorted( + await CompRunsRepository(aiopg_engine).list(never_scheduled=True), + key=lambda x: x.iteration, + ) == sorted(created, key=lambda x: x.iteration) + + # now mark a few of them as processed + comp_runs_marked_as_processed = random.sample(comp_runs_marked_for_scheduling, k=11) + await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).mark_as_processed( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + ) + for comp_run in comp_runs_marked_as_processed + ) + ) + # filter them away + comp_runs_marked_for_scheduling = [ + r + for r in comp_runs_marked_for_scheduling + if r not in comp_runs_marked_as_processed + ] + # since they were just marked as processed now, we will get nothing + assert ( + sorted( + await CompRunsRepository(aiopg_engine).list( + never_scheduled=False, processed_since=SCHEDULER_INTERVAL + ), + key=lambda x: x.iteration, + ) + == [] + ) + # now we artificially change the processed time and set it 2x the scheduler interval + fake_processed_time = arrow.utcnow().datetime - 2 * SCHEDULER_INTERVAL + comp_runs_marked_as_processed = await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).update( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + processed=fake_processed_time, + ) + for comp_run in comp_runs_marked_as_processed + ) + ) + # now we should get them + assert sorted( + await CompRunsRepository(aiopg_engine).list( + never_scheduled=False, processed_since=SCHEDULER_INTERVAL + ), + key=lambda x: x.iteration, + ) == sorted( + comp_runs_marked_as_processed, key=lambda x: cast(CompRunsAtDB, x).iteration + ) + async def test_create( aiopg_engine, @@ -387,7 +466,7 @@ async def test_mark_scheduling_done( assert created.scheduled is None assert created.processed is None - updated = await CompRunsRepository(aiopg_engine).mark_scheduling_done( + updated = await CompRunsRepository(aiopg_engine).mark_as_processed( user_id=created.user_id, project_id=created.project_uuid, iteration=created.iteration, diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index d3ae2726157..42f1159e2d0 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -191,7 +191,7 @@ async def test_schedule_all_pipelines( assert comp_run.modified == start_modified_time # once the worker is done, the schedule time is set back to None - await CompRunsRepository(aiopg_engine).mark_scheduling_done( + await CompRunsRepository(aiopg_engine).mark_as_processed( user_id=comp_run.user_id, project_id=comp_run.project_uuid, iteration=comp_run.iteration, From 76b494a60b4452318fbcf0fe2fcde3c3065a9fff Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:21:23 +0100 Subject: [PATCH 107/127] handling of processed and scheduled --- .../modules/comp_scheduler/_manager.py | 5 +- .../modules/db/repositories/comp_runs.py | 28 ++++++-- .../test_db_repositories_comp_runs.py | 72 +++++++++++++++---- .../with_dbs/comp_scheduler/test_manager.py | 8 ++- .../tests/unit/with_dbs/conftest.py | 2 +- 5 files changed, 92 insertions(+), 23 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index f134da556f7..0d2cd0af74a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -1,7 +1,6 @@ import logging from typing import Final -import arrow import networkx as nx from aiopg.sa import Engine from fastapi import FastAPI @@ -126,13 +125,13 @@ async def schedule_all_pipelines(app: FastAPI) -> None: runs_to_schedule = await CompRunsRepository.instance(db_engine).list( filter_by_state=SCHEDULED_STATES, never_scheduled=True, - processed_before=arrow.utcnow().datetime - SCHEDULER_INTERVAL, + processed_since=SCHEDULER_INTERVAL, ) possibly_lost_scheduled_pipelines = await CompRunsRepository.instance( db_engine ).list( filter_by_state=SCHEDULED_STATES, - scheduled_after=SCHEDULER_INTERVAL * _LOST_TASKS_FACTOR, + scheduled_since=SCHEDULER_INTERVAL * _LOST_TASKS_FACTOR, ) if possibly_lost_scheduled_pipelines: _logger.error( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index d879cfc49f5..f5aa581c565 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -65,7 +65,7 @@ async def list( filter_by_state: set[RunningState] | None = None, never_scheduled: bool = False, processed_since: datetime.timedelta | None = None, - scheduled_after: datetime.timedelta | None = None, + scheduled_since: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: conditions = [] if filter_by_state: @@ -81,13 +81,31 @@ async def list( scheduling_or_conditions = [] if never_scheduled: scheduling_or_conditions.append(comp_runs.c.scheduled.is_(None)) - if scheduled_after is not None: - scheduled_cutoff = arrow.utcnow().datetime - scheduled_after - scheduling_or_conditions.append(comp_runs.c.scheduled <= scheduled_cutoff) + if scheduled_since is not None: + # a scheduled run is a run that has been scheduled but not processed yet + # e.g. the processing timepoint is either null or before the scheduling timepoint + scheduled_cutoff = arrow.utcnow().datetime - scheduled_since + scheduling_filter = ( + comp_runs.c.scheduled.is_not(None) + & ( + comp_runs.c.processed.is_(None) + | (comp_runs.c.scheduled > comp_runs.c.processed) + ) + & (comp_runs.c.scheduled <= scheduled_cutoff) + ) + scheduling_or_conditions.append(scheduling_filter) if processed_since is not None: + # a processed run is a run that has been scheduled and processed + # and the processing timepoint is after the scheduling timepoint processed_cutoff = arrow.utcnow().datetime - processed_since - scheduling_or_conditions.append(comp_runs.c.processed <= processed_cutoff) + processed_filter = ( + comp_runs.c.processed.is_not(None) + & (comp_runs.c.processed > comp_runs.c.scheduled) + & (comp_runs.c.processed <= processed_cutoff) + ) + + scheduling_or_conditions.append(processed_filter) if scheduling_or_conditions: conditions.append(sa.or_(*scheduling_or_conditions)) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py index 8aba0e06183..ba903d1b069 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_db_repositories_comp_runs.py @@ -181,17 +181,25 @@ async def test_list( ) == [] ) - # now we artificially change the processed time and set it 2x the scheduler interval - fake_processed_time = arrow.utcnow().datetime - 2 * SCHEDULER_INTERVAL - comp_runs_marked_as_processed = await asyncio.gather( - *( - CompRunsRepository(aiopg_engine).update( - user_id=comp_run.user_id, - project_id=comp_run.project_uuid, - iteration=comp_run.iteration, - processed=fake_processed_time, - ) - for comp_run in comp_runs_marked_as_processed + # now we artificially change the scheduled/processed time and set it 2x the scheduler interval + # these are correctly processed ones, so we should get them back + fake_scheduled_time = arrow.utcnow().datetime - 2 * SCHEDULER_INTERVAL + fake_processed_time = fake_scheduled_time + 0.5 * SCHEDULER_INTERVAL + comp_runs_marked_as_processed = ( + cast( # NOTE: the cast here is ok since gather will raise if there is an error + list[CompRunsAtDB], + await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).update( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + scheduled=fake_scheduled_time, + processed=fake_processed_time, + ) + for comp_run in comp_runs_marked_as_processed + ) + ), ) ) # now we should get them @@ -200,8 +208,48 @@ async def test_list( never_scheduled=False, processed_since=SCHEDULER_INTERVAL ), key=lambda x: x.iteration, + ) == sorted(comp_runs_marked_as_processed, key=lambda x: x.iteration) + + # now some of them were never processed (e.g. processed time is either null or before schedule time) + comp_runs_waiting_for_processing_or_never_processed = random.sample( + comp_runs_marked_as_processed, k=6 + ) + comp_runs_marked_as_processed = [ + r + for r in comp_runs_marked_as_processed + if r not in comp_runs_waiting_for_processing_or_never_processed + ] + # now we artificially change the processed time to be before the scheduled time + comp_runs_waiting_for_processing_or_never_processed = cast( + list[CompRunsAtDB], + await asyncio.gather( + *( + CompRunsRepository(aiopg_engine).update( + user_id=comp_run.user_id, + project_id=comp_run.project_uuid, + iteration=comp_run.iteration, + scheduled=fake_processed_time, # NOTE: we invert here the timings + processed=random.choice([fake_scheduled_time, None]), # noqa: S311 + ) + for comp_run in comp_runs_waiting_for_processing_or_never_processed + ) + ), + ) + # so the processed ones shall remain + assert sorted( + await CompRunsRepository(aiopg_engine).list( + never_scheduled=False, processed_since=SCHEDULER_INTERVAL + ), + key=lambda x: x.iteration, + ) == sorted(comp_runs_marked_as_processed, key=lambda x: x.iteration) + # the ones waiting for scheduling now + assert sorted( + await CompRunsRepository(aiopg_engine).list( + never_scheduled=False, scheduled_since=SCHEDULER_INTERVAL + ), + key=lambda x: x.iteration, ) == sorted( - comp_runs_marked_as_processed, key=lambda x: cast(CompRunsAtDB, x).iteration + comp_runs_waiting_for_processing_or_never_processed, key=lambda x: x.iteration ) diff --git a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py index 42f1159e2d0..ac5bbbcc942 100644 --- a/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py +++ b/services/director-v2/tests/unit/with_dbs/comp_scheduler/test_manager.py @@ -178,6 +178,7 @@ async def test_schedule_all_pipelines( assert comp_run.metadata == run_metadata assert comp_run.result is RunningState.PUBLISHED assert comp_run.scheduled is not None + assert comp_run.processed is None start_schedule_time = comp_run.scheduled start_modified_time = comp_run.modified @@ -186,15 +187,18 @@ async def test_schedule_all_pipelines( scheduler_rabbit_client_parser.assert_not_called() comp_runs = await assert_comp_runs(sqlalchemy_async_engine, expected_total=1) comp_run = comp_runs[0] + assert comp_run.scheduled assert comp_run.scheduled == start_schedule_time, "scheduled time changed!" assert comp_run.cancelled is None assert comp_run.modified == start_modified_time - # once the worker is done, the schedule time is set back to None - await CompRunsRepository(aiopg_engine).mark_as_processed( + # to simulate that the worker did its job we will set times in the past + await CompRunsRepository(aiopg_engine).update( user_id=comp_run.user_id, project_id=comp_run.project_uuid, iteration=comp_run.iteration, + scheduled=comp_run.scheduled - 1.5 * SCHEDULER_INTERVAL, + processed=comp_run.scheduled - 1.1 * SCHEDULER_INTERVAL, ) # now we schedule a pipeline again, but we wait for the scheduler interval to pass diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 3d9c00b892a..694d76bf4ab 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -123,7 +123,7 @@ async def _( ), "node_class": to_node_class(node_data.key), "internal_id": internal_id + 1, - "submit": datetime.datetime.now(), + "submit": datetime.datetime.now(datetime.UTC), "job_id": generate_dask_job_id( service_key=node_data.key, service_version=node_data.version, From f37a3b5aeb4c5661a4abe4d911a9d45a13d69745 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:24:25 +0100 Subject: [PATCH 108/127] doc --- .../modules/db/repositories/comp_runs.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index f5aa581c565..68798d89a29 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -67,6 +67,17 @@ async def list( processed_since: datetime.timedelta | None = None, scheduled_since: datetime.timedelta | None = None, ) -> list[CompRunsAtDB]: + """lists the computational runs: + filter_by_state AND (never_scheduled OR processed_since OR scheduled_since) + + + Keyword Arguments: + filter_by_state -- will return only the runs with result in filter_by_state (default: {None}) + never_scheduled -- will return the runs which were never scheduled (default: {False}) + processed_since -- will return the runs which were processed since X, which are not re-scheduled since then (default: {None}) + scheduled_since -- will return the runs which were scheduled since X, which are not processed since then (default: {None}) + """ + conditions = [] if filter_by_state: conditions.append( From 1580044aaa1ab947f95613518c6d9ff7e21caf81 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:36:27 +0100 Subject: [PATCH 109/127] convert comp_tasks timestamps to tz aware --- .../7ad64e963e0f_add_timezone_comp_tasks.py | 68 +++++++++++++++++++ .../models/comp_tasks.py | 13 +++- 2 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 packages/postgres-database/src/simcore_postgres_database/migration/versions/7ad64e963e0f_add_timezone_comp_tasks.py diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/7ad64e963e0f_add_timezone_comp_tasks.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/7ad64e963e0f_add_timezone_comp_tasks.py new file mode 100644 index 00000000000..fe56f4c548f --- /dev/null +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/7ad64e963e0f_add_timezone_comp_tasks.py @@ -0,0 +1,68 @@ +"""add_timezone_comp_tasks + +Revision ID: 7ad64e963e0f +Revises: b7f23f6d8aa2 +Create Date: 2024-11-27 22:28:51.898433+00:00 + +""" +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "7ad64e963e0f" +down_revision = "b7f23f6d8aa2" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "comp_tasks", + "submit", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=True, + ) + op.alter_column( + "comp_tasks", + "start", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=True, + ) + op.alter_column( + "comp_tasks", + "end", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=True, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "comp_tasks", + "end", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=True, + ) + op.alter_column( + "comp_tasks", + "start", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=True, + ) + op.alter_column( + "comp_tasks", + "submit", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=True, + ) + # ### end Alembic commands ### diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_tasks.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_tasks.py index 60bfc3f95c3..af5dc451cc3 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/comp_tasks.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/comp_tasks.py @@ -1,6 +1,7 @@ """ Computational Tasks Table """ + import enum import sqlalchemy as sa @@ -77,9 +78,15 @@ class NodeClass(enum.Enum): doc="current progress of the task if available", ), # utc timestamps for submission/start/end - sa.Column("submit", sa.DateTime, doc="UTC timestamp for task submission"), - sa.Column("start", sa.DateTime, doc="UTC timestamp when task started"), - sa.Column("end", sa.DateTime, doc="UTC timestamp for task completion"), + sa.Column( + "submit", sa.DateTime(timezone=True), doc="UTC timestamp for task submission" + ), + sa.Column( + "start", sa.DateTime(timezone=True), doc="UTC timestamp when task started" + ), + sa.Column( + "end", sa.DateTime(timezone=True), doc="UTC timestamp for task completion" + ), sa.Column( "last_heartbeat", sa.DateTime(timezone=True), From 7a586f45888fce8fb99dc596fcc937e40ad4915e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:52:44 +0100 Subject: [PATCH 110/127] convert timestamps to utc aware --- .../e05bdc5b3c7b_add_timezone_comp_runs.py | 86 +++++++++++++++++++ .../models/comp_runs.py | 24 ++---- 2 files changed, 91 insertions(+), 19 deletions(-) create mode 100644 packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py new file mode 100644 index 00000000000..6880b6c3ca9 --- /dev/null +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py @@ -0,0 +1,86 @@ +"""add_timezone_comp_runs + +Revision ID: e05bdc5b3c7b +Revises: 7ad64e963e0f +Create Date: 2024-11-27 22:51:21.112336+00:00 + +""" +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "e05bdc5b3c7b" +down_revision = "7ad64e963e0f" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "comp_runs", + "created", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=False, + existing_server_default=sa.text("now()"), + ) + op.alter_column( + "comp_runs", + "modified", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=False, + existing_server_default=sa.text("now()"), + ) + op.alter_column( + "comp_runs", + "started", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=True, + ) + op.alter_column( + "comp_runs", + "ended", + existing_type=postgresql.TIMESTAMP(), + type_=sa.DateTime(timezone=True), + existing_nullable=True, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "comp_runs", + "ended", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=True, + ) + op.alter_column( + "comp_runs", + "started", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=True, + ) + op.alter_column( + "comp_runs", + "modified", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=False, + existing_server_default=sa.text("now()"), + ) + op.alter_column( + "comp_runs", + "created", + existing_type=sa.DateTime(timezone=True), + type_=postgresql.TIMESTAMP(), + existing_nullable=False, + existing_server_default=sa.text("now()"), + ) + # ### end Alembic commands ### diff --git a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py index cb657c20801..d92227c07e2 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/comp_runs.py @@ -4,9 +4,8 @@ import sqlalchemy as sa from sqlalchemy.dialects.postgresql import JSONB -from sqlalchemy.sql import func -from ._common import RefActions +from ._common import RefActions, column_created_datetime, column_modified_datetime from .base import metadata from .comp_pipeline import StateType @@ -73,31 +72,18 @@ doc="The result of the run entry", ), # dag node id and class - sa.Column( - "created", - sa.DateTime(), - nullable=False, - server_default=func.now(), - doc="When the run entry was created", - ), - sa.Column( - "modified", - sa.DateTime(), - nullable=False, - server_default=func.now(), - onupdate=func.now(), # this will auto-update on modification - doc="When the run entry was last modified", - ), + column_created_datetime(timezone=True), + column_modified_datetime(timezone=True), # utc timestamps for submission/start/end sa.Column( "started", - sa.DateTime, + sa.DateTime(timezone=True), nullable=True, doc="When the run was started", ), sa.Column( "ended", - sa.DateTime, + sa.DateTime(timezone=True), nullable=True, doc="When the run was finished", ), From f4eecf99dbd9d2bb8f14d344f3abec6096f8b84a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:55:32 +0100 Subject: [PATCH 111/127] ensure metadata are jsonable encoded --- services/director-v2/tests/unit/with_dbs/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 694d76bf4ab..ee8259f9f5b 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -200,14 +200,14 @@ async def _( "user_id": user["id"], "iteration": 1, "result": StateType.NOT_STARTED, - "metadata": run_metadata, + "metadata": jsonable_encoder(run_metadata), "use_on_demand_clusters": False, } run_config.update(**run_kwargs) async with sqlalchemy_async_engine.begin() as conn: result = await conn.execute( comp_runs.insert() - .values(**jsonable_encoder(run_config)) + .values(**run_config) .returning(sa.literal_column("*")) ) new_run = CompRunsAtDB.model_validate(result.first()) From f7898c27c734da8d2e085cc09ed9baeb96e124f4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 28 Nov 2024 09:11:06 +0100 Subject: [PATCH 112/127] refactor --- services/director-v2/tests/conftest.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/services/director-v2/tests/conftest.py b/services/director-v2/tests/conftest.py index fcc0db6dbf1..d8a11779bf3 100644 --- a/services/director-v2/tests/conftest.py +++ b/services/director-v2/tests/conftest.py @@ -196,17 +196,6 @@ def mock_env( ) -@pytest.fixture() -async def client(mock_env: EnvVarsDict) -> AsyncIterator[TestClient]: - settings = AppSettings.create_from_envs() - app = init_app(settings) - print("Application settings\n", settings.model_dump_json(indent=2)) - # NOTE: this way we ensure the events are run in the application - # since it starts the app on a test server - with TestClient(app, raise_server_exceptions=True) as test_client: - yield test_client - - @pytest.fixture() async def initialized_app(mock_env: EnvVarsDict) -> AsyncIterable[FastAPI]: settings = AppSettings.create_from_envs() @@ -216,6 +205,14 @@ async def initialized_app(mock_env: EnvVarsDict) -> AsyncIterable[FastAPI]: yield app +@pytest.fixture() +async def client(initialized_app: FastAPI) -> AsyncIterator[TestClient]: + # NOTE: this way we ensure the events are run in the application + # since it starts the app on a test server + with TestClient(initialized_app, raise_server_exceptions=True) as test_client: + yield test_client + + @pytest.fixture() async def async_client(initialized_app: FastAPI) -> AsyncIterable[httpx.AsyncClient]: async with httpx.AsyncClient( From adb865b44009c1aa127fa7a1880362bbd8813984 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 28 Nov 2024 18:33:18 +0100 Subject: [PATCH 113/127] linter --- .../modules/comp_scheduler/_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py index 7ec438bc589..28dca04dc53 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_models.py @@ -15,5 +15,5 @@ class SchedulePipelineRabbitMessage(RabbitMessageBase): project_id: ProjectID iteration: Iteration - def routing_key(self) -> str | None: + def routing_key(self) -> str | None: # pylint: disable=no-self-use # abstract return None From 5c54a68a1101bff9ee49bb86d470cf2fd8985bdb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 28 Nov 2024 19:17:58 +0100 Subject: [PATCH 114/127] mypy --- .../versions/e05bdc5b3c7b_add_timezone_comp_runs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py index 6880b6c3ca9..30f1af8867d 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py @@ -5,6 +5,7 @@ Create Date: 2024-11-27 22:51:21.112336+00:00 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql @@ -24,7 +25,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default=sa.sql.func.now(), ) op.alter_column( "comp_runs", @@ -32,7 +33,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default=sa.sql.func.now(), ) op.alter_column( "comp_runs", @@ -73,7 +74,7 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default=sa.sql.func.now(), ) op.alter_column( "comp_runs", @@ -81,6 +82,6 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default=sa.sql.func.now(), ) # ### end Alembic commands ### From 28d5b538359721912504d642e5c88f657265e67a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 28 Nov 2024 19:26:12 +0100 Subject: [PATCH 115/127] revert --- .../versions/e05bdc5b3c7b_add_timezone_comp_runs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py index 30f1af8867d..85e67092ceb 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py @@ -25,7 +25,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.sql.func.now(), + existing_server_default=sa.text("now()"), ) op.alter_column( "comp_runs", @@ -33,7 +33,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.sql.func.now(), + existing_server_default=sa.text("now()"), ) op.alter_column( "comp_runs", @@ -74,7 +74,7 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.sql.func.now(), + existing_server_default=sa.text("now()"), ) op.alter_column( "comp_runs", @@ -82,6 +82,6 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.sql.func.now(), + existing_server_default=sa.text("now()"), ) # ### end Alembic commands ### From a642af11fbc275d823ff45457678adcedbca752c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 09:52:20 +0100 Subject: [PATCH 116/127] mypy --- .../versions/e05bdc5b3c7b_add_timezone_comp_runs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py index 85e67092ceb..3d3d6c6896a 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e05bdc5b3c7b_add_timezone_comp_runs.py @@ -25,7 +25,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default="now()", ) op.alter_column( "comp_runs", @@ -33,7 +33,7 @@ def upgrade(): existing_type=postgresql.TIMESTAMP(), type_=sa.DateTime(timezone=True), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default="now()", ) op.alter_column( "comp_runs", @@ -74,7 +74,7 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default="now()", ) op.alter_column( "comp_runs", @@ -82,6 +82,6 @@ def downgrade(): existing_type=sa.DateTime(timezone=True), type_=postgresql.TIMESTAMP(), existing_nullable=False, - existing_server_default=sa.text("now()"), + existing_server_default="now()", ) # ### end Alembic commands ### From 3832cca241897fbe556e81f81d963fb369d98f00 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 10:05:18 +0100 Subject: [PATCH 117/127] the lifespan manager cannot be used with the TestClient --- services/director-v2/tests/conftest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/services/director-v2/tests/conftest.py b/services/director-v2/tests/conftest.py index d8a11779bf3..72b94ec3262 100644 --- a/services/director-v2/tests/conftest.py +++ b/services/director-v2/tests/conftest.py @@ -206,10 +206,14 @@ async def initialized_app(mock_env: EnvVarsDict) -> AsyncIterable[FastAPI]: @pytest.fixture() -async def client(initialized_app: FastAPI) -> AsyncIterator[TestClient]: +async def client(mock_env: EnvVarsDict) -> AsyncIterator[TestClient]: # NOTE: this way we ensure the events are run in the application # since it starts the app on a test server - with TestClient(initialized_app, raise_server_exceptions=True) as test_client: + settings = AppSettings.create_from_envs() + app = init_app(settings) + # NOTE: we cannot use the initialized_app fixture here as the TestClient also creates it + print("Application settings\n", settings.model_dump_json(indent=2)) + with TestClient(app, raise_server_exceptions=True) as test_client: yield test_client From 9f1b7481497804a71fbfed7a85db05e32c1740e3 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 13:15:18 +0100 Subject: [PATCH 118/127] added a silence_exceptions decorator --- .../src/servicelib/exception_utils.py | 35 ++++++++++++++++++- .../modules/comp_scheduler/_manager.py | 4 ++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/packages/service-library/src/servicelib/exception_utils.py b/packages/service-library/src/servicelib/exception_utils.py index 4f44d673838..57882507524 100644 --- a/packages/service-library/src/servicelib/exception_utils.py +++ b/packages/service-library/src/servicelib/exception_utils.py @@ -1,6 +1,9 @@ +import inspect import logging +from collections.abc import Callable from datetime import datetime -from typing import Final +from functools import wraps +from typing import Final, ParamSpec, TypeVar from pydantic import BaseModel, Field, NonNegativeFloat, PrivateAttr @@ -65,3 +68,33 @@ def else_reset(self) -> None: """error no longer occurs reset tracking""" self._first_exception_skip = None self._failure_counter = 0 + + +P = ParamSpec("P") +R = TypeVar("R") + + +def silence_exceptions( + exceptions: tuple[type[BaseException], ...] +) -> Callable[[Callable[P, R]], Callable[P, R]]: + def decorator(func: Callable[..., R]) -> Callable[..., R]: + @wraps(func) + def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: + try: + return func(*args, **kwargs) + except exceptions: + return None + + @wraps(func) + async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: + try: + assert inspect.isawaitable(func) # nosec + return await func(*args, **kwargs) + except exceptions: + return None + + if inspect.iscoroutinefunction(func): + return async_wrapper # type: ignore + return sync_wrapper # type: ignore + + return decorator diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py index 0d2cd0af74a..281c9fc4630 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_manager.py @@ -8,7 +8,9 @@ from models_library.projects import ProjectID from models_library.users import UserID from servicelib.background_task import start_periodic_task, stop_periodic_task +from servicelib.exception_utils import silence_exceptions from servicelib.logging_utils import log_context +from servicelib.redis import CouldNotAcquireLockError from servicelib.redis_utils import exclusive from servicelib.utils import limited_gather @@ -160,7 +162,7 @@ async def schedule_all_pipelines(app: FastAPI) -> None: async def setup_manager(app: FastAPI) -> None: app.state.scheduler_manager = start_periodic_task( - schedule_all_pipelines, + silence_exceptions((CouldNotAcquireLockError,))(schedule_all_pipelines), interval=SCHEDULER_INTERVAL, task_name=MODULE_NAME_SCHEDULER, app=app, From 570bfd03cfd9e275b5951c7cc9d43d097003f082 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 13:25:10 +0100 Subject: [PATCH 119/127] added test for silencing exceptions --- .../src/servicelib/exception_utils.py | 2 +- .../tests/test_exception_utils.py | 59 ++++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/packages/service-library/src/servicelib/exception_utils.py b/packages/service-library/src/servicelib/exception_utils.py index 57882507524..3046f870ded 100644 --- a/packages/service-library/src/servicelib/exception_utils.py +++ b/packages/service-library/src/servicelib/exception_utils.py @@ -88,7 +88,7 @@ def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: @wraps(func) async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: try: - assert inspect.isawaitable(func) # nosec + assert inspect.iscoroutinefunction(func) # nosec return await func(*args, **kwargs) except exceptions: return None diff --git a/packages/service-library/tests/test_exception_utils.py b/packages/service-library/tests/test_exception_utils.py index 299855e8241..fb55a9ae5af 100644 --- a/packages/service-library/tests/test_exception_utils.py +++ b/packages/service-library/tests/test_exception_utils.py @@ -4,7 +4,7 @@ import pytest from pydantic import PositiveFloat, PositiveInt -from servicelib.exception_utils import DelayedExceptionHandler +from servicelib.exception_utils import DelayedExceptionHandler, silence_exceptions TOLERANCE: Final[PositiveFloat] = 0.1 SLEEP_FOR: Final[PositiveFloat] = TOLERANCE * 0.1 @@ -49,3 +49,60 @@ def test_workflow_passes() -> None: def test_workflow_raises() -> None: with pytest.raises(TargetException): workflow(stop_raising_after=ITERATIONS + 1) + + +# Define some custom exceptions for testing +class CustomError(Exception): + pass + + +class AnotherCustomError(Exception): + pass + + +@silence_exceptions((CustomError,)) +def sync_function(*, raise_error: bool, raise_another_error: bool) -> str: + if raise_error: + raise CustomError + if raise_another_error: + raise AnotherCustomError + return "Success" + + +@silence_exceptions((CustomError,)) +async def async_function(*, raise_error: bool, raise_another_error: bool) -> str: + if raise_error: + raise CustomError + if raise_another_error: + raise AnotherCustomError + return "Success" + + +def test_sync_function_no_exception(): + result = sync_function(raise_error=False, raise_another_error=False) + assert result == "Success" + + +def test_sync_function_with_exception_is_silenced(): + result = sync_function(raise_error=True, raise_another_error=False) + assert result is None + + +async def test_async_function_no_exception(): + result = await async_function(raise_error=False, raise_another_error=False) + assert result == "Success" + + +async def test_async_function_with_exception_is_silenced(): + result = await async_function(raise_error=True, raise_another_error=False) + assert result is None + + +def test_sync_function_with_different_exception(): + with pytest.raises(AnotherCustomError): + sync_function(raise_error=False, raise_another_error=True) + + +async def test_sync_function_with_different_exception(): + with pytest.raises(AnotherCustomError): + await async_function(raise_error=False, raise_another_error=True) From 33e7259836be8e6dbd1130c58e2554c35aa11fdd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 13:25:37 +0100 Subject: [PATCH 120/127] typo --- packages/service-library/tests/test_exception_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/service-library/tests/test_exception_utils.py b/packages/service-library/tests/test_exception_utils.py index fb55a9ae5af..a884d3dafb1 100644 --- a/packages/service-library/tests/test_exception_utils.py +++ b/packages/service-library/tests/test_exception_utils.py @@ -103,6 +103,6 @@ def test_sync_function_with_different_exception(): sync_function(raise_error=False, raise_another_error=True) -async def test_sync_function_with_different_exception(): +async def test_async_function_with_different_exception(): with pytest.raises(AnotherCustomError): await async_function(raise_error=False, raise_another_error=True) From 11cb145af68ccbd015ec361c4473e771456aeb03 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 15:53:52 +0100 Subject: [PATCH 121/127] fix serialization using context --- .../models/comp_pipelines.py | 19 ++++++++++++++++--- .../tests/unit/with_dbs/conftest.py | 7 ++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py b/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py index 5de823d826b..b790f733fb7 100644 --- a/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py +++ b/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py @@ -4,7 +4,13 @@ import networkx as nx from models_library.projects import ProjectID from models_library.projects_state import RunningState -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + SerializationInfo, + field_serializer, + field_validator, +) from simcore_postgres_database.models.comp_pipeline import StateType from ..utils.db import DB_TO_RUNNING_STATE @@ -15,9 +21,16 @@ class CompPipelineAtDB(BaseModel): dag_adjacency_list: dict[str, list[str]] # json serialization issue if using NodeID state: RunningState + @field_serializer("project_id") + @staticmethod + def _convert_uuid_to_str(v: ProjectID, info: SerializationInfo) -> str | ProjectID: + if info.context == "asyncpg": + return f"{v}" + return v + @field_validator("state", mode="before") @classmethod - def convert_state_from_state_type_enum_if_needed(cls, v): + def _convert_state_from_state_type_enum_if_needed(cls, v): if isinstance(v, str): # try to convert to a StateType, if it fails the validations will continue # and pydantic will try to convert it to a RunninState later on @@ -29,7 +42,7 @@ def convert_state_from_state_type_enum_if_needed(cls, v): @field_validator("dag_adjacency_list", mode="before") @classmethod - def auto_convert_dag(cls, v): + def _auto_convert_dag(cls, v): # this enforcement is here because the serialization using json is not happy with non str Dict keys, also comparison gets funny if the lists are having sometimes UUIDs or str. # NOTE: this might not be necessary anymore once we have something fully defined return {str(key): [str(n) for n in value] for key, value in v.items()} diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index ee8259f9f5b..96302476fb9 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -52,10 +52,15 @@ async def _(**pipeline_kwargs) -> CompPipelineAtDB: "state": StateType.NOT_STARTED, } pipeline_config.update(**pipeline_kwargs) + CompPipelineAtDB.model_validate(pipeline_config).model_dump(context="asyncpg") async with sqlalchemy_async_engine.begin() as conn: result = await conn.execute( comp_pipeline.insert() - .values(**pipeline_config) + .values( + **CompPipelineAtDB.model_validate(pipeline_config).model_dump( + context="asyncpg" + ) + ) .returning(sa.literal_column("*")) ) assert result From dede26a94e9df1271bb9fc5677e79a8efd172887 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:31:03 +0100 Subject: [PATCH 122/127] revert and fix --- .../models/comp_pipelines.py | 15 +-------------- .../director-v2/tests/unit/with_dbs/conftest.py | 7 +------ .../tests/unit/with_dbs/test_utils_rabbitmq.py | 2 +- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py b/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py index b790f733fb7..63017ee62e7 100644 --- a/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py +++ b/services/director-v2/src/simcore_service_director_v2/models/comp_pipelines.py @@ -4,13 +4,7 @@ import networkx as nx from models_library.projects import ProjectID from models_library.projects_state import RunningState -from pydantic import ( - BaseModel, - ConfigDict, - SerializationInfo, - field_serializer, - field_validator, -) +from pydantic import BaseModel, ConfigDict, field_validator from simcore_postgres_database.models.comp_pipeline import StateType from ..utils.db import DB_TO_RUNNING_STATE @@ -21,13 +15,6 @@ class CompPipelineAtDB(BaseModel): dag_adjacency_list: dict[str, list[str]] # json serialization issue if using NodeID state: RunningState - @field_serializer("project_id") - @staticmethod - def _convert_uuid_to_str(v: ProjectID, info: SerializationInfo) -> str | ProjectID: - if info.context == "asyncpg": - return f"{v}" - return v - @field_validator("state", mode="before") @classmethod def _convert_state_from_state_type_enum_if_needed(cls, v): diff --git a/services/director-v2/tests/unit/with_dbs/conftest.py b/services/director-v2/tests/unit/with_dbs/conftest.py index 96302476fb9..ee8259f9f5b 100644 --- a/services/director-v2/tests/unit/with_dbs/conftest.py +++ b/services/director-v2/tests/unit/with_dbs/conftest.py @@ -52,15 +52,10 @@ async def _(**pipeline_kwargs) -> CompPipelineAtDB: "state": StateType.NOT_STARTED, } pipeline_config.update(**pipeline_kwargs) - CompPipelineAtDB.model_validate(pipeline_config).model_dump(context="asyncpg") async with sqlalchemy_async_engine.begin() as conn: result = await conn.execute( comp_pipeline.insert() - .values( - **CompPipelineAtDB.model_validate(pipeline_config).model_dump( - context="asyncpg" - ) - ) + .values(**pipeline_config) .returning(sa.literal_column("*")) ) assert result diff --git a/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py b/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py index 32bfe4d52ff..8778d17245e 100644 --- a/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py +++ b/services/director-v2/tests/unit/with_dbs/test_utils_rabbitmq.py @@ -101,7 +101,7 @@ async def tasks( create_tasks: Callable[..., Awaitable[list[CompTaskAtDB]]], ) -> list[CompTaskAtDB]: await create_pipeline( - project_id=project.uuid, + project_id=f"{project.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) comp_tasks = await create_tasks(user, project) From 7f279c567eed6b5ad783d9c6c1c5a58386dd145d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 29 Nov 2024 17:21:33 +0100 Subject: [PATCH 123/127] fix tests --- .../src/servicelib/exception_utils.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/packages/service-library/src/servicelib/exception_utils.py b/packages/service-library/src/servicelib/exception_utils.py index 3046f870ded..2de33fd98e6 100644 --- a/packages/service-library/src/servicelib/exception_utils.py +++ b/packages/service-library/src/servicelib/exception_utils.py @@ -3,7 +3,7 @@ from collections.abc import Callable from datetime import datetime from functools import wraps -from typing import Final, ParamSpec, TypeVar +from typing import Any, Final, ParamSpec, TypeVar from pydantic import BaseModel, Field, NonNegativeFloat, PrivateAttr @@ -73,28 +73,31 @@ def else_reset(self) -> None: P = ParamSpec("P") R = TypeVar("R") +F = TypeVar("F", bound=Callable[..., Any]) -def silence_exceptions( - exceptions: tuple[type[BaseException], ...] -) -> Callable[[Callable[P, R]], Callable[P, R]]: - def decorator(func: Callable[..., R]) -> Callable[..., R]: - @wraps(func) - def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: - try: - return func(*args, **kwargs) - except exceptions: - return None - @wraps(func) - async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R | None: +def silence_exceptions(exceptions: tuple[type[BaseException], ...]) -> Callable[[F], F]: + def _decorator(func_or_coro: F) -> F: + + if inspect.iscoroutinefunction(func_or_coro): + + @wraps(func_or_coro) + async def _async_wrapper(*args, **kwargs) -> Any: + try: + assert inspect.iscoroutinefunction(func_or_coro) # nosec + return await func_or_coro(*args, **kwargs) + except exceptions: + return None + + return _async_wrapper # type: ignore[return-value] # decorators typing is hard + + @wraps(func_or_coro) + def _sync_wrapper(*args, **kwargs) -> Any: try: - assert inspect.iscoroutinefunction(func) # nosec - return await func(*args, **kwargs) + return func_or_coro(*args, **kwargs) except exceptions: return None - if inspect.iscoroutinefunction(func): - return async_wrapper # type: ignore - return sync_wrapper # type: ignore + return _sync_wrapper # type: ignore[return-value] # decorators typing is hard - return decorator + return _decorator From 3c93db21e47fcfd79d29864cfe12e2b79c5fcd99 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 2 Dec 2024 09:15:38 +0100 Subject: [PATCH 124/127] @pcrespov review: remove match and use a mapping --- .../modules/db/repositories/comp_runs.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 68798d89a29..50070611394 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -1,6 +1,6 @@ import datetime import logging -from typing import Any +from typing import Any, Final import arrow import sqlalchemy as sa @@ -20,7 +20,6 @@ ClusterNotFoundError, ComputationalRunNotFoundError, DirectorError, - ProjectNotFoundError, UserNotFoundError, ) from ....models.comp_runs import CompRunsAtDB, RunMetadataDict @@ -30,6 +29,20 @@ logger = logging.getLogger(__name__) +_POSTGRES_ERROR_TO_ERROR_MAP: Final[ + dict[tuple[str, ...], tuple[type[DirectorError], tuple[str, ...]]] +] = { + ("users", "user_id"): (UserNotFoundError, ("users", "user_id")), + ("projects", "project_uuid"): ( + UserNotFoundError, + ("projects", "project_id"), + ), + ("clusters", "cluster_id"): ( + ClusterNotFoundError, + ("clusters", "cluster_id"), + ), +} + class CompRunsRepository(BaseRepository): async def get( @@ -173,15 +186,13 @@ async def create( return CompRunsAtDB.model_validate(row) except ForeignKeyViolation as exc: message = exc.args[0] - match message: - case s if "users" in s and "user_id" in s: - raise UserNotFoundError(user_id=user_id) from exc - case s if "projects" in s and "project_uuid" in s: - raise ProjectNotFoundError(project_id=project_id) from exc - case s if "clusters" in s and "cluster_id" in s: - raise ClusterNotFoundError(cluster_id=cluster_id) from exc - case _: - raise DirectorError from exc + + for pg_keys, (exc_type, exc_keys) in _POSTGRES_ERROR_TO_ERROR_MAP.items(): + if all(k in message for k in pg_keys): + raise exc_type( + **{f"{k}": locals().get(k) for k in exc_keys} + ) from exc + raise DirectorError from exc async def update( self, user_id: UserID, project_id: ProjectID, iteration: PositiveInt, **values From b7d1d2c4c045fa928aeab347370e72369d637271 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 2 Dec 2024 09:56:52 +0100 Subject: [PATCH 125/127] fixed typo --- .../modules/db/repositories/comp_runs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index 50070611394..ae5ba212ec3 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -20,6 +20,7 @@ ClusterNotFoundError, ComputationalRunNotFoundError, DirectorError, + ProjectNotFoundError, UserNotFoundError, ) from ....models.comp_runs import CompRunsAtDB, RunMetadataDict @@ -34,7 +35,7 @@ ] = { ("users", "user_id"): (UserNotFoundError, ("users", "user_id")), ("projects", "project_uuid"): ( - UserNotFoundError, + ProjectNotFoundError, ("projects", "project_id"), ), ("clusters", "cluster_id"): ( From 56048864ec2ba9cba9475a4c43a677eecbc3326d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 2 Dec 2024 10:43:14 +0100 Subject: [PATCH 126/127] @pcrespov review: use more robust checks --- .../modules/db/repositories/comp_runs.py | 25 ++++++++++++------- .../with_dbs/test_api_route_computations.py | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py index ae5ba212ec3..b746407a8aa 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_runs.py @@ -30,19 +30,23 @@ logger = logging.getLogger(__name__) -_POSTGRES_ERROR_TO_ERROR_MAP: Final[ - dict[tuple[str, ...], tuple[type[DirectorError], tuple[str, ...]]] +_POSTGRES_FK_COLUMN_TO_ERROR_MAP: Final[ + dict[sa.Column, tuple[type[DirectorError], tuple[str, ...]]] ] = { - ("users", "user_id"): (UserNotFoundError, ("users", "user_id")), - ("projects", "project_uuid"): ( + comp_runs.c.user_id: (UserNotFoundError, ("users", "user_id")), + comp_runs.c.project_uuid: ( ProjectNotFoundError, ("projects", "project_id"), ), - ("clusters", "cluster_id"): ( + comp_runs.c.cluster_id: ( ClusterNotFoundError, ("clusters", "cluster_id"), ), } +_DEFAULT_FK_CONSTRAINT_TO_ERROR: Final[tuple[type[DirectorError], tuple]] = ( + DirectorError, + (), +) class CompRunsRepository(BaseRepository): @@ -186,10 +190,13 @@ async def create( row = await result.first() return CompRunsAtDB.model_validate(row) except ForeignKeyViolation as exc: - message = exc.args[0] - - for pg_keys, (exc_type, exc_keys) in _POSTGRES_ERROR_TO_ERROR_MAP.items(): - if all(k in message for k in pg_keys): + assert exc.diag.constraint_name # nosec # noqa: PT017 + for foreign_key in comp_runs.foreign_keys: + if exc.diag.constraint_name == foreign_key.name: + assert foreign_key.parent is not None # nosec + exc_type, exc_keys = _POSTGRES_FK_COLUMN_TO_ERROR_MAP[ + foreign_key.parent + ] raise exc_type( **{f"{k}": locals().get(k) for k in exc_keys} ) from exc diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py index 3c7076b0aec..b2070d4bb44 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py @@ -927,7 +927,7 @@ async def test_get_computation_from_not_started_computation_task( f"/v2/computations/{proj.uuid}?user_id={user['id']}" ) await create_pipeline( - project_id=proj.uuid, + project_id=f"{proj.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) # create no task this should trigger an exception From a0e3990739ae93c9a89c137b9f53d7347c0e815a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:07:28 +0100 Subject: [PATCH 127/127] fixed syntax --- .../tests/unit/with_dbs/test_api_route_computations.py | 5 ++--- .../tests/unit/with_dbs/test_api_route_computations_tasks.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py index b2070d4bb44..2e75b18c009 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations.py @@ -87,7 +87,6 @@ def minimal_configuration( rabbit_service: RabbitSettings, redis_service: RedisSettings, monkeypatch: pytest.MonkeyPatch, - mocked_rabbit_mq_client: None, faker: Faker, ): monkeypatch.setenv("DIRECTOR_V2_DYNAMIC_SIDECAR_ENABLED", "false") @@ -885,7 +884,7 @@ async def test_get_computation_from_empty_project( assert response.status_code == status.HTTP_404_NOT_FOUND, response.text # create an empty pipeline await create_pipeline( - project_id=proj.uuid, + project_id=f"{proj.uuid}", ) response = await async_client.get(get_computation_url) assert response.status_code == status.HTTP_200_OK, response.text @@ -997,7 +996,7 @@ async def test_get_computation_from_published_computation_task( user = registered_user() proj = await project(user, workbench=fake_workbench_without_outputs) await create_pipeline( - project_id=proj.uuid, + project_id=f"{proj.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) comp_tasks = await create_tasks( diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py index 80f329658b5..845983b99cb 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_computations_tasks.py @@ -126,7 +126,7 @@ async def project_id( # insert pipeline -> comp_pipeline await create_pipeline( - project_id=proj.uuid, + project_id=f"{proj.uuid}", dag_adjacency_list=fake_workbench_adjacency, ) # insert tasks -> comp_tasks