From 50bd5692f1b9f289c6c62e6a045437bdafe785ed Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Sep 2024 14:08:13 -0400 Subject: [PATCH 01/12] WIP: Add delete org background job with access to org ops Not yet working because of ModuleNotFound import error in container running main_bg.py --- backend/btrixcloud/background_jobs.py | 79 ++++++++++++-- backend/btrixcloud/crawlmanager.py | 28 +++++ backend/btrixcloud/main.py | 2 +- backend/btrixcloud/main_bg.py | 136 ++++++++++++++++++++++++ backend/btrixcloud/models.py | 19 +++- backend/btrixcloud/orgs.py | 15 ++- backend/test/test_z_delete_org.py | 27 ++++- chart/app-templates/delete_org_job.yaml | 43 ++++++++ chart/templates/configmap.yaml | 4 + 9 files changed, 337 insertions(+), 16 deletions(-) create mode 100644 backend/btrixcloud/main_bg.py create mode 100644 chart/app-templates/delete_org_job.yaml diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index de3601fd94..7f3c3db983 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -1,6 +1,7 @@ """k8s background jobs""" import asyncio +import os from datetime import datetime from typing import Optional, Tuple, Union, List, Dict, TYPE_CHECKING, cast from uuid import UUID @@ -19,6 +20,7 @@ BgJobType, CreateReplicaJob, DeleteReplicaJob, + DeleteOrgJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -273,6 +275,51 @@ async def create_delete_replica_job( ) return None + async def create_delete_org_job( + self, + org: Organization, + existing_job_id: Optional[str] = None, + ) -> Optional[str]: + """Create background job to delete org and its data""" + + try: + job_id = await self.crawl_manager.run_delete_org_job( + oid=str(org.id), + backend_image=os.environ.get("BACKEND_IMAGE", ""), + pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + existing_job_id=existing_job_id, + ) + if existing_job_id: + delete_org_job = await self.get_background_job(existing_job_id, org.id) + previous_attempt = { + "started": delete_org_job.started, + "finished": delete_org_job.finished, + } + if delete_org_job.previousAttempts: + delete_org_job.previousAttempts.append(previous_attempt) + else: + delete_org_job.previousAttempts = [previous_attempt] + delete_org_job.started = dt_now() + delete_org_job.finished = None + delete_org_job.success = None + else: + delete_org_job = DeleteOrgJob( + id=job_id, + oid=org.id, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": delete_org_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: delete org job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, @@ -316,10 +363,13 @@ async def job_finished( ) async def get_background_job( - self, job_id: str, oid: UUID - ) -> Union[CreateReplicaJob, DeleteReplicaJob]: + self, job_id: str, oid: Optional[UUID] = None + ) -> Union[CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob]: """Get background job""" - query: dict[str, object] = {"_id": job_id, "oid": oid} + query: dict[str, object] = {"_id": job_id} + if oid: + query["oid"] = oid + res = await self.jobs.find_one(query) if not res: raise HTTPException(status_code=404, detail="job_not_found") @@ -331,9 +381,10 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.CREATE_REPLICA: return CreateReplicaJob.from_dict(data) - return DeleteReplicaJob.from_dict(data) + if data["type"] == BgJobType.DELETE_REPLICA: + return DeleteReplicaJob.from_dict(data) - # return BackgroundJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( self, @@ -432,9 +483,8 @@ async def retry_background_job( if job.success: raise HTTPException(status_code=400, detail="job_already_succeeded") - file = await self.get_replica_job_file(job, org) - if job.type == BgJobType.CREATE_REPLICA: + file = await self.get_replica_job_file(job, org) primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) primary_endpoint, bucket_suffix = self.strip_bucket( primary_storage.endpoint_url @@ -452,6 +502,7 @@ async def retry_background_job( ) if job.type == BgJobType.DELETE_REPLICA: + file = await self.get_replica_job_file(job, org) await self.create_delete_replica_job( org, file, @@ -461,6 +512,12 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.DELETE_ORG: + await self.create_delete_org_job( + org, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( @@ -523,6 +580,14 @@ async def get_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) + @app.post("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) + async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): + """Retry failed background jobs from all orgs""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return await ops.get_background_job(job_id) + @router.post("/{job_id}/retry", response_model=SuccessResponse) async def retry_background_job( job_id: str, diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index dd2ef3d116..a77537ef49 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -110,6 +110,34 @@ async def run_replica_job( return job_id + async def run_delete_org_job( + self, + oid: str, + backend_image: str, + pull_policy: str, + existing_job_id: Optional[str] = None, + ): + """run job to delete org and all of its data""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" + + params = { + "id": job_id, + "oid": oid, + "job_type": BgJobType.DELETE_ORG.value, + "backend_image": backend_image, + "pull_policy": pull_policy, + } + + data = self.templates.env.get_template("delete_org_job.yaml").render(params) + + await self.create_from_yaml(data) + + return job_id + async def create_crawl_job( self, crawlconfig: CrawlConfig, diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index cb1610a98a..a16ec0edf7 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -243,7 +243,7 @@ def main() -> None: init_uploads_api(*base_crawl_init) - org_ops.set_ops(base_crawl_ops, profiles, coll_ops) + org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py new file mode 100644 index 0000000000..fdc4556b44 --- /dev/null +++ b/backend/btrixcloud/main_bg.py @@ -0,0 +1,136 @@ +""" entrypoint module for background jobs """ + +import asyncio +import os +import sys +import traceback +from uuid import UUID + +from .crawlmanager import CrawlManager +from .db import init_db +from .emailsender import EmailSender + +# from .utils import register_exit_handler +from .models import BgJobType + +from .basecrawls import BaseCrawlOps +from .invites import InviteOps +from .users import init_user_manager +from .orgs import OrgOps +from .colls import CollectionOps +from .crawlconfigs import CrawlConfigOps +from .crawls import CrawlOps +from .profiles import ProfileOps +from .storages import StorageOps +from .webhooks import EventWebhookOps +from .background_jobs import BackgroundJobOps +from .pages import PageOps + +job_type = os.environ.get("BG_JOB_TYPE") +oid = os.environ.get("OID") + + +# ============================================================================ +# pylint: disable=too-many-function-args, duplicate-code, too-many-locals +async def main(): + """main init""" + email = EmailSender() + crawl_manager = None + + dbclient, mdb = init_db() + + invite_ops = InviteOps(mdb, email) + + user_manager = init_user_manager(mdb, email, invite_ops) + + org_ops = OrgOps(mdb, invite_ops, user_manager) + + event_webhook_ops = EventWebhookOps(mdb, org_ops) + + # pylint: disable=import-outside-toplevel + if not os.environ.get("KUBERNETES_SERVICE_HOST"): + print( + "Sorry, the Browsertrix Backend must be run inside a Kubernetes environment.\ + Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting" + ) + sys.exit(1) + + crawl_manager = CrawlManager() + + storage_ops = StorageOps(org_ops, crawl_manager) + + background_job_ops = BackgroundJobOps( + mdb, email, user_manager, org_ops, crawl_manager, storage_ops + ) + + profile_ops = ProfileOps( + mdb, org_ops, crawl_manager, storage_ops, background_job_ops + ) + + crawl_config_ops = CrawlConfigOps( + dbclient, + mdb, + user_manager, + org_ops, + crawl_manager, + profile_ops, + ) + + user_manager.set_ops(org_ops, crawl_config_ops, None) + + coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) + + base_crawl_ops = BaseCrawlOps( + mdb, + user_manager, + org_ops, + crawl_config_ops, + coll_ops, + storage_ops, + event_webhook_ops, + background_job_ops, + ) + + crawl_ops = CrawlOps( + crawl_manager, + mdb, + user_manager, + org_ops, + crawl_config_ops, + coll_ops, + storage_ops, + event_webhook_ops, + background_job_ops, + ) + + page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + + crawl_ops.set_page_ops(page_ops) + + background_job_ops.set_ops(crawl_ops, profile_ops) + + org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops) + + # Refactor, improve error handling + if job_type == BgJobType.DELETE_REPLICA: + if not oid: + return + org = await org_ops.get_org_by_id(UUID(oid)) + if not org: + return + + try: + await org_ops.delete_org_and_data(org, user_manager) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + + print(f"Provided job type {job_type} not currently supported") + return 1 + + +# # ============================================================================ +if __name__ == "__main__": + asyncio.run(main()) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a011cbf186..a34392a2ee 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1966,6 +1966,7 @@ class BgJobType(str, Enum): CREATE_REPLICA = "create-replica" DELETE_REPLICA = "delete-replica" + DELETE_ORG = "delete-org" # ============================================================================ @@ -2004,10 +2005,19 @@ class DeleteReplicaJob(BackgroundJob): replica_storage: StorageRef +# ============================================================================ +class DeleteOrgJob(BackgroundJob): + """Model for tracking deletion of org data jobs""" + + type: Literal[BgJobType.DELETE_ORG] = BgJobType.DELETE_ORG + + # ============================================================================ # Union of all job types, for response model -AnyJob = RootModel[Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob]] +AnyJob = RootModel[ + Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob, DeleteOrgJob] +] # ============================================================================ @@ -2227,6 +2237,13 @@ class DeletedResponse(BaseModel): deleted: bool +# ============================================================================ +class DeletedResponseId(DeletedResponse): + """Response for delete API endpoints that return job id""" + + id: str + + # ============================================================================ class DeletedResponseQuota(DeletedResponse): """Response for delete API endpoints""" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 2b278caf77..e0f6d647f3 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -66,7 +66,7 @@ PAUSED_PAYMENT_FAILED, REASON_PAUSED, ACTIVE, - DeletedResponse, + DeletedResponseId, UpdatedResponse, AddedResponse, AddedResponseId, @@ -93,8 +93,10 @@ from .colls import CollectionOps from .profiles import ProfileOps from .users import UserManager + from .background_jobs import BackgroundJobOps else: - InviteOps = BaseCrawlOps = ProfileOps = CollectionOps = UserManager = object + InviteOps = BaseCrawlOps = ProfileOps = CollectionOps = object + BackgroundJobOps = UserManager = object DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") @@ -150,12 +152,14 @@ def set_ops( base_crawl_ops: BaseCrawlOps, profile_ops: ProfileOps, coll_ops: CollectionOps, + background_job_ops: BackgroundJobOps, ) -> None: """Set base crawl ops""" # pylint: disable=attribute-defined-outside-init self.base_crawl_ops = base_crawl_ops self.profile_ops = profile_ops self.coll_ops = coll_ops + self.background_job_ops = background_job_ops def set_default_primary_storage(self, storage: StorageRef): """set default primary storage""" @@ -1438,15 +1442,16 @@ async def get_org( org_out.execMinutesQuotaReached = ops.exec_mins_quota_reached(org) return org_out - @router.delete("", tags=["organizations"], response_model=DeletedResponse) + @router.delete("", tags=["organizations"], response_model=DeletedResponseId) async def delete_org( org: Organization = Depends(org_dep), user: User = Depends(user_dep) ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - await ops.delete_org_and_data(org, user_manager) - return {"deleted": True} + job_id = await ops.background_job_ops.create_delete_org_job(org) + + return {"deleted": True, "id": job_id} @router.post("/rename", tags=["organizations"], response_model=UpdatedResponse) async def rename_org( diff --git a/backend/test/test_z_delete_org.py b/backend/test/test_z_delete_org.py index 2de04be119..7d875a7eee 100644 --- a/backend/test/test_z_delete_org.py +++ b/backend/test/test_z_delete_org.py @@ -54,9 +54,32 @@ def test_delete_org_superadmin(admin_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers ) assert r.status_code == 200 - assert r.json()["deleted"] + data = r.json() + assert data["deleted"] + + job_id = data["id"] + + # Check that background job is launched and eventually succeeds + max_attempts = 18 + attempts = 1 + while attempts <= max_attempts: + try: + r = requests.get( + f"{API_PREFIX}/orgs/all/jobs/{job_id}", headers=admin_auth_headers + ) + assert r.status_code == 200 + data = r.json() + if data["success"]: + break + time.sleep(10) + except: + pass + attempts += 1 + + # Ensure org and items got deleted + r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers) + assert r.status_code == 404 - # Ensure items got deleted for item_id in item_ids: r = requests.get( f"{API_PREFIX}/orgs/all/all-crawls/{item_id}/replay.json", diff --git a/chart/app-templates/delete_org_job.yaml b/chart/app-templates/delete_org_job.yaml new file mode 100644 index 0000000000..bd59c12614 --- /dev/null +++ b/chart/app-templates/delete_org_job.yaml @@ -0,0 +1,43 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ id }}" + labels: + role: "background-job" + job_type: {{ job_type }} + btrix.org: {{ oid }} + +spec: + ttlSecondsAfterFinished: 0 + backoffLimit: 3 + template: + spec: + restartPolicy: Never + priorityClassName: bg-job + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: btrixbgjob + operator: NotIn + values: [0] + containers: + - name: btrixbgjob + image: {{ backend_image }} + imagePullPolicy: {{ pull_policy }} + env: + - name: BG_JOB_TYPE + value: {{ job_type }} + + - name: OID + value: {{ oid }} + + command: ["python3", "/app/btrixcloud/main_bg.py"] + + resources: + limits: + memory: "200Mi" + + requests: + memory: "200Mi" + cpu: "50m" diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 265850e3d4..73afa3575e 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -72,6 +72,10 @@ data: LOG_SENT_EMAILS: "{{ .Values.email.log_sent_emails }}" + BACKEND_IMAGE: "{{ .Values.backend_image }}" + + BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}" + --- apiVersion: v1 kind: ConfigMap From 17c9cfc94f0edfe948307ae0c9e1bb5f1910cb03 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Sep 2024 18:10:21 -0400 Subject: [PATCH 02/12] Refine test --- backend/test/test_z_delete_org.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/backend/test/test_z_delete_org.py b/backend/test/test_z_delete_org.py index 7d875a7eee..3b3b5bf659 100644 --- a/backend/test/test_z_delete_org.py +++ b/backend/test/test_z_delete_org.py @@ -62,18 +62,27 @@ def test_delete_org_superadmin(admin_auth_headers, default_org_id): # Check that background job is launched and eventually succeeds max_attempts = 18 attempts = 1 - while attempts <= max_attempts: + while True: try: r = requests.get( f"{API_PREFIX}/orgs/all/jobs/{job_id}", headers=admin_auth_headers ) assert r.status_code == 200 - data = r.json() - if data["success"]: + success = r.json()["success"] + + if success: break + + if success is False: + assert False + + if attempts >= max_attempts: + assert False + time.sleep(10) except: pass + attempts += 1 # Ensure org and items got deleted From 7797fd782084debeaea4964be86e918e4433e6f2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 5 Sep 2024 10:42:18 -0400 Subject: [PATCH 03/12] Improve error handling --- backend/btrixcloud/main_bg.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index fdc4556b44..c2fd58fbe3 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -111,13 +111,15 @@ async def main(): org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops) - # Refactor, improve error handling + # Run job if job_type == BgJobType.DELETE_REPLICA: if not oid: - return + print("Org id missing, quitting") + return 1 org = await org_ops.get_org_by_id(UUID(oid)) if not org: - return + print("Org id invalid, quitting") + return 1 try: await org_ops.delete_org_and_data(org, user_manager) From 8cb2b5e4caf7f07804cb35278056080139ffc35e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 5 Sep 2024 11:51:36 -0400 Subject: [PATCH 04/12] WIP: Rename background-job template and load secrets Loading from secrets not working, not sure what issue is yet --- backend/btrixcloud/crawlmanager.py | 2 +- chart/app-templates/background_job.yaml | 61 +++++++++++++++++++++++++ chart/app-templates/delete_org_job.yaml | 43 ----------------- 3 files changed, 62 insertions(+), 44 deletions(-) create mode 100644 chart/app-templates/background_job.yaml delete mode 100644 chart/app-templates/delete_org_job.yaml diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index a77537ef49..e7dd594a47 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -132,7 +132,7 @@ async def run_delete_org_job( "pull_policy": pull_policy, } - data = self.templates.env.get_template("delete_org_job.yaml").render(params) + data = self.templates.env.get_template("background_job.yaml").render(params) await self.create_from_yaml(data) diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml new file mode 100644 index 0000000000..a0117b260d --- /dev/null +++ b/chart/app-templates/background_job.yaml @@ -0,0 +1,61 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ id }}" + labels: + role: "background-job" + job_type: {{ job_type }} + btrix.org: {{ oid }} + +spec: + ttlSecondsAfterFinished: 0 + backoffLimit: 3 + template: + spec: + restartPolicy: Never + priorityClassName: bg-job + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: btrixbgjob + operator: NotIn + values: [0] + + volumes: + - name: ops-configs + secret: + secretName: ops-configs + + containers: + - name: btrixbgjob + image: {{ backend_image }} + imagePullPolicy: {{ pull_policy }} + env: + - name: BG_JOB_TYPE + value: {{ job_type }} + + - name: OID + value: {{ oid }} + + envFrom: + - configMapRef: + name: backend-env-config + - secretRef: + name: backend-auth + - secretRef: + name: mongo-auth + + volumeMounts: + - name: ops-configs + mountPath: /ops-configs/ + + command: ["python3", "-m", "btrixcloud.main_bg"] + + resources: + limits: + memory: "200Mi" + + requests: + memory: "200Mi" + cpu: "50m" diff --git a/chart/app-templates/delete_org_job.yaml b/chart/app-templates/delete_org_job.yaml deleted file mode 100644 index bd59c12614..0000000000 --- a/chart/app-templates/delete_org_job.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: "{{ id }}" - labels: - role: "background-job" - job_type: {{ job_type }} - btrix.org: {{ oid }} - -spec: - ttlSecondsAfterFinished: 0 - backoffLimit: 3 - template: - spec: - restartPolicy: Never - priorityClassName: bg-job - podFailurePolicy: - rules: - - action: FailJob - onExitCodes: - containerName: btrixbgjob - operator: NotIn - values: [0] - containers: - - name: btrixbgjob - image: {{ backend_image }} - imagePullPolicy: {{ pull_policy }} - env: - - name: BG_JOB_TYPE - value: {{ job_type }} - - - name: OID - value: {{ oid }} - - command: ["python3", "/app/btrixcloud/main_bg.py"] - - resources: - limits: - memory: "200Mi" - - requests: - memory: "200Mi" - cpu: "50m" From 33daefaaa5869a8704c16d6d61f31e70b7820f7c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 5 Sep 2024 12:02:55 -0400 Subject: [PATCH 05/12] Add failed commented out attempt to use default namespace Likely not the right approach but I wanted to see if it the background job template's inability to load secrets was due to the crawler namespace not having access to them --- backend/btrixcloud/crawlmanager.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index e7dd594a47..4d821cfad2 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -136,6 +136,14 @@ async def run_delete_org_job( await self.create_from_yaml(data) + # Attempt to use default namespace to have access to secrets, + # results in 403 forbidden: default service account cannot create + # resources "jobs" in API group "batch" in namespace "default" + # await self.create_from_yaml( + # data, + # namespace=os.environ.get("DEFAULT_NAMESPACE", "default") + # ) + return job_id async def create_crawl_job( From c48d184bca739e2802c31cf68fce0f6fe2bb0f8f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 5 Sep 2024 12:22:19 -0400 Subject: [PATCH 06/12] Add necessary secrets and volumes to crawler namespace Job now seems to run and exit successfully but doesn't actually delete the org yet. --- backend/btrixcloud/crawlmanager.py | 8 --- chart/app-templates/background_job.yaml | 2 - chart/templates/configmap.yaml | 75 +++++++++++++++++++++++++ chart/templates/secrets.yaml | 13 +++++ 4 files changed, 88 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 4d821cfad2..e7dd594a47 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -136,14 +136,6 @@ async def run_delete_org_job( await self.create_from_yaml(data) - # Attempt to use default namespace to have access to secrets, - # results in 403 forbidden: default service account cannot create - # resources "jobs" in API group "batch" in namespace "default" - # await self.create_from_yaml( - # data, - # namespace=os.environ.get("DEFAULT_NAMESPACE", "default") - # ) - return job_id async def create_crawl_job( diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index a0117b260d..132d3bf8fe 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -41,8 +41,6 @@ spec: envFrom: - configMapRef: name: backend-env-config - - secretRef: - name: backend-auth - secretRef: name: mongo-auth diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 73afa3575e..9b9d703d3e 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -76,6 +76,81 @@ data: BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}" + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: backend-env-config + namespace: {{ .Values.crawler_namespace }} + +data: + APP_ORIGIN: {{ .Values.ingress.tls | ternary "https" "http" }}://{{ or .Values.ingress.host ( print "localhost:" ( .Values.local_service_port | default 9870 )) }} + + CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} + + DEFAULT_NAMESPACE: {{ .Release.Namespace }} + + FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} + + CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local" + + DEFAULT_ORG: "{{ .Values.default_org }}" + + INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}" + + REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}" + + REGISTER_TO_ORG_ID: "{{ .Values.registration_org_id }}" + + ALLOW_DUPE_INVITES: "{{ .Values.allow_dupe_invites | default 0 }}" + + JWT_TOKEN_LIFETIME_MINUTES: "{{ .Values.jwt_token_lifetime_minutes | default 60 }}" + + DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}" + + DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}" + + DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}" + + MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" + + IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" + + RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" + + PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}" + + FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" + + MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}" + + LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" + + IS_LOCAL_MINIO: "{{ .Values.minio_local }}" + + STORAGES_JSON: "/ops-configs/storages.json" + + CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" + + MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" + + MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" + + ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}" + + BILLING_ENABLED: "{{ .Values.billing_enabled }}" + + SIGN_UP_URL: "{{ .Values.sign_up_url }}" + + SALES_EMAIL: "{{ .Values.sales_email }}" + + LOG_SENT_EMAILS: "{{ .Values.email.log_sent_emails }}" + + BACKEND_IMAGE: "{{ .Values.backend_image }}" + + BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}" + --- apiVersion: v1 kind: ConfigMap diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 4ee89e9f50..d3a8a33855 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -34,6 +34,19 @@ data: crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: ops-configs + namespace: {{ $.Values.crawler_namespace }} + +type: Opaque +data: + storages.json: {{ .Values.storages | toJson | b64enc | quote }} + crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} + + {{- range $storage := .Values.storages }} --- apiVersion: v1 From d6ecc6254645aae239b1e9fecb0111e992c262ee Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Oct 2024 12:30:58 -0400 Subject: [PATCH 07/12] Get background jobs working --- backend/btrixcloud/main_bg.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index c2fd58fbe3..8b56e6a8da 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -76,8 +76,6 @@ async def main(): profile_ops, ) - user_manager.set_ops(org_ops, crawl_config_ops, None) - coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) base_crawl_ops = BaseCrawlOps( @@ -105,14 +103,21 @@ async def main(): page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) background_job_ops.set_ops(crawl_ops, profile_ops) org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops) + user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) + + background_job_ops.set_ops(base_crawl_ops, profile_ops) + + crawl_config_ops.set_coll_ops(coll_ops) + # Run job - if job_type == BgJobType.DELETE_REPLICA: + if job_type == BgJobType.DELETE_ORG: if not oid: print("Org id missing, quitting") return 1 @@ -135,4 +140,5 @@ async def main(): # # ============================================================================ if __name__ == "__main__": - asyncio.run(main()) + return_code = asyncio.run(main()) + sys.exit(return_code) From 9c2c52f0bc00f4f601a04558244b14cb8fd22750 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Oct 2024 13:05:33 -0400 Subject: [PATCH 08/12] Fix HTTP method for get job from all orgs --- backend/btrixcloud/background_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 7f3c3db983..e776572d26 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -580,7 +580,7 @@ async def get_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) - @app.post("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) + @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Retry failed background jobs from all orgs""" if not user.is_superuser: From b0385439758728c9a35be561ff498310e9df6ffb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 8 Oct 2024 12:42:22 -0400 Subject: [PATCH 09/12] Move background jobs to default namespace --- backend/btrixcloud/crawlmanager.py | 6 ++- chart/templates/configmap.yaml | 74 ------------------------------ chart/templates/role.yaml | 31 +++++++++++++ chart/templates/secrets.yaml | 13 ------ 4 files changed, 35 insertions(+), 89 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index e7dd594a47..e215522051 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -22,6 +22,8 @@ class CrawlManager(K8sAPI): def __init__(self): super().__init__() + self.default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") + self.loop = asyncio.get_running_loop() # pylint: disable=too-many-arguments @@ -106,7 +108,7 @@ async def run_replica_job( data = self.templates.env.get_template("replica_job.yaml").render(params) - await self.create_from_yaml(data) + await self.create_from_yaml(data, namespace=self.default_namespace) return job_id @@ -134,7 +136,7 @@ async def run_delete_org_job( data = self.templates.env.get_template("background_job.yaml").render(params) - await self.create_from_yaml(data) + await self.create_from_yaml(data, namespace=self.default_namespace) return job_id diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 9b9d703d3e..8e34a13401 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -77,80 +77,6 @@ data: BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}" ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: backend-env-config - namespace: {{ .Values.crawler_namespace }} - -data: - APP_ORIGIN: {{ .Values.ingress.tls | ternary "https" "http" }}://{{ or .Values.ingress.host ( print "localhost:" ( .Values.local_service_port | default 9870 )) }} - - CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} - - DEFAULT_NAMESPACE: {{ .Release.Namespace }} - - FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} - - CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local" - - DEFAULT_ORG: "{{ .Values.default_org }}" - - INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}" - - REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}" - - REGISTER_TO_ORG_ID: "{{ .Values.registration_org_id }}" - - ALLOW_DUPE_INVITES: "{{ .Values.allow_dupe_invites | default 0 }}" - - JWT_TOKEN_LIFETIME_MINUTES: "{{ .Values.jwt_token_lifetime_minutes | default 60 }}" - - DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}" - - DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}" - - DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}" - - MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" - - IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" - - RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" - - PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}" - - FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" - - MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}" - - LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" - - IS_LOCAL_MINIO: "{{ .Values.minio_local }}" - - STORAGES_JSON: "/ops-configs/storages.json" - - CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" - - MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" - - MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" - - ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}" - - BILLING_ENABLED: "{{ .Values.billing_enabled }}" - - SIGN_UP_URL: "{{ .Values.sign_up_url }}" - - SALES_EMAIL: "{{ .Values.sales_email }}" - - LOG_SENT_EMAILS: "{{ .Values.email.log_sent_emails }}" - - BACKEND_IMAGE: "{{ .Values.backend_image }}" - - BACKEND_IMAGE_PULL_POLICY: "{{ .Values.backend_pull_policy }}" - --- apiVersion: v1 kind: ConfigMap diff --git a/chart/templates/role.yaml b/chart/templates/role.yaml index 16f860734b..5e31ef1da5 100644 --- a/chart/templates/role.yaml +++ b/chart/templates/role.yaml @@ -21,6 +21,17 @@ rules: resources: ["pods"] verbs: ["list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: {{ .Release.Namespace }} + name: bg-job +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"] + --- kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -40,3 +51,23 @@ roleRef: kind: Role name: crawler-run apiGroup: rbac.authorization.k8s.io + +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: bg-job-role + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: default + namespace: {{ .Release.Namespace }} + +- kind: User + name: system:anonymous + namespace: {{ .Release.Namespace }} + +roleRef: + kind: Role + name: bg-job + apiGroup: rbac.authorization.k8s.io diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index d3a8a33855..4ee89e9f50 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -34,19 +34,6 @@ data: crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: ops-configs - namespace: {{ $.Values.crawler_namespace }} - -type: Opaque -data: - storages.json: {{ .Values.storages | toJson | b64enc | quote }} - crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} - - {{- range $storage := .Values.storages }} --- apiVersion: v1 From e96f082f04a0fb811f961c809744e7355dc11ee5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 8 Oct 2024 12:46:27 -0400 Subject: [PATCH 10/12] Temporarily return replica jobs to crawlers namespace for storage secrets --- backend/btrixcloud/crawlmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index e215522051..b7e2d722bb 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -108,7 +108,7 @@ async def run_replica_job( data = self.templates.env.get_template("replica_job.yaml").render(params) - await self.create_from_yaml(data, namespace=self.default_namespace) + await self.create_from_yaml(data) return job_id From d601507798aac898060fae7171738ba0c821a8de Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 8 Oct 2024 13:05:55 -0400 Subject: [PATCH 11/12] Add type annotation for namespace --- backend/btrixcloud/crawlmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 2a2e2745c6..a3d2354535 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -17,7 +17,7 @@ # ============================================================================ DEFAULT_PROXY_ID: str = os.environ.get("DEFAULT_PROXY_ID", "") -DEFAULT_NAMESPACE = os.environ.get("DEFAULT_NAMESPACE", "default") +DEFAULT_NAMESPACE: str = os.environ.get("DEFAULT_NAMESPACE", "default") # ============================================================================ From be14e5447723989d0dfc66b6dcea9e664b77ecc1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 8 Oct 2024 13:19:34 -0400 Subject: [PATCH 12/12] Update docstring --- backend/btrixcloud/background_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index e776572d26..bee8621ba0 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -582,7 +582,7 @@ async def get_background_job( @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): - """Retry failed background jobs from all orgs""" + """Get background job from any org""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed")