Skip to content

Commit

Permalink
Compute crawl execution time in operator (#1256)
Browse files Browse the repository at this point in the history
* store execution time in operator:
- rename isNewCrash -> isNewExit, crashTime -> exitTime
- keep track of exitCode
- add execTime counter, increment when state has a 'finishedAt' and 'startedAt' state
- ensure pods are complete before deleting
- store 'crawlExecSeconds' on crawl and org levels, add to Crawl, CrawlOut, Organization models

* support for fast cancel:
- set redis ':canceled' key to immediately cancel crawl
- delete crawl pods to ensure pod exits immediately
- in finalizer, don't wait for pods to complete when canceling (but still check if terminated)
- add currentTime in pod.status.running.startedAt times for all existing pods
- logging: log exec time, missing finishedAt
- logging: don't log exit code 11 (interrupt due to time/size limits) as a crash

* don't wait for pods completed on failed with existing browsertrix-crawler image

---------
Co-authored-by: Tessa Walsh <[email protected]>
  • Loading branch information
ikreymer authored Oct 10, 2023
1 parent 748c867 commit 5cad9ac
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 36 deletions.
7 changes: 7 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,13 @@ async def update_running_crawl_stats(self, crawl_id, stats):
query = {"_id": crawl_id, "type": "crawl", "state": "running"}
return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})

async def store_exec_time(self, crawl_id, exec_time):
"""set exec time, only if not already set"""
query = {"_id": crawl_id, "type": "crawl", "execTime": {"$in": [0, None]}}
return await self.crawls.find_one_and_update(
query, {"$set": {"execTime": exec_time}}
)

async def get_crawl_state(self, crawl_id):
"""return current crawl state of a crawl"""
res = await self.crawls.find_one(
Expand Down
7 changes: 7 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ class CrawlOut(BaseMongoModel):

collectionIds: Optional[List[UUID4]] = []

crawlExecSeconds: int = 0

# automated crawl fields
config: Optional[RawCrawlConfig]
cid: Optional[UUID4]
Expand Down Expand Up @@ -441,6 +443,8 @@ class Crawl(BaseCrawl, CrawlConfigCore):

stopping: Optional[bool] = False

crawlExecSeconds: int = 0


# ============================================================================
class CrawlCompleteIn(BaseModel):
Expand Down Expand Up @@ -666,6 +670,7 @@ class Organization(BaseMongoModel):
storage: Union[S3Storage, DefaultStorage]

usage: Dict[str, int] = {}
crawlExecSeconds: Dict[str, int] = {}

bytesStored: int = 0
bytesStoredCrawls: int = 0
Expand Down Expand Up @@ -713,6 +718,7 @@ async def serialize_for_user(self, user: User, user_manager):

if not self.is_crawler(user):
exclude.add("usage")
exclude.add("crawlExecSeconds")

result = self.to_dict(
exclude_unset=True,
Expand Down Expand Up @@ -747,6 +753,7 @@ class OrgOut(BaseMongoModel):
name: str
users: Optional[Dict[str, Any]]
usage: Optional[Dict[str, int]]
crawlExecSeconds: Optional[Dict[str, int]]
default: bool = False
bytesStored: int
bytesStoredCrawls: int
Expand Down
Loading

0 comments on commit 5cad9ac

Please sign in to comment.