Skip to content

Commit

Permalink
Update the global status of the crawl task
Browse files Browse the repository at this point in the history
  • Loading branch information
Pandalei97 committed Dec 1, 2023
1 parent 3a12296 commit 4f11e10
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
6 changes: 6 additions & 0 deletions app/celery_broker/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def metadata_task(
task_name=metadata_type,
task=task,
)
crawls.update_status(
crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
)
continue
except Exception as e:
logger.error(
Expand All @@ -99,5 +102,8 @@ def metadata_task(
task_name=metadata_type,
task=task,
)
crawls.update_status(
crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
)
continue
return handle_metadata_result(task, crawl_process, result, metadata_type)
17 changes: 16 additions & 1 deletion app/celery_broker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
except Exception as e:
logger.error(f"Error while crawling html files: {e}")
set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.ERROR
)
self.update_state(state='FAILURE')
return crawl_process
try:
Expand All @@ -63,6 +66,9 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
logger.error(f"Error while uploading html files: {e}")
# Html crawl will be considered failed if we can't upload the html files
set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.ERROR
)
self.update_state(state='FAILURE')
return crawl_process

Expand Down Expand Up @@ -122,11 +128,20 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C
f"Crawl process ({crawl.id}) for website {crawl.config.url} ended"
)

# Retrieve the current status of the crawl
current_crawl = crawls.get(crawl_id=crawl.id)

if current_crawl.status == ProcessStatus.STARTED:
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.SUCCESS
)

websites.store_last_crawl(
website_id=crawl.website_id,
crawl=crawls.get(crawl_id=crawl.id).model_dump(),
)


# This task will always succeed, since it retrieves the last crawl
self.update_state(state='SUCCESS')


Expand Down

0 comments on commit 4f11e10

Please sign in to comment.