Update the global status of the crawl task

dataesr · Dec 1, 2023 · 4f11e10 · 4f11e10
1 parent 3a12296
commit 4f11e10
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 1 deletion.
diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py
@@ -87,6 +87,9 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
+                    crawls.update_status(
+                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
+                    )
                 continue
             except Exception as e:
                 logger.error(
@@ -99,5 +102,8 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
+                    crawls.update_status(
+                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
+                    )
                 continue
     return handle_metadata_result(task, crawl_process, result, metadata_type)
diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py
@@ -54,6 +54,9 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
     except Exception as e:
         logger.error(f"Error while crawling html files: {e}")
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
+        crawls.update_status(
+            crawl_id=crawl.id, status=ProcessStatus.ERROR
+        )
         self.update_state(state='FAILURE')
         return crawl_process
     try:
@@ -63,6 +66,9 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
         logger.error(f"Error while uploading html files: {e}")
         # Html crawl will be considered failed if we can't upload the html files
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
+        crawls.update_status(
+            crawl_id=crawl.id, status=ProcessStatus.ERROR
+        )
         self.update_state(state='FAILURE')
         return crawl_process
 
@@ -122,11 +128,20 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C
         f"Crawl process ({crawl.id}) for website {crawl.config.url} ended"
     )
 
+    # Retrieve the current status of the crawl
+    current_crawl = crawls.get(crawl_id=crawl.id)
+
+    if current_crawl.status == ProcessStatus.STARTED:
+        crawls.update_status(
+            crawl_id=crawl.id, status=ProcessStatus.SUCCESS
+        )
+
     websites.store_last_crawl(
         website_id=crawl.website_id,
         crawl=crawls.get(crawl_id=crawl.id).model_dump(),
     )
-
+
+    # This task will always succeed, since it retrieves the last crawl
     self.update_state(state='SUCCESS')