fix: only calculate crawl statut at the beginning and the end of the …

…crawl
dataesr · Dec 14, 2023 · 055fff6 · 055fff6
1 parent b58e64b
commit 055fff6
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 18 deletions.
diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py
@@ -87,9 +87,6 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
-                    crawls.update_status(
-                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
-                    )
                 continue
             except Exception as e:
                 logger.error(
@@ -102,8 +99,5 @@ def metadata_task(
                         task_name=metadata_type,
                         task=task,
                     )
-                    crawls.update_status(
-                        crawl_id=crawl_process.id, status=ProcessStatus.PARTIAL_ERROR
-                    )
                 continue
     return handle_metadata_result(task, crawl_process, result, metadata_type)
diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py
@@ -45,9 +45,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
     except Exception as e:
         logger.error(f"Error while crawling html files: {e}")
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
-        crawls.update_status(
-            crawl_id=crawl.id, status=ProcessStatus.ERROR
-        )
         self.update_state(state='FAILURE')
         return crawl_process
     try:
@@ -57,9 +54,6 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
         logger.error(f"Error while uploading html files: {e}")
         # Html crawl will be considered failed if we can't upload the html files
         set_html_crawl_status(crawl, self.request.id, ProcessStatus.ERROR)
-        crawls.update_status(
-            crawl_id=crawl.id, status=ProcessStatus.ERROR
-        )
         self.update_state(state='FAILURE')
         return crawl_process
 
@@ -141,7 +135,7 @@ def finalize_crawl_process(self, crawl_process: Optional[CrawlProcess], crawl: C
         current_crawl.status = ProcessStatus.ERROR
 
     crawls.update_status(
-        crawl_id=crawl.id, status=current_crawl.status, final_status=True
+        crawl_id=crawl.id, status=current_crawl.status
     )
 
     websites.store_last_crawl(

diff --git a/app/repositories/crawls.py b/app/repositories/crawls.py
@@ -63,14 +63,11 @@ def update(self, data: CrawlModel):
             },
         )
 
-    def update_status(self, crawl_id: str, status: ProcessStatus, final_status: bool = False):
+    def update_status(self, crawl_id: str, status: ProcessStatus):
         update_dict = {"status": status}
         if status == ProcessStatus.STARTED:
             update_dict["started_at"] = french_datetime()
-        elif status == ProcessStatus.SUCCESS:
-            update_dict["finished_at"] = french_datetime()
-        # In finalize task, we should update the finished_at field regardless of the status
-        if final_status:
+        else:
             update_dict["finished_at"] = french_datetime()
         self.collection.update_one(
             filter={"id": crawl_id},