diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index a585c93..65e787f 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -20,6 +20,9 @@ jobs: steps: - uses: actions/checkout@v3 + - name : Install Packages + run : pip install -r requirements.txt + - name: test run: python -m unittest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..6e2a7c3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,17 @@ +name: Testing deployment + +on: + push: + +jobs: + unit-test: + name: run unit tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name : Install Packages + run : pip install -r requirements.txt + + - name: test + run: python -m unittest \ No newline at end of file diff --git a/README.md b/README.md index 961db5e..0594287 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,7 @@ This endpoint allows you to create a new website configuration end execute a cra | `depth` | `integer` | Maximum depth to crawl (**Default**: 2) | | `limit` | `integer` | Maximum pages to crawl (**Default**: 400) | | `headers` | `dict[str, str]` | Headers that will be passed to all crawl requests (**Default**: {})| -| `accessibility` | `MetadataConfig` | Accessibility configuration (**Default**: {'enabled':True, 'depth' 0}) | -| `good_practices` | `MetadataConfig` | Good Practices configuration (**Default**: {'enabled': False}) | +| `lighthouse` | `MetadataConfig` | Lighthouse configuration (**Default**: {'enabled':True, 'depth' 0}) | | `technologies` | `MetadataConfig` | Technologies configuration (**Default**: {'enabled': False}) | | `responsiveness` | `MetadataConfig` | Responsiveness configuration (**Default**: {'enabled': False}) | | `carbon_footprint` | `MetadataConfig` | Carbon Footprint configuration (**Default**: {'enabled': False}) | diff --git a/app/celery_broker/config.py b/app/celery_broker/config.py index 7c96683..1dd3105 100644 --- a/app/celery_broker/config.py +++ b/app/celery_broker/config.py @@ -6,9 +6,8 @@ class BaseConfig: CRAWL_QUEUE_NAME = "crawl_queue" - ACCESSIBILITY_QUEUE_NAME = "accessibility_queue" + LIGHTHOUSE_QUEUE_NAME = "lighthouse_queue" TECHNOLOGIES_QUEUE_NAME = "technologies_queue" - GOOD_PRACTICES_QUEUE_NAME = "good_practices_queue" RESPONSIVENESS_QUEUE_NAME = "responsiveness_queue" CARBON_QUEUE_NAME = "carbon_footprint_queue" UPLOAD_QUEUE_NAME = "upload_queue" @@ -29,20 +28,15 @@ class BaseConfig: routing_key=CRAWL_QUEUE_NAME, ), Queue( - ACCESSIBILITY_QUEUE_NAME, - Exchange(ACCESSIBILITY_QUEUE_NAME), - routing_key=ACCESSIBILITY_QUEUE_NAME, + LIGHTHOUSE_QUEUE_NAME, + Exchange(LIGHTHOUSE_QUEUE_NAME), + routing_key=LIGHTHOUSE_QUEUE_NAME, ), Queue( TECHNOLOGIES_QUEUE_NAME, Exchange(TECHNOLOGIES_QUEUE_NAME), routing_key=TECHNOLOGIES_QUEUE_NAME, ), - Queue( - GOOD_PRACTICES_QUEUE_NAME, - Exchange(GOOD_PRACTICES_QUEUE_NAME), - routing_key=GOOD_PRACTICES_QUEUE_NAME, - ), Queue( RESPONSIVENESS_QUEUE_NAME, Exchange(RESPONSIVENESS_QUEUE_NAME), @@ -62,18 +56,14 @@ class BaseConfig: task_routes = { "crawl": {"queue": CRAWL_QUEUE_NAME, "routing_key": CRAWL_QUEUE_NAME}, - "get_accessibility": { - "queue": ACCESSIBILITY_QUEUE_NAME, - "routing_key": ACCESSIBILITY_QUEUE_NAME, + "get_lighthouse": { + "queue": LIGHTHOUSE_QUEUE_NAME, + "routing_key": LIGHTHOUSE_QUEUE_NAME, }, "get_technologies": { "queue": TECHNOLOGIES_QUEUE_NAME, "routing_key": TECHNOLOGIES_QUEUE_NAME, }, - "get_good_practices": { - "queue": GOOD_PRACTICES_QUEUE_NAME, - "routing_key": GOOD_PRACTICES_QUEUE_NAME, - }, "get_responsiveness": { "queue": RESPONSIVENESS_QUEUE_NAME, "routing_key": RESPONSIVENESS_QUEUE_NAME, diff --git a/app/celery_broker/metadata_utils.py b/app/celery_broker/metadata_utils.py index 197c45a..57d8b8b 100644 --- a/app/celery_broker/metadata_utils.py +++ b/app/celery_broker/metadata_utils.py @@ -6,10 +6,7 @@ from app.models.enums import MetadataType, ProcessStatus from app.models.metadata import MetadataTask from app.models.process import CrawlProcess -from app.services.accessibility_best_practices_calculator import ( - AccessibilityError, - BestPracticesError, -) +from app.services.lighthouse_calculator import LighthouseError from app.services.carbon_calculator import CarbonCalculatorError from app.services.crawler_logger import logger from app.services.responsiveness_calculator import ResponsivenessCalculatorError @@ -76,8 +73,7 @@ def metadata_task( data = calc_method(url) result[url] = data except ( - AccessibilityError, - BestPracticesError, + LighthouseError, TechnologiesError, ResponsivenessCalculatorError, CarbonCalculatorError, diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index 86f4afc..36f1a7d 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -17,8 +17,8 @@ from app.models.enums import MetadataType, ProcessStatus from app.models.metadata import MetadataTask from app.models.process import CrawlProcess -from app.services.accessibility_best_practices_calculator import ( - LighthouseWrapper, +from app.services.lighthouse_calculator import ( + LighthouseCalculator, ) from app.services.carbon_calculator import CarbonCalculator from app.services.crawler_logger import logger @@ -77,14 +77,14 @@ def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess: return crawl_process -@celery_app.task(bind=True, name="get_accessibility") -def get_accessibility(self, crawl_process: CrawlProcess): +@celery_app.task(bind=True, name="get_lighthouse") +def get_lighthouse(self, crawl_process: CrawlProcess): return metadata_task( task=MetadataTask(task_id=self.request.id), crawl_process=crawl_process, - metadata_type=MetadataType.ACCESSIBILITY, - calculator=LighthouseWrapper(), - method_name="get_accessibility", + metadata_type=MetadataType.LIGHTHOUSE, + calculator=LighthouseCalculator(), + method_name="get_lighthouse", ) @@ -99,17 +99,6 @@ def get_technologies(self, crawl_process: CrawlProcess): ) -@celery_app.task(bind=True, name="get_good_practices") -def get_good_practices(self, crawl_process: CrawlProcess): - return metadata_task( - task=MetadataTask(task_id=self.request.id), - crawl_process=crawl_process, - metadata_type=MetadataType.GOOD_PRACTICES, - calculator=LighthouseWrapper(), - method_name="get_best_practices", - ) - - @celery_app.task(bind=True, name="get_responsiveness") def get_responsiveness(self, crawl_process: CrawlProcess): return metadata_task( @@ -133,9 +122,8 @@ def get_carbon_footprint(self, crawl_process: CrawlProcess): METADATA_TASK_REGISTRY = { - MetadataType.ACCESSIBILITY: get_accessibility, + MetadataType.LIGHTHOUSE: get_lighthouse, MetadataType.TECHNOLOGIES: get_technologies, - MetadataType.GOOD_PRACTICES: get_good_practices, MetadataType.RESPONSIVENESS: get_responsiveness, MetadataType.CARBON_FOOTPRINT: get_carbon_footprint, } diff --git a/app/models/crawl.py b/app/models/crawl.py index 362e400..2836907 100644 --- a/app/models/crawl.py +++ b/app/models/crawl.py @@ -5,7 +5,7 @@ from app.celery_broker.utils import french_datetime from app.models.enums import MetadataType, ProcessStatus -from app.models.metadata import MetadataConfig, AccessibilityModel, MetadataTask +from app.models.metadata import MetadataConfig, LighthouseModel, MetadataTask from app.models.utils import get_uuid, BaseTaskModel @@ -31,10 +31,9 @@ class CrawlModel(BaseModel): finished_at: datetime | None = None status: ProcessStatus = ProcessStatus.PENDING html_crawl: BaseTaskModel = Field(default_factory=BaseTaskModel) - accessibility: AccessibilityModel | None = None + lighthouse: LighthouseModel | None = None technologies_and_trackers: MetadataTask | None = None responsiveness: MetadataTask | None = None - good_practices: MetadataTask | None = None carbon_footprint: MetadataTask | None = None uploads: BaseTaskModel = Field(default_factory=BaseTaskModel) @@ -47,14 +46,12 @@ def enabled_metadata(self) -> list[MetadataType]: ] def init_tasks(self) -> None: - if MetadataType.ACCESSIBILITY in self.enabled_metadata: - self.accessibility = AccessibilityModel() + if MetadataType.LIGHTHOUSE in self.enabled_metadata: + self.lighthouse = LighthouseModel() if MetadataType.TECHNOLOGIES in self.enabled_metadata: self.technologies_and_trackers = MetadataTask() if MetadataType.RESPONSIVENESS in self.enabled_metadata: self.responsiveness = MetadataTask() - if MetadataType.GOOD_PRACTICES in self.enabled_metadata: - self.good_practices = MetadataTask() if MetadataType.CARBON_FOOTPRINT in self.enabled_metadata: self.carbon_footprint = MetadataTask() diff --git a/app/models/enums.py b/app/models/enums.py index 3eb22c9..0b25960 100644 --- a/app/models/enums.py +++ b/app/models/enums.py @@ -2,10 +2,9 @@ class MetadataType(StrEnum): - ACCESSIBILITY = "accessibility" + LIGHTHOUSE = "lighthouse" TECHNOLOGIES = "technologies_and_trackers" RESPONSIVENESS = "responsiveness" - GOOD_PRACTICES = "good_practices" CARBON_FOOTPRINT = "carbon_footprint" diff --git a/app/models/metadata.py b/app/models/metadata.py index 3e6d3d7..89c9ff9 100644 --- a/app/models/metadata.py +++ b/app/models/metadata.py @@ -14,5 +14,5 @@ class MetadataTask(BaseTaskModel): pass -class AccessibilityModel(MetadataTask): +class LighthouseModel(MetadataTask): score: float | None = None diff --git a/app/models/request.py b/app/models/request.py index 15cd119..718ba4d 100644 --- a/app/models/request.py +++ b/app/models/request.py @@ -11,10 +11,9 @@ class UpdateWebsiteRequest(BaseModel): depth: int | None = None limit: int | None = None - accessibility: MetadataConfig | None = None + lighthouse: MetadataConfig | None = None technologies_and_trackers: MetadataConfig | None = None responsiveness: MetadataConfig | None = None - good_practices: MetadataConfig | None = None carbon_footprint: MetadataConfig | None = None headers: dict[str, Any] | None = None tags: list[str] | None = None @@ -26,16 +25,13 @@ class CreateWebsiteRequest(BaseModel): url: str depth: int = Field(ge=0, default=2) limit: int = Field(ge=0, default=400) - accessibility: MetadataConfig = Field(default=MetadataConfig()) + lighthouse: MetadataConfig = Field(default=MetadataConfig()) technologies_and_trackers: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) responsiveness: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) - good_practices: MetadataConfig = Field( - default=MetadataConfig(enabled=False) - ) carbon_footprint: MetadataConfig = Field( default=MetadataConfig(enabled=False) ) diff --git a/app/models/website.py b/app/models/website.py index 943d2fd..aed9616 100644 --- a/app/models/website.py +++ b/app/models/website.py @@ -18,10 +18,9 @@ class WebsiteModel(BaseModel): url: str depth: int limit: int - accessibility: MetadataConfig + lighthouse: MetadataConfig technologies_and_trackers: MetadataConfig responsiveness: MetadataConfig - good_practices: MetadataConfig carbon_footprint: MetadataConfig headers: dict[str, Any] created_at: datetime = Field(default_factory=french_datetime) @@ -36,10 +35,9 @@ def to_config(self) -> CrawlConfig: url=self.url, parameters=CrawlParameters(depth=self.depth, limit=self.limit), metadata_config={ - MetadataType.ACCESSIBILITY: self.accessibility, + MetadataType.LIGHTHOUSE: self.lighthouse, MetadataType.TECHNOLOGIES: self.technologies_and_trackers, MetadataType.RESPONSIVENESS: self.responsiveness, - MetadataType.GOOD_PRACTICES: self.good_practices, MetadataType.CARBON_FOOTPRINT: self.carbon_footprint, }, headers=self.headers, diff --git a/app/services/accessibility_best_practices_calculator.py b/app/services/accessibility_best_practices_calculator.py deleted file mode 100644 index 6676da3..0000000 --- a/app/services/accessibility_best_practices_calculator.py +++ /dev/null @@ -1,67 +0,0 @@ -import json -import subprocess -from enum import StrEnum -from typing import Any - - -class LighthouseCategories(StrEnum): - ACCESSIBILITY = "accessibility" - BEST_PRACTICES = "best-practices" - - -class LighthouseError(Exception): - pass - - -class AccessibilityError(Exception): - pass - - -class BestPracticesError(Exception): - pass - - -class LighthouseWrapper: - def get_accessibility(self, url: str) -> dict[str, Any]: - try: - result = self.get_categories( - url=url, categories=[LighthouseCategories.ACCESSIBILITY] - ) - except LighthouseError as e: - raise AccessibilityError from e - return result["accessibility"] - - def get_best_practices(self, url: str) -> dict[str, Any]: - try: - result = self.get_categories( - url=url, categories=[LighthouseCategories.BEST_PRACTICES] - ) - except LighthouseError as e: - raise BestPracticesError from e - return result["best-practices"] - - def get_categories( - self, url: str, categories: list[LighthouseCategories] - ) -> dict[str, Any]: - try: - lighthouse_process = subprocess.run( - " ".join( - [ - "lighthouse", - url, - '--chrome-flags="--no-sandbox --headless --disable-dev-shm-usage"', - f"--only-categories={','.join(categories)}", - "--output=json", - "--disable-full-page-screenshot", - "--no-enable-error-reporting", - "--quiet", - ] - ), - stdout=subprocess.PIPE, - shell=True, - ) - lighthouse_response = json.loads(lighthouse_process.stdout) - result = lighthouse_response["categories"] - except Exception as e: - raise LighthouseError from e - return result diff --git a/app/services/lighthouse_calculator.py b/app/services/lighthouse_calculator.py new file mode 100644 index 0000000..4365a67 --- /dev/null +++ b/app/services/lighthouse_calculator.py @@ -0,0 +1,32 @@ +import json +import subprocess +from typing import Any + + +class LighthouseError(Exception): + pass + + +class LighthouseCalculator: + def get_lighthouse(self, url: str) -> dict[str, Any]: + try: + lighthouse_process = subprocess.run( + " ".join( + [ + "lighthouse", + url, + '--chrome-flags="--no-sandbox --headless --disable-dev-shm-usage"', + "--output=json", + "--disable-full-page-screenshot", + "--no-enable-error-reporting", + "--quiet", + ] + ), + stdout=subprocess.PIPE, + shell=True, + ) + lighthouse_response = json.loads(lighthouse_process.stdout) + result = lighthouse_response + except Exception as e: + raise LighthouseError from e + return result diff --git a/client/index.html b/client/index.html index 53b3037..5d28e73 100644 --- a/client/index.html +++ b/client/index.html @@ -3,14 +3,21 @@
- + -