Skip to content

Commit

Permalink
Merge pull request #30 from dataesr/dev
Browse files Browse the repository at this point in the history
fix import
  • Loading branch information
folland87 authored Nov 16, 2023
2 parents 7aa597f + 2654b8d commit abf87fc
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 48 deletions.
16 changes: 9 additions & 7 deletions app/api/crawls_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from fastapi import HTTPException, APIRouter, status as statuscode
from fastapi.responses import StreamingResponse

import app.repositories as repositories
from app.repositories.crawls import crawls
from app.repositories.files import files
from app.repositories.websites import websites
from app.api.utils import create_crawl, start_crawl
from app.models.crawl import CrawlModel, ListCrawlResponse

Expand All @@ -19,10 +21,10 @@
summary="Start a new crawl for an existing website, using stored configuration.",
)
def crawl_website(website_id: str):
if website := repositories.websites.get(website_id):
if website := websites.get(website_id):
crawl = create_crawl(website)
start_crawl(crawl)
repositories.websites.refresh_next_crawl(crawl.website_id)
websites.refresh_next_crawl(crawl.website_id)
return crawl
else:
raise HTTPException(
Expand All @@ -42,10 +44,10 @@ def list_crawls(
skip: int = 0,
limit: int = 10,
):
crawls = repositories.crawls.list(
crawls_list = crawls.list(
website_id=website_id, skip=skip, limit=limit
)
return crawls
return crawls_list


@crawls_router.get(
Expand All @@ -55,7 +57,7 @@ def list_crawls(
)
def get_crawl_files(crawl_id: str) -> StreamingResponse:
"""Zip the files from the storage service"""
zip_io = repositories.files.zip_all_crawl_files(crawl_id)
zip_io = files.zip_all_crawl_files(crawl_id)
return StreamingResponse(
iter([zip_io.getvalue()]),
media_type="application/x-zip-compressed",
Expand All @@ -72,4 +74,4 @@ def get_crawl_files(crawl_id: str) -> StreamingResponse:
)
def delete_crawl(crawl_id: str) -> None:
"""Zip the files from the storage service"""
return repositories.files.delete_all_crawl_files(crawl_id)
return files.delete_all_crawl_files(crawl_id)
5 changes: 2 additions & 3 deletions app/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

from celery import group, chain

import app.repositories as repositories
from app.repositories.crawls import crawls
from app.celery_broker.tasks import (
METADATA_TASK_REGISTRY,
start_crawl_process,
)
from app.models.crawl import CrawlModel
from app.models.enums import ProcessStatus
from app.models.website import WebsiteModel
from app.services.crawler_logger import logger

Expand All @@ -27,7 +26,7 @@ def create_crawl(website: WebsiteModel) -> CrawlModel:
config=website.to_config(),
)
crawl.init_tasks()
repositories.crawls.create(crawl)
crawls.create(crawl)
return crawl


Expand Down
16 changes: 8 additions & 8 deletions app/api/websites_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from fastapi import APIRouter, HTTPException, status as statuscode
from pymongo.errors import DuplicateKeyError

import app.repositories as repositories
from app.repositories.websites import websites
from app.api.utils import create_crawl, start_crawl
from app.models.request import UpdateWebsiteRequest, CreateWebsiteRequest
from app.models.website import WebsiteModel, ListWebsiteResponse
Expand All @@ -23,7 +23,7 @@
def create_website(data: CreateWebsiteRequest):
website = data.to_website_model()
try:
repositories.websites.create(website)
websites.create(website)
except DuplicateKeyError as e:
raise HTTPException(
status_code=statuscode.HTTP_409_CONFLICT,
Expand All @@ -49,7 +49,7 @@ def list_websites(
status: str | None = None,
sort: str = "created_at",
):
return repositories.websites.list(
return websites.list(
query=query, tags=tags, status=status, skip=skip, limit=limit, sort=sort
)

Expand All @@ -61,7 +61,7 @@ def list_websites(
summary="Get a single website by its unique ID",
)
def get_website(website_id: str):
if data := repositories.websites.get(website_id):
if data := websites.get(website_id):
return data
else:
raise HTTPException(
Expand All @@ -77,7 +77,7 @@ def get_website(website_id: str):
)
def patch_website(website_id: str, data: UpdateWebsiteRequest) -> None:
try:
repositories.websites.update(website_id, data)
websites.update(website_id, data)
except AssertionError as e:
raise HTTPException(
status_code=statuscode.HTTP_404_NOT_FOUND,
Expand All @@ -91,7 +91,7 @@ def patch_website(website_id: str, data: UpdateWebsiteRequest) -> None:
summary="Delete a website by its unique ID",
)
def delete_website(website_id: str):
repositories.websites.delete(website_id)
websites.delete(website_id)


@websites_router.post(
Expand All @@ -100,8 +100,8 @@ def delete_website(website_id: str):
summary="Recrawl websites with next_crawl_at date passed",
)
def recrawl_cron():
for website in repositories.websites.list_to_recrawl().data:
for website in websites.list_to_recrawl().data:
crawl = create_crawl(website)
start_crawl(crawl)
repositories.websites.refresh_next_crawl(crawl.website_id)
websites.refresh_next_crawl(crawl.website_id)
return crawl
6 changes: 3 additions & 3 deletions app/celery_broker/crawler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scrapy.utils.project import get_project_settings
from scrapy.utils.python import without_none_values

import app.repositories as repositories
from app.repositories.crawls import crawls
from app.crawler.spider import MenesrSpider
from app.models.crawl import CrawlModel
from app.models.enums import ProcessStatus
Expand All @@ -11,7 +11,7 @@

def update_crawl_status(crawl: CrawlModel, status: ProcessStatus):
crawl.update_status(status=status)
repositories.crawls.update(crawl)
crawls.update(crawl)


def init_crawler_settings(crawl_process: CrawlProcess):
Expand All @@ -38,7 +38,7 @@ def set_html_crawl_status(crawl: CrawlModel, request_id: str, status: ProcessSta
crawl.html_crawl.update(
task_id=request_id, status=status
)
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl.id,
task_name="html_crawl",
task=crawl.html_crawl,
Expand Down
15 changes: 8 additions & 7 deletions app/celery_broker/metadata_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

import app.repositories as repositories
from app.repositories.crawls import crawls
from app.repositories.files import files
from app.models.enums import MetadataType, ProcessStatus
from app.models.metadata import MetadataTask
from app.models.process import CrawlProcess
Expand All @@ -19,7 +20,7 @@ def handle_metadata_result(
):
if not result:
task.update(status=ProcessStatus.ERROR)
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl_process.id,
task_name=metadata_type,
task=task,
Expand All @@ -29,7 +30,7 @@ def handle_metadata_result(
store_metadata_result(crawl_process, result, metadata_type)
if task.status == ProcessStatus.STARTED:
task.update(status=ProcessStatus.SUCCESS)
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl_process.id,
task_name=metadata_type,
task=task,
Expand All @@ -41,7 +42,7 @@ def handle_metadata_result(
def store_metadata_result(
crawl_process: CrawlProcess, result: dict, metadata_type: MetadataType
):
return repositories.files.store_metadata_file(
return files.store_metadata_file(
crawl_id=crawl_process.id,
object_name=f"{metadata_type}.json",
content_type='application/json',
Expand All @@ -60,7 +61,7 @@ def metadata_task(
result = {}
task.update(status=ProcessStatus.STARTED)
logger.debug(f"{metadata_type} started!")
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl_process.id,
task_name=metadata_type,
task=task,
Expand All @@ -81,7 +82,7 @@ def metadata_task(
)
if task.status != ProcessStatus.PARTIAL_ERROR:
task.update(status=ProcessStatus.PARTIAL_ERROR)
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl_process.id,
task_name=metadata_type,
task=task,
Expand All @@ -93,7 +94,7 @@ def metadata_task(
)
if task.status != ProcessStatus.PARTIAL_ERROR:
task.update(status=ProcessStatus.PARTIAL_ERROR)
repositories.crawls.update_task(
crawls.update_task(
crawl_id=crawl_process.id,
task_name=metadata_type,
task=task,
Expand Down
7 changes: 4 additions & 3 deletions app/celery_broker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from multiprocessing import Process, Manager

# Local imports
import app.repositories as repositories
from app.repositories.crawls import crawls
from app.repositories.files import files
from app.celery_broker.crawler_utils import start_crawler_process, set_html_crawl_status
from app.celery_broker.main import celery_app
from app.celery_broker.metadata_utils import metadata_task
Expand All @@ -30,7 +31,7 @@

@celery_app.task(bind=True, name="crawl")
def start_crawl_process(self, crawl: CrawlModel) -> CrawlProcess:
repositories.crawls.update_status(
crawls.update_status(
crawl_id=crawl.id, status=ProcessStatus.STARTED
)
logger.debug("Html crawl started!")
Expand Down Expand Up @@ -130,7 +131,7 @@ def upload_html(crawl: CrawlModel):
for file in crawl_files_path.rglob("*.[hj][ts][mo][ln]"):
file_path = str(file)
file_name = file_path.removeprefix(local_files_folder).lstrip('/')
repositories.files.store_html_file(
files.store_html_file(
object_name=file_name,
file_path=file_path,
content_type=assume_content_type(file_path),
Expand Down
21 changes: 8 additions & 13 deletions app/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,12 @@


client = MongoClient(host=settings.MONGO_URI)
db = None


def init_database():
global db
if db is None:
db = client[settings.MONGO_DBNAME]
db[settings.MONGO_WEBSITES_COLLECTION].create_index(
[("id", 1)], unique=True
)
db[settings.MONGO_WEBSITES_COLLECTION].create_index(
[("url", 1)], unique=True
)
db[settings.MONGO_CRAWLS_COLLECTION].create_index([("id", 1)], unique=True)
db = client[settings.MONGO_DBNAME]
db[settings.MONGO_WEBSITES_COLLECTION].create_index(
[("id", 1)], unique=True
)
db[settings.MONGO_WEBSITES_COLLECTION].create_index(
[("url", 1)], unique=True
)
db[settings.MONGO_CRAWLS_COLLECTION].create_index([("id", 1)], unique=True)
3 changes: 1 addition & 2 deletions app/repositories/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
from app.models.crawl import CrawlModel, ListCrawlResponse
from app.models.enums import ProcessStatus
from app.models.metadata import MetadataTask
from app.mongo import db, init_database
from app.mongo import db


class CrawlsRepository:
"""Operations for crawls collection"""

def __init__(self):
init_database()
self.collection = db[settings.MONGO_CRAWLS_COLLECTION]

def create(self, data: CrawlModel) -> str:
Expand Down
3 changes: 1 addition & 2 deletions app/repositories/websites.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@
from app.models.enums import ProcessStatus
from app.models.request import UpdateWebsiteRequest
from app.models.website import WebsiteModel, ListWebsiteResponse
from app.mongo import db, init_database
from app.mongo import db


class WebsitesRepository:
"""Operations for websites collection"""

def __init__(self):
init_database()
self.collection = db[settings.MONGO_WEBSITES_COLLECTION]

def list(
Expand Down

0 comments on commit abf87fc

Please sign in to comment.