remove pydantic and seiralisation lock

sbusso · Mar 24, 2024 · 7465ca4 · 7465ca4
1 parent e38d22b
commit 7465ca4
Show file tree

Hide file tree

Showing 11 changed files with 179 additions and 50 deletions.
diff --git a/.gitignore b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 #.idea/
 spidy.py
 output.json
+cache
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ parsel = "^1.9.0"
 courlan = "^1.0.0"
 trafilatura = "^1.8.0"
 httpx = "^0.27.0"
+hishel = "^0.0.24"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.3.0"

diff --git a/scrapework/cache.py b/scrapework/cache.py
@@ -0,0 +1,57 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+import hishel
+
+from scrapework.middleware import Middleware
+from scrapework.request import HTTPClient, Request
+
+
+class HishelClient(HTTPClient):
+    @classmethod
+    def build_client(cls, **kwargs) -> hishel.CacheClient:
+        return hishel.CacheClient(**kwargs)
+
+
+class CacheMiddleware(Middleware):
+    controller: Optional[hishel.Controller] = None
+    storage: Optional[hishel.FileStorage] = None
+    cache_dir: Optional[str] = None
+
+    def __init__(self, cache_dir: str, ttl: int = 3600):
+        super().__init__()
+        self.controller = hishel.Controller(
+            # Cache only GET and POST methods
+            cacheable_methods=["GET", "POST"],
+            # Cache only 200 status codes
+            cacheable_status_codes=[200],
+            # Use the stale response if there is a connection issue and the new response cannot be obtained.
+            allow_stale=True,
+            # First, revalidate the response and then utilize it.
+            # If the response has not changed, do not download the
+            # entire response data from the server; instead,
+            # use the one you have because you know it has not been modified.
+            always_revalidate=True,
+        )
+        cache_dir_path = os.path.join(os.getcwd(), cache_dir)
+
+        if not os.path.exists(cache_dir_path):
+            os.mkdir(cache_dir_path)
+
+        serializer = hishel.PickleSerializer()
+
+        self.storage = hishel.FileStorage(
+            base_path=Path(cache_dir_path), check_ttl_every=ttl
+        )
+
+        self.cache_dir = cache_dir
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def process_request(self, request: Request):
+        request.cls_client = HishelClient
+        request.client_kwargs["controller"] = self.controller
+        request.client_kwargs["storage"] = self.storage
+        return request
diff --git a/scrapework/context.py b/scrapework/context.py
@@ -1,12 +1,18 @@
 import logging
 
-from pydantic import BaseModel
 
-
-class Context(BaseModel):
+class Context:
     logger: logging.Logger
 
     filename: str
 
+    def __init__(self, logger: logging.Logger, filename: str):
+        if not isinstance(logger, logging.Logger):
+            raise TypeError("logger must be an instance of logging.Logger")
+        if not isinstance(filename, str):
+            raise TypeError("filename must be a string")
+        self.logger = logger
+        self.filename = filename
+
     class Config:
         arbitrary_types_allowed = True
diff --git a/scrapework/extractors.py b/scrapework/extractors.py
@@ -1,11 +1,10 @@
 from typing import Any, Dict, Iterable, Union
 
 from parsel import Selector
-from pydantic import BaseModel
 from trafilatura import bare_extraction
 
 
-class Extractor(BaseModel):
+class Extractor:
     def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
         raise NotImplementedError
 

diff --git a/scrapework/middleware.py b/scrapework/middleware.py
@@ -1,23 +1,22 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from random import choice
 from typing import List
 from urllib.parse import urlencode
 
-from pydantic import BaseModel
-
 from scrapework.request import Request
 
 
-class Proxy(BaseModel):
+class Proxy:
     url: str
 
-    def validate(self, value):
-        if not value.startswith("http"):
-            raise ValueError("Proxy url must start with http")
-        return value
+    def __init__(self, url: str):
+        if not url.startswith("http"):
+            raise ValueError("Proxy URL must start with http")
+
+        self.url = url
 
 
-class Middleware(BaseModel):
+class Middleware(ABC):
     @abstractmethod
     def process_request(self, request: Request):
         raise NotImplementedError
@@ -44,6 +43,9 @@ def process_request(self, request: Request):
 class MiddlewareProxy(Middleware):
     proxy: Proxy
 
+    def __init__(self, proxy: Proxy):
+        self.proxy = proxy
+
     def process_request(self, request: Request):
         if self.proxy:
             request.proxy = self.proxy.url
@@ -54,6 +56,9 @@ def process_request(self, request: Request):
 class MiddlewareScrapeOps(Middleware):
     api_key: str
 
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+
     def process_request(self, request: Request):
 
         payload = {"api_key": self.api_key, "url": request.url}
@@ -65,6 +70,9 @@ def process_request(self, request: Request):
 class ProxyRotationMiddleware(Middleware):
     proxies: List[Proxy]  # "http://Username:[email protected]:20000",
 
+    def __init__(self, proxies: List[Proxy]):
+        self.proxies = proxies
+
     def process_request(self, request: Request):
         proxy = choice(self.proxies)
         request.proxy = proxy.url

diff --git a/scrapework/monitors.py b/scrapework/monitors.py
@@ -1,7 +1,7 @@
-from pydantic import BaseModel
+# This is a generic class to manage output expectation, like number of items, format, etc.
+from abc import ABC
 
 
-# This is a generic class to manage output expectation, like number of items, format, etc.
-class Expectations(BaseModel):
+class Expectations(ABC):
     def is_met(self):
         raise NotImplementedError
diff --git a/scrapework/pipelines.py b/scrapework/pipelines.py
@@ -3,12 +3,12 @@
 from typing import Any, Dict, Iterable, Union
 
 import boto3
-from pydantic import BaseModel, Field
+from pydantic import Field
 
 from scrapework.context import Context
 
 
-class Pipeline(ABC, BaseModel):
+class Pipeline(ABC):
     @abstractmethod
     def process_items(
         self,
@@ -31,6 +31,9 @@ def process_items(
 class S3Pipeline(Pipeline):
     s3_bucket: str = Field(default_factory=str)
 
+    def __init__(self, s3_bucket: str):
+        self.s3_bucket = s3_bucket
+
     def process_items(
         self, items: Union[Dict[str, Any], Iterable[Dict[str, Any]]], ctx: Context
     ):

diff --git a/scrapework/request.py b/scrapework/request.py
@@ -1,19 +1,46 @@
 import logging
-from typing import Dict
+from abc import ABC, abstractmethod
+from typing import Any, Dict
 
 import httpx
 from httpx import HTTPError, TimeoutException
-from pydantic import BaseModel
 
 
-class Request(BaseModel):
+class HTTPClient(ABC):
+
+    @classmethod
+    @abstractmethod
+    def build_client(cls, **kwargs) -> httpx.Client:
+        pass
+
+
+class HttpxClient(HTTPClient):
+    @classmethod
+    def build_client(cls, **kwargs) -> httpx.Client:
+        return httpx.Client(**kwargs)
+
+
+class Request:
     url: str
     logger: logging.Logger
     headers: Dict[str, str] = {}
     timeout: int = 10
     follow_redirects: bool = False
     proxy: str | None = None
     retries: int = 0
+    cls_client: type[HTTPClient] = HttpxClient
+    client_kwargs: Dict[str, Any] = {}
+
+    def __init__(self, url: str, **kwargs):
+        self.url = url
+        self.logger = kwargs.get("logger", logging.getLogger("request"))
+        self.headers = kwargs.get("headers", {})
+        self.timeout = kwargs.get("timeout", 10)
+        self.follow_redirects = kwargs.get("follow_redirects", False)
+        self.proxy = kwargs.get("proxy", None)
+        self.retries = kwargs.get("retries", 0)
+        self.cls_client = kwargs.get("cls_client", HttpxClient)
+        self.client_kwargs = kwargs.get("client_kwargs", {})
 
     class Config:
         arbitrary_types_allowed = True
@@ -34,22 +61,25 @@ def fetch(self) -> httpx.Response:
             }
         else:
             mounts = {}
+        client = self.cls_client.build_client(
+            headers=self.headers,
+            timeout=self.timeout,
+            follow_redirects=self.follow_redirects,
+            mounts=mounts,
+            **self.client_kwargs,
+        )
         try:
-            with httpx.Client(
-                headers=self.headers,
-                timeout=self.timeout,
-                follow_redirects=self.follow_redirects,
-                mounts=mounts,
-            ) as client:
 
-                request = client.build_request(
-                    "GET",
-                    self.url,
-                )
+            request = client.build_request(
+                "GET",
+                self.url,
+            )
 
-                response = client.send(request)
+            response = client.send(
+                request,
+            )
 
-                return response
+            return response
 
         except TimeoutException as err:
             self.logger.error(f"TimeoutError fetching {self.url}: {err}")  # type: ignore
@@ -62,3 +92,6 @@ def fetch(self) -> httpx.Response:
         except Exception as err:
             self.logger.error(f"Exception fetching {self.url}: {err}")  # type: ignore
             raise err
+
+        finally:
+            client.close()
-Original file line number
+Diff line change
@@ Expand Up / @@ -163,3 +163,4 @@ cython_debug/ @@
     #.idea/
     spidy.py
     output.json
+    cache