Skip to content

Commit

Permalink
remove pydantic and seiralisation lock
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 24, 2024
1 parent e38d22b commit 7465ca4
Show file tree
Hide file tree
Showing 11 changed files with 179 additions and 50 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,4 @@ cython_debug/
#.idea/
spidy.py
output.json
cache
23 changes: 22 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ parsel = "^1.9.0"
courlan = "^1.0.0"
trafilatura = "^1.8.0"
httpx = "^0.27.0"
hishel = "^0.0.24"

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
57 changes: 57 additions & 0 deletions scrapework/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
from pathlib import Path
from typing import Optional

import hishel

from scrapework.middleware import Middleware
from scrapework.request import HTTPClient, Request


class HishelClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> hishel.CacheClient:
return hishel.CacheClient(**kwargs)


class CacheMiddleware(Middleware):
controller: Optional[hishel.Controller] = None
storage: Optional[hishel.FileStorage] = None
cache_dir: Optional[str] = None

def __init__(self, cache_dir: str, ttl: int = 3600):
super().__init__()
self.controller = hishel.Controller(
# Cache only GET and POST methods
cacheable_methods=["GET", "POST"],
# Cache only 200 status codes
cacheable_status_codes=[200],
# Use the stale response if there is a connection issue and the new response cannot be obtained.
allow_stale=True,
# First, revalidate the response and then utilize it.
# If the response has not changed, do not download the
# entire response data from the server; instead,
# use the one you have because you know it has not been modified.
always_revalidate=True,
)
cache_dir_path = os.path.join(os.getcwd(), cache_dir)

if not os.path.exists(cache_dir_path):
os.mkdir(cache_dir_path)

serializer = hishel.PickleSerializer()

self.storage = hishel.FileStorage(
base_path=Path(cache_dir_path), check_ttl_every=ttl
)

self.cache_dir = cache_dir

class Config:
arbitrary_types_allowed = True

def process_request(self, request: Request):
request.cls_client = HishelClient
request.client_kwargs["controller"] = self.controller
request.client_kwargs["storage"] = self.storage
return request
12 changes: 9 additions & 3 deletions scrapework/context.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import logging

from pydantic import BaseModel


class Context(BaseModel):
class Context:
logger: logging.Logger

filename: str

def __init__(self, logger: logging.Logger, filename: str):
if not isinstance(logger, logging.Logger):
raise TypeError("logger must be an instance of logging.Logger")
if not isinstance(filename, str):
raise TypeError("filename must be a string")
self.logger = logger
self.filename = filename

class Config:
arbitrary_types_allowed = True
3 changes: 1 addition & 2 deletions scrapework/extractors.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Any, Dict, Iterable, Union

from parsel import Selector
from pydantic import BaseModel
from trafilatura import bare_extraction


class Extractor(BaseModel):
class Extractor:
def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
raise NotImplementedError

Expand Down
26 changes: 17 additions & 9 deletions scrapework/middleware.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
from abc import abstractmethod
from abc import ABC, abstractmethod
from random import choice
from typing import List
from urllib.parse import urlencode

from pydantic import BaseModel

from scrapework.request import Request


class Proxy(BaseModel):
class Proxy:
url: str

def validate(self, value):
if not value.startswith("http"):
raise ValueError("Proxy url must start with http")
return value
def __init__(self, url: str):
if not url.startswith("http"):
raise ValueError("Proxy URL must start with http")

self.url = url


class Middleware(BaseModel):
class Middleware(ABC):
@abstractmethod
def process_request(self, request: Request):
raise NotImplementedError
Expand All @@ -44,6 +43,9 @@ def process_request(self, request: Request):
class MiddlewareProxy(Middleware):
proxy: Proxy

def __init__(self, proxy: Proxy):
self.proxy = proxy

def process_request(self, request: Request):
if self.proxy:
request.proxy = self.proxy.url
Expand All @@ -54,6 +56,9 @@ def process_request(self, request: Request):
class MiddlewareScrapeOps(Middleware):
api_key: str

def __init__(self, api_key: str):
self.api_key = api_key

def process_request(self, request: Request):

payload = {"api_key": self.api_key, "url": request.url}
Expand All @@ -65,6 +70,9 @@ def process_request(self, request: Request):
class ProxyRotationMiddleware(Middleware):
proxies: List[Proxy] # "http://Username:[email protected]:20000",

def __init__(self, proxies: List[Proxy]):
self.proxies = proxies

def process_request(self, request: Request):
proxy = choice(self.proxies)
request.proxy = proxy.url
Expand Down
6 changes: 3 additions & 3 deletions scrapework/monitors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pydantic import BaseModel
# This is a generic class to manage output expectation, like number of items, format, etc.
from abc import ABC


# This is a generic class to manage output expectation, like number of items, format, etc.
class Expectations(BaseModel):
class Expectations(ABC):
def is_met(self):
raise NotImplementedError
7 changes: 5 additions & 2 deletions scrapework/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from typing import Any, Dict, Iterable, Union

import boto3
from pydantic import BaseModel, Field
from pydantic import Field

from scrapework.context import Context


class Pipeline(ABC, BaseModel):
class Pipeline(ABC):
@abstractmethod
def process_items(
self,
Expand All @@ -31,6 +31,9 @@ def process_items(
class S3Pipeline(Pipeline):
s3_bucket: str = Field(default_factory=str)

def __init__(self, s3_bucket: str):
self.s3_bucket = s3_bucket

def process_items(
self, items: Union[Dict[str, Any], Iterable[Dict[str, Any]]], ctx: Context
):
Expand Down
63 changes: 48 additions & 15 deletions scrapework/request.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,46 @@
import logging
from typing import Dict
from abc import ABC, abstractmethod
from typing import Any, Dict

import httpx
from httpx import HTTPError, TimeoutException
from pydantic import BaseModel


class Request(BaseModel):
class HTTPClient(ABC):

@classmethod
@abstractmethod
def build_client(cls, **kwargs) -> httpx.Client:
pass


class HttpxClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> httpx.Client:
return httpx.Client(**kwargs)


class Request:
url: str
logger: logging.Logger
headers: Dict[str, str] = {}
timeout: int = 10
follow_redirects: bool = False
proxy: str | None = None
retries: int = 0
cls_client: type[HTTPClient] = HttpxClient
client_kwargs: Dict[str, Any] = {}

def __init__(self, url: str, **kwargs):
self.url = url
self.logger = kwargs.get("logger", logging.getLogger("request"))
self.headers = kwargs.get("headers", {})
self.timeout = kwargs.get("timeout", 10)
self.follow_redirects = kwargs.get("follow_redirects", False)
self.proxy = kwargs.get("proxy", None)
self.retries = kwargs.get("retries", 0)
self.cls_client = kwargs.get("cls_client", HttpxClient)
self.client_kwargs = kwargs.get("client_kwargs", {})

class Config:
arbitrary_types_allowed = True
Expand All @@ -34,22 +61,25 @@ def fetch(self) -> httpx.Response:
}
else:
mounts = {}
client = self.cls_client.build_client(
headers=self.headers,
timeout=self.timeout,
follow_redirects=self.follow_redirects,
mounts=mounts,
**self.client_kwargs,
)
try:
with httpx.Client(
headers=self.headers,
timeout=self.timeout,
follow_redirects=self.follow_redirects,
mounts=mounts,
) as client:

request = client.build_request(
"GET",
self.url,
)
request = client.build_request(
"GET",
self.url,
)

response = client.send(request)
response = client.send(
request,
)

return response
return response

except TimeoutException as err:
self.logger.error(f"TimeoutError fetching {self.url}: {err}") # type: ignore
Expand All @@ -62,3 +92,6 @@ def fetch(self) -> httpx.Response:
except Exception as err:
self.logger.error(f"Exception fetching {self.url}: {err}") # type: ignore
raise err

finally:
client.close()
Loading

0 comments on commit 7465ca4

Please sign in to comment.