-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
remove pydantic and seiralisation lock
- Loading branch information
Showing
11 changed files
with
179 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,3 +163,4 @@ cython_debug/ | |
#.idea/ | ||
spidy.py | ||
output.json | ||
cache |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import hishel | ||
|
||
from scrapework.middleware import Middleware | ||
from scrapework.request import HTTPClient, Request | ||
|
||
|
||
class HishelClient(HTTPClient): | ||
@classmethod | ||
def build_client(cls, **kwargs) -> hishel.CacheClient: | ||
return hishel.CacheClient(**kwargs) | ||
|
||
|
||
class CacheMiddleware(Middleware): | ||
controller: Optional[hishel.Controller] = None | ||
storage: Optional[hishel.FileStorage] = None | ||
cache_dir: Optional[str] = None | ||
|
||
def __init__(self, cache_dir: str, ttl: int = 3600): | ||
super().__init__() | ||
self.controller = hishel.Controller( | ||
# Cache only GET and POST methods | ||
cacheable_methods=["GET", "POST"], | ||
# Cache only 200 status codes | ||
cacheable_status_codes=[200], | ||
# Use the stale response if there is a connection issue and the new response cannot be obtained. | ||
allow_stale=True, | ||
# First, revalidate the response and then utilize it. | ||
# If the response has not changed, do not download the | ||
# entire response data from the server; instead, | ||
# use the one you have because you know it has not been modified. | ||
always_revalidate=True, | ||
) | ||
cache_dir_path = os.path.join(os.getcwd(), cache_dir) | ||
|
||
if not os.path.exists(cache_dir_path): | ||
os.mkdir(cache_dir_path) | ||
|
||
serializer = hishel.PickleSerializer() | ||
|
||
self.storage = hishel.FileStorage( | ||
base_path=Path(cache_dir_path), check_ttl_every=ttl | ||
) | ||
|
||
self.cache_dir = cache_dir | ||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
def process_request(self, request: Request): | ||
request.cls_client = HishelClient | ||
request.client_kwargs["controller"] = self.controller | ||
request.client_kwargs["storage"] = self.storage | ||
return request |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,18 @@ | ||
import logging | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class Context(BaseModel): | ||
class Context: | ||
logger: logging.Logger | ||
|
||
filename: str | ||
|
||
def __init__(self, logger: logging.Logger, filename: str): | ||
if not isinstance(logger, logging.Logger): | ||
raise TypeError("logger must be an instance of logging.Logger") | ||
if not isinstance(filename, str): | ||
raise TypeError("filename must be a string") | ||
self.logger = logger | ||
self.filename = filename | ||
|
||
class Config: | ||
arbitrary_types_allowed = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,22 @@ | ||
from abc import abstractmethod | ||
from abc import ABC, abstractmethod | ||
from random import choice | ||
from typing import List | ||
from urllib.parse import urlencode | ||
|
||
from pydantic import BaseModel | ||
|
||
from scrapework.request import Request | ||
|
||
|
||
class Proxy(BaseModel): | ||
class Proxy: | ||
url: str | ||
|
||
def validate(self, value): | ||
if not value.startswith("http"): | ||
raise ValueError("Proxy url must start with http") | ||
return value | ||
def __init__(self, url: str): | ||
if not url.startswith("http"): | ||
raise ValueError("Proxy URL must start with http") | ||
|
||
self.url = url | ||
|
||
|
||
class Middleware(BaseModel): | ||
class Middleware(ABC): | ||
@abstractmethod | ||
def process_request(self, request: Request): | ||
raise NotImplementedError | ||
|
@@ -44,6 +43,9 @@ def process_request(self, request: Request): | |
class MiddlewareProxy(Middleware): | ||
proxy: Proxy | ||
|
||
def __init__(self, proxy: Proxy): | ||
self.proxy = proxy | ||
|
||
def process_request(self, request: Request): | ||
if self.proxy: | ||
request.proxy = self.proxy.url | ||
|
@@ -54,6 +56,9 @@ def process_request(self, request: Request): | |
class MiddlewareScrapeOps(Middleware): | ||
api_key: str | ||
|
||
def __init__(self, api_key: str): | ||
self.api_key = api_key | ||
|
||
def process_request(self, request: Request): | ||
|
||
payload = {"api_key": self.api_key, "url": request.url} | ||
|
@@ -65,6 +70,9 @@ def process_request(self, request: Request): | |
class ProxyRotationMiddleware(Middleware): | ||
proxies: List[Proxy] # "http://Username:[email protected]:20000", | ||
|
||
def __init__(self, proxies: List[Proxy]): | ||
self.proxies = proxies | ||
|
||
def process_request(self, request: Request): | ||
proxy = choice(self.proxies) | ||
request.proxy = proxy.url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
from pydantic import BaseModel | ||
# This is a generic class to manage output expectation, like number of items, format, etc. | ||
from abc import ABC | ||
|
||
|
||
# This is a generic class to manage output expectation, like number of items, format, etc. | ||
class Expectations(BaseModel): | ||
class Expectations(ABC): | ||
def is_met(self): | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.