Skip to content

Commit

Permalink
rename spider to scraper and change the logging instance
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 23, 2024
1 parent fa7b43d commit 3134b33
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 19 deletions.
3 changes: 3 additions & 0 deletions scrapework/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@
class Context(BaseModel):
logger: logging.Logger
config: EnvConfig

class Config:
arbitrary_types_allowed = True
35 changes: 20 additions & 15 deletions scrapework/logger.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import logging

# Configure the logger
# Configure the logger for the entire module
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)

# Configure file handler
file_handler = logging.FileHandler("spidy.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
def new_logger(name):
name = name

# Optionally, configure console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# Configure the logger
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)

# Configure file handler
file_handler = logging.FileHandler(f"{name}.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Optionally, configure console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

return logger
3 changes: 3 additions & 0 deletions scrapework/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ class Request(BaseModel):
proxy: str | None = None
retries: int = 0

class Config:
arbitrary_types_allowed = True

def fetch(self) -> httpx.Response:
"""
Fetches the HTML content of a given URL.
Expand Down
10 changes: 6 additions & 4 deletions scrapework/spider.py → scrapework/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
from scrapework.config import EnvConfig
from scrapework.context import Context
from scrapework.extractors import BodyExtractor
from scrapework.logger import logger
from scrapework.logger import new_logger
from scrapework.middleware import Middleware
from scrapework.pipelines import Pipeline
from scrapework.request import Request


class Spider(BaseModel, ABC):
name: ClassVar[str] = "base_spider"
class Scraper(BaseModel, ABC):
name: ClassVar[str] = "base_scraper"
start_urls: List[str] = []
pipelines: List[Pipeline] = []
base_url: str = ""
Expand All @@ -24,7 +24,7 @@ class Spider(BaseModel, ABC):
Callable[[Response], Union[Dict[str, Any], Iterable[Dict[str, Any]]]]
] = None
middlewares: List[Middleware] = []
logger: ClassVar[logging.Logger] = logger
logger: logging.Logger = logging.getLogger(name)
config: EnvConfig = Field(default_factory=EnvConfig)

def __init__(self, **args):
Expand All @@ -40,6 +40,8 @@ def __init__(self, **args):
if not self.filename:
self.filename = f"{self.name}.json"

self.logger = new_logger(self.name)

class SpiderConfig(EnvConfig):
pass

Expand Down

0 comments on commit 3134b33

Please sign in to comment.