Skip to content

Commit

Permalink
request known_encoding kwarg now defaults to utf-8
Browse files Browse the repository at this point in the history
Assumed unknowed encoding remains default for executors and crawlers
Fix #890
  • Loading branch information
Yomguithereal committed Nov 16, 2023
1 parent 2a016af commit 1b7e0ff
Show file tree
Hide file tree
Showing 16 changed files with 28 additions and 29 deletions.
2 changes: 1 addition & 1 deletion minet/buzzsumo/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self, token):
@rate_limited_method()
def request(self, url):
try:
response = request(url, known_encoding="utf-8")
response = request(url)
data = response.json()
except JSONDecodeError:
raise BuzzSumoBadRequestError
Expand Down
6 changes: 6 additions & 0 deletions minet/crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def __init__(
max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
stateful_redirects: bool = False,
spoof_ua: bool = False,
known_encoding: Optional[str] = None,
callback: Optional[
Callable[
[
Expand Down Expand Up @@ -139,6 +140,9 @@ def __init__(
if compressed:
self.default_kwargs["compressed"] = True

if known_encoding is not None:
self.default_kwargs["known_encoding"] = known_encoding

def __call__(
self, job: CrawlJob[CrawlJobDataType]
) -> Optional[
Expand Down Expand Up @@ -288,6 +292,7 @@ def __init__(
request_args: Optional[RequestArgsType[CrawlJobDataType]] = None,
use_pycurl: bool = False,
compressed: bool = False,
known_encoding: Optional[str] = None,
max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
stateful_redirects: bool = False,
spoof_ua: bool = False,
Expand Down Expand Up @@ -398,6 +403,7 @@ def __init__(
"spoof_ua": spoof_ua,
"use_pycurl": use_pycurl,
"compressed": compressed,
"known_encoding": known_encoding
}

def __repr__(self):
Expand Down
2 changes: 1 addition & 1 deletion minet/crowdtangle/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, token, rate_limit=None):

@retrying_method()
def __request(self, url):
response = request(url, pool_manager=self.pool_manager, known_encoding="utf-8")
response = request(url, pool_manager=self.pool_manager)

# Bad auth
if response.status == 401:
Expand Down
7 changes: 7 additions & 0 deletions minet/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class ExecutorRequestKwargs(TypedDict, Generic[ItemType, CallbackResultType]):
buffer_size: NotRequired[int]
domain_parallelism: NotRequired[int]
max_redirects: NotRequired[int]
known_encoding: NotRequired[Optional[str]]


class ExecutorResolveKwargs(TypedDict, Generic[ItemType, CallbackResultType]):
Expand Down Expand Up @@ -362,6 +363,7 @@ def __init__(
follow_js_relocation: bool = False,
infer_redirection: bool = False,
canonicalize: bool = False,
known_encoding: Optional[str] = None,
callback: Optional[
Union[
Callable[[ItemType, str, Response], CallbackResultType],
Expand Down Expand Up @@ -404,6 +406,9 @@ def __init__(
if compressed:
self.default_kwargs["compressed"] = True

if known_encoding is not None:
self.default_kwargs["known_encoding"] = known_encoding

def __call__(
self, payload: HTTPWorkerPayloadBase[ItemType]
) -> Optional[
Expand Down Expand Up @@ -589,6 +594,7 @@ def request(
buffer_size: int = DEFAULT_IMAP_BUFFER_SIZE,
domain_parallelism: int = DEFAULT_DOMAIN_PARALLELISM,
max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
known_encoding: Optional[str] = None,
callback: Optional[
Callable[[ItemType, str, Response], CallbackResultType]
] = None,
Expand All @@ -608,6 +614,7 @@ def request(
max_redirects=max_redirects,
use_pycurl=use_pycurl,
compressed=compressed,
known_encoding=known_encoding,
callback=callback,
)

Expand Down
1 change: 0 additions & 1 deletion minet/facebook/mobile_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,6 @@ def request_page(self, url):
pool_manager=self.pool_manager,
cookie=self.cookie,
headers={"User-Agent": "curl/7.68.0", "Accept-Language": "en"},
known_encoding="utf-8",
)

return response.text()
Expand Down
2 changes: 1 addition & 1 deletion minet/google/sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def export_google_sheets_as_csv(
while True:
attempts -= 1

response = request(export_url, cookie=cookie, known_encoding="utf-8")
response = request(export_url, cookie=cookie)

if response.status == 404:
raise GoogleSheetsNotFoundError
Expand Down
8 changes: 2 additions & 6 deletions minet/instagram/api_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,11 +222,7 @@ def request_json(self, url, magic_token=False):
headers["X-IG-App-ID"] = self.magic_token

response = request(
url,
pool_manager=self.pool_manager,
spoof_ua=True,
headers=headers,
known_encoding="utf-8",
url, pool_manager=self.pool_manager, spoof_ua=True, headers=headers
)

text = response.text()
Expand Down Expand Up @@ -281,7 +277,7 @@ def request_json(self, url, magic_token=False):
return data

def get_magic_token(self):
response = request("https://www.instagram.com/disney", known_encoding="utf-8")
response = request("https://www.instagram.com/disney")

if response.status >= 400:
return None
Expand Down
2 changes: 1 addition & 1 deletion minet/mediacloud/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def generator():
last_processed_stories_id=last_processed_stories_id,
)

response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
response = request(url, pool_manager=pool_manager)
data = response.json()

if response.status >= 500:
Expand Down
2 changes: 1 addition & 1 deletion minet/mediacloud/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def mediacloud_topic_stories(
from_media_id=from_media_id,
)

response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
response = request(url, pool_manager=pool_manager)
data = response.json()

if "stories" not in data or len(data["stories"]) == 0:
Expand Down
2 changes: 1 addition & 1 deletion minet/mediacloud/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def make_simple_call(
if query is not None:
url += "&" + ("&".join("%s=%s" % (str(k), str(v)) for k, v in query.items()))

response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
response = request(url, pool_manager=pool_manager)
data = response.json()

if response.status >= 500:
Expand Down
2 changes: 1 addition & 1 deletion minet/telegram/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def __init__(self, throttle=TELEGRAM_DEFAULT_THROTTLE):
@rate_limited_method()
@retrying_method()
def request_page(self, url):
response = request(url, pool_manager=self.pool_manager, known_encoding="utf-8")
response = request(url, pool_manager=self.pool_manager)

return response.text()

Expand Down
6 changes: 1 addition & 5 deletions minet/tiktok/api_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,7 @@ def request_json(self, url):
headers = {"Cookie": self.cookie}

response = request(
url,
pool_manager=self.pool_manager,
spoof_ua=True,
headers=headers,
known_encoding="utf-8",
url, pool_manager=self.pool_manager, spoof_ua=True, headers=headers
)

if response.status >= 400:
Expand Down
3 changes: 1 addition & 2 deletions minet/twitter/api_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,7 @@ def request(self, url, headers=None, method="GET"):
pool_manager=self.pool_manager,
spoof_ua=True,
method=method,
headers=headers,
known_encoding="utf-8",
headers=headers
)

# def acquire_guest_token(self):
Expand Down
7 changes: 3 additions & 4 deletions minet/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,7 @@ def __init__(
headers: HTTPHeaderDict,
status: int,
body: bytes,
known_encoding: Optional[str] = None,
known_encoding: Optional[str] = "utf-8",
):
self.__url = url
self.__stack = stack
Expand Down Expand Up @@ -944,7 +944,7 @@ def request(
follow_js_relocation: bool = False,
infer_redirection: bool = False,
canonicalize: bool = False,
known_encoding: Optional[str] = None,
known_encoding: Optional[str] = "utf-8",
timeout: Optional[AnyTimeout] = None,
body: Optional[Union[str, bytes]] = None,
json_body: Optional[Any] = None,
Expand Down Expand Up @@ -1127,8 +1127,7 @@ def request_jsonrpc(
url,
pool_manager=pool_manager,
method="POST",
json_body={"method": method, "params": params},
known_encoding="utf-8",
json_body={"method": method, "params": params}
)


Expand Down
4 changes: 1 addition & 3 deletions minet/youtube/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ def __init__(self, key):
def request_json(self, url):
while True:
final_url = add_query_argument(url, "key", self.current_key)
response = request(
final_url, pool_manager=self.pool_manager, known_encoding="utf-8"
)
response = request(final_url, pool_manager=self.pool_manager)
data = response.json()

if response.status == 403:
Expand Down
1 change: 0 additions & 1 deletion minet/youtube/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def __init__(self):
def request(self, url, spoof_ua: bool = False) -> Response:
return request(
url,
known_encoding="utf-8",
pool_manager=self.pool_manager,
spoof_ua=spoof_ua,
)
Expand Down

0 comments on commit 1b7e0ff

Please sign in to comment.