request known_encoding kwarg now defaults to utf-8

Assumed unknowed encoding remains default for executors and crawlers Fix #890
medialab · Nov 16, 2023 · 1b7e0ff · 1b7e0ff
1 parent 2a016af
commit 1b7e0ff
Show file tree

Hide file tree

Showing 16 changed files with 28 additions and 29 deletions.
diff --git a/minet/buzzsumo/client.py b/minet/buzzsumo/client.py
@@ -77,7 +77,7 @@ def __init__(self, token):
     @rate_limited_method()
     def request(self, url):
         try:
-            response = request(url, known_encoding="utf-8")
+            response = request(url)
             data = response.json()
         except JSONDecodeError:
             raise BuzzSumoBadRequestError

diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py
@@ -107,6 +107,7 @@ def __init__(
         max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
         stateful_redirects: bool = False,
         spoof_ua: bool = False,
+        known_encoding: Optional[str] = None,
         callback: Optional[
             Callable[
                 [
@@ -139,6 +140,9 @@ def __init__(
         if compressed:
             self.default_kwargs["compressed"] = True
 
+        if known_encoding is not None:
+            self.default_kwargs["known_encoding"] = known_encoding
+
     def __call__(
         self, job: CrawlJob[CrawlJobDataType]
     ) -> Optional[
@@ -288,6 +292,7 @@ def __init__(
         request_args: Optional[RequestArgsType[CrawlJobDataType]] = None,
         use_pycurl: bool = False,
         compressed: bool = False,
+        known_encoding: Optional[str] = None,
         max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
         stateful_redirects: bool = False,
         spoof_ua: bool = False,
@@ -398,6 +403,7 @@ def __init__(
             "spoof_ua": spoof_ua,
             "use_pycurl": use_pycurl,
             "compressed": compressed,
+            "known_encoding": known_encoding
         }
 
     def __repr__(self):

diff --git a/minet/crowdtangle/client.py b/minet/crowdtangle/client.py
@@ -56,7 +56,7 @@ def __init__(self, token, rate_limit=None):
 
     @retrying_method()
     def __request(self, url):
-        response = request(url, pool_manager=self.pool_manager, known_encoding="utf-8")
+        response = request(url, pool_manager=self.pool_manager)
 
         # Bad auth
         if response.status == 401:

diff --git a/minet/executors.py b/minet/executors.py
@@ -105,6 +105,7 @@ class ExecutorRequestKwargs(TypedDict, Generic[ItemType, CallbackResultType]):
     buffer_size: NotRequired[int]
     domain_parallelism: NotRequired[int]
     max_redirects: NotRequired[int]
+    known_encoding: NotRequired[Optional[str]]
 
 
 class ExecutorResolveKwargs(TypedDict, Generic[ItemType, CallbackResultType]):
@@ -362,6 +363,7 @@ def __init__(
         follow_js_relocation: bool = False,
         infer_redirection: bool = False,
         canonicalize: bool = False,
+        known_encoding: Optional[str] = None,
         callback: Optional[
             Union[
                 Callable[[ItemType, str, Response], CallbackResultType],
@@ -404,6 +406,9 @@ def __init__(
         if compressed:
             self.default_kwargs["compressed"] = True
 
+        if known_encoding is not None:
+            self.default_kwargs["known_encoding"] = known_encoding
+
     def __call__(
         self, payload: HTTPWorkerPayloadBase[ItemType]
     ) -> Optional[
@@ -589,6 +594,7 @@ def request(
         buffer_size: int = DEFAULT_IMAP_BUFFER_SIZE,
         domain_parallelism: int = DEFAULT_DOMAIN_PARALLELISM,
         max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
+        known_encoding: Optional[str] = None,
         callback: Optional[
             Callable[[ItemType, str, Response], CallbackResultType]
         ] = None,
@@ -608,6 +614,7 @@ def request(
             max_redirects=max_redirects,
             use_pycurl=use_pycurl,
             compressed=compressed,
+            known_encoding=known_encoding,
             callback=callback,
         )
 

diff --git a/minet/facebook/mobile_scraper.py b/minet/facebook/mobile_scraper.py
@@ -640,7 +640,6 @@ def request_page(self, url):
             pool_manager=self.pool_manager,
             cookie=self.cookie,
             headers={"User-Agent": "curl/7.68.0", "Accept-Language": "en"},
-            known_encoding="utf-8",
         )
 
         return response.text()

diff --git a/minet/google/sheets.py b/minet/google/sheets.py
@@ -62,7 +62,7 @@ def export_google_sheets_as_csv(
     while True:
         attempts -= 1
 
-        response = request(export_url, cookie=cookie, known_encoding="utf-8")
+        response = request(export_url, cookie=cookie)
 
         if response.status == 404:
             raise GoogleSheetsNotFoundError

diff --git a/minet/instagram/api_scraper.py b/minet/instagram/api_scraper.py
@@ -222,11 +222,7 @@ def request_json(self, url, magic_token=False):
             headers["X-IG-App-ID"] = self.magic_token
 
         response = request(
-            url,
-            pool_manager=self.pool_manager,
-            spoof_ua=True,
-            headers=headers,
-            known_encoding="utf-8",
+            url, pool_manager=self.pool_manager, spoof_ua=True, headers=headers
         )
 
         text = response.text()
@@ -281,7 +277,7 @@ def request_json(self, url, magic_token=False):
         return data
 
     def get_magic_token(self):
-        response = request("https://www.instagram.com/disney", known_encoding="utf-8")
+        response = request("https://www.instagram.com/disney")
 
         if response.status >= 400:
             return None

diff --git a/minet/mediacloud/search.py b/minet/mediacloud/search.py
@@ -133,7 +133,7 @@ def generator():
                 last_processed_stories_id=last_processed_stories_id,
             )
 
-            response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
+            response = request(url, pool_manager=pool_manager)
             data = response.json()
 
             if response.status >= 500:

diff --git a/minet/mediacloud/topic.py b/minet/mediacloud/topic.py
@@ -50,7 +50,7 @@ def mediacloud_topic_stories(
             from_media_id=from_media_id,
         )
 
-        response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
+        response = request(url, pool_manager=pool_manager)
         data = response.json()
 
         if "stories" not in data or len(data["stories"]) == 0:

diff --git a/minet/mediacloud/utils.py b/minet/mediacloud/utils.py
@@ -56,7 +56,7 @@ def make_simple_call(
     if query is not None:
         url += "&" + ("&".join("%s=%s" % (str(k), str(v)) for k, v in query.items()))
 
-    response = request(url, pool_manager=pool_manager, known_encoding="utf-8")
+    response = request(url, pool_manager=pool_manager)
     data = response.json()
 
     if response.status >= 500:

diff --git a/minet/telegram/scraper.py b/minet/telegram/scraper.py
@@ -328,7 +328,7 @@ def __init__(self, throttle=TELEGRAM_DEFAULT_THROTTLE):
     @rate_limited_method()
     @retrying_method()
     def request_page(self, url):
-        response = request(url, pool_manager=self.pool_manager, known_encoding="utf-8")
+        response = request(url, pool_manager=self.pool_manager)
 
         return response.text()
 

diff --git a/minet/tiktok/api_scraper.py b/minet/tiktok/api_scraper.py
@@ -64,11 +64,7 @@ def request_json(self, url):
         headers = {"Cookie": self.cookie}
 
         response = request(
-            url,
-            pool_manager=self.pool_manager,
-            spoof_ua=True,
-            headers=headers,
-            known_encoding="utf-8",
+            url, pool_manager=self.pool_manager, spoof_ua=True, headers=headers
         )
 
         if response.status >= 400:

diff --git a/minet/twitter/api_scraper.py b/minet/twitter/api_scraper.py
@@ -481,8 +481,7 @@ def request(self, url, headers=None, method="GET"):
             pool_manager=self.pool_manager,
             spoof_ua=True,
             method=method,
-            headers=headers,
-            known_encoding="utf-8",
+            headers=headers
         )
 
     # def acquire_guest_token(self):

diff --git a/minet/web.py b/minet/web.py
@@ -703,7 +703,7 @@ def __init__(
         headers: HTTPHeaderDict,
         status: int,
         body: bytes,
-        known_encoding: Optional[str] = None,
+        known_encoding: Optional[str] = "utf-8",
     ):
         self.__url = url
         self.__stack = stack
@@ -944,7 +944,7 @@ def request(
     follow_js_relocation: bool = False,
     infer_redirection: bool = False,
     canonicalize: bool = False,
-    known_encoding: Optional[str] = None,
+    known_encoding: Optional[str] = "utf-8",
     timeout: Optional[AnyTimeout] = None,
     body: Optional[Union[str, bytes]] = None,
     json_body: Optional[Any] = None,
@@ -1127,8 +1127,7 @@ def request_jsonrpc(
         url,
         pool_manager=pool_manager,
         method="POST",
-        json_body={"method": method, "params": params},
-        known_encoding="utf-8",
+        json_body={"method": method, "params": params}
     )
 
 

diff --git a/minet/youtube/client.py b/minet/youtube/client.py
@@ -92,9 +92,7 @@ def __init__(self, key):
     def request_json(self, url):
         while True:
             final_url = add_query_argument(url, "key", self.current_key)
-            response = request(
-                final_url, pool_manager=self.pool_manager, known_encoding="utf-8"
-            )
+            response = request(final_url, pool_manager=self.pool_manager)
             data = response.json()
 
             if response.status == 403:

diff --git a/minet/youtube/scraper.py b/minet/youtube/scraper.py
@@ -48,7 +48,6 @@ def __init__(self):
     def request(self, url, spoof_ua: bool = False) -> Response:
         return request(
             url,
-            known_encoding="utf-8",
             pool_manager=self.pool_manager,
             spoof_ua=spoof_ua,
         )