Skip to content

Commit

Permalink
Merge pull request #142 from creativecommons/update-gcs-fetch
Browse files Browse the repository at this point in the history
GCS fetch: add discovery num_retries and query rate_delay
  • Loading branch information
TimidRobot authored Nov 27, 2024
2 parents 670d1a6 + 6a5e557 commit 5b2f7d7
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions scripts/1-fetch/gcs_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
import traceback
import urllib.parse
from copy import copy

# Third-party
import googleapiclient.discovery
Expand All @@ -34,14 +35,14 @@
load_dotenv(PATHS["dotenv"])

# Constants
DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
CX = os.getenv("GCS_CX")
BASE_URL = "https://www.googleapis.com/customsearch/v1"
FILE1_COUNT = os.path.join(PATHS["data_phase"], "gcs_1_count.csv")
FILE2_LANGUAGE = os.path.join(
PATHS["data_phase"], "gcs_2_count_by_language.csv"
)
FILE3_COUNTRY = os.path.join(PATHS["data_phase"], "gcs_3_count_by_country.csv")
GCS_CX = os.getenv("GCS_CX")
GCS_DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
HEADER2_LANGUAGE = ["PLAN_INDEX", "TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
HEADER3_COUNTRY = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
Expand Down Expand Up @@ -87,7 +88,11 @@ def get_search_service():
"""
LOGGER.info("Getting Google Custom Search API Service.")
return googleapiclient.discovery.build(
"customsearch", "v1", developerKey=DEVELOPER_KEY, cache_discovery=False
"customsearch",
"v1",
developerKey=GCS_DEVELOPER_KEY,
cache_discovery=False,
num_retries=5,
)


Expand Down Expand Up @@ -184,21 +189,15 @@ def query_gcs(args, service, last_completed_plan_index, plan):

max_tries = 5
initial_delay = 1 # in seconds
rate_delay = copy(initial_delay) # query gently
start = last_completed_plan_index + 1
stop = start + args.limit

for plan_row in plan[start:stop]: # noqa: E203
index = plan.index(plan_row)
query_info = f"index: {index}, tool: {plan_row['TOOL_IDENTIFIER']}"
encoded_tool_url = urllib.parse.quote(plan_row["TOOL_URL"], safe=":/")
query_params = {
"cx": CX,
# "num": records_per_query,
# "start": start_index,
# "cr": cr,
# "lr": lr,
"q": encoded_tool_url,
}
query_params = {"cx": GCS_CX, "q": encoded_tool_url}
if plan_row["COUNTRY"]:
query_info = f"{query_info}, country: {plan_row['COUNTRY']}"
query_params["cr"] = plan_row["CR"]
Expand All @@ -222,6 +221,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
results.get("searchInformation", {}).get("totalResults", 0)
)
success = True
time.sleep(rate_delay)
break # no need to try again

except HttpError as e:
Expand All @@ -230,7 +230,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
"Quota exceeded" in e.reason
and "Queries per day" in e.reason
):
LOGGER.warning(f"{e.status_code}: {e.reason}.")
LOGGER.warning(f"{e.status_code}: {e.reason}")
return # abort queries
else:
LOGGER.warning(
Expand Down

0 comments on commit 5b2f7d7

Please sign in to comment.