diff --git a/analyze/data_analysis.py b/analyze/data_analysis.py index 76a037cf..52b96213 100644 --- a/analyze/data_analysis.py +++ b/analyze/data_analysis.py @@ -138,10 +138,10 @@ def time_trend_helper(df): Extract year-wise count of entries from a DataFrame. Args: - df (DataFrame): Input DataFrame containing dates. + - df (DataFrame): Input DataFrame containing dates. Returns: - DataFrame: DataFrame with counts of entries per year. + - DataFrame: DataFrame with counts of entries per year. """ year_list = [] for date_row in df["dates"][0:]: @@ -167,7 +167,7 @@ def time_trend(csv_path): Generate a line graph to show the time trend of the license usage. Args: - csv_path (str): Path to the CSV file. + - csv_path (str): Path to the CSV file. """ df = pd.read_csv(csv_path) count_df = time_trend_helper(df) @@ -205,10 +205,10 @@ def time_trend_compile_helper(yearly_count): Filter yearly trend data for the years between 2018 and 2022. Args: - yearly_count (DataFrame): DataFrame with "year" and "Counts" columns. + - yearly_count (DataFrame): DataFrame with "year" and "Counts" columns. Returns: - DataFrame: Filtered yearly count data. + - DataFrame: Filtered yearly count data. """ Years = np.arange(2018, 2023) yearly_count["year"] = list(yearly_count.index) @@ -370,10 +370,10 @@ def view_compare_helper(df): Calculate maximum views of pictures under a license. Args: - df (DataFrame): Input DataFrame. + - df (DataFrame): Input DataFrame. Returns: - int: Maximum views. + - int: Maximum views. """ highest_view = int(max(df["views"])) df = df.sort_values("views", ascending=False) diff --git a/deviantart/deviantart_scratcher.py b/deviantart/deviantart_scratcher.py index 8be036c3..4093bfd8 100755 --- a/deviantart/deviantart_scratcher.py +++ b/deviantart/deviantart_scratcher.py @@ -17,26 +17,36 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +# Set up current working directory CWD = os.path.dirname(os.path.abspath(__file__)) +# Load environment variables dotenv_path = os.path.join(os.path.dirname(CWD), ".env") load_dotenv(dotenv_path) +# Get the current date today = dt.datetime.today() +# Retrieve API keys API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",") API_KEYS_IND = 0 +# Set up file path for CSV report DATA_WRITE_FILE = ( f"{CWD}" f"/data_deviantart_{today.year}_{today.month}_{today.day}.csv" ) +# Retrieve Programmable Search Engine key from environment variables PSE_KEY = os.getenv("PSE_KEY") def get_license_list(): - """Provides the list of license from 2018's record of Creative Commons. + """ + Provides the list of license from 2018's record of Creative Commons. + Returns: - np.array: An np array containing all license types that should be - searched via Programmable Search Engine. + - np.array: An array containing all license types that should be + searched via Programmable Search Engine. """ + # Read license data from file cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None) + # Define regex pattern to extract license types license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*" license_list = ( cc_license_data[0] @@ -48,14 +58,14 @@ def get_license_list(): def get_request_url(license): - """Provides the API Endpoint URL for specified parameter combinations. + """ + Provides the API Endpoint URL for specified parameter combinations. Args: - license: - A string representing the type of license, and should be a segment - of its URL towards the license description. + - license (str): A string representing the type of license. It's a + segment of the URL towards the license description. + Returns: - string: A string representing the API Endpoint URL for the query - specified by this function's parameters. + - str: The API Endpoint URL for the query specified by parameters. """ try: api_key = API_KEYS[API_KEYS_IND] @@ -73,19 +83,19 @@ def get_request_url(license): def get_response_elems(license): - """Provides the metadata for query of specified parameters + """ + Provides the metadata for query of specified parameters Args: - license: - A string representing the type of license, and should be a segment - of its URL towards the license description. Alternatively, the - default None value stands for having no assumption about license - type. + - license (str): A string representing the type of license. + It's a segment of the URL towards the license description. If not provided, + it defaults to None, indicating no assumption about the license type. Returns: - dict: A dictionary mapping metadata to its value provided from the API - query of specified parameters. + - dict: A dictionary mapping metadata to its value provided from the API + query. """ try: + # Make a request to the API and handle potential retries request_url = get_request_url(license) max_retries = Retry( total=5, @@ -104,6 +114,7 @@ def get_response_elems(license): return search_data_dict except Exception as e: if isinstance(e, requests.exceptions.HTTPError): + # If quota limit exceeded, switch to the next API key global API_KEYS_IND API_KEYS_IND += 1 print( @@ -115,20 +126,18 @@ def get_response_elems(license): def set_up_data_file(): - """Writes the header row to file to contain DeviantArt data.""" + """Writes the header row to the file to contain DeviantArt data.""" header_title = "LICENSE TYPE,Document Count" with open(DATA_WRITE_FILE, "w") as f: f.write(f"{header_title}\n") def record_license_data(license_type): - """Writes the row for LICENSE_TYPE to file to contain DeviantArt data. + """Writes the row for LICENSE_TYPE to the file to contain DeviantArt data. Args: - license_type: - A string representing the type of license, and should be a segment - of its URL towards the license description. Alternatively, the - default None value stands for having no assumption about license - type. + - license_type(str): A string representing the type of license. + It's a segment of the URL towards the license description. If not provided, + it defaults to None, indicating no assumption about the license type. """ data_log = ( f"{license_type}," @@ -139,10 +148,14 @@ def record_license_data(license_type): def record_all_licenses(): - """Records the data of all license types findable in the license list and - records these data into the DATA_WRITE_FILE as specified in that constant. """ + Records the data for all available license types listed in the license + list and writes this data into the DATA_WRITE_FILE, as specified by the + constant. + """ + # Get the list of license types license_list = get_license_list() + # Record data for each license types for license_type in license_list: record_license_data(license_type) diff --git a/flickr/data_cleaning.py b/flickr/data_cleaning.py index 286b5aa8..993cba29 100644 --- a/flickr/data_cleaning.py +++ b/flickr/data_cleaning.py @@ -20,9 +20,15 @@ import pandas as pd -def drop_empty_column(csv_path, new_csv_path): # attribute is string +def drop_empty_column(csv_path, new_csv_path): + """ + Drops columns with 'Unnamed' in the name from the CSV file. + Args: + - csv_path (str): Path to the original CSV file. + - new_csv_path (str): Path to save the cleaned CSV file. + """ df = pd.read_csv(csv_path) - for col in df.columns: # to get the column list + for col in df.columns: if "Unnamed" in col: data = df.drop(col, axis=1) print("Dropping column", col) @@ -30,22 +36,29 @@ def drop_empty_column(csv_path, new_csv_path): # attribute is string print("Dropping empty columns") -def drop_duplicate_id(csv_path, new_csv_path): # attribute is string +def drop_duplicate_id(csv_path, new_csv_path): + """ + Drops duplicate rows based on the 'id' column from the CSV file. + + Args: + - csv_path (str): Path to the original CSV file. + - new_csv_path (str): Path to save the cleaned CSV file. + """ df = pd.read_csv(csv_path) data = df.drop_duplicates(subset=["id"]) data.to_csv(new_csv_path) print("Dropping duplicates") -def save_new_data( - csv_path, column_name_list, new_csv_path -): # attribute is string +def save_new_data(csv_path, column_name_list, new_csv_path): """ - column_name_list must belongs to the - existing column names from original csv - csv_path is the path of original csv - This function generate a new dataframe - to save final data with useful columns + Saves specified columns from the original CSV file to a new CSV file. + + Args: + - csv_path (str): Path to the original CSV file. + - column_name_list (list of str): List of column names to be saved + (belongs to the existing column names from original csv) + - new_csv_path (str): Path to save the new CSV file. """ df = pd.read_csv(csv_path) new_df = pd.DataFrame() diff --git a/flickr/photos.py b/flickr/photos.py index 6f2d92c4..5e9cdf55 100644 --- a/flickr/photos.py +++ b/flickr/photos.py @@ -1,3 +1,8 @@ +""" +Fetching photo information from Flickr API for photos under +each Creative Commons license and saving the data into a JSON file +""" + # Standard library import json import os @@ -9,24 +14,29 @@ import flickrapi from dotenv import load_dotenv +# Get the current working directory CWD = os.path.dirname(os.path.abspath(__file__)) +# Load environment variables dotenv_path = os.path.join(os.path.dirname(CWD), ".env") load_dotenv(dotenv_path) def main(): + # Initialize Flickr API instance flickr = flickrapi.FlickrAPI( os.getenv("FLICKR_API_KEY"), os.getenv("FLICKR_API_SECRET"), format="json", ) - # use search method to pull general photo info under each cc license data - # saved in photos.json + # Dictionary to store photo data for each Creative Commons license dic = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 9: 0, 10: 0} + # Use search method to retrieve photo info for each license + # and store it in the dictionary for i in dic.keys(): photosJson = flickr.photos.search(license=i, per_page=500) dic[i] = [json.loads(photosJson.decode("utf-8"))] + # Save the dictionary containing photo data to a JSON file with open(os.path.join(CWD, "photos.json"), "w") as json_file: json.dump(dic, json_file) diff --git a/flickr/photos_detail.py b/flickr/photos_detail.py index 16700f44..97679fb5 100644 --- a/flickr/photos_detail.py +++ b/flickr/photos_detail.py @@ -20,17 +20,26 @@ import pandas as pd from dotenv import load_dotenv +# Set up current working directory CWD = os.path.dirname(os.path.abspath(__file__)) +# Load environment variables dotenv_path = os.path.join(os.path.dirname(CWD), ".env") load_dotenv(dotenv_path) +# Global variable: Number of retries for error handling RETRIES = 0 def to_df(datalist, namelist): """ - this is to transform pulled and queried data into dataframe - by iterating through the list of columns + Transform data into a DataFrame. + + Args: + - datalist (list): List of lists containing data. + - namelist (list): List of column names. + + Returns: + - df (DataFrame): DataFrame constructed from the data. """ df = [pd.DataFrame() for ind in range(len(datalist))] df = pd.DataFrame(datalist).transpose() @@ -40,25 +49,32 @@ def to_df(datalist, namelist): def df_to_csv(temp_list, name_list, temp_csv, final_csv): """ - This function is to save the data first into datafram and then csv - temp_csv is the csv that used for saving data every 100 seconds - temp_csv is set to prevent data from losing when script stops - final_csv is the final csv for one certain license - pd.concat(map...) means to merge temp CSV to final CSV - both temp_csv and final_csv should be path in form of string - note that the map(pd.read_csv) means overwrite data - so duplicate issue solved + Save data to temporary CSV and then merge it with final CSV. + + Args: + - temp_list (list): csv that is used for saving data every 100 seconds. + It is set to prevent data from losing when script stops + - name_list (list): List of column names. + - temp_csv (str): Temporary CSV file path. + - final_csv (str): Final CSV file path. """ df = to_df(temp_list, name_list) df.to_csv(temp_csv) + # Merge temporary CSV with final CSV, ignoring index to avoid duplication df = pd.concat(map(pd.read_csv, [temp_csv, final_csv]), ignore_index=True) df.to_csv(final_csv) def creat_lisoflis(size): """ - this is to create one list of list [[],[],[]] to save - all the columns with each column as a list + Create one list of list [[],[],[]] to save all the columns with + each column as a list + + Args: + - size (int): Size of the list of lists. + + Returns: + - temp_list (list): List of empty lists. """ temp_list = [[] for i in range(size)] return temp_list @@ -66,8 +82,11 @@ def creat_lisoflis(size): def clean_saveas_csv(old_csv_str, new_csv_str): """ - when iterating through all the data in one license - clean empty columns and save the csv to a new one + Clean empty columns and save CSV to a new file. + + Args: + - old_csv_str (str): Path to the old CSV file. + - new_csv_str (str): Path to the new CSV file. """ data = pd.read_csv(old_csv_str, low_memory=False) for col in list(data.columns): @@ -77,14 +96,30 @@ def clean_saveas_csv(old_csv_str, new_csv_str): def query_helper1(raw, part, detail, temp_list, index): - """Helper function 1 for query_data""" - # part and detail should be string + """ + Helper function 1 for querying data. + + Args: + - raw (dict): Raw data from API. + - part (str): Part of the data. + - detail (str): Detail to be queried. + - temp_list (list): List to store queried data. + - index (int): Index of the data in temp_list. + """ queried_raw = raw["photo"][part][detail] yield queried_raw def query_helper2(raw, part, temp_list, index): - """Helper function 2 for query_data""" + """ + Helper function 2 for querying data. + + Args: + - raw (dict): Raw data from API. + - part (str): Part of the data. + - temp_list (list): List to store queried data. + - index (int): Index of the data in temp_list. + """ # part should be string queried_raw = raw["photo"][part] yield queried_raw @@ -92,12 +127,16 @@ def query_helper2(raw, part, temp_list, index): def query_data(raw_data, name_list, data_list): """ - Function for querying the useful data - from raw pulled data - in our case useful data is supposed to be this + Query useful data from raw pulled data and store it in lists. + In our case useful data is supposed to be this name list: ["id", "dateuploaded", "isfavorite", "license", "realname", "location", "title", "description", "dates", "views", "comments", "tags"] + + Args: + - raw_data (dict): Raw data from API. + - name_list (list): List of column names. + - data_list (list): List of lists to store data. """ for a in range(0, len(name_list)): if (0 <= a < 4) or a == 9: @@ -114,12 +153,9 @@ def query_data(raw_data, name_list, data_list): elif a == 8: temp = query_helper1(raw_data, name_list[a], "taken", data_list, a) data_list[a].append(next(temp)) - - # some photo id has more than one sub ids included - # each corresponds to certain tag(s) - # therefore we save tags of each id as a list - # further clean/query may be needed in analyzing - # this column of data + # Each photo ID can have multiple tags, + # so we save the tags for each ID as a list. + # Further cleaning or analysis may be required for this data column. if a == 11: tags = raw_data["photo"]["tags"]["tag"] if tags: @@ -135,10 +171,14 @@ def query_data(raw_data, name_list, data_list): def page1_reset(final_csv, raw_data): """ - change total equals to the total picture number under current license - everytime moving to the 1st page of a new license - and set the final CSV as empty if is at the 1st page - final_csv is the path in the form of string + Reset page count and update total picture count. + + Args: + - final_csv (str): Path to the final CSV file. + - raw_data (dict): Raw data from API call. + + Returns: + - int: Total number of pages. """ data = pd.read_csv(final_csv, low_memory=False) for col in list(data.columns): @@ -152,16 +192,17 @@ def main(): record_txt_path = os.path.join(CWD, "rec.txt") hs_csv_path = os.path.join(CWD, "hs.csv") + # Initialize Flickr API instance flickr = flickrapi.FlickrAPI( os.getenv("FLICKR_API_KEY"), os.getenv("FLICKR_API_SECRET"), format="json", ) - # below is the cc licenses list + # List of Creative Commons licenses license_list = [1, 2, 3, 4, 5, 6, 9, 10] - # we want to have these 11 columns of data saved in final csv - # name_lis is the header of final table + # List of column names for the final CSV + # name_list is the header of final table # temp_list is in the form of list within list, which saves the actual data # each internal list is a column: ie. temp_list[0] saves the data of id # number @@ -180,31 +221,29 @@ def main(): "tags", ] temp_list = creat_lisoflis(len(name_list)) - # use rec txt to record j(current page), i(current license), and total - # every time iterating through one page of photos - # to pick up from where the script errors or stops + # Read current page, license, and total from record text file + # Resumes iteration from the last processed page if the script + # encounters errors or stops. with open(record_txt_path) as f: readed = f.read().split(" ") - j = int(readed[0]) - i = int(readed[1]) - total = int(readed[2]) + j = int(readed[0]) # Current page + i = int(readed[1]) # Current license + total = int(readed[2]) # Total number of pages while i in license_list: + # Iterate through pages while j <= total: - # use search method to pull photo id in each license + # Use search method to pull photo ids for each license photosJson = flickr.photos.search(license=i, per_page=100, page=j) time.sleep(1) photos = json.loads(photosJson.decode("utf-8")) id = [x["id"] for x in photos["photos"]["photo"]] - # change total equals to the total picture number - # and set the final CSV as empty + # Reset total and clear final CSV if on the first page if j == 1: total = page1_reset(final_csv_path, photos) - # use getInfo method to get more detailed photo - # info from inputting photo id - # and query data and save into list (temp_list) - # as columns of final dataset + # Use getInfo method to get detailed photo info from photo ids + # Query data and save into temp_list as columns of final dataset for index in range(0, len(id)): detailJson = flickr.photos.getInfo( license=i, photo_id=id[index] @@ -238,18 +277,18 @@ def main(): RETRIES, ) - # save csv + # save data to csv df_to_csv(temp_list, name_list, hs_csv_path, final_csv_path) - # update j (the current page number in txt) + # Update current page in record text file with open(record_txt_path, "w") as f: f.write(f"{j} {i} {total}") - # set list to empty everytime after saving the data into - # the csv file to prevent from saving duplicate data + # Clear temp_list everytime after saving the data into + # the csv file to prevent duplication temp_list = creat_lisoflis(len(name_list)) - # if current page has reached the max limit of total pages - # reset j to 1 and update i to the license in the dictionary + # If reached max limit of pages, reset j to 1 and + # update i to the license in the dictionary if j == total + 1 or j > total: license_i_path = os.path.join(CWD, f"license{i}.csv") clean_saveas_csv(final_csv_path, license_i_path) @@ -260,8 +299,7 @@ def main(): with open(record_txt_path, "w") as f: f.write(f"{j} {i} {total}") - # below is to clear list everytime - # before rerun (to prevent duplicate) + # Clear temp_list before rerun to prevent duplication temp_list = creat_lisoflis(len(name_list)) break diff --git a/wikipedia/wikipedia_scratcher.py b/wikipedia/wikipedia_scratcher.py index af4a0b2e..519e9960 100755 --- a/wikipedia/wikipedia_scratcher.py +++ b/wikipedia/wikipedia_scratcher.py @@ -23,7 +23,8 @@ def get_wiki_langs(): - """Provides the list of language to find Creative Commons usage data on. + """ + Provides the list of language to find Creative Commons usage data on. The codes represent the language codes defined by ISO 639-1 and ISO 639-3, and the decision of which language code to use is usually determined by the @@ -31,24 +32,23 @@ def get_wiki_langs(): (https://en.wikipedia.org/wiki/List_of_Wikipedias#Wikipedia_edition_codes) Returns: - pd.DataFrame: A Dataframe containing information of each Wikipedia - language and its respective encoding on web address. + - pd.DataFrame: A Dataframe containing information of each Wikipedia + language and its respective encoding on web address. """ return pd.read_csv(f"{CWD}/language-codes_csv.csv") def get_request_url(lang="en"): - """Provides the API Endpoint URL for specified parameter combinations. + """ + Provides the API Endpoint URL for specified parameter combinations. Args: - lang: - A string representing the language that the search results are - presented in. Alternatively, the default value is by Wikipedia - customs "en". + - lang: A string representing the language that the search results are + presented in. Alternatively, the default value is by Wikipedia customs "en" Returns: - string: A string representing the API Endpoint URL for the query - specified by this function's parameters. + - string: A string representing the API Endpoint URL for the query + specified by this function's parameters. """ base_url = ( r"wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=statistics" @@ -59,17 +59,16 @@ def get_request_url(lang="en"): def get_response_elems(language="en"): - """Provides the metadata for query of specified parameters + """ + Provides the metadata for query of specified parameters Args: - language: - A string representing the language that the search results are - presented in. Alternatively, the default value is by Wikipedia - customs "en". + - language: A string representing the language that the search results are + presented in. Alternatively, the default value is by Wikipedia customs "en" Returns: - dict: A dictionary mapping metadata to its value provided from the API - query of specified parameters. + - dict: A dictionary mapping metadata to its value provided from the API + query of specified parameters. """ search_data = None try: @@ -114,10 +113,8 @@ def record_lang_data(lang="en"): """Writes the row for LICENSE_TYPE to file to contain Google Query data. Args: - lang: - A string representing the language that the search results are - presented in. Alternatively, the default value is by Wikipedia - customs "en". + - lang: A string representing the language that the search results are + presented in. Alternatively, the default value is by Wikipedia customs "en" """ response = get_response_elems(lang) if response != {}: @@ -141,8 +138,8 @@ def get_current_data(): Wikipedia texts are licensed under CC-BY-SA 3.0 Returns: - pd.DataFrame: A DataFrame recording the number of CC-licensed documents - per search query of assumption. + - pd.DataFrame: A DataFrame recording the number of CC-licensed documents + per search query of assumption. """ return pd.read_csv(DATA_WRITE_FILE).set_index("language") diff --git a/youtube/youtube_scratcher.py b/youtube/youtube_scratcher.py index 7d74b584..884bead3 100755 --- a/youtube/youtube_scratcher.py +++ b/youtube/youtube_scratcher.py @@ -16,12 +16,17 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +# Get the current working directory CWD = os.path.dirname(os.path.abspath(__file__)) +# Load environment variables dotenv_path = os.path.join(os.path.dirname(CWD), ".env") load_dotenv(dotenv_path) +# Get the current date today = dt.datetime.today() +# Get the YouTube API key API_KEY = os.getenv("YOUTUBE_API_KEY") +# Set up file path for CSV report DATA_WRITE_FILE = ( f"{CWD}" f"/data_youtube_{today.year}_{today.month}_{today.day}.csv" ) @@ -31,13 +36,14 @@ def get_next_time_search_interval(): - """Provides the next searching interval of time for Creative Commons + """ + Provides the next searching interval of time for Creative Commons licensed video. Yields: - tuple: A tuple representing the time search interval currently dealt - via 2 RFC 3339 formatted date-time values (by YouTube API Standards), - and the current starting year and month of the interval. + - tuple: A tuple representing the time search interval currently dealt + via 2 RFC 3339 formatted date-time values (by YouTube API Standards), + and the current starting year and month of the interval. """ cur_year, cur_month = 2009, 1 while cur_year * 100 + cur_month <= today.year * 100 + today.month: @@ -66,17 +72,18 @@ def get_next_time_search_interval(): def get_request_url(time=None): - """Provides the API Endpoint URL for specified parameter combinations. + """ + Provides the API Endpoint URL for specified parameter combinations. Args: - time: A tuple indicating whether this query is related to video time - occurrence, and the time interval which it would like to investigate. - Defaults to None to indicate the query is not related to video time - occurrence. + - time: A tuple indicating whether this query is related to video time + occerrence, and the time interval which it would like to investigate. + Defaults to None to indicate the query is not related to video time + occurrence. Returns: - string: A string representing the API Endpoint URL for the query - specified by this function's parameters. + - string: A string representing the API Endpoint URL for the query + specified by this function's parameters. """ base_url = ( r"https://youtube.googleapis.com/youtube/v3/search?part=snippet" @@ -92,17 +99,18 @@ def get_request_url(time=None): def get_response_elems(time=None): - """Provides the metadata for query of specified parameters + """ + Provides the metadata for query of specified parameters Args: - time: A tuple indicating whether this query is related to video time - occurrence, and the time interval which it would like to investigate. - Defaults to None to indicate the query is not related to video time - occurrence. + - time: A tuple indicating whether this query is related to video time + occurrence, and the time interval which it would like to investigate. + Defaults to None to indicate the query is not related to video time + occurrence. Returns: - dict: A dictionary mapping metadata to its value provided from the API - query of specified parameters. + - dict: A dictionary mapping metadata to its value provided from the API + query of specified parameters. """ search_data = None try: @@ -114,6 +122,7 @@ def get_response_elems(time=None): ) session = requests.Session() session.mount("https://", HTTPAdapter(max_retries=max_retries)) + # Send GET request to YouTube API with session.get(request_url) as response: response.raise_for_status() search_data = response.json()