Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding comments in the codebase #73

Merged
merged 7 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions analyze/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,10 @@ def time_trend_helper(df):
Extract year-wise count of entries from a DataFrame.

Args:
df (DataFrame): Input DataFrame containing dates.
- df (DataFrame): Input DataFrame containing dates.

Returns:
DataFrame: DataFrame with counts of entries per year.
- DataFrame: DataFrame with counts of entries per year.
"""
year_list = []
for date_row in df["dates"][0:]:
Expand All @@ -167,7 +167,7 @@ def time_trend(csv_path):
Generate a line graph to show the time trend of the license usage.

Args:
csv_path (str): Path to the CSV file.
- csv_path (str): Path to the CSV file.
"""
df = pd.read_csv(csv_path)
count_df = time_trend_helper(df)
Expand Down Expand Up @@ -205,10 +205,10 @@ def time_trend_compile_helper(yearly_count):
Filter yearly trend data for the years between 2018 and 2022.

Args:
yearly_count (DataFrame): DataFrame with "year" and "Counts" columns.
- yearly_count (DataFrame): DataFrame with "year" and "Counts" columns.

Returns:
DataFrame: Filtered yearly count data.
- DataFrame: Filtered yearly count data.
"""
Years = np.arange(2018, 2023)
yearly_count["year"] = list(yearly_count.index)
Expand Down Expand Up @@ -370,10 +370,10 @@ def view_compare_helper(df):
Calculate maximum views of pictures under a license.

Args:
df (DataFrame): Input DataFrame.
- df (DataFrame): Input DataFrame.

Returns:
int: Maximum views.
- int: Maximum views.
"""
highest_view = int(max(df["views"]))
df = df.sort_values("views", ascending=False)
Expand Down
65 changes: 39 additions & 26 deletions deviantart/deviantart_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,36 @@
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up current working directory
CWD = os.path.dirname(os.path.abspath(__file__))
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)

# Get the current date
today = dt.datetime.today()
# Retrieve API keys
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
API_KEYS_IND = 0
# Set up file path for CSV report
DATA_WRITE_FILE = (
f"{CWD}" f"/data_deviantart_{today.year}_{today.month}_{today.day}.csv"
)
# Retrieve Programmable Search Engine key from environment variables
PSE_KEY = os.getenv("PSE_KEY")


def get_license_list():
"""Provides the list of license from 2018's record of Creative Commons.
"""
Provides the list of license from 2018's record of Creative Commons.

Returns:
np.array: An np array containing all license types that should be
searched via Programmable Search Engine.
- np.array: An array containing all license types that should be
searched via Programmable Search Engine.
"""
# Read license data from file
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
# Define regex pattern to extract license types
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_list = (
cc_license_data[0]
Expand All @@ -48,14 +58,14 @@ def get_license_list():


def get_request_url(license):
"""Provides the API Endpoint URL for specified parameter combinations.
"""
Provides the API Endpoint URL for specified parameter combinations.
Args:
license:
A string representing the type of license, and should be a segment
of its URL towards the license description.
- license (str): A string representing the type of license. It's a
segment of the URL towards the license description.

Returns:
string: A string representing the API Endpoint URL for the query
specified by this function's parameters.
- str: The API Endpoint URL for the query specified by parameters.
"""
try:
api_key = API_KEYS[API_KEYS_IND]
Expand All @@ -73,19 +83,19 @@ def get_request_url(license):


def get_response_elems(license):
"""Provides the metadata for query of specified parameters
"""
Provides the metadata for query of specified parameters
Args:
license:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
- license (str): A string representing the type of license.
It's a segment of the URL towards the license description. If not provided,
it defaults to None, indicating no assumption about the license type.

Returns:
dict: A dictionary mapping metadata to its value provided from the API
query of specified parameters.
- dict: A dictionary mapping metadata to its value provided from the API
query.
"""
try:
# Make a request to the API and handle potential retries
request_url = get_request_url(license)
max_retries = Retry(
total=5,
Expand All @@ -104,6 +114,7 @@ def get_response_elems(license):
return search_data_dict
except Exception as e:
if isinstance(e, requests.exceptions.HTTPError):
# If quota limit exceeded, switch to the next API key
global API_KEYS_IND
API_KEYS_IND += 1
print(
Expand All @@ -115,20 +126,18 @@ def get_response_elems(license):


def set_up_data_file():
"""Writes the header row to file to contain DeviantArt data."""
"""Writes the header row to the file to contain DeviantArt data."""
header_title = "LICENSE TYPE,Document Count"
with open(DATA_WRITE_FILE, "w") as f:
f.write(f"{header_title}\n")


def record_license_data(license_type):
"""Writes the row for LICENSE_TYPE to file to contain DeviantArt data.
"""Writes the row for LICENSE_TYPE to the file to contain DeviantArt data.
Args:
license_type:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
- license_type(str): A string representing the type of license.
It's a segment of the URL towards the license description. If not provided,
it defaults to None, indicating no assumption about the license type.
"""
data_log = (
f"{license_type},"
Expand All @@ -139,10 +148,14 @@ def record_license_data(license_type):


def record_all_licenses():
"""Records the data of all license types findable in the license list and
records these data into the DATA_WRITE_FILE as specified in that constant.
"""
Records the data for all available license types listed in the license
list and writes this data into the DATA_WRITE_FILE, as specified by the
constant.
"""
# Get the list of license types
license_list = get_license_list()
# Record data for each license types
for license_type in license_list:
record_license_data(license_type)

Expand Down
35 changes: 24 additions & 11 deletions flickr/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,45 @@
import pandas as pd


def drop_empty_column(csv_path, new_csv_path): # attribute is string
def drop_empty_column(csv_path, new_csv_path):
"""
Drops columns with 'Unnamed' in the name from the CSV file.
Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
df = pd.read_csv(csv_path)
for col in df.columns: # to get the column list
for col in df.columns:
if "Unnamed" in col:
data = df.drop(col, axis=1)
print("Dropping column", col)
data.to_csv(new_csv_path)
print("Dropping empty columns")


def drop_duplicate_id(csv_path, new_csv_path): # attribute is string
def drop_duplicate_id(csv_path, new_csv_path):
"""
Drops duplicate rows based on the 'id' column from the CSV file.

Args:
- csv_path (str): Path to the original CSV file.
- new_csv_path (str): Path to save the cleaned CSV file.
"""
df = pd.read_csv(csv_path)
data = df.drop_duplicates(subset=["id"])
data.to_csv(new_csv_path)
print("Dropping duplicates")


def save_new_data(
csv_path, column_name_list, new_csv_path
): # attribute is string
def save_new_data(csv_path, column_name_list, new_csv_path):
"""
column_name_list must belongs to the
existing column names from original csv
csv_path is the path of original csv
This function generate a new dataframe
to save final data with useful columns
Saves specified columns from the original CSV file to a new CSV file.

Args:
- csv_path (str): Path to the original CSV file.
- column_name_list (list of str): List of column names to be saved
(belongs to the existing column names from original csv)
- new_csv_path (str): Path to save the new CSV file.
"""
df = pd.read_csv(csv_path)
new_df = pd.DataFrame()
Expand Down
14 changes: 12 additions & 2 deletions flickr/photos.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
Fetching photo information from Flickr API for photos under
each Creative Commons license and saving the data into a JSON file
"""

# Standard library
import json
import os
Expand All @@ -9,24 +14,29 @@
import flickrapi
from dotenv import load_dotenv

# Get the current working directory
CWD = os.path.dirname(os.path.abspath(__file__))
# Load environment variables
dotenv_path = os.path.join(os.path.dirname(CWD), ".env")
load_dotenv(dotenv_path)


def main():
# Initialize Flickr API instance
flickr = flickrapi.FlickrAPI(
os.getenv("FLICKR_API_KEY"),
os.getenv("FLICKR_API_SECRET"),
format="json",
)

# use search method to pull general photo info under each cc license data
# saved in photos.json
# Dictionary to store photo data for each Creative Commons license
dic = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 9: 0, 10: 0}
# Use search method to retrieve photo info for each license
# and store it in the dictionary
for i in dic.keys():
photosJson = flickr.photos.search(license=i, per_page=500)
dic[i] = [json.loads(photosJson.decode("utf-8"))]
# Save the dictionary containing photo data to a JSON file
with open(os.path.join(CWD, "photos.json"), "w") as json_file:
json.dump(dic, json_file)

Expand Down
Loading
Loading