Skip to content

Commit

Permalink
Merge pull request #96 from IamMQaisar/shared_module
Browse files Browse the repository at this point in the history
Fixed: Refactor scripts #91, Adds logging #85 and Cleans code and comments #86 by Putting it in a Shared Library/Module
  • Loading branch information
TimidRobot authored Apr 2, 2024
2 parents f2c4b4c + b3d5e21 commit 8412423
Show file tree
Hide file tree
Showing 18 changed files with 44,437 additions and 1,835 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ directories to check:
- [ppypa/pipenv][pipenv]: _Python Development Workflow for Humans._
- [pre-commit][pre-commit]: _A framework for managing and maintaining
multi-language pre-commit hooks._
- [Logging][logging]: _Built-in Python logging module to implement a flexible logging system across shared modules._
- [Logging][logging]: _Utilize the built-in Python logging module to implement a flexible logging system from a shared module._

[ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/
[black]: https://github.com/psf/black
[flake8]: https://github.com/PyCQA/flake8
[isort]: https://pycqa.github.io/isort/
[pipenv]: https://github.com/pypa/pipenv
[pre-commit]: https://pre-commit.com/
[logging]: https://docs.python.org/3/howto/logging.html
[logging]: https://docs.python.org/3/library/logging.html


### GitHub Actions
Expand Down
75 changes: 29 additions & 46 deletions analyze/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
"""

# Standard library
import logging
import os.path
import os
import re
import sys
import traceback
Expand All @@ -16,36 +15,17 @@
import pandas as pd
import plotly.express as px
import seaborn as sns

warnings.filterwarnings("ignore")

# Third-party
from wordcloud import STOPWORDS, WordCloud # noqa: E402

# Set the current working directory
PATH_WORK_DIR = os.path.dirname(os.path.abspath(__file__))

# Set the current working directory
CWD = os.path.dirname(os.path.abspath(__file__))

# Set up the logger
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)
sys.path.append(".")
# First-party/Local
import quantify # noqa: E402

# Define both the handler and the formatter
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
)

# Add formatter to the handler
handler.setFormatter(formatter)

# Add handler to the logger
LOG.addHandler(handler)
# Warning suppression /!\ Caution /!\
warnings.filterwarnings("ignore")

# Log the start of the script execution
LOG.info("Script execution started.")
# Setup PATH_WORK_DIR, and LOGGER using quantify.setup()
_, PATH_WORK_DIR, _, _, LOGGER = quantify.setup(__file__)


def tags_frequency(csv_path, column_names):
Expand All @@ -59,7 +39,7 @@ def tags_frequency(csv_path, column_names):
Example: ["tags", "description"]
"""
LOG.info("Generating word cloud based on tags.")
LOGGER.info("Generating word cloud based on tags.")

df = pd.read_csv(csv_path)
# Process each column containing tags
Expand All @@ -79,7 +59,7 @@ def tags_frequency(csv_path, column_names):
and str(row) != ""
and str(row) != "nan"
):
LOG.debug(f"Processing row: {row}")
LOGGER.debug(f"Processing row: {row}")
if "ChineseinUS.org" in str(row):
row = "ChineseinUS"
list2 += re.split(r"\s|(?<!\d)[,.](?!\d)", str(row))
Expand Down Expand Up @@ -168,7 +148,7 @@ def time_trend_helper(df):
Returns:
- DataFrame: DataFrame with counts of entries per year.
"""
LOG.info("Extracting year-wise count of entries.")
LOGGER.info("Extracting year-wise count of entries.")

year_list = []
for date_row in df["dates"][0:]:
Expand Down Expand Up @@ -196,7 +176,7 @@ def time_trend(csv_path):
Args:
- csv_path (str): Path to the CSV file.
"""
LOG.info("Generating time trend line graph.")
LOGGER.info("Generating time trend line graph.")

df = pd.read_csv(csv_path)
count_df = time_trend_helper(df)
Expand Down Expand Up @@ -239,7 +219,7 @@ def time_trend_compile_helper(yearly_count):
Returns:
- DataFrame: Filtered yearly count data.
"""
LOG.info("Filtering yearly trend data.")
LOGGER.info("Filtering yearly trend data.")

Years = np.arange(2018, 2023)
yearly_count["year"] = list(yearly_count.index)
Expand All @@ -249,7 +229,7 @@ def time_trend_compile_helper(yearly_count):
int(yearly_count["year"][num]) >= 2018
):
counts.append(yearly_count["Counts"][num])
LOG.info(f"{counts}")
LOGGER.info(f"{counts}")
final_yearly_count = pd.DataFrame(
list(zip(Years, counts)), columns=["Years", "Yearly_counts"]
)
Expand All @@ -260,7 +240,7 @@ def time_trend_compile():
"""
Compile yearly trends for different licenses and plot them.
"""
LOG.info("Compiling yearly trends for different licenses.")
LOGGER.info("Compiling yearly trends for different licenses.")

license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
Expand Down Expand Up @@ -319,7 +299,7 @@ def time_trend_compile():
yearly_count6 = time_trend_compile_helper(yearly_count6)
yearly_count9 = time_trend_compile_helper(yearly_count9)
yearly_count10 = time_trend_compile_helper(yearly_count10)
LOG.info(f"{yearly_count1}")
LOGGER.info(f"{yearly_count1}")

# Plot yearly trend for all licenses
plt.plot(
Expand Down Expand Up @@ -408,20 +388,22 @@ def view_compare_helper(df):
Returns:
- int: Maximum views.
"""
LOG.info("Calculating maximum views of pictures under a license.")
LOGGER.info("Calculating maximum views of pictures under a license.")

highest_view = int(max(df["views"]))
df = df.sort_values("views", ascending=False)
LOG.info(f"DataFrame sorted by views in descending order: {df}")
LOG.info(f"Maximum views found: {highest_view}")
LOGGER.info(f"DataFrame sorted by views in descending order: {df}")
LOGGER.info(f"Maximum views found: {highest_view}")
return highest_view


def view_compare():
"""
Compare maximum views of pictures under different licenses.
"""
LOG.info("Comparing maximum views of pictures under different licenses.")
LOGGER.info(
"Comparing maximum views of pictures under different licenses."
)

license1 = pd.read_csv(
os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv")
Expand Down Expand Up @@ -461,7 +443,7 @@ def view_compare():
maxs = []
for lic in licenses:
maxs.append(view_compare_helper(lic))
LOG.info(f"{maxs}")
LOGGER.info(f"{maxs}")
# Create DataFrame to store license and their maximum views
temp_data = pd.DataFrame()
temp_data["Licenses"] = [
Expand Down Expand Up @@ -517,7 +499,9 @@ def total_usage():
"""
Generate a bar plot showing the total usage of different licenses.
"""
LOG.info("Generating bar plot showing total usage of different licenses.")
LOGGER.info(
"Generating bar plot showing total usage of different licenses."
)

# Reads the license total file as the input dataset
df = pd.read_csv(
Expand All @@ -538,15 +522,14 @@ def main():


if __name__ == "__main__":
# Exception Handling
try:
main()
except SystemExit as e:
LOG.error(f"System exit with code: {e.code}")
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOG.info("(130) Halted via KeyboardInterrupt.")
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
84 changes: 36 additions & 48 deletions deviantart/deviantart_scratcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
data.
"""
# Standard library
import logging
import os
import sys
import traceback
Expand All @@ -20,56 +19,47 @@
# First-party/Local
import quantify # noqa: E402

PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup(
__file__
# Setup paths, Date and LOGGER using quantify.setup()
PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY, LOGGER = (
quantify.setup(__file__)
)

# Load environment variables
load_dotenv(PATH_DOTENV)

# Retrieve API keys
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")

# Global Variable for API_KEYS indexing
API_KEYS_IND = 0

# Gets API_KEYS and PSE_KEY from .env file
API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
PSE_KEY = os.getenv("PSE_KEY")

# Set up file path for CSV report
DATA_WRITE_FILE = os.path.join(
PATH_WORK_DIR,
f"data_deviantart_"
f"/data_deviantart_"
f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv",
)
# Retrieve Programmable Search Engine key from environment variables
PSE_KEY = os.getenv("PSE_KEY")

# Set up the logger
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)

# Define both the handler and the formatter
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
)

# Add formatter to the handler
handler.setFormatter(formatter)

# Add handler to the logger
LOG.addHandler(handler)

# Log the start of the script execution
LOG.info("Script execution started.")
LOGGER.info("Script execution started.")


def get_license_list():
"""
Provides the list of license from 2018's record of Creative Commons.
Returns:
- np.array: An array containing all license types that should be
searched via Programmable Search Engine.
- np.array:
An np array containing all license types that should be searched
via Programmable Search Engine (PSE).
"""
LOG.info("Retrieving list of license from Creative Commons' record.")
LOGGER.info("Retrieving list of license from Creative Commons' record.")

# Read license data from file
cc_license_data = pd.read_csv(
os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None
f"{PATH_REPO_ROOT}/legal-tool-paths.txt", header=None
)
# Define regex pattern to extract license types
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
Expand All @@ -92,7 +82,9 @@ def get_request_url(license):
Returns:
- str: The API Endpoint URL for the query specified by parameters.
"""
LOG.info(f"Generating API Endpoint URL for specified license: {license}")
LOGGER.info(
f"Generating API Endpoint URL for specified license: {license}"
)

try:
api_key = API_KEYS[API_KEYS_IND]
Expand All @@ -104,7 +96,7 @@ def get_request_url(license):
)
except Exception as e:
if isinstance(e, IndexError):
LOG.exception("Depleted all API Keys provided")
LOGGER.error("Depleted all API Keys provided")
else:
raise e

Expand All @@ -121,7 +113,7 @@ def get_response_elems(license):
- dict: A dictionary mapping metadata to its value provided from the API
query.
"""
LOG.info("Making a request to the API and handling potential retries.")
LOGGER.info("Making a request to the API and handling potential retries.")

try:
# Make a request to the API and handle potential retries
Expand All @@ -146,16 +138,14 @@ def get_response_elems(license):
# If quota limit exceeded, switch to the next API key
global API_KEYS_IND
API_KEYS_IND += 1
LOG.exception("Changing API KEYS due to depletion of quota")
LOGGER.error("Changing API KEYS due to depletion of quota")
return get_response_elems(license)
else:
raise e


def set_up_data_file():
"""Writes the header row to the file to contain DeviantArt data."""
LOG.info("Setting up data file by writing the header row.")

# Writes the header row to the file to contain DeviantArt data.
header_title = "LICENSE TYPE,Document Count"
with open(DATA_WRITE_FILE, "w") as f:
f.write(f"{header_title}\n")
Expand All @@ -164,11 +154,13 @@ def set_up_data_file():
def record_license_data(license_type):
"""Writes the row for LICENSE_TYPE to the file to contain DeviantArt data.
Args:
- license_type(str): A string representing the type of license.
It's a segment of the URL towards the license description. If not provided,
it defaults to None, indicating no assumption about the license type.
- license_type:
A string representing the type of license, and should be a segment
of its URL towards the license description. Alternatively, the
default None value stands for having no assumption about license
type.
"""
LOG.info(
LOGGER.info(
"Writing the row for license type %s to contain DeviantArt data",
license_type,
)
Expand All @@ -187,11 +179,8 @@ def record_all_licenses():
list and writes this data into the DATA_WRITE_FILE, as specified by the
constant.
"""
LOG.info("Recording data for all available license types.")

# Get the list of license types
# Gets the list of license types and record data for each license type
license_list = get_license_list()
# Record data for each license types
for license_type in license_list:
record_license_data(license_type)

Expand All @@ -202,15 +191,14 @@ def main():


if __name__ == "__main__":
# Exception Handling
try:
main()
except SystemExit as e:
LOG.error(f"System exit with code: {e.code}")
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOG.info("(130) Halted via KeyboardInterrupt.")
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
Loading

0 comments on commit 8412423

Please sign in to comment.