Merge pull request #96 from IamMQaisar/shared_module

Fixed: Refactor scripts #91, Adds logging #85 and Cleans code and comments #86 by Putting it in a Shared Library/Module
creativecommons · Apr 2, 2024 · 8412423 · 8412423
2 parents f2c4b4c + b3d5e21
commit 8412423
Show file tree

Hide file tree

Showing 18 changed files with 44,437 additions and 1,835 deletions.
diff --git a/README.md b/README.md
@@ -133,15 +133,15 @@ directories to check:
 - [ppypa/pipenv][pipenv]: _Python Development Workflow for Humans._
 - [pre-commit][pre-commit]: _A framework for managing and maintaining
   multi-language pre-commit hooks._
-- [Logging][logging]: _Built-in Python logging module to implement a flexible logging system across shared modules._
+- [Logging][logging]: _Utilize the built-in Python logging module to implement a flexible logging system from a shared module._
 
 [ccospyguide]: https://opensource.creativecommons.org/contributing-code/python-guidelines/
 [black]: https://github.com/psf/black
 [flake8]: https://github.com/PyCQA/flake8
 [isort]: https://pycqa.github.io/isort/
 [pipenv]: https://github.com/pypa/pipenv
 [pre-commit]: https://pre-commit.com/
-[logging]: https://docs.python.org/3/howto/logging.html
+[logging]: https://docs.python.org/3/library/logging.html
 
 
 ### GitHub Actions

diff --git a/analyze/data_analysis.py b/analyze/data_analysis.py
@@ -3,8 +3,7 @@
 """
 
 # Standard library
-import logging
-import os.path
+import os
 import re
 import sys
 import traceback
@@ -16,36 +15,17 @@
 import pandas as pd
 import plotly.express as px
 import seaborn as sns
-
-warnings.filterwarnings("ignore")
-
-# Third-party
 from wordcloud import STOPWORDS, WordCloud  # noqa: E402
 
-# Set the current working directory
-PATH_WORK_DIR = os.path.dirname(os.path.abspath(__file__))
-
-# Set the current working directory
-CWD = os.path.dirname(os.path.abspath(__file__))
-
-# Set up the logger
-LOG = logging.getLogger(__name__)
-LOG.setLevel(logging.INFO)
+sys.path.append(".")
+# First-party/Local
+import quantify  # noqa: E402
 
-# Define both the handler and the formatter
-handler = logging.StreamHandler()
-formatter = logging.Formatter(
-    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
-)
-
-# Add formatter to the handler
-handler.setFormatter(formatter)
-
-# Add handler to the logger
-LOG.addHandler(handler)
+# Warning suppression /!\ Caution /!\
+warnings.filterwarnings("ignore")
 
-# Log the start of the script execution
-LOG.info("Script execution started.")
+# Setup PATH_WORK_DIR, and LOGGER using quantify.setup()
+_, PATH_WORK_DIR, _, _, LOGGER = quantify.setup(__file__)
 
 
 def tags_frequency(csv_path, column_names):
@@ -59,7 +39,7 @@ def tags_frequency(csv_path, column_names):
                            Example: ["tags", "description"]
 
     """
-    LOG.info("Generating word cloud based on tags.")
+    LOGGER.info("Generating word cloud based on tags.")
 
     df = pd.read_csv(csv_path)
     # Process each column containing tags
@@ -79,7 +59,7 @@ def tags_frequency(csv_path, column_names):
                     and str(row) != ""
                     and str(row) != "nan"
                 ):
-                    LOG.debug(f"Processing row: {row}")
+                    LOGGER.debug(f"Processing row: {row}")
                     if "ChineseinUS.org" in str(row):
                         row = "ChineseinUS"
                     list2 += re.split(r"\s|(?<!\d)[,.](?!\d)", str(row))
@@ -168,7 +148,7 @@ def time_trend_helper(df):
     Returns:
     - DataFrame: DataFrame with counts of entries per year.
     """
-    LOG.info("Extracting year-wise count of entries.")
+    LOGGER.info("Extracting year-wise count of entries.")
 
     year_list = []
     for date_row in df["dates"][0:]:
@@ -196,7 +176,7 @@ def time_trend(csv_path):
     Args:
     - csv_path (str): Path to the CSV file.
     """
-    LOG.info("Generating time trend line graph.")
+    LOGGER.info("Generating time trend line graph.")
 
     df = pd.read_csv(csv_path)
     count_df = time_trend_helper(df)
@@ -239,7 +219,7 @@ def time_trend_compile_helper(yearly_count):
     Returns:
     - DataFrame: Filtered yearly count data.
     """
-    LOG.info("Filtering yearly trend data.")
+    LOGGER.info("Filtering yearly trend data.")
 
     Years = np.arange(2018, 2023)
     yearly_count["year"] = list(yearly_count.index)
@@ -249,7 +229,7 @@ def time_trend_compile_helper(yearly_count):
             int(yearly_count["year"][num]) >= 2018
         ):
             counts.append(yearly_count["Counts"][num])
-    LOG.info(f"{counts}")
+    LOGGER.info(f"{counts}")
     final_yearly_count = pd.DataFrame(
         list(zip(Years, counts)), columns=["Years", "Yearly_counts"]
     )
@@ -260,7 +240,7 @@ def time_trend_compile():
     """
     Compile yearly trends for different licenses and plot them.
     """
-    LOG.info("Compiling yearly trends for different licenses.")
+    LOGGER.info("Compiling yearly trends for different licenses.")
 
     license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
     license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
@@ -319,7 +299,7 @@ def time_trend_compile():
     yearly_count6 = time_trend_compile_helper(yearly_count6)
     yearly_count9 = time_trend_compile_helper(yearly_count9)
     yearly_count10 = time_trend_compile_helper(yearly_count10)
-    LOG.info(f"{yearly_count1}")
+    LOGGER.info(f"{yearly_count1}")
 
     # Plot yearly trend for all licenses
     plt.plot(
@@ -408,20 +388,22 @@ def view_compare_helper(df):
     Returns:
     - int: Maximum views.
     """
-    LOG.info("Calculating maximum views of pictures under a license.")
+    LOGGER.info("Calculating maximum views of pictures under a license.")
 
     highest_view = int(max(df["views"]))
     df = df.sort_values("views", ascending=False)
-    LOG.info(f"DataFrame sorted by views in descending order: {df}")
-    LOG.info(f"Maximum views found: {highest_view}")
+    LOGGER.info(f"DataFrame sorted by views in descending order: {df}")
+    LOGGER.info(f"Maximum views found: {highest_view}")
     return highest_view
 
 
 def view_compare():
     """
     Compare maximum views of pictures under different licenses.
     """
-    LOG.info("Comparing maximum views of pictures under different licenses.")
+    LOGGER.info(
+        "Comparing maximum views of pictures under different licenses."
+    )
 
     license1 = pd.read_csv(
         os.path.join(PATH_WORK_DIR, "../flickr/dataset/cleaned_license1.csv")
@@ -461,7 +443,7 @@ def view_compare():
     maxs = []
     for lic in licenses:
         maxs.append(view_compare_helper(lic))
-    LOG.info(f"{maxs}")
+    LOGGER.info(f"{maxs}")
     # Create DataFrame to store license and their maximum views
     temp_data = pd.DataFrame()
     temp_data["Licenses"] = [
@@ -517,7 +499,9 @@ def total_usage():
     """
     Generate a bar plot showing the total usage of different licenses.
     """
-    LOG.info("Generating bar plot showing total usage of different licenses.")
+    LOGGER.info(
+        "Generating bar plot showing total usage of different licenses."
+    )
 
     # Reads the license total file as the input dataset
     df = pd.read_csv(
@@ -538,15 +522,14 @@ def main():
 
 
 if __name__ == "__main__":
-    # Exception Handling
     try:
         main()
     except SystemExit as e:
-        LOG.error(f"System exit with code: {e.code}")
+        LOGGER.error(f"System exit with code: {e.code}")
         sys.exit(e.code)
     except KeyboardInterrupt:
-        LOG.info("(130) Halted via KeyboardInterrupt.")
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
         sys.exit(130)
     except Exception:
-        LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
+        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
         sys.exit(1)
diff --git a/deviantart/deviantart_scratcher.py b/deviantart/deviantart_scratcher.py
@@ -4,7 +4,6 @@
 data.
 """
 # Standard library
-import logging
 import os
 import sys
 import traceback
@@ -20,56 +19,47 @@
 # First-party/Local
 import quantify  # noqa: E402
 
-PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY = quantify.setup(
-    __file__
+# Setup paths, Date and LOGGER using quantify.setup()
+PATH_REPO_ROOT, PATH_WORK_DIR, PATH_DOTENV, DATETIME_TODAY, LOGGER = (
+    quantify.setup(__file__)
 )
+
+# Load environment variables
 load_dotenv(PATH_DOTENV)
 
-# Retrieve API keys
-API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
+
+# Global Variable for API_KEYS indexing
 API_KEYS_IND = 0
+
+# Gets API_KEYS and PSE_KEY from .env file
+API_KEYS = os.getenv("GOOGLE_API_KEYS").split(",")
+PSE_KEY = os.getenv("PSE_KEY")
+
 # Set up file path for CSV report
 DATA_WRITE_FILE = os.path.join(
     PATH_WORK_DIR,
-    f"data_deviantart_"
+    f"/data_deviantart_"
     f"{DATETIME_TODAY.year}_{DATETIME_TODAY.month}_{DATETIME_TODAY.day}.csv",
 )
-# Retrieve Programmable Search Engine key from environment variables
-PSE_KEY = os.getenv("PSE_KEY")
-
-# Set up the logger
-LOG = logging.getLogger(__name__)
-LOG.setLevel(logging.INFO)
-
-# Define both the handler and the formatter
-handler = logging.StreamHandler()
-formatter = logging.Formatter(
-    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
-)
-
-# Add formatter to the handler
-handler.setFormatter(formatter)
-
-# Add handler to the logger
-LOG.addHandler(handler)
 
 # Log the start of the script execution
-LOG.info("Script execution started.")
+LOGGER.info("Script execution started.")
 
 
 def get_license_list():
     """
     Provides the list of license from 2018's record of Creative Commons.
 
     Returns:
-    - np.array: An array containing all license types that should be
-    searched via Programmable Search Engine.
+    - np.array:
+            An np array containing all license types that should be searched
+            via Programmable Search Engine (PSE).
     """
-    LOG.info("Retrieving list of license from Creative Commons' record.")
+    LOGGER.info("Retrieving list of license from Creative Commons' record.")
 
     # Read license data from file
     cc_license_data = pd.read_csv(
-        os.path.join(PATH_WORK_DIR, "legal-tool-paths.txt"), header=None
+        f"{PATH_REPO_ROOT}/legal-tool-paths.txt", header=None
     )
     # Define regex pattern to extract license types
     license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
@@ -92,7 +82,9 @@ def get_request_url(license):
     Returns:
     - str: The API Endpoint URL for the query specified by parameters.
     """
-    LOG.info(f"Generating API Endpoint URL for specified license: {license}")
+    LOGGER.info(
+        f"Generating API Endpoint URL for specified license: {license}"
+    )
 
     try:
         api_key = API_KEYS[API_KEYS_IND]
@@ -104,7 +96,7 @@ def get_request_url(license):
         )
     except Exception as e:
         if isinstance(e, IndexError):
-            LOG.exception("Depleted all API Keys provided")
+            LOGGER.error("Depleted all API Keys provided")
         else:
             raise e
 
@@ -121,7 +113,7 @@ def get_response_elems(license):
     - dict: A dictionary mapping metadata to its value provided from the API
     query.
     """
-    LOG.info("Making a request to the API and handling potential retries.")
+    LOGGER.info("Making a request to the API and handling potential retries.")
 
     try:
         # Make a request to the API and handle potential retries
@@ -146,16 +138,14 @@ def get_response_elems(license):
             # If quota limit exceeded, switch to the next API key
             global API_KEYS_IND
             API_KEYS_IND += 1
-            LOG.exception("Changing API KEYS due to depletion of quota")
+            LOGGER.error("Changing API KEYS due to depletion of quota")
             return get_response_elems(license)
         else:
             raise e
 
 
 def set_up_data_file():
-    """Writes the header row to the file to contain DeviantArt data."""
-    LOG.info("Setting up data file by writing the header row.")
-
+    # Writes the header row to the file to contain DeviantArt data.
     header_title = "LICENSE TYPE,Document Count"
     with open(DATA_WRITE_FILE, "w") as f:
         f.write(f"{header_title}\n")
@@ -164,11 +154,13 @@ def set_up_data_file():
 def record_license_data(license_type):
     """Writes the row for LICENSE_TYPE to the file to contain DeviantArt data.
     Args:
-    - license_type(str): A string representing the type of license.
-    It's a segment of the URL towards the license description. If not provided,
-    it defaults to None, indicating no assumption about the license type.
+    - license_type:
+            A string representing the type of license, and should be a segment
+            of its URL towards the license description. Alternatively, the
+            default None value stands for having no assumption about license
+            type.
     """
-    LOG.info(
+    LOGGER.info(
         "Writing the row for license type %s to contain DeviantArt data",
         license_type,
     )
@@ -187,11 +179,8 @@ def record_all_licenses():
     list and writes this data into the DATA_WRITE_FILE, as specified by the
     constant.
     """
-    LOG.info("Recording data for all available license types.")
-
-    # Get the list of license types
+    # Gets the list of license types and record data for each license type
     license_list = get_license_list()
-    # Record data for each license types
     for license_type in license_list:
         record_license_data(license_type)
 
@@ -202,15 +191,14 @@ def main():
 
 
 if __name__ == "__main__":
-    # Exception Handling
     try:
         main()
     except SystemExit as e:
-        LOG.error(f"System exit with code: {e.code}")
+        LOGGER.error(f"System exit with code: {e.code}")
         sys.exit(e.code)
     except KeyboardInterrupt:
-        LOG.info("(130) Halted via KeyboardInterrupt.")
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
         sys.exit(130)
     except Exception:
-        LOG.error(f"(1) Unhandled exception: {traceback.format_exc()}")
+        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
         sys.exit(1)