fuzzylabs · JonoCX · Nov 9, 2022
diff --git a/data/generate_dataset_for_demo.py b/data/generate_dataset_for_demo.py
@@ -6,7 +6,9 @@
 
 from .prob_distribution import ProbDistribution
 
-
+""" Jon:
+Typo in the function name.
+"""
 def laod_data(dataset_path: str, features: list, no_rows: int) -> pd.DataFrame:
     """Loads the dataset from the `dataset_path`,  select the `features` and number of rows upto `no_rows`.
 

diff --git a/data/get_data.py b/data/get_data.py
@@ -6,7 +6,10 @@
 import gdown
 import pandas as pd
 
-
+""" Jon:
+These types of utility functions shouldn't really be doing any form of IO.
+They should return what they produce and IO should be handled a step above.
+"""
 def download_dataset(url: str, output: str) -> None:
     """Download the dataset using the gdown library.
 

diff --git a/download_dataset.py b/download_dataset.py
@@ -2,6 +2,19 @@
 from config.config import logger
 from data.get_data import download_dataset, preprocess_dataset
 
+""" Jon:
+The code below should be split into several functions and there should be
+a 'main' function which calls them. It's not the best practise to run this
+within the code execution if statement.
+
+You could also make the dataset_url and ouput either parameters that are
+passed to the file, or better, are stored in some form of configuration file.
+
+It's worth considering whether the 'preproces_dataset' function should be called in here,
+it's mixing functionality and you already have a 'prepare_dataset' file which
+deals with these sorts of things - this file should only contain code to 
+download the data.
+"""
 if __name__ == "__main__":
     # Path to the dataset on google drive
     dataset_url = (

diff --git a/prepare_dataset.py b/prepare_dataset.py
@@ -10,6 +10,17 @@
     generate_reference_data,
 )
 
+""" Jon:
+This code should be more functional with all of this split up into separate
+functions which return the necessary parts for the follow on steps. As per
+the download_dataset feedback, this code should be executed in the program
+execution if statement - there should be a 'main' function which sequentially
+calls the functions.
+
+'features' should probably be a global variable, or better yet, in a configuration
+file - same with the dataset_path and save_dir, both could be arguments passed
+to the file or in a configuration file.
+"""
 if __name__ == "__main__":
     logger.info("Generating reference data")
     # same path where preprocessed dataset is stored by preprocess_dataset function in download_dataset.py