diff --git a/pycytominer/cyto_utils/DeepProfiler_processing.py b/pycytominer/cyto_utils/DeepProfiler_processing.py index 30f5c75d..254f4291 100644 --- a/pycytominer/cyto_utils/DeepProfiler_processing.py +++ b/pycytominer/cyto_utils/DeepProfiler_processing.py @@ -211,7 +211,7 @@ def setup_aggregate(self): def aggregate_deep(self): """ - Main function of this class. Aggregates the profiles into a pandas dataframe. + Aggregate the DeepProfiler profiles into a pandas dataframe. For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated. If files are missing, we throw a warning but continue the code. @@ -303,7 +303,9 @@ def aggregate_deep(self): class SingleCellDeepProfiler: - """This class holds functions needed to analyze single cells from the DeepProfiler (DP) run. Only pycytominer.normalization() is implemented. + """Class that holds functions needed to analyze single cells from the DeepProfiler (DP) run. + + Only pycytominer.normalization() is implemented. Attributes ---------- @@ -352,9 +354,9 @@ def __init__( def get_single_cells( self, output=False, location_x_col_index=0, location_y_col_index=1 ): - """ - Sets up the single_cells attribute or output as a variable. This is a helper function to normalize_deep_single_cells(). - single_cells is a pandas dataframe in the format expected by pycytominer.normalize(). + """Set up a single_cells dataframe in the format expected by pycytominer.normalize(). + + Helper function to normalize_deep_single_cells() that sets up the single_cells attribute or outputs it as a dataframe. Arguments ----------- @@ -409,7 +411,7 @@ def normalize_deep_single_cells( spherize_epsilon=1e-6, ): """ - Normalizes all cells into a pandas dataframe. + Normalize all cells into a pandas dataframe. For each file in the DP project features folder, the features from each cell are loaded. These features are put into a profiles dataframe for use in pycytominer.normalize. diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py index 25456868..f9a0407e 100644 --- a/pycytominer/cyto_utils/__init__.py +++ b/pycytominer/cyto_utils/__init__.py @@ -1,3 +1,5 @@ +"""A variety of utility functions for working with cytominer data.""" + from .output import output from .util import ( check_compartments, diff --git a/pycytominer/cyto_utils/annotate_custom.py b/pycytominer/cyto_utils/annotate_custom.py index c1259c87..d06edb01 100644 --- a/pycytominer/cyto_utils/annotate_custom.py +++ b/pycytominer/cyto_utils/annotate_custom.py @@ -1,3 +1,5 @@ +"""Functions to annotate data frames with custom options according to CMAP specifications.""" + import numpy as np diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index a3059511..04ee9077 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -12,8 +12,7 @@ class CellLocation: - """This class holds all the functions augment a metadata file with X,Y - locations of cells in each image. + """Class holding all the functions augment a metadata file with X,Y locations of cells in each image. In the metadata file, which is either a CSV or a Parquet file, - Each row is single multi-channel image @@ -376,6 +375,7 @@ def _load_single_cell(self): def add_cell_location(self): """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column. + Optionally, save the augmented metadata file as a Parquet file. Returns diff --git a/pycytominer/cyto_utils/cell_locations_cmd.py b/pycytominer/cyto_utils/cell_locations_cmd.py index f0e08891..3b96965f 100644 --- a/pycytominer/cyto_utils/cell_locations_cmd.py +++ b/pycytominer/cyto_utils/cell_locations_cmd.py @@ -1,3 +1,5 @@ +"""CLI for cell location calculations.""" + from pycytominer.cyto_utils.cell_locations import CellLocation import fire diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index ddd9d19a..0e07ae61 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -1,3 +1,5 @@ +"""Module containing the SingleCells class, which is used to interact with single cell morphological profiles.""" + from typing import Dict, Union, Optional import numpy as np @@ -25,8 +27,7 @@ class SingleCells: - """This is a class to interact with single cell morphological profiles. Interaction - includes aggregation, normalization, and output. + """Class to interact with single cell morphological profiles including aggregation, normalization, and output. Attributes ---------- @@ -115,7 +116,7 @@ def __init__( object_feature="Metadata_ObjectNumber", default_datatype_float=np.float64, ): - """Constructor method.""" + """Construct a SingleCells object.""" # Check compartments specified check_compartments(compartments) @@ -179,7 +180,7 @@ def __init__( self.load_image(image_table_name=self.image_table_name) def _check_subsampling(self): - """Internal method checking if subsampling options were specified correctly. + """Check if subsampling options were specified correctly. Returns ------- @@ -192,7 +193,7 @@ def _check_subsampling(self): ), "Do not set both subsample_frac and subsample_n" def set_output_file(self, output_file): - """Setting operation to conveniently rename output file. + """Set or modify output file. Parameters ---------- @@ -207,7 +208,7 @@ def set_output_file(self, output_file): self.output_file = output_file def set_subsample_frac(self, subsample_frac): - """Setting operation to conveniently update the subsample fraction. + """Set or update the subsample fraction. Parameters ---------- @@ -223,7 +224,7 @@ def set_subsample_frac(self, subsample_frac): self._check_subsampling() def set_subsample_n(self, subsample_n): - """Setting operation to conveniently update the subsample n. + """Set or update the subsample n. Parameters ---------- @@ -242,7 +243,7 @@ def set_subsample_n(self, subsample_n): self._check_subsampling() def set_subsample_random_state(self, random_state): - """Setting operation to conveniently update the subsample random state. + """Set or update the subsample random state. Parameters ---------- @@ -435,7 +436,7 @@ def split_column_categories(self, col_names): return meta_cols, feat_cols def load_compartment(self, compartment): - """Creates the compartment dataframe. + """Create the compartment dataframe. Note: makes use of default_datatype_float attribute for setting a default floating point datatype. @@ -590,8 +591,7 @@ def _compartment_df_generator( compartment, n_aggregation_memory_strata=1, ): - """A generator function that returns chunks of the entire compartment - table from disk. + """Yield chunks of the entire compartment table from disk. We want to return dataframes with all compartment entries within unique combinations of self.merge_cols when aggregated by self.strata @@ -881,9 +881,7 @@ def aggregate_profiles( def _sqlite_strata_conditions(df, dtypes, n=1): - """Given a dataframe where columns are merge_cols and rows are unique - value combinations that appear as aggregation strata, return a list - of strings which constitute valid SQLite conditional statements. + """Construct a list of strings which constitute valid SQLite conditional statements. Parameters ---------- diff --git a/pycytominer/cyto_utils/collate.py b/pycytominer/cyto_utils/collate.py index 794be8d4..39b3e12b 100644 --- a/pycytominer/cyto_utils/collate.py +++ b/pycytominer/cyto_utils/collate.py @@ -1,3 +1,5 @@ +"""Module that provides functions for collating CellProfiler-created CSVs into a single SQLite file.""" + import os import pathlib import subprocess diff --git a/pycytominer/cyto_utils/collate_cmd.py b/pycytominer/cyto_utils/collate_cmd.py index 7819d4bd..6f012a01 100644 --- a/pycytominer/cyto_utils/collate_cmd.py +++ b/pycytominer/cyto_utils/collate_cmd.py @@ -1,3 +1,5 @@ +"""Command line interface for collate function in pycytominer.cyto_utils.collate.""" + import argparse from pycytominer.cyto_utils.collate import collate diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index 995152c7..bc05b21f 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -179,7 +179,7 @@ def drop_outlier_features( def convert_compartment_format_to_list(compartments): - """Converts compartment to a list. + """Convert cell painting compartments to a list. Parameters ---------- diff --git a/pycytominer/cyto_utils/load.py b/pycytominer/cyto_utils/load.py index 543e1dd0..198fd990 100644 --- a/pycytominer/cyto_utils/load.py +++ b/pycytominer/cyto_utils/load.py @@ -1,3 +1,5 @@ +"""Module for loading data from various file formats.""" + import csv import gzip import pathlib @@ -7,7 +9,7 @@ def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool: - """Checks if the provided file path is a parquet file. + """Check if the provided file path is a parquet file. Identify parquet files by inspecting the file extensions. If the file does not end with `parquet`, this will return False, else True. diff --git a/pycytominer/cyto_utils/modz.py b/pycytominer/cyto_utils/modz.py index b3174c2d..d1fdf414 100644 --- a/pycytominer/cyto_utils/modz.py +++ b/pycytominer/cyto_utils/modz.py @@ -1,3 +1,5 @@ +"""Module for performing a modified z score transformation.""" + import numpy as np from pycytominer.cyto_utils.util import ( get_pairwise_correlation, diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py index 41a2d75c..d9dcc070 100644 --- a/pycytominer/cyto_utils/single_cell_ingest_utils.py +++ b/pycytominer/cyto_utils/single_cell_ingest_utils.py @@ -1,3 +1,5 @@ +"""Utility functions for single cell ingest.""" + from collections import Counter from pycytominer.cyto_utils import get_default_compartments @@ -75,8 +77,7 @@ def assert_linking_cols_complete(linking_cols="default", compartments="default") def provide_linking_cols_feature_name_update(linking_cols="default"): - """Output a dictionary to use to update pandas dataframe column names. The linking - cols must be Metadata. + """Output a dictionary to use to update pandas dataframe column names from linking cols in the Metadata. Parameters ---------- diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py index 8320d700..3a7302ad 100644 --- a/pycytominer/cyto_utils/util.py +++ b/pycytominer/cyto_utils/util.py @@ -14,7 +14,7 @@ def get_default_compartments(): - """Returns default compartments. + """Return default compartments. Returns ------- @@ -26,7 +26,7 @@ def get_default_compartments(): def check_compartments(compartments): - """Checks if the input compartments are noncanonical compartments. + """Check if the input compartments are noncanonical compartments. Parameters ---------- @@ -56,13 +56,13 @@ def check_compartments(compartments): def load_known_metadata_dictionary(metadata_file=default_metadata_file): - """From a tab separated text file (two columns: ["compartment", "feature"]), load - previously known metadata columns per compartment. + """Load previously known metadata columns per compartment from metadata text file. Parameters ---------- metadata_file : str, optional - File location of the metadata text file. Uses a default dictionary if you do not specify. + File location of the metadata text file which should be a tab-separated file with two columns: ["compartment", "feature"]. + If not provided, the default metadata file will be used. Returns ------- diff --git a/pycytominer/cyto_utils/write_gct.py b/pycytominer/cyto_utils/write_gct.py index 9e6828f4..02bba65b 100644 --- a/pycytominer/cyto_utils/write_gct.py +++ b/pycytominer/cyto_utils/write_gct.py @@ -1,4 +1,5 @@ -""" +"""Module to write a gct file from a pandas DataFrame. + Transform profiles into a gct (Gene Cluster Text) file A gct is a tab deliminted text file that traditionally stores gene expression data File Format Description: https://clue.io/connectopedia/gct_format. diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index 02f74b0e..9d164e7b 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -35,7 +35,7 @@ def feature_select( noise_removal_perturb_groups=None, noise_removal_stdev_cutoff=None, ): - """Performs feature selection based on the given operation. + """Perform feature selection based on the given operation. Parameters ---------- diff --git a/pycytominer/operations/__init__.py b/pycytominer/operations/__init__.py index 39a59b30..07f5bba8 100644 --- a/pycytominer/operations/__init__.py +++ b/pycytominer/operations/__init__.py @@ -1,3 +1,5 @@ +"""Module containing statistical operations for data processing.""" + from .correlation_threshold import correlation_threshold from .get_na_columns import get_na_columns from .noise_removal import noise_removal diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py index 961eeb5c..d6526949 100644 --- a/pycytominer/operations/correlation_threshold.py +++ b/pycytominer/operations/correlation_threshold.py @@ -1,5 +1,6 @@ -""" -Returns list of features such that no two features have a correlation greater than a +"""Module for correlation threshold operation. + +The correlation threshold operation list of features such that no two features have a correlation greater than a specified threshold. """ @@ -79,8 +80,9 @@ def correlation_threshold( def determine_high_cor_pair(correlation_row, sorted_correlation_pairs): - """Select highest correlated variable given a correlation row with columns: - ["pair_a", "pair_b", "correlation"]. For use in a pandas.apply(). + """Select highest correlated variable given a correlation row. + + From a row with columns: ["pair_a", "pair_b", "correlation"]. For use in a pandas.apply(). Parameters ---------- diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py index f340bc1e..69393318 100644 --- a/pycytominer/operations/get_na_columns.py +++ b/pycytominer/operations/get_na_columns.py @@ -1,4 +1,5 @@ -""" +"""Function to get columns with NA values above a certain threshold. + Remove variables with specified threshold of NA values Note: This was called `drop_na_columns` in cytominer for R. """ diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py index eceef964..f52c03ff 100644 --- a/pycytominer/operations/noise_removal.py +++ b/pycytominer/operations/noise_removal.py @@ -10,7 +10,7 @@ def noise_removal( samples="all", noise_removal_stdev_cutoff=0.8, ): - """ + """Remove features with excessive standard deviation within the same perturbation group. Parameters ---------- diff --git a/pycytominer/operations/transform.py b/pycytominer/operations/transform.py index b5373709..5077d977 100644 --- a/pycytominer/operations/transform.py +++ b/pycytominer/operations/transform.py @@ -13,9 +13,9 @@ class Spherize(BaseEstimator, TransformerMixin): - """Class to apply a sphering transform (aka whitening) data in the base sklearn - transform API. Note, this implementation is modified/inspired from the following - sources: + """Class to apply a sphering transform (aka whitening) data in the base sklearn transform API. + + This implementation is modified/inspired from the following sources: 1) A custom function written by Juan C. Caicedo 2) A custom ZCA function at https://github.com/mwv/zca 3) Notes from Niranj Chandrasekaran (https://github.com/cytomining/pycytominer/issues/90) @@ -33,7 +33,8 @@ class Spherize(BaseEstimator, TransformerMixin): """ def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False): - """ + """Construct a Spherize object. + Parameters ---------- epsilon : float, default 1e-6 diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index cdbefa44..d4b999fc 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -1,5 +1,5 @@ -""" -Remove variables with near-zero variance. +"""Remove variables with near-zero variance. + Modified from caret::nearZeroVar(). """ @@ -79,7 +79,6 @@ def variance_threshold( def calculate_frequency(feature_column, freq_cut): """Calculate frequency of second most common to most common feature. - Used in pandas.apply(). Parameters ---------- diff --git a/pyproject.toml b/pyproject.toml index ef7c977b..1f25ac59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,9 @@ target-version = "py38" line-length = 88 fix = true extend-include = ["*.ipynb"] +exclude = [ + "walkthroughs/nbconverted/*", +] [tool.ruff.lint] select = [ @@ -170,8 +173,12 @@ convention = "numpy" [tool.ruff.lint.per-file-ignores] # Ignore `E402` and `F401` (unusued imports) in all `__init__.py` files "__init__.py" = ["E402", "F401"] -# Ignore assert statements in tests -"tests/*" = ["S101"] +"tests/*" = [ + # Allow assert statements in tests + "S101", + # Disable docstring checks in tests + "D", +] "pycytominer/cyto_utils/*" = [ # I (isort) is ignored due to circular dependencies in the cyto_utils module "I",