From 1672dea4331560c7bb56bc13248df58dffca821e Mon Sep 17 00:00:00 2001 From: Luciano Lorenti Date: Mon, 13 May 2024 22:29:52 +0200 Subject: [PATCH 1/7] Iterators: Allow sample weights to use dataframes, series and arrays (#39) * Iterators: Allow sample weights to use dataframes, series and arrays * Tests: Add tests for sample weight in keras, and scikit-learn * Bump Version --- .bumpversion.cfg | 2 +- ceruleo/__init__.py | 2 +- ceruleo/iterators/sample_weight.py | 38 +++++++--- tests/test_iterators.py | 109 ++++++++++++++++++++--------- tests/test_models.py | 27 +++++++ tests/test_sklearn.py | 50 ++++++++++--- 6 files changed, 174 insertions(+), 54 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 745b5c21..a1bdf753 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.3 +current_version = 3.0.4 commit = True tag = True diff --git a/ceruleo/__init__.py b/ceruleo/__init__.py index 02280252..b8e01a54 100644 --- a/ceruleo/__init__.py +++ b/ceruleo/__init__.py @@ -9,4 +9,4 @@ CACHE_PATH.mkdir(parents=True, exist_ok=True) -__version__ = "3.0.3" +__version__ = "3.0.4" diff --git a/ceruleo/iterators/sample_weight.py b/ceruleo/iterators/sample_weight.py index ac36d997..da1fa86c 100644 --- a/ceruleo/iterators/sample_weight.py +++ b/ceruleo/iterators/sample_weight.py @@ -1,9 +1,8 @@ -from abc import abstractmethod -from signal import signal from typing import Any, Callable, Union import numpy as np +import pandas as pd class AbstractSampleWeights: @@ -11,10 +10,26 @@ class AbstractSampleWeights: The base class for the sample weight provider """ - def __call__(self, y, i: int, metadata): + def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata): raise NotImplementedError + +def get_value(y: Union[np.ndarray, pd.DataFrame], i:int) -> float: + if isinstance(y, np.ndarray): + if len(y.shape) > 1: + return y[i, 0] + else: + return y[i] + elif isinstance(y, pd.DataFrame): + return y.iloc[i, 0] + elif isinstance(y, pd.Series): + return y.iloc[i] + else: + raise ValueError(f"Unsupported type {type(y)}") + + + class NotWeighted(AbstractSampleWeights): """ Simplest sample weight provider @@ -22,7 +37,7 @@ class NotWeighted(AbstractSampleWeights): Provide 1 as a sample weight for every sample """ - def __call__(self, y, i: int, metadata): + def __call__(self, y: Union[np.ndarray, pd.DataFrame], i: int, metadata): return 1 @@ -41,8 +56,10 @@ class RULInverseWeighted(AbstractSampleWeights): Weight each sample by the inverse of the RUL """ - def __call__(self, y, i: int, metadata): - return 1 / (y[i, 0] + 1) + def __call__(self, y : Union[np.ndarray, pd.DataFrame], i: int, metadata): + return 1 / (get_value(y, i) + 1) + + class InverseToLengthWeighted(AbstractSampleWeights): @@ -53,8 +70,8 @@ class InverseToLengthWeighted(AbstractSampleWeights): """ - def __call__(self, y, i: int, metadata): - return 1 / y[0] + def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata): + return 1 / get_value(y, 0) class ExponentialDecay(AbstractSampleWeights): @@ -64,8 +81,9 @@ class ExponentialDecay(AbstractSampleWeights): """ def __init__(self, *, near_0_at: float): + super().__init__() self.alpha = -((near_0_at) ** 2) / np.log(0.000001) - def __call__(self, y, i: int, metadata): - return (1 + np.exp(-(y[i, 0] ** 2) / self.alpha)) ** 2 + def __call__(self, y:Union[np.ndarray, pd.DataFrame], i: int, metadata): + return ( np.exp(-(get_value(y,i) ** 2) / self.alpha)) diff --git a/tests/test_iterators.py b/tests/test_iterators.py index 49357c55..6d3de450 100644 --- a/tests/test_iterators.py +++ b/tests/test_iterators.py @@ -3,27 +3,31 @@ from ceruleo.dataset.ts_dataset import AbstractPDMDataset from ceruleo.iterators.batcher import Batcher from ceruleo.iterators.iterators import WindowedDatasetIterator -from ceruleo.transformation import Pipeline, Transformer +from ceruleo.iterators.sample_weight import ( + ExponentialDecay, + InverseToLengthWeighted, + NotWeighted, + RULInverseWeighted, +) +from ceruleo.transformation import Transformer from ceruleo.transformation.features.scalers import MinMaxScaler from ceruleo.transformation.features.selection import ByNameFeatureSelector class SimpleDataset(AbstractPDMDataset): def __init__(self): - self.lives = [ - pd.DataFrame({ - 'feature1': np.array(range(0, 100)), - 'RUL': np.array(range(0, 100)) - })] - + pd.DataFrame( + {"feature1": np.array(range(0, 100)), "RUL": np.array(range(0, 100))} + ) + ] def get_time_series(self, i: int): return self.lives[i] @property def rul_column(self): - return 'RUL' + return "RUL" @property def n_time_series(self): @@ -32,22 +36,26 @@ def n_time_series(self): class MockDataset(AbstractPDMDataset): def __init__(self, nlives: int): - self.lives = [ - pd.DataFrame({ - 'feature1': np.linspace(0, (i+1)*100, 50), - 'feature2': np.linspace(-25, (i+1)*500, 50), - 'RUL': np.linspace(100, 0, 50) - }) - for i in range(nlives-1)] + pd.DataFrame( + { + "feature1": np.linspace(0, (i + 1) * 100, 50), + "feature2": np.linspace(-25, (i + 1) * 500, 50), + "RUL": np.linspace(100, 0, 50), + } + ) + for i in range(nlives - 1) + ] self.lives.append( - pd.DataFrame({ - 'feature1': np.linspace(0, 5*100, 50), - 'feature2': np.linspace(-25, 5*500, 50), - 'feature3': np.linspace(-25, 5*500, 50), - 'RUL': np.linspace(100, 0, 50) - }) + pd.DataFrame( + { + "feature1": np.linspace(0, 5 * 100, 50), + "feature2": np.linspace(-25, 5 * 500, 50), + "feature3": np.linspace(-25, 5 * 500, 50), + "RUL": np.linspace(100, 0, 50), + } + ) ) def get_time_series(self, i: int): @@ -55,25 +63,25 @@ def get_time_series(self, i: int): @property def rul_column(self): - return 'RUL' + return "RUL" @property def n_time_series(self): return len(self.lives) -class TestIterators(): +class TestIterators: def test_iterators(self): - features = ['feature1', 'feature2'] + features = ["feature1", "feature2"] x = ByNameFeatureSelector(features=features) x = MinMaxScaler(range=(-1, 1))(x) - y = ByNameFeatureSelector(features=['RUL']) + y = ByNameFeatureSelector(features=["RUL"]) transformer = Transformer(x, y) batch_size = 15 window_size = 5 ds = MockDataset(5) - + transformer.fit(ds) b = Batcher.new(ds.map(transformer), window_size, batch_size, 1) X, y, w = next(b) @@ -84,14 +92,47 @@ def test_iterators(self): def test_2(self): dataset = SimpleDataset() - pipe = ByNameFeatureSelector(features=['feature1']) - y_pipe = ByNameFeatureSelector(features=['RUL']) - transformer_raw = Transformer( - pipelineX=pipe, - pipelineY=y_pipe - ) + pipe = ByNameFeatureSelector(features=["feature1"]) + y_pipe = ByNameFeatureSelector(features=["RUL"]) + transformer_raw = Transformer(pipelineX=pipe, pipelineY=y_pipe) transformer_raw.fit(dataset) - it = WindowedDatasetIterator(dataset.map(transformer_raw), 5) + it = WindowedDatasetIterator(dataset.map(transformer_raw), 5) X, y, sw = next(it) - assert np.all(X == np.array([[0,1,2,3,4]]).T) + assert np.all(X == np.array([[0, 1, 2, 3, 4]]).T) assert y[0][0] == 4 + + +def build_elements(): + a = np.linspace(10, 0, 11) + b = np.vstack((a, a)).T + c = pd.DataFrame(a, columns=["RUL"]) + d = pd.DataFrame(b, columns=["RUL", "RUL2"]) + e = pd.Series(a) + return [a, b, c, d, e] + + +class TestSampleWeight: + def test_not_weighted(self): + nw = NotWeighted() + for el in build_elements(): + assert nw(el, 0, None) == 1 + assert nw(el, 5, None) == 1 + + def test_rul_inverse_weighted(self): + inverse = RULInverseWeighted() + for el in build_elements(): + assert inverse(el, 0, None) == 1.0/(10 + 1) + assert inverse(el, 5, None) == 1.0/(5 + 1) + assert inverse(el, 10, None) == 1.0 + + def test_InverseToLengthWeighted(self): + inverse = InverseToLengthWeighted() + for el in build_elements(): + assert inverse(el, 0, None) == 1/(10.0) + assert inverse(el, 5, None) == 1/(10.0) + + def test_ExponentialDecay(self): + exp = ExponentialDecay(near_0_at=3) + for el in build_elements(): + assert exp(el, 0, None) < 0.00001 + assert exp(el, 9, None) > 0.21 \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py index 97529108..1d2f76ad 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -12,6 +12,7 @@ from ceruleo.dataset.ts_dataset import AbstractPDMDataset from ceruleo.iterators.iterators import WindowedDatasetIterator +from ceruleo.iterators.sample_weight import RULInverseWeighted from ceruleo.iterators.shufflers import AllShuffled from ceruleo.iterators.utils import true_values from ceruleo.models.baseline import BaselineModel, FixedValueBaselineModel @@ -211,6 +212,32 @@ def test_keras(self): assert mae < mae_before_fit + train_iterator = WindowedDatasetIterator( + train_dataset.map(transformer), + window_size=window_size, + step=1, + shuffler=AllShuffled(), + sample_weight=RULInverseWeighted(), + ) + + val_iterator = WindowedDatasetIterator( + val_dataset.map(transformer), window_size=window_size, step=1 + ) + model.compile(loss="mae", optimizer=tf.keras.optimizers.SGD(0.01)) + y_true = true_values(val_iterator) + y_pred_before_fit = model.predict(tf_regression_dataset(val_iterator).batch(64)) + mae_before_fit = np.mean(np.abs(y_pred_before_fit.ravel() - y_true.ravel())) + model.fit( + tf_regression_dataset(train_iterator).batch(4), + validation_data=tf_regression_dataset(val_iterator).batch(64), + epochs=15, + ) + y_pred = model.predict(tf_regression_dataset(val_iterator).batch(64)) + + mae = np.mean(np.abs(y_pred.ravel() - y_true.ravel())) + + assert mae < mae_before_fit + def test_xgboost(self): features = ["feature1", "feature2"] diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index e7533832..b2de39c1 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1,16 +1,19 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import GridSearchCV + from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset, sensor_indices -from ceruleo.transformation import Transformer -from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline -from ceruleo.transformation.features.selection import ByNameFeatureSelector -from ceruleo.transformation.features.scalers import MinMaxScaler +from ceruleo.iterators.sample_weight import RULInverseWeighted from ceruleo.models.sklearn import ( + CeruleoMetricWrapper, CeruleoRegressor, TimeSeriesWindowTransformer, - CeruleoMetricWrapper ) -from sklearn.linear_model import Ridge -from sklearn.model_selection import GridSearchCV -from sklearn.ensemble import RandomForestRegressor +from ceruleo.transformation import Transformer +from ceruleo.transformation.features.scalers import MinMaxScaler +from ceruleo.transformation.features.selection import ByNameFeatureSelector +from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline + def test_gridsearch_cv(): train_dataset = CMAPSSDataset(train=True, models='FD001') @@ -48,3 +51,34 @@ def test_gridsearch_cv(): grid_search.fit(train_dataset) assert grid_search is not None + + + +def test_sample_weights(): + train_dataset = CMAPSSDataset(train=True, models='FD001') + FEATURES = [train_dataset[0].columns[i] for i in sensor_indices] + transformer = Transformer( + pipelineX=make_pipeline( + ByNameFeatureSelector(features=FEATURES), + MinMaxScaler(range=(-1, 1)) + + ), + pipelineY=make_pipeline( + ByNameFeatureSelector(features=['RUL']), + ) + ) + + + regressor_gs = CeruleoRegressor( + TimeSeriesWindowTransformer( + transformer, + window_size=32, + sample_weight=RULInverseWeighted(), + padding=True, + step=1), + Ridge(alpha=15)) + + + + regressor_gs = regressor_gs.fit(train_dataset) + assert regressor_gs is not None From ef911b65ba68db9ffcdc397e8fb9de61705e2710 Mon Sep 17 00:00:00 2001 From: Luciano Lorenti Date: Fri, 14 Jun 2024 11:30:10 +0200 Subject: [PATCH 2/7] Setup: Allow using pydantic 1 --- .bumpversion.cfg | 2 +- ceruleo/__init__.py | 2 +- ceruleo/dataset/analysis/correlation.py | 4 +++- ceruleo/dataset/analysis/sample_rate.py | 3 ++- ceruleo/utils/__init__.py | 7 +++++++ pyproject.toml | 2 +- 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index a1bdf753..748342a9 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.4 +current_version = 3.0.5 commit = True tag = True diff --git a/ceruleo/__init__.py b/ceruleo/__init__.py index b8e01a54..b7e1ee29 100644 --- a/ceruleo/__init__.py +++ b/ceruleo/__init__.py @@ -9,4 +9,4 @@ CACHE_PATH.mkdir(parents=True, exist_ok=True) -__version__ = "3.0.4" +__version__ = "3.0.5" diff --git a/ceruleo/dataset/analysis/correlation.py b/ceruleo/dataset/analysis/correlation.py index 6fc0e27d..6262bf5e 100644 --- a/ceruleo/dataset/analysis/correlation.py +++ b/ceruleo/dataset/analysis/correlation.py @@ -6,6 +6,8 @@ from ceruleo.dataset.utils import iterate_over_features from pydantic import BaseModel +from ceruleo.utils import pydantic_to_dict + class CorrelationAnalysisElement(BaseModel): mean_correlation: float @@ -31,7 +33,7 @@ def get(self, feature_1: str, feature_2: str) -> CorrelationAnalysisElement: def to_pandas(self) -> pd.DataFrame: return ( pd.DataFrame.from_dict( - {(k[0], k[1]): v.model_dump() for k, v in self.data.items()}, + {(k[0], k[1]): pydantic_to_dict(v) for k, v in self.data.items()}, orient="index", ) .reset_index() diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py index 6559a774..6bacc5da 100644 --- a/ceruleo/dataset/analysis/sample_rate.py +++ b/ceruleo/dataset/analysis/sample_rate.py @@ -6,6 +6,7 @@ from pydantic import BaseModel from ceruleo.dataset.ts_dataset import AbstractPDMDataset +from ceruleo.utils import pydantic_to_dict logger = logging.getLogger(__name__) @@ -16,7 +17,7 @@ class SampleRateAnalysis(BaseModel): std: float def to_pandas(self) -> pd.Series: - return pd.Series(self.model_dump()).to_frame().T + return pd.Series(pydantic_to_dict(self)).to_frame().T def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray: diff --git a/ceruleo/utils/__init__.py b/ceruleo/utils/__init__.py index e69de29b..b9c5b94b 100644 --- a/ceruleo/utils/__init__.py +++ b/ceruleo/utils/__init__.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel, version + +def pydantic_to_dict(b:BaseModel): + if version.VERSION.startswith("1"): + return b.dict() + else: + return b.model_dump() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f6fdd2db..51a7b555 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "antropy >= 0.1.5", "uncertainties >= 3.1", "PyWavelets >= 1.3", - "pydantic>=2.6.2", + "pydantic >= 1.0.0,<3.0.0" ] From 2fd3ebfe16a14de2ece3678b7531af027c077a7d Mon Sep 17 00:00:00 2001 From: lrolando Date: Fri, 14 Jun 2024 18:00:45 +0200 Subject: [PATCH 3/7] Analysis: Improve --- ceruleo/dataset/analysis/sample_rate.py | 18 ++++-- ceruleo/dataset/ts_dataset.py | 18 +++--- ceruleo/graphics/duration.py | 78 ++++++++++++++----------- 3 files changed, 65 insertions(+), 49 deletions(-) diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py index 6bacc5da..fc4d6396 100644 --- a/ceruleo/dataset/analysis/sample_rate.py +++ b/ceruleo/dataset/analysis/sample_rate.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from pydantic import BaseModel - +from typing import List from ceruleo.dataset.ts_dataset import AbstractPDMDataset from ceruleo.utils import pydantic_to_dict @@ -12,13 +12,17 @@ class SampleRateAnalysis(BaseModel): - mode: float + median: float mean: float std: float + unit: str def to_pandas(self) -> pd.Series: return pd.Series(pydantic_to_dict(self)).to_frame().T + def __repr__(self) -> str: + return f"Mode: {self.median} | {self.mean} +- {self.std} [{self.unit}]" + def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray: """Obtain an array of time difference between two consecutive samples @@ -33,9 +37,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray: Array of time differences """ - time_diff = [] + time_diff : List[float ]= [] for life in ds: diff = np.diff(life.index.values) + diff = diff[diff <= np.median(diff)] if pd.api.types.is_timedelta64_ns_dtype(diff.dtype): diff = diff / np.timedelta64(1, unit) time_diff.extend(diff) @@ -44,10 +49,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray: def sample_rate_summary( - ds: AbstractPDMDataset, unit: Optional[str] = "s" + ds: AbstractPDMDataset, unit: str = "s" ) -> SampleRateAnalysis: """ - Obtain the mean, mode and standard deviation of the sample rate of the dataset + Obtain the mean, median and standard deviation of the sample rate of the dataset Parameters: ds: The dataset @@ -60,5 +65,6 @@ def sample_rate_summary( return SampleRateAnalysis( mean=np.mean(sr), std=np.std(sr), - mode=pd.Series(sr).mode().values[0], + median=np.median(sr), + unit=unit ) diff --git a/ceruleo/dataset/ts_dataset.py b/ceruleo/dataset/ts_dataset.py index fab9a0e9..915ceea3 100644 --- a/ceruleo/dataset/ts_dataset.py +++ b/ceruleo/dataset/ts_dataset.py @@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int: def rul_column(self) -> str: raise NotImplementedError - def duration(self, life: pd.DataFrame) -> float: - return life[self.rul_column].max() - - def number_of_samples(self) -> List[int]: - return [ - self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self))) - ] - def duration(self, life: pd.DataFrame) -> float: """Obtain the duration of the time-series @@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float: Returns: Duration of the life """ - v = life.index - return v.max() - v.min() + return life[self.rul_column].max() + + def number_of_samples(self) -> List[int]: + return [ + self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self))) + ] + + def durations(self, show_progress: bool = False) -> List[float]: """ diff --git a/ceruleo/graphics/duration.py b/ceruleo/graphics/duration.py index 4b133c21..35ac267d 100644 --- a/ceruleo/graphics/duration.py +++ b/ceruleo/graphics/duration.py @@ -7,7 +7,9 @@ import numpy as np import seaborn as sns from ceruleo.dataset.ts_dataset import AbstractPDMDataset - +from datetime import timedelta +from typing import Iterable +import pandas as pd def add_vertical_line(ax, v_x, label, color, line, n_lines): @@ -29,8 +31,8 @@ def durations_histogram( label: Union[str, List[str]] = "1", bins: int = 15, units: str = "m", - vlines: Tuple[float, str] = [], - ax:matplotlib.axes.Axes=None, + vlines: List[Tuple[float, str]] = [], + ax:Optional[matplotlib.axes.Axes]=None, add_mean: bool = True, add_median: bool = True, transform: Callable[[float], float] = lambda x: x, @@ -68,10 +70,13 @@ def durations_histogram( """ if isinstance(datasets, list): + assert isinstance(label,list) assert len(datasets) == len(label) + label_list = label else: + assert isinstance(label, str) datasets = [datasets] - label = [label] + label_list = [label] durations = [] for ds in datasets: @@ -80,7 +85,7 @@ def durations_histogram( return histogram_from_durations( durations, xlabel=xlabel, - label=label, + label=label_list, bins=bins, units=units, vlines=vlines, @@ -93,50 +98,57 @@ def durations_histogram( ) + def histogram_from_durations( - durations: Union[List[float], List[List[float]]], + durations: List[List[float]], xlabel: str, - label: Union[str, List[str]] = "", + label: List[str], bins: int = 15, units: str = "m", vlines: List[Tuple[float, str]] = [], ax=None, add_mean: bool = True, add_median: bool = True, - threshold: float = np.inf, + threshold: float = np.inf, color=None, alpha=1.0, - **kwargs, + **kwargs ) -> matplotlib.axes.Axes: if ax is None: _, ax = plt.subplots(1, 1, **kwargs) - if isinstance(durations[0], list): - assert isinstance(label, list) - assert len(durations) == len(label) - else: - durations = [durations] - label = [label] + + assert isinstance(label, list) + assert len(durations) == len(label) + + + elem_is_timedelta = isinstance(durations[0][0], timedelta) + + + for l, dur in zip(label, durations): if len(l) > 0: l += " " vlines = copy(vlines) + durations_array = np.array(dur) + if elem_is_timedelta: + durations_array = durations / pd.Timedelta(1, units) if add_mean: - vlines.append((np.mean(dur), l + "Mean")) + vlines.append((float(np.mean(durations_array)), l + "Mean")) if add_median: - vlines.append((np.median(dur), l + "Median")) - dur = [d for d in dur if d < threshold] - ax.hist(dur, bins, color=color, alpha=alpha, label=l) + vlines.append((float(np.median(durations_array)), l + "Median")) + durations_array = durations_array[durations_array matplotlib.axes.Axes: - if isinstance(durations[0], list): - assert isinstance(xlabel, list) - assert len(durations) == len(xlabel) - else: - durations = [durations] - xlabel = [xlabel] + assert isinstance(xlabel, list) + assert len(durations) == len(xlabel) if ax is None: fig, ax = plt.subplots(**kwargs) From 2f3c8babbc2c4f4b99d693658e5a184bb03af1ab Mon Sep 17 00:00:00 2001 From: lrolando Date: Fri, 14 Jun 2024 22:36:45 +0200 Subject: [PATCH 4/7] Transformation: Allow the option for missing NA in MixMaxScaler --- ceruleo/transformation/features/scalers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ceruleo/transformation/features/scalers.py b/ceruleo/transformation/features/scalers.py index 00a81fb0..4f9108dd 100644 --- a/ceruleo/transformation/features/scalers.py +++ b/ceruleo/transformation/features/scalers.py @@ -119,7 +119,9 @@ class MinMaxScaler(TransformerStep): Parameters: range: Desired range of transformed data. clip: Set to True to clip transformed values of held-out data to provided, by default True + fillna: Wheter to fill NaN with a value name: Name of the step, by default None + """ def __init__( @@ -127,6 +129,7 @@ def __init__( *, range: tuple, clip: bool = True, + fillna: Optional[float] = None, name: Optional[str] = None, ): super().__init__(name=name) @@ -136,6 +139,7 @@ def __init__( self.data_min = None self.data_max = None self.clip = clip + self.fillna = fillna def partial_fit(self, df: pd.DataFrame, y=None): """ @@ -192,7 +196,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: / divisor[mask] * (self.max - self.min) ) + self.min - X.loc[:, ~mask] = 0 + if self.fillna is not None: + X.loc[:, ~mask] = self.fillna except: raise if self.clip: From 19ee5ee0d808e62abc8aabf4c50b2be1e5403a76 Mon Sep 17 00:00:00 2001 From: lrolando Date: Fri, 14 Jun 2024 22:37:36 +0200 Subject: [PATCH 5/7] Graphics: Show better labels in duration histogram --- ceruleo/graphics/duration.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ceruleo/graphics/duration.py b/ceruleo/graphics/duration.py index 35ac267d..91aaf5bd 100644 --- a/ceruleo/graphics/duration.py +++ b/ceruleo/graphics/duration.py @@ -27,8 +27,9 @@ def add_vertical_line(ax, v_x, label, color, line, n_lines): def durations_histogram( datasets: Union[AbstractPDMDataset, List[AbstractPDMDataset]], - xlabel: str = 'Cycle Duration', - label: Union[str, List[str]] = "1", + *, + label: Union[str, List[str]], + xlabel: str = 'Cycle Duration', bins: int = 15, units: str = "m", vlines: List[Tuple[float, str]] = [], @@ -142,12 +143,12 @@ def histogram_from_durations( durations_array = durations_array[durations_array Date: Fri, 14 Jun 2024 22:38:15 +0200 Subject: [PATCH 6/7] Analysis: Provide a better html repr for the sample rate metric --- ceruleo/dataset/analysis/sample_rate.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py index fc4d6396..15608682 100644 --- a/ceruleo/dataset/analysis/sample_rate.py +++ b/ceruleo/dataset/analysis/sample_rate.py @@ -21,7 +21,15 @@ def to_pandas(self) -> pd.Series: return pd.Series(pydantic_to_dict(self)).to_frame().T def __repr__(self) -> str: - return f"Mode: {self.median} | {self.mean} +- {self.std} [{self.unit}]" + return f"Median: {self.median} | {self.mean} +- {self.std} [{self.unit}]" + + + def _repr_html_(self) -> str: + return f"""
+

Median: {self.median} [{self.unit}]

+

Mean +- Std: {self.mean:.3f} +- {self.std:.3f} [{self.unit}]

+
+ """ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray: From 5f0427259874929360fdb9ed506a76aba092ab64 Mon Sep 17 00:00:00 2001 From: lrolando Date: Fri, 14 Jun 2024 22:38:49 +0200 Subject: [PATCH 7/7] Analysis: Add a function for obtain the numeric analysis as a dataframe --- .../dataset/analysis/numerical_features.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py index ff6e2a6e..234a1af9 100644 --- a/ceruleo/dataset/analysis/numerical_features.py +++ b/ceruleo/dataset/analysis/numerical_features.py @@ -12,6 +12,8 @@ from ceruleo.dataset.transformed import TransformedDataset from ceruleo.dataset.ts_dataset import AbstractPDMDataset from ceruleo.dataset.utils import iterate_over_features_and_target +import pandas as pd + class MetricType(str, Enum): @@ -263,3 +265,47 @@ def analysis( analysis_single_cycle(X, y, data_per_cycle, column_names, what_to_compute) return merge_cycle_analysis(data_per_cycle) + + +def analysis_dataframe( + dataset: Union[TransformedDataset, AbstractPDMDataset], + *, + show_progress: bool = False, + what_to_compute: List[str] = [], +) -> pd.DataFrame: + """ + Compute analysis of numerical features + + Parameters: + dataset: A transformed dataset with features and target + show_progress: Wether to show the progress when computing the features + what_to_compute: Elements available to compute: + + - std + - Correlation + - Autocorrelation + - Monotonicity + - Number of unique elements + - Mutual information + - Null + - Entropy + + + Returns: + NumericalFeaturesAnalysis + """ + rr = analysis(dataset, show_progress=show_progress, what_to_compute=what_to_compute) + out = {} + for k in rr.keys(): + for metric in rr[k].metric.keys(): + if (metric, "mean") not in out: + out[(metric.value, "mean")] = [] + out[(metric.value, "std")] = [] + out[(metric.value, "max")] = [] + out[(metric.value, "min")] = [] + out[(metric.value, "mean")].append(rr[k].metric[metric].mean) + out[(metric.value, "std")].append(rr[k].metric[metric].std) + out[(metric.value, "max")].append(rr[k].metric[metric].max) + out[(metric.value, "min")].append(rr[k].metric[metric].min) + + return pd.DataFrame(out, index=rr.keys()).sort_values(by=("null", "mean"), ascending=False) \ No newline at end of file