Analysis: Improve Data analysis notebook

lucianolorenti · Jun 16, 2024 · 12d1192 · 12d1192
2 parents 4a21890 + 5f04272
commit 12d1192
Show file tree

Hide file tree

Showing 11 changed files with 2,365 additions and 591 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 3.0.4
+current_version = 3.0.5
 commit = True
 tag = True
 

diff --git a/ceruleo/__init__.py b/ceruleo/__init__.py
@@ -9,4 +9,4 @@
 CACHE_PATH.mkdir(parents=True, exist_ok=True)
 
 
-__version__ = "3.0.4"
+__version__ = "3.0.5"
diff --git a/ceruleo/dataset/analysis/correlation.py b/ceruleo/dataset/analysis/correlation.py
@@ -6,6 +6,8 @@
 from ceruleo.dataset.utils import iterate_over_features
 from pydantic import BaseModel
 
+from ceruleo.utils import pydantic_to_dict
+
 
 class CorrelationAnalysisElement(BaseModel):
     mean_correlation: float
@@ -31,7 +33,7 @@ def get(self, feature_1: str, feature_2: str) -> CorrelationAnalysisElement:
     def to_pandas(self) -> pd.DataFrame:
         return (
             pd.DataFrame.from_dict(
-                {(k[0], k[1]): v.model_dump() for k, v in self.data.items()},
+                {(k[0], k[1]): pydantic_to_dict(v) for k, v in self.data.items()},
                 orient="index",
             )
             .reset_index()

diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py
@@ -1,6 +1,5 @@
-
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import antropy as ant
 import numpy as np
@@ -12,6 +11,7 @@
 from ceruleo.dataset.transformed import TransformedDataset
 from ceruleo.dataset.ts_dataset import AbstractPDMDataset
 from ceruleo.dataset.utils import iterate_over_features_and_target
+import pandas as pd
 
 
 class MetricType(str, Enum):
@@ -29,7 +29,7 @@ def from_str(s: str) -> "MetricType":
         return MetricType(s)
 
 
-class MetricValues(BaseModel):
+class MetricValuesSummary(BaseModel):
     mean: float
     std: float
     max: float
@@ -38,7 +38,28 @@ class MetricValues(BaseModel):
 
 class NumericalFeaturesAnalysis(BaseModel):
     feature: str
-    metric: Dict[MetricType, MetricValues]
+    metric: Dict[MetricType, List[float]]
+
+    def summarize(self) -> Dict[MetricType, MetricValuesSummary]:
+        out = {}
+        for metric in self.metric.keys():
+            mean = np.nanmean(self.metric[metric])
+            std = np.nanstd(self.metric[metric])
+            max_ = np.nanmax(self.metric[metric])
+            min_ = np.nanmin(self.metric[metric])
+            out[metric] = MetricValuesSummary(mean=mean, std=std, max=max_, min=min_)
+        return out
+
+    def __getitem__(self, key: str) -> MetricValuesSummary:
+        return self.metric[MetricType.from_str(key)]
+
+    def _repr_html_(self) -> str:
+        out = "<table>"
+        out += "<tr><th>Metric</th><th>Mean</th><th>Std</th><th>Max</th><th>Min</th></tr>"
+        for metric, summary in self.summarize().items():
+            out += f"<tr><td>{metric}</td><td>{summary.mean}</td><td>{summary.std}</td><td>{summary.max}</td><td>{summary.min}</td></tr>"
+        out += "</table>"
+        return out
 
 
 def entropy(s: np.ndarray) -> float:
@@ -120,15 +141,15 @@ def n_unique(s: np.ndarray) -> int:
 
 def null(s: np.ndarray) -> float:
     """
-    Null proportion for a given feature
+    Null percentage for a given feature
 
     Parameters:
         s: A feature
 
     Returns:
-        Null proportion
+        Null percentage
     """
-    return np.mean(~np.isfinite(s))
+    return np.mean(~np.isfinite(s)) * 100
 
 
 def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
@@ -151,17 +172,19 @@ def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
 
 metrics = {
     "std": lambda x, y: np.std(x),
-    "correlation": lambda x, y: correlation(x, y),
+
     "autocorrelation": lambda x, y: autocorrelation(x),
     "monotonicity": lambda x, y: monotonicity(x),
     "number_of_unique_elements": lambda x, y: n_unique(x),
-    "mutual_information": mutual_information,
+
     "null": lambda x, y: null(x),
     "entropy": lambda x, y: entropy(x),
+    "mutual_information": mutual_information,
+     "correlation": lambda x, y: correlation(x, y),
 }
 
 
-def analysis_single_cycle(
+def analyze_single_cycle(
     X: np.ndarray,
     y: np.ndarray,
     out: Dict[str, Dict[MetricType, List[float]]],
@@ -206,16 +229,11 @@ def merge_cycle_analysis(
     for column_name in data.keys():
         for what in data[column_name]:
             metric_type = MetricType.from_str(what)
-            out[column_name].metric[metric_type] = MetricValues(
-                mean=np.nanmean(data[column_name][what]),
-                std=np.nanstd(data[column_name][what]),
-                max=np.nanmax(data[column_name][what]),
-                min=np.nanmin(data[column_name][what]),
-            )
+            out[column_name].metric[metric_type] = data[column_name][what]
     return out
 
 
-def analysis(
+def analyze(
     dataset: Union[TransformedDataset, AbstractPDMDataset],
     *,
     show_progress: bool = False,
@@ -260,6 +278,62 @@ def analysis(
     }
     for X, y in iterate_over_features_and_target(dataset):
         y = np.squeeze(y)
-        analysis_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
+        analyze_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
 
     return merge_cycle_analysis(data_per_cycle)
+
+
+def analyze_as_dataframe(
+    dataset: Union[TransformedDataset, AbstractPDMDataset],
+    *,
+    show_progress: bool = False,
+    what_to_compute: List[str] = [],
+) -> pd.DataFrame:
+    """
+    Compute analysis of numerical features
+
+    Parameters:
+        dataset: A transformed dataset with features and target
+        show_progress: Wether to show the progress when computing the features
+        what_to_compute: Elements available to compute:
+
+            - std
+            - Correlation
+            - Autocorrelation
+            - Monotonicity
+            - Number of unique elements
+            - Mutual information
+            - Null
+            - Entropy
+
+
+    Returns:
+        pd.DataFrame
+    """
+    rr = analyze(dataset, show_progress=show_progress, what_to_compute=what_to_compute)
+
+
+    out: Dict[Tuple[str, str], List[float]] = {}
+
+
+    for k, metrics in rr.items():
+        metrics_summary = metrics.summarize()
+        for metric_name, metric_values in metrics_summary.items():
+
+            key_mean = (metric_name.value, "Mean value across the cycles")
+            key_std = (metric_name.value, "Standard deviation across the cycles")
+            key_max = (metric_name.value, "Maximum value found in a cycle")
+            key_min = (metric_name.value, "Minimum value found in a cycle")
+
+            if key_mean not in out:
+                out[key_mean] = []
+                out[key_std] = []
+                out[key_max] = []
+                out[key_min] = []
+
+            out[key_mean].append(metric_values.mean)
+            out[key_std].append(metric_values.std)
+            out[key_max].append(metric_values.max)
+            out[key_min].append(metric_values.min)
+
+    return pd.DataFrame(out, index=rr.keys())
diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py
@@ -4,19 +4,32 @@
 import numpy as np
 import pandas as pd
 from pydantic import BaseModel
-
+from typing import List 
 from ceruleo.dataset.ts_dataset import AbstractPDMDataset
+from ceruleo.utils import pydantic_to_dict
 
 logger = logging.getLogger(__name__)
 
 
 class SampleRateAnalysis(BaseModel):
-    mode: float
+    median: float
     mean: float
     std: float
+    unit: str
 
     def to_pandas(self) -> pd.Series:
-        return pd.Series(self.model_dump()).to_frame().T
+        return pd.Series(pydantic_to_dict(self)).to_frame().T
+
+    def __repr__(self) -> str:
+        return f"Median: {self.median} | {self.mean} +- {self.std} [{self.unit}]"
+
+
+    def _repr_html_(self) -> str:
+        return f"""<div> 
+        <p> <span style="font-weight:bold"> Median: </span> {self.median} [{self.unit}]  </p>  
+        <p> <span style="font-weight:bold">  Mean +- Std: </span> {self.mean:.3f} +- {self.std:.3f} [{self.unit}] </p>
+        </div>
+    """
 
 
 def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
@@ -32,9 +45,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
         Array of time differences
 
     """
-    time_diff = []
+    time_diff : List[float ]= []
     for life in ds:
         diff = np.diff(life.index.values)
+        diff = diff[diff <= np.median(diff)]
         if pd.api.types.is_timedelta64_ns_dtype(diff.dtype):
             diff = diff / np.timedelta64(1, unit)
         time_diff.extend(diff)
@@ -43,10 +57,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
 
 
 def sample_rate_summary(
-    ds: AbstractPDMDataset, unit: Optional[str] = "s"
+    ds: AbstractPDMDataset, unit: str = "s"
 ) -> SampleRateAnalysis:
     """
-    Obtain the mean, mode and standard deviation of the sample rate of the dataset
+    Obtain the mean, median and standard deviation of the sample rate of the dataset
 
     Parameters:
         ds: The dataset
@@ -59,5 +73,6 @@ def sample_rate_summary(
     return SampleRateAnalysis(
         mean=np.mean(sr),
         std=np.std(sr),
-        mode=pd.Series(sr).mode().values[0],
+        median=np.median(sr),
+        unit=unit
     )
diff --git a/ceruleo/dataset/ts_dataset.py b/ceruleo/dataset/ts_dataset.py
@@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int:
     def rul_column(self) -> str:
         raise NotImplementedError
 
-    def duration(self, life: pd.DataFrame) -> float:
-        return life[self.rul_column].max()
-
-    def number_of_samples(self) -> List[int]:
-        return [
-            self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
-        ]
-
     def duration(self, life: pd.DataFrame) -> float:
         """Obtain the duration of the time-series
 
@@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float:
         Returns:
             Duration of the life
         """
-        v = life.index
-        return v.max() - v.min()
+        return life[self.rul_column].max()
+
+    def number_of_samples(self) -> List[int]:
+        return [
+            self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
+        ]
+
+
 
     def durations(self, show_progress: bool = False) -> List[float]:
         """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@
		CACHE_PATH.mkdir(parents=True, exist_ok=True)


		__version__ = "3.0.4"
		__version__ = "3.0.5"