Skip to content

Commit

Permalink
Analysis: Improve Data analysis notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
lucianolorenti committed Jun 16, 2024
2 parents 4a21890 + 5f04272 commit 12d1192
Show file tree
Hide file tree
Showing 11 changed files with 2,365 additions and 591 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.0.4
current_version = 3.0.5
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion ceruleo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
CACHE_PATH.mkdir(parents=True, exist_ok=True)


__version__ = "3.0.4"
__version__ = "3.0.5"
4 changes: 3 additions & 1 deletion ceruleo/dataset/analysis/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from ceruleo.dataset.utils import iterate_over_features
from pydantic import BaseModel

from ceruleo.utils import pydantic_to_dict


class CorrelationAnalysisElement(BaseModel):
mean_correlation: float
Expand All @@ -31,7 +33,7 @@ def get(self, feature_1: str, feature_2: str) -> CorrelationAnalysisElement:
def to_pandas(self) -> pd.DataFrame:
return (
pd.DataFrame.from_dict(
{(k[0], k[1]): v.model_dump() for k, v in self.data.items()},
{(k[0], k[1]): pydantic_to_dict(v) for k, v in self.data.items()},
orient="index",
)
.reset_index()
Expand Down
110 changes: 92 additions & 18 deletions ceruleo/dataset/analysis/numerical_features.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

from enum import Enum
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union

import antropy as ant
import numpy as np
Expand All @@ -12,6 +11,7 @@
from ceruleo.dataset.transformed import TransformedDataset
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.dataset.utils import iterate_over_features_and_target
import pandas as pd


class MetricType(str, Enum):
Expand All @@ -29,7 +29,7 @@ def from_str(s: str) -> "MetricType":
return MetricType(s)


class MetricValues(BaseModel):
class MetricValuesSummary(BaseModel):
mean: float
std: float
max: float
Expand All @@ -38,7 +38,28 @@ class MetricValues(BaseModel):

class NumericalFeaturesAnalysis(BaseModel):
feature: str
metric: Dict[MetricType, MetricValues]
metric: Dict[MetricType, List[float]]

def summarize(self) -> Dict[MetricType, MetricValuesSummary]:
out = {}
for metric in self.metric.keys():
mean = np.nanmean(self.metric[metric])
std = np.nanstd(self.metric[metric])
max_ = np.nanmax(self.metric[metric])
min_ = np.nanmin(self.metric[metric])
out[metric] = MetricValuesSummary(mean=mean, std=std, max=max_, min=min_)
return out

def __getitem__(self, key: str) -> MetricValuesSummary:
return self.metric[MetricType.from_str(key)]

def _repr_html_(self) -> str:
out = "<table>"
out += "<tr><th>Metric</th><th>Mean</th><th>Std</th><th>Max</th><th>Min</th></tr>"
for metric, summary in self.summarize().items():
out += f"<tr><td>{metric}</td><td>{summary.mean}</td><td>{summary.std}</td><td>{summary.max}</td><td>{summary.min}</td></tr>"
out += "</table>"
return out


def entropy(s: np.ndarray) -> float:
Expand Down Expand Up @@ -120,15 +141,15 @@ def n_unique(s: np.ndarray) -> int:

def null(s: np.ndarray) -> float:
"""
Null proportion for a given feature
Null percentage for a given feature
Parameters:
s: A feature
Returns:
Null proportion
Null percentage
"""
return np.mean(~np.isfinite(s))
return np.mean(~np.isfinite(s)) * 100


def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
Expand All @@ -151,17 +172,19 @@ def mutual_information(x: np.ndarray, y: np.ndarray) -> float:

metrics = {
"std": lambda x, y: np.std(x),
"correlation": lambda x, y: correlation(x, y),

"autocorrelation": lambda x, y: autocorrelation(x),
"monotonicity": lambda x, y: monotonicity(x),
"number_of_unique_elements": lambda x, y: n_unique(x),
"mutual_information": mutual_information,

"null": lambda x, y: null(x),
"entropy": lambda x, y: entropy(x),
"mutual_information": mutual_information,
"correlation": lambda x, y: correlation(x, y),
}


def analysis_single_cycle(
def analyze_single_cycle(
X: np.ndarray,
y: np.ndarray,
out: Dict[str, Dict[MetricType, List[float]]],
Expand Down Expand Up @@ -206,16 +229,11 @@ def merge_cycle_analysis(
for column_name in data.keys():
for what in data[column_name]:
metric_type = MetricType.from_str(what)
out[column_name].metric[metric_type] = MetricValues(
mean=np.nanmean(data[column_name][what]),
std=np.nanstd(data[column_name][what]),
max=np.nanmax(data[column_name][what]),
min=np.nanmin(data[column_name][what]),
)
out[column_name].metric[metric_type] = data[column_name][what]
return out


def analysis(
def analyze(
dataset: Union[TransformedDataset, AbstractPDMDataset],
*,
show_progress: bool = False,
Expand Down Expand Up @@ -260,6 +278,62 @@ def analysis(
}
for X, y in iterate_over_features_and_target(dataset):
y = np.squeeze(y)
analysis_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
analyze_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)

return merge_cycle_analysis(data_per_cycle)


def analyze_as_dataframe(
dataset: Union[TransformedDataset, AbstractPDMDataset],
*,
show_progress: bool = False,
what_to_compute: List[str] = [],
) -> pd.DataFrame:
"""
Compute analysis of numerical features
Parameters:
dataset: A transformed dataset with features and target
show_progress: Wether to show the progress when computing the features
what_to_compute: Elements available to compute:
- std
- Correlation
- Autocorrelation
- Monotonicity
- Number of unique elements
- Mutual information
- Null
- Entropy
Returns:
pd.DataFrame
"""
rr = analyze(dataset, show_progress=show_progress, what_to_compute=what_to_compute)


out: Dict[Tuple[str, str], List[float]] = {}


for k, metrics in rr.items():
metrics_summary = metrics.summarize()
for metric_name, metric_values in metrics_summary.items():

key_mean = (metric_name.value, "Mean value across the cycles")
key_std = (metric_name.value, "Standard deviation across the cycles")
key_max = (metric_name.value, "Maximum value found in a cycle")
key_min = (metric_name.value, "Minimum value found in a cycle")

if key_mean not in out:
out[key_mean] = []
out[key_std] = []
out[key_max] = []
out[key_min] = []

out[key_mean].append(metric_values.mean)
out[key_std].append(metric_values.std)
out[key_max].append(metric_values.max)
out[key_min].append(metric_values.min)

return pd.DataFrame(out, index=rr.keys())
29 changes: 22 additions & 7 deletions ceruleo/dataset/analysis/sample_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,32 @@
import numpy as np
import pandas as pd
from pydantic import BaseModel

from typing import List
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.utils import pydantic_to_dict

logger = logging.getLogger(__name__)


class SampleRateAnalysis(BaseModel):
mode: float
median: float
mean: float
std: float
unit: str

def to_pandas(self) -> pd.Series:
return pd.Series(self.model_dump()).to_frame().T
return pd.Series(pydantic_to_dict(self)).to_frame().T

def __repr__(self) -> str:
return f"Median: {self.median} | {self.mean} +- {self.std} [{self.unit}]"


def _repr_html_(self) -> str:
return f"""<div>
<p> <span style="font-weight:bold"> Median: </span> {self.median} [{self.unit}] </p>
<p> <span style="font-weight:bold"> Mean +- Std: </span> {self.mean:.3f} +- {self.std:.3f} [{self.unit}] </p>
</div>
"""


def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
Expand All @@ -32,9 +45,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
Array of time differences
"""
time_diff = []
time_diff : List[float ]= []
for life in ds:
diff = np.diff(life.index.values)
diff = diff[diff <= np.median(diff)]
if pd.api.types.is_timedelta64_ns_dtype(diff.dtype):
diff = diff / np.timedelta64(1, unit)
time_diff.extend(diff)
Expand All @@ -43,10 +57,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:


def sample_rate_summary(
ds: AbstractPDMDataset, unit: Optional[str] = "s"
ds: AbstractPDMDataset, unit: str = "s"
) -> SampleRateAnalysis:
"""
Obtain the mean, mode and standard deviation of the sample rate of the dataset
Obtain the mean, median and standard deviation of the sample rate of the dataset
Parameters:
ds: The dataset
Expand All @@ -59,5 +73,6 @@ def sample_rate_summary(
return SampleRateAnalysis(
mean=np.mean(sr),
std=np.std(sr),
mode=pd.Series(sr).mode().values[0],
median=np.median(sr),
unit=unit
)
18 changes: 8 additions & 10 deletions ceruleo/dataset/ts_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int:
def rul_column(self) -> str:
raise NotImplementedError

def duration(self, life: pd.DataFrame) -> float:
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]

def duration(self, life: pd.DataFrame) -> float:
"""Obtain the duration of the time-series
Expand All @@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float:
Returns:
Duration of the life
"""
v = life.index
return v.max() - v.min()
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]



def durations(self, show_progress: bool = False) -> List[float]:
"""
Expand Down
Loading

0 comments on commit 12d1192

Please sign in to comment.