Skip to content

Commit

Permalink
probabilistic support
Browse files Browse the repository at this point in the history
  • Loading branch information
dennisbader committed Nov 18, 2023
1 parent 8326e60 commit 68cfa01
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 49 deletions.
77 changes: 30 additions & 47 deletions darts/dataprocessing/transformers/midas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Mixed-data sampling (MIDAS) Transformer
------------------
---------------------------------------
"""
from typing import Any, Dict, List, Mapping, Optional, Sequence, Union

Expand All @@ -12,7 +12,7 @@
FittableDataTransformer,
InvertibleDataTransformer,
)
from darts.logging import get_logger, raise_if, raise_if_not, raise_log
from darts.logging import get_logger, raise_log
from darts.timeseries import _finite_rows_boundaries
from darts.utils.timeseries_generation import generate_index

Expand Down Expand Up @@ -75,14 +75,12 @@ def __init__(
name='Month', freq='MS')
>>> print(monthly_series.values()[:4])
[[112.], [118.], [132.], [129.]]
>>> midas = MIDAS(low_freq="QS")
>>> quarterly_series = midas.fit_transform(monthly_series)
>>> print(quarterly_series.time_index[:3])
DatetimeIndex(['1949-01-01', '1949-04-01', '1949-07-01'], dtype='datetime64[ns]', name='Month', freq='QS-JAN')
>>> print(quarterly_series.values()[:3])
[[112. 118. 132.], [129. 121. 135.], [148. 148. 136.]]
>>> inversed_quaterly = midas.inverse_transform(quarterly_series)
>>> print(inversed_quaterly.time_index[:4])
DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01'], dtype='datetime64[ns]',
Expand Down Expand Up @@ -252,9 +250,9 @@ def up_sample(low_df: pd.DataFrame, high_period):
arr_out.fill(np.nan)

arr = np.lib.stride_tricks.sliding_window_view(
arr, window_shape=max_size, axis=0
arr, window_shape=(max_size, n_cols_in, n_samples)
)
arr = arr.reshape((len(arr), n_cols_out, n_samples))
arr = arr.reshape((-1, n_cols_out, n_samples))

# the first resampled index might not have all dates from higher freq
size_group_first = group_sizes.iloc[0]
Expand Down Expand Up @@ -290,14 +288,11 @@ def up_sample(low_df: pd.DataFrame, high_period):

arr = arr_out

# TODO: remove this
arr = np.concatenate([arr[:, i::max_size] for i in range(max_size)], axis=1)

ts = MIDAS._create_midas_df(
series=series,
arr=arr,
time_index=time_index,
n_midas_cols=max_size,
n_midas=max_size,
drop_static_covariates=drop_static_covariates,
inverse_transform=False,
feature_sep=feature_sep,
Expand Down Expand Up @@ -339,18 +334,17 @@ def ts_inverse_transform(
series_n_components = series.n_components

n_orig_components = series_n_components // n_midas_components
# original ts was univariate
if n_orig_components == 1:
series_values = series.values(copy=False).flatten()
else:
series_values = series.values(copy=False).reshape((-1, n_orig_components))

if len(series) == 0:
# placeholders for empty series
start_time = pd.Timestamp("2020-01-01")
shift = 0
series_values = np.empty((0, n_orig_components))
series_values = np.empty((0, n_orig_components, series.n_samples))
else:
series_values = series.all_values(copy=False).reshape(
-1, n_orig_components, series.n_samples
)

# remove the rows containing only NaNs at the extremities of the array, necessary to adjust the time index
first_finite_row, last_finite_row = _finite_rows_boundaries(
series_values, how="all"
Expand Down Expand Up @@ -384,7 +378,7 @@ def ts_inverse_transform(
series=series,
arr=series_values,
time_index=time_index,
n_midas_cols=n_midas_components,
n_midas=n_midas_components,
drop_static_covariates=drop_static_covariates,
inverse_transform=True,
feature_sep=feature_sep,
Expand All @@ -398,17 +392,11 @@ def _verify_series(
low_freq: Optional[str] = None,
):
"""Some sanity checks on the input, the high_freq and low_freq arguments are mutually exclusive"""
raise_if(
series.is_probabilistic,
"MIDAS Transformer cannot be applied to probabilistic/stochastic TimeSeries",
logger,
)

raise_if_not(
isinstance(series.time_index, pd.DatetimeIndex),
"MIDAS input series must have a pd.Datetime index",
logger,
)
if not isinstance(series.time_index, pd.DatetimeIndex):
raise_log(
ValueError("MIDAS input series must have a pd.Datetime index"),
logger,
)

series_freq_str = series.freq_str
input_freq = [series_freq_str]
Expand All @@ -435,26 +423,26 @@ def _verify_series(

@staticmethod
def _process_static_covariates(
static_covariates: Union[None, pd.Series, pd.DataFrame],
index_or_multiple: int,
series: TimeSeries,
n_midas: int,
drop_static_covariates: bool,
inverse_transform: bool,
) -> Optional[Union[pd.Series, pd.DataFrame]]:
"""If static covariates are component-specific, they must be reshaped appropriately.
`index_or_multiple` has a different meaning depending on the transformation:
- transform : multiple, to repeat the static covariates for the new components
- inverse_transform : index, to remove the duplciated static covariates
"""
If static covariates are component-specific, they must be reshaped appropriately.
"""
static_covariates = series.static_covariates
if drop_static_covariates:
return None
elif (
static_covariates is not None
and static_covariates.index.name == "component"
):
if inverse_transform:
return static_covariates[:index_or_multiple]
cols_orig = series.n_components // n_midas
return static_covariates[:cols_orig]
else:
return pd.concat([static_covariates] * index_or_multiple)
return pd.concat([static_covariates] * n_midas)
else:
return static_covariates

Expand All @@ -463,7 +451,7 @@ def _create_midas_df(
series: TimeSeries,
arr: np.ndarray,
time_index: Union[pd.DatetimeIndex, pd.RangeIndex],
n_midas_cols: int,
n_midas: int,
drop_static_covariates: bool,
inverse_transform: bool,
feature_sep: str,
Expand All @@ -472,23 +460,18 @@ def _create_midas_df(
Function creating the lower frequency dataframe out of a higher frequency dataframe.
"""
if not inverse_transform:
# TODO: revert this to [f"{col}_{i}" for col in series.columns for i in range(n_midas_cols)]
index_or_multiple = n_midas_cols
cols = [
f"{col}{feature_sep}{i}"
for i in range(n_midas_cols)
for i in range(n_midas)
for col in series.columns
]
else:
index_or_multiple = series.n_components // n_midas_cols
cols = [
feature_sep.join(series.components[i].split(feature_sep)[:-1])
for i in range(index_or_multiple)
]
cols_orig = series.n_components // n_midas
cols = series.components[:cols_orig].str.split(feature_sep).str[0].tolist()

static_covariates = MIDAS._process_static_covariates(
static_covariates=series.static_covariates,
index_or_multiple=index_or_multiple,
series=series,
n_midas=n_midas,
drop_static_covariates=drop_static_covariates,
inverse_transform=inverse_transform,
)
Expand Down
137 changes: 137 additions & 0 deletions darts/tests/dataprocessing/transformers/test_midas.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,86 @@ def test_not_complete_monthly_to_quarterly(self):
== inversed_quarterly_not_complete_ts_midas
)

def test_probabilistic_complete_monthly_to_quarterly(self):
"""
Tests MIDAS on probabilistic series on example of monthly series aligned with quarters.
"""

# generate probabilistic monthly series
all_vals_monthly = self.monthly_ts.all_values(copy=False)
prob_values_monthly = np.concatenate(
[all_vals_monthly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_monthly = TimeSeries.from_times_and_values(
times=self.monthly_ts.time_index,
values=prob_values_monthly,
columns=self.monthly_ts.columns.tolist(),
)
# generate probabilistic quarterly series
all_vals_quarterly = self.quarterly_ts.all_values(copy=False)
prob_values_quarterly = np.concatenate(
[all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_quarterly = TimeSeries.from_times_and_values(
times=self.quarterly_ts.time_index,
values=prob_values_quarterly,
columns=self.quarterly_ts.columns.tolist(),
)

# to quarter start
midas_1 = MIDAS(low_freq="QS")

quarterly_ts_midas = midas_1.fit_transform(ts_prob_monthly)
assert quarterly_ts_midas == ts_prob_quarterly

inversed_quarterly_ts_midas = midas_1.inverse_transform(quarterly_ts_midas)
assert inversed_quarterly_ts_midas == ts_prob_monthly

def test_probabilistic_not_complete_monthly_to_quarterly(self):
"""
Tests MIDAS on probabilistic series on example of monthly series not aligned with quarters.
"""
# generate probabilistic monthly series
all_vals_monthly = self.monthly_not_complete_ts.all_values(copy=False)
prob_values_monthly = np.concatenate(
[all_vals_monthly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_monthly = TimeSeries.from_times_and_values(
times=self.monthly_not_complete_ts.time_index,
values=prob_values_monthly,
columns=self.monthly_not_complete_ts.columns.tolist(),
)
# generate probabilistic quarterly series
all_vals_quarterly = self.quarterly_not_complete_ts.all_values(copy=False)
prob_values_quarterly = np.concatenate(
[all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_quarterly = TimeSeries.from_times_and_values(
times=self.quarterly_not_complete_ts.time_index,
values=prob_values_quarterly,
columns=self.quarterly_not_complete_ts.columns.tolist(),
)

# monthly series with missing values
midas = MIDAS(low_freq="QS", strip=False)
quarterly_not_complete_ts_midas = midas.fit_transform(ts_prob_monthly)
assert quarterly_not_complete_ts_midas == ts_prob_quarterly

inversed_quarterly_not_complete_ts_midas = midas.inverse_transform(
quarterly_not_complete_ts_midas
)
assert inversed_quarterly_not_complete_ts_midas == ts_prob_monthly

# when strip=True we only get 1 one quarter with all 3 months
midas = MIDAS(low_freq="QS", strip=True)
quarterly_not_complete_ts_midas = midas.fit_transform(ts_prob_monthly)
assert quarterly_not_complete_ts_midas == ts_prob_quarterly[1:2]

inversed_quarterly_not_complete_ts_midas = midas.inverse_transform(
quarterly_not_complete_ts_midas
)
assert ts_prob_monthly[1:4] == inversed_quarterly_not_complete_ts_midas

def test_multivariate_monthly_to_quarterly(self):
"""
Check that multivariate monthly to quarterly is properly transformed
Expand Down Expand Up @@ -133,6 +213,63 @@ def test_multivariate_monthly_to_quarterly(self):
)
assert stacked_monthly_ts == multivar_inversed_quarterly_ts_midas

def test_probabilistic_multivariate_monthly_to_quarterly(self):
"""
Check that probabilistic multivariate monthly to quarterly is properly transformed
"""
monthly_ts = self.monthly_ts.stack(
TimeSeries.from_times_and_values(
times=self.monthly_ts.time_index,
values=np.arange(10, 19),
columns=["other"],
)
)
# generate probabilistic monthly series
all_vals_monthly = monthly_ts.all_values(copy=False)
prob_values_monthly = np.concatenate(
[all_vals_monthly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_monthly = TimeSeries.from_times_and_values(
times=monthly_ts.time_index,
values=prob_values_monthly,
columns=monthly_ts.columns.tolist(),
)

# component components are alternating
quarterly_ts = TimeSeries.from_times_and_values(
times=self.quarterly_ts.time_index,
values=np.array(
[[1, 10, 2, 11, 3, 12], [4, 13, 5, 14, 6, 15], [7, 16, 8, 17, 9, 18]]
),
columns=[
"values_midas_0",
"other_midas_0",
"values_midas_1",
"other_midas_1",
"values_midas_2",
"other_midas_2",
],
)
# generate probabilistic quarterly series
all_vals_quarterly = quarterly_ts.all_values(copy=False)
prob_values_quarterly = np.concatenate(
[all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
)
ts_prob_quarterly = TimeSeries.from_times_and_values(
times=quarterly_ts.time_index,
values=prob_values_quarterly,
columns=quarterly_ts.columns.tolist(),
)

midas_1 = MIDAS(low_freq="QS")
multivar_quarterly_ts_midas = midas_1.fit_transform(ts_prob_monthly)
assert multivar_quarterly_ts_midas == ts_prob_quarterly

multivar_inversed_quarterly_ts_midas = midas_1.inverse_transform(
multivar_quarterly_ts_midas
)
assert ts_prob_monthly == multivar_inversed_quarterly_ts_midas

def test_ts_with_missing_data(self):
"""
Check that multivariate monthly to quarterly with missing data in the middle is properly transformed.
Expand Down
9 changes: 7 additions & 2 deletions darts/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5299,18 +5299,23 @@ def _finite_rows_boundaries(
Parameters
----------
values
1D or 2D numpy array where the first dimension correspond to entries/rows, and the second to components/columns
1D, 2D or 3D numpy array where the first dimension correspond to entries/rows, and the second to components/
columns
how
Define if the entries containing `NaN` in all the components ('all') or in any of the components ('any')
should be stripped. Default: 'all'
"""
dims = values.shape

raise_if(
len(dims) > 2, f"Expected 1D or 2D array, received {len(dims)}D array", logger
len(dims) > 3, f"Expected 1D to 3D array, received {len(dims)}D array", logger
)

finite_rows = ~np.isnan(values)

if len(dims) == 3:
finite_rows = finite_rows.all(axis=2)

if len(dims) > 1 and dims[1] > 1:
if how == "any":
finite_rows = finite_rows.all(axis=1)
Expand Down

0 comments on commit 68cfa01

Please sign in to comment.