probabilistic support

unit8co · Nov 18, 2023 · 68cfa01 · 68cfa01
1 parent 8326e60
commit 68cfa01
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 49 deletions.
diff --git a/darts/dataprocessing/transformers/midas.py b/darts/dataprocessing/transformers/midas.py
@@ -1,6 +1,6 @@
 """
 Mixed-data sampling (MIDAS) Transformer
-------------------
+---------------------------------------
 """
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
 
@@ -12,7 +12,7 @@
     FittableDataTransformer,
     InvertibleDataTransformer,
 )
-from darts.logging import get_logger, raise_if, raise_if_not, raise_log
+from darts.logging import get_logger, raise_log
 from darts.timeseries import _finite_rows_boundaries
 from darts.utils.timeseries_generation import generate_index
 
@@ -75,14 +75,12 @@ def __init__(
         name='Month', freq='MS')
         >>> print(monthly_series.values()[:4])
         [[112.], [118.], [132.], [129.]]
-
         >>> midas = MIDAS(low_freq="QS")
         >>> quarterly_series = midas.fit_transform(monthly_series)
         >>> print(quarterly_series.time_index[:3])
         DatetimeIndex(['1949-01-01', '1949-04-01', '1949-07-01'], dtype='datetime64[ns]', name='Month', freq='QS-JAN')
         >>> print(quarterly_series.values()[:3])
         [[112. 118. 132.], [129. 121. 135.], [148. 148. 136.]]
-
         >>> inversed_quaterly = midas.inverse_transform(quarterly_series)
         >>> print(inversed_quaterly.time_index[:4])
         DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01'], dtype='datetime64[ns]',
@@ -252,9 +250,9 @@ def up_sample(low_df: pd.DataFrame, high_period):
             arr_out.fill(np.nan)
 
             arr = np.lib.stride_tricks.sliding_window_view(
-                arr, window_shape=max_size, axis=0
+                arr, window_shape=(max_size, n_cols_in, n_samples)
             )
-            arr = arr.reshape((len(arr), n_cols_out, n_samples))
+            arr = arr.reshape((-1, n_cols_out, n_samples))
 
             # the first resampled index might not have all dates from higher freq
             size_group_first = group_sizes.iloc[0]
@@ -290,14 +288,11 @@ def up_sample(low_df: pd.DataFrame, high_period):
 
             arr = arr_out
 
-            # TODO: remove this
-            arr = np.concatenate([arr[:, i::max_size] for i in range(max_size)], axis=1)
-
         ts = MIDAS._create_midas_df(
             series=series,
             arr=arr,
             time_index=time_index,
-            n_midas_cols=max_size,
+            n_midas=max_size,
             drop_static_covariates=drop_static_covariates,
             inverse_transform=False,
             feature_sep=feature_sep,
@@ -339,18 +334,17 @@ def ts_inverse_transform(
         series_n_components = series.n_components
 
         n_orig_components = series_n_components // n_midas_components
-        # original ts was univariate
-        if n_orig_components == 1:
-            series_values = series.values(copy=False).flatten()
-        else:
-            series_values = series.values(copy=False).reshape((-1, n_orig_components))
 
         if len(series) == 0:
             # placeholders for empty series
             start_time = pd.Timestamp("2020-01-01")
             shift = 0
-            series_values = np.empty((0, n_orig_components))
+            series_values = np.empty((0, n_orig_components, series.n_samples))
         else:
+            series_values = series.all_values(copy=False).reshape(
+                -1, n_orig_components, series.n_samples
+            )
+
             # remove the rows containing only NaNs at the extremities of the array, necessary to adjust the time index
             first_finite_row, last_finite_row = _finite_rows_boundaries(
                 series_values, how="all"
@@ -384,7 +378,7 @@ def ts_inverse_transform(
             series=series,
             arr=series_values,
             time_index=time_index,
-            n_midas_cols=n_midas_components,
+            n_midas=n_midas_components,
             drop_static_covariates=drop_static_covariates,
             inverse_transform=True,
             feature_sep=feature_sep,
@@ -398,17 +392,11 @@ def _verify_series(
         low_freq: Optional[str] = None,
     ):
         """Some sanity checks on the input, the high_freq and low_freq arguments are mutually exclusive"""
-        raise_if(
-            series.is_probabilistic,
-            "MIDAS Transformer cannot be applied to probabilistic/stochastic TimeSeries",
-            logger,
-        )
-
-        raise_if_not(
-            isinstance(series.time_index, pd.DatetimeIndex),
-            "MIDAS input series must have a pd.Datetime index",
-            logger,
-        )
+        if not isinstance(series.time_index, pd.DatetimeIndex):
+            raise_log(
+                ValueError("MIDAS input series must have a pd.Datetime index"),
+                logger,
+            )
 
         series_freq_str = series.freq_str
         input_freq = [series_freq_str]
@@ -435,26 +423,26 @@ def _verify_series(
 
     @staticmethod
     def _process_static_covariates(
-        static_covariates: Union[None, pd.Series, pd.DataFrame],
-        index_or_multiple: int,
+        series: TimeSeries,
+        n_midas: int,
         drop_static_covariates: bool,
         inverse_transform: bool,
     ) -> Optional[Union[pd.Series, pd.DataFrame]]:
-        """If static covariates are component-specific, they must be reshaped appropriately.
-        `index_or_multiple` has a different meaning depending on the transformation:
-        - transform : multiple, to repeat the static covariates for the new components
-        - inverse_transform : index, to remove the duplciated static covariates
         """
+        If static covariates are component-specific, they must be reshaped appropriately.
+        """
+        static_covariates = series.static_covariates
         if drop_static_covariates:
             return None
         elif (
             static_covariates is not None
             and static_covariates.index.name == "component"
         ):
             if inverse_transform:
-                return static_covariates[:index_or_multiple]
+                cols_orig = series.n_components // n_midas
+                return static_covariates[:cols_orig]
             else:
-                return pd.concat([static_covariates] * index_or_multiple)
+                return pd.concat([static_covariates] * n_midas)
         else:
             return static_covariates
 
@@ -463,7 +451,7 @@ def _create_midas_df(
         series: TimeSeries,
         arr: np.ndarray,
         time_index: Union[pd.DatetimeIndex, pd.RangeIndex],
-        n_midas_cols: int,
+        n_midas: int,
         drop_static_covariates: bool,
         inverse_transform: bool,
         feature_sep: str,
@@ -472,23 +460,18 @@ def _create_midas_df(
         Function creating the lower frequency dataframe out of a higher frequency dataframe.
         """
         if not inverse_transform:
-            # TODO: revert this to [f"{col}_{i}" for col in series.columns for i in range(n_midas_cols)]
-            index_or_multiple = n_midas_cols
             cols = [
                 f"{col}{feature_sep}{i}"
-                for i in range(n_midas_cols)
+                for i in range(n_midas)
                 for col in series.columns
             ]
         else:
-            index_or_multiple = series.n_components // n_midas_cols
-            cols = [
-                feature_sep.join(series.components[i].split(feature_sep)[:-1])
-                for i in range(index_or_multiple)
-            ]
+            cols_orig = series.n_components // n_midas
+            cols = series.components[:cols_orig].str.split(feature_sep).str[0].tolist()
 
         static_covariates = MIDAS._process_static_covariates(
-            static_covariates=series.static_covariates,
-            index_or_multiple=index_or_multiple,
+            series=series,
+            n_midas=n_midas,
             drop_static_covariates=drop_static_covariates,
             inverse_transform=inverse_transform,
         )

diff --git a/darts/tests/dataprocessing/transformers/test_midas.py b/darts/tests/dataprocessing/transformers/test_midas.py
@@ -96,6 +96,86 @@ def test_not_complete_monthly_to_quarterly(self):
             == inversed_quarterly_not_complete_ts_midas
         )
 
+    def test_probabilistic_complete_monthly_to_quarterly(self):
+        """
+        Tests MIDAS on probabilistic series on example of monthly series aligned with quarters.
+        """
+
+        # generate probabilistic monthly series
+        all_vals_monthly = self.monthly_ts.all_values(copy=False)
+        prob_values_monthly = np.concatenate(
+            [all_vals_monthly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_monthly = TimeSeries.from_times_and_values(
+            times=self.monthly_ts.time_index,
+            values=prob_values_monthly,
+            columns=self.monthly_ts.columns.tolist(),
+        )
+        # generate probabilistic quarterly series
+        all_vals_quarterly = self.quarterly_ts.all_values(copy=False)
+        prob_values_quarterly = np.concatenate(
+            [all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_quarterly = TimeSeries.from_times_and_values(
+            times=self.quarterly_ts.time_index,
+            values=prob_values_quarterly,
+            columns=self.quarterly_ts.columns.tolist(),
+        )
+
+        # to quarter start
+        midas_1 = MIDAS(low_freq="QS")
+
+        quarterly_ts_midas = midas_1.fit_transform(ts_prob_monthly)
+        assert quarterly_ts_midas == ts_prob_quarterly
+
+        inversed_quarterly_ts_midas = midas_1.inverse_transform(quarterly_ts_midas)
+        assert inversed_quarterly_ts_midas == ts_prob_monthly
+
+    def test_probabilistic_not_complete_monthly_to_quarterly(self):
+        """
+        Tests MIDAS on probabilistic series on example of monthly series not aligned with quarters.
+        """
+        # generate probabilistic monthly series
+        all_vals_monthly = self.monthly_not_complete_ts.all_values(copy=False)
+        prob_values_monthly = np.concatenate(
+            [all_vals_monthly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_monthly = TimeSeries.from_times_and_values(
+            times=self.monthly_not_complete_ts.time_index,
+            values=prob_values_monthly,
+            columns=self.monthly_not_complete_ts.columns.tolist(),
+        )
+        # generate probabilistic quarterly series
+        all_vals_quarterly = self.quarterly_not_complete_ts.all_values(copy=False)
+        prob_values_quarterly = np.concatenate(
+            [all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_quarterly = TimeSeries.from_times_and_values(
+            times=self.quarterly_not_complete_ts.time_index,
+            values=prob_values_quarterly,
+            columns=self.quarterly_not_complete_ts.columns.tolist(),
+        )
+
+        # monthly series with missing values
+        midas = MIDAS(low_freq="QS", strip=False)
+        quarterly_not_complete_ts_midas = midas.fit_transform(ts_prob_monthly)
+        assert quarterly_not_complete_ts_midas == ts_prob_quarterly
+
+        inversed_quarterly_not_complete_ts_midas = midas.inverse_transform(
+            quarterly_not_complete_ts_midas
+        )
+        assert inversed_quarterly_not_complete_ts_midas == ts_prob_monthly
+
+        # when strip=True we only get 1 one quarter with all 3 months
+        midas = MIDAS(low_freq="QS", strip=True)
+        quarterly_not_complete_ts_midas = midas.fit_transform(ts_prob_monthly)
+        assert quarterly_not_complete_ts_midas == ts_prob_quarterly[1:2]
+
+        inversed_quarterly_not_complete_ts_midas = midas.inverse_transform(
+            quarterly_not_complete_ts_midas
+        )
+        assert ts_prob_monthly[1:4] == inversed_quarterly_not_complete_ts_midas
+
     def test_multivariate_monthly_to_quarterly(self):
         """
         Check that multivariate monthly to quarterly is properly transformed
@@ -133,6 +213,63 @@ def test_multivariate_monthly_to_quarterly(self):
         )
         assert stacked_monthly_ts == multivar_inversed_quarterly_ts_midas
 
+    def test_probabilistic_multivariate_monthly_to_quarterly(self):
+        """
+        Check that probabilistic multivariate monthly to quarterly is properly transformed
+        """
+        monthly_ts = self.monthly_ts.stack(
+            TimeSeries.from_times_and_values(
+                times=self.monthly_ts.time_index,
+                values=np.arange(10, 19),
+                columns=["other"],
+            )
+        )
+        # generate probabilistic monthly series
+        all_vals_monthly = monthly_ts.all_values(copy=False)
+        prob_values_monthly = np.concatenate(
+            [all_vals_monthly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_monthly = TimeSeries.from_times_and_values(
+            times=monthly_ts.time_index,
+            values=prob_values_monthly,
+            columns=monthly_ts.columns.tolist(),
+        )
+
+        # component components are alternating
+        quarterly_ts = TimeSeries.from_times_and_values(
+            times=self.quarterly_ts.time_index,
+            values=np.array(
+                [[1, 10, 2, 11, 3, 12], [4, 13, 5, 14, 6, 15], [7, 16, 8, 17, 9, 18]]
+            ),
+            columns=[
+                "values_midas_0",
+                "other_midas_0",
+                "values_midas_1",
+                "other_midas_1",
+                "values_midas_2",
+                "other_midas_2",
+            ],
+        )
+        # generate probabilistic quarterly series
+        all_vals_quarterly = quarterly_ts.all_values(copy=False)
+        prob_values_quarterly = np.concatenate(
+            [all_vals_quarterly + i * 0.01 for i in range(3)], axis=2
+        )
+        ts_prob_quarterly = TimeSeries.from_times_and_values(
+            times=quarterly_ts.time_index,
+            values=prob_values_quarterly,
+            columns=quarterly_ts.columns.tolist(),
+        )
+
+        midas_1 = MIDAS(low_freq="QS")
+        multivar_quarterly_ts_midas = midas_1.fit_transform(ts_prob_monthly)
+        assert multivar_quarterly_ts_midas == ts_prob_quarterly
+
+        multivar_inversed_quarterly_ts_midas = midas_1.inverse_transform(
+            multivar_quarterly_ts_midas
+        )
+        assert ts_prob_monthly == multivar_inversed_quarterly_ts_midas
+
     def test_ts_with_missing_data(self):
         """
         Check that multivariate monthly to quarterly with missing data in the middle is properly transformed.

diff --git a/darts/timeseries.py b/darts/timeseries.py
@@ -5299,18 +5299,23 @@ def _finite_rows_boundaries(
     Parameters
     ----------
     values
-        1D or 2D numpy array where the first dimension correspond to entries/rows, and the second to components/columns
+        1D, 2D or 3D numpy array where the first dimension correspond to entries/rows, and the second to components/
+        columns
     how
         Define if the entries containing `NaN` in all the components ('all') or in any of the components ('any')
         should be stripped. Default: 'all'
     """
     dims = values.shape
 
     raise_if(
-        len(dims) > 2, f"Expected 1D or 2D array, received {len(dims)}D array", logger
+        len(dims) > 3, f"Expected 1D to 3D array, received {len(dims)}D array", logger
     )
 
     finite_rows = ~np.isnan(values)
+
+    if len(dims) == 3:
+        finite_rows = finite_rows.all(axis=2)
+
     if len(dims) > 1 and dims[1] > 1:
         if how == "any":
             finite_rows = finite_rows.all(axis=1)