diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index cd4f32f1e9..b1efa3ea0e 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -72,7 +72,8 @@ def get_feature_times( output_chunk_length: Optional[int], max_samples_per_ts: Optional[int], output_chunk_shift: int, - ): + stride: int, + ) -> pd.Index: """ Helper function that returns the times shared by all specified series that can be used to create features and labels. This is performed by using the helper functions @@ -101,6 +102,9 @@ def get_feature_times( future, lags_future ) times = times.intersection(future_times) + # Apply stride + if stride > 1: + times = times[::stride] # Take most recent `max_samples_per_ts` samples if requested: if (max_samples_per_ts is not None) and (len(times) > max_samples_per_ts): times = times[-max_samples_per_ts:] @@ -433,6 +437,7 @@ def helper_create_expected_lagged_data( output_chunk_shift: int, multi_models: bool, max_samples_per_ts: Optional[int], + stride: int, ) -> tuple[np.ndarray, np.ndarray, Any]: """Helper function to create the X and y arrays by building them block by block (one per covariates).""" feats_times = self.get_feature_times( @@ -445,6 +450,7 @@ def helper_create_expected_lagged_data( output_chunk_length, max_samples_per_ts, output_chunk_shift, + stride, ) # Construct `X` by constructing each block, then concatenate these # blocks together along component axis: @@ -487,6 +493,7 @@ def helper_check_lagged_data( max_samples_per_ts: Optional[int], use_moving_windows: bool, concatenate: bool, + stride: int, **kwargs, ): """Helper function to call the `create_lagged_training_data()` method with lags argument either in the list @@ -537,6 +544,7 @@ def helper_check_lagged_data( use_moving_windows=use_moving_windows, output_chunk_shift=output_chunk_shift, concatenate=concatenate, + stride=stride, ) # should have the exact same number of indexes assert len(times) == len(expected_times_x) == len(expected_times_y) @@ -642,10 +650,13 @@ def helper_check_lagged_data( min_n_ts = 8 + max(output_chunk_shift_combos) @pytest.mark.parametrize( - "series_type", - ["datetime", "integer"], + "params", + product( + ["datetime", "integer"], # series_type + [1, 3], # stride + ), ) - def test_lagged_training_data_equal_freq(self, series_type: str): + def test_lagged_training_data_equal_freq(self, params): """ Tests that `create_lagged_training_data` produces `X`, `y`, and `times` outputs that are consistent with those generated by using the helper @@ -659,6 +670,7 @@ def test_lagged_training_data_equal_freq(self, series_type: str): are of the same frequency, the implementation of the 'moving window' method is being tested here. """ + series_type, stride = params # Define datetime index timeseries - each has different number of components, # different start times, different lengths, and different values, but # they're all of the same frequency: @@ -749,6 +761,7 @@ def test_lagged_training_data_equal_freq(self, series_type: str): output_chunk_shift, multi_models, max_samples_per_ts, + stride, ) ) @@ -770,6 +783,7 @@ def test_lagged_training_data_equal_freq(self, series_type: str): "max_samples_per_ts": max_samples_per_ts, "use_moving_windows": True, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -777,10 +791,13 @@ def test_lagged_training_data_equal_freq(self, series_type: str): self.helper_check_lagged_data(convert_lags_to_dict=True, **kwargs) @pytest.mark.parametrize( - "series_type", - ["datetime", "integer"], + "params", + product( + ["datetime", "integer"], # series_type + [1, 3], # stride + ), ) - def test_lagged_training_data_unequal_freq(self, series_type): + def test_lagged_training_data_unequal_freq(self, params): """ Tests that `create_lagged_training_data` produces `X`, `y`, and `times` outputs that are consistent with those generated by using the helper @@ -794,6 +811,7 @@ def test_lagged_training_data_unequal_freq(self, series_type): are *not* of the same frequency, the implementation of the 'time intersection' method is being tested here. """ + series_type, stride = params # Define range index timeseries - each has different number of components, # different start times, different lengths, different values, and different # frequencies: @@ -869,6 +887,7 @@ def test_lagged_training_data_unequal_freq(self, series_type): output_chunk_shift, multi_models, max_samples_per_ts, + stride, ) ) @@ -890,6 +909,7 @@ def test_lagged_training_data_unequal_freq(self, series_type): "max_samples_per_ts": max_samples_per_ts, "use_moving_windows": False, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -901,10 +921,13 @@ def test_lagged_training_data_unequal_freq(self, series_type): ) @pytest.mark.parametrize( - "series_type", - ["datetime", "integer"], + "params", + product( + ["datetime", "integer"], # series_type + [1, 3], # stride + ), ) - def test_lagged_training_data_method_consistency(self, series_type): + def test_lagged_training_data_method_consistency(self, params): """ Tests that `create_lagged_training_data` produces the same result when `use_moving_windows = False` and when `use_moving_windows = True` @@ -918,6 +941,7 @@ def test_lagged_training_data_method_consistency(self, series_type): # Define datetime index timeseries - each has different number of components, # different start times, different lengths, different values, and of # different frequencies: + series_type, stride = params if series_type == "integer": target = helper_create_multivariate_linear_timeseries( n_components=2, start_value=0, end_value=10, start=2, length=20, freq=1 @@ -991,6 +1015,7 @@ def test_lagged_training_data_method_consistency(self, series_type): multi_models=multi_models, use_moving_windows=True, output_chunk_shift=output_chunk_shift, + stride=stride, ) # Using time intersection method: X_ti, y_ti, times_ti, _, _ = create_lagged_training_data( @@ -1006,6 +1031,7 @@ def test_lagged_training_data_method_consistency(self, series_type): multi_models=multi_models, use_moving_windows=False, output_chunk_shift=output_chunk_shift, + stride=stride, ) assert np.allclose(X_mw, X_ti) assert np.allclose(y_mw, y_ti) @@ -1021,6 +1047,7 @@ def test_lagged_training_data_method_consistency(self, series_type): [0, 1, 3], [False, True], ["datetime", "integer"], + [1, 3], # stride ), ) def test_lagged_training_data_single_lag_single_component_same_series(self, config): @@ -1032,7 +1059,7 @@ def test_lagged_training_data_single_lag_single_component_same_series(self, conf same time series, and the expected `y` can be formed by taking a single slice from the `target`. """ - output_chunk_shift, use_moving_windows, series_type = config + output_chunk_shift, use_moving_windows, series_type, stride = config if series_type == "integer": series = linear_timeseries(start=0, length=15) else: @@ -1069,6 +1096,12 @@ def test_lagged_training_data_single_lag_single_component_same_series(self, conf ) expected_X = np.expand_dims(expected_X, axis=-1) + if stride > 1: + expected_X = expected_X[::stride] + expected_y = expected_y[::stride] + expected_times_x = expected_times_x[::stride] + expected_times_y = expected_times_y[::stride] + kwargs = { "expected_X": expected_X, "expected_y": expected_y, @@ -1087,6 +1120,7 @@ def test_lagged_training_data_single_lag_single_component_same_series(self, conf "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1196,6 +1230,7 @@ def test_lagged_training_data_extend_past_and_future_covariates(self, config): "max_samples_per_ts": max_samples_per_ts, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": 1, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1211,8 +1246,8 @@ def test_lagged_training_data_extend_past_and_future_covariates(self, config): @pytest.mark.parametrize( "config", - itertools.product( - [0, 1, 3], [False, True], ["datetime", "integer"], [False, True] + product( + [0, 1, 3], [False, True], ["datetime", "integer"], [False, True], [1, 3] ), ) def test_lagged_training_data_single_point(self, config): @@ -1220,7 +1255,9 @@ def test_lagged_training_data_single_point(self, config): Tests that `create_lagged_training_data` correctly handles case where only one possible training point can be generated. """ - output_chunk_shift, use_moving_windows, series_type, multi_models = config + output_chunk_shift, use_moving_windows, series_type, multi_models, stride = ( + config + ) # Can only create feature using first value of series (i.e. `0`) # and can only create label using last value of series (i.e. `1`) if series_type == "integer": @@ -1244,6 +1281,11 @@ def test_lagged_training_data_single_point(self, config): length=1, freq=target.freq, ) + if stride > 1: + expected_X = expected_X[::stride] + expected_y = expected_y[::stride] + expected_times = expected_times[::stride] + # Test correctness for 'moving window' and for 'time intersection' methods, as well # as for different `multi_models` values: kwargs = { @@ -1264,6 +1306,7 @@ def test_lagged_training_data_single_point(self, config): "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1280,7 +1323,7 @@ def test_lagged_training_data_single_point(self, config): @pytest.mark.parametrize( "config", itertools.product( - [0, 1, 3], [False, True], ["datetime", "integer"], [False, True] + [0, 1, 3], [False, True], ["datetime", "integer"], [False, True], [1, 3] ), ) def test_lagged_training_data_zero_lags(self, config): @@ -1295,7 +1338,9 @@ def test_lagged_training_data_zero_lags(self, config): # only possible feature that can be created using these series utilises # the value of `future` at the same time as the label (i.e. a lag # of `0` away from the only feature time): - output_chunk_shift, use_moving_windows, series_type, multi_models = config + output_chunk_shift, use_moving_windows, series_type, multi_models, stride = ( + config + ) if series_type == "integer": target = linear_timeseries( @@ -1329,6 +1374,11 @@ def test_lagged_training_data_zero_lags(self, config): length=1, freq=target.freq, ) + if stride > 1: + expected_X = expected_X[::stride] + expected_y = expected_y[::stride] + expected_times = expected_times[::stride] + # Check correctness for 'moving windows' and 'time intersection' methods, as # well as for different `multi_models` values: kwargs = { @@ -1349,6 +1399,7 @@ def test_lagged_training_data_zero_lags(self, config): "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1364,13 +1415,14 @@ def test_lagged_training_data_zero_lags(self, config): @pytest.mark.parametrize( "config", - itertools.product( + product( [0, 1, 3], [False, True], ["datetime", "integer"], [False, True], [-1, 0, 1], [-2, 0, 2], + [1, 3], ), ) def test_lagged_training_data_no_target_lags_future_covariates(self, config): @@ -1390,6 +1442,7 @@ def test_lagged_training_data_no_target_lags_future_covariates(self, config): multi_models, cov_start_shift, cov_lag, + stride, ) = config # adapt covariate start, length, and target length so that only 1 sample can be extracted @@ -1429,6 +1482,11 @@ def test_lagged_training_data_no_target_lags_future_covariates(self, config): length=1, freq=target.freq, ) + if stride > 1: + expected_X[::stride] + expected_y[::stride] + expected_times[::stride] + # Check correctness for 'moving windows' and 'time intersection' methods, as # well as for different `multi_models` values: kwargs = { @@ -1449,6 +1507,7 @@ def test_lagged_training_data_no_target_lags_future_covariates(self, config): "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1471,6 +1530,7 @@ def test_lagged_training_data_no_target_lags_future_covariates(self, config): [False, True], [-1, 0], [-2, -1], + [1, 3], ), ) def test_lagged_training_data_no_target_lags_past_covariates(self, config): @@ -1489,6 +1549,7 @@ def test_lagged_training_data_no_target_lags_past_covariates(self, config): multi_models, cov_start_shift, cov_lag, + stride, ) = config # adapt covariate start, length, and target length so that only 1 sample can be extracted @@ -1528,6 +1589,11 @@ def test_lagged_training_data_no_target_lags_past_covariates(self, config): length=1, freq=target.freq, ) + if stride > 1: + expected_X[::stride] + expected_y[::stride] + expected_times[::stride] + # Check correctness for 'moving windows' and 'time intersection' methods, as # well as for different `multi_models` values: kwargs = { @@ -1548,6 +1614,7 @@ def test_lagged_training_data_no_target_lags_past_covariates(self, config): "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1564,7 +1631,7 @@ def test_lagged_training_data_no_target_lags_past_covariates(self, config): @pytest.mark.parametrize( "config", itertools.product( - [0, 1, 3], [False, True], ["datetime", "integer"], [False, True] + [0, 1, 3], [False, True], ["datetime", "integer"], [False, True], [1, 3] ), ) def test_lagged_training_data_positive_lags(self, config): @@ -1580,7 +1647,9 @@ def test_lagged_training_data_positive_lags(self, config): # only possible feature that can be created using these series utilises # the value of `future` one timestep after the time of the label (i.e. a lag # of `1` away from the only feature time): - output_chunk_shift, use_moving_windows, series_type, multi_models = config + output_chunk_shift, use_moving_windows, series_type, multi_models, stride = ( + config + ) if series_type == "integer": target = linear_timeseries( @@ -1613,6 +1682,11 @@ def test_lagged_training_data_positive_lags(self, config): length=1, freq=target.freq, ) + if stride > 1: + expected_X[::stride] + expected_y[::stride] + expected_times[::stride] + # Check correctness for 'moving windows' and 'time intersection' methods, as # well as for different `multi_models` values: kwargs = { @@ -1633,6 +1707,7 @@ def test_lagged_training_data_positive_lags(self, config): "max_samples_per_ts": None, "use_moving_windows": use_moving_windows, "concatenate": True, + "stride": stride, } self.helper_check_lagged_data(convert_lags_to_dict=False, **kwargs) @@ -1653,6 +1728,7 @@ def test_lagged_training_data_positive_lags(self, config): [1, 2], [True, False], ["datetime", "integer"], + [1, 3], ), ) def test_lagged_training_data_comp_wise_lags(self, config): @@ -1662,7 +1738,9 @@ def test_lagged_training_data_comp_wise_lags(self, config): Note that this is supported only when use_moving_window=True. """ - output_chunk_shift, output_chunk_length, multi_models, series_type = config + output_chunk_shift, output_chunk_length, multi_models, series_type, stride = ( + config + ) lags_tg = {"target_0": [-4, -1], "target_1": [-4, -1]} lags_pc = [-3] @@ -1716,6 +1794,7 @@ def test_lagged_training_data_comp_wise_lags(self, config): output_chunk_length, None, output_chunk_shift, + stride=stride, ) # reorder the features to obtain target_0_lag-4, target_1_lag-4, target_0_lag-1, target_1_lag-1 @@ -1762,6 +1841,10 @@ def test_lagged_training_data_comp_wise_lags(self, config): multi_models, output_chunk_shift, )[:, :, np.newaxis] + if stride > 1: + expected_X[::stride] + expected_y[::stride] + feats_times[::stride] # lags are already in dict format self.helper_check_lagged_data( @@ -1783,9 +1866,14 @@ def test_lagged_training_data_comp_wise_lags(self, config): max_samples_per_ts=None, use_moving_windows=True, concatenate=True, + stride=stride, ) - def test_lagged_training_data_sequence_inputs(self): + @pytest.mark.parametrize( + "stride", + [1, 3], + ) + def test_lagged_training_data_sequence_inputs(self, stride): """ Tests that `create_lagged_training_data` correctly handles being passed a sequence of `TimeSeries` inputs, as opposed to individual @@ -1806,12 +1894,14 @@ def test_lagged_training_data_sequence_inputs(self): expected_X_2 = np.concatenate( 3 * [target_2.all_values(copy=False)[:-1, :, :]], axis=1 ) - expected_X = np.concatenate([expected_X_1, expected_X_2], axis=0) - expected_y_1 = target_1.all_values(copy=False)[1:, :, :] - expected_y_2 = target_2.all_values(copy=False)[1:, :, :] + expected_X = np.concatenate( + [expected_X_1[::stride], expected_X_2[::stride]], axis=0 + ) + expected_y_1 = target_1.all_values(copy=False)[1::stride, :, :] + expected_y_2 = target_2.all_values(copy=False)[1::stride, :, :] expected_y = np.concatenate([expected_y_1, expected_y_2], axis=0) - expected_times_1 = target_1.time_index[1:] - expected_times_2 = target_2.time_index[1:] + expected_times_1 = target_1.time_index[1::stride] + expected_times_2 = target_2.time_index[1::stride] kwargs = { "expected_X": expected_X, @@ -1830,6 +1920,7 @@ def test_lagged_training_data_sequence_inputs(self): "multi_models": True, "max_samples_per_ts": None, "use_moving_windows": True, + "stride": stride, } # concatenate=True @@ -1848,7 +1939,11 @@ def test_lagged_training_data_sequence_inputs(self): convert_lags_to_dict=True, concatenate=False, **kwargs ) - def test_lagged_training_data_stochastic_series(self): + @pytest.mark.parametrize( + "stride", + [1, 3], + ) + def test_lagged_training_data_stochastic_series(self, stride): """ Tests that `create_lagged_training_data` is correctly vectorised over the sample axes of the input `TimeSeries`. @@ -1863,10 +1958,10 @@ def test_lagged_training_data_stochastic_series(self): output_chunk_length = 1 # Expected solution: expected_X = np.concatenate( - 3 * [target.all_values(copy=False)[:-1, :, :]], axis=1 + 3 * [target.all_values(copy=False)[:-1:stride, :, :]], axis=1 ) - expected_y = target.all_values(copy=False)[1:, :, :] - expected_times = target.time_index[1:] + expected_y = target.all_values(copy=False)[1::stride, :, :] + expected_times = target.time_index[1::stride] kwargs = { "expected_X": expected_X, @@ -1885,6 +1980,7 @@ def test_lagged_training_data_stochastic_series(self): "multi_models": True, "max_samples_per_ts": None, "use_moving_windows": True, + "stride": stride, } self.helper_check_lagged_data( @@ -2729,6 +2825,7 @@ def test_correct_generated_weights_exponential(self, config): ["D", "2D", 2], [True, False], [True, False], + [1, 3], ), ) def test_correct_user_weights(self, config): @@ -2751,14 +2848,18 @@ def test_correct_user_weights(self, config): freq, single_series, univar_series, + stride, ) = config + lags = [-4, -1] if not isinstance(freq, int): freq = pd.tseries.frequencies.to_offset(freq) start = pd.Timestamp("2000-01-01") else: start = 1 - train_y = linear_timeseries(start=start, length=training_size, freq=freq) + train_y = linear_timeseries( + start=start, end_value=training_size - 1, length=training_size, freq=freq + ) if not univar_series: train_y.stack(train_y) @@ -2776,13 +2877,14 @@ def test_correct_user_weights(self, config): ts_weights.stack(ts_weights + 1.0) _, y, _, _, weights = create_lagged_training_data( - lags=[-4, -1], + lags=lags, target_series=train_y if single_series else [train_y] * 2, output_chunk_length=ocl, uses_static_covariates=False, sample_weight=ts_weights if single_series else [ts_weights] * 2, output_chunk_shift=ocs, use_moving_windows=use_moving_windows, + stride=stride, ) # weights shape must match label shape, since we have one @@ -2796,11 +2898,15 @@ def test_correct_user_weights(self, config): # the weights correspond to the same sample and time index as the `y` labels expected_weights = [] - len_y_single = len(y) if single_series else int(len(y) / 2) + len_y_single = len(y) if single_series else len(y) // 2 for i in range(ocl): - mask = slice(-(i + len_y_single), -i if i else None) + # shifted by the steps required to create the first set of features + first_label_idx = -min(lags) + ocs + i + # make enough room for all the strided labels + last_label_idx = first_label_idx + len_y_single * stride + mask = slice(first_label_idx, last_label_idx, stride) expected_weights.append(weights_exact[mask]) - expected_weights = np.concatenate(expected_weights, axis=1)[:, ::-1] + expected_weights = np.concatenate(expected_weights, axis=1) if not single_series: expected_weights = np.concatenate([expected_weights] * 2, axis=0) np.testing.assert_array_almost_equal(weights[:, :, 0], expected_weights) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index d29f3ac299..4785617c84 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1155,8 +1155,7 @@ def _create_lagged_data_by_moving_window( # must take `(num_samples - 1)` values ahead of `first_window_end_idx` vals = vals[ first_window_start_idx : first_window_end_idx - + num_samples * stride - - 1, + + (num_samples - 1) * stride, :, :, ]