Skip to content

Commit

Permalink
updating doc branch (#850)
Browse files Browse the repository at this point in the history
* autoregressor unit test code (#843)

* autoregressor unit test code

* fixed error

* fix error

* Correcting indexes (#844)

* Correcting indexes

* Update tsa.py

* Update tsa.py

* adding inferences for TS

* Removed name of column that was removed from africa-education CSV file (#845)

* 832 AR, MA, ARMA, ARIMA unit test (#849)

* autoregressor unit test code

* ARIMA UT

* UT for common functions for AR, MA, ARMA, ARIMA, and score test for AR and ARMA

* sphinx docstring changes for vdataframe/typing (#846)

* sphinx docstring changes for vdataframe/typing

* Multiple correction

 - be consistent on the variable name: we use data for the dataset.
 - be consistent on the notes and naming.
 - some docs were using old method like Tablesample.to_vdf: use vDataFrame
 - some indentation needed to be done properly.

* black

---------

Co-authored-by: Badr <[email protected]>

* Time Series improvement (#848)

* Time Series improvement

 - HC correction
 - handling full forecasting and one step ahead forecasting
 - possibility to filter ts steps
 - improve the quality of the scores and report

* correct std_err in plots

* Update base.py

* Update tsa.py

* Update tsa.py

---------

Co-authored-by: Vikash Singh <[email protected]>
Co-authored-by: Badr Ouali <[email protected]>
Co-authored-by: Abhishek Sharma <[email protected]>
Co-authored-by: Badr <[email protected]>
  • Loading branch information
5 people authored Nov 10, 2023
1 parent 29da343 commit 42f38c8
Show file tree
Hide file tree
Showing 13 changed files with 1,902 additions and 123 deletions.
944 changes: 937 additions & 7 deletions verticapy/core/vdataframe/_typing.py

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions verticapy/datasets/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,6 @@ def load_africa_education(
"SPUPPR16": "Varchar(20)",
"zpmealsc": "Varchar(32)",
"PREPEAT": "Varchar(20)",
"zmaloct": "Float",
"zpses": "Numeric(7,3)",
"SPUPPR06": "Varchar(20)",
"zraloct": "Float",
Expand Down Expand Up @@ -962,7 +961,6 @@ def load_africa_education(
"SPUPPR16",
"zpmealsc",
"PREPEAT",
"zmaloct",
"zpses",
"SPUPPR06",
"zraloct",
Expand Down
184 changes: 170 additions & 14 deletions verticapy/machine_learning/vertica/tsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"""
from abc import abstractmethod
import copy
import datetime
from dateutil.relativedelta import relativedelta
from typing import Literal, Optional, Union

import numpy as np
Expand Down Expand Up @@ -403,6 +405,9 @@ def predict(
output_standard_errors: bool = False,
output_index: bool = False,
output_estimated_ts: bool = False,
freq: Literal[None, "m", "months", "y", "year", "infer"] = "infer",
filter_step: Optional[int] = None,
method: Literal["auto", "forecast"] = "auto",
) -> vDataFrame:
"""
Predicts using the input relation.
Expand Down Expand Up @@ -464,6 +469,47 @@ def predict(
Boolean, whether to return the estimated abscissa of
each prediction. The real one is hard to obtain due to
interval computations.
freq: str, optional
How to compute the delta.
- m/month:
We assume that the data is organized on a monthly
basis.
- y/year:
We assume that the data is organized on a yearly
basis.
- infer:
When making inferences, the system will attempt to
identify the best option, which may involve more
computational resources.
- None:
The inference is based on the average of the difference
between 'ts' and its lag.
filter_step: int, optional
Integer parameter that determines the frequency of
predictions. You can adjust it according to your
specific requirements, such as setting it to 3 for
predictions every third step.
.. note::
It is only utilized when "output_estimated_ts" is set to
True.
method: str, optional
Forecasting method. One of the following:
- auto:
the model initially utilizes the true values at each step
for forecasting. However, when it reaches a point where it
can no longer rely on true values, it transitions to using
its own predictions for further forecasting. This method is
often referred to as "one step ahead" forecasting.
- forecast:
the model initiates forecasting from an initial value
and entirely disregards any subsequent true values. This
approach involves forecasting based solely on the model's
own predictions and does not consider actual observations
after the start point.
Returns
-------
Expand All @@ -482,10 +528,23 @@ def predict(
if isinstance(y, NoneType):
y = self.y
ar_ma = True
if isinstance(start, (int, float)):
start_predict = int(start + 1)
elif not (isinstance(start, NoneType)):
start_predict = int(start)
else:
start_predict = None
where = ""
if isinstance(filter_step, NoneType):
filter_step = 1
elif filter_step < 1:
raise ValueError("Parameter 'filter_step' must be greater or equal to 1.")
else:
where = f" WHERE MOD(idx, {filter_step}) = 0"
sql = "SELECT " + self.deploySQL(
ts=ts,
y=y,
start=start,
start=start_predict,
npredictions=npredictions,
output_standard_errors=(
output_standard_errors or output_index or output_estimated_ts
Expand All @@ -494,7 +553,10 @@ def predict(
)
no_relation = True
if not (isinstance(vdf, NoneType)):
sql += f" FROM {vdf}"
relation = vdf
if not (isinstance(start, NoneType)) and str(method).lower() == "forecast":
relation = f"(SELECT * FROM {vdf} ORDER BY {ts} LIMIT {start}) VERTICAPY_SUBTABLE"
sql += f" FROM {relation}"
no_relation = False
if output_index or output_estimated_ts:
j = self.n_
Expand All @@ -512,28 +574,64 @@ def predict(
else:
output_standard_errors = ""
stde_out = ""
if ar_ma:
order_by = ""
else:
order_by = 'ORDER BY "std_err"'
sql = f"""
SELECT
ROW_NUMBER() OVER ({order_by}) + {j} - 1 AS idx,
ROW_NUMBER() OVER () + {j} - 1 AS idx,
prediction{output_standard_errors}
FROM ({sql}) VERTICAPY_SUBTABLE"""
if output_estimated_ts:
if isinstance(freq, str):
freq = freq.lower()
if freq == "infer":
infer_sql = f"""
SELECT
{self.ts}
FROM {self.input_relation}
WHERE {self.ts} IS NOT NULL
ORDER BY 1
LIMIT 100"""
res = _executeSQL(
infer_sql, title="Finding the right delta.", method="fetchall"
)
res = [l[0] for l in res]
n = len(res)
for i in range(1, n):
if not (isinstance(res[i], datetime.date)):
freq = None
break
dm = ((res[i] - res[i - 1]) / 28).days
dy = ((res[i] - res[i - 1]) / 365).days
if res[i - 1] + relativedelta(months=dm) == res[i] and freq != "y":
freq = "m"
elif res[i - 1] + relativedelta(years=dy) == res[i] and freq != "m":
freq = "y"
else:
freq = None
break
min_value = f"(SELECT MIN({self.ts}) FROM {self.input_relation})"
if freq in ("m", "months", "y", "year"):
delta_ts = f"MONTHS_BETWEEN({self.ts}, LAG({self.ts}) OVER (ORDER BY {self.ts})) AS delta"
else:
delta_ts = (
f"{self.ts} - LAG({self.ts}) OVER (ORDER BY {self.ts}) AS delta"
)
delta = f"""
(SELECT
AVG(delta)
AVG(delta)
FROM (SELECT
{self.ts} - LAG({self.ts}) OVER (ORDER BY {self.ts}) AS delta
{delta_ts}
FROM {self.input_relation}) VERTICAPY_SUBTABLE)"""
if freq in ("m", "months"):
estimation = f"TIMESTAMPADD(MONTH, (idx * {delta})::int, {min_value})::date AS {self.ts}"
elif freq in ("y", "year"):
estimation = f"TIMESTAMPADD(YEAR, (idx * {delta} / 12)::int, {min_value})::date AS {self.ts}"
else:
estimation = f"idx * {delta} + {min_value} AS {self.ts}"
sql = f"""
SELECT
idx * {delta} + {min_value} AS {self.ts},
{estimation},
prediction{stde_out}
FROM ({sql}) VERTICAPY_SUBTABLE"""
FROM ({sql}) VERTICAPY_SUBTABLE{where}"""
return vDataFrame(clean_query(sql))

# Model Evaluation Methods.
Expand All @@ -542,6 +640,7 @@ def _evaluation_relation(
self,
start: Optional[int] = None,
npredictions: Optional[int] = None,
method: Literal["auto", "forecast"] = "auto",
):
"""
Returns the relation needed to evaluate the
Expand All @@ -567,6 +666,7 @@ def _evaluation_relation(
start=start,
npredictions=npredictions,
output_index=True,
method=method,
)
sql = f"""
(SELECT
Expand All @@ -575,7 +675,7 @@ def _evaluation_relation(
FROM
(
SELECT
ROW_NUMBER() OVER (ORDER BY {self.ts}) AS idx,
ROW_NUMBER() OVER (ORDER BY {self.ts}) - 1 AS idx,
{self.y} AS y_true
FROM {test_relation}
) AS true_values
Expand All @@ -592,6 +692,7 @@ def regression_report(
] = None,
start: Optional[int] = None,
npredictions: Optional[int] = None,
method: Literal["auto", "forecast"] = "auto",
) -> Union[float, TableSample]:
"""
Computes a regression report using multiple metrics to
Expand Down Expand Up @@ -655,6 +756,21 @@ def regression_report(
npredictions: int, optional
Integer greater or equal to 1, the number of predicted
timesteps.
method: str, optional
Forecasting method. One of the following:
- auto:
the model initially utilizes the true values at each step
for forecasting. However, when it reaches a point where it
can no longer rely on true values, it transitions to using
its own predictions for further forecasting. This method is
often referred to as "one step ahead" forecasting.
- forecast:
the model initiates forecasting from an initial value
and entirely disregards any subsequent true values. This
approach involves forecasting based solely on the model's
own predictions and does not consider actual observations
after the start point.
Returns
-------
Expand All @@ -664,7 +780,9 @@ def regression_report(
return mt.regression_report(
"y_true",
"y_pred",
self._evaluation_relation(start=start, npredictions=npredictions),
self._evaluation_relation(
start=start, npredictions=npredictions, method=method
),
metrics=metrics,
k=1,
)
Expand All @@ -679,6 +797,7 @@ def score(
] = "r2",
start: Optional[int] = None,
npredictions: Optional[int] = None,
method: Literal["auto", "forecast"] = "auto",
) -> float:
"""
Computes the model score.
Expand Down Expand Up @@ -732,6 +851,21 @@ def score(
npredictions: int, optional
Integer greater or equal to 1, the number of predicted
timesteps.
method: str, optional
Forecasting method. One of the following:
- auto:
the model initially utilizes the true values at each step
for forecasting. However, when it reaches a point where it
can no longer rely on true values, it transitions to using
its own predictions for further forecasting. This method is
often referred to as "one step ahead" forecasting.
- forecast:
the model initiates forecasting from an initial value
and entirely disregards any subsequent true values. This
approach involves forecasting based solely on the model's
own predictions and does not consider actual observations
after the start point.
Returns
-------
Expand All @@ -753,7 +887,11 @@ def score(
arg = [
"y_true",
"y_pred",
self._evaluation_relation(start=start, npredictions=npredictions),
self._evaluation_relation(
start=start,
npredictions=npredictions,
method=method,
),
]
if metric in ("aic", "bic") or adj:
arg += [1]
Expand All @@ -770,6 +908,7 @@ def plot(
y: Optional[str] = None,
start: Optional[int] = None,
npredictions: int = 10,
method: Literal["auto", "forecast"] = "auto",
chart: Optional[PlottingObject] = None,
**style_kwargs,
) -> PlottingObject:
Expand Down Expand Up @@ -824,6 +963,21 @@ def plot(
npredictions: int, optional
Integer greater or equal to 1, the number of predicted
timesteps.
method: str, optional
Forecasting method. One of the following:
- auto:
the model initially utilizes the true values at each step
for forecasting. However, when it reaches a point where it
can no longer rely on true values, it transitions to using
its own predictions for further forecasting. This method is
often referred to as "one step ahead" forecasting.
- forecast:
the model initiates forecasting from an initial value
and entirely disregards any subsequent true values. This
approach involves forecasting based solely on the model's
own predictions and does not consider actual observations
after the start point.
chart: PlottingObject, optional
The chart object to plot on.
**style_kwargs
Expand Down Expand Up @@ -854,9 +1008,11 @@ def plot(
start=start,
npredictions=npredictions,
output_standard_errors=True,
method=method,
),
start=start,
dataset_provided=dataset_provided,
method=method,
).draw(**kwargs)


Expand Down
Loading

0 comments on commit 42f38c8

Please sign in to comment.