From 67a8280b1c3a6277a3986f030b09d81d8323fa29 Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:03:59 +1000 Subject: [PATCH 1/8] minor typos and latex fixes --- notebooks/Module 1.6.2 - Stationarity.ipynb | 2 +- notebooks/Module 1.6.4 - ARMA.ipynb | 4 ++-- notebooks/Module 1.7.1 - Kalman Filter Introduction.ipynb | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/notebooks/Module 1.6.2 - Stationarity.ipynb b/notebooks/Module 1.6.2 - Stationarity.ipynb index a64e1a9..80caab5 100644 --- a/notebooks/Module 1.6.2 - Stationarity.ipynb +++ b/notebooks/Module 1.6.2 - Stationarity.ipynb @@ -259,7 +259,7 @@ "\n", "An autoregressive model is one where the current value of a time series is dependent on the previous values, along with some $\\beta$ value to create a linear relationship. Effectively, it is a lagged OLS, except that the variables are previous values of the predicted variable:\n", "\n", - "$X_t = c + \\sum_{i=0}^n{\\beta_i X_{t-n+1}} + u_t$\n", + "$X_t = c + \\sum_{i=1}^n{\\beta_i X_{t-i}} + u_t$\n", "\n", "Where:\n", "* $c$ is a constant. In previous OLS models, we simply added a constant to $X$ and another value for $\\beta$, but that doesn't make sense for this data, as it is time-sequential. Here, we add it separately, but it has the same effect.\n", diff --git a/notebooks/Module 1.6.4 - ARMA.ipynb b/notebooks/Module 1.6.4 - ARMA.ipynb index e2279ed..810dffd 100644 --- a/notebooks/Module 1.6.4 - ARMA.ipynb +++ b/notebooks/Module 1.6.4 - ARMA.ipynb @@ -25,7 +25,7 @@ "\n", "The autoregressive model for predicting the value of a variable in a time series. We use the annotation $AR(n)$ for an autoregressive model with $n$ periods.\n", "\n", - "$AR(n) X_t = c + \\sum_{i=0}^n{\\beta_i X_{t-n+1}} + u_t$\n", + "$AR(n) X_t = c + \\sum_{i=1}^n{\\beta_i X_{t-i)}} + u_t$\n", "\n", "We can simplify in the case of an AR(1) model, that is $n=1$. This simplifies further if we also assume a zero mean (which can be done by demeaning the data beforehand) and an error term that is white noise:\n", "\n", @@ -49,7 +49,7 @@ "\n", "An $ARMA(p, q)$ model, where $p$ is the lag in the autoregressive model and $p$ is the lag in the moving-average model is given as:\n", "\n", - "$X_t = c + \\epsilon_t + \\sum_{i=1}^{q}{\\beta X_{t-i}} + \\sum_{i=1}^{p}\\theta_i\\epsilon_{t-i}$\n", + "$X_t = c + \\epsilon_t + \\sum_{i=1}^{q}{\\beta_i X_{t-i}} + \\sum_{i=1}^{p}\\theta_i\\epsilon_{t-i}$\n", "\n", "(where $c$ is the bias, and would be 0 if the data was demeaned beforehand - and therefore could be set as the overall mean)\n", "\n", diff --git a/notebooks/Module 1.7.1 - Kalman Filter Introduction.ipynb b/notebooks/Module 1.7.1 - Kalman Filter Introduction.ipynb index cf2dfe9..449ee82 100644 --- a/notebooks/Module 1.7.1 - Kalman Filter Introduction.ipynb +++ b/notebooks/Module 1.7.1 - Kalman Filter Introduction.ipynb @@ -21,7 +21,7 @@ "\n", "# 1.7.1 Introduction to Kalman Filters\n", "\n", - "Imagine we have data coming in from a sensor at regular time intervals. This could be anything from a thermometer measuring the temperature, a image recognition system measuring the number of people in a room, or people placing buy orders on a given stock. To simplify our example, we will imagine that the thing we are measuring is constant during the time we are measuring (constant temperature, number of people in the room, and constant \"actual\" price for the stock).\n", + "Imagine we have data coming in from a sensor at regular time intervals. This could be anything from a thermometer measuring the temperature, an image recognition system measuring the number of people in a room, or people placing buy orders on a given stock. To simplify our example, we will imagine that the thing we are measuring is constant during the time we are measuring (constant temperature, number of people in the room, and constant \"actual\" price for the stock).\n", "\n", "As those measurements come in, there will be a measurement itself, and some error from the \"true\" value we are measuring. For instance, there may be 20 people in a room, but one is obscured and our image recognition system doesn't pick one up, giving an estimate of 19. After a minute, we check again, and a painting on the wall has been incorrectly counted as a person, giving an estimate of 21. These errors could be large or small (but assumed iid normally).\n", "\n", @@ -438,7 +438,7 @@ "\n", "* $A$ is a state change matrix. In our case, one that adds velocity to the old position\n", "* $B$ is the control variable matrix, which accounts for added factors, such as the acceleration of an object (i.e. if we are tracking a falling object, this would account for gravity).\n", - "* $u_t$ is the control variable matrix \n", + "* $u_t$ is the control vector\n", "* $w$ is the noise in the process. It is optional, but if properly modelled it can improve the results if used.\n", "\n", "\n", @@ -455,7 +455,7 @@ "\n", "Computing the dot product $AX_{t-1}$ will produce the new position and velocity $X_t$, if we assume that there is no velocity change. Velocity change is managed by the matrix product $Bu_t$.\n", "\n", - "For example if we had an object in free fall (and ignored wind resistance), it would be accelerating at a rate of $-9.8ms^-2$. This would give $u_t = [0, -9.8]^T$. The matrix B would then be:\n", + "For example if we had an object in free fall (and ignored wind resistance), it would be accelerating at a rate of $-9.8ms^{-2}$. This would give $u_t = [0, -9.8]^T$. The matrix B would then be:\n", "\n", "$B = \\begin{bmatrix}\n", "\\frac{1}{2}\\Delta t^2 & 0 \\\\\n", From 4454d9d0cc6c3b5a28ec6b16fbfd4a126abc317e Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:05:06 +1000 Subject: [PATCH 2/8] remove reference to non-existant solution --- notebooks/Module 2.2.3 - GARCH.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/Module 2.2.3 - GARCH.ipynb b/notebooks/Module 2.2.3 - GARCH.ipynb index a949143..827cbf5 100644 --- a/notebooks/Module 2.2.3 - GARCH.ipynb +++ b/notebooks/Module 2.2.3 - GARCH.ipynb @@ -445,8 +445,6 @@ "\n", "We'll now combine the steps we have covered, specifically ARIMA and GARCH, to fit a model to predict the price of the market. While this is an exercise here, a template for this code, with some parts missing, is available at:\n", "\n", - "`solutions/arima_garch_prediction_template.py`\n", - "\n", "If you get stuck, feel free to start with this template and fill out the details. If you are more confident, try solving the exercise without it.\n", "\n", "The general process for using ARIMA and GARCH together for forecasting is to:\n", From 65c7d453e4653825511e6456e5f82113fba9b2db Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:07:48 +1000 Subject: [PATCH 3/8] More typos, and formatting. numerous typos, eg: - missing closing brace in 2.4.1, and cells appeared in the wrong order (solutions file was after another cell) - incorrect name of solution for 2.5.1 rename why_not_normal to why_not_straight as the name of the file gives away the answer to the exersise --- notebooks/Module 2.4.1 - Residual Analysis.ipynb | 6 +++--- .../Module 2.5.1 - Model and Estimate Instability.ipynb | 2 +- notebooks/solutions/arima_seasonal.py | 2 +- notebooks/solutions/dot.py | 5 +++-- notebooks/solutions/multiple_comparisons.py | 3 ++- .../solutions/{why_not_normal.py => why_not_straight.py} | 0 6 files changed, 10 insertions(+), 8 deletions(-) rename notebooks/solutions/{why_not_normal.py => why_not_straight.py} (100%) diff --git a/notebooks/Module 2.4.1 - Residual Analysis.ipynb b/notebooks/Module 2.4.1 - Residual Analysis.ipynb index 2e1d562..8a2eb40 100644 --- a/notebooks/Module 2.4.1 - Residual Analysis.ipynb +++ b/notebooks/Module 2.4.1 - Residual Analysis.ipynb @@ -275,7 +275,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There is a pattern here (we will come to that, but the residuals are still centred around zero. Due to this, the most common cause of your residuals not being centred around zero is actually a *coding* error, where some issue in the handling of your data has occurred or a computer bug.\n", + "There is a pattern here (we will come to that), but the residuals are still centred around zero. Due to this, the most common cause of your residuals not being centred around zero is actually a *coding* error, where some issue in the handling of your data has occurred or a computer bug.\n", "\n", "That said, if you forget to add a constant, it can happen too. Here we fit a linear model to our linear data, but forget the constant:" ] @@ -477,14 +477,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Other patterns may be present in your data. For instance, seasonal trends are seen in many datasets, and this shows significantly in a residual plot." + "*For solutions, see `solutions/residual_analysis_one.py`*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "*For solutions, see `solutions/residual_analysis_one.py`*" + "Other patterns may be present in your data. For instance, seasonal trends are seen in many datasets, and this shows significantly in a residual plot." ] }, { diff --git a/notebooks/Module 2.5.1 - Model and Estimate Instability.ipynb b/notebooks/Module 2.5.1 - Model and Estimate Instability.ipynb index 035faaf..f60c06d 100644 --- a/notebooks/Module 2.5.1 - Model and Estimate Instability.ipynb +++ b/notebooks/Module 2.5.1 - Model and Estimate Instability.ipynb @@ -599,7 +599,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "*For solutions, see `solutions/control_chart.py`*" + "*For solutions, see `solutions/control_charts.py`*" ] } ], diff --git a/notebooks/solutions/arima_seasonal.py b/notebooks/solutions/arima_seasonal.py index 4229699..9cc2865 100644 --- a/notebooks/solutions/arima_seasonal.py +++ b/notebooks/solutions/arima_seasonal.py @@ -1,4 +1,4 @@ -#1. In the above example we have specified a seasonal ARMIMA model of 0, 1, 0 (P,Q,Q) with a period of 4, since +#1. In the above example we have specified a seasonal ARMIMA model of 0, 1, 0 (P,D,Q) with a period of 4, since # we are using quarterly data. #2. diff --git a/notebooks/solutions/dot.py b/notebooks/solutions/dot.py index 5f699ad..b0b3d22 100644 --- a/notebooks/solutions/dot.py +++ b/notebooks/solutions/dot.py @@ -1,3 +1,4 @@ -# Because $X$ is a n by k matrix, and $\beta$ is a k by 1 matrix, it needs to be on the right hand side -# for the matrix multiplication to be valid? +# Because $X$ is a n by k matrix, and $\beta$ is a k by 1 matrix, it needs to be +# on the right hand side for the matrix multiplication to be valid. You could +# alternatively write $(\beta^T X^T)^T$. diff --git a/notebooks/solutions/multiple_comparisons.py b/notebooks/solutions/multiple_comparisons.py index aec7fc2..f2aac48 100644 --- a/notebooks/solutions/multiple_comparisons.py +++ b/notebooks/solutions/multiple_comparisons.py @@ -1,2 +1,3 @@ -# After modify the threshold, the result is no longer significant. \ No newline at end of file +# see FWER (https://en.wikipedia.org/wiki/Family-wise_error_rate) to adjust the significance threshold +# After modifying the threshold, the result is no longer significant. diff --git a/notebooks/solutions/why_not_normal.py b/notebooks/solutions/why_not_straight.py similarity index 100% rename from notebooks/solutions/why_not_normal.py rename to notebooks/solutions/why_not_straight.py From 018c2702ece0eaeca2e55d199495652a40bafa41 Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:13:08 +1000 Subject: [PATCH 4/8] Add extended exersise solution for 1.1.2 cdf_relationships --- notebooks/solutions/cdf_relationships.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/notebooks/solutions/cdf_relationships.py b/notebooks/solutions/cdf_relationships.py index 556e2af..14669d0 100644 --- a/notebooks/solutions/cdf_relationships.py +++ b/notebooks/solutions/cdf_relationships.py @@ -15,3 +15,22 @@ plt.plot(x_values_2, y_2) print("This function is *decreasing*, as when the standard deviation is higher, " "the normal distribution 'spreads out'.") + +# Extended Exercise + +# options for one dice +opts = np.arange(1, 7) +# all combinations of first dice and second dice, flattened and counted the frequency +nums, counts = np.unique((opts[None] + opts[:, None]).flatten(), return_counts=True) +# add 0 probability options on either end of distribution +nums = np.concatenate(([nums.min()-1], nums, [nums.max()+1])) +probs = np.concatenate(([0], counts / counts.sum(), [0])) # this only holds true for uniform distribution +csum = np.cumsum(probs) + +# add steps and plot +plt.plot( + np.stack([nums, nums]).T.flatten()[1:], + np.stack([csum, csum]).T.flatten()[:-1] +) +plt.xlabel('sum') +plt.ylabel('p(sum <= value)') From 5f0b6ac98b88756b5d3ca19bdc116e4081a27425 Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:17:37 +1000 Subject: [PATCH 5/8] stats.kstest not correctly used. it seems to require z-scores for the distribution argument to be 'norm'. the suggested fix is to pass the cdf function of a normal distirbution with the correct parameters set. --- notebooks/solutions/scipy_normal_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/solutions/scipy_normal_tests.py b/notebooks/solutions/scipy_normal_tests.py index 8a50af8..dbb8a00 100644 --- a/notebooks/solutions/scipy_normal_tests.py +++ b/notebooks/solutions/scipy_normal_tests.py @@ -1,4 +1,4 @@ stats.normaltest(heights) -stats.kstest(heights, 'norm') \ No newline at end of file +stats.kstest(heights, stats.norm.fit(heights).cdf) From bdc02f2017d8305cf6317def02daf470cc7af1ef Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:19:49 +1000 Subject: [PATCH 6/8] add more efficient generator solution for rolling_forecast --- notebooks/solutions/rolling_forecast.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/notebooks/solutions/rolling_forecast.py b/notebooks/solutions/rolling_forecast.py index 0e8d5c9..3706ad8 100644 --- a/notebooks/solutions/rolling_forecast.py +++ b/notebooks/solutions/rolling_forecast.py @@ -54,5 +54,12 @@ def rolling_forecasting_origin_generator(time_series, m=1): return - - +# This is a more advanced function, using only python generators. +from itertools import count +def rolling_forecasting_origin(time_series, m=1): + yield from zip( + # len(time_series)-m+2 returns the final length required + 1 + # (+1 to account for the range starting from 1) + map(np.arange, range(1, (len(time_series)-m+2))), + count(start=m) + ) From 6fa12e446f77ff52706505b45557b658e45ffca1 Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:20:52 +1000 Subject: [PATCH 7/8] minor fixes to solutions --- notebooks/bayesian_updating_plot.py | 2 +- notebooks/solutions/rolling_forecast.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/bayesian_updating_plot.py b/notebooks/bayesian_updating_plot.py index abd8891..bf19cb9 100644 --- a/notebooks/bayesian_updating_plot.py +++ b/notebooks/bayesian_updating_plot.py @@ -9,7 +9,7 @@ # For the already prepared, I'm using Binomial's conj. prior. for k, N in enumerate(n_trials): - sx = plt.subplot(len(n_trials)/2, 2, k+1) + sx = plt.subplot(len(n_trials)//2, 2, k+1) plt.xlabel("$p$, probability of heads") \ if k in [0, len(n_trials)-1] else None plt.setp(sx.get_yticklabels(), visible=False) diff --git a/notebooks/solutions/rolling_forecast.py b/notebooks/solutions/rolling_forecast.py index 3706ad8..a1d22ae 100644 --- a/notebooks/solutions/rolling_forecast.py +++ b/notebooks/solutions/rolling_forecast.py @@ -24,7 +24,7 @@ def rolling_forecasting_origin(time_series, m=1): # Let's test our function! dates = np.array('2015-07-04', dtype=np.datetime64) + np.arange(100) dates -rolling_forecasting_origin(date, 10) +rolling_forecasting_origin(dates, 10) # Here is essentially the same function code, but presented as a python generator which # can be iterated over, for example in a for loop. From a38b5598696cf13fca1bb5e2aa123cc1a1b89050 Mon Sep 17 00:00:00 2001 From: Hixan Date: Wed, 22 Sep 2021 12:37:34 +1000 Subject: [PATCH 8/8] add solution to final exersize in 1.6.4 --- notebooks/Module 1.6.4 - ARMA.ipynb | 7 ++++++ notebooks/solutions/arma_check_white_noise.py | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 notebooks/solutions/arma_check_white_noise.py diff --git a/notebooks/Module 1.6.4 - ARMA.ipynb b/notebooks/Module 1.6.4 - ARMA.ipynb index 810dffd..feca6bf 100644 --- a/notebooks/Module 1.6.4 - ARMA.ipynb +++ b/notebooks/Module 1.6.4 - ARMA.ipynb @@ -783,6 +783,13 @@ "\n", "Perform a formal analysis to identity if the residuals are white noise in both the training and testing case." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*For solutions, see `solutions/arma_check_white_noise.py`*" + ] } ], "metadata": { diff --git a/notebooks/solutions/arma_check_white_noise.py b/notebooks/solutions/arma_check_white_noise.py new file mode 100644 index 0000000..e62556a --- /dev/null +++ b/notebooks/solutions/arma_check_white_noise.py @@ -0,0 +1,23 @@ + +def check_white_noise(data: np.ndarray): + # check mean 0 + _, p_mean = stats.ttest_1samp(data, 0) + if p_mean > 0.05: + print('data has mean of 0 (unable to reject)') + else: + print('data does not have a mean of 0!') + # check homeoskedastic + _, p_skedastic = stats.levene(*data.reshape(2, -1)) + if p_skedastic > 0: + print('data is homeoskedastic (unable to reject)') + else: + print('data is HETEROskedastic!') + # plot to check for autocorrelation + pd.plotting.autocorrelation_plot(data) + plt.title('autocorrelation plot') + plt.show() + +print('Training results:') +check_white_noise(train_residuals) +print('Testing results:') +check_white_noise(np.r_[0, test_residuals])