diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 885873f3..c6cb3d57 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -20,8 +20,10 @@ Functions clustered_inference data_simulation desparsified_lasso + desparsified_lasso_pvalue + desparsified_group_lasso_pvalue ensemble_clustered_inference - group_reid + reid hd_inference knockoff_aggregation model_x_knockoff diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 8fa09830..91d0ed97 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -177,4 +177,103 @@ @article{liuFastPowerfulConditional2021 archiveprefix = {arxiv}, keywords = {Statistics - Methodology}, file = {/home/ahmad/Zotero/storage/8HRQZX3H/Liu et al. - 2021 - Fast and Powerful Conditional Randomization Testin.pdf;/home/ahmad/Zotero/storage/YFNDKN2B/2006.html} -} \ No newline at end of file +} + +@article{zhang2014confidence, + title={Confidence intervals for low dimensional parameters in high dimensional linear models}, + author={Zhang, Cun-Hui and Zhang, Stephanie S}, + journal={Journal of the Royal Statistical Society Series B: Statistical Methodology}, + volume={76}, + number={1}, + pages={217--242}, + year={2014}, + publisher={Oxford University Press} +} + +@article{van2014asymptotically, + title={On asymptotically optimal confidence regions and tests for high-dimensional models}, + author={van de Geer, Sara and B{\"u}hlmann, Peter and Ritov, Ya'acov and Dezeure, Ruben}, + journal={The Annals of Statistics}, + pages={1166--1202}, + year={2014}, + publisher={JSTOR} +} + +@article{javanmard2014confidence, + title={Confidence intervals and hypothesis testing for high-dimensional regression}, + author={Javanmard, Adel and Montanari, Andrea}, + journal={The Journal of Machine Learning Research}, + volume={15}, + number={1}, + pages={2869--2909}, + year={2014}, + publisher={JMLR. org} +} + +@article{bellec2022biasing, + title={De-biasing the lasso with degrees-of-freedom adjustment}, + author={Bellec, Pierre C and Zhang, Cun-Hui}, + journal={Bernoulli}, + volume={28}, + number={2}, + pages={713--743}, + year={2022}, + publisher={Bernoulli Society for Mathematical Statistics and Probability} +} + +@article{celentano2023lasso, + title={The lasso with general gaussian designs with applications to hypothesis testing}, + author={Celentano, Michael and Montanari, Andrea and Wei, Yuting}, + journal={The Annals of Statistics}, + volume={51}, + number={5}, + pages={2194--2220}, + year={2023}, + publisher={Institute of Mathematical Statistics} +} + +@phdthesis{chevalier2020statistical, + title={Statistical control of sparse models in high dimension}, + author={Chevalier, J{\'e}r{\^o}me-Alexis}, + year={2020}, + school={Universit{\'e} Paris-Saclay} +} + +@article{fan2012variance, + title={Variance estimation using refitted cross-validation in ultrahigh dimensional regression}, + author={Fan, Jianqing and Guo, Shaojun and Hao, Ning}, + journal={Journal of the Royal Statistical Society Series B: Statistical Methodology}, + volume={74}, + number={1}, + pages={37--65}, + year={2012}, + publisher={Oxford University Press} +} + +@article{reid2016study, + title={A study of error variance estimation in lasso regression}, + author={Reid, Stephen and Tibshirani, Robert and Friedman, Jerome}, + journal={Statistica Sinica}, + pages={35--67}, + year={2016}, + publisher={JSTOR} +} + +@article{chevalier2020statistical, + title={Statistical control for spatio-temporal meg/eeg source imaging with desparsified mutli-task lasso}, + author={Chevalier, J{\'e}r{\^o}me-Alexis and Salmon, Joseph and Gramfort, Alexandre and Thirion, Bertrand}, + journal={Advances in Neural Information Processing Systems}, + volume={33}, + pages={1759--1770}, + year={2020} +} + +@article{eshel2003yule, + title={The yule walker equations for the AR coefficients}, + author={Eshel, Gidon}, + journal={Internet resource}, + volume={2}, + pages={68--73}, + year={2003} +} + diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index b19168d6..07890e5d 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -61,10 +61,13 @@ from sklearn.feature_extraction import image from hidimstat.clustered_inference import clustered_inference -from hidimstat.desparsified_lasso import desparsified_lasso +from hidimstat.desparsified_lasso import ( + desparsified_lasso, + desparsified_lasso_pvalue, +) from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference from hidimstat.scenario import multivariate_simulation -from hidimstat.stat_tools import pval_from_cb, zscore_from_pval +from hidimstat.stat_tools import zscore_from_pval ############################################################################# # Specific plotting functions @@ -237,8 +240,10 @@ def plot(maps, titles): # and referred to as Desparsified Lasso. # compute desparsified lasso -beta_hat, cb_min, cb_max = desparsified_lasso(X_init, y, n_jobs=n_jobs) -pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb(cb_min, cb_max) +beta_hat, sigma_hat, omega_diag = desparsified_lasso(X_init, y, n_jobs=n_jobs) +pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( + desparsified_lasso_pvalue(X_init.shape[0], beta_hat, sigma_hat, omega_diag) +) # compute estimated support (first method) zscore = zscore_from_pval(pval, one_minus_pval) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index cdc21880..b9ae28b3 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,12 +1,16 @@ from .adaptive_permutation_threshold import ada_svr from .clustered_inference import clustered_inference, hd_inference -from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso +from .desparsified_lasso import ( + desparsified_lasso, + desparsified_lasso_pvalue, + desparsified_group_lasso_pvalue, +) from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference from .knockoff_aggregation import knockoff_aggregation from .knockoffs import model_x_knockoff from .multi_sample_split import aggregate_quantiles -from .noise_std import group_reid, reid +from .noise_std import reid from .permutation_test import permutation_test_cv from .scenario import multivariate_1D_simulation from .standardized_svr import standardized_svr @@ -26,10 +30,11 @@ "clustered_inference", "dcrt_zero", "desparsified_lasso", - "desparsified_group_lasso", + "desparsified_lasso_pvalue", + "desparsified_group_lasso_pvalue", "DnnLearnerSingle", "ensemble_clustered_inference", - "group_reid", + "reid", "hd_inference", "knockoff_aggregation", "model_x_knockoff", diff --git a/src/hidimstat/clustered_inference.py b/src/hidimstat/clustered_inference.py index 5202f0a8..69166c39 100644 --- a/src/hidimstat/clustered_inference.py +++ b/src/hidimstat/clustered_inference.py @@ -1,10 +1,12 @@ import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.utils import resample -from sklearn.utils.validation import check_memory -from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso -from .stat_tools import pval_from_cb +from .desparsified_lasso import ( + desparsified_lasso, + desparsified_lasso_pvalue, + desparsified_group_lasso_pvalue, +) def _subsampling(n_samples, train_size, groups=None, seed=0): @@ -45,7 +47,7 @@ def _ward_clustering(X_init, ward, train_index): return X_reduced, ward -def hd_inference(X, y, method, n_jobs=1, memory=None, verbose=0, **kwargs): +def hd_inference(X, y, method, n_jobs=1, verbose=0, **kwargs): """Wrap-up high-dimensional inference procedures Parameters @@ -65,11 +67,6 @@ def hd_inference(X, y, method, n_jobs=1, memory=None, verbose=0, **kwargs): n_jobs : int or None, optional (default=1) Number of CPUs to use during parallel steps such as inference. - memory : str or joblib.Memory object, optional (default=None) - Used to cache the output of the computation of the clustering - and the inference. By default, no caching is done. If a string is - given, it is the path to the caching directory. - verbose: int, optional (default=1) The verbosity level. If `verbose > 0`, we print a message before runing the clustered inference. @@ -96,34 +93,28 @@ def hd_inference(X, y, method, n_jobs=1, memory=None, verbose=0, **kwargs): one_minus_pval_corr : ndarray, shape (n_features,) One minus the p-value corrected for multiple testing. """ - - if method == "desparsified-lasso": - - beta_hat, cb_min, cb_max = desparsified_lasso( - X, - y, - confidence=0.95, - n_jobs=n_jobs, - memory=memory, - verbose=verbose, - **kwargs, - ) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb( - cb_min, cb_max, confidence=0.95 - ) - - elif method == "desparsified-group-lasso": - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso( - X, y, n_jobs=n_jobs, memory=memory, verbose=verbose, **kwargs + if method not in ["desparsified-lasso", "desparsified-group-lasso"]: + raise ValueError("Unknow method") + group = method == "desparsified-group-lasso" + print("hd_inference", group, kwargs) + beta_hat, theta_hat, omega_diag = desparsified_lasso( + X, y, group=group, n_jobs=n_jobs, verbose=verbose, **kwargs + ) + if not group: + pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( + desparsified_lasso_pvalue( + X.shape[0], + beta_hat, + theta_hat, + omega_diag, + confidence=0.95, + **kwargs, ) ) - else: - - raise ValueError("Unknow method") - + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + desparsified_group_lasso_pvalue(beta_hat, theta_hat, omega_diag, **kwargs) + ) return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr @@ -178,7 +169,6 @@ def clustered_inference( method="desparsified-lasso", seed=0, n_jobs=1, - memory=None, verbose=1, **kwargs, ): @@ -220,11 +210,6 @@ def clustered_inference( n_jobs : int or None, optional (default=1) Number of CPUs to use during parallel steps such as inference. - memory : str or joblib.Memory object, optional (default=None) - Used to cache the output of the computation of the clustering - and the inference. By default, no caching is done. If a string is - given, it is the path to the caching directory. - verbose: int, optional (default=1) The verbosity level. If `verbose > 0`, we print a message before runing the clustered inference. @@ -257,9 +242,6 @@ def clustered_inference( Spatially relaxed inference on high-dimensional linear models. arXiv preprint arXiv:2106.02590. """ - - memory = check_memory(memory) - n_samples, n_features = X_init.shape if verbose > 0: @@ -273,20 +255,26 @@ def clustered_inference( train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed) # Clustering - X, ward = memory.cache(_ward_clustering)(X_init, ward, train_index) + X, ward = _ward_clustering(X_init, ward, train_index) # Preprocessing X = StandardScaler().fit_transform(X) y = y - np.mean(y) # Inference: computing reduced parameter vector and stats + print("Clustered inference", kwargs) beta_hat_, pval_, pval_corr_, one_minus_pval_, one_minus_pval_corr_ = hd_inference( - X, y, method, n_jobs=n_jobs, memory=memory, **kwargs + X, y, method, n_jobs=n_jobs, **kwargs ) # De-grouping beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( - ward, beta_hat_, pval_, pval_corr_, one_minus_pval_, one_minus_pval_corr_ + ward, + beta_hat_, + pval_, + pval_corr_, + one_minus_pval_, + one_minus_pval_corr_, ) return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index aaa5e03e..4e2fd2a5 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -4,141 +4,111 @@ from scipy import stats from scipy.linalg import inv from sklearn.linear_model import Lasso -from sklearn.utils.validation import check_memory -from .noise_std import group_reid, reid -from .stat_tools import pval_from_two_sided_pval_and_sign - - -def _compute_all_residuals( - X, alphas, gram, max_iter=5000, tol=1e-3, method="lasso", n_jobs=1, verbose=0 -): - """Nodewise Lasso. Compute all the residuals: regressing each column of the - design matrix against the other columns""" - - n_samples, n_features = X.shape - - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_compute_residuals)( - X=X, - column_index=i, - alpha=alphas[i], - gram=gram, - max_iter=max_iter, - tol=tol, - method=method, - ) - for i in range(n_features) - ) - - results = np.asarray(results, dtype=object) - Z = np.stack(results[:, 0], axis=1) - omega_diag = np.stack(results[:, 1]) - - return Z, omega_diag - - -def _compute_residuals( - X, column_index, alpha, gram, max_iter=5000, tol=1e-3, method="lasso" -): - """Compute the residuals of the regression of a given column of the - design matrix against the other columns""" - - n_samples, n_features = X.shape - i = column_index - - X_new = np.delete(X, i, axis=1) - y = np.copy(X[:, i]) - - if method == "lasso": - - gram_ = np.delete(np.delete(gram, i, axis=0), i, axis=1) - clf = Lasso(alpha=alpha, precompute=gram_, max_iter=max_iter, tol=tol) - - else: - - ValueError("The only regression method available is 'lasso'") - - clf.fit(X_new, y) - z = y - clf.predict(X_new) - - omega_diag_i = n_samples * np.sum(z**2) / np.dot(y, z) ** 2 - - return z, omega_diag_i +from hidimstat.noise_std import reid +from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign +from hidimstat.stat_tools import pval_from_cb def desparsified_lasso( X, y, dof_ajdustement=False, - confidence=0.95, max_iter=5000, tol=1e-3, - residual_method="lasso", alpha_max_fraction=0.01, + eps=1e-2, + tol_reid=1e-4, + n_split=5, n_jobs=1, - memory=None, + seed=0, verbose=0, + group=False, + cov=None, + noise_method="AR", + order=1, + fit_Y=True, + stationary=True, ): - """Desparsified Lasso with confidence intervals + """ + Desparsified Lasso with confidence intervals + + Algorithm based on Algorithm 1 of d-Lasso and d-MTLasso in + :cite:`chevalier2020statistical`. Parameters ---------- X : ndarray, shape (n_samples, n_features) - Data. + Input data matrix. - y : ndarray, shape (n_samples,) - Target. + y : ndarray, shape (n_samples,) or (n_samples, n_times) + Target vector for single response or matrix for multiple + responses. dof_ajdustement : bool, optional (default=False) - If True, makes the degrees of freedom adjustement (cf. [4]_ and [5]_). - Otherwise, the original Desparsified Lasso estimator is computed - (cf. [1]_ and [2]_ and [3]_). - - confidence : float, optional (default=0.95) - Confidence level used to compute the confidence intervals. - Each value should be in the range [0, 1]. + If True, applies degrees of freedom adjustment. + If False, computes original Desparsified Lasso estimator. max_iter : int, optional (default=5000) - The maximum number of iterations when regressing, by Lasso, - each column of the design matrix against the others. + Maximum iterations for Nodewise Lasso regressions. tol : float, optional (default=1e-3) - The tolerance for the optimization of the Lasso problems: if the - updates are smaller than `tol`, the optimization code checks the - dual gap for optimality and continues until it is smaller than `tol`. - - residual_method : str, optional (default='lasso') - Method used for computing the residuals of the Nodewise Lasso. - Currently the only method available is 'lasso'. + Convergence tolerance for optimization. alpha_max_fraction : float, optional (default=0.01) - Only used if method='lasso'. - Then alpha = alpha_max_fraction * alpha_max. + Fraction of max alpha used for Lasso regularization. - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the Nodewise Lasso. + eps : float, optional (default=1e-2) + Small constant used in noise estimation. + + tol_reid : float, optional (default=1e-4) + Tolerance for Reid estimation. + + n_split : int, optional (default=5) + Number of splits for cross-validation in Reid procedure. + + n_jobs : int, optional (default=1) + Number of parallel jobs. Use -1 for all CPUs. + + seed : int, default=0 + Random seed for reproducibility. - memory : str or joblib.Memory object, optional (default=None) - Used to cache the output of the computation of the Nodewise Lasso. - By default, no caching is done. If a string is given, it is the path - to the caching directory. + verbose : int, default=0 + Verbosity level for logging. - verbose: int, optional (default=1) - The verbosity level: if non zero, progress messages are printed - when computing the Nodewise Lasso in parralel. - The frequency of the messages increases with the verbosity level. + group : bool, default=False + If True, use group Lasso for multiple responses. + + cov : ndarray, shape (n_times, n_times), default=None + Temporal covariance matrix of the noise. + If None, it is estimated. + + noise_method : {'AR', 'simple'}, default='AR' + Method to estimate noise covariance: + - 'simple': Uses median correlation between consecutive + timepoints + - 'AR': Fits autoregressive model of specified order + + order : int, default=1 + Order of AR model when noise_method='AR'. Must be < n_times. + + fit_Y : bool, default=True + Whether to fit Y in noise estimation. + + stationary : bool, default=True + Whether to assume stationary noise in estimation. Returns ------- - beta_hat : array, shape (n_features,) - Estimated parameter vector. + beta_hat : ndarray, shape (n_features,) or (n_features, n_times) + Desparsified Lasso coefficient estimates. - cb_min : array, shape (n_features) - Lower bound of the confidence intervals on the parameter vector. + sigma_hat/theta_hat : float or ndarray, shape (n_times, n_times) + Estimated noise level (single response) or precision matrix + (multiple responses). - cb_max : array, shape (n_features) - Upper bound of the confidence intervals on the parameter vector. + omega_diag : ndarray, shape (n_features,) + Diagonal elements of the precision matrix. Notes ----- @@ -152,271 +122,378 @@ def desparsified_lasso( References ---------- - .. [1] Zhang, C. H., & Zhang, S. S. (2014). Confidence intervals for - low dimensional parameters in high dimensional linear models. - Journal of the Royal Statistical Society: Series B: Statistical - Methodology, 217-242. - - .. [2] Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). - On asymptotically optimal confidence regions and tests for - high-dimensional models. Annals of Statistics, 42(3), 1166-1202. - - .. [3] Javanmard, A., & Montanari, A. (2014). Confidence intervals and - hypothesis testing for high-dimensional regression. The Journal - of Machine Learning Research, 15(1), 2869-2909. - - .. [4] Bellec, P. C., & Zhang, C. H. (2019). De-biasing the lasso with - degrees-of-freedom adjustment. arXiv preprint arXiv:1902.08885. - - .. [5] Celentano, M., Montanari, A., & Wei, Y. (2020). The Lasso with - general Gaussian designs with applications to hypothesis testing. - arXiv preprint arXiv:2007.13716. + .. footbibliography:: """ - X = np.asarray(X) - - n_samples, n_features = X.shape - - memory = check_memory(memory) + X_ = np.asarray(X) + + n_samples, n_features = X_.shape + if group: + n_times = y.shape[1] + if cov is not None and cov.shape != (n_times, n_times): + raise ValueError( + f'Shape of "cov" should be ({n_times}, {n_times}),' + + f' the shape of "cov" was ({cov.shape}) instead' + ) + + # centering the data and the target variable + y_ = y - np.mean(y) + X_ = X_ - np.mean(X_, axis=0) + + # Lasso regression and noise standard deviation estimation + # TODO: other estimation of the noise standard deviation? + sigma_hat, beta_reid = reid( + X_, + y_, + eps=eps, + tol=tol_reid, + max_iter=max_iter, + n_split=n_split, + n_jobs=n_jobs, + seed=seed, + # for group + group=group, + method=noise_method, + order=order, + fit_Y=fit_Y, + stationary=stationary, + ) - y = y - np.mean(y) - X = X - np.mean(X, axis=0) - gram = np.dot(X.T, X) - gram_nodiag = gram - np.diag(np.diag(gram)) + # compute the Gram matrix + gram = np.dot(X_.T, X_) + gram_nodiag = np.copy(gram) + np.fill_diagonal(gram_nodiag, 0) + # define the alphas for the Nodewise Lasso + # TODO why don't use the function _alpha_max instead of this? list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples alphas = alpha_max_fraction * list_alpha_max # Calculating precision matrix (Nodewise Lasso) - Z, omega_diag = memory.cache(_compute_all_residuals, ignore=["n_jobs"])( - X, + Z, omega_diag = _compute_all_residuals( + X_, alphas, gram, max_iter=max_iter, tol=tol, - method=residual_method, n_jobs=n_jobs, verbose=verbose, ) - # Lasso regression - sigma_hat, beta_lasso = reid(X, y, n_jobs=n_jobs) - # Computing the degrees of freedom adjustement if dof_ajdustement: - coef_max = np.max(np.abs(beta_lasso)) - support = np.sum(np.abs(beta_lasso) > 0.01 * coef_max) + coef_max = np.max(np.abs(beta_reid)) + support = np.sum(np.abs(beta_reid) > 0.01 * coef_max) support = min(support, n_samples - 1) dof_factor = n_samples / (n_samples - support) else: dof_factor = 1 # Computing Desparsified Lasso estimator and confidence intervals - beta_bias = dof_factor * np.dot(y.T, Z) / np.sum(X * Z, axis=0) + # Estimating the coefficient vector + beta_bias = dof_factor * np.dot(y_.T, Z) / np.sum(X_ * Z, axis=0) - P = ((Z.T.dot(X)).T / np.sum(X * Z, axis=0)).T + # beta hat + P = (np.dot(X_.T, Z) / np.sum(X_ * Z, axis=0)).T P_nodiag = P - np.diag(np.diag(P)) Id = np.identity(n_features) P_nodiag = dof_factor * P_nodiag + (dof_factor - 1) * Id - - beta_hat = beta_bias - P_nodiag.dot(beta_lasso) - + beta_hat = beta_bias.T - P_nodiag.dot(beta_reid.T) + # confidence intervals omega_diag = omega_diag * dof_factor**2 - omega_invsqrt_diag = omega_diag ** (-0.5) + if not group: + return beta_hat, sigma_hat, omega_diag + else: + cov_hat = sigma_hat + if cov is not None: + cov_hat = cov + theta_hat = n_samples * inv(cov_hat) + return beta_hat, theta_hat, omega_diag + + +def desparsified_lasso_pvalue( + n_samples, + beta_hat, + sigma_hat, + omega_diag, + confidence=0.95, + distribution="norm", + eps=1e-14, + confidence_interval_only=False, +): + """ + Calculate confidence intervals and p-values for desparsified lasso estimators. + This function computes confidence intervals for the desparsified lasso + estimator beta_hat. + It can also return p-values derived from these confidence intervals. + Parameters + ---------- + n_samples : float + The number of samples + beta_hat : ndarray, shape (n_features,) + The desparsified lasso coefficient estimates. + sigma_hat : float + Estimated noise level. + omega_diag : ndarray, shape (n_features,) + Diagonal elements of the precision matrix estimate. + confidence : float, default=0.95 + Confidence level for intervals, must be in [0, 1]. + distribution : str, default="norm" + Distribution to use for p-value calculation. + Currently only "norm" supported. + eps : float, default=1e-14 + Small value to avoid numerical issues in p-value calculation. + confidence_interval_only : bool, optional (default=False) + If True, return only confidence intervals. + If False, also return p-values. + Returns + ------- + If confidence_interval_only=True: + cb_min : ndarray, shape (n_features,) + Lower bounds of confidence intervals + cb_max : ndarray, shape (n_features,) + Upper bounds of confidence intervals + If confidence_interval_only=False: + pval : ndarray, shape (n_features,) + P-values + pval_corr : ndarray, shape (n_features,) + Corrected p-values + one_minus_pval : ndarray, shape (n_features,) + 1 - p-values + one_minus_pval_corr : ndarray, shape (n_features,) + 1 - corrected p-values + cb_min : ndarray, shape (n_features,) + Lower bounds of confidence intervals + cb_max : ndarray, shape (n_features,) + Upper bounds of confidence intervals + """ + # define the quantile for the confidence intervals quantile = stats.norm.ppf(1 - (1 - confidence) / 2) - + # TODO:why the double inverse of omega_diag? + omega_invsqrt_diag = omega_diag ** (-0.5) confint_radius = np.abs( quantile * sigma_hat / (np.sqrt(n_samples) * omega_invsqrt_diag) ) cb_max = beta_hat + confint_radius cb_min = beta_hat - confint_radius - return beta_hat, cb_min, cb_max + if confidence_interval_only: + return cb_min, cb_max + pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb( + cb_min, cb_max, confidence=confidence, distribution=distribution, eps=eps + ) + return pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max -def desparsified_group_lasso( - X, - Y, - cov=None, - test="chi2", - max_iter=5000, - tol=1e-3, - residual_method="lasso", - alpha_max_fraction=0.01, - noise_method="AR", - order=1, - n_jobs=1, - memory=None, - verbose=0, -): - """Desparsified Group Lasso + +def desparsified_group_lasso_pvalue(beta_hat, theta_hat, omega_diag, test="chi2"): + """ + Compute p-values for the desparsified group Lasso estimator using + chi-squared or F tests Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data. + beta_hat : ndarray, shape (n_features, n_times) + Estimated parameter matrix from desparsified group Lasso. - Y : ndarray, shape (n_samples, n_times) - Target. + theta_hat : ndarray, shape (n_times, n_times) + Estimated precision matrix (inverse covariance). - cov : ndarray, shape (n_times, n_times), optional (default=None) - If None, a temporal covariance matrix of the noise is estimated. - Otherwise, `cov` is the temporal covariance matrix of the noise. + omega_diag : ndarray, shape (n_features,) + Diagonal elements of the precision matrix. - test : str, optional (default='chi2') - Statistical test used to compute p-values. 'chi2' corresponds - to a chi-squared test and 'F' corresponds to an F-test. + test : {'chi2', 'F'}, default='chi2' + Statistical test for computing p-values: + - 'chi2': Chi-squared test (recommended for large samples) + - 'F': F-test (better for small samples) - max_iter : int, optional (default=5000) - The maximum number of iterations when regressing, by Lasso, - each column of the design matrix against the others. + Returns + ------- + pval : ndarray, shape (n_features,) + Raw p-values, numerically accurate for positive effects + (p-values close to 0). - tol : float, optional (default=1e-3) - The tolerance for the optimization of the Lasso problems: if the - updates are smaller than `tol`, the optimization code checks the - dual gap for optimality and continues until it is smaller than `tol`. + pval_corr : ndarray, shape (n_features,) + P-values corrected for multiple testing using + Benjamini-Hochberg procedure. - residual_method : str, optional (default='lasso') - Method used for computing the residuals of the Nodewise Lasso. - Currently the only method available is 'lasso'. + one_minus_pval : ndarray, shape (n_features,) + 1 - p-values, numerically accurate for negative effects + (p-values close to 1). - alpha_max_fraction : float, optional (default=0.01) - Only used if method='lasso'. - Then alpha = alpha_max_fraction * alpha_max. + one_minus_pval_corr : ndarray, shape (n_features,) + 1 - corrected p-values. - noise_method : str, optional (default='simple') - If 'simple', the correlation matrix is estimated by taking the - median of the correlation between two consecutive time steps - and the noise standard deviation for each time step is estimated - by taking the median of the standard deviations for every time step. - If 'AR', the order of the AR model is given by `order` and - Yule-Walker method is used to estimate the covariance matrix. + Notes + ----- + The chi-squared test assumes asymptotic normality while the F-test + makes no such assumption and is preferable for small sample sizes. + P-values are computed based on score statistics from the estimated + coefficients and precision matrix. + """ + n_features, n_times = beta_hat.shape + n_samples = omega_diag.shape[0] - order : int, optional (default=1) - If `method=AR`, `order` gives the order of the estimated autoregressive - model. `order` must be smaller than the number of time steps. + # Compute the two-sided p-values + if test == "chi2": + chi2_scores = np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag + two_sided_pval = np.minimum(2 * stats.chi2.sf(chi2_scores, df=n_times), 1.0) + elif test == "F": + f_scores = ( + np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag / n_times + ) + two_sided_pval = np.minimum( + 2 * stats.f.sf(f_scores, dfd=n_samples, dfn=n_times), 1.0 + ) + else: + raise ValueError(f"Unknown test '{test}'") - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the Nodewise Lasso. + # Compute the p-values + sign_beta = np.sign(np.sum(beta_hat, axis=1)) + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) + ) - memory : str or joblib.Memory object, optional (default=None) - Used to cache the output of the computation of the Nodewise Lasso. - By default, no caching is done. If a string is given, it is the path - to the caching directory. + return pval, pval_corr, one_minus_pval, one_minus_pval_corr - verbose: int, optional (default=1) - The verbosity level: if non zero, progress messages are printed - when computing the Nodewise Lasso in parralel. - The frequency of the messages increases with the verbosity level. - Returns - ------- - beta_hat : ndarray, shape (n_features, n_times) - Estimated parameter matrix. +def _compute_all_residuals( + X, alphas, gram, max_iter=5000, tol=1e-3, n_jobs=1, verbose=0 +): + """ + Nodewise Lasso for computing residuals and precision matrix diagonal. - pval : ndarray, shape (n_features,) - p-value, with numerically accurate values for - positive effects (ie., for p-value close to zero). + For each feature, fits a Lasso regression against all other features + to estimate the precision matrix and residuals needed for the + desparsified Lasso estimator. - pval_corr : ndarray, shape (n_features,) - p-value corrected for multiple testing. + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data matrix. - one_minus_pval : ndarray, shape (n_features,) - One minus the p-value, with numerically accurate values - for negative effects (ie., for p-value close to one). + alphas : ndarray, shape (n_features,) + Lasso regularization parameters, one per feature. + + gram : ndarray, shape (n_features, n_features) + Precomputed Gram matrix X.T @ X to speed up computations. + + max_iter : int, optional (default=5000) + Maximum number of iterations for Lasso optimization. + + tol : float, optional (default=1e-3) + Convergence tolerance for Lasso optimization. + + n_jobs : int or None, optional (default=1) + Number of parallel jobs. None means using all processors. + + verbose : int, optional (default=0) + Controls the verbosity when fitting the models: + 0 = silent + 1 = progress bar + >1 = more detailed output + + Returns + ------- + Z : ndarray, shape (n_samples, n_features) + Matrix of residuals from nodewise regressions. + + omega_diag : ndarray, shape (n_features,) + Diagonal entries of the precision matrix estimate. - one_minus_pval_corr : ndarray, shape (n_features,) - One minus the p-value corrected for multiple testing. Notes ----- - The columns of `X` and the matrix `Y` are always centered, this ensures - that the intercepts of the Nodewise Lasso problems are all equal to zero - and the intercept of the noise model is also equal to zero. Since - the values of the intercepts are not of interest, the centering avoids - the consideration of unecessary additional parameters. - Also, you may consider to center and scale `X` beforehand, notably if - the data contained in `X` has not been prescaled from measurements. + This implements the nodewise Lasso procedure from :cite:`chevalier2020statistical` + for estimating entries of the precision matrix needed in the + desparsified Lasso. The procedure regresses each feature against + all others using Lasso to obtain residuals and precision matrix estimates. References ---------- - .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). - Statistical control for spatio-temporal MEG/EEG source imaging with - desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on - Neural Information Processing Systems. + .. footbibliography:: """ - X = np.asarray(X) - n_samples, n_features = X.shape - n_times = Y.shape[1] - - memory = check_memory(memory) - if cov is not None and cov.shape != (n_times, n_times): - raise ValueError( - f'Shape of "cov" should be ({n_times}, {n_times}),' - + f' the shape of "cov" was ({cov.shape}) instead' + results = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_compute_residuals)( + X=X, + column_index=i, + alpha=alphas[i], + gram=gram, + max_iter=max_iter, + tol=tol, ) + for i in range(n_features) + ) - Y = Y - np.mean(Y) - X = X - np.mean(X, axis=0) - gram = np.dot(X.T, X) - gram_nodiag = gram - np.diag(np.diag(gram)) + # Unpacking the results + results = np.asarray(results, dtype=object) + Z = np.stack(results[:, 0], axis=1) + omega_diag = np.stack(results[:, 1]) - list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples - alphas = alpha_max_fraction * list_alpha_max + return Z, omega_diag - # Calculating precision matrix (Nodewise Lasso) - Z, omega_diag = memory.cache(_compute_all_residuals, ignore=["n_jobs"])( - X, - alphas, - gram, - max_iter=max_iter, - tol=tol, - method=residual_method, - n_jobs=n_jobs, - verbose=verbose, - ) - # Group Lasso regression - cov_hat, beta_mtl = group_reid( - X, Y, method=noise_method, order=order, n_jobs=n_jobs - ) +def _compute_residuals(X, column_index, alpha, gram, max_iter=5000, tol=1e-3): + """ + Compute nodewise Lasso regression for desparsified Lasso estimation - if cov is not None: - cov_hat = cov + For feature i, regresses X[:,i] against all other features to + obtain residuals and precision matrix diagonal entry needed for debiasing. - theta_hat = n_samples * inv(cov_hat) + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Centered input data matrix - # Estimating the coefficient vector - beta_bias = Y.T.dot(Z) / np.sum(X * Z, axis=0) + column_index : int + Index i of feature to regress - beta_mtl = beta_mtl.T - beta_bias = beta_bias.T + alpha : float + Lasso regularization parameter - P = (np.dot(X.T, Z) / np.sum(X * Z, axis=0)).T - P_nodiag = P - np.diag(np.diag(P)) + gram : ndarray, shape (n_features, n_features) + Precomputed X.T @ X matrix - beta_hat = beta_bias - P_nodiag.dot(beta_mtl) + max_iter : int, default=5000 + Maximum Lasso iterations - if test == "chi2": + tol : float, default=1e-3 + Optimization tolerance - chi2_scores = np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag - two_sided_pval = np.minimum(2 * stats.chi2.sf(chi2_scores, df=n_times), 1.0) + Returns + ------- + z : ndarray, shape (n_samples,) + Residuals from regression - if test == "F": + omega_diag_i : float + Diagonal entry i of precision matrix estimate, + computed as n * ||z||^2 / ^2 - f_scores = ( - np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag / n_times - ) - two_sided_pval = np.minimum( - 2 * stats.f.sf(f_scores, dfd=n_samples, dfn=n_times), 1.0 - ) + Notes + ----- + Uses sklearn's Lasso with precomputed Gram matrix for efficiency. + """ - sign_beta = np.sign(np.sum(beta_hat, axis=1)) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) - ) + n_samples, n_features = X.shape + i = column_index - return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr + # Removing the column to regress against the others + X_new = np.delete(X, i, axis=1) + y_new = np.copy(X[:, i]) + + # Method used for computing the residuals of the Nodewise Lasso. + # here we use the Lasso method + gram_ = np.delete(np.delete(gram, i, axis=0), i, axis=1) + clf = Lasso(alpha=alpha, precompute=gram_, max_iter=max_iter, tol=tol) + + # Fitting the Lasso model and computing the residuals + clf.fit(X_new, y_new) + z = y_new - clf.predict(X_new) + + # Computing the diagonal of the covariance matrix + omega_diag_i = n_samples * np.sum(z**2) / np.dot(y_new, z) ** 2 + + return z, omega_diag_i diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 2c4233e6..ab289f68 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -57,7 +57,6 @@ def ensemble_clustered_inference( gamma_min=0.2, n_bootstraps=25, n_jobs=1, - memory=None, verbose=1, **kwargs, ): @@ -113,11 +112,6 @@ def ensemble_clustered_inference( Number of CPUs used to compute several clustered inference algorithms at the same time. - memory : str, optional (default=None) - Used to cache the output of the computation of the clustering - and the inference. By default, no caching is done. If a string is - given, it is the path to the caching directory. - verbose: int, optional (default=1) The verbosity level. If `verbose > 0`, we print a message before runing the clustered inference. @@ -150,13 +144,6 @@ def ensemble_clustered_inference( Spatially relaxed inference on high-dimensional linear models. arXiv preprint arXiv:2106.02590. """ - - if memory is not None and not isinstance(memory, str): - raise ValueError( - "'memory' must be None or a string corresponding " - + "to the path of the caching directory." - ) - # Clustered inference algorithms results = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(clustered_inference)( @@ -169,7 +156,6 @@ def ensemble_clustered_inference( method=inference_method, seed=i, n_jobs=1, - memory=memory, verbose=verbose, **kwargs, ) diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index 276b2c7d..4ad5347b 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -5,175 +5,105 @@ from sklearn.model_selection import KFold -def reid(X, y, eps=1e-2, tol=1e-4, max_iter=10000, n_jobs=1, seed=0): - """Estimation of noise standard deviation using Reid procedure - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Data. - - y : ndarray, shape (n_samples,) - Target. - - eps: float, optional (default=1e-2) - Length of the cross-validation path. - eps=1e-2 means that alpha_min / alpha_max = 1e-2. - - tol : float, optional (default=1e-4) - The tolerance for the optimization: if the updates are smaller - than `tol`, the optimization code checks the dual gap for optimality - and continues until it is smaller than `tol`. - - max_iter : int, optional (default=1e4) - The maximum number of iterations. - - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the cross validation. - - seed: int, optional (default=0) - Seed passed in the KFold object which is used to cross-validate - LassoCV. This seed controls the partitioning randomness. - - Returns - ------- - sigma_hat : float - Estimated noise standard deviation. - - beta_hat : array, shape (n_features,) - Estimated parameter vector. - - References - ---------- - .. [1] Reid, S., Tibshirani, R., & Friedman, J. (2016). A study of error - variance estimation in lasso regression. Statistica Sinica, 35-67. - """ - - X = np.asarray(X) - n_samples, n_features = X.shape - - if max_iter // 5 <= n_features: - max_iter = n_features * 5 - print(f"'max_iter' has been increased to {max_iter}") - - cv = KFold(n_splits=5, shuffle=True, random_state=seed) - - clf_lasso_cv = LassoCV( - eps=eps, fit_intercept=False, cv=cv, tol=tol, max_iter=max_iter, n_jobs=n_jobs - ) - - clf_lasso_cv.fit(X, y) - beta_hat = clf_lasso_cv.coef_ - residual = clf_lasso_cv.predict(X) - y - coef_max = np.max(np.abs(beta_hat)) - support = np.sum(np.abs(beta_hat) > tol * coef_max) - - # avoid dividing by 0 - support = min(support, n_samples - 1) - - sigma_hat = norm(residual) / np.sqrt(n_samples - support) - - return sigma_hat, beta_hat - - -def group_reid( +def reid( X, - Y, - fit_Y=True, - stationary=True, - method="simple", - order=1, + y, eps=1e-2, tol=1e-4, max_iter=10000, + n_split=5, n_jobs=1, seed=0, + group=False, + fit_Y=True, + stationary=True, + method="simple", + order=1, ): - """Estimation of the covariance matrix using group Reid procedure + """ + Residual sum of squares based estimators for noise standard deviation + estimation. + + This implementation follows the procedure described in + :footcite:`fan2012variance` and :cite:`reid2016study`. It uses Lasso with + cross-validation to estimate both the noise standard deviation and model + coefficients. + + For group, the implementation is based on the procedure + from :footcite:`chevalier2020statistical`. Parameters ---------- X : ndarray, shape (n_samples, n_features) - Data. + Input data matrix. + + y : ndarray, shape (n_samples,)/(n_samples, n_times) + Target vector. The time means the presence of groups. - Y : ndarray, shape (n_samples, n_times) - Target. + eps : float, optional (default=1e-2) + Length of the cross-validation path, where alpha_min / alpha_max = eps. + Smaller values create a finer grid. - fit_Y : bool, optional (default=True) - If True, Y will be regressed against X by MultiTaskLassoCV - and the covariance matrix is estimated on the residuals. - Otherwise, covariance matrix is estimated directly on Y. + tol : float, optional (default=1e-4) + Tolerance for optimization convergence. The algorithm stops + when updates are smaller than tol and dual gap is smaller than tol. - stationary : bool, optional (default=True) - If True, noise is considered to have the same magnitude for each - time step. Otherwise, magnitude of the noise is not constant. + max_iter : int, optional (default=10000) + Maximum number of iterations for the optimization algorithm. - method : str, optional (default='simple') - If 'simple', the correlation matrix is estimated by taking the - median of the correlation between two consecutive time steps - and the noise standard deviation for each time step is estimated - by taking the median of the standard deviations for every time step. - If 'AR', the order of the AR model is given by `order` and - Yule-Walker method is used to estimate the covariance matrix. + n_split : int, optional (default=5) + Number of folds for cross-validation. - order : int, optional (default=1) - If `stationary=True` and `method=AR`, `order` gives the - order of the estimated autoregressive model. `order` must - be smaller than the number of time steps. + n_jobs : int, optional (default=1) + Number of parallel jobs for cross-validation. + -1 means using all processors. - eps : float, optional (default=1e-2) - Length of the cross-validation path. - eps=1e-2 means that alpha_min / alpha_max = 1e-2. + seed : int, optional (default=0) + Random seed for reproducible cross-validation splits. - tol : float, optional (default=1e-4) - The tolerance for the optimization: if the updates are smaller - than `tol`, the optimization code checks the dual gap for optimality - and continues until it is smaller than `tol`. + fit_Y : bool, (default=True) + Whether to use MultiTaskLassoCV to fit Y against X. + If False, covariance is estimated directly from Y. - max_iter : int, optional (default=1e4) - The maximum number of iterations. + stationary : bool, (default=True) + Whether noise has constant magnitude across time steps. - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the cross validation. + method : {'simple', 'AR'}, (default='simple') + Covariance estimation method: + - 'simple': Uses median correlation between consecutive time steps + - 'AR': Uses Yule-Walker method with specified order - seed: int, optional (default=0) - Seed passed in the KFold object which is used to cross-validate - LassoCV. This seed controls also the partitioning randomness. + order : int, default=1 + Order of AR model when method='AR'. Must be < n_times. Returns ------- - cov_hat : ndarray, shape (n_times, n_times) - Estimated covariance matrix. + sigma_hat/cov_hat : float/ndarray, shape (n_times, n_times) + Estimated noise standard deviation based on residuals + or estimated covariance matrix for group. - beta_hat : ndarray, shape (n_features, n_times) - Estimated parameter matrix. + beta_hat : ndarray, shape (n_features,)/(n_features, n_times) + Estimated sparse coefficient vector from Lasso regression. References ---------- - .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). - Statistical control for spatio-temporal MEG/EEG source imaging with - desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on - Neural Information Processing Systems. + .. footbibliography:: """ - X = np.asarray(X) - n_samples, n_features = X.shape - n_times = Y.shape[1] - - if method == "simple": - print("Group reid: simple cov estimation") - else: - print(f"Group reid: {method}{order} cov estimation") - - if (max_iter // 5) <= n_features: - max_iter = n_features * 5 - print(f"'max_iter' has been increased to {max_iter}") - - cv = KFold(n_splits=5, shuffle=True, random_state=seed) - + X_ = np.asarray(X) + n_samples, n_features = X_.shape + if group: + n_times = y.shape[1] if fit_Y: - - clf_mtlcv = MultiTaskLassoCV( + # check if max_iter is large enough + if max_iter // n_split <= n_features: + max_iter = n_features * n_split + print(f"'max_iter' has been increased to {max_iter}") + + # use the cross-validation for define the best alpha of Lasso + cv = KFold(n_splits=n_split, shuffle=True, random_state=seed) + Refit_CV = MultiTaskLassoCV if group else LassoCV + clf_cv = Refit_CV( eps=eps, fit_intercept=False, cv=cv, @@ -181,107 +111,146 @@ def group_reid( max_iter=max_iter, n_jobs=n_jobs, ) + clf_cv.fit(X_, y) - clf_mtlcv.fit(X, Y) - beta_hat = clf_mtlcv.coef_ - residual = clf_mtlcv.predict(X) - Y - row_max = np.max(np.sum(np.abs(beta_hat), axis=0)) - support = np.sum(np.sum(np.abs(beta_hat), axis=0) > tol * row_max) + # Estimate the support of the variable importance + beta_hat = clf_cv.coef_ + residual = clf_cv.predict(X_) - y - # avoid dividing by 0 - support = min(support, n_samples - 1) + # get the number of non-zero coefficients + coef_sum = np.sum(np.abs(beta_hat), axis=0) + size_support = np.sum(coef_sum > tol * coef_sum.max()) + # avoid dividing by 0 + size_support = min(size_support, n_samples - 1) else: - + # null model + # TODO Why do we need a null model? beta_hat = np.zeros((n_features, n_times)) - residual = np.copy(Y) - support = 0 + residual = np.copy(y) + size_support = 0 - sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - support) + # estimate the noise standard deviation (eq. 7 in `fan2012variance`) + sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - size_support) - if stationary: - sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times) - corr_emp = np.corrcoef(residual.T) - else: - sigma_hat = sigma_hat_raw - residual_rescaled = residual / sigma_hat - corr_emp = np.corrcoef(residual_rescaled.T) - - # Median method - if not stationary or method == "simple": - - rho_hat = np.median(np.diag(corr_emp, 1)) - corr_hat = toeplitz(np.geomspace(1, rho_hat ** (n_times - 1), n_times)) - cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat - - # Yule-Walker method - elif stationary and method == "AR": - - if order > n_times - 1: - raise ValueError( - "The requested AR order is to high with " - + "respect to the number of time steps." - ) - - rho_ar = np.zeros(order + 1) - rho_ar[0] = 1 - - for i in range(1, order + 1): - rho_ar[i] = np.median(np.diag(corr_emp, i)) - - A = toeplitz(rho_ar[:-1]) - coef_ar = solve(A, rho_ar[1:]) - - residual_estimate = np.zeros((n_samples, n_times - order)) - - for i in range(order): - # time window used to estimate the residual from AR model - start = order - i - 1 - end = -i - 1 - residual_estimate += coef_ar[i] * residual[:, start:end] - - residual_diff = residual[:, order:] - residual_estimate - sigma_eps = np.median(norm(residual_diff, axis=0) / np.sqrt(n_samples)) - - rho_ar_full = np.zeros(n_times) - rho_ar_full[: rho_ar.size] = rho_ar - - for i in range(order + 1, n_times): - start = i - order - end = i - rho_ar_full[i] = np.dot(coef_ar[::-1], rho_ar_full[start:end]) - - corr_hat = toeplitz(rho_ar_full) - sigma_hat[:] = sigma_eps / np.sqrt((1 - np.dot(coef_ar, rho_ar[1:]))) - cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat + if not group: + return sigma_hat_raw, beta_hat + ## Computation of the covariance matrix for group else: - raise ValueError("Unknown method for estimating the covariance matrix") - - return cov_hat, beta_hat + if method == "simple": + print("Group reid: simple cov estimation") + elif method == "AR": + print(f"Group reid: {method}{order} cov estimation") + if order > n_times - 1: + raise ValueError( + "The requested AR order is to high with " + + "respect to the number of time steps." + ) + elif not stationary: + raise ValueError( + "The AR method is not compatible with the non-stationary" + + " noise assumption." + ) + else: + raise ValueError("Unknown method for estimating the covariance matrix") + ## compute emperical correlation of the residual + if stationary: + # consideration of stationary noise + # (section 2.5 of `chevalier2020statistical`) + sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times) + # compute rho from the empirical correlation matrix + # (section 2.5 of `chevalier2020statistical`) + corr_emp = np.corrcoef(residual.T) + else: + sigma_hat = sigma_hat_raw + residual_rescaled = residual / sigma_hat + corr_emp = np.corrcoef(residual_rescaled.T) + + # TODO: Why the name of the method is different + # than the name of the "function"? + # Median method + if not stationary or method == "simple": + rho_hat = np.median(np.diag(corr_emp, 1)) + # estimate M (section 2.5 of `chevalier2020statistical`) + corr_hat = toeplitz(np.geomspace(1, rho_hat ** (n_times - 1), n_times)) + cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat + + # Yule-Walker method (algorithm in section 3 of `eshel2003yule`) + elif stationary and method == "AR": + # compute the autocorrelation coefficients of the AR model + rho_ar = np.zeros(order + 1) + rho_ar[0] = 1 + + for i in range(1, order + 1): + rho_ar[i] = np.median(np.diag(corr_emp, i)) + + # solve the Yule-Walker equations (see eq.2 in `eshel2003yule`) + R = toeplitz(rho_ar[:-1]) + coef_ar = solve(R, rho_ar[1:]) + + # estimate the variance of the noise from the AR model + residual_estimate = np.zeros((n_samples, n_times - order)) + for i in range(order): + # time window used to estimate the residual from AR model + start = order - i - 1 + end = -i - 1 + residual_estimate += coef_ar[i] * residual[:, start:end] + residual_diff = residual[:, order:] - residual_estimate + sigma_eps = np.median(norm(residual_diff, axis=0) / np.sqrt(n_samples)) + + # estimation of the autocorrelation matrices + rho_ar_full = np.zeros(n_times) + rho_ar_full[: rho_ar.size] = rho_ar + for i in range(order + 1, n_times): + start = i - order + end = i + rho_ar_full[i] = np.dot(coef_ar[::-1], rho_ar_full[start:end]) + corr_hat = toeplitz(rho_ar_full) + + # estimation of the variance of an AR process + # from wikipedia it should be: + # VAR(X_t)=\frac{\sigma_\epsilon^2}{1-\phi^2} + # TODO there is a short difference between the code + # and the above formula + sigma_hat[:] = sigma_eps / np.sqrt((1 - np.dot(coef_ar, rho_ar[1:]))) + # estimation of the covariance based on the + # correlation matrix and sigma + # COV(X_t, X_t) = COR(X_t, X_t) * \sigma^2 + cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat + + return cov_hat, beta_hat def empirical_snr(X, y, beta, noise=None): - """Compute the SNR for the linear model: y = X beta + noise + """ + Compute the empirical signal-to-noise ratio (SNR) for + the linear model y = X @ beta + noise. Parameters ---------- - X : ndarray or scipy.sparse matrix, shape (n_samples, n_features) - Data. + X : ndarray, shape (n_samples, n_features) + Design matrix. y : ndarray, shape (n_samples,) - Target. + Target vector. beta : ndarray, shape (n_features,) - True parameter vector. + Parameter vector. - noise : ndarray, shape (n_samples,), optional (default=None) - True error vector. + noise : ndarray, shape (n_samples,), optional + Noise vector. If None, computed as y - X @ beta. Returns ------- snr_hat : float - Empirical signal-to-noise ratio. + Empirical SNR computed as var(signal) / var(noise). + + Notes + ----- + SNR measures the ratio of signal power to noise power, + indicating model estimation quality. + Higher values suggest better signal recovery. """ X = np.asarray(X) @@ -290,8 +259,7 @@ def empirical_snr(X, y, beta, noise=None): if noise is None: noise = y - signal - sig_signal = np.linalg.norm(signal - np.mean(signal)) - sig_noise = np.linalg.norm(noise - np.mean(noise)) - snr_hat = (sig_signal / sig_noise) ** 2 + # compute signal-to-noise ratio + snr_hat = np.var(signal) / np.var(noise) return snr_hat diff --git a/src/hidimstat/stat_tools.py b/src/hidimstat/stat_tools.py index 0ced13e9..e3640882 100644 --- a/src/hidimstat/stat_tools.py +++ b/src/hidimstat/stat_tools.py @@ -54,7 +54,7 @@ def pval_corr_from_pval(one_sided_pval): return one_sided_pval_corr -def pval_from_scale(beta, scale, distrib="norm", eps=1e-14): +def pval_from_scale(beta, scale, distribution="norm", eps=1e-14): """Computing one-sided p-values from the value of the parameter and its scale. @@ -66,7 +66,7 @@ def pval_from_scale(beta, scale, distrib="norm", eps=1e-14): scale : ndarray, shape (n_features,) Value of the standard deviation of the parameters. - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -97,7 +97,7 @@ def pval_from_scale(beta, scale, distrib="norm", eps=1e-14): pval = np.zeros(n_features) + 0.5 one_minus_pval = np.zeros(n_features) + 0.5 - if distrib == "norm": + if distribution == "norm": pval[index_no_nan] = norm.sf(beta[index_no_nan], scale=scale[index_no_nan]) one_minus_pval[index_no_nan] = norm.cdf( @@ -113,7 +113,7 @@ def pval_from_scale(beta, scale, distrib="norm", eps=1e-14): return pval, pval_corr, one_minus_pval, one_minus_pval_corr -def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): +def zscore_from_cb(cb_min, cb_max, confidence=0.95, distribution="norm"): """Computing z-scores from confidence intervals. Parameters @@ -128,7 +128,7 @@ def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): Confidence level used to compute the confidence intervals. Each value should be in the range [0, 1]. - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -138,7 +138,7 @@ def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): z-scores. """ - if distrib == "norm": + if distribution == "norm": quantile = norm.ppf(1 - (1 - confidence) / 2) beta_hat = (cb_min + cb_max) / 2 @@ -148,7 +148,7 @@ def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): return zscore -def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm", eps=1e-14): +def pval_from_cb(cb_min, cb_max, confidence=0.95, distribution="norm", eps=1e-14): """Computing one-sided p-values from confidence intervals. Parameters @@ -163,7 +163,7 @@ def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm", eps=1e-14): Confidence level used to compute the confidence intervals. Each value should be in the range [0, 1]. - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -187,9 +187,11 @@ def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm", eps=1e-14): One minus the p-value corrected for multiple testing. """ - zscore = zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib) + zscore = zscore_from_cb( + cb_min, cb_max, confidence=confidence, distribution=distribution + ) - if distrib == "norm": + if distribution == "norm": pval = norm.sf(zscore) one_minus_pval = norm.cdf(zscore) @@ -203,7 +205,7 @@ def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm", eps=1e-14): return pval, pval_corr, one_minus_pval, one_minus_pval_corr -def two_sided_pval_from_zscore(zscore, distrib="norm"): +def two_sided_pval_from_zscore(zscore, distribution="norm"): """Computing two-sided p-values from z-scores. Parameters @@ -211,7 +213,7 @@ def two_sided_pval_from_zscore(zscore, distrib="norm"): zscore : ndarray, shape (n_features,) z-scores. - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -225,7 +227,7 @@ def two_sided_pval_from_zscore(zscore, distrib="norm"): """ n_features = zscore.size - if distrib == "norm": + if distribution == "norm": two_sided_pval = 2 * norm.sf(np.abs(zscore)) two_sided_pval_corr = np.minimum(1, two_sided_pval * n_features) @@ -233,7 +235,7 @@ def two_sided_pval_from_zscore(zscore, distrib="norm"): return two_sided_pval, two_sided_pval_corr -def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): +def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distribution="norm"): """Computing two-sided p-values from confidence intervals. Parameters @@ -248,7 +250,7 @@ def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): Confidence level used to compute the confidence intervals. Each value should be in the range [0, 1]. - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -260,16 +262,18 @@ def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distrib="norm"): two_sided_pval_corr : ndarray, shape (n_features,) Two-sided p-values corrected for multiple testing. """ - zscore = zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib) + zscore = zscore_from_cb( + cb_min, cb_max, confidence=confidence, distribution=distribution + ) two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore( - zscore, distrib=distrib + zscore, distribution=distribution ) return two_sided_pval, two_sided_pval_corr -def zscore_from_pval(pval, one_minus_pval=None, distrib="norm"): +def zscore_from_pval(pval, one_minus_pval=None, distribution="norm"): """Computing z-scores from one-sided p-values. Parameters @@ -282,7 +286,7 @@ def zscore_from_pval(pval, one_minus_pval=None, distrib="norm"): One minus the p-value, with numerically accurate values for negative effects (ie., for p-value close to one). - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -292,7 +296,7 @@ def zscore_from_pval(pval, one_minus_pval=None, distrib="norm"): z-scores. """ - if distrib == "norm": + if distribution == "norm": zscore = norm.isf(pval) @@ -357,7 +361,7 @@ def pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign, eps=1e-14) return pval, pval_corr, one_minus_pval, one_minus_pval_corr -def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): +def two_sided_pval_from_pval(pval, one_minus_pval=None, distribution="norm"): """Computing two-sided p-value from one-sided p-values. Parameters @@ -370,7 +374,7 @@ def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): One minus the p-value, with numerically accurate values for negative effects (ie., for p-value close to one). - distrib : str, opitonal (default='norm') + distribution : str, opitonal (default='norm') Type of distribution assumed for the underlying estimator. 'norm' means normal and is the only value accepted at the moment. @@ -383,10 +387,10 @@ def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): Two-sided p-values corrected for multiple testing. """ - zscore = zscore_from_pval(pval, one_minus_pval, distrib=distrib) + zscore = zscore_from_pval(pval, one_minus_pval, distribution=distribution) two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore( - zscore, distrib=distrib + zscore, distribution=distribution ) return two_sided_pval, two_sided_pval_corr diff --git a/test/test_clustered_inference.py b/test/test_clustered_inference.py index 94e04ef0..fb7450d9 100644 --- a/test/test_clustered_inference.py +++ b/test/test_clustered_inference.py @@ -2,6 +2,7 @@ Test the clustered_inference module """ +import pytest import numpy as np from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration @@ -51,6 +52,12 @@ def test_clustered_inference(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) + # test exception + with pytest.raises(ValueError, match="Unknow method"): + beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + clustered_inference(X_init, y, ward, n_clusters, method="test") + ) + beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( clustered_inference(X_init, y, ward, n_clusters) ) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 713bd2c5..89953fc4 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -2,11 +2,16 @@ Test the desparsified_lasso module """ +import pytest import numpy as np from numpy.testing import assert_almost_equal, assert_equal from scipy.linalg import toeplitz -from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso +from hidimstat.desparsified_lasso import ( + desparsified_lasso, + desparsified_lasso_pvalue, + desparsified_group_lasso_pvalue, +) from hidimstat.scenario import ( multivariate_1D_simulation, multivariate_temporal_simulation, @@ -18,7 +23,7 @@ def test_desparsified_lasso(): a support of size 1. Computing 99% confidence bounds and checking that they contains the true parameter vector.""" - n_samples, n_features = 50, 50 + n_samples, n_features = 52, 50 support_size = 1 sigma = 0.1 rho = 0.0 @@ -32,20 +37,42 @@ def test_desparsified_lasso(): shuffle=False, seed=2, ) + expected_pval_corr = np.concatenate( + (np.zeros(support_size), 0.5 * np.ones(n_features - support_size)) + ) - beta_hat, cb_min, cb_max = desparsified_lasso(X, y, confidence=0.99) + beta_hat, sigma_hat, omega_diag = desparsified_lasso(X, y) + pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( + desparsified_lasso_pvalue( + X.shape[0], beta_hat, sigma_hat, omega_diag, confidence=0.99 + ) + ) assert_almost_equal(beta_hat, beta, decimal=1) assert_equal(cb_min < beta, True) assert_equal(cb_max > beta, True) + assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) - beta_hat, cb_min, cb_max = desparsified_lasso( - X, y, dof_ajdustement=True, confidence=0.99 + beta_hat, sigma_hat, omega_diag = desparsified_lasso(X, y, dof_ajdustement=True) + pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( + desparsified_lasso_pvalue( + X.shape[0], beta_hat, sigma_hat, omega_diag, confidence=0.99 + ) ) - + cb_min_bis, cb_max_bis = desparsified_lasso_pvalue( + X.shape[0], + beta_hat, + sigma_hat, + omega_diag, + confidence=0.99, + confidence_interval_only=True, + ) + assert np.all(cb_min_bis == cb_min) + assert np.all(cb_max_bis == cb_max) assert_almost_equal(beta_hat, beta, decimal=1) assert_equal(cb_min < beta, True) assert_equal(cb_max > beta, True) + assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) def test_desparsified_group_lasso(): @@ -72,8 +99,9 @@ def test_desparsified_group_lasso(): rho_noise=rho, ) - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso(X, Y, cov=cov) + beta_hat, theta_hat, omega_diag = desparsified_lasso(X, Y, group=True, cov=cov) + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + desparsified_group_lasso_pvalue(beta_hat, theta_hat, omega_diag) ) expected_pval_corr = np.concatenate( @@ -83,8 +111,9 @@ def test_desparsified_group_lasso(): assert_almost_equal(beta_hat, beta, decimal=1) assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso(X, Y, test="F") + beta_hat, theta_hat, omega_diag = desparsified_lasso(X, Y, group=True) + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + desparsified_group_lasso_pvalue(beta_hat, theta_hat, omega_diag, test="F") ) assert_almost_equal(beta_hat, beta, decimal=1) @@ -93,5 +122,8 @@ def test_desparsified_group_lasso(): # Testing error is raised when the covariance matrix has wrong shape bad_cov = np.delete(cov, 0, axis=1) np.testing.assert_raises( - ValueError, desparsified_group_lasso, X=X, Y=Y, cov=bad_cov + ValueError, desparsified_lasso, X=X, y=Y, group=True, cov=bad_cov ) + + with pytest.raises(ValueError, match=f"Unknown test 'r2'"): + desparsified_group_lasso_pvalue(beta_hat, theta_hat, omega_diag, test="r2") diff --git a/test/test_noise_std.py b/test/test_noise_std.py index cb890529..2e90e069 100644 --- a/test/test_noise_std.py +++ b/test/test_noise_std.py @@ -2,11 +2,12 @@ Test the noise_std module """ +import pytest import numpy as np from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz -from hidimstat.noise_std import empirical_snr, group_reid, reid +from hidimstat.noise_std import empirical_snr, reid from hidimstat.scenario import ( multivariate_1D_simulation, multivariate_temporal_simulation, @@ -84,13 +85,13 @@ def test_group_reid(): ) # max_iter=1 to get a better coverage - cov_hat, _ = group_reid(X, Y, tol=1e-3, max_iter=1) + cov_hat, _ = reid(X, Y, group=True, tol=1e-3, max_iter=1) error_ratio = cov_hat / cov assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) - cov_hat, _ = group_reid(X, Y, method="AR") + cov_hat, _ = reid(X, Y, group=True, method="AR") error_ratio = cov_hat / cov assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) @@ -110,25 +111,57 @@ def test_group_reid(): seed=1, ) - cov_hat, _ = group_reid(X, Y) + cov_hat, _ = reid(X, Y, group=True) error_ratio = cov_hat / cov assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) - cov_hat, _ = group_reid(X, Y, fit_Y=False, stationary=False) + cov_hat, _ = reid(X, Y, group=True, fit_Y=False, stationary=False) error_ratio = cov_hat / cov assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=0) - cov_hat, _ = group_reid(X, Y, method="AR") + cov_hat, _ = reid(X, Y, group=True, method="AR") error_ratio = cov_hat / cov assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) +def test_reid_exception(): + "Test for testing the exceptions on the arguments of reid function" + n_samples, n_features = 50, 30 + n_times = 10 + sigma = 1.0 + rho = 0.9 + + # First expe + # ########## + support_size = 2 + + X, y, beta, noise = multivariate_temporal_simulation( + n_samples=n_samples, + n_features=n_features, + n_times=n_times, + support_size=support_size, + sigma=sigma, + rho_noise=rho, + ) + + with pytest.raises( + ValueError, match="Unknown method for estimating the covariance matrix" + ): + _, _ = reid(X, y, method="test", group=True) + with pytest.raises( + ValueError, match="The AR method is not compatible with the non-stationary" + ): + _, _ = reid(X, y, method="AR", stationary=False, group=True) + with pytest.raises(ValueError, match="The requested AR order is to high with"): + _, _ = reid(X, y, method="AR", order=1e4, group=True) + + def test_empirical_snr(): """Computing empirical signal to noise ratio from the target `y`, the data `X` and the true parameter vector `beta` in a simple