-
Notifications
You must be signed in to change notification settings - Fork 159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENH] New experimental module: imbalance in collection transformers #2498
base: main
Are you sure you want to change the base?
Changes from all commits
aff5ed5
4bec820
4ef2fa3
5db24f3
d9b35b7
97c7466
a440a90
6e24ef0
c731117
770ea75
460a378
2366305
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
"""Supervised transformers to rebalance colelctions of time series.""" | ||
|
||
__all__ = ["ADASYN", "SMOTE"] | ||
|
||
from aeon.transformations.collection.imbalance._adasyn import ADASYN | ||
from aeon.transformations.collection.imbalance._smote import SMOTE |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
"""ADASYN over sampling algorithm.""" | ||
|
||
import numpy as np | ||
from sklearn.utils import check_random_state | ||
|
||
from aeon.transformations.collection.imbalance._smote import SMOTE | ||
|
||
__maintainer__ = ["TonyBagnall"] | ||
__all__ = ["ADASYN"] | ||
|
||
|
||
class ADASYN(SMOTE): | ||
""" | ||
Over-sampling using Adaptive Synthetic Sampling (ADASYN). | ||
|
||
Adaptation of imblearn.over_sampling.ADASYN | ||
original authors: | ||
# Guillaume Lemaitre <[email protected]> | ||
# Christos Aridas | ||
# License: MIT | ||
|
||
This transformer extends SMOTE, but it generates different number of | ||
samples depending on an estimate of the local distribution of the class | ||
to be oversampled. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Params need documenting |
||
""" | ||
|
||
def __init__(self, random_state=None, k_neighbors=5): | ||
super().__init__(random_state=random_state, k_neighbors=k_neighbors) | ||
|
||
def _transform(self, X, y=None): | ||
X = np.squeeze(X, axis=1) | ||
random_state = check_random_state(self.random_state) | ||
X_resampled = [X.copy()] | ||
y_resampled = [y.copy()] | ||
|
||
# got the minority class label and the number needs to be generated | ||
for class_sample, n_samples in self.sampling_strategy_.items(): | ||
if n_samples == 0: | ||
continue | ||
target_class_indices = np.flatnonzero(y == class_sample) | ||
X_class = X[target_class_indices] | ||
|
||
self.nn_.fit(X) | ||
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] | ||
# The ratio is computed using a one-vs-rest manner. Using majority | ||
# in multi-class would lead to slightly different results at the | ||
# cost of introducing a new parameter. | ||
n_neighbors = self.nn_.n_neighbors - 1 | ||
ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors | ||
if not np.sum(ratio_nn): | ||
raise RuntimeError( | ||
"Not any neigbours belong to the majority" | ||
" class. This case will induce a NaN case" | ||
" with a division by zero. ADASYN is not" | ||
" suited for this specific dataset." | ||
" Use SMOTE instead." | ||
) | ||
ratio_nn /= np.sum(ratio_nn) | ||
n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) | ||
# rounding may cause new amount for n_samples | ||
n_samples = np.sum(n_samples_generate) | ||
if not n_samples: | ||
raise ValueError( | ||
"No samples will be generated with the provided ratio settings." | ||
) | ||
|
||
# the nearest neighbors need to be fitted only on the current class | ||
# to find the class NN to generate new samples | ||
self.nn_.fit(X_class) | ||
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] | ||
|
||
enumerated_class_indices = np.arange(len(target_class_indices)) | ||
rows = np.repeat(enumerated_class_indices, n_samples_generate) | ||
cols = random_state.choice(n_neighbors, size=n_samples) | ||
diffs = X_class[nns[rows, cols]] - X_class[rows] | ||
steps = random_state.uniform(size=(n_samples, 1)) | ||
X_new = X_class[rows] + steps * diffs | ||
|
||
X_new = X_new.astype(X.dtype) | ||
y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) | ||
X_resampled.append(X_new) | ||
y_resampled.append(y_new) | ||
X_resampled = np.vstack(X_resampled) | ||
y_resampled = np.hstack(y_resampled) | ||
|
||
X_resampled = X_resampled[:, np.newaxis, :] | ||
return X_resampled, y_resampled |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
"""SMOTE over sampling algorithm. | ||
|
||
See more in imblearn.over_sampling.SMOTE | ||
original authors: | ||
# Guillaume Lemaitre <[email protected]> | ||
# Fernando Nogueira | ||
# Christos Aridas | ||
# Dzianis Dudnik | ||
# License: MIT | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In adasyn the attribution is in the class comment. I dont mind which having it at the top or in the class but it should be consistent. |
||
|
||
from collections import OrderedDict | ||
|
||
import numpy as np | ||
from sklearn.neighbors import NearestNeighbors | ||
from sklearn.utils import check_random_state | ||
|
||
from aeon.transformations.collection import BaseCollectionTransformer | ||
|
||
__maintainer__ = ["TonyBagnall"] | ||
__all__ = ["SMOTE"] | ||
|
||
|
||
class SMOTE(BaseCollectionTransformer): | ||
""" | ||
Over-sampling using the Synthetic Minority Over-sampling TEchnique (SMOTE)[1]_. | ||
|
||
An adaptation of the imbalance-learn implementation of SMOTE in | ||
imblearn.over_sampling.SMOTE. sampling_strategy is sampling target by | ||
targeting all classes but not the majority, which is directly expressed in | ||
_fit.sampling_strategy. | ||
|
||
Parameters | ||
---------- | ||
k_neighbors : int, default=5 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This typing is wrong? The comment suggests it can be a int or an instance of a KNN classifier? |
||
The number of nearest neighbors used to define the neighborhood of samples | ||
to use to generate the synthetic time series. | ||
`~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case. | ||
random_state : int, RandomState instance or None, default=None | ||
If `int`, random_state is the seed used by the random number generator; | ||
If `RandomState` instance, random_state is the random number generator; | ||
If `None`, the random number generator is the `RandomState` instance used | ||
by `np.random`. | ||
|
||
See Also | ||
-------- | ||
ADASYN | ||
|
||
References | ||
---------- | ||
.. [1] Chawla et al. SMOTE: synthetic minority over-sampling technique, Journal | ||
of Artificial Intelligence Research 16(1): 321–357, 2002. | ||
https://dl.acm.org/doi/10.5555/1622407.1622416 | ||
""" | ||
|
||
_tags = { | ||
"requires_y": True, | ||
} | ||
|
||
def __init__(self, k_neighbors=5, random_state=None): | ||
self.random_state = random_state | ||
self.k_neighbors = k_neighbors | ||
super().__init__() | ||
|
||
def _fit(self, X, y=None): | ||
# set the additional_neighbor required by SMOTE | ||
self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1) | ||
|
||
# generate sampling target by targeting all classes except the majority | ||
unique, counts = np.unique(y, return_counts=True) | ||
target_stats = dict(zip(unique, counts)) | ||
n_sample_majority = max(target_stats.values()) | ||
class_majority = max(target_stats, key=target_stats.get) | ||
sampling_strategy = { | ||
key: n_sample_majority - value | ||
for (key, value) in target_stats.items() | ||
if key != class_majority | ||
} | ||
self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items())) | ||
return self | ||
|
||
def _transform(self, X, y=None): | ||
# remove the channel dimension to be compatible with sklearn | ||
X = np.squeeze(X, axis=1) | ||
X_resampled = [X.copy()] | ||
y_resampled = [y.copy()] | ||
|
||
# got the minority class label and the number needs to be generated | ||
for class_sample, n_samples in self.sampling_strategy_.items(): | ||
if n_samples == 0: | ||
continue | ||
target_class_indices = np.flatnonzero(y == class_sample) | ||
X_class = X[target_class_indices] | ||
|
||
self.nn_.fit(X_class) | ||
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] | ||
X_new, y_new = self._make_samples( | ||
X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 | ||
) | ||
X_resampled.append(X_new) | ||
y_resampled.append(y_new) | ||
X_resampled = np.vstack(X_resampled) | ||
y_resampled = np.hstack(y_resampled) | ||
X_resampled = X_resampled[:, np.newaxis, :] | ||
return X_resampled, y_resampled | ||
|
||
def _make_samples( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be nice to have actual types on the parameters e.g. X: np.ndarray (would be nice on all the params in this PR) |
||
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None | ||
): | ||
"""Make artificial samples constructed based on nearest neighbours. | ||
|
||
Parameters | ||
---------- | ||
X : np.ndarray | ||
Shape (n_cases, n_timepoints), time series from which the new series will | ||
be created. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove new line between params comments |
||
y_dtype : dtype | ||
The data type of the targets. | ||
|
||
y_type : str or int | ||
The minority target value, just so the function can return the | ||
target values for the synthetic variables with correct length in | ||
a clear format. | ||
|
||
nn_data : ndarray of shape (n_samples_all, n_features) | ||
Data set carrying all the neighbours to be used | ||
|
||
nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) | ||
The nearest neighbours of each sample in `nn_data`. | ||
|
||
n_samples : int | ||
The number of samples to generate. | ||
|
||
step_size : float, default=1.0 | ||
The step size to create samples. | ||
|
||
y : ndarray of shape (n_samples_all,), default=None | ||
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to | ||
weight the distances in the sample generation process. | ||
|
||
Returns | ||
------- | ||
X_new : ndarray | ||
Synthetically generated samples of shape (n_samples_new, n_timepoints). | ||
|
||
y_new : ndarray | ||
Target values for synthetic samples of shape (n_samples_new,). | ||
""" | ||
random_state = check_random_state(self.random_state) | ||
samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples) | ||
|
||
# np.newaxis for backwards compatability with random_state | ||
steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis] | ||
rows = np.floor_divide(samples_indices, nn_num.shape[1]) | ||
cols = np.mod(samples_indices, nn_num.shape[1]) | ||
|
||
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y) | ||
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) | ||
return X_new, y_new | ||
|
||
def _generate_samples( | ||
self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Param types again would be nice |
||
): | ||
r"""Generate a synthetic sample. | ||
|
||
The rule for the generation is: | ||
|
||
.. math:: | ||
\mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times | ||
(\mathbf{s_{i}} - \mathbf{s_{nn}}) \, | ||
|
||
where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is | ||
the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of | ||
\mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). | ||
|
||
Parameters | ||
---------- | ||
X : np.ndarray | ||
Series from which the points will be created of shape (n_cases, | ||
n_timepoints). | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove lines between |
||
nn_data : ndarray of shape (n_samples_all, n_features) | ||
Data set carrying all the neighbours to be used. | ||
|
||
nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours) | ||
The nearest neighbours of each sample in `nn_data`. | ||
|
||
rows : ndarray of shape (n_samples,), dtype=int | ||
Indices pointing at feature vector in X which will be used | ||
as a base for creating new samples. | ||
|
||
cols : ndarray of shape (n_samples,), dtype=int | ||
Indices pointing at which nearest neighbor of base feature vector | ||
will be used when creating new samples. | ||
|
||
steps : ndarray of shape (n_samples,), dtype=float | ||
Step sizes for new samples. | ||
|
||
y_type : str, int or None, default=None | ||
Class label of the current target classes for which we want to generate | ||
samples. | ||
|
||
y : ndarray of shape (n_samples_all,), default=None | ||
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to | ||
weight the distances in the sample generation process. | ||
|
||
Returns | ||
------- | ||
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features) | ||
Synthetically generated samples. | ||
""" | ||
diffs = nn_data[nn_num[rows, cols]] - X[rows] | ||
if y is not None: # only entering for BorderlineSMOTE-2 | ||
random_state = check_random_state(self.random_state) | ||
mask_pair_samples = y[nn_num[rows, cols]] != y_type | ||
diffs[mask_pair_samples] *= random_state.uniform( | ||
low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1) | ||
) | ||
X_new = X[rows] + steps * diffs | ||
return X_new.astype(X.dtype) | ||
|
||
@classmethod | ||
def _get_test_params(cls, parameter_set="default"): | ||
"""Return testing parameter settings for the estimator. | ||
|
||
Parameters | ||
---------- | ||
parameter_set : str, default="default" | ||
Name of the set of test parameters to return, for use in tests. If no | ||
special parameters are defined for a value, will return `"default"` set. | ||
ClassifierChannelEnsemble provides the following special sets: | ||
- "results_comparison" - used in some classifiers to compare against | ||
previously generated results where the default set of parameters | ||
cannot produce suitable probability estimates | ||
|
||
Returns | ||
------- | ||
params : dict or list of dict, default={} | ||
Parameters to create testing instances of the class. | ||
Each dict are parameters to construct an "interesting" test instance, i.e., | ||
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. | ||
""" | ||
return {"k_neighbors": 1} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Test resampling transformers.""" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was this here before or what is this?