Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] New experimental module: imbalance in collection transformers #2498

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ does not apply:
- `segmentation`
- `similarity_search`
- `visualisation`
- `transformations.collection.imbalance`

| Overview | |
|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this here before or what is this?

Expand Down
6 changes: 6 additions & 0 deletions aeon/transformations/collection/imbalance/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Supervised transformers to rebalance colelctions of time series."""

__all__ = ["ADASYN", "SMOTE"]

from aeon.transformations.collection.imbalance._adasyn import ADASYN
from aeon.transformations.collection.imbalance._smote import SMOTE
87 changes: 87 additions & 0 deletions aeon/transformations/collection/imbalance/_adasyn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""ADASYN over sampling algorithm."""

import numpy as np
from sklearn.utils import check_random_state

from aeon.transformations.collection.imbalance._smote import SMOTE

__maintainer__ = ["TonyBagnall"]
__all__ = ["ADASYN"]


class ADASYN(SMOTE):
"""
Over-sampling using Adaptive Synthetic Sampling (ADASYN).

Adaptation of imblearn.over_sampling.ADASYN
original authors:
# Guillaume Lemaitre <[email protected]>
# Christos Aridas
# License: MIT

This transformer extends SMOTE, but it generates different number of
samples depending on an estimate of the local distribution of the class
to be oversampled.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Params need documenting

"""

def __init__(self, random_state=None, k_neighbors=5):
super().__init__(random_state=random_state, k_neighbors=k_neighbors)

def _transform(self, X, y=None):
X = np.squeeze(X, axis=1)
random_state = check_random_state(self.random_state)
X_resampled = [X.copy()]
y_resampled = [y.copy()]

# got the minority class label and the number needs to be generated
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = X[target_class_indices]

self.nn_.fit(X)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
# The ratio is computed using a one-vs-rest manner. Using majority
# in multi-class would lead to slightly different results at the
# cost of introducing a new parameter.
n_neighbors = self.nn_.n_neighbors - 1
ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors
if not np.sum(ratio_nn):
raise RuntimeError(
"Not any neigbours belong to the majority"
" class. This case will induce a NaN case"
" with a division by zero. ADASYN is not"
" suited for this specific dataset."
" Use SMOTE instead."
)
ratio_nn /= np.sum(ratio_nn)
n_samples_generate = np.rint(ratio_nn * n_samples).astype(int)
# rounding may cause new amount for n_samples
n_samples = np.sum(n_samples_generate)
if not n_samples:
raise ValueError(
"No samples will be generated with the provided ratio settings."
)

# the nearest neighbors need to be fitted only on the current class
# to find the class NN to generate new samples
self.nn_.fit(X_class)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]

enumerated_class_indices = np.arange(len(target_class_indices))
rows = np.repeat(enumerated_class_indices, n_samples_generate)
cols = random_state.choice(n_neighbors, size=n_samples)
diffs = X_class[nns[rows, cols]] - X_class[rows]
steps = random_state.uniform(size=(n_samples, 1))
X_new = X_class[rows] + steps * diffs

X_new = X_new.astype(X.dtype)
y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype)
X_resampled.append(X_new)
y_resampled.append(y_new)
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)

X_resampled = X_resampled[:, np.newaxis, :]
return X_resampled, y_resampled
244 changes: 244 additions & 0 deletions aeon/transformations/collection/imbalance/_smote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
"""SMOTE over sampling algorithm.

See more in imblearn.over_sampling.SMOTE
original authors:
# Guillaume Lemaitre <[email protected]>
# Fernando Nogueira
# Christos Aridas
# Dzianis Dudnik
# License: MIT
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In adasyn the attribution is in the class comment. I dont mind which having it at the top or in the class but it should be consistent.


from collections import OrderedDict

import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_random_state

from aeon.transformations.collection import BaseCollectionTransformer

__maintainer__ = ["TonyBagnall"]
__all__ = ["SMOTE"]


class SMOTE(BaseCollectionTransformer):
"""
Over-sampling using the Synthetic Minority Over-sampling TEchnique (SMOTE)[1]_.

An adaptation of the imbalance-learn implementation of SMOTE in
imblearn.over_sampling.SMOTE. sampling_strategy is sampling target by
targeting all classes but not the majority, which is directly expressed in
_fit.sampling_strategy.

Parameters
----------
k_neighbors : int, default=5
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This typing is wrong? The comment suggests it can be a int or an instance of a KNN classifier?

The number of nearest neighbors used to define the neighborhood of samples
to use to generate the synthetic time series.
`~sklearn.neighbors.NearestNeighbors` instance will be fitted in this case.
random_state : int, RandomState instance or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `RandomState` instance, random_state is the random number generator;
If `None`, the random number generator is the `RandomState` instance used
by `np.random`.

See Also
--------
ADASYN

References
----------
.. [1] Chawla et al. SMOTE: synthetic minority over-sampling technique, Journal
of Artificial Intelligence Research 16(1): 321–357, 2002.
https://dl.acm.org/doi/10.5555/1622407.1622416
"""

_tags = {
"requires_y": True,
}

def __init__(self, k_neighbors=5, random_state=None):
self.random_state = random_state
self.k_neighbors = k_neighbors
super().__init__()

def _fit(self, X, y=None):
# set the additional_neighbor required by SMOTE
self.nn_ = NearestNeighbors(n_neighbors=self.k_neighbors + 1)

# generate sampling target by targeting all classes except the majority
unique, counts = np.unique(y, return_counts=True)
target_stats = dict(zip(unique, counts))
n_sample_majority = max(target_stats.values())
class_majority = max(target_stats, key=target_stats.get)
sampling_strategy = {
key: n_sample_majority - value
for (key, value) in target_stats.items()
if key != class_majority
}
self.sampling_strategy_ = OrderedDict(sorted(sampling_strategy.items()))
return self

def _transform(self, X, y=None):
# remove the channel dimension to be compatible with sklearn
X = np.squeeze(X, axis=1)
X_resampled = [X.copy()]
y_resampled = [y.copy()]

# got the minority class label and the number needs to be generated
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = X[target_class_indices]

self.nn_.fit(X_class)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(
X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
)
X_resampled.append(X_new)
y_resampled.append(y_new)
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)
X_resampled = X_resampled[:, np.newaxis, :]
return X_resampled, y_resampled

def _make_samples(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to have actual types on the parameters e.g. X: np.ndarray (would be nice on all the params in this PR)

self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None
):
"""Make artificial samples constructed based on nearest neighbours.

Parameters
----------
X : np.ndarray
Shape (n_cases, n_timepoints), time series from which the new series will
be created.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove new line between params comments

y_dtype : dtype
The data type of the targets.

y_type : str or int
The minority target value, just so the function can return the
target values for the synthetic variables with correct length in
a clear format.

nn_data : ndarray of shape (n_samples_all, n_features)
Data set carrying all the neighbours to be used

nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours)
The nearest neighbours of each sample in `nn_data`.

n_samples : int
The number of samples to generate.

step_size : float, default=1.0
The step size to create samples.

y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.

Returns
-------
X_new : ndarray
Synthetically generated samples of shape (n_samples_new, n_timepoints).

y_new : ndarray
Target values for synthetic samples of shape (n_samples_new,).
"""
random_state = check_random_state(self.random_state)
samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples)

# np.newaxis for backwards compatability with random_state
steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis]
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])

X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
return X_new, y_new

def _generate_samples(
self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Param types again would be nice

):
r"""Generate a synthetic sample.

The rule for the generation is:

.. math::
\mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times
(\mathbf{s_{i}} - \mathbf{s_{nn}}) \,

where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is
the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of
\mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1).

Parameters
----------
X : np.ndarray
Series from which the points will be created of shape (n_cases,
n_timepoints).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove lines between

nn_data : ndarray of shape (n_samples_all, n_features)
Data set carrying all the neighbours to be used.

nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours)
The nearest neighbours of each sample in `nn_data`.

rows : ndarray of shape (n_samples,), dtype=int
Indices pointing at feature vector in X which will be used
as a base for creating new samples.

cols : ndarray of shape (n_samples,), dtype=int
Indices pointing at which nearest neighbor of base feature vector
will be used when creating new samples.

steps : ndarray of shape (n_samples,), dtype=float
Step sizes for new samples.

y_type : str, int or None, default=None
Class label of the current target classes for which we want to generate
samples.

y : ndarray of shape (n_samples_all,), default=None
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
weight the distances in the sample generation process.

Returns
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Synthetically generated samples.
"""
diffs = nn_data[nn_num[rows, cols]] - X[rows]
if y is not None: # only entering for BorderlineSMOTE-2
random_state = check_random_state(self.random_state)
mask_pair_samples = y[nn_num[rows, cols]] != y_type
diffs[mask_pair_samples] *= random_state.uniform(
low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1)
)
X_new = X[rows] + steps * diffs
return X_new.astype(X.dtype)

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
ClassifierChannelEnsemble provides the following special sets:
- "results_comparison" - used in some classifiers to compare against
previously generated results where the default set of parameters
cannot produce suitable probability estimates

Returns
-------
params : dict or list of dict, default={}
Parameters to create testing instances of the class.
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
"""
return {"k_neighbors": 1}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Test resampling transformers."""
Loading