Skip to content

Commit

Permalink
Improve visualization for clustering-based threshold autogeneration (#2)
Browse files Browse the repository at this point in the history
* use rejects and thresholds in autogen cluster plots
* add custom projection for cluster plots
* add option for saving clustering plots to files
  • Loading branch information
svirpioj authored Nov 22, 2023
1 parent b8045a1 commit f35ce1a
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 17 deletions.
15 changes: 9 additions & 6 deletions bin/opusfilter-autogen
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ parser.add_argument('--clusters', '-k', default=2, type=int, metavar='INT',
parser.add_argument('--work-dir', default='work',
help='Location of the source and target files for the generated configuration (default %(default)s)')
parser.add_argument('--inter-dir', help='Save intermediate files in this directory (use a temporary directory if not given)')
parser.add_argument('--plot', action='store_true',
help=('Show a scatter plot of the clustering and histograms of feature data distributions; '
'only for the clustering method'))
parser.add_argument('--plot', metavar='PATH', default=None, type=str,
help=('Create histograms of feature data distributions and a scatter plot of the clustering; '
'give path to plot the PDF files to, or "-" for interactive plots; only for the clustering method'))
parser.add_argument('--list-defaults', action='store_true', help='List default filters of the method to the output and quit')
parser.add_argument('--add-filter', nargs=2, action='append', default=[], metavar=('CLASS', 'JSON'),
help=('Instead of using default filters, add a filter of CLASS with JSON parameters object '
Expand Down Expand Up @@ -78,9 +78,12 @@ if args.list_defaults:

filters = filtergen.set_filter_thresholds()

if args.method == 'clustering' and args.plot:
filtergen.scoredata.plot(plt)
plt.show()
if args.method == 'clustering' and args.plot is not None:
if args.plot == '-':
filtergen.scoredata.plot(plt)
plt.show()
else:
filtergen.scoredata.plot(plt, path=args.plot)

generator = ConfigurationGenerator(
files=[os.path.abspath(f) for f in args.files], langs=args.langs, workdir=args.work_dir)
Expand Down
211 changes: 200 additions & 11 deletions opusfilter/autogen_cluster.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""Unsupervised threshold selection for filters"""

from collections import Counter
import itertools
import logging
import os

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn import preprocessing, random_projection
from sklearn import decomposition, preprocessing, random_projection
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.utils.validation import check_is_fitted
import numpy as np

from . import CLEAN_LOW
Expand All @@ -18,6 +22,147 @@
logger = logging.getLogger(__name__)


class ArcProjection(BaseEstimator, TransformerMixin):
"""Project data to two dimensions using evenly distributed unit vectors
The assigment of unit vectors to the original vectors is optimized
so that L2 norm between the correlation coefficients of the
original features and dot product of the vectors is minimized.
Parameters
----------
arc : float, 'auto' or 'full', default='auto'
Length of the arc used for projection vectors. For 'auto', the
length is set as the arccosine of the minimum correlation of
the input features. For 'full', full circle (2 * pi) is used.
Attributes
----------
n_components_ : int
Number of components, always 2.
components_ : ndarray of shape (n_components, n_features)
Matrix used for the projection.
"""

def __init__(self, arc='auto'):
self.n_components_ = 2
self.arc = arc

def _best_permutation(self, current, n_features, eval_func):
"""Greedy search for best permutation of projection matrix"""
permutation = np.arange(n_features)
while True:
logger.debug("current permutation: %s", permutation)
minpair, mincost = self._best_single_swap(current, eval_func)
logger.debug("best swap: %s (%s)", minpair, mincost)
if minpair is None:
return permutation, current, mincost
fwd, rev = list(minpair), list(reversed(minpair))
current[fwd] = current[rev]
permutation[fwd] = permutation[rev]

@staticmethod
def _best_single_swap(matrix, eval_func):
"""Find best swap of matrix rows based in eval_func"""
mincost = eval_func(matrix)
minpair = None
for pair in itertools.combinations(range(matrix.shape[0]), 2):
fwd, rev = list(pair), list(reversed(pair))
matrix[fwd] = matrix[rev]
cost = eval_func(matrix)
if cost < mincost:
mincost = cost
minpair = pair
matrix[fwd] = matrix[rev]
return minpair, mincost

def _make_matrix(self, corrmat, n_features):
"""Generate the random projection matrix.
Parameters
----------
corrmat : ndarray of shape (n_features, n_features),
Correlation matrix for the original features
n_features : int,
Dimensionality of the original source space.
Returns
-------
components : ndarray of shape (n_components, n_features)
The generated random matrix.
"""
def costf(matrix):
return ((corrmat - matrix @ matrix.T)**2).sum()

if self.arc == 'full':
dist = 2 * np.pi / n_features
elif self.arc == 'auto':
dist = np.arccos(corrmat.min()) / (n_features - 1)
else:
dist = self.arc / (n_features - 1)
matrix = np.array([
[np.cos(idx * dist), np.sin(idx * dist)] for idx in range(n_features)
])
_, best_m, best_cost = self._best_permutation(matrix.copy(), n_features, costf)
logger.debug("fcorr:\n%s", corrmat.round(3))
logger.debug("vdist:\n%s", (best_m @ best_m.T).round(3))
logger.debug("diff:\n%s", (corrmat - best_m @ best_m.T).round(3))
logger.debug("cost: %s", best_cost)
return best_m.T

def fit(self, X: np.array, y=None):
"""Generate a projection matrix.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
Training set: only the shape is used to find matrix dimensions.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
BaseRandomProjection class instance.
"""
X = self._validate_data(
X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32]
)

n_features = X.shape[1]
corrmat = np.corrcoef(X.T)
self.components_ = self._make_matrix(
corrmat, n_features
).astype(X.dtype, copy=False)

return self

def transform(self, X):
"""Project the data by using matrix product with the random matrix.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
The input data to project into a smaller dimensional space.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
Projected array.
"""
check_is_fitted(self)
X = self._validate_data(
X, accept_sparse=["csr", "csc"], reset=False, dtype=[np.float64, np.float32]
)

return X @ self.components_.T


class ScoreClusters:
"""Cluster segments by filter scores
Expand All @@ -42,6 +187,8 @@ def __init__(self, score_file, k=2):
self.labels = self.kmeans.labels_
self.cluster_centers = self.scaler.inverse_transform(self.kmeans.cluster_centers_) * self.direction_vector
self._noisy_label = self._get_noisy_label()
self.rejects = None
self.thresholds = None

@property
def noisy_label(self):
Expand Down Expand Up @@ -119,17 +266,41 @@ def get_rejects(self):

def get_result_df(self):
"""Return dataframe containing the thresholds and reject booleans"""
self.rejects = self.get_rejects()
self.thresholds = self.get_thresholds()
return pd.DataFrame.from_dict(
{'name': self.get_columns(),
'threshold': self.get_thresholds(),
'reject': self.get_rejects()})
{'name': self.get_columns(), 'threshold': self.thresholds, 'reject': self.rejects})

def plot(self, plt):
def plot(self, plt, path=None, apply_rejects=True, projection='arc'):
"""Plot clustering and histograms"""
if projection == 'pca':
proj = decomposition.PCA(n_components=2)
elif projection == 'random':
proj = random_projection.GaussianRandomProjection(n_components=2)
elif projection == 'arc':
proj = ArcProjection()
else:
raise ValueError(f"Unknown projection: {projection}")
col_index = {col: idx for idx, col in enumerate(self.get_columns())}
index_col = dict(enumerate(self.get_columns()))
if apply_rejects and self.rejects:
indices = [idx for idx, reject in enumerate(self.rejects) if not reject]
cols = [index_col[idx] for idx in indices]
index_col = dict(enumerate(cols))
data_t = proj.fit_transform(self.standard_data[:, indices])
centroids = proj.transform(self.kmeans.cluster_centers_[:, indices])
else:
data_t = proj.fit_transform(self.standard_data)
centroids = proj.transform(self.kmeans.cluster_centers_)
plt.figure()
for idx in range(proj.components_.shape[1]):
plt.arrow(0, 0, proj.components_[0, idx], proj.components_[1, idx], head_width=0.01, fc='k')
plt.text(proj.components_[0, idx], proj.components_[1, idx], index_col[idx])
plt.gca().set_aspect('equal', adjustable='box')
plt.title('Projection vectors for clustering')
if path is not None:
plt.savefig(os.path.join(path, 'projection.pdf'))
plt.figure(figsize=(10, 10))
projection = random_projection.GaussianRandomProjection(n_components=2)
data_t = projection.fit_transform(self.standard_data)
centroids = projection.transform(self.kmeans.cluster_centers_)
for label_id in range(self.k):
points = np.where(self.labels == label_id)
plt.scatter(data_t[points, 0], data_t[points, 1],
Expand All @@ -141,10 +312,28 @@ def plot(self, plt):
marker='+', c='brown' if label_id == self.noisy_label else 'darkblue',
label='noisy centroid' if label_id == self.noisy_label else 'clean centroid')
plt.legend()
plt.title('Clusters')
plt.title('Clustering')
if path is not None:
plt.savefig(os.path.join(path, 'clustering.pdf'))
noisy_samples = self.df.iloc[np.where(self.labels == self.noisy_label)]
clean_samples = self.df.iloc[np.where(self.labels != self.noisy_label)]
noisy_samples.hist(bins=100, figsize=(10, 10))
subplots_n = noisy_samples.hist(bins=100, figsize=(10, 10))
fig_noisy = plt.gcf()
plt.suptitle('Histograms for noisy samples')
clean_samples.hist(bins=100, figsize=(10, 10))
subplots_c = clean_samples.hist(bins=100, figsize=(10, 10))
fig_clean = plt.gcf()
plt.suptitle('Histograms for clean samples')
if apply_rejects and self.rejects:
for axes in ([axes for sublist in subplots_c for axes in sublist] +
[axes for sublist in subplots_n for axes in sublist]):
title = axes.get_title()
if not title:
continue
idx = col_index[title]
if self.rejects[idx]:
axes.text(0.5, 1, 'REJECTED', horizontalalignment='center',
verticalalignment='top', transform=axes.transAxes, color='r')
axes.axvline(self.thresholds[idx], color='k', linestyle='--', alpha=0.5)
if path is not None:
fig_clean.savefig(os.path.join(path, 'histogram_clean.pdf'))
fig_noisy.savefig(os.path.join(path, 'histogram_noisy.pdf'))

0 comments on commit f35ce1a

Please sign in to comment.