Skip to content

Commit

Permalink
Merge pull request jasonlaska#15 from cv3d/scikit_learn_0.19.1
Browse files Browse the repository at this point in the history
Support scikit-learn v0.20
  • Loading branch information
jasonlaska authored Nov 13, 2018
2 parents 823e0f4 + e58ed98 commit e1a171c
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 28 deletions.
6 changes: 3 additions & 3 deletions example-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-r requirements.txt
tabulate==0.7.7
matplotlib==2.0.2
seaborn==0.8
tabulate==0.8.2
matplotlib==3.0.2
seaborn==0.9
1 change: 0 additions & 1 deletion examples/make_sphere_graphic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
colors = ['b', 'r', 'g']
for nn in range(n_clusters):
ax.scatter(Xs[nn][0, :], Xs[nn][1, :], Xs[nn][2, :], c=colors[nn])
ax.hold(True)

ax.set_aspect('equal')
plt.axis('off')
Expand Down
8 changes: 2 additions & 6 deletions examples/small_mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ def r_input(val=None):
xlim=[-1.1, 1.1], ylim=[-1.1, 1.1])
for ex in X_0:
plt.plot(ex[0], ex[1], 'r+')
plt.hold(True)

for ex in X_1:
plt.plot(ex[0], ex[1], 'b+')
plt.hold(True)

ax.set_aspect('equal')
plt.title('Original data')
plt.show()
Expand All @@ -142,7 +142,6 @@ def r_input(val=None):
plt.plot(ex[0], ex[1], 'r+')
else:
plt.plot(ex[0], ex[1], 'b+')
plt.hold(True)

ax.set_aspect('equal')
plt.title('K-means clustering')
Expand All @@ -156,7 +155,6 @@ def r_input(val=None):
plt.plot(ex[0], ex[1], 'r+')
else:
plt.plot(ex[0], ex[1], 'b+')
plt.hold(True)

ax.set_aspect('equal')
plt.title('Spherical K-means clustering')
Expand All @@ -170,7 +168,6 @@ def r_input(val=None):
plt.plot(ex[0], ex[1], 'r+')
else:
plt.plot(ex[0], ex[1], 'b+')
plt.hold(True)

ax.set_aspect('equal')
plt.title('soft-movMF clustering')
Expand All @@ -184,7 +181,6 @@ def r_input(val=None):
plt.plot(ex[0], ex[1], 'r+')
else:
plt.plot(ex[0], ex[1], 'b+')
plt.hold(True)

ax.set_aspect('equal')
plt.title('hard-movMF clustering')
Expand Down
5 changes: 0 additions & 5 deletions examples/small_mix_3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def r_input(val=None):
adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
zlim=[-1.1, 1.1])
ax.scatter(X_0[:, 0], X_0[:, 1], X_0[:, 2], c='r')
ax.hold(True)
ax.scatter(X_1[:, 0], X_1[:, 1], X_1[:, 2], c='b')
ax.set_aspect('equal')
plt.title('Original data')
Expand All @@ -131,7 +130,6 @@ def r_input(val=None):
adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
zlim=[-1.1, 1.1])
ax.scatter(X[km.labels_ == km_mu_0_idx, 0], X[km.labels_ == km_mu_0_idx, 1], X[km.labels_ == km_mu_0_idx, 2], c='r')
plt.hold(True)
ax.scatter(X[km.labels_ == km_mu_1_idx, 0], X[km.labels_ == km_mu_1_idx, 1], X[km.labels_ == km_mu_1_idx, 2], c='b')
ax.set_aspect('equal')
plt.title('K-means clustering')
Expand All @@ -142,7 +140,6 @@ def r_input(val=None):
adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
zlim=[-1.1, 1.1])
ax.scatter(X[skm.labels_ == skm_mu_0_idx, 0], X[skm.labels_ == skm_mu_0_idx, 1], X[skm.labels_ == skm_mu_0_idx, 2], c='r')
plt.hold(True)
ax.scatter(X[skm.labels_ == skm_mu_1_idx, 0], X[skm.labels_ == skm_mu_1_idx, 1], X[skm.labels_ == skm_mu_1_idx, 2], c='b')
ax.set_aspect('equal')
plt.title('Spherical K-means clustering')
Expand All @@ -153,7 +150,6 @@ def r_input(val=None):
adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
zlim=[-1.1, 1.1])
ax.scatter(X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 0], X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 1], X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 2], c='r')
plt.hold(True)
ax.scatter(X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 0], X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 1], X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 2], c='b')
ax.set_aspect('equal')
plt.title('soft-movMF clustering')
Expand All @@ -164,7 +160,6 @@ def r_input(val=None):
adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
zlim=[-1.1, 1.1])
ax.scatter(X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 0], X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 1], X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 2], c='r')
plt.hold(True)
ax.scatter(X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 0], X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 1], X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 2], c='b')
ax.set_aspect('equal')
plt.title('hard-movMF clustering')
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy
scipy
scikit-learn>=0.19.0
scikit-learn>=0.20
pytest
nose
black==18.6b4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name='spherecluster',
version='0.1.6',
version='0.1.7',
description='Clustering on the unit hypersphere in scikit-learn.',
author='Jason Laska',
author_email='[email protected]',
Expand Down
44 changes: 37 additions & 7 deletions spherecluster/spherical_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@

from sklearn.cluster import KMeans
from sklearn.cluster.k_means_ import (
_check_sample_weight,
_init_centroids,
_labels_inertia,
_tolerance,
_validate_center_shape,
)
from sklearn.utils import check_array, check_random_state, as_float_array
from sklearn.utils import check_array, check_random_state
from sklearn.utils.validation import _num_samples
from sklearn.cluster import _k_means
from sklearn.preprocessing import normalize
from sklearn.externals.joblib import Parallel, delayed
Expand All @@ -20,6 +22,7 @@
def _spherical_kmeans_single_lloyd(
X,
n_clusters,
sample_weight=None,
max_iter=300,
init="k-means++",
verbose=False,
Expand All @@ -33,6 +36,8 @@ def _spherical_kmeans_single_lloyd(
"""
random_state = check_random_state(random_state)

sample_weight = _check_sample_weight(X, sample_weight)

best_labels, best_inertia, best_centers = None, None, None

# init
Expand All @@ -56,6 +61,7 @@ def _spherical_kmeans_single_lloyd(
# this doesn't really matter.
labels, inertia = _labels_inertia(
X,
sample_weight,
x_squared_norms,
centers,
precompute_distances=precompute_distances,
Expand All @@ -64,9 +70,13 @@ def _spherical_kmeans_single_lloyd(

# computation of the means
if sp.issparse(X):
centers = _k_means._centers_sparse(X, labels, n_clusters, distances)
centers = _k_means._centers_sparse(
X, sample_weight, labels, n_clusters, distances
)
else:
centers = _k_means._centers_dense(X, labels, n_clusters, distances)
centers = _k_means._centers_dense(
X, sample_weight, labels, n_clusters, distances
)

# l2-normalize centers (this is the main contibution here)
centers = normalize(centers)
Expand All @@ -93,6 +103,7 @@ def _spherical_kmeans_single_lloyd(
# match cluster centers
best_labels, best_inertia = _labels_inertia(
X,
sample_weight,
x_squared_norms,
best_centers,
precompute_distances=precompute_distances,
Expand All @@ -105,6 +116,7 @@ def _spherical_kmeans_single_lloyd(
def spherical_k_means(
X,
n_clusters,
sample_weight=None,
init="k-means++",
n_init=10,
max_iter=300,
Expand Down Expand Up @@ -132,11 +144,20 @@ def spherical_k_means(
)

best_inertia = np.infty
X = as_float_array(X, copy=copy_x)
# avoid forcing order when copy_x=False
order = "C" if copy_x else None
X = check_array(
X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x
)
# verify that the number of samples given is larger than k
if _num_samples(X) < n_clusters:
raise ValueError(
"n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)
)
tol = _tolerance(X, tol)

if hasattr(init, "__array__"):
init = check_array(init, dtype=X.dtype.type, copy=True)
init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
_validate_center_shape(X, n_clusters, init)

if n_init != 1:
Expand All @@ -159,6 +180,7 @@ def spherical_k_means(
labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
X,
n_clusters,
sample_weight,
max_iter=max_iter,
init=init,
verbose=verbose,
Expand All @@ -180,6 +202,7 @@ def spherical_k_means(
delayed(_spherical_kmeans_single_lloyd)(
X,
n_clusters,
sample_weight,
max_iter=max_iter,
init=init,
verbose=verbose,
Expand Down Expand Up @@ -303,25 +326,32 @@ def __init__(
self.n_jobs = n_jobs
self.normalize = normalize

def fit(self, X, y=None):
def fit(self, X, y=None, sample_weight=None):
"""Compute k-means clustering.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
y : Ignored
not used, present here for API consistency by convention.
sample_weight : array-like, shape (n_samples,), optional
The weights for each observation in X. If None, all observations
are assigned equal weight (default: None)
"""
if self.normalize:
X = normalize(X)

random_state = check_random_state(self.random_state)
X = self._check_fit_data(X)

# TODO: add check that all data is unit-normalized

self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
X,
n_clusters=self.n_clusters,
sample_weight=sample_weight,
init=self.init,
n_init=self.n_init,
max_iter=self.max_iter,
Expand Down
1 change: 0 additions & 1 deletion spherecluster/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@

def test_estimator_spherical_k_means():
return check_estimator(SphericalKMeans)

5 changes: 3 additions & 2 deletions spherecluster/tests/test_von_mises_fisher_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,9 @@ def test_integration_sparse(params_in):
X = sp.sparse.csr_matrix((n_examples, n_features))
for ee in range(n_examples):
ridx = np.random.randint(n_features, size=(n_nonzero))
X[ee, ridx] = np.random.randn(n_nonzero)
X[ee, :] /= sp.sparse.linalg.norm(X[ee, :])
random_values = np.random.randn(n_nonzero)
random_values = random_values / np.linalg.norm(random_values)
X[ee, ridx] = random_values

params_in.update({"n_clusters": n_clusters})
movmf = VonMisesFisherMixture(**params_in)
Expand Down
2 changes: 1 addition & 1 deletion spherecluster/von_mises_fisher_mixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import scipy.sparse as sp
from scipy.special import iv # modified Bessel function of first kind, I_v
from numpy import i0 # modified Bessel function of first kind order 0, I_0
from scipy.misc import logsumexp
from scipy.special import logsumexp

from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
Expand Down

0 comments on commit e1a171c

Please sign in to comment.