Merge pull request jasonlaska#15 from cv3d/scikit_learn_0.19.1

Support scikit-learn v0.20
itamar-dw · Nov 13, 2018 · e1a171c · e1a171c
2 parents 823e0f4 + e58ed98
commit e1a171c
Show file tree

Hide file tree

Showing 10 changed files with 48 additions and 28 deletions.
diff --git a/example-requirements.txt b/example-requirements.txt
@@ -1,4 +1,4 @@
 -r requirements.txt
-tabulate==0.7.7
-matplotlib==2.0.2
-seaborn==0.8
+tabulate==0.8.2
+matplotlib==3.0.2
+seaborn==0.9
diff --git a/examples/make_sphere_graphic.py b/examples/make_sphere_graphic.py
@@ -31,7 +31,6 @@
 colors = ['b', 'r', 'g']
 for nn in range(n_clusters):
     ax.scatter(Xs[nn][0, :], Xs[nn][1, :], Xs[nn][2, :], c=colors[nn])
-    ax.hold(True)
 
 ax.set_aspect('equal')
 plt.axis('off')

diff --git a/examples/small_mix.py b/examples/small_mix.py
@@ -126,10 +126,10 @@ def r_input(val=None):
         xlim=[-1.1, 1.1], ylim=[-1.1, 1.1])
 for ex in X_0:
     plt.plot(ex[0], ex[1], 'r+')
-    plt.hold(True)
+
 for ex in X_1:
     plt.plot(ex[0], ex[1], 'b+')
-    plt.hold(True)
+
 ax.set_aspect('equal')
 plt.title('Original data')
 plt.show()
@@ -142,7 +142,6 @@ def r_input(val=None):
         plt.plot(ex[0], ex[1], 'r+')
     else:
         plt.plot(ex[0], ex[1], 'b+')
-    plt.hold(True)
 
 ax.set_aspect('equal')
 plt.title('K-means clustering')
@@ -156,7 +155,6 @@ def r_input(val=None):
         plt.plot(ex[0], ex[1], 'r+')
     else:
         plt.plot(ex[0], ex[1], 'b+')
-    plt.hold(True)
 
 ax.set_aspect('equal')
 plt.title('Spherical K-means clustering')
@@ -170,7 +168,6 @@ def r_input(val=None):
         plt.plot(ex[0], ex[1], 'r+')
     else:
         plt.plot(ex[0], ex[1], 'b+')
-    plt.hold(True)
 
 ax.set_aspect('equal')
 plt.title('soft-movMF clustering')
@@ -184,7 +181,6 @@ def r_input(val=None):
         plt.plot(ex[0], ex[1], 'r+')
     else:
         plt.plot(ex[0], ex[1], 'b+')
-    plt.hold(True)
 
 ax.set_aspect('equal')
 plt.title('hard-movMF clustering')

diff --git a/examples/small_mix_3d.py b/examples/small_mix_3d.py
@@ -120,7 +120,6 @@ def r_input(val=None):
         adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
         zlim=[-1.1, 1.1])
 ax.scatter(X_0[:, 0], X_0[:, 1], X_0[:, 2], c='r')
-ax.hold(True)
 ax.scatter(X_1[:, 0], X_1[:, 1], X_1[:, 2], c='b')
 ax.set_aspect('equal')
 plt.title('Original data')
@@ -131,7 +130,6 @@ def r_input(val=None):
         adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
         zlim=[-1.1, 1.1])
 ax.scatter(X[km.labels_ == km_mu_0_idx, 0], X[km.labels_ == km_mu_0_idx, 1], X[km.labels_ == km_mu_0_idx, 2], c='r')
-plt.hold(True)
 ax.scatter(X[km.labels_ == km_mu_1_idx, 0], X[km.labels_ == km_mu_1_idx, 1], X[km.labels_ == km_mu_1_idx, 2], c='b')
 ax.set_aspect('equal')
 plt.title('K-means clustering')
@@ -142,7 +140,6 @@ def r_input(val=None):
         adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
         zlim=[-1.1, 1.1])
 ax.scatter(X[skm.labels_ == skm_mu_0_idx, 0], X[skm.labels_ == skm_mu_0_idx, 1], X[skm.labels_ == skm_mu_0_idx, 2], c='r')
-plt.hold(True)
 ax.scatter(X[skm.labels_ == skm_mu_1_idx, 0], X[skm.labels_ == skm_mu_1_idx, 1], X[skm.labels_ == skm_mu_1_idx, 2], c='b')
 ax.set_aspect('equal')
 plt.title('Spherical K-means clustering')
@@ -153,7 +150,6 @@ def r_input(val=None):
         adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
         zlim=[-1.1, 1.1])
 ax.scatter(X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 0], X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 1], X[vmf_soft.labels_ == vmf_soft_mu_0_idx, 2], c='r')
-plt.hold(True)
 ax.scatter(X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 0], X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 1], X[vmf_soft.labels_ == vmf_soft_mu_1_idx, 2], c='b')
 ax.set_aspect('equal')
 plt.title('soft-movMF clustering')
@@ -164,7 +160,6 @@ def r_input(val=None):
         adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1],
         zlim=[-1.1, 1.1])
 ax.scatter(X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 0], X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 1], X[vmf_hard.labels_ == vmf_hard_mu_0_idx, 2], c='r')
-plt.hold(True)
 ax.scatter(X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 0], X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 1], X[vmf_hard.labels_ == vmf_hard_mu_1_idx, 2], c='b')
 ax.set_aspect('equal')
 plt.title('hard-movMF clustering')

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy
 scipy
-scikit-learn>=0.19.0
+scikit-learn>=0.20
 pytest
 nose
 black==18.6b4
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name='spherecluster',
-    version='0.1.6',
+    version='0.1.7',
     description='Clustering on the unit hypersphere in scikit-learn.',
     author='Jason Laska',
     author_email='[email protected]',

diff --git a/spherecluster/spherical_kmeans.py b/spherecluster/spherical_kmeans.py
@@ -5,12 +5,14 @@
 
 from sklearn.cluster import KMeans
 from sklearn.cluster.k_means_ import (
+    _check_sample_weight,
     _init_centroids,
     _labels_inertia,
     _tolerance,
     _validate_center_shape,
 )
-from sklearn.utils import check_array, check_random_state, as_float_array
+from sklearn.utils import check_array, check_random_state
+from sklearn.utils.validation import _num_samples
 from sklearn.cluster import _k_means
 from sklearn.preprocessing import normalize
 from sklearn.externals.joblib import Parallel, delayed
@@ -20,6 +22,7 @@
 def _spherical_kmeans_single_lloyd(
     X,
     n_clusters,
+    sample_weight=None,
     max_iter=300,
     init="k-means++",
     verbose=False,
@@ -33,6 +36,8 @@ def _spherical_kmeans_single_lloyd(
     """
     random_state = check_random_state(random_state)
 
+    sample_weight = _check_sample_weight(X, sample_weight)
+
     best_labels, best_inertia, best_centers = None, None, None
 
     # init
@@ -56,6 +61,7 @@ def _spherical_kmeans_single_lloyd(
         #       this doesn't really matter.
         labels, inertia = _labels_inertia(
             X,
+            sample_weight,
             x_squared_norms,
             centers,
             precompute_distances=precompute_distances,
@@ -64,9 +70,13 @@ def _spherical_kmeans_single_lloyd(
 
         # computation of the means
         if sp.issparse(X):
-            centers = _k_means._centers_sparse(X, labels, n_clusters, distances)
+            centers = _k_means._centers_sparse(
+                X, sample_weight, labels, n_clusters, distances
+            )
         else:
-            centers = _k_means._centers_dense(X, labels, n_clusters, distances)
+            centers = _k_means._centers_dense(
+                X, sample_weight, labels, n_clusters, distances
+            )
 
         # l2-normalize centers (this is the main contibution here)
         centers = normalize(centers)
@@ -93,6 +103,7 @@ def _spherical_kmeans_single_lloyd(
         # match cluster centers
         best_labels, best_inertia = _labels_inertia(
             X,
+            sample_weight,
             x_squared_norms,
             best_centers,
             precompute_distances=precompute_distances,
@@ -105,6 +116,7 @@ def _spherical_kmeans_single_lloyd(
 def spherical_k_means(
     X,
     n_clusters,
+    sample_weight=None,
     init="k-means++",
     n_init=10,
     max_iter=300,
@@ -132,11 +144,20 @@ def spherical_k_means(
         )
 
     best_inertia = np.infty
-    X = as_float_array(X, copy=copy_x)
+    # avoid forcing order when copy_x=False
+    order = "C" if copy_x else None
+    X = check_array(
+        X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x
+    )
+    # verify that the number of samples given is larger than k
+    if _num_samples(X) < n_clusters:
+        raise ValueError(
+            "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)
+        )
     tol = _tolerance(X, tol)
 
     if hasattr(init, "__array__"):
-        init = check_array(init, dtype=X.dtype.type, copy=True)
+        init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
         _validate_center_shape(X, n_clusters, init)
 
         if n_init != 1:
@@ -159,6 +180,7 @@ def spherical_k_means(
             labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
                 X,
                 n_clusters,
+                sample_weight,
                 max_iter=max_iter,
                 init=init,
                 verbose=verbose,
@@ -180,6 +202,7 @@ def spherical_k_means(
             delayed(_spherical_kmeans_single_lloyd)(
                 X,
                 n_clusters,
+                sample_weight,
                 max_iter=max_iter,
                 init=init,
                 verbose=verbose,
@@ -303,25 +326,32 @@ def __init__(
         self.n_jobs = n_jobs
         self.normalize = normalize
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """Compute k-means clustering.
 
         Parameters
         ----------
 
         X : array-like or sparse matrix, shape=(n_samples, n_features)
+
+        y : Ignored
+            not used, present here for API consistency by convention.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight (default: None)
         """
         if self.normalize:
             X = normalize(X)
 
         random_state = check_random_state(self.random_state)
-        X = self._check_fit_data(X)
 
         # TODO: add check that all data is unit-normalized
 
         self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
             X,
             n_clusters=self.n_clusters,
+            sample_weight=sample_weight,
             init=self.init,
             n_init=self.n_init,
             max_iter=self.max_iter,

diff --git a/spherecluster/tests/test_common.py b/spherecluster/tests/test_common.py
@@ -4,4 +4,3 @@
 
 def test_estimator_spherical_k_means():
     return check_estimator(SphericalKMeans)
-
diff --git a/spherecluster/tests/test_von_mises_fisher_mixture.py b/spherecluster/tests/test_von_mises_fisher_mixture.py
@@ -171,8 +171,9 @@ def test_integration_sparse(params_in):
     X = sp.sparse.csr_matrix((n_examples, n_features))
     for ee in range(n_examples):
         ridx = np.random.randint(n_features, size=(n_nonzero))
-        X[ee, ridx] = np.random.randn(n_nonzero)
-        X[ee, :] /= sp.sparse.linalg.norm(X[ee, :])
+        random_values = np.random.randn(n_nonzero)
+        random_values = random_values / np.linalg.norm(random_values)
+        X[ee, ridx] = random_values
 
     params_in.update({"n_clusters": n_clusters})
     movmf = VonMisesFisherMixture(**params_in)

diff --git a/spherecluster/von_mises_fisher_mixture.py b/spherecluster/von_mises_fisher_mixture.py
@@ -4,7 +4,7 @@
 import scipy.sparse as sp
 from scipy.special import iv  # modified Bessel function of first kind, I_v
 from numpy import i0  # modified Bessel function of first kind order 0, I_0
-from scipy.misc import logsumexp
+from scipy.special import logsumexp
 
 from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
 from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,3 @@

		def test_estimator_spherical_k_means():
		return check_estimator(SphericalKMeans)