GUDHI · martinroyer · Nov 28, 2023 · Dec 19, 2023 · Dec 19, 2023 · Dec 20, 2023
diff --git a/src/python/gudhi/representations/__init__.py b/src/python/gudhi/representations/__init__.py
@@ -2,5 +2,6 @@
 from .metrics import *
 from .preprocessing import *
 from .vector_methods import *
+from .archipelago import *
 
-__all__ = ["kernel_methods", "metrics", "preprocessing", "vector_methods"]
+__all__ = ["kernel_methods", "metrics", "preprocessing", "vector_methods", "archipelago"]
diff --git a/src/python/gudhi/representations/archipelago.py b/src/python/gudhi/representations/archipelago.py
@@ -0,0 +1,127 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Martin Royer
+
+import copy
+
+import numpy as np
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from gudhi.representations.vector_methods import Atol, TopologicalVector, BettiCurve
+
+
+class Archipelago(BaseEstimator, TransformerMixin):
+    """
+    Transformer that dictionary-wraps persistence diagram vectorizers, i.e. objects from gudhi.representations.vector_methods.
+    One provides persistence diagram vectorizers (by way of either `island` or `island_dict`), and the Archipelago object will
+    |fit on| and |transform = vectorize| lists or series of persistence diagrams.
+    The object is sklearn-API consistent.
+
+    Parameters:
+        island: island for populating archipelago, i.e. object to vectorize the target in each homology
+            dimensions. Must be `copy.deepcopy`-able. *Will be ignored if island_list is given*.
+        island_dict: island dict for populating archipelago, i.e. dictionary of objects to vectorize persistence
+            diagrams according to homology_dimensions.
+
+    Examples
+    >>> pdiagram1 = [(0, (0.0, 2.34)), (0, (0.0, 0.956)), (1, (0.536, 0.856)), (2, (1.202, 1.734))]
+    >>> pdiagram2 = [(0, (0.0, 3.34)), (0, (0.0, 2.956)), (1, (0.536, 1.856)), (2, (1.202, 2.734))]
+    >>> pdiagram3 = [(0, (1.0, 4.34)), (0, (2.0, 3.956)), (1, (1.536, 2.856)), (2, (3.202, 4.734))]
+    >>> list_pdiags = [pdiagram1, pdiagram2, pdiagram3]
+    >>> archipelago = Archipelago(island=Atol())
+    >>> archipelago.fit(X=list_pdiags)
+    >>> archipelago.transform(X=list_pdiags)
+    >>> archipelago = Archipelago(island_dict={2: BettiCurve(resolution=4), 0:Atol()})
+    >>> import pandas as pd
+    >>> series_pdiags = pd.Series(list_pdiags)
+    >>> archipelago.set_output(transform="pandas")
+    >>> archipelago.fit(X=series_pdiags)
+    >>> archipelago.transform(X=series_pdiags)
+    """
+    _parameter_constraints = {
+        "island": [object, None],
+        "island_dict": [dict, None],
+    }
+
+    def __init__(
+            self,
+            island=None,
+            island_dict=None
+    ):
+        self.island = island
+        self.island_dict = island_dict
+
+    def fit(self, X, y=None):
+        """
+        Calibration step: create and fit `island` vectorizer to each matching diagram element
+
+        Args:
+            X (list of diagrams): input persistence diagrams to fit vectorizers on.
+            y: possibly labels for each diagram
+
+        Returns:
+            self
+        """
+        self._validate_params()
+        if self.island is None and self.island_dict is None:
+            self.island = Atol()
+        self.archipelago_ = {}
+        self._running_transform_names = ""
+
+        max_dimension = max(dim for pdiagram in X for (dim, _) in pdiagram)
+        by_dim_list_pdiags = [[
+            np.array([_ for (dim, _) in pdiagram if dim == dimension]) for dimension in range(0, max_dimension + 1)
+        ] for pdiagram in X]
+        for dimension in range(0, max_dimension + 1):
+            this_dim_list_pdiags = [pdiags[dimension] for pdiags in by_dim_list_pdiags]
+            if not len(this_dim_list_pdiags):
+                continue
+            if self.island_dict is not None and dimension in self.island_dict.keys():
+                island = self.island_dict[dimension]
+            elif self.island_dict is not None:
+                continue
+            else:
+                island = copy.deepcopy(self.island)
+            island.fit(X=this_dim_list_pdiags, y=y)
+            print(f"[Archipelago] Fit of homology dimension {dimension} with object {island.__class__} succeeded.")
+            self.archipelago_[dimension] = island
+        return self
+
+    def transform(self, X, y=None):
+        """
+        Apply measure vectorisation on a dictionary of list of measures.
+
+        Args:
+            X (list of diagrams): input persistence diagrams to vectorize.
+            y: Ignored, present for API consistency by convention.
+
+        Returns:
+            vectors : array of shape (len(X), n_features) where the columns features are vectorized homology dimension
+                in increasing order.
+        """
+
+        max_dimension = max(dim for pdiagram in X for (dim, _) in pdiagram)
+        by_dim_list_pdiags = [[
+            np.array([_ for (dim, _) in pdiagram if dim == dimension]) for dimension in range(0, max_dimension + 1)
+        ] for pdiagram in X]
+
+        archipelago_vectorized = []
+        running_transform_names = []
+        for dimension in range(0, max_dimension + 1):
+            if dimension not in self.archipelago_.keys():
+                print(f"[Archipelago] Encounters homology dimension {dimension} that has not been fitted on. Will ignore this key")
+                continue
+            this_dim_list_pdiags = [pdiags[dimension] for pdiags in by_dim_list_pdiags]
+            vectorized_dgms = self.archipelago_[dimension].transform(this_dim_list_pdiags)
+            if hasattr(self.archipelago_[dimension], 'get_feature_names_out') and callable(self.archipelago_[dimension].get_feature_names_out):
+                this_dim_names = self.archipelago_[dimension].get_feature_names_out()
+            else:
+                this_dim_names = [f"Feat{i + 1}" for i in range(vectorized_dgms.shape[1])]
+            running_transform_names += [f"(D{dimension}) {name}" for name in this_dim_names]
+            archipelago_vectorized.append(vectorized_dgms)
+        self._running_transform_names = running_transform_names
+        return np.concatenate(archipelago_vectorized, axis=1)
+
+    def get_feature_names_out(self):
+        return self._running_transform_names
diff --git a/src/python/gudhi/representations/vector_methods.py b/src/python/gudhi/representations/vector_methods.py
@@ -15,6 +15,8 @@
 from sklearn.exceptions    import NotFittedError
 from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
 from sklearn.metrics       import pairwise
+from sklearn.cluster import KMeans
+
 try:
     # New location since 1.0
     from sklearn.metrics     import DistanceMetric
@@ -719,21 +721,26 @@ class Atol(BaseEstimator, TransformerMixin):
     >>> a = np.array([[1, 2, 4], [1, 4, 0], [1, 0, 4]])
     >>> b = np.array([[4, 2, 0], [4, 4, 0], [4, 0, 2]])
     >>> c = np.array([[3, 2, -1], [1, 2, -1]])
-    >>> atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006))
+    >>> atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006, n_init=10))
     >>> atol_vectoriser.fit(X=[a, b, c]).centers
     array([[ 2.6       ,  2.8       , -0.4       ],
            [ 2.        ,  0.66666667,  3.33333333]])
-    >>> atol_vectoriser(a)
+    >>> atol_vectoriser._transform(a)
     array([0.42375966, 1.18168665])
-    >>> atol_vectoriser(c)
+    >>> atol_vectoriser._transform(c)
     array([1.25157463, 0.02062512])
     >>> atol_vectoriser.transform(X=[a, b, c])
     array([[0.42375966, 1.18168665],
            [1.06330156, 0.29861028],
            [1.25157463, 0.02062512]])
     """
     # Note the example above must be up to date with the one in tests called test_atol_doc
-    def __init__(self, quantiser, weighting_method="cloud", contrast="gaussian"):
+    def __init__(
+            self,
+            quantiser=KMeans(n_clusters=2, random_state=202312, n_init=10),
+            weighting_method="cloud",
+            contrast="gaussian"
+    ):
         """
         Constructor for the Atol measure vectorisation class.
 
@@ -751,6 +758,7 @@ def __init__(self, quantiser, weighting_method="cloud", contrast="gaussian"):
         self.quantiser = quantiser
         self.contrast = contrast
         self.weighting_method = weighting_method
+        self._running_transform_names = ""
 
     def get_contrast(self):
         return {
@@ -790,7 +798,9 @@ def fit(self, X, y=None, sample_weight=None):
 
         measures_concat = np.concatenate(X)
         weights_concat = np.concatenate(sample_weight)
+
         self.quantiser.fit(X=measures_concat, sample_weight=weights_concat)
+
         self.centers = self.quantiser.cluster_centers_
         # Hack, but some people are unhappy if the order depends on the version of sklearn
         self.centers = self.centers[np.lexsort(self.centers.T)]
@@ -805,7 +815,7 @@ def fit(self, X, y=None, sample_weight=None):
             self.inertias = np.min(dist_centers, axis=0)/2
         return self
 
-    def __call__(self, measure, sample_weight=None):
+    def _transform(self, measure, sample_weight=None):
         """
         Apply measure vectorisation on a single measure. Only available after `fit` has been called.
 
@@ -834,4 +844,8 @@ def transform(self, X, sample_weight=None):
         """
         if sample_weight is None:
             sample_weight = [self.get_weighting_method()(measure) for measure in X]
-        return np.stack([self(measure, sample_weight=weight) for measure, weight in zip(X, sample_weight)])
+        self._running_transform_names = [f"Atol Center {i + 1}" for i in range(self.quantiser.n_clusters)]
+        return np.stack([self._transform(measure, sample_weight=weight) for measure, weight in zip(X, sample_weight)])
+
+    def get_feature_names_out(self):
+        return self._running_transform_names
diff --git a/src/python/test/test_representations.py b/src/python/test/test_representations.py
@@ -118,23 +118,23 @@ def test_atol_doc():
     b = np.array([[4, 2, 0], [4, 4, 0], [4, 0, 2]])
     c = np.array([[3, 2, -1], [1, 2, -1]])
 
-    atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006))
+    atol_vectoriser = Atol(quantiser=KMeans(n_clusters=2, random_state=202006, n_init=10))
     # Atol will do
     # X = np.concatenate([a,b,c])
-    # kmeans = KMeans(n_clusters=2, random_state=202006).fit(X) 
+    # kmeans = KMeans(n_clusters=2, random_state=202006, n_init=10).fit(X)
     # kmeans.labels_ will be : array([1, 0, 1, 0, 0, 1, 0, 0])
     first_cluster = np.asarray([a[0], a[2], b[2]])
-    second_cluster = np.asarray([a[1], b[0], b[2], c[0], c[1]])
+    second_cluster = np.asarray([a[1], b[0], b[1], c[0], c[1]])
 
     # Check the center of the first_cluster and second_cluster are in Atol centers
     centers = atol_vectoriser.fit(X=[a, b, c]).centers
     np.isclose(centers, first_cluster.mean(axis=0)).all(1).any() 
     np.isclose(centers, second_cluster.mean(axis=0)).all(1).any() 
 
     vectorization = atol_vectoriser.transform(X=[a, b, c])
-    assert np.allclose(vectorization[0], atol_vectoriser(a))
-    assert np.allclose(vectorization[1], atol_vectoriser(b))
-    assert np.allclose(vectorization[2], atol_vectoriser(c))
+    assert np.allclose(vectorization[0], atol_vectoriser._transform(a))
+    assert np.allclose(vectorization[1], atol_vectoriser._transform(b))
+    assert np.allclose(vectorization[2], atol_vectoriser._transform(c))
 
 
 def test_dummy_atol():
@@ -145,12 +145,12 @@ def test_dummy_atol():
     for weighting_method in ["cloud", "iidproba"]:
         for contrast in ["gaussian", "laplacian", "indicator"]:
             atol_vectoriser = Atol(
-                quantiser=KMeans(n_clusters=1, random_state=202006),
+                quantiser=KMeans(n_clusters=1, random_state=202006, n_init=10),
                 weighting_method=weighting_method,
                 contrast=contrast,
             )
             atol_vectoriser.fit([a, b, c])
-            atol_vectoriser(a)
+            atol_vectoriser._transform(a)
             atol_vectoriser.transform(X=[a, b, c])