PR review. landmark_positions and other arbitrary kwargs now accepted…

… by .fit() and .fit_transform()
lmcinnes · Oct 17, 2024 · 803e23e · 803e23e
1 parent 7abfd5b
commit 803e23e
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 44 deletions.
diff --git a/doc/transform_landmarked_pumap.rst b/doc/transform_landmarked_pumap.rst
@@ -118,7 +118,7 @@ Again, we get good results on our initial embedding of ``x1``. If we pass ``x2``
 Re-training Parametric UMAP with landmarks
 ------------------------------------------
 
-To update our embedding to include the new class, we'll fine-tune our existing ``ParametricUMAP`` model. Doing this without any other changes will start from where we left off, but our embedding space's structure may drift and change. This is because the UMAP loss function is invariant to scaling, translation, and rotation, as it is only concerned with the relative positions and distances between points. 
+To update our embedding to include the new class, we'll fine-tune our existing ``ParametricUMAP`` model. Doing this without any other changes will start from where we left off, but our embedding space's structure may drift and change. This is because the UMAP loss function is invariant to translation and rotation, as it is only concerned with the relative positions and distances between points.
 
 In order to keep our embedding space more consistent, we'll use the landmarks option for ``ParametricUMAP``. We retrain the model on the ``x2`` partition, along with some points chosen as landmarks from ``x1``. We'll choose 1% of the samples in ``x1`` to be included, along with their current position in the embedding space to be used in the landmarks loss function.
 

diff --git a/notebooks/MNIST_Landmarks.ipynb b/notebooks/MNIST_Landmarks.ipynb
@@ -17,7 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# !pip install keras scikit-learn umap-learn matplotlib numpy pandas"
+    "!pip install keras scikit-learn umap-learn matplotlib numpy pandas"
    ]
   },
   {

diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py
@@ -121,7 +121,7 @@ def __init__(
         self.keras_fit_kwargs = keras_fit_kwargs  # arguments for model.fit
         self.parametric_model = None
 
-        # Pass the random state on to keras. This will set the numpy, 
+        # Pass the random state on to keras. This will set the numpy,
         # backend, and python random seeds
         # For reproducable training.
         if isinstance(self.random_state, int):
@@ -175,10 +175,6 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
             Points that are not landmarks should have nan coordinates.
         """
 
-        # We may have set landmark positions in a call to fit_transform
-        if landmark_positions is None and hasattr(self, "landmark_positions"):
-            lanrmark_positions = self.landmark_positions
-
         if landmark_positions is not None:
             len_X = len(X)
             len_land = len(landmark_positions)
@@ -188,16 +184,6 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
                     = {len_land}, while it must be equal."
                 )
 
-        # store landmark_positions after checking it is in the right format.
-        if landmark_positions is not None:
-            self.landmark_positions = check_array(
-                landmark_positions,
-                dtype=np.float32,
-                force_all_finite="allow-nan",
-            )
-        else:
-            self.landmark_positions = landmark_positions
-
         if self.metric == "precomputed":
             if precomputed_distances is None:
                 raise ValueError(
@@ -207,18 +193,12 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
             # prepare X for training the network
             self._X = X
             # geneate the graph on precomputed distances
-            out = super().fit(precomputed_distances, y)
-            # delete landmark positions
-            if self.landmark_positions is not None:
-                delattr(self, "landmark_positions")
-            return out
+            return super().fit(
+                precomputed_distances, y, landmark_positions=landmark_positions
+            )
 
         else:
-            out = super().fit(X, y)
-            # delete landmark positions
-            if self.landmark_positions is not None:
-                delattr(self, "landmark_positions")
-            return out
+            return super().fit(X, y, landmark_positions=landmark_positions)
 
     def fit_transform(
         self, X, y=None, precomputed_distances=None, landmark_positions=None
@@ -260,8 +240,6 @@ def fit_transform(
                     = {len_land}, while it must be equal."
                 )
 
-        self.landmark_positions = landmark_positions
-
         if self.metric == "precomputed":
             if precomputed_distances is None:
                 raise ValueError(
@@ -273,11 +251,13 @@ def fit_transform(
             # generate the graph on precomputed distances
             # landmark positions are cleaned up inside the
             # .fit() component of .fit_transform()
-            return super().fit_transform(precomputed_distances, y)
+            return super().fit_transform(
+                precomputed_distances, y, landmark_positions=landmark_positions
+            )
         else:
             # landmark positions are cleaned up inside the
             # .fit() component of .fit_transform()
-            return super().fit_transform(X, y)
+            return super().fit_transform(X, y, landmark_positions=landmark_positions)
 
     def transform(self, X, batch_size=None):
         """Transform X into the existing embedded space and return that
@@ -340,7 +320,7 @@ def _define_model(self):
             optimizer=self.optimizer,
         )
 
-    def _fit_embed_data(self, X, n_epochs, init, random_state):
+    def _fit_embed_data(self, X, n_epochs, init, random_state, landmark_positions=None):
 
         if self.metric == "precomputed":
             X = self._X
@@ -358,11 +338,13 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
                 "Data should be scaled to the range 0-1 for cross-entropy reconstruction loss."
             )
 
-        # check if landmark positions were passed to fit.
-        if hasattr(self, "landmark_positions"):
-            landmark_positions = self.landmark_positions
-        else:
-            landmark_positions = None
+        # Make sure landmark_positions is float32.
+        if landmark_positions is not None:
+            landmark_positions = check_array(
+                landmark_positions,
+                dtype=np.float32,
+                force_all_finite="allow-nan",
+            )
 
         # get dataset of edges
         (
@@ -1204,8 +1186,8 @@ def _landmark_loss(self, y, y_pred):
         y_to = y["landmark_to"]
 
         # Euclidean distance between y and y_pred, ignoring nans.
-        # Before computing difference, replace all predicted and 
-        # git slandmark embeddings with 0 if there isn't a landmark.
+        # Before computing difference, replace all predicted and
+        # landmark embeddings with 0 if there isn't a landmark.
         clean_y_pred_to = ops.where(
             ops.isnan(y_to),
             x1=ops.zeros_like(y_pred["embedding_to"]),

diff --git a/umap/umap_.py b/umap/umap_.py
@@ -2336,7 +2336,7 @@ def __sub__(self, other):
 
         return result
 
-    def fit(self, X, y=None, force_all_finite=True):
+    def fit(self, X, y=None, force_all_finite=True, **kwargs):
         """Fit X into an embedded space.
 
         Optionally use y for supervised dimension reduction.
@@ -2360,6 +2360,9 @@ def fit(self, X, y=None, force_all_finite=True):
                                    - False: accepts np.inf, np.nan, pd.NA in array.
                                    - 'allow-nan': accepts only np.nan and pd.NA values in array.
                                      Values cannot be infinite.
+
+        **kwargs : optional
+            Any additional keyword arguments are passed to _fit_embed_data.
         """
         if self.metric in ("bit_hamming", "bit_jaccard"):
             X = check_array(
@@ -2816,6 +2819,7 @@ def fit(self, X, y=None, force_all_finite=True):
                 epochs,
                 init,
                 random_state,  # JH why raw data?
+                **kwargs,
             )
 
             if self.n_epochs_list is not None:
@@ -2853,9 +2857,10 @@ def fit(self, X, y=None, force_all_finite=True):
 
         return self
 
-    def _fit_embed_data(self, X, n_epochs, init, random_state):
+    def _fit_embed_data(self, X, n_epochs, init, random_state, **kwargs):
         """A method wrapper for simplicial_set_embedding that can be
-        replaced by subclasses.
+        replaced by subclasses. Arbitrary keyword arguments can be passed
+        through .fit() and .fit_transform().
         """
         return simplicial_set_embedding(
             X,
@@ -2882,7 +2887,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
             tqdm_kwds=self.tqdm_kwds,
         )
 
-    def fit_transform(self, X, y=None, force_all_finite=True):
+    def fit_transform(self, X, y=None, force_all_finite=True, **kwargs):
         """Fit X into an embedded space and return that transformed
         output.
 
@@ -2904,6 +2909,8 @@ def fit_transform(self, X, y=None, force_all_finite=True):
                                    - 'allow-nan': accepts only np.nan and pd.NA values in array.
                                      Values cannot be infinite.
 
+        **kwargs : Any additional keyword arguments are passed to _fit_embed_data.
+
         Returns
         -------
         X_new : array, shape (n_samples, n_components)
@@ -2918,7 +2925,7 @@ def fit_transform(self, X, y=None, force_all_finite=True):
         r_emb: array, shape (n_samples)
             Local radii of data points in the embedding (log-transformed).
         """
-        self.fit(X, y, force_all_finite)
+        self.fit(X, y, force_all_finite, **kwargs)
         if self.transform_mode == "embedding":
             if self.output_dens:
                 return self.embedding_, self.rad_orig_, self.rad_emb_