Skip to content

Commit

Permalink
PR review. landmark_positions and other arbitrary kwargs now accepted…
Browse files Browse the repository at this point in the history
… by .fit() and .fit_transform()
  • Loading branch information
jacob golding committed Oct 17, 2024
1 parent 7abfd5b commit 803e23e
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 44 deletions.
2 changes: 1 addition & 1 deletion doc/transform_landmarked_pumap.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ Again, we get good results on our initial embedding of ``x1``. If we pass ``x2``
Re-training Parametric UMAP with landmarks
------------------------------------------

To update our embedding to include the new class, we'll fine-tune our existing ``ParametricUMAP`` model. Doing this without any other changes will start from where we left off, but our embedding space's structure may drift and change. This is because the UMAP loss function is invariant to scaling, translation, and rotation, as it is only concerned with the relative positions and distances between points.
To update our embedding to include the new class, we'll fine-tune our existing ``ParametricUMAP`` model. Doing this without any other changes will start from where we left off, but our embedding space's structure may drift and change. This is because the UMAP loss function is invariant to translation and rotation, as it is only concerned with the relative positions and distances between points.

In order to keep our embedding space more consistent, we'll use the landmarks option for ``ParametricUMAP``. We retrain the model on the ``x2`` partition, along with some points chosen as landmarks from ``x1``. We'll choose 1% of the samples in ``x1`` to be included, along with their current position in the embedding space to be used in the landmarks loss function.

Expand Down
2 changes: 1 addition & 1 deletion notebooks/MNIST_Landmarks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"metadata": {},
"outputs": [],
"source": [
"# !pip install keras scikit-learn umap-learn matplotlib numpy pandas"
"!pip install keras scikit-learn umap-learn matplotlib numpy pandas"
]
},
{
Expand Down
56 changes: 19 additions & 37 deletions umap/parametric_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __init__(
self.keras_fit_kwargs = keras_fit_kwargs # arguments for model.fit
self.parametric_model = None

# Pass the random state on to keras. This will set the numpy,
# Pass the random state on to keras. This will set the numpy,
# backend, and python random seeds
# For reproducable training.
if isinstance(self.random_state, int):
Expand Down Expand Up @@ -175,10 +175,6 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
Points that are not landmarks should have nan coordinates.
"""

# We may have set landmark positions in a call to fit_transform
if landmark_positions is None and hasattr(self, "landmark_positions"):
lanrmark_positions = self.landmark_positions

if landmark_positions is not None:
len_X = len(X)
len_land = len(landmark_positions)
Expand All @@ -188,16 +184,6 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
= {len_land}, while it must be equal."
)

# store landmark_positions after checking it is in the right format.
if landmark_positions is not None:
self.landmark_positions = check_array(
landmark_positions,
dtype=np.float32,
force_all_finite="allow-nan",
)
else:
self.landmark_positions = landmark_positions

if self.metric == "precomputed":
if precomputed_distances is None:
raise ValueError(
Expand All @@ -207,18 +193,12 @@ def fit(self, X, y=None, precomputed_distances=None, landmark_positions=None):
# prepare X for training the network
self._X = X
# geneate the graph on precomputed distances
out = super().fit(precomputed_distances, y)
# delete landmark positions
if self.landmark_positions is not None:
delattr(self, "landmark_positions")
return out
return super().fit(
precomputed_distances, y, landmark_positions=landmark_positions
)

else:
out = super().fit(X, y)
# delete landmark positions
if self.landmark_positions is not None:
delattr(self, "landmark_positions")
return out
return super().fit(X, y, landmark_positions=landmark_positions)

def fit_transform(
self, X, y=None, precomputed_distances=None, landmark_positions=None
Expand Down Expand Up @@ -260,8 +240,6 @@ def fit_transform(
= {len_land}, while it must be equal."
)

self.landmark_positions = landmark_positions

if self.metric == "precomputed":
if precomputed_distances is None:
raise ValueError(
Expand All @@ -273,11 +251,13 @@ def fit_transform(
# generate the graph on precomputed distances
# landmark positions are cleaned up inside the
# .fit() component of .fit_transform()
return super().fit_transform(precomputed_distances, y)
return super().fit_transform(
precomputed_distances, y, landmark_positions=landmark_positions
)
else:
# landmark positions are cleaned up inside the
# .fit() component of .fit_transform()
return super().fit_transform(X, y)
return super().fit_transform(X, y, landmark_positions=landmark_positions)

def transform(self, X, batch_size=None):
"""Transform X into the existing embedded space and return that
Expand Down Expand Up @@ -340,7 +320,7 @@ def _define_model(self):
optimizer=self.optimizer,
)

def _fit_embed_data(self, X, n_epochs, init, random_state):
def _fit_embed_data(self, X, n_epochs, init, random_state, landmark_positions=None):

if self.metric == "precomputed":
X = self._X
Expand All @@ -358,11 +338,13 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
"Data should be scaled to the range 0-1 for cross-entropy reconstruction loss."
)

# check if landmark positions were passed to fit.
if hasattr(self, "landmark_positions"):
landmark_positions = self.landmark_positions
else:
landmark_positions = None
# Make sure landmark_positions is float32.
if landmark_positions is not None:
landmark_positions = check_array(
landmark_positions,
dtype=np.float32,
force_all_finite="allow-nan",
)

# get dataset of edges
(
Expand Down Expand Up @@ -1204,8 +1186,8 @@ def _landmark_loss(self, y, y_pred):
y_to = y["landmark_to"]

# Euclidean distance between y and y_pred, ignoring nans.
# Before computing difference, replace all predicted and
# git slandmark embeddings with 0 if there isn't a landmark.
# Before computing difference, replace all predicted and
# landmark embeddings with 0 if there isn't a landmark.
clean_y_pred_to = ops.where(
ops.isnan(y_to),
x1=ops.zeros_like(y_pred["embedding_to"]),
Expand Down
17 changes: 12 additions & 5 deletions umap/umap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -2336,7 +2336,7 @@ def __sub__(self, other):

return result

def fit(self, X, y=None, force_all_finite=True):
def fit(self, X, y=None, force_all_finite=True, **kwargs):
"""Fit X into an embedded space.
Optionally use y for supervised dimension reduction.
Expand All @@ -2360,6 +2360,9 @@ def fit(self, X, y=None, force_all_finite=True):
- False: accepts np.inf, np.nan, pd.NA in array.
- 'allow-nan': accepts only np.nan and pd.NA values in array.
Values cannot be infinite.
**kwargs : optional
Any additional keyword arguments are passed to _fit_embed_data.
"""
if self.metric in ("bit_hamming", "bit_jaccard"):
X = check_array(
Expand Down Expand Up @@ -2816,6 +2819,7 @@ def fit(self, X, y=None, force_all_finite=True):
epochs,
init,
random_state, # JH why raw data?
**kwargs,
)

if self.n_epochs_list is not None:
Expand Down Expand Up @@ -2853,9 +2857,10 @@ def fit(self, X, y=None, force_all_finite=True):

return self

def _fit_embed_data(self, X, n_epochs, init, random_state):
def _fit_embed_data(self, X, n_epochs, init, random_state, **kwargs):
"""A method wrapper for simplicial_set_embedding that can be
replaced by subclasses.
replaced by subclasses. Arbitrary keyword arguments can be passed
through .fit() and .fit_transform().
"""
return simplicial_set_embedding(
X,
Expand All @@ -2882,7 +2887,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
tqdm_kwds=self.tqdm_kwds,
)

def fit_transform(self, X, y=None, force_all_finite=True):
def fit_transform(self, X, y=None, force_all_finite=True, **kwargs):
"""Fit X into an embedded space and return that transformed
output.
Expand All @@ -2904,6 +2909,8 @@ def fit_transform(self, X, y=None, force_all_finite=True):
- 'allow-nan': accepts only np.nan and pd.NA values in array.
Values cannot be infinite.
**kwargs : Any additional keyword arguments are passed to _fit_embed_data.
Returns
-------
X_new : array, shape (n_samples, n_components)
Expand All @@ -2918,7 +2925,7 @@ def fit_transform(self, X, y=None, force_all_finite=True):
r_emb: array, shape (n_samples)
Local radii of data points in the embedding (log-transformed).
"""
self.fit(X, y, force_all_finite)
self.fit(X, y, force_all_finite, **kwargs)
if self.transform_mode == "embedding":
if self.output_dens:
return self.embedding_, self.rad_orig_, self.rad_emb_
Expand Down

0 comments on commit 803e23e

Please sign in to comment.