tenstorrent · sgligorijevicTT · Jan 8, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 13, 2025
@@ -13,3 +13,5 @@ lit
 pybind11
 pytest
 transformers
+fsspec
+einops
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: (c) 2024 Google LLC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Taken from https://github.com/google-research/vision_transformer/blob/c6de1e5378c9831a8477feb30994971bdc409e46/vit_jax/models_mixer.py
+
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+import einops
+import flax.linen as nn
+import jax.numpy as jnp
+
+
+class MlpBlock(nn.Module):
+    mlp_dim: int
+
+    @nn.compact
+    def __call__(self, x):
+        y = nn.Dense(self.mlp_dim)(x)
+        y = nn.gelu(y)
+        return nn.Dense(x.shape[-1])(y)
+
+
+class MixerBlock(nn.Module):
+    """Mixer block layer."""
+
+    tokens_mlp_dim: int
+    channels_mlp_dim: int
+
+    @nn.compact
+    def __call__(self, x):
+        y = nn.LayerNorm()(x)
+        y = jnp.swapaxes(y, 1, 2)
+        y = MlpBlock(self.tokens_mlp_dim, name="token_mixing")(y)
+        y = jnp.swapaxes(y, 1, 2)
+        x = x + y
+        y = nn.LayerNorm()(x)
+        return x + MlpBlock(self.channels_mlp_dim, name="channel_mixing")(y)
+
+
+class MlpMixer(nn.Module):
+    """Mixer architecture."""
+
+    patches: Any
+    num_classes: int
+    num_blocks: int
+    hidden_dim: int
+    tokens_mlp_dim: int
+    channels_mlp_dim: int
+    model_name: Optional[str] = None
+
+    @nn.compact
+    def __call__(self, inputs, train):
+        del train
+        x = nn.Conv(
+            self.hidden_dim, self.patches.size, strides=self.patches.size, name="stem"
+        )(inputs)
+        x = einops.rearrange(x, "n h w c -> n (h w) c")
+        for _ in range(self.num_blocks):
+            x = MixerBlock(self.tokens_mlp_dim, self.channels_mlp_dim)(x)
+        x = nn.LayerNorm(name="pre_head_layer_norm")(x)
+        x = jnp.mean(x, axis=1)
+        if self.num_classes:
+            x = nn.Dense(
+                self.num_classes, kernel_init=nn.initializers.zeros, name="head"
+            )(x)
+        return x
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, Sequence
+
+
+import jax
+import jax.numpy as jnp
+import numpy
+import pytest
+import fsspec
+from flax import linen as nn
+import flax.traverse_util
+from infra import ModelTester, RunMode
+from .model_implementation import MlpMixer
+
+
+# hypers
+patch_size = 16
+num_classes = 21843
+num_blocks = 12
+hidden_dim = 768
+token_mlp_dim = 384
+channel_mlp_dim = 3072
+
+
+def Mixer_B_16_pretrained():
+    # TODO(stefan): Discuss how weights should be handled org wide
+    link = "https://storage.googleapis.com/mixer_models/imagenet21k/Mixer-B_16.npz"
+    with fsspec.open("filecache::" + link, cache_storage="/tmp/files/") as f:
+        weights = numpy.load(f, encoding="bytes")
+        state_dict = {k: v for k, v in weights.items()}
+        pytree = flax.traverse_util.unflatten_dict(state_dict, sep="/")
+    return {"params": pytree}
+
+
+class MlpMixerTester(ModelTester):
+    """Tester for MlpMixer model."""
+
+    # @override
+    def _get_model(self) -> nn.Module:
+        patch = jnp.ones((patch_size, patch_size))
+        return MlpMixer(
+            patches=patch,
+            num_classes=num_classes,
+            num_blocks=num_blocks,
+            hidden_dim=hidden_dim,
+            tokens_mlp_dim=token_mlp_dim,
+            channels_mlp_dim=channel_mlp_dim,
+        )
+
+    # @override
+    def _get_forward_method_name(self) -> str:
+        return "apply"
+
+    # @override
+    def _get_input_activations(self) -> Sequence[jax.Array]:
+        key = jax.random.PRNGKey(42)
+        random_image = jax.random.normal(key, (1, 196, 196, 3))
+        return random_image
+
+    # @override
+    def _get_forward_method_args(self):
+        ins = self._get_input_activations()
+        weights = Mixer_B_16_pretrained()
+        # Required to bypass "Initializer expected to generate shape (16, 16, 3, 768) but got shape (256, 3, 768)"
+        kernel = weights["params"]["stem"]["kernel"]
+        kernel = kernel.reshape(-1, 3, hidden_dim)
+        weights["params"]["stem"]["kernel"] = kernel
+
+        # Alternatively, weights could be randomly initialized like this:
+        # weights = self._model.init(jax.random.PRNGKey(42), ins, train=False)
+
+        return [weights, ins]
+
+    # @override
+    def _get_forward_method_kwargs(self) -> Dict[str, jax.Array]:
+        return {"train": False}
+
+
+# ----- Fixtures -----
+@pytest.fixture
+def inference_tester() -> MlpMixerTester:
+    return MlpMixerTester()
+
+
+@pytest.fixture
+def training_tester() -> MlpMixerTester:
+    return MlpMixerTester(RunMode.TRAINING)
+
+
+# ----- Tests -----
+@pytest.mark.skip(
+    reason="error: failed to legalize operation 'ttir.convolution' that was explicitly marked illegal"
+)
+def test_mlpmixer(inference_tester: MlpMixerTester):
+    inference_tester.test()
+
+
+@pytest.mark.skip(reason="Support for training not implemented")
+def test_mlpmixer_training(training_tester: MlpMixerTester):
+    training_tester.test()
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,5 @@ lit @@
     pybind11
     pytest
     transformers
+    fsspec
+    einops