Improve: Equivalent encode_multimodal

unum-cloud · Mar 28, 2024 · 08df284 · 08df284
1 parent 4754253
commit 08df284
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 39 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -7,11 +7,14 @@
         "logits",
         "multimodal",
         "ndarray",
+        "numpy",
+        "ONNX",
         "onnxruntime",
         "preprocess",
         "pretrained",
         "probs",
         "pypi",
+        "rerank",
         "reranker",
         "reranking",
         "softmax",

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -5,9 +5,12 @@ Before submitting any changes, please make sure that the tests pass.
 
 ```sh
 pip install -e .                # For core dependencies
+
 pip install -e ".[torch]"       # For PyTorch
 pip install -e ".[onnx]"        # For ONNX on CPU
 pip install -e ".[onnx-gpu]"    # For ONNX on GPU, available for some platforms
 pip install -e ".[torch,onnx]"  # For PyTorch and ONNX Python tests
+
 pytest python/scripts/ -s -x -Wd -v
+pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
 ```
diff --git a/README.md b/README.md
@@ -36,13 +36,14 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 
 ## Features
 
-- __Tiny Embeddings__: 64-dimensional Matryoshaka-style embeddings for extremely fast [search][usearch].
+- __Tiny Embeddings__: 64-dimensional [Matryoshaka][matryoshka]-style embeddings for extremely fast [search][usearch].
 - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
 - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
-- __Quantization Aware__: Downcasted embeddings from `f32` to `i8` without losing much recall.
+- __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
 - __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation).
 
 [usearch]: https://github.com/unum-cloud/usearch
+[matryoshka]: https://arxiv.org/abs/2205.13147
 
 ## Models
 
@@ -118,27 +119,116 @@ similarity = (image_embedding * text_embedding).sum(axis=1)
 
 To search for similar items, the embeddings can be compared using cosine similarity.
 The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match.
+
+### Reranking
+
 Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list.
 The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match.
 
 ```python
-# For PyTorch
-joint_embedding = model.encode_multimodal(
-    image_features=image_features,
-    text_features=text_features,
-    attention_mask=text_data['attention_mask']
-)
-score = model.get_matching_scores(joint_embedding)
-
-# For ONNX
 score, joint_embedding = model.encode_multimodal(
     image_features=image_features,
     text_features=text_features,
     attention_mask=text_data['attention_mask'],
-    return_scores=True
+    return_scores=True,
 )
 ```
 
+### Down-casting, Quantization, Matryoshka, and Slicing
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support.
+Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries.
+Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search.
+
+```python
+import numpy as np
+
+f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
+i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
+b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
+```
+
+Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner.
+
+```python
+import numpy as np
+
+large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+small_embedding: np.ndarray = large_embedding[:, :256]
+tiny_embedding: np.ndarray = large_embedding[:, :64]
+```
+
+Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries.
+When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement over Torch, NumPy, SciPy, and vanilla Python using SimSIMD][report-simsimd].
+
+```python
+from simsimd import cosine, hamming
+
+distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU
+distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU
+distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU
+distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU
+```
+
+Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement over FAISS and other vector-search solutions using USearch][report-usearch].
+Here are a couple of examples:
+
+```python
+from usearch.index import Index
+
+f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings
+f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings
+i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings
+b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings
+```
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
+[report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel
+[report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/
+
+### Compact Packaging
+
+PyTorch is a heavy dependency to carry, especially if you run on Edge or IoT devices.
+Using vanilla ONNX runtime, one can significantly reduce memory consumption and deployment latency.
+
+```sh
+$ conda create -n uform_torch python=3.10 -y
+$ conda create -n uform_onnx python=3.10 -y
+$ conda activate uform_torch && pip install -e ".[torch]" && conda deactivate
+$ conda activate uform_onnx && pip install -e ".[onnx]" && conda deactivate
+$ du -sh $(conda info --envs | grep 'uform_torch' | awk '{print $2}')
+> 5.2G    ~/conda/envs/uform_torch
+$ du -sh $(conda info --envs | grep 'uform_onnx' | awk '{print $2}')
+> 461M    ~/conda/envs/uform_onnx
+```
+
+Most of that weight can be further reduced down to 100 MB for both the model and the runtime.
+You can pick one of many supported [ONNX execution providers][onnx-providers], which includes XNNPACK, CUDA and TensorRT for Nvidia GPUs, OpenVINO on Intel, DirectML on Windows, ROCm on AMD, CoreML on Apple devices, and more to come.
+
+[onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
+
+---
+
+The configuration process may include a few additional steps, depending on the environment.
+When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
+
+```sh
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12
+pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+export CUDA_PATH="/usr/local/cuda-12/bin"
+export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+pytest python/scripts/test_embeddings.py -s -x -Wd -v
+```
+
+[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
+
 ## Chat, Image Captioning and Question Answering
 
 UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
@@ -335,20 +425,6 @@ Results for VQAv2 evaluation.
 > ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
 > ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
 
-## Size
-
-Torch is a heavy dependency and most models are too large to run on edge and on IoT devices.
-Using the ONNX runtime, one can significantly reduce memory consumption and deployment latency.
-
-```sh
-$ conda create -n env_torch python=3.10 -y
-$ conda create -n env_onnx python=3.10 -y
-$ conda activate env_torch && pip install -e ".[torch]" && conda deactivate
-$ conda activate env_onnx && pip install -e ".[onnx-gpu]" && conda deactivate
-du -sh $(conda info --envs | grep 'env_torch' | awk '{print $2}')
-du -sh $(conda info --envs | grep 'env_onnx' | awk '{print $2}')
-```
-
 ## Speed
 
 On Nvidia RTX 3090, the following performance is expected on text encoding.

diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py
@@ -21,17 +21,17 @@
     onnx_available = False
 
 torch_models = [
-    # "unum-cloud/uform-vl-english",
-    # "unum-cloud/uform-vl-multilingual-v2",
+    "unum-cloud/uform-vl-english",
+    "unum-cloud/uform-vl-multilingual-v2",
 ]
 
 onnx_models_and_providers = [
-    # ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
-    # ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
-    # ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
-    # ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
+    ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
+    ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
+    ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
+    ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
     ("unum-cloud/uform-vl-english-small", "gpu", "fp16"),
+    ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
 ]
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
@@ -52,11 +52,14 @@ def test_torch_one_embedding(model_name: str):
     assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
     # Test reranking
-    joint_embedding = model.encode_multimodal(
-        image_features=image_features, text_features=text_features, attention_mask=text_data["attention_mask"]
+    score, joint_embedding = model.encode_multimodal(
+        image_features=image_features,
+        text_features=text_features,
+        attention_mask=text_data["attention_mask"],
+        return_scores=True,
     )
-    score = model.get_matching_scores(joint_embedding)
     assert score.shape[0] == 1, "Matching score batch size is not 1"
+    assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
 
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")

diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py
@@ -155,7 +155,7 @@ def encode_multimodal(
         text_features: Optional[ndarray] = None,
         attention_mask: Optional[ndarray] = None,
         return_scores: bool = False,
-    ) -> ndarray:
+    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
         """Passes preprocessed texts (or precomputed texts features) and
             preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings.
 

diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py
@@ -398,7 +398,8 @@ def encode_multimodal(
         image_features: Optional[Tensor] = None,
         text_features: Optional[Tensor] = None,
         attention_mask: Optional[Tensor] = None,
-    ) -> Tensor:
+        return_scores: bool = False,
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
         """Passes preprocessed texts (or precomputed texts features) and
             preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings.
 
@@ -424,11 +425,16 @@ def encode_multimodal(
                 text["attention_mask"],
             )
 
-        return self.text_encoder.forward_multimodal(
+        embeddings = self.text_encoder.forward_multimodal(
             text_features,
             attention_mask if attention_mask is not None else text["attention_mask"],
             image_features,
         )
+
+        if return_scores:
+            return self.get_matching_scores(embeddings), embeddings
+
+        return embeddings
 
     def get_matching_scores(self, embeddings: Tensor) -> Tensor:
         """Computes the probability that there is a match between images and texts based on their multimodal embeddings