rom1504 · barinov274 · Jun 11, 2023 · Jun 14, 2023 · Jan 6, 2024
diff --git a/README.md b/README.md
@@ -169,7 +169,7 @@ clip_inference turn a set of text+image into clip embeddings
 * **write_batch_size** Write batch size (default *10**6*)
 * **wds_image_key** Key to use for images in webdataset. (default *jpg*)
 * **wds_caption_key** Key to use for captions in webdataset. (default *txt*)
-* **clip_model** CLIP model to load (default *ViT-B/32*). Specify it as `"open_clip:ViT-B-32-quickgelu"` to use the [open_clip](https://github.com/mlfoundations/open_clip).
+* **clip_model** CLIP model to load (default *ViT-B/32*). Specify it as `"open_clip:ViT-B-32-quickgelu"` to use the [open_clip](https://github.com/mlfoundations/open_clip). You can also specify a checkpoint of openclip model that you need to download like this: `"open_clip:ViT-L-14 | datacomp_xl_s13b_b90k"`. To see a list of available openclip models, you can use this code:```import open_clip; print(open_clip.list_pretrained())```
 * **mclip_model** MCLIP model to load (default *sentence-transformers/clip-ViT-B-32-multilingual-v1*)
 * **use_mclip** If False it performs the inference using CLIP; MCLIP otherwise (default *False*)
 * **use_jit** uses jit for the clip model (default *True*)
@@ -223,7 +223,7 @@ The API is very similar to `clip-retrieval inference` with some minor changes:
 * **enable_metadata** Enable metadata processing (default *False*)
 * **wds_image_key** Key to use for images in webdataset. (default *jpg*)
 * **wds_caption_key** Key to use for captions in webdataset. (default *txt*)
-* **clip_model** CLIP model to load (default *ViT-B/32*). Specify it as `"open_clip:ViT-B-32-quickgelu"` to use the [open_clip](https://github.com/mlfoundations/open_clip).
+* **clip_model** CLIP model to load (default *ViT-B/32*). Specify it as `"open_clip:ViT-B-32-quickgelu"` to use the [open_clip](https://github.com/mlfoundations/open_clip). You can also specify a checkpoint of openclip model that you need to download like this: `"open_clip:ViT-L-14 | datacomp_xl_s13b_b90k"`. To see a list of available openclip models, you can use this code:```import open_clip; print(open_clip.list_pretrained())```
 * **mclip_model** MCLIP model to load (default *sentence-transformers/clip-ViT-B-32-multilingual-v1*)
 * **use_mclip** If False it performs the inference using CLIP; MCLIP otherwise (default *False*)
 * **use_jit** uses jit for the clip model (default *True*)
@@ -304,7 +304,7 @@ clip-retrieval back --port 1234 --indices-paths indices_paths.json
 
 Options:
 * `--use_jit True` uses jit for the clip model
-* `--clip_model "ViT-B/32"` allows choosing the clip model to use. Prefix with `"open_clip:"` to use an [open_clip](https://github.com/mlfoundations/open_clip) model.
+* `--clip_model "ViT-B/32"` allows choosing the clip model to use. Prefix with `"open_clip:"` to use an [open_clip](https://github.com/mlfoundations/open_clip) model. You can also specify a checkpoint of openclip model that you need to download like this: `"open_clip:ViT-L-14 | datacomp_xl_s13b_b90k"`. To see a list of available openclip models, you can use this code:```import open_clip; print(open_clip.list_pretrained())```
 * `--enable_mclip_option True` loads the mclip model, making it possible to search in any language.
 * `--columns_to_return='["url", "image_path", "caption", "NSFW"]` allows you to specify which columns should be fetched from the metadata and returned by the backend. It's useful to specify less in case of hdf5 caching to speed up the queries.
 * `--enable_faiss_memory_mapping=True` option can be passed to use an index with memory mapping.

diff --git a/clip_retrieval/load_clip.py b/clip_retrieval/load_clip.py
@@ -44,8 +44,10 @@ def load_open_clip(clip_model, use_jit=True, device="cuda", clip_cache_path=None
 
     torch.backends.cuda.matmul.allow_tf32 = True
 
-    pretrained = dict(open_clip.list_pretrained())
-    checkpoint = pretrained[clip_model]
+    clip_model = clip_model.split(" | ")
+    checkpoint = dict(open_clip.list_pretrained())[clip_model[0]] if len(clip_model)<2 else clip_model[1]
+    clip_model = clip_model[0]
+    print(f"Loading OpenClip model {clip_model} with {checkpoint} checkpoint")
     model, _, preprocess = open_clip.create_model_and_transforms(
         clip_model, pretrained=checkpoint, device=device, jit=use_jit, cache_dir=clip_cache_path
     )
@@ -61,6 +63,8 @@ def get_tokenizer(clip_model):
         import open_clip  # pylint: disable=import-outside-toplevel
 
         clip_model = clip_model[len("open_clip:") :]
+        clip_model = clip_model.split(" | ")
+        clip_model = clip_model[0]
         return open_clip.get_tokenizer(clip_model)
     else:
         return lambda t: clip.tokenize(t, truncate=True)
@@ -71,6 +75,8 @@ def load_clip_without_warmup(clip_model, use_jit, device, clip_cache_path):
     """Load clip"""
     if clip_model.startswith("open_clip:"):
         clip_model = clip_model[len("open_clip:") :]
+        clip_model = clip_model.split(" | ")
+        clip_model = clip_model[0]
         model, preprocess = load_open_clip(clip_model, use_jit, device, clip_cache_path)
     else:
         model, preprocess = clip.load(clip_model, device=device, jit=use_jit, download_root=clip_cache_path)