Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Replicate demo and API #191

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
</p>

## Recent updates
- 🔥 **News**: ``2024/9/25``: Web demos are availble on Replicate! Try CogVLM2 here [![Replicate](https://replicate.com/chenxwh/cogvlm2/badge)](https://replicate.com/chenxwh/cogvlm2) and CogVLM2-Video here [![Replicate](https://replicate.com/chenxwh/cogvlm2-video/badge)](https://replicate.com/chenxwh/cogvlm2-video).
- 🔥 **News**: ``2024/8/30``: The [CogVLM2 paper](https://arxiv.org/abs/2408.16500) has been published on arXiv.
- 🔥 **News**: ``2024/7/12``: We have released CogVLM2-Video [online web demo](http://cogvlm2-online.cogviewai.cn:7868/), welcome to experience it.
- 🔥 **News**: ``2024/7/8``: We released the video understanding version of the CogVLM2 model, the CogVLM2-Video model.
Expand Down
40 changes: 40 additions & 0 deletions web_demo/replicate/cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Configuration for Cog ⚙️
# Reference: https://cog.run/yaml

build:
# set to true if your model requires a GPU
gpu: true

# a list of ubuntu apt packages to install
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"

# python version in the form '3.11' or '3.11.4'
python_version: "3.11"

# a list of packages in the format <package-name>==<version>
python_packages:
- decord>=0.6.0
- pytorchvideo==0.1.5
- xformers
- torch==2.1.0
- torchvision==0.16.0
- transformers==4.42.4
- huggingface-hub>=0.23.0
- pillow
- chainlit>=1.0
- timm>=0.9.16
- openai>=1.30.1
- loguru>=0.7.2
- accelerate
- einops
- sse-starlette>=2.1.0
- bitsandbytes>=0.43.1 # for int4 quantization
run:
- pip install ipython
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget

# predict.py defines how predictions are run on your model
# predict: "predict.py:Predictor"
predict: "predict_video.py:Predictor"
108 changes: 108 additions & 0 deletions web_demo/replicate/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Prediction interface for Cog ⚙️
# https://cog.run/python


import os
import time
import subprocess
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from cog import BasePredictor, Input, Path

MODEL_CACHE = "model_cache_image"
MODEL_URL = (
f"https://weights.replicate.delivery/default/THUDM/CogVLM2/{MODEL_CACHE}.tar"
)
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HOME"] = MODEL_CACHE
os.environ["TORCH_HOME"] = MODEL_CACHE
os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE

TORCH_TYPE = torch.bfloat16
DEVICE = "cuda:0"


def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""

if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)

# model_id: THUDM/cogvlm2-llama3-chat-19B, use 8 bit quantization
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_CACHE,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(load_in_8bit=True),
low_cpu_mem_usage=True,
).eval()

self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_CACHE, trust_remote_code=True
)

def predict(
self,
input_image: Path = Input(description="Input image"),
prompt: str = Input(description="Input prompt", default="Describe this image."),
top_p: float = Input(
description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens",
ge=0.0,
le=1.0,
default=0.9,
),
temperature: float = Input(
description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic",
default=0.7,
ge=0.0,
),
max_new_tokens: int = Input(
description="Maximum number of tokens to generate. A word is generally 2-3 tokens",
default=2048,
ge=0,
),
) -> str:
"""Run a single prediction on the model"""
image = Image.open(str(input_image)).convert("RGB")

input_by_model = self.model.build_conversation_input_ids(
self.tokenizer, query=prompt, images=[image], template_version="chat"
)

inputs = {
"input_ids": input_by_model["input_ids"].unsqueeze(0).to(DEVICE),
"token_type_ids": input_by_model["token_type_ids"].unsqueeze(0).to(DEVICE),
"attention_mask": input_by_model["attention_mask"].unsqueeze(0).to(DEVICE),
"images": (
[[input_by_model["images"][0].to(DEVICE).to(TORCH_TYPE)]]
if image is not None
else None
),
}
gen_kwargs = {
"max_new_tokens": max_new_tokens,
"pad_token_id": 128002,
# "top_k": 1,
"do_sample": True,
"top_p": top_p,
"temperature": temperature,
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs["input_ids"].shape[1] :]
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

return response
142 changes: 142 additions & 0 deletions web_demo/replicate/predict_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Prediction interface for Cog ⚙️
# https://cog.run/python


import os
import io
import time
import subprocess
import numpy as np
import torch
from PIL import Image
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from cog import BasePredictor, Input, Path

MODEL_CACHE = "model_cache_video"
MODEL_URL = (
f"https://weights.replicate.delivery/default/THUDM/CogVLM2/{MODEL_CACHE}.tar"
)
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HOME"] = MODEL_CACHE
os.environ["TORCH_HOME"] = MODEL_CACHE
os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE

TORCH_TYPE = torch.bfloat16
DEVICE = "cuda:0"


def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""

if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)

# model_id: THUDM/cogvlm2-video-llama3-chat, use 8 bit quantization
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_CACHE,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True,
).eval()

self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_CACHE, trust_remote_code=True
)

def predict(
self,
input_video: Path = Input(description="Input video"),
prompt: str = Input(description="Input prompt", default="Describe this video."),
top_p: float = Input(
description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens",
ge=0.0,
le=1.0,
default=0.1,
),
temperature: float = Input(
description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic",
default=0.1,
ge=0.0,
),
max_new_tokens: int = Input(
description="Maximum number of tokens to generate. A word is generally 2-3 tokens",
default=2048,
ge=0,
),
) -> str:
"""Run a single prediction on the model"""
video = load_video(str(input_video))

inputs = self.model.build_conversation_input_ids(
tokenizer=self.tokenizer,
query=prompt,
images=[video],
template_version="chat",
)

inputs = {
"input_ids": inputs["input_ids"].unsqueeze(0).to(DEVICE),
"token_type_ids": inputs["token_type_ids"].unsqueeze(0).to(DEVICE),
"attention_mask": inputs["attention_mask"].unsqueeze(0).to(DEVICE),
"images": [[inputs["images"][0].to("cuda").to(TORCH_TYPE)]],
}
gen_kwargs = {
"max_new_tokens": max_new_tokens,
"pad_token_id": 128002,
# "top_k": 1,
"do_sample": True,
"top_p": top_p,
"temperature": temperature,
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs["input_ids"].shape[1] :]
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

return response


def load_video(video_path):
bridge.set_bridge("torch")
with open(video_path, "rb") as f:
mp4_stream = f.read()
num_frames = 24

if mp4_stream is not None:
decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
else:
decord_vr = VideoReader(video_path, ctx=cpu(0))
frame_id_list = None
total_frames = len(decord_vr)

# strategy == 'chat':
timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
timestamps = [i[0] for i in timestamps]
max_second = round(max(timestamps)) + 1
frame_id_list = []
for second in range(max_second):
closest_num = min(timestamps, key=lambda x: abs(x - second))
index = timestamps.index(closest_num)
frame_id_list.append(index)
if len(frame_id_list) >= num_frames:
break
video_data = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data