From 173eb918f453229430eb6242b6f0501fd359e47f Mon Sep 17 00:00:00 2001 From: lawrence-cj Date: Sun, 12 Jan 2025 14:53:11 +0800 Subject: [PATCH] update README.md; Signed-off-by: lawrence-cj --- README.md | 1 + asset/docs/model_zoo.md | 9 ++++----- diffusion/model/builder.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index edaf08a..9bffddf 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ As a result, Sana-0.6B is very competitive with modern giant diffusion model (e. ## 🔥🔥 News +- (🔥 New) \[2025/1/12\] DC-AE tiling makes Sana-4K inferences 4096x4096px images within 22GB GPU memory.[\[Guidance\]](asset/docs/model_zoo.md#-3-4k-models) - (🔥 New) \[2025/1/11\] Sana code-base license changed to Apache 2.0. - (🔥 New) \[2025/1/10\] Inference Sana with 8bit quantization.[\[Guidance\]](asset/docs/8bit_sana.md#quantization) - (🔥 New) \[2025/1/8\] 4K resolution [Sana models](asset/docs/model_zoo.md) is supported in [Sana-ComfyUI](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) and [work flow](asset/docs/ComfyUI/Sana_FlowEuler_4K.json) is also prepared. [\[4K guidance\]](asset/docs/ComfyUI/comfyui.md) diff --git a/asset/docs/model_zoo.md b/asset/docs/model_zoo.md index c1729d7..01ea915 100644 --- a/asset/docs/model_zoo.md +++ b/asset/docs/model_zoo.md @@ -77,11 +77,9 @@ image = pipe( image[0].save('sana.png') ``` -#### 2). For 4K models +## ❗ 3. 4K models -4K models need [patch_conv](https://github.com/mit-han-lab/patch_conv) to avoid OOM issue.(80GB GPU is recommended) - -run `pip install patch_conv` first, then +4K models need VAE tiling to avoid OOM issue.(24 GPU is recommended) ```python # run `pip install git+https://github.com/huggingface/diffusers` before use Sana in diffusers @@ -99,7 +97,8 @@ pipe.vae.to(torch.bfloat16) pipe.text_encoder.to(torch.bfloat16) # for 4096x4096 image generation OOM issue, feel free adjust the tile size -pipe.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_min_width=1024) +if pipe.transformer.config.sample_size == 128: + pipe.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_min_width=1024) prompt = 'a cyberpunk cat with a neon sign that says "Sana"' image = pipe( diff --git a/diffusion/model/builder.py b/diffusion/model/builder.py index 82438f0..c2ab81f 100755 --- a/diffusion/model/builder.py +++ b/diffusion/model/builder.py @@ -135,6 +135,7 @@ def vae_decode(name, vae, latent): scaling_factor = ae.cfg.scaling_factor if ae.cfg.scaling_factor else 0.41407 if latent.shape[-1] * vae_scale_factor > 4000 or latent.shape[-2] * vae_scale_factor > 4000: from patch_conv import convert_model + ae = convert_model(ae, splits=4) samples = ae.decode(latent.detach() / scaling_factor) elif "AutoencoderDC" in name: