From 6c0a0380ffe135f8d254ba982f34ce5e8c160418 Mon Sep 17 00:00:00 2001
From: Anatoly Belikov <abelikov@singularitynet.io>
Date: Wed, 11 Dec 2024 15:17:32 +0300
Subject: [PATCH] use sequential cpu offload for limited gpu ram

---
 multigen/worker.py        | 12 +++++++++---
 requirements.txt          |  2 +-
 tests/test_worker_flux.py |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/multigen/worker.py b/multigen/worker.py
index 413fa62..822e40f 100755
--- a/multigen/worker.py
+++ b/multigen/worker.py
@@ -64,9 +64,15 @@ def _get_pipeline(self, pipe_class, model_id, model_type, cnet=None, quantize_dt
                 # use quantisation by default for now
                 cls = pipe_class._classflux
                 if device.type == 'cuda':
-                    quantize_dtype = qfloat8
-                    # offload_device = device.index
-                    # device = torch.device('cpu')
+                    mb = torch.cuda.get_device_properties(device.index).total_memory / 1024 / 1024
+                    # quantize if there is more than 23 GB of memory
+                    # if less use cpu offload
+                    if 23000 < mb:
+                        self.logger.debug(f"set quantisation for the pipe on cuda:{device.index} has {mb}Mb")
+                        quantize_dtype = qfloat8
+                    else:
+                        offload_device = device.index
+                        device = torch.device('cpu')
             else:
                 cls = pipe_class._class
             pipeline = self._loader.load_pipeline(cls, model_id, torch_dtype=torch.bfloat16, 
diff --git a/requirements.txt b/requirements.txt
index 3a5edf0..5b68b48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ psutil
 sentencepiece
 protobuf
 accelerate
-optimum-quanto
+optimum-quanto==0.2.4
diff --git a/tests/test_worker_flux.py b/tests/test_worker_flux.py
index 1b4a70c..ae00911 100755
--- a/tests/test_worker_flux.py
+++ b/tests/test_worker_flux.py
@@ -68,7 +68,7 @@ def on_new_image(*args, **kwargs):
             nonlocal c
             c += 1
         
-        num_runs = 25
+        num_runs = 15
         for i in range(num_runs):
             if len(sessions) - 1 < i:
                 i %= len(sessions)