Merge pull request #89 from noskill/ref

use sequential cpu offload for limited gpu ram
singnet · Dec 17, 2024 · 05f2c07 · 05f2c07
2 parents f112432 + 7154d5a
commit 05f2c07
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 5 deletions.
diff --git a/multigen/worker.py b/multigen/worker.py
@@ -64,9 +64,15 @@ def _get_pipeline(self, pipe_class, model_id, model_type, cnet=None, quantize_dt
                 # use quantisation by default for now
                 cls = pipe_class._classflux
                 if device.type == 'cuda':
-                    quantize_dtype = qfloat8
-                    # offload_device = device.index
-                    # device = torch.device('cpu')
+                    mb = torch.cuda.get_device_properties(device.index).total_memory / 1024 / 1024
+                    # quantize if there is more than 23 GB of memory
+                    # if less use cpu offload
+                    if 23000 < mb:
+                        self.logger.debug(f"set quantisation for the pipe on cuda:{device.index} has {mb}Mb")
+                        quantize_dtype = qfloat8
+                    else:
+                        offload_device = device.index
+                        device = torch.device('cpu')
             else:
                 cls = pipe_class._class
             pipeline = self._loader.load_pipeline(cls, model_id, torch_dtype=torch.bfloat16, 

diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,4 @@ psutil
 sentencepiece
 protobuf
 accelerate
-optimum-quanto
+optimum-quanto==0.2.4
diff --git a/tests/test_worker_flux.py b/tests/test_worker_flux.py
@@ -68,7 +68,7 @@ def on_new_image(*args, **kwargs):
             nonlocal c
             c += 1
 
-        num_runs = 25
+        num_runs = 15
         for i in range(num_runs):
             if len(sessions) - 1 < i:
                 i %= len(sessions)