From 6c0a0380ffe135f8d254ba982f34ce5e8c160418 Mon Sep 17 00:00:00 2001 From: Anatoly Belikov Date: Wed, 11 Dec 2024 15:17:32 +0300 Subject: [PATCH] use sequential cpu offload for limited gpu ram --- multigen/worker.py | 12 +++++++++--- requirements.txt | 2 +- tests/test_worker_flux.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/multigen/worker.py b/multigen/worker.py index 413fa62..822e40f 100755 --- a/multigen/worker.py +++ b/multigen/worker.py @@ -64,9 +64,15 @@ def _get_pipeline(self, pipe_class, model_id, model_type, cnet=None, quantize_dt # use quantisation by default for now cls = pipe_class._classflux if device.type == 'cuda': - quantize_dtype = qfloat8 - # offload_device = device.index - # device = torch.device('cpu') + mb = torch.cuda.get_device_properties(device.index).total_memory / 1024 / 1024 + # quantize if there is more than 23 GB of memory + # if less use cpu offload + if 23000 < mb: + self.logger.debug(f"set quantisation for the pipe on cuda:{device.index} has {mb}Mb") + quantize_dtype = qfloat8 + else: + offload_device = device.index + device = torch.device('cpu') else: cls = pipe_class._class pipeline = self._loader.load_pipeline(cls, model_id, torch_dtype=torch.bfloat16, diff --git a/requirements.txt b/requirements.txt index 3a5edf0..5b68b48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ psutil sentencepiece protobuf accelerate -optimum-quanto +optimum-quanto==0.2.4 diff --git a/tests/test_worker_flux.py b/tests/test_worker_flux.py index 1b4a70c..ae00911 100755 --- a/tests/test_worker_flux.py +++ b/tests/test_worker_flux.py @@ -68,7 +68,7 @@ def on_new_image(*args, **kwargs): nonlocal c c += 1 - num_runs = 25 + num_runs = 15 for i in range(num_runs): if len(sessions) - 1 < i: i %= len(sessions)