From 6a9de1fcf3e1c50b5dbdc537934708b75994e2c8 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 6 Jan 2025 20:38:17 +0000 Subject: [PATCH] Change definition of VRAM in use for the ModelCache from sum of model weights to the total torch.cuda.memory_allocated(). --- .../load/model_cache/model_cache.py | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py index f1d3f8cf9ef..98462a54c53 100644 --- a/invokeai/backend/model_manager/load/model_cache/model_cache.py +++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py @@ -247,7 +247,6 @@ def unlock(self, cache_entry: CacheRecord) -> None: def _load_locked_model(self, cache_entry: CacheRecord) -> None: """Helper function for self.lock(). Loads a locked model into VRAM.""" start_time = time.time() - vram_available = self._get_vram_available() # Calculate model_vram_needed, the amount of additional VRAM that will be used if we fully load the model into # VRAM. @@ -255,9 +254,7 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None: model_total_bytes = cache_entry.cached_model.total_bytes() model_vram_needed = model_total_bytes - model_cur_vram_bytes - # The amount of VRAM that must be freed to make room for model_vram_needed. - vram_bytes_to_free = max(0, model_vram_needed - vram_available) - + vram_available = self._get_vram_available() self._logger.debug( f"Before unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}" ) @@ -266,7 +263,7 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None: # 1. If the model can fit entirely in VRAM, then make enough room for it to be loaded fully. # 2. If the model can't fit fully into VRAM, then unload all other models and load as much of the model as # possible. - vram_bytes_freed = self._offload_unlocked_models(vram_bytes_to_free) + vram_bytes_freed = self._offload_unlocked_models(model_vram_needed) self._logger.debug(f"Unloaded models (if necessary): vram_bytes_freed={(vram_bytes_freed/MB):.2f}MB") # Check the updated vram_available after offloading. @@ -278,7 +275,9 @@ def _load_locked_model(self, cache_entry: CacheRecord) -> None: # Move as much of the model as possible into VRAM. # For testing, only allow 10% of the model to be loaded into VRAM. # vram_available = int(model_vram_needed * 0.1) - model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available) + # We add 1 MB to the available VRAM to account for small errors in memory tracking (e.g. off-by-one). A fully + # loaded model is much faster than a 95% loaded model. + model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available + MB) model_cur_vram_bytes = cache_entry.cached_model.cur_vram_bytes() vram_available = self._get_vram_available() @@ -330,7 +329,14 @@ def _get_vram_available(self) -> int: def _get_vram_in_use(self) -> int: """Get the amount of VRAM currently in use by the cache.""" - return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values()) + if self._execution_device.type == "cuda": + return torch.cuda.memory_allocated() + elif self._execution_device.type == "mps": + return torch.mps.current_allocated_memory() + else: + raise ValueError(f"Unsupported execution device type: {self._execution_device.type}") + # Alternative definition of VRAM in use: + # return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values()) def _get_ram_available(self) -> int: """Get the amount of RAM available for the cache to use, while keeping memory pressure under control.""" @@ -357,24 +363,28 @@ def _get_vram_state_str(self, model_cur_vram_bytes: int, model_total_bytes: int, + f"vram_available={(vram_available/MB):.0f} MB, " ) - def _offload_unlocked_models(self, vram_bytes_to_free: int) -> int: - """Offload models from the execution_device until vram_bytes_to_free bytes are freed, or all models are + def _offload_unlocked_models(self, vram_bytes_required: int) -> int: + """Offload models from the execution_device until vram_bytes_required bytes are available, or all models are offloaded. Of course, locked models are not offloaded. Returns: - int: The number of bytes freed. + int: The number of bytes freed based on believed model sizes. The actual change in VRAM may be different. """ - self._logger.debug(f"Offloading unlocked models with goal of freeing {vram_bytes_to_free/MB:.2f}MB of VRAM.") + self._logger.debug( + f"Offloading unlocked models with goal of making room for {vram_bytes_required/MB:.2f}MB of VRAM." + ) vram_bytes_freed = 0 # TODO(ryand): Give more thought to the offloading policy used here. cache_entries_increasing_size = sorted(self._cached_models.values(), key=lambda x: x.cached_model.total_bytes()) for cache_entry in cache_entries_increasing_size: - if vram_bytes_freed >= vram_bytes_to_free: + # We do not fully trust the count of bytes freed, so we check again on each iteration. + vram_available = self._get_vram_available() + vram_bytes_to_free = vram_bytes_required - vram_available + if vram_bytes_to_free <= 0: break if cache_entry.is_locked: continue - - cache_entry_bytes_freed = self._move_model_to_ram(cache_entry, vram_bytes_to_free - vram_bytes_freed) + cache_entry_bytes_freed = self._move_model_to_ram(cache_entry, vram_bytes_to_free) if cache_entry_bytes_freed > 0: self._logger.debug( f"Unloaded {cache_entry.key} from VRAM to free {(cache_entry_bytes_freed/MB):.0f} MB."