From 1dc330383ca0b19d5eba0d55a6b418d584dc75de Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Thu, 16 Jan 2025 20:24:27 +0000 Subject: [PATCH] Revise the logic for calculating the RAM model cache limit. --- .../load/model_cache/model_cache.py | 102 +++++++++++++----- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py index 5ef6aefe613..237f5324fff 100644 --- a/invokeai/backend/model_manager/load/model_cache/model_cache.py +++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py @@ -123,6 +123,8 @@ def __init__( self._cached_models: Dict[str, CacheRecord] = {} self._cache_stack: List[str] = [] + self._ram_cache_size_bytes = self._calc_ram_available_to_model_cache() + @property def stats(self) -> Optional[CacheStats]: """Return collected CacheStats object.""" @@ -388,41 +390,89 @@ def _get_vram_in_use(self) -> int: # Alternative definition of VRAM in use: # return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values()) - def _get_ram_available(self) -> int: - """Get the amount of RAM available for the cache to use, while keeping memory pressure under control.""" + def _calc_ram_available_to_model_cache(self) -> int: + """Calculate the amount of RAM available for the cache to use.""" # If self._max_ram_cache_size_gb is set, then it overrides the default logic. if self._max_ram_cache_size_gb is not None: - ram_total_available_to_cache = int(self._max_ram_cache_size_gb * GB) - return ram_total_available_to_cache - self._get_ram_in_use() - - virtual_memory = psutil.virtual_memory() - ram_total = virtual_memory.total - ram_available = virtual_memory.available - ram_used = ram_total - ram_available - - # The total size of all the models in the cache will often be larger than the amount of RAM reported by psutil - # (due to lazy-loading and OS RAM caching behaviour). We could just rely on the psutil values, but it feels - # like a bad idea to over-fill the model cache. So, for now, we'll try to keep the total size of models in the - # cache under the total amount of system RAM. - cache_ram_used = self._get_ram_in_use() - ram_used = max(cache_ram_used, ram_used) - - # Aim to keep 10% of RAM free. - ram_available_based_on_memory_usage = int(ram_total * 0.9) - ram_used + self._logger.info(f"Using user-defined RAM cache size: {self._max_ram_cache_size_gb} GB.") + return int(self._max_ram_cache_size_gb * GB) + + # Heuristics for dynamically calculating the RAM cache size, **in order of increasing priority**: + # 1. As an initial default, use 50% of the total RAM for InvokeAI. + # - Assume a 2GB baseline for InvokeAI's non-model RAM usage, and use the rest of the RAM for the model cache. + # 2. On a system with a lot of RAM (e.g. 64GB+), users probably don't want InvokeAI to eat up too much RAM. + # There are diminishing returns to storing more and more models. So, we apply an upper bound. + # - On systems without a CUDA device, the upper bound is 32GB. + # - On systems with a CUDA device, the upper bound is 2x the amount of VRAM. + # 3. On systems with a CUDA device, the minimum should be the VRAM size (less the working memory). + # - Setting lower than this would mean that we sometimes kick models out of the cache when there is room for + # all models in VRAM. + # - Consider an extreme case of a system with 8GB RAM / 24GB VRAM. I haven't tested this, but I think + # you'd still want the RAM cache size to be ~24GB (less the working memory). (Though you'd probably want to + # set `keep_ram_copy_of_weights: false` in this case.) + # 4. Absolute minimum of 4GB. + + # NOTE(ryand): We explored dynamically adjusting the RAM cache size based on memory pressure (using psutil), but + # decided against it for now, for the following reasons: + # - It was surprisingly difficult to get memory metrics with consistent definitions across OSes. (If you go + # down this path again, don't underestimate the amount of complexity here and be sure to test rigorously on all + # OSes.) + # - Making the RAM cache size dynamic opens the door for performance regressions that are hard to diagnose and + # hard for users to understand. It is better for users to see that their RAM is maxed out, and then override + # the default value if desired. + + # Lookup the total VRAM size for the CUDA execution device. + total_cuda_vram_bytes: int | None = None + if self._execution_device.type == "cuda": + _, total_cuda_vram_bytes = torch.cuda.mem_get_info(self._execution_device) + + # Apply heuristic 1. + # ------------------ + heuristics_applied = [1] + total_system_ram_bytes = psutil.virtual_memory().total + # Assumed baseline RAM used by InvokeAI for non-model stuff. + baseline_ram_used_by_invokeai = 2 * GB + ram_available_to_model_cache = int(total_system_ram_bytes * 0.5 - baseline_ram_used_by_invokeai) + + # Apply heuristic 2. + # ------------------ + max_ram_cache_size_bytes = 32 * GB + if total_cuda_vram_bytes is not None: + max_ram_cache_size_bytes = 2 * total_cuda_vram_bytes + if ram_available_to_model_cache > max_ram_cache_size_bytes: + heuristics_applied.append(2) + ram_available_to_model_cache = max_ram_cache_size_bytes + + # Apply heuristic 3. + # ------------------ + if total_cuda_vram_bytes is not None: + if self._max_vram_cache_size_gb is not None: + min_ram_cache_size_bytes = int(self._max_vram_cache_size_gb * GB) + else: + min_ram_cache_size_bytes = total_cuda_vram_bytes - int(self._execution_device_working_mem_gb * GB) + if ram_available_to_model_cache < min_ram_cache_size_bytes: + heuristics_applied.append(3) + ram_available_to_model_cache = min_ram_cache_size_bytes - # If we are running out of RAM, then there's an increased likelihood that we will run into this issue: - # https://github.com/invoke-ai/InvokeAI/issues/7513 - # To keep things running smoothly, there's a minimum RAM cache size that we always allow (even if this means - # using swap). - min_ram_cache_size_bytes = 4 * GB - ram_available_based_on_min_cache_size = min_ram_cache_size_bytes - cache_ram_used + # Apply heuristic 4. + # ------------------ + if ram_available_to_model_cache < 4 * GB: + heuristics_applied.append(4) + ram_available_to_model_cache = 4 * GB - return max(ram_available_based_on_memory_usage, ram_available_based_on_min_cache_size) + self._logger.info( + f"Calculated model RAM cache size: {ram_available_to_model_cache / MB:.2f} MB. Heuristics applied: {heuristics_applied}." + ) + return ram_available_to_model_cache def _get_ram_in_use(self) -> int: """Get the amount of RAM currently in use.""" return sum(ce.cached_model.total_bytes() for ce in self._cached_models.values()) + def _get_ram_available(self) -> int: + """Get the amount of RAM available for the cache to use.""" + return self._ram_cache_size_bytes - self._get_ram_in_use() + def _capture_memory_snapshot(self) -> Optional[MemorySnapshot]: if self._log_memory_usage: return MemorySnapshot.capture()