From 7505d4e755564c8bfd34b261588e76a78c445c44 Mon Sep 17 00:00:00 2001 From: Andriy Berestovskyy Date: Wed, 22 Jan 2025 01:04:46 +0100 Subject: [PATCH] fix: EXC-1815: Fix sandbox processes metric (#3527) The monitoring thread may collect information about each sandbox process for a few seconds. Consequently, metrics scraping might occur during this collection phase, leading to an underreporting of the number of sandboxes. This PR addresses this issue by: 1. Collecting metrics into small vectors of `f64` values. This allows for more efficient and consistent data gathering. 2. Observing the metrics within a tight loop. This ensures that the latest and most accurate data is captured. While this approach may not completely eliminate the possibility of an incomplete sandbox count, it should significantly reduce the likelihood of this occurring. --- .../sandboxed_execution_controller.rs | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs index 15bfa97a703..93b3f602180 100644 --- a/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs +++ b/rs/canister_sandbox/src/replica_controller/sandboxed_execution_controller.rs @@ -1238,6 +1238,8 @@ impl SandboxedExecutionController { let sandbox_processes = get_sandbox_process_stats(&backends); #[allow(unused_mut)] // for MacOS let mut sandbox_processes_rss = Vec::with_capacity(sandbox_processes.len()); + let mut active_last_used = Vec::with_capacity(sandbox_processes.len()); + let mut evicted_last_used = Vec::with_capacity(sandbox_processes.len()); #[cfg(target_os = "linux")] { @@ -1278,14 +1280,10 @@ impl SandboxedExecutionController { .unwrap_or_else(|| std::time::Duration::from_secs(0)); match status { SandboxProcessStatus::Active => { - metrics - .sandboxed_execution_subprocess_active_last_used - .observe(time_since_last_usage.as_secs_f64()); + active_last_used.push(time_since_last_usage.as_secs_f64()); } SandboxProcessStatus::Evicted => { - metrics - .sandboxed_execution_subprocess_evicted_last_used - .observe(time_since_last_usage.as_secs_f64()); + evicted_last_used.push(time_since_last_usage.as_secs_f64()); } } } @@ -1313,18 +1311,24 @@ impl SandboxedExecutionController { .unwrap_or_else(|| std::time::Duration::from_secs(0)); match status { SandboxProcessStatus::Active => { - metrics - .sandboxed_execution_subprocess_active_last_used - .observe(time_since_last_usage.as_secs_f64()); + active_last_used.push(time_since_last_usage.as_secs_f64()); } SandboxProcessStatus::Evicted => { - metrics - .sandboxed_execution_subprocess_evicted_last_used - .observe(time_since_last_usage.as_secs_f64()); + evicted_last_used.push(time_since_last_usage.as_secs_f64()); } } } } + for o in active_last_used { + metrics + .sandboxed_execution_subprocess_active_last_used + .observe(o); + } + for o in evicted_last_used { + metrics + .sandboxed_execution_subprocess_evicted_last_used + .observe(o); + } { let mut guard = backends.lock().unwrap();