Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cat-gateway): Add memory metrics to improve observability #1499

Merged
merged 20 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions catalyst-gateway/bin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ bech32 = "0.11.0"
const_format = "0.2.33"
regex = "1.11.1"
minijinja = "2.5.0"
stats_alloc = "0.1.10"
memory-stats = "1.0.0"

[dev-dependencies]
proptest = "1.5.0"
Expand Down
1 change: 1 addition & 0 deletions catalyst-gateway/bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ mod cli;
mod db;
mod jinja;
mod logger;
mod metrics;
mod service;
mod settings;
mod utils;
Expand Down
1 change: 1 addition & 0 deletions catalyst-gateway/bin/src/metrics/chain_follower.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
//! Metrics related to Chain Follower analytics.
1 change: 1 addition & 0 deletions catalyst-gateway/bin/src/metrics/chain_indexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
//! Metrics related to Chain Indexer analytics.
60 changes: 60 additions & 0 deletions catalyst-gateway/bin/src/metrics/endpoint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//! Metrics related to endpoint analytics.

use std::sync::LazyLock;

use prometheus::{register_histogram_vec, register_int_counter_vec, HistogramVec, IntCounterVec};

/// Labels for the metrics
const METRIC_LABELS: [&str; 3] = ["endpoint", "method", "status_code"];
/// Labels for the client metrics
const CLIENT_METRIC_LABELS: [&str; 2] = ["client", "status_code"];

// Prometheus Metrics maintained by the service

/// HTTP Request duration histogram.
pub(crate) static HTTP_REQ_DURATION_MS: LazyLock<HistogramVec> = LazyLock::new(|| {
register_histogram_vec!(
"http_request_duration_ms",
"Duration of HTTP requests in milliseconds",
&METRIC_LABELS
)
.unwrap()
});

/// HTTP Request CPU Time histogram.
pub(crate) static HTTP_REQ_CPU_TIME_MS: LazyLock<HistogramVec> = LazyLock::new(|| {
register_histogram_vec!(
"http_request_cpu_time_ms",
"CPU Time of HTTP requests in milliseconds",
&METRIC_LABELS
)
.unwrap()
});

// No Tacho implemented to enable this.
// static ref HTTP_REQUEST_RATE: GaugeVec = register_gauge_vec!(
// "http_request_rate",
// "Rate of HTTP requests per second",
// &METRIC_LABELS
// )
// .unwrap();

/// HTTP Request count histogram.
pub(crate) static HTTP_REQUEST_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
register_int_counter_vec!(
"http_request_count",
"Number of HTTP requests",
&METRIC_LABELS
)
.unwrap()
});

/// Client Request Count histogram.
pub(crate) static CLIENT_REQUEST_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
register_int_counter_vec!(
"client_request_count",
"Number of HTTP requests per client",
&CLIENT_METRIC_LABELS
)
.unwrap()
});
97 changes: 97 additions & 0 deletions catalyst-gateway/bin/src/metrics/memory.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//! Metrics related to memory analytics.

use std::{
alloc::System,
sync::{
atomic::{AtomicBool, Ordering},
Arc, LazyLock, RwLock,
},
thread,
time::Duration,
};

use memory_stats::memory_stats;
use stats_alloc::{Region, Stats, StatsAlloc, INSTRUMENTED_SYSTEM};
use tracing::log::error;

/// Use the instrumented allocator for gathering allocation statistics.
/// Note: This wraps the global allocator.
/// All structs that use the global allocator can be tracked.
#[global_allocator]
static GLOBAL: &StatsAlloc<System> = &INSTRUMENTED_SYSTEM;

/// A global, thread-safe container for memory metrics.
static GLOBAL_METRICS: LazyLock<Arc<RwLock<MemoryMetrics>>> =
LazyLock::new(|| Arc::new(RwLock::new(MemoryMetrics::default())));

/// This is to prevent the init function from accidentally being called multiple times.
static IS_INITIALIZED: AtomicBool = AtomicBool::new(false);

/// Interval for updating memory metrics, in milliseconds.
const UPDATE_INTERVAL_MILLI: u64 = 1000;

/// A structure for storing memory metrics, including allocator statistics
/// and physical/virtual memory usage.
#[derive(Debug, Default, Clone)]
pub(crate) struct MemoryMetrics {
/// Statistics from the global allocator, including allocations and deallocations from
/// `stats_alloc::Stats`.
pub(crate) allocator_stats: Stats,
/// Physical memory usage of the application, if available.
pub(crate) physical_usage: Option<usize>,
/// Virtual memory usage of the application, if available.
pub(crate) virtual_usage: Option<usize>,
}

impl MemoryMetrics {
/// Updates the memory metrics with the latest allocator statistics and memory stats.
///
/// # Arguments
/// * `allocator_stats` - Current statistics from the global allocator.
fn update(&mut self, allocator_stats: Stats) {
self.allocator_stats = allocator_stats;

if let Some(mem_stats) = memory_stats() {
self.physical_usage = Some(mem_stats.physical_mem);
self.virtual_usage = Some(mem_stats.virtual_mem);
} else {
self.physical_usage = None;
self.virtual_usage = None;
}
}

/// Starts a background thread to periodically update memory metrics.
///
/// This function spawns a thread that updates the global `MemoryMetrics`
/// structure at regular intervals defined by `UPDATE_INTERVAL_MILLI`.
pub(crate) fn init_metrics_updater() {
if IS_INITIALIZED.swap(true, Ordering::SeqCst) {
return;
}

let stats = Region::new(GLOBAL);

thread::spawn(move || {
let interval = Duration::from_millis(UPDATE_INTERVAL_MILLI);
loop {
let allocator_stats = stats.change();
match GLOBAL_METRICS.read() {
Ok(_) => {
match GLOBAL_METRICS.write() {
Ok(mut writable_metrics) => {
writable_metrics.update(allocator_stats);
},
Err(err) => {
error!("Failed to acquire write lock on metrics: {:?}", err);
},
}
},
Err(err) => {
error!("Failed to read memory usage metrics: {:?}", err);
},
}
thread::sleep(interval);
}
});
}
}
20 changes: 20 additions & 0 deletions catalyst-gateway/bin/src/metrics/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//! This module contains submodules related to metrics report and analytics.

use prometheus::{default_registry, Registry};

pub(crate) mod chain_follower;
pub(crate) mod chain_indexer;
pub(crate) mod endpoint;
pub(crate) mod memory;

/// Initialize Prometheus metrics.
///
/// ## Returns
///
/// Returns the default prometheus registry.
#[must_use]
pub(crate) fn init_prometheus() -> Registry {
memory::MemoryMetrics::init_metrics_updater();

default_registry().clone()
}
3 changes: 2 additions & 1 deletion catalyst-gateway/bin/src/service/poem_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ use poem::{
};

use crate::{
metrics::init_prometheus,
service::{
api::mk_api,
docs::{docs, favicon},
utilities::{
catch_panic::{set_panic_hook, ServicePanicHandler},
middleware::tracing_mw::{init_prometheus, Tracing},
middleware::tracing_mw::Tracing,
},
},
settings::Settings,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Full Tracing and metrics middleware.
use std::{sync::LazyLock, time::Instant};

use std::time::Instant;

use cpu_time::ProcessTime; // ThreadTime doesn't work.
use poem::{
Expand All @@ -8,74 +9,17 @@ use poem::{
Endpoint, Error, FromRequest, IntoResponse, Middleware, PathPattern, Request, Response, Result,
};
use poem_openapi::OperationId;
use prometheus::{
default_registry, register_histogram_vec, register_int_counter_vec, HistogramVec,
IntCounterVec, Registry,
};
use tracing::{error, field, Instrument, Level, Span};
use ulid::Ulid;
use uuid::Uuid;

use crate::{settings::Settings, utils::blake2b_hash::generate_uuid_string_from_data};

/// Labels for the metrics
const METRIC_LABELS: [&str; 3] = ["endpoint", "method", "status_code"];
/// Labels for the client metrics
const CLIENT_METRIC_LABELS: [&str; 2] = ["client", "status_code"];

// Prometheus Metrics maintained by the service

/// HTTP Request duration histogram.
static HTTP_REQ_DURATION_MS: LazyLock<HistogramVec> = LazyLock::new(|| {
#[allow(clippy::ignored_unit_patterns)]
register_histogram_vec!(
"http_request_duration_ms",
"Duration of HTTP requests in milliseconds",
&METRIC_LABELS
)
.unwrap()
});

/// HTTP Request CPU Time histogram.
static HTTP_REQ_CPU_TIME_MS: LazyLock<HistogramVec> = LazyLock::new(|| {
#[allow(clippy::ignored_unit_patterns)]
register_histogram_vec!(
"http_request_cpu_time_ms",
"CPU Time of HTTP requests in milliseconds",
&METRIC_LABELS
)
.unwrap()
});

// No Tacho implemented to enable this.
// static ref HTTP_REQUEST_RATE: GaugeVec = register_gauge_vec!(
// "http_request_rate",
// "Rate of HTTP requests per second",
// &METRIC_LABELS
// )
// .unwrap();

/// HTTP Request count histogram.
static HTTP_REQUEST_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
#[allow(clippy::ignored_unit_patterns)]
register_int_counter_vec!(
"http_request_count",
"Number of HTTP requests",
&METRIC_LABELS
)
.unwrap()
});

/// Client Request Count histogram.
static CLIENT_REQUEST_COUNT: LazyLock<IntCounterVec> = LazyLock::new(|| {
#[allow(clippy::ignored_unit_patterns)]
register_int_counter_vec!(
"client_request_count",
"Number of HTTP requests per client",
&CLIENT_METRIC_LABELS
)
.unwrap()
});
use crate::{
metrics::endpoint::{
CLIENT_REQUEST_COUNT, HTTP_REQUEST_COUNT, HTTP_REQ_CPU_TIME_MS, HTTP_REQ_DURATION_MS,
},
settings::Settings,
utils::blake2b_hash::generate_uuid_string_from_data,
};

// Currently no way to get these values. TODO.
// Panic Request Count histogram.
Expand Down Expand Up @@ -389,13 +333,3 @@ impl<E: Endpoint> Endpoint for TracingEndpoint<E> {
response
}
}

/// Initialize Prometheus metrics.
///
/// ## Returns
///
/// Returns the default prometheus registry.
#[must_use]
pub(crate) fn init_prometheus() -> Registry {
default_registry().clone()
}
Loading