Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: EXC-1765: Fix canister_age metric #3443

Merged
merged 2 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions rs/execution_environment/src/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,24 +603,27 @@ impl SchedulerImpl {
// are not polluted by canisters that haven't had any messages for a long time.
for canister_id in &round_filtered_canisters.active_canister_ids {
let canister_state = state.canister_state(canister_id).unwrap();
let canister_age = current_round.get()
- canister_state
.scheduler_state
.last_full_execution_round
.get();
self.metrics.canister_age.observe(canister_age as f64);
// If `canister_age` > 1 / `compute_allocation` the canister ought to have been
// scheduled.
let allocation = Ratio::new(
canister_state
.scheduler_state
.compute_allocation
.as_percent(),
100,
);
if *allocation.numer() > 0 && Ratio::from_integer(canister_age) > allocation.recip() {
self.metrics.canister_compute_allocation_violation.inc();
}
// Newly created canisters have `last_full_execution_round` set to zero,
// and hence skew the `canister_age` metric.
let last_full_execution_round =
canister_state.scheduler_state.last_full_execution_round;
if last_full_execution_round.get() != 0 {
let canister_age = current_round.get() - last_full_execution_round.get();
self.metrics.canister_age.observe(canister_age as f64);
// If `canister_age` > 1 / `compute_allocation` the canister ought to have been
// scheduled.
let allocation = Ratio::new(
canister_state
.scheduler_state
.compute_allocation
.as_percent(),
100,
);
if *allocation.numer() > 0 && Ratio::from_integer(canister_age) > allocation.recip()
{
self.metrics.canister_compute_allocation_violation.inc();
}
};
}

for (message_id, status) in ingress_execution_results {
Expand Down
4 changes: 2 additions & 2 deletions rs/execution_environment/src/scheduler/scheduler_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ impl SchedulerMetrics {
canister_age: metrics_registry.histogram(
"scheduler_canister_age_rounds",
"Number of rounds for which a canister was not scheduled.",
// 1, 2, 5, …, 100, 200, 500
decimal_buckets(0, 2),
// 1, 2, 5, …, 1000, 2000, 5000
decimal_buckets(0, 3),
),
canister_compute_allocation_violation: metrics_registry.int_counter(
"scheduler_compute_allocation_violations",
Expand Down
47 changes: 31 additions & 16 deletions rs/execution_environment/src/scheduler/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2685,21 +2685,23 @@ fn can_record_metrics_single_scheduler_thread() {

#[test]
fn can_record_metrics_for_a_round() {
let num_canisters = 3;
let scheduler_cores = num_canisters as usize - 1;
let instructions = 5;
let mut test = SchedulerTestBuilder::new()
.with_scheduler_config(SchedulerConfig {
scheduler_cores: 2,
max_instructions_per_round: NumInstructions::from(25),
max_instructions_per_message: NumInstructions::from(5),
max_instructions_per_message_without_dts: NumInstructions::new(5),
max_instructions_per_slice: NumInstructions::from(5),
scheduler_cores,
max_instructions_per_round: NumInstructions::from(instructions * 2),
max_instructions_per_message: NumInstructions::from(instructions),
max_instructions_per_message_without_dts: NumInstructions::new(instructions),
max_instructions_per_slice: NumInstructions::from(instructions),
instruction_overhead_per_execution: NumInstructions::from(0),
instruction_overhead_per_canister: NumInstructions::from(0),
instruction_overhead_per_canister_for_finalization: NumInstructions::from(0),
..SchedulerConfig::application_subnet()
})
.build();

let num_canisters = 3;
// The first two canisters have an `Allocation` of 45% and the last 9%. We'll be
// forced to execute the first two and then run out of instructions (based on
// the limits) which will result in a violation of third canister's
Expand All @@ -2715,13 +2717,10 @@ fn can_record_metrics_for_a_round() {
None,
);
for _ in 0..5 {
test.send_ingress(canister, ingress(5));
test.send_ingress(canister, ingress(instructions));
}
}

// For allocation violation to happen, the canister age should be more than `100/9 = 11 rounds`
test.advance_to_round(ExecutionRound::from(12));

for canister in test.state_mut().canister_states.values_mut() {
canister.scheduler_state.time_of_last_allocation_charge =
UNIX_EPOCH + Duration::from_secs(1);
Expand All @@ -2744,10 +2743,11 @@ fn can_record_metrics_for_a_round() {

let metrics = &test.scheduler().metrics;
assert_eq!(
metrics.executable_canisters_per_round.get_sample_sum() as i64,
3
metrics.executable_canisters_per_round.get_sample_sum() as u64,
num_canisters
);
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 12);
// The canister age metric is not observed for newly created canisters.
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 0);
assert_eq!(metrics.round_preparation_duration.get_sample_count(), 1);
assert_eq!(metrics.round_preparation_ingress.get_sample_count(), 1);
assert_eq!(metrics.round_scheduling_duration.get_sample_count(), 1);
Expand All @@ -2762,23 +2762,38 @@ fn can_record_metrics_for_a_round() {
);
assert_eq!(metrics.round_finalization_ingress.get_sample_count(), 1);
assert_eq!(metrics.round_finalization_charge.get_sample_count(), 1);
assert_eq!(metrics.canister_compute_allocation_violation.get(), 1);
// Compute allocation violation is not observed for newly created canisters.
assert_eq!(metrics.canister_compute_allocation_violation.get(), 0);
assert_eq!(
metrics.canister_messages_where_cycles_were_charged.get(),
10
scheduler_cores as u64 * 2
);

assert_eq!(
test.state()
.metadata
.subnet_metrics
.update_transactions_total,
10
scheduler_cores as u64 * 2
);
assert_eq!(
test.state().metadata.subnet_metrics.num_canisters,
num_canisters
);

// Bump up the round number.
test.execute_round(ExecutionRoundType::OrdinaryRound);

// For allocation violation to happen, the canister age should be more than `100/9 = 11 rounds`
// plus 2 rounds already executed.
test.advance_to_round(ExecutionRound::from(11 + 2));
test.execute_round(ExecutionRoundType::OrdinaryRound);

let metrics = &test.scheduler().metrics;
// The canister age metric should be observed now.
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 12);
// Compute allocation violation should also be observed now.
assert_eq!(metrics.canister_compute_allocation_violation.get(), 1);
}

/// Check that when a canister is scheduled and can't prepay for execution, the
Expand Down
Loading