Skip to content

Commit

Permalink
Add new scheduler metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
severinson committed Nov 15, 2023
1 parent 651bebe commit df59fbc
Show file tree
Hide file tree
Showing 9 changed files with 424 additions and 38 deletions.
1 change: 0 additions & 1 deletion internal/armada/configuration/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,6 @@ type MetricsConfig struct {
}

type SchedulerMetricsConfig struct {
EnablePerQueueNodeMetrics bool
ScheduleCycleTimeHistogramSettings HistogramConfig
ReconcileCycleTimeHistogramSettings HistogramConfig
}
Expand Down
35 changes: 35 additions & 0 deletions internal/scheduler/configuration/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/armadaproject/armada/internal/common/config"
grpcconfig "github.com/armadaproject/armada/internal/common/grpc/configuration"
"github.com/armadaproject/armada/pkg/client"
v1 "k8s.io/api/core/v1"
)

const (
Expand All @@ -30,6 +31,9 @@ type Configuration struct {
Leader LeaderConfig
// Configuration controlling metrics
Metrics configuration.MetricsConfig
// Configuration for new scheduler metrics.
// Due to replace metrics configured via the above entry.
SchedulerMetrics MetricsConfig
// Scheduler configuration (this is shared with the old scheduler)
Scheduling configuration.SchedulingConfig
Auth authconfig.AuthConfig
Expand All @@ -55,6 +59,37 @@ type Configuration struct {
PulsarSendTimeout time.Duration `validate:"required"`
}

type MetricsConfig struct {
// If true, disable metric collection and publishing.
Disabled bool
// The scheduler exports metrics tracking job failures.
// These metrics may be annotated by flags indicating the type of error.
// For example, if TrackedErrorRegexes contains the following entry,
// "isCudaError": "/CUDA/"
// then job failure metrics will have a label with value
TrackedErrorRegexByLabel map[string]string
// Metrics are exported for these resources.
TrackedResourceNames []v1.ResourceName
// Controls the cycle time metrics.
CycleTimeConfig PrometheusSummaryConfig
}

// PrometheusSummaryConfig contains the relevant config for a prometheus.Summary.
type PrometheusSummaryConfig struct {
// Objectives defines the quantile rank estimates with their respective
// absolute error. If Objectives[q] = e, then the value reported for q
// will be the φ-quantile value for some φ between q-e and q+e. The
// default value is an empty map, resulting in a summary without
// quantiles.
Objectives map[float64]float64

// MaxAge defines the duration for which an observation stays relevant
// for the summary. Only applies to pre-calculated quantiles, does not
// apply to _sum and _count. Must be positive. The default value is
// DefMaxAge.
MaxAge time.Duration
}

type LeaderConfig struct {
// Valid modes are "standalone" or "kubernetes"
Mode string `validate:"required"`
Expand Down
4 changes: 2 additions & 2 deletions internal/scheduler/database/job_repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ type JobRepository interface {
// These updates are guaranteed to be consistent with each other
FetchJobUpdates(ctx *armadacontext.Context, jobSerial int64, jobRunSerial int64) ([]Job, []Run, error)

// FetchJobRunErrors returns all armadaevents.JobRunErrors for the provided job run ids. The returned map is
// keyed by job run id. Any dbRuns which don't have errors wil be absent from the map.
// FetchJobRunErrors returns all armadaevents.JobRunErrors for the provided job run ids. The returned map is
// keyed by job run id. Any dbRuns which don't have errors wil be absent from the map.
FetchJobRunErrors(ctx *armadacontext.Context, runIds []uuid.UUID) (map[uuid.UUID]*armadaevents.Error, error)

// CountReceivedPartitions returns a count of the number of partition messages present in the database corresponding
Expand Down
4 changes: 2 additions & 2 deletions internal/scheduler/jobdb/job_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func (run *JobRun) Id() uuid.UUID {
return run.id
}

// Id returns the id of the job this run is associated with.
// JobId returns the id of the job this run is associated with.
func (run *JobRun) JobId() string {
return run.jobId
}
Expand All @@ -105,7 +105,7 @@ func (run *JobRun) NodeId() string {
return run.nodeId
}

// NodeId returns the name of the node to which the JobRun is assigned.
// NodeName returns the name of the node to which the JobRun is assigned.
func (run *JobRun) NodeName() string {
return run.nodeName
}
Expand Down
Loading

0 comments on commit df59fbc

Please sign in to comment.