Skip to content

Commit

Permalink
fix: add legacy IPAM metrics back to IPAMv2 (#2970)
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Baker <[email protected]>
  • Loading branch information
rbtr authored Sep 4, 2024
1 parent b7ce09f commit ff46b57
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 62 deletions.
59 changes: 19 additions & 40 deletions cns/ipampool/metrics.go → cns/ipampool/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
package ipampool
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
subnetLabel = "subnet"
subnetCIDRLabel = "subnet_cidr"
podnetARMIDLabel = "podnet_arm_id"
SubnetLabel = "subnet"
SubnetCIDRLabel = "subnet_cidr"
PodnetARMIDLabel = "podnet_arm_id"
customerMetricLabel = "customer_metric"
customerMetricLabelValue = "customer metric"
subnetExhaustionStateLabel = "subnet_exhaustion_state"
SubnetExhaustionStateLabel = "subnet_exhaustion_state"
SubnetIPExhausted = 1
SubnetIPNotExhausted = 0
)
Expand All @@ -23,110 +23,110 @@ var (
Help: "IPs currently in use by Pods on this CNS Node.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamAvailableIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_available_ips",
Help: "IPs available on this CNS Node for use by a Pod.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamBatchSize = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_batch_size",
Help: "IPAM IP pool scaling batch size.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamCurrentAvailableIPcount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_current_available_ips",
Help: "Current available IP count.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamExpectedAvailableIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_expect_available_ips",
Help: "Expected future available IP count assuming the Requested IP count is honored.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamMaxIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_max_ips",
Help: "Maximum Secondary IPs allowed on this Node.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPendingProgramIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_pending_programming_ips",
Help: "IPs reserved but not yet available (Pending Programming).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPendingReleaseIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_pending_release_ips",
Help: "IPs reserved but not available anymore (Pending Release).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPrimaryIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_primary_ips",
Help: "NC Primary IP count (reserved from Pod Subnet for DNS and IMDS SNAT).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamRequestedIPConfigCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_requested_ips",
Help: "Secondary Pod Subnet IPs requested by this CNS Node (for Pods).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSecondaryIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_secondary_ips",
Help: "Node NC Secondary IP count (reserved usable by Pods).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamTotalIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_total_ips",
Help: "Count of total IP pool size allocated to CNS by DNC.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSubnetExhaustionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_subnet_exhaustion_state",
Help: "IPAM view of subnet exhaustion state",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSubnetExhaustionCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cx_ipam_subnet_exhaustion_state_count_total",
Help: "Count of the number of times the ipam pool monitor sees subnet exhaustion",
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel, subnetExhaustionStateLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel, SubnetExhaustionStateLabel},
)
)

Expand All @@ -148,24 +148,3 @@ func init() {
IpamSubnetExhaustionCount,
)
}

func observeIPPoolState(state ipPoolState, meta metaState) {
labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPExhausted))
} else {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPNotExhausted))
}
}
157 changes: 157 additions & 0 deletions cns/ipampool/metrics/observer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package metrics

import (
"context"
"fmt"
"net/netip"

"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/types"
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
"github.com/pkg/errors"
)

// Subnet ARM ID /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/$(SUBNET)
const subnetARMIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s"

// ipPoolState is the current actual state of the CNS IP pool.
type ipPoolState struct {
// allocatedToPods are the IPs CNS gives to Pods.
allocatedToPods int64
// available are the IPs in state "Available".
available int64
// currentAvailableIPs are the current available IPs: allocated - assigned - pendingRelease.
currentAvailableIPs int64
// expectedAvailableIPs are the "future" available IPs, if the requested IP count is honored: requested - assigned.
expectedAvailableIPs int64
// pendingProgramming are the IPs in state "PendingProgramming".
pendingProgramming int64
// pendingRelease are the IPs in state "PendingRelease".
pendingRelease int64
// requestedIPs are the IPs CNS has requested that it be allocated by DNC.
requestedIPs int64
// secondaryIPs are all the IPs given to CNS by DNC, not including the primary IP of the NC.
secondaryIPs int64
}

// metaState is the Monitor's configuration state for the IP pool.
type metaState struct {
batch int64
exhausted bool
max int64
primaryIPAddresses map[string]struct{}
subnet string
subnetARMID string
subnetCIDR string
}

// NewLegacyMetricsObserver creates a closed functional scope which can be invoked to
// observe the legacy IPAM pool metrics.
//
//nolint:lll // ignore line length
func NewLegacyMetricsObserver(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) func() error {
return func() error {
return observeMetrics(ctx, ipcli, nnccli, csscli)
}
}

// generateARMID uses the Subnet ARM ID format to populate the ARM ID with the metadata.
// If either of the metadata attributes are empty, then the ARM ID will be an empty string.
func generateARMID(nc *v1alpha.NetworkContainer) string {
subscription := nc.SubscriptionID
resourceGroup := nc.ResourceGroupID
vnetID := nc.VNETID
subnetID := nc.SubnetID

if subscription == "" || resourceGroup == "" || vnetID == "" || subnetID == "" {
return ""
}
return fmt.Sprintf(subnetARMIDTemplate, subscription, resourceGroup, vnetID, subnetID)
}

// observeMetrics observes the IP pool and updates the metrics. Blocking.
//
//nolint:lll // ignore line length
func observeMetrics(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) error {
csslist, err := csscli(ctx)
if err != nil {
return err
}
nnc, err := nnccli(ctx)
if err != nil {
return err
}
ips := ipcli()

var meta metaState
for i := range csslist {
if csslist[i].Status.Exhausted {
meta.exhausted = true
break
}
}
if len(nnc.Status.NetworkContainers) > 0 {
// Set SubnetName, SubnetAddressSpace and Pod Network ARM ID values to the global subnet, subnetCIDR and subnetARM variables.
meta.subnet = nnc.Status.NetworkContainers[0].SubnetName
meta.subnetCIDR = nnc.Status.NetworkContainers[0].SubnetAddressSpace
meta.subnetARMID = generateARMID(&nnc.Status.NetworkContainers[0])
}
meta.primaryIPAddresses = make(map[string]struct{})
// Add Primary IP to Map, if not present.
// This is only for Swift i.e. if NC Type is vnet.
for i := 0; i < len(nnc.Status.NetworkContainers); i++ {
nc := nnc.Status.NetworkContainers[i]
if nc.Type == "" || nc.Type == v1alpha.VNET {
meta.primaryIPAddresses[nc.PrimaryIP] = struct{}{}
}

if nc.Type == v1alpha.VNETBlock {
primaryPrefix, err := netip.ParsePrefix(nc.PrimaryIP)
if err != nil {
return errors.Wrapf(err, "unable to parse ip prefix: %s", nc.PrimaryIP)
}
meta.primaryIPAddresses[primaryPrefix.Addr().String()] = struct{}{}
}
}

state := ipPoolState{
secondaryIPs: int64(len(ips)),
requestedIPs: nnc.Spec.RequestedIPCount,
}
for i := range ips {
ip := ips[i]
switch ip.GetState() {
case types.Assigned:
state.allocatedToPods++
case types.Available:
state.available++
case types.PendingProgramming:
state.pendingProgramming++
case types.PendingRelease:
state.pendingRelease++
}
}
state.currentAvailableIPs = state.secondaryIPs - state.allocatedToPods - state.pendingRelease
state.expectedAvailableIPs = state.requestedIPs - state.allocatedToPods

labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPExhausted))
} else {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPNotExhausted))
}
return nil
}
28 changes: 25 additions & 3 deletions cns/ipampool/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/ipampool/metrics"
"github.com/Azure/azure-container-networking/cns/logger"
"github.com/Azure/azure-container-networking/cns/metric"
"github.com/Azure/azure-container-networking/cns/types"
Expand Down Expand Up @@ -105,9 +106,9 @@ func (pm *Monitor) Start(ctx context.Context) error {
case css := <-pm.cssSource: // received an updated ClusterSubnetState
pm.metastate.exhausted = css.Status.Exhausted
logger.Printf("subnet exhausted status = %t", pm.metastate.exhausted)
IpamSubnetExhaustionCount.With(prometheus.Labels{
subnetLabel: pm.metastate.subnet, subnetCIDRLabel: pm.metastate.subnetCIDR,
podnetARMIDLabel: pm.metastate.subnetARMID, subnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
metrics.IpamSubnetExhaustionCount.With(prometheus.Labels{
metrics.SubnetLabel: pm.metastate.subnet, metrics.SubnetCIDRLabel: pm.metastate.subnetCIDR,
metrics.PodnetARMIDLabel: pm.metastate.subnetARMID, metrics.SubnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
}).Inc()
select {
default:
Expand Down Expand Up @@ -482,6 +483,27 @@ func (pm *Monitor) clampScaler(scaler *v1alpha.Scaler) {
}
}

func observeIPPoolState(state ipPoolState, meta metaState) {
labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
metrics.IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
metrics.IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
metrics.IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
metrics.IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
metrics.IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
metrics.IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
metrics.IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
metrics.IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
metrics.IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
metrics.IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
metrics.IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
metrics.IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
metrics.IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(metrics.SubnetIPExhausted))
} else {
metrics.IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(metrics.SubnetIPNotExhausted))
}
}

// CalculateMinFreeIPs calculates the minimum free IP quantity based on the Scaler
// in the passed NodeNetworkConfig.
// Half of odd batches are rounded up!
Expand Down
Loading

0 comments on commit ff46b57

Please sign in to comment.