diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2f62a8f..3e0b3c0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -23,7 +23,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 14ca26c..fcaa940 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -24,12 +24,12 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Lint - uses: golangci/golangci-lint-action@v5 + uses: golangci/golangci-lint-action@v6 with: - version: 'v1.58.0' + version: 'v1.60' args: -v --timeout 5m skip-cache: true @@ -49,7 +49,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Test run: go test -v ./... @@ -62,7 +62,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build @@ -76,7 +76,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 30daa65..b69f714 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build @@ -37,7 +37,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build @@ -57,7 +57,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build @@ -77,7 +77,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v4 with: - go-version: '1.22' + go-version: '1.23' - name: Build run: make build diff --git a/Makefile b/Makefile index d7eda53..ae7a602 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ LINTER_BIN ?= golangci-lint DOCKER_BIN ?= docker -TARGETS := knavigator klient +TARGETS := knavigator klient node-resource-exporter CMD_DIR := ./cmd OUTPUT_DIR := ./bin diff --git a/charts/node-resource-exporter/.helmignore b/charts/node-resource-exporter/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/node-resource-exporter/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/node-resource-exporter/Chart.yaml b/charts/node-resource-exporter/Chart.yaml new file mode 100644 index 0000000..2139a8a --- /dev/null +++ b/charts/node-resource-exporter/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: node-resource-exporter +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/charts/node-resource-exporter/templates/NOTES.txt b/charts/node-resource-exporter/templates/NOTES.txt new file mode 100644 index 0000000..dd5fae9 --- /dev/null +++ b/charts/node-resource-exporter/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "node-resource-exporter.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "node-resource-exporter.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "node-resource-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "node-resource-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/charts/node-resource-exporter/templates/_helpers.tpl b/charts/node-resource-exporter/templates/_helpers.tpl new file mode 100644 index 0000000..b36c596 --- /dev/null +++ b/charts/node-resource-exporter/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "node-resource-exporter.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "node-resource-exporter.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "node-resource-exporter.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "node-resource-exporter.labels" -}} +helm.sh/chart: {{ include "node-resource-exporter.chart" . }} +{{ include "node-resource-exporter.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "node-resource-exporter.selectorLabels" -}} +app.kubernetes.io/name: {{ include "node-resource-exporter.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "node-resource-exporter.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "node-resource-exporter.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/node-resource-exporter/templates/deployment.yaml b/charts/node-resource-exporter/templates/deployment.yaml new file mode 100644 index 0000000..5459a18 --- /dev/null +++ b/charts/node-resource-exporter/templates/deployment.yaml @@ -0,0 +1,80 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "node-resource-exporter.fullname" . }} + namespace: {{.Release.Namespace}} + labels: + {{- include "node-resource-exporter.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "node-resource-exporter.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "node-resource-exporter.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "node-resource-exporter.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /usr/local/bin/node-resource-exporter + args: + - -p + - "{{ .Values.cmd.port }}" + - -r + - {{ .Values.cmd.resources }} + - -l + - {{ .Values.cmd.nodeLabels }} + - -v + - "{{ .Values.cmd.verbosity }}" + {{- if .Values.cmd.namespace }} + - -n + - {{ .Values.cmd.namespace }} + {{- end }} + ports: + - name: metrics + containerPort: {{ .Values.cmd.port }} + protocol: TCP + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/node-resource-exporter/templates/hpa.yaml b/charts/node-resource-exporter/templates/hpa.yaml new file mode 100644 index 0000000..ad1d7b4 --- /dev/null +++ b/charts/node-resource-exporter/templates/hpa.yaml @@ -0,0 +1,34 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "node-resource-exporter.fullname" . }} + namespace: {{.Release.Namespace}} + labels: + {{- include "node-resource-exporter.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "node-resource-exporter.fullname" . }} + namespace: {{.Release.Namespace}} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/charts/node-resource-exporter/templates/ingress.yaml b/charts/node-resource-exporter/templates/ingress.yaml new file mode 100644 index 0000000..e4ec258 --- /dev/null +++ b/charts/node-resource-exporter/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "node-resource-exporter.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "node-resource-exporter.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/node-resource-exporter/templates/rbac.yaml b/charts/node-resource-exporter/templates/rbac.yaml new file mode 100644 index 0000000..70b03af --- /dev/null +++ b/charts/node-resource-exporter/templates/rbac.yaml @@ -0,0 +1,22 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-resource-exporter.serviceAccountName" . }} +rules: +- apiGroups: [""] + resources: ["*"] + verbs: [get,list,watch] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-resource-exporter.serviceAccountName" . }} +subjects: +- kind: ServiceAccount + name: {{ include "node-resource-exporter.serviceAccountName" . }} + namespace: {{.Release.Namespace}} + apiGroup: "" +roleRef: + kind: ClusterRole + name: {{ include "node-resource-exporter.serviceAccountName" . }} + apiGroup: "" diff --git a/charts/node-resource-exporter/templates/service.yaml b/charts/node-resource-exporter/templates/service.yaml new file mode 100644 index 0000000..cc6c8b0 --- /dev/null +++ b/charts/node-resource-exporter/templates/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "node-resource-exporter.fullname" . }} + namespace: {{.Release.Namespace}} + labels: + {{- include "node-resource-exporter.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: metrics + protocol: TCP + name: metrics + selector: + {{- include "node-resource-exporter.selectorLabels" . | nindent 4 }} diff --git a/charts/node-resource-exporter/templates/serviceaccount.yaml b/charts/node-resource-exporter/templates/serviceaccount.yaml new file mode 100644 index 0000000..bdc2119 --- /dev/null +++ b/charts/node-resource-exporter/templates/serviceaccount.yaml @@ -0,0 +1,14 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-resource-exporter.serviceAccountName" . }} + namespace: {{.Release.Namespace}} + labels: + {{- include "node-resource-exporter.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/charts/node-resource-exporter/templates/servicemonitor.yaml b/charts/node-resource-exporter/templates/servicemonitor.yaml new file mode 100644 index 0000000..42af668 --- /dev/null +++ b/charts/node-resource-exporter/templates/servicemonitor.yaml @@ -0,0 +1,20 @@ +{{- if .Values.monitoring.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: prometheus + name: {{ include "node-resource-exporter.fullname" . }} + namespace: {{.Release.Namespace}} +spec: + endpoints: + - interval: 10s + port: metrics + scheme: http + namespaceSelector: + matchNames: + - {{.Release.Namespace}} + selector: + matchLabels: + {{- include "node-resource-exporter.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/node-resource-exporter/values.yaml b/charts/node-resource-exporter/values.yaml new file mode 100644 index 0000000..5a21b46 --- /dev/null +++ b/charts/node-resource-exporter/values.yaml @@ -0,0 +1,117 @@ +# Default values for node-resource-exporter. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: ghcr.io/nvidia/knavigator + pullPolicy: Always + # Overrides the image tag whose default is the chart appVersion. + tag: ff4aeb1 + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +cmd: + port: 9090 + namespace: default + resources: "cpu,memory,nvidia.com/gpu" + nodeLabels: "nodeInstance,nodeType" + verbosity: 1 + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 8080 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +monitoring: + enabled: true diff --git a/cmd/node-resource-exporter/main.go b/cmd/node-resource-exporter/main.go new file mode 100644 index 0000000..3e54b2e --- /dev/null +++ b/cmd/node-resource-exporter/main.go @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "context" + "flag" + "fmt" + "net/http" + "os" + "strings" + "syscall" + "time" + + "github.com/oklog/run" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + log "k8s.io/klog/v2" + + "github.com/NVIDIA/knavigator/pkg/metrics" +) + +type args struct { + port int + namespace string + nodeLabels string + resources string +} + +func main() { + var args args + flag.IntVar(&args.port, "p", 8080, "Prometheus target port") + flag.StringVar(&args.namespace, "n", "", "Tracking namespace (all if not set)") + flag.StringVar(&args.resources, "r", "", "Comma-separated list of tracked resource names") + flag.StringVar(&args.nodeLabels, "l", "", "Comma-separated list of node label names to be passed onto metrics") + + log.InitFlags(nil) + flag.Parse() + + if err := mainInternal(&args); err != nil { + log.Error(err.Error()) + os.Exit(1) + } +} + +func mainInternal(args *args) error { + config, err := rest.InClusterConfig() + if err != nil { + return err + } + + kubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + return err + } + + nodeLabels := strings.Split(args.nodeLabels, ",") + metrics.New(nodeLabels) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + promServer := &http.Server{ + Addr: fmt.Sprintf(":%d", args.port), + ReadHeaderTimeout: time.Minute, + Handler: mux, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var g run.Group + // Signal handler + g.Add(run.SignalHandler(ctx, os.Interrupt, syscall.SIGTERM)) + // Prometheus target + g.Add( + func() error { + log.Infof("Starting Node Resource Exporter on port %d", args.port) + return promServer.ListenAndServe() + }, + func(err error) { + log.Infof("Stopping Node Resource Exporter: %v", err) + if err := promServer.Shutdown(ctx); err != nil { + log.Infof("Error during server shutdown: %v", err) + } + log.Infof("Stopped Node Resource Exporter") + }) + // Resource sampling loop + g.Add( + func() error { + log.Infof("Starting sampling loop") + return startResourceSamplingLoop(ctx, kubeClient, args.namespace, strings.Split(args.resources, ","), nodeLabels) + }, + func(err error) { + log.Infof("Stopping sampling loop: %v", err) + cancel() + log.Infof("Stopped sampling loop") + }) + + return g.Run() +} + +func startResourceSamplingLoop(ctx context.Context, kubeClient *kubernetes.Clientset, namespace string, resources, nodeLabels []string) error { + defer log.Infof("Exited sampling loop") + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + metrics.ReportResourceUsage(ctx, kubeClient, namespace, resources, nodeLabels) + + case <-ctx.Done(): + return ctx.Err() + } + } +} diff --git a/docs/examples/knavigator/test-job.yml b/docs/examples/knavigator/test-job.yml index 2fe0a71..96ed0cc 100644 --- a/docs/examples/knavigator/test-job.yml +++ b/docs/examples/knavigator/test-job.yml @@ -9,7 +9,7 @@ spec: serviceAccountName: knavigator containers: - name: knavigator - image: ghcr.io/nvidia/knavigator:3527c8c + image: ghcr.io/nvidia/knavigator:ff4aeb1 imagePullPolicy: Always command: - /usr/local/bin/knavigator diff --git a/go.mod b/go.mod index 50d0502..bb684d3 100644 --- a/go.mod +++ b/go.mod @@ -1,11 +1,12 @@ module github.com/NVIDIA/knavigator -go 1.21 +go 1.23.3 require ( github.com/maja42/goval v1.4.0 github.com/oklog/run v1.1.0 - github.com/stretchr/testify v1.8.4 + github.com/prometheus/client_golang v1.20.5 + github.com/stretchr/testify v1.9.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.29.3 k8s.io/apimachinery v0.29.3 @@ -15,6 +16,8 @@ require ( ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/go-logr/logr v1.4.1 // indirect @@ -30,20 +33,23 @@ require ( github.com/imdario/mergo v0.3.11 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/net v0.23.0 // indirect - golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sys v0.18.0 // indirect - golang.org/x/term v0.18.0 // indirect - golang.org/x/text v0.14.0 // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/sys v0.22.0 // indirect + golang.org/x/term v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.3.0 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.33.0 // indirect + google.golang.org/protobuf v1.34.2 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect diff --git a/go.sum b/go.sum index 02bd04b..87fd692 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -18,7 +22,6 @@ github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEe github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= @@ -41,6 +44,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -48,6 +53,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/maja42/goval v1.4.0 h1:tlX0X+GvjKzWW2Q6qzWwL4Av2KV1bLtzxwzgxiiwEPc= @@ -69,6 +76,14 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -80,8 +95,8 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -90,45 +105,41 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= -golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= -golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= -golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= -golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= -golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/pkg/engine/check_object_task.go b/pkg/engine/check_object_task.go index 597b6f7..49949ca 100644 --- a/pkg/engine/check_object_task.go +++ b/pkg/engine/check_object_task.go @@ -127,7 +127,7 @@ func (task *CheckObjTask) Exec(ctx context.Context) error { func (task *CheckObjTask) checkStates(ctx context.Context, info *ObjInfo, nameMap *utils.SyncMap) error { for _, name := range info.Names { if err := task.checkState(ctx, name, info, nameMap); err != nil { - log.V(4).Infof(err.Error()) + log.V(4).Info(err.Error()) } } @@ -141,7 +141,7 @@ func (task *CheckObjTask) checkStates(ctx context.Context, info *ObjInfo, nameMa func (task *CheckObjTask) checkStateAsync(ctx context.Context, name string, info *ObjInfo, nameMap *utils.SyncMap, done chan struct{}) { if err := task.checkState(ctx, name, info, nameMap); err != nil { - log.V(4).Infof(err.Error()) + log.V(4).Info(err.Error()) return } diff --git a/pkg/engine/configure_task.go b/pkg/engine/configure_task.go index d2133fa..87a2c35 100644 --- a/pkg/engine/configure_task.go +++ b/pkg/engine/configure_task.go @@ -452,7 +452,7 @@ func runCommand(ctx context.Context, exe string, args []string) error { return err } - log.V(4).Infof(stdout.String()) + log.V(4).Info(stdout.String()) return nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000..56bb863 --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + nodeResourceRequests *prometheus.GaugeVec + nodeResourceLimits *prometheus.GaugeVec + nodeResourceOccupancy *prometheus.GaugeVec +) + +func New(nodeLabels []string) { + labels := append([]string{"node", "resource"}, nodeLabels...) + + nodeResourceRequests = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_resource_requests", + Help: "Gauge of node resource requests.", + }, labels) + + nodeResourceLimits = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_resource_limits", + Help: "Gauge of node resource limits.", + }, labels) + + nodeResourceOccupancy = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_resource_occupancy", + Help: "Occupancy percentage of node resource.", + }, labels) +} diff --git a/pkg/metrics/report.go b/pkg/metrics/report.go new file mode 100644 index 0000000..321ca9e --- /dev/null +++ b/pkg/metrics/report.go @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package metrics + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + log "k8s.io/klog/v2" +) + +type resourceList struct { + requests corev1.ResourceList + limits corev1.ResourceList +} + +func ReportResourceUsage(ctx context.Context, client *kubernetes.Clientset, namespace string, resources, nodeLabels []string) { + start := time.Now() + nodeResources, err := getNodeResourceMap(ctx, client, namespace) + if err != nil { + log.Infof("ERROR: %v", err) + return + } + + nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + log.Infof("ERROR: failed to list the nodes: %v", err) + return + } + + empty := &resourceList{ + requests: make(corev1.ResourceList), + limits: make(corev1.ResourceList), + } + + for _, node := range nodes.Items { + nodeLabelValues := make([]string, len(nodeLabels)) + for i, name := range nodeLabels { + nodeLabelValues[i] = node.Labels[name] + } + + list, ok := nodeResources[node.Name] + if !ok { + list = empty + } + + var val float64 + for _, resource := range resources { + labelValues := append([]string{node.Name, resource}, nodeLabelValues...) + // get resource requests + if v, ok := list.requests[corev1.ResourceName(resource)]; ok { + val = v.AsApproximateFloat64() + } else { + val = 0 + } + nodeResourceRequests.WithLabelValues(labelValues...).Set(val) + // get resource usage in percents + if v, ok := node.Status.Allocatable[corev1.ResourceName(resource)]; ok { + if allocatable := v.AsApproximateFloat64(); allocatable > 0 { + occ := val / allocatable + + log.V(4).InfoS("metrics", "node", node.Name, "resource", resource, "allocatable", allocatable, "requests", val, "occupancy", occ) + nodeResourceOccupancy.WithLabelValues(labelValues...).Set(occ * 100.0) + } + } + // get resource limits + if v, ok := list.limits[corev1.ResourceName(resource)]; ok { + val = v.AsApproximateFloat64() + } else { + val = 0 + } + nodeResourceLimits.WithLabelValues(labelValues...).Set(val) + } + } + log.V(4).Infof("Reporting cycle took %s", time.Since(start).String()) +} + +func getNodeResourceMap(ctx context.Context, kubeClient *kubernetes.Clientset, namespace string) (map[string]*resourceList, error) { + pods, err := kubeClient.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list the pods: %v", err) + } + log.V(4).Infof("Found %d pods in %q namespace", len(pods.Items), namespace) + + nodeResources := make(map[string]*resourceList) + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + continue + } + list, ok := nodeResources[pod.Spec.NodeName] + if !ok { + list = &resourceList{ + requests: make(corev1.ResourceList), + limits: make(corev1.ResourceList), + } + nodeResources[pod.Spec.NodeName] = list + } + for _, container := range pod.Spec.Containers { + addResourceList(list.requests, container.Resources.Requests) + addResourceList(list.limits, container.Resources.Limits) + } + } + + log.V(4).Infof("Created resource map for %d nodes", len(nodeResources)) + return nodeResources, nil +} + +func addResourceList(total, addition corev1.ResourceList) { + for resourceName, quantity := range addition { + if curr, found := total[resourceName]; found { + curr.Add(quantity) + total[resourceName] = curr + } else { + total[resourceName] = quantity.DeepCopy() + } + } +} diff --git a/scripts/env.sh b/scripts/env.sh index 4a5c690..c3bb004 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -109,6 +109,12 @@ function deploy_prometheus() { --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=600s + + printGreen Deploying Node Resource Exporter + + helm upgrade --install -n monitoring node-resource-exporter --wait $REPO_HOME/charts/node-resource-exporter + + kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/name=node-resource-exporter --timeout=600s } # Tested workload managers