Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce handling of missing metrics #6

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## Ignore Binary
nvidia-exporter
15 changes: 15 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module github.com/BugRoger/nvidia-exporter

go 1.13

require (
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a
github.com/golang/protobuf v1.0.0
github.com/matttproud/golang_protobuf_extensions v1.0.0
github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37
github.com/prometheus/client_golang v0.8.0
github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5
github.com/prometheus/common v0.0.0-20180110214958-89604d197083
github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7
github.com/sirupsen/logrus v1.4.2
)
25 changes: 25 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a h1:BtpsbiV638WQZwhA98cEZw2BsbnQJrbd0BI7tsy0W1c=
github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/golang/protobuf v1.0.0 h1:lsek0oXi8iFE9L+EXARyHIjU5rlWIhhTkjDz3vHhWWQ=
github.com/golang/protobuf v1.0.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/matttproud/golang_protobuf_extensions v1.0.0 h1:YNOwxxSJzSUARoD9KRZLzM9Y858MNGCOACTvCW9TSAc=
github.com/matttproud/golang_protobuf_extensions v1.0.0/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37 h1:5rdLUt9OvtxZzyqcktcDDpu4t7uHWGXU7jsvrG3b8cM=
github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.8.0 h1:1921Yw9Gc3iSc4VQh3PIoOqgPCZS7G/4xQNVUp8Mda8=
github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5 h1:cLL6NowurKLMfCeQy4tIeph12XNQWgANCNvdyrOYKV4=
github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/common v0.0.0-20180110214958-89604d197083 h1:BVsJT8+ZbyuL3hypz/HmEiM8h2P6hBQGig4el9/MdjA=
github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7 h1:hhvfGDVThBnd4kYisSFmYuHYeUhglxcwag7FhVPH9zM=
github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
81 changes: 63 additions & 18 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@ package main

import (
"flag"
"fmt"
"log"
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
log "github.com/sirupsen/logrus"
)

const (
namespace = "nvidia"
)
const namespace = "nvidia"

type Exporter struct {
up prometheus.Gauge
Expand All @@ -32,27 +29,46 @@ type Exporter struct {

func main() {
var (
listenAddress = flag.String("web.listen-address", ":9401", "Address to listen on for web interface and telemetry.")
level = flag.String("log.level", "info", "Set the output log level")
listenAddress = flag.String("web.listen-address", "0.0.0.0:9401", "Address to listen on for web interface and telemetry.")
metricsPath = flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics.")
)
flag.Parse()
setLogLevel(*level)

prometheus.MustRegister(NewExporter())

http.Handle("/metrics", promhttp.Handler())
http.Handle(*metricsPath, promhttp.Handler())
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(`<html>
<head><title>NVML Exporter</title></head>
<body>
<h1>NVML Exporter</h1>
<p><a href='` + *metricsPath + `'>Metrics</a></p>
<h2>More information:</h2>
<p><a href="https://github.com/BugRoger/nvidia-exporter">github.com/BugRoger/nvidia-exporter</a></p>
</body>
</html>`))
})
fmt.Println("Starting HTTP server on", *listenAddress)
log.Infof("Starting HTTP server on %s", *listenAddress)
log.Fatal(http.ListenAndServe(*listenAddress, nil))
}

func setLogLevel(level string) {
switch level {
case "error":
log.SetLevel(log.ErrorLevel)
case "warn":
log.SetLevel(log.WarnLevel)
case "info":
log.SetLevel(log.InfoLevel)
case "debug":
log.SetLevel(log.DebugLevel)
default:
log.Warnln("Unrecognized minimum log level; using 'info' as default")
}
}

func NewExporter() *Exporter {
return &Exporter{
up: prometheus.NewGauge(
Expand Down Expand Up @@ -160,10 +176,21 @@ func NewExporter() *Exporter {
}
}

// This function is used to check if metric
// value is valid; we expect nothing less than 0
// gonvml returns uint data type
func checkMetric(value float64) bool {
if value < 0 {
return false
} else {
return true
}
}

func (e *Exporter) Collect(metrics chan<- prometheus.Metric) {
data, err := collectMetrics()
if err != nil {
log.Printf("Failed to collect metrics: %s\n", err)
log.Errorf("Failed to collect metrics: %s", err)
e.up.Set(0)
e.up.Collect(metrics)
return
Expand All @@ -176,15 +203,33 @@ func (e *Exporter) Collect(metrics chan<- prometheus.Metric) {
for i := 0; i < len(data.Devices); i++ {
d := data.Devices[i]
e.deviceInfo.WithLabelValues(d.Index, d.MinorNumber, d.Name, d.UUID).Set(1)
e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed)
e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal)
e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed)
e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage)
e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage)
e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature)
e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU)
e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage)
e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory)
if checkMetric(d.FanSpeed) {
e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed)
}
if checkMetric(d.MemoryTotal) {
e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal)
}
if checkMetric(d.MemoryUsed) {
e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed)
}
if checkMetric(d.PowerUsage) {
e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage)
}
if checkMetric(d.PowerUsageAverage) {
e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage)
}
if checkMetric(d.Temperature) {
e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature)
}
if checkMetric(d.UtilizationGPU) {
e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU)
}
if checkMetric(d.UtilizationGPUAverage) {
e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage)
}
if checkMetric(d.UtilizationMemory) {
e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory)
}
}

e.deviceCount.Collect(metrics)
Expand Down
91 changes: 49 additions & 42 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ import (
"time"

"github.com/mindprince/gonvml"
log "github.com/sirupsen/logrus"
)

var (
averageDuration = 10 * time.Second
)
var averageDuration = 10 * time.Second

type Metrics struct {
Version string
Expand All @@ -34,13 +33,16 @@ type Device struct {

func collectMetrics() (*Metrics, error) {
if err := gonvml.Initialize(); err != nil {
log.Errorf("Failed to initialize gonvml.")
// Return out, since this failure to initialize
// will prevent collection.
return nil, err
}
defer gonvml.Shutdown()

version, err := gonvml.SystemDriverVersion()
if err != nil {
return nil, err
log.Warnf("Failed to get SystemDriverVersion.")
}

metrics := &Metrics{
Expand All @@ -49,82 +51,87 @@ func collectMetrics() (*Metrics, error) {

numDevices, err := gonvml.DeviceCount()
if err != nil {
log.Errorf("Failed to get DeviceCount")
// Return out, since this failure to obtain
// device count will prevent collection.
return nil, err
}

for index := 0; index < int(numDevices); index++ {
device, err := gonvml.DeviceHandleByIndex(uint(index))
if err != nil {
log.Errorf("Failed to get DeviceHandleByIndex")
// Return out, since this failure to obtain
// DeviceHandleByIndex will prevent collection.
return nil, err
}

uuid, err := device.UUID()
if err != nil {
log.Errorf("Failed to get deviceUUID")
// Return out, since this failure to obtain
// failure to get this metrics is likely.
// is of a problem.
return nil, err
}

name, err := device.Name()
if err != nil {
log.Errorf("Failed to get deviceName")
// Return out, since this failure to obtain
// failure to get this metrics is likely.
// is of a problem.
return nil, err
}

minorNumber, err := device.MinorNumber()
if err != nil {
log.Errorf("Failed to get MinorNumber")
// Return out, since this failure to obtain
// MinorNumber will potentially cause conlficts.
return nil, err
}

temperature, err := device.Temperature()
if err != nil {
return nil, err
}
temperature, temperatureErr := device.Temperature()

powerUsage, err := device.PowerUsage()
if err != nil {
return nil, err
}
powerUsage, powerUsageErr := device.PowerUsage()

powerUsageAverage, err := device.AveragePowerUsage(averageDuration)
if err != nil {
return nil, err
}
powerUsageAverage, powerUsageAverageErr := device.AveragePowerUsage(averageDuration)

fanSpeed, err := device.FanSpeed()
if err != nil {
return nil, err
}
fanSpeed, fanSpeedErr := device.FanSpeed()

memoryTotal, memoryUsed, err := device.MemoryInfo()
if err != nil {
return nil, err
}
memoryTotal, memoryUsed, memoryInfoErr := device.MemoryInfo()

utilizationGPU, utilizationMemory, err := device.UtilizationRates()
if err != nil {
return nil, err
}
utilizationGPU, utilizationMemory, utilizationRatesErr := device.UtilizationRates()

utilizationGPUAverage, err := device.AverageGPUUtilization(averageDuration)
if err != nil {
return nil, err
}
utilizationGPUAverage, utilizationGPUAverageErr := device.AverageGPUUtilization(averageDuration)

metrics.Devices = append(metrics.Devices,
&Device{
Index: strconv.Itoa(index),
MinorNumber: strconv.Itoa(int(minorNumber)),
Name: name,
UUID: uuid,
Temperature: float64(temperature),
PowerUsage: float64(powerUsage),
PowerUsageAverage: float64(powerUsageAverage),
FanSpeed: float64(fanSpeed),
MemoryTotal: float64(memoryTotal),
MemoryUsed: float64(memoryUsed),
UtilizationMemory: float64(utilizationMemory),
UtilizationGPU: float64(utilizationGPU),
UtilizationGPUAverage: float64(utilizationGPUAverage),
Temperature: checkError(temperatureErr, float64(temperature), index, "Temperature"),
PowerUsage: checkError(powerUsageErr, float64(powerUsage), index, "PowerUsage"),
PowerUsageAverage: checkError(powerUsageAverageErr, float64(powerUsageAverage), index, "PowerUsageAverage"),
FanSpeed: checkError(fanSpeedErr, float64(fanSpeed), index, "FanSpeed"),
MemoryTotal: checkError(memoryInfoErr, float64(memoryTotal), index, "MemoryTotal"),
MemoryUsed: checkError(memoryInfoErr, float64(memoryUsed), index, "MemoryUsed"),
UtilizationMemory: checkError(utilizationRatesErr, float64(utilizationMemory), index, "UtilizationMemory"),
UtilizationGPU: checkError(utilizationRatesErr, float64(utilizationGPU), index, "UtilizationGPU"),
UtilizationGPUAverage: checkError(utilizationGPUAverageErr, float64(utilizationGPUAverage), index, "UtilizationGPUAverage"),
})
}

return metrics, nil
}

// This function is used to check if error is returned
// if so set float64 to -1
func checkError(err error, value float64, index int, metric string) float64 {
if err != nil {
log.Debugf("Unable to collect metrics for %s for device %d: %s", metric, index, err)
return -1
}
return value
}