diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..89b6556 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +## Ignore Binary +nvidia-exporter \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..64bd244 --- /dev/null +++ b/go.mod @@ -0,0 +1,15 @@ +module github.com/BugRoger/nvidia-exporter + +go 1.13 + +require ( + github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a + github.com/golang/protobuf v1.0.0 + github.com/matttproud/golang_protobuf_extensions v1.0.0 + github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37 + github.com/prometheus/client_golang v0.8.0 + github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5 + github.com/prometheus/common v0.0.0-20180110214958-89604d197083 + github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7 + github.com/sirupsen/logrus v1.4.2 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..dee111e --- /dev/null +++ b/go.sum @@ -0,0 +1,25 @@ +github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a h1:BtpsbiV638WQZwhA98cEZw2BsbnQJrbd0BI7tsy0W1c= +github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang/protobuf v1.0.0 h1:lsek0oXi8iFE9L+EXARyHIjU5rlWIhhTkjDz3vHhWWQ= +github.com/golang/protobuf v1.0.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/matttproud/golang_protobuf_extensions v1.0.0 h1:YNOwxxSJzSUARoD9KRZLzM9Y858MNGCOACTvCW9TSAc= +github.com/matttproud/golang_protobuf_extensions v1.0.0/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= +github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37 h1:5rdLUt9OvtxZzyqcktcDDpu4t7uHWGXU7jsvrG3b8cM= +github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v0.8.0 h1:1921Yw9Gc3iSc4VQh3PIoOqgPCZS7G/4xQNVUp8Mda8= +github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= +github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5 h1:cLL6NowurKLMfCeQy4tIeph12XNQWgANCNvdyrOYKV4= +github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= +github.com/prometheus/common v0.0.0-20180110214958-89604d197083 h1:BVsJT8+ZbyuL3hypz/HmEiM8h2P6hBQGig4el9/MdjA= +github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= +github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7 h1:hhvfGDVThBnd4kYisSFmYuHYeUhglxcwag7FhVPH9zM= +github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= +github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/main.go b/main.go index 7e4b1ad..c7332d2 100644 --- a/main.go +++ b/main.go @@ -2,17 +2,14 @@ package main import ( "flag" - "fmt" - "log" "net/http" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" ) -const ( - namespace = "nvidia" -) +const namespace = "nvidia" type Exporter struct { up prometheus.Gauge @@ -32,27 +29,46 @@ type Exporter struct { func main() { var ( - listenAddress = flag.String("web.listen-address", ":9401", "Address to listen on for web interface and telemetry.") + level = flag.String("log.level", "info", "Set the output log level") + listenAddress = flag.String("web.listen-address", "0.0.0.0:9401", "Address to listen on for web interface and telemetry.") metricsPath = flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics.") ) flag.Parse() + setLogLevel(*level) prometheus.MustRegister(NewExporter()) - http.Handle("/metrics", promhttp.Handler()) + http.Handle(*metricsPath, promhttp.Handler()) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(` NVML Exporter

NVML Exporter

Metrics

+

More information:

+

github.com/BugRoger/nvidia-exporter

`)) }) - fmt.Println("Starting HTTP server on", *listenAddress) + log.Infof("Starting HTTP server on %s", *listenAddress) log.Fatal(http.ListenAndServe(*listenAddress, nil)) } +func setLogLevel(level string) { + switch level { + case "error": + log.SetLevel(log.ErrorLevel) + case "warn": + log.SetLevel(log.WarnLevel) + case "info": + log.SetLevel(log.InfoLevel) + case "debug": + log.SetLevel(log.DebugLevel) + default: + log.Warnln("Unrecognized minimum log level; using 'info' as default") + } +} + func NewExporter() *Exporter { return &Exporter{ up: prometheus.NewGauge( @@ -160,10 +176,21 @@ func NewExporter() *Exporter { } } +// This function is used to check if metric +// value is valid; we expect nothing less than 0 +// gonvml returns uint data type +func checkMetric(value float64) bool { + if value < 0 { + return false + } else { + return true + } +} + func (e *Exporter) Collect(metrics chan<- prometheus.Metric) { data, err := collectMetrics() if err != nil { - log.Printf("Failed to collect metrics: %s\n", err) + log.Errorf("Failed to collect metrics: %s", err) e.up.Set(0) e.up.Collect(metrics) return @@ -176,15 +203,33 @@ func (e *Exporter) Collect(metrics chan<- prometheus.Metric) { for i := 0; i < len(data.Devices); i++ { d := data.Devices[i] e.deviceInfo.WithLabelValues(d.Index, d.MinorNumber, d.Name, d.UUID).Set(1) - e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed) - e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal) - e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed) - e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage) - e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage) - e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature) - e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU) - e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage) - e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory) + if checkMetric(d.FanSpeed) { + e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed) + } + if checkMetric(d.MemoryTotal) { + e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal) + } + if checkMetric(d.MemoryUsed) { + e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed) + } + if checkMetric(d.PowerUsage) { + e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage) + } + if checkMetric(d.PowerUsageAverage) { + e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage) + } + if checkMetric(d.Temperature) { + e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature) + } + if checkMetric(d.UtilizationGPU) { + e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU) + } + if checkMetric(d.UtilizationGPUAverage) { + e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage) + } + if checkMetric(d.UtilizationMemory) { + e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory) + } } e.deviceCount.Collect(metrics) diff --git a/metrics.go b/metrics.go index da4f9fe..f55e79c 100644 --- a/metrics.go +++ b/metrics.go @@ -5,11 +5,10 @@ import ( "time" "github.com/mindprince/gonvml" + log "github.com/sirupsen/logrus" ) -var ( - averageDuration = 10 * time.Second -) +var averageDuration = 10 * time.Second type Metrics struct { Version string @@ -34,13 +33,16 @@ type Device struct { func collectMetrics() (*Metrics, error) { if err := gonvml.Initialize(); err != nil { + log.Errorf("Failed to initialize gonvml.") + // Return out, since this failure to initialize + // will prevent collection. return nil, err } defer gonvml.Shutdown() version, err := gonvml.SystemDriverVersion() if err != nil { - return nil, err + log.Warnf("Failed to get SystemDriverVersion.") } metrics := &Metrics{ @@ -49,64 +51,60 @@ func collectMetrics() (*Metrics, error) { numDevices, err := gonvml.DeviceCount() if err != nil { + log.Errorf("Failed to get DeviceCount") + // Return out, since this failure to obtain + // device count will prevent collection. return nil, err } for index := 0; index < int(numDevices); index++ { device, err := gonvml.DeviceHandleByIndex(uint(index)) if err != nil { + log.Errorf("Failed to get DeviceHandleByIndex") + // Return out, since this failure to obtain + // DeviceHandleByIndex will prevent collection. return nil, err } uuid, err := device.UUID() if err != nil { + log.Errorf("Failed to get deviceUUID") + // Return out, since this failure to obtain + // failure to get this metrics is likely. + // is of a problem. return nil, err } name, err := device.Name() if err != nil { + log.Errorf("Failed to get deviceName") + // Return out, since this failure to obtain + // failure to get this metrics is likely. + // is of a problem. return nil, err } minorNumber, err := device.MinorNumber() if err != nil { + log.Errorf("Failed to get MinorNumber") + // Return out, since this failure to obtain + // MinorNumber will potentially cause conlficts. return nil, err } - temperature, err := device.Temperature() - if err != nil { - return nil, err - } + temperature, temperatureErr := device.Temperature() - powerUsage, err := device.PowerUsage() - if err != nil { - return nil, err - } + powerUsage, powerUsageErr := device.PowerUsage() - powerUsageAverage, err := device.AveragePowerUsage(averageDuration) - if err != nil { - return nil, err - } + powerUsageAverage, powerUsageAverageErr := device.AveragePowerUsage(averageDuration) - fanSpeed, err := device.FanSpeed() - if err != nil { - return nil, err - } + fanSpeed, fanSpeedErr := device.FanSpeed() - memoryTotal, memoryUsed, err := device.MemoryInfo() - if err != nil { - return nil, err - } + memoryTotal, memoryUsed, memoryInfoErr := device.MemoryInfo() - utilizationGPU, utilizationMemory, err := device.UtilizationRates() - if err != nil { - return nil, err - } + utilizationGPU, utilizationMemory, utilizationRatesErr := device.UtilizationRates() - utilizationGPUAverage, err := device.AverageGPUUtilization(averageDuration) - if err != nil { - return nil, err - } + utilizationGPUAverage, utilizationGPUAverageErr := device.AverageGPUUtilization(averageDuration) metrics.Devices = append(metrics.Devices, &Device{ @@ -114,17 +112,26 @@ func collectMetrics() (*Metrics, error) { MinorNumber: strconv.Itoa(int(minorNumber)), Name: name, UUID: uuid, - Temperature: float64(temperature), - PowerUsage: float64(powerUsage), - PowerUsageAverage: float64(powerUsageAverage), - FanSpeed: float64(fanSpeed), - MemoryTotal: float64(memoryTotal), - MemoryUsed: float64(memoryUsed), - UtilizationMemory: float64(utilizationMemory), - UtilizationGPU: float64(utilizationGPU), - UtilizationGPUAverage: float64(utilizationGPUAverage), + Temperature: checkError(temperatureErr, float64(temperature), index, "Temperature"), + PowerUsage: checkError(powerUsageErr, float64(powerUsage), index, "PowerUsage"), + PowerUsageAverage: checkError(powerUsageAverageErr, float64(powerUsageAverage), index, "PowerUsageAverage"), + FanSpeed: checkError(fanSpeedErr, float64(fanSpeed), index, "FanSpeed"), + MemoryTotal: checkError(memoryInfoErr, float64(memoryTotal), index, "MemoryTotal"), + MemoryUsed: checkError(memoryInfoErr, float64(memoryUsed), index, "MemoryUsed"), + UtilizationMemory: checkError(utilizationRatesErr, float64(utilizationMemory), index, "UtilizationMemory"), + UtilizationGPU: checkError(utilizationRatesErr, float64(utilizationGPU), index, "UtilizationGPU"), + UtilizationGPUAverage: checkError(utilizationGPUAverageErr, float64(utilizationGPUAverage), index, "UtilizationGPUAverage"), }) } - return metrics, nil } + +// This function is used to check if error is returned +// if so set float64 to -1 +func checkError(err error, value float64, index int, metric string) float64 { + if err != nil { + log.Debugf("Unable to collect metrics for %s for device %d: %s", metric, index, err) + return -1 + } + return value +}