diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..89b6556
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+## Ignore Binary
+nvidia-exporter
\ No newline at end of file
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..64bd244
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,15 @@
+module github.com/BugRoger/nvidia-exporter
+
+go 1.13
+
+require (
+ github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a
+ github.com/golang/protobuf v1.0.0
+ github.com/matttproud/golang_protobuf_extensions v1.0.0
+ github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37
+ github.com/prometheus/client_golang v0.8.0
+ github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5
+ github.com/prometheus/common v0.0.0-20180110214958-89604d197083
+ github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7
+ github.com/sirupsen/logrus v1.4.2
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..dee111e
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,25 @@
+github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a h1:BtpsbiV638WQZwhA98cEZw2BsbnQJrbd0BI7tsy0W1c=
+github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/golang/protobuf v1.0.0 h1:lsek0oXi8iFE9L+EXARyHIjU5rlWIhhTkjDz3vHhWWQ=
+github.com/golang/protobuf v1.0.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/matttproud/golang_protobuf_extensions v1.0.0 h1:YNOwxxSJzSUARoD9KRZLzM9Y858MNGCOACTvCW9TSAc=
+github.com/matttproud/golang_protobuf_extensions v1.0.0/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
+github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37 h1:5rdLUt9OvtxZzyqcktcDDpu4t7uHWGXU7jsvrG3b8cM=
+github.com/mindprince/gonvml v0.0.0-20180111080136-eea82dc7bb37/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v0.8.0 h1:1921Yw9Gc3iSc4VQh3PIoOqgPCZS7G/4xQNVUp8Mda8=
+github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
+github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5 h1:cLL6NowurKLMfCeQy4tIeph12XNQWgANCNvdyrOYKV4=
+github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
+github.com/prometheus/common v0.0.0-20180110214958-89604d197083 h1:BVsJT8+ZbyuL3hypz/HmEiM8h2P6hBQGig4el9/MdjA=
+github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
+github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7 h1:hhvfGDVThBnd4kYisSFmYuHYeUhglxcwag7FhVPH9zM=
+github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
+github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc=
+golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
diff --git a/main.go b/main.go
index 7e4b1ad..c7332d2 100644
--- a/main.go
+++ b/main.go
@@ -2,17 +2,14 @@ package main
import (
"flag"
- "fmt"
- "log"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
+ log "github.com/sirupsen/logrus"
)
-const (
- namespace = "nvidia"
-)
+const namespace = "nvidia"
type Exporter struct {
up prometheus.Gauge
@@ -32,27 +29,46 @@ type Exporter struct {
func main() {
var (
- listenAddress = flag.String("web.listen-address", ":9401", "Address to listen on for web interface and telemetry.")
+ level = flag.String("log.level", "info", "Set the output log level")
+ listenAddress = flag.String("web.listen-address", "0.0.0.0:9401", "Address to listen on for web interface and telemetry.")
metricsPath = flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics.")
)
flag.Parse()
+ setLogLevel(*level)
prometheus.MustRegister(NewExporter())
- http.Handle("/metrics", promhttp.Handler())
+ http.Handle(*metricsPath, promhttp.Handler())
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(`
NVML Exporter
NVML Exporter
Metrics
+ More information:
+ github.com/BugRoger/nvidia-exporter
`))
})
- fmt.Println("Starting HTTP server on", *listenAddress)
+ log.Infof("Starting HTTP server on %s", *listenAddress)
log.Fatal(http.ListenAndServe(*listenAddress, nil))
}
+func setLogLevel(level string) {
+ switch level {
+ case "error":
+ log.SetLevel(log.ErrorLevel)
+ case "warn":
+ log.SetLevel(log.WarnLevel)
+ case "info":
+ log.SetLevel(log.InfoLevel)
+ case "debug":
+ log.SetLevel(log.DebugLevel)
+ default:
+ log.Warnln("Unrecognized minimum log level; using 'info' as default")
+ }
+}
+
func NewExporter() *Exporter {
return &Exporter{
up: prometheus.NewGauge(
@@ -160,10 +176,21 @@ func NewExporter() *Exporter {
}
}
+// This function is used to check if metric
+// value is valid; we expect nothing less than 0
+// gonvml returns uint data type
+func checkMetric(value float64) bool {
+ if value < 0 {
+ return false
+ } else {
+ return true
+ }
+}
+
func (e *Exporter) Collect(metrics chan<- prometheus.Metric) {
data, err := collectMetrics()
if err != nil {
- log.Printf("Failed to collect metrics: %s\n", err)
+ log.Errorf("Failed to collect metrics: %s", err)
e.up.Set(0)
e.up.Collect(metrics)
return
@@ -176,15 +203,33 @@ func (e *Exporter) Collect(metrics chan<- prometheus.Metric) {
for i := 0; i < len(data.Devices); i++ {
d := data.Devices[i]
e.deviceInfo.WithLabelValues(d.Index, d.MinorNumber, d.Name, d.UUID).Set(1)
- e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed)
- e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal)
- e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed)
- e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage)
- e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage)
- e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature)
- e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU)
- e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage)
- e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory)
+ if checkMetric(d.FanSpeed) {
+ e.fanSpeed.WithLabelValues(d.MinorNumber).Set(d.FanSpeed)
+ }
+ if checkMetric(d.MemoryTotal) {
+ e.memoryTotal.WithLabelValues(d.MinorNumber).Set(d.MemoryTotal)
+ }
+ if checkMetric(d.MemoryUsed) {
+ e.memoryUsed.WithLabelValues(d.MinorNumber).Set(d.MemoryUsed)
+ }
+ if checkMetric(d.PowerUsage) {
+ e.powerUsage.WithLabelValues(d.MinorNumber).Set(d.PowerUsage)
+ }
+ if checkMetric(d.PowerUsageAverage) {
+ e.powerUsageAverage.WithLabelValues(d.MinorNumber).Set(d.PowerUsageAverage)
+ }
+ if checkMetric(d.Temperature) {
+ e.temperatures.WithLabelValues(d.MinorNumber).Set(d.Temperature)
+ }
+ if checkMetric(d.UtilizationGPU) {
+ e.utilizationGPU.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPU)
+ }
+ if checkMetric(d.UtilizationGPUAverage) {
+ e.utilizationGPUAverage.WithLabelValues(d.MinorNumber).Set(d.UtilizationGPUAverage)
+ }
+ if checkMetric(d.UtilizationMemory) {
+ e.utilizationMemory.WithLabelValues(d.MinorNumber).Set(d.UtilizationMemory)
+ }
}
e.deviceCount.Collect(metrics)
diff --git a/metrics.go b/metrics.go
index da4f9fe..f55e79c 100644
--- a/metrics.go
+++ b/metrics.go
@@ -5,11 +5,10 @@ import (
"time"
"github.com/mindprince/gonvml"
+ log "github.com/sirupsen/logrus"
)
-var (
- averageDuration = 10 * time.Second
-)
+var averageDuration = 10 * time.Second
type Metrics struct {
Version string
@@ -34,13 +33,16 @@ type Device struct {
func collectMetrics() (*Metrics, error) {
if err := gonvml.Initialize(); err != nil {
+ log.Errorf("Failed to initialize gonvml.")
+ // Return out, since this failure to initialize
+ // will prevent collection.
return nil, err
}
defer gonvml.Shutdown()
version, err := gonvml.SystemDriverVersion()
if err != nil {
- return nil, err
+ log.Warnf("Failed to get SystemDriverVersion.")
}
metrics := &Metrics{
@@ -49,64 +51,60 @@ func collectMetrics() (*Metrics, error) {
numDevices, err := gonvml.DeviceCount()
if err != nil {
+ log.Errorf("Failed to get DeviceCount")
+ // Return out, since this failure to obtain
+ // device count will prevent collection.
return nil, err
}
for index := 0; index < int(numDevices); index++ {
device, err := gonvml.DeviceHandleByIndex(uint(index))
if err != nil {
+ log.Errorf("Failed to get DeviceHandleByIndex")
+ // Return out, since this failure to obtain
+ // DeviceHandleByIndex will prevent collection.
return nil, err
}
uuid, err := device.UUID()
if err != nil {
+ log.Errorf("Failed to get deviceUUID")
+ // Return out, since this failure to obtain
+ // failure to get this metrics is likely.
+ // is of a problem.
return nil, err
}
name, err := device.Name()
if err != nil {
+ log.Errorf("Failed to get deviceName")
+ // Return out, since this failure to obtain
+ // failure to get this metrics is likely.
+ // is of a problem.
return nil, err
}
minorNumber, err := device.MinorNumber()
if err != nil {
+ log.Errorf("Failed to get MinorNumber")
+ // Return out, since this failure to obtain
+ // MinorNumber will potentially cause conlficts.
return nil, err
}
- temperature, err := device.Temperature()
- if err != nil {
- return nil, err
- }
+ temperature, temperatureErr := device.Temperature()
- powerUsage, err := device.PowerUsage()
- if err != nil {
- return nil, err
- }
+ powerUsage, powerUsageErr := device.PowerUsage()
- powerUsageAverage, err := device.AveragePowerUsage(averageDuration)
- if err != nil {
- return nil, err
- }
+ powerUsageAverage, powerUsageAverageErr := device.AveragePowerUsage(averageDuration)
- fanSpeed, err := device.FanSpeed()
- if err != nil {
- return nil, err
- }
+ fanSpeed, fanSpeedErr := device.FanSpeed()
- memoryTotal, memoryUsed, err := device.MemoryInfo()
- if err != nil {
- return nil, err
- }
+ memoryTotal, memoryUsed, memoryInfoErr := device.MemoryInfo()
- utilizationGPU, utilizationMemory, err := device.UtilizationRates()
- if err != nil {
- return nil, err
- }
+ utilizationGPU, utilizationMemory, utilizationRatesErr := device.UtilizationRates()
- utilizationGPUAverage, err := device.AverageGPUUtilization(averageDuration)
- if err != nil {
- return nil, err
- }
+ utilizationGPUAverage, utilizationGPUAverageErr := device.AverageGPUUtilization(averageDuration)
metrics.Devices = append(metrics.Devices,
&Device{
@@ -114,17 +112,26 @@ func collectMetrics() (*Metrics, error) {
MinorNumber: strconv.Itoa(int(minorNumber)),
Name: name,
UUID: uuid,
- Temperature: float64(temperature),
- PowerUsage: float64(powerUsage),
- PowerUsageAverage: float64(powerUsageAverage),
- FanSpeed: float64(fanSpeed),
- MemoryTotal: float64(memoryTotal),
- MemoryUsed: float64(memoryUsed),
- UtilizationMemory: float64(utilizationMemory),
- UtilizationGPU: float64(utilizationGPU),
- UtilizationGPUAverage: float64(utilizationGPUAverage),
+ Temperature: checkError(temperatureErr, float64(temperature), index, "Temperature"),
+ PowerUsage: checkError(powerUsageErr, float64(powerUsage), index, "PowerUsage"),
+ PowerUsageAverage: checkError(powerUsageAverageErr, float64(powerUsageAverage), index, "PowerUsageAverage"),
+ FanSpeed: checkError(fanSpeedErr, float64(fanSpeed), index, "FanSpeed"),
+ MemoryTotal: checkError(memoryInfoErr, float64(memoryTotal), index, "MemoryTotal"),
+ MemoryUsed: checkError(memoryInfoErr, float64(memoryUsed), index, "MemoryUsed"),
+ UtilizationMemory: checkError(utilizationRatesErr, float64(utilizationMemory), index, "UtilizationMemory"),
+ UtilizationGPU: checkError(utilizationRatesErr, float64(utilizationGPU), index, "UtilizationGPU"),
+ UtilizationGPUAverage: checkError(utilizationGPUAverageErr, float64(utilizationGPUAverage), index, "UtilizationGPUAverage"),
})
}
-
return metrics, nil
}
+
+// This function is used to check if error is returned
+// if so set float64 to -1
+func checkError(err error, value float64, index int, metric string) float64 {
+ if err != nil {
+ log.Debugf("Unable to collect metrics for %s for device %d: %s", metric, index, err)
+ return -1
+ }
+ return value
+}