From c92b168ae0f2e57090d4f0485f750ae472913db2 Mon Sep 17 00:00:00 2001 From: Alexander Zhebrak Date: Fri, 4 Aug 2017 13:16:44 +0300 Subject: [PATCH] initial --- .gitignore | 2 ++ README.md | 57 ++++++++++++++++++++++++++++++++++++ nvidia_smi_exporter.go | 66 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 nvidia_smi_exporter.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d466bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +nvidia_smi_exporter +sample.csv diff --git a/README.md b/README.md new file mode 100644 index 0000000..1dee265 --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +# nvidia_smi_exporter + +nvidia-smi metrics exporter for Prometheus + +## Build +``` +> go build -v nvidia_smi_exporter +``` + +## Run +``` +> ./nvidia_smi_exporter [] +``` +Default port is 9101 + + +### localhost:9101/metrics +``` +temperature_gpu{gpu="TITAN X (Pascal)[0]"} 41 +utilization_gpu{gpu="TITAN X (Pascal)[0]"} 0 +utilization_memory{gpu="TITAN X (Pascal)[0]"} 0 +memory_total{gpu="TITAN X (Pascal)[0]"} 12189 +memory_free{gpu="TITAN X (Pascal)[0]"} 12189 +memory_used{gpu="TITAN X (Pascal)[0]"} 0 +temperature_gpu{gpu="TITAN X (Pascal)[1]"} 78 +utilization_gpu{gpu="TITAN X (Pascal)[1]"} 95 +utilization_memory{gpu="TITAN X (Pascal)[1]"} 59 +memory_total{gpu="TITAN X (Pascal)[1]"} 12189 +memory_free{gpu="TITAN X (Pascal)[1]"} 1738 +memory_used{gpu="TITAN X (Pascal)[1]"} 10451 +temperature_gpu{gpu="TITAN X (Pascal)[2]"} 83 +utilization_gpu{gpu="TITAN X (Pascal)[2]"} 99 +utilization_memory{gpu="TITAN X (Pascal)[2]"} 82 +memory_total{gpu="TITAN X (Pascal)[2]"} 12189 +memory_free{gpu="TITAN X (Pascal)[2]"} 190 +memory_used{gpu="TITAN X (Pascal)[2]"} 11999 +temperature_gpu{gpu="TITAN X (Pascal)[3]"} 84 +utilization_gpu{gpu="TITAN X (Pascal)[3]"} 97 +utilization_memory{gpu="TITAN X (Pascal)[3]"} 76 +memory_total{gpu="TITAN X (Pascal)[3]"} 12189 +memory_free{gpu="TITAN X (Pascal)[3]"} 536 +memory_used{gpu="TITAN X (Pascal)[3]"} 11653 +``` + +### Exact command +``` +nvidia-smi --query-gpu=name,index,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv,noheader,nounits +``` + +### Prometheus example config + +``` +- job_name: "gpu_exporter" + static_configs: + - targets: ['localhost:9101'] +``` + diff --git a/nvidia_smi_exporter.go b/nvidia_smi_exporter.go new file mode 100644 index 0000000..9deeed9 --- /dev/null +++ b/nvidia_smi_exporter.go @@ -0,0 +1,66 @@ +package main + +import ( + "bytes" + "encoding/csv" + "fmt" + "net/http" + "log" + "os" + "os/exec" + "strings" +) + + +// name, index, temperature.gpu, utilization.gpu, +// utilization.memory, memory.total, memory.free, memory.used + +func metrics(response http.ResponseWriter, request *http.Request) { + out, err := exec.Command( + "nvidia-smi", + "--query-gpu=name,index,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used", + "--format=csv,noheader,nounits").Output() + + if err != nil { + fmt.Printf("%s\n", err) + return + } + + csvReader := csv.NewReader(bytes.NewReader(out)) + csvReader.TrimLeadingSpace = true + records, err := csvReader.ReadAll() + + if err != nil { + fmt.Printf("%s\n", err) + return + } + + metricList := []string { + "temperature.gpu", "utilization.gpu", + "utilization.memory", "memory.total", "memory.free", "memory.used"} + + result := "" + for _, row := range records { + name := fmt.Sprintf("%s[%s]", row[0], row[1]) + for idx, value := range row[2:] { + result = fmt.Sprintf( + "%s%s{gpu=\"%s\"} %s\n", result, + metricList[idx], name, value) + } + } + + fmt.Fprintf(response, strings.Replace(result, ".", "_", -1)) +} + +func main() { + port := ":9101" + if len(os.Args) > 1 { + port = ":" + os.Args[1] + } + + http.HandleFunc("/metrics/", metrics) + err := http.ListenAndServe(port, nil) + if err != nil { + log.Fatal("ListenAndServe: ", err) + } +}