diff --git a/nvidia_smi_exporter.go b/nvidia_smi_exporter.go index d78f35b..2899ad5 100644 --- a/nvidia_smi_exporter.go +++ b/nvidia_smi_exporter.go @@ -1,66 +1,65 @@ package main import ( - "bytes" - "encoding/csv" - "fmt" - "net/http" - "log" - "os" - "os/exec" - "strings" + "bytes" + "encoding/csv" + "fmt" + "log" + "net/http" + "os" + "os/exec" + "strings" ) - // name, index, temperature.gpu, utilization.gpu, // utilization.memory, memory.total, memory.free, memory.used func metrics(response http.ResponseWriter, request *http.Request) { - out, err := exec.Command( - "nvidia-smi", - "--query-gpu=name,index,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used", - "--format=csv,noheader,nounits").Output() + out, err := exec.Command( + "nvidia-smi", + "--query-gpu=name,index,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,ecc.errors.uncorrected.aggregate.total,ecc.errors.uncorrected.volatile.total", + "--format=csv,noheader,nounits").Output() - if err != nil { - fmt.Printf("%s\n", err) - return - } + if err != nil { + fmt.Printf("%s\n", err) + return + } - csvReader := csv.NewReader(bytes.NewReader(out)) - csvReader.TrimLeadingSpace = true - records, err := csvReader.ReadAll() + csvReader := csv.NewReader(bytes.NewReader(out)) + csvReader.TrimLeadingSpace = true + records, err := csvReader.ReadAll() - if err != nil { - fmt.Printf("%s\n", err) - return - } + if err != nil { + fmt.Printf("%s\n", err) + return + } - metricList := []string { - "temperature.gpu", "utilization.gpu", - "utilization.memory", "memory.total", "memory.free", "memory.used"} + metricList := []string{ + "temperature.gpu", "utilization.gpu", + "utilization.memory", "memory.total", "memory.free", "memory.used", "eccerrorsaggregate.total", "eccuncorrectedvolatile.total"} - result := "" - for _, row := range records { - name := fmt.Sprintf("%s[%s]", row[0], row[1]) - for idx, value := range row[2:] { - result = fmt.Sprintf( - "%s%s{gpu=\"%s\"} %s\n", result, - metricList[idx], name, value) - } - } + result := "" + for _, row := range records { + name := fmt.Sprintf("%s[%s]", row[0], row[1]) + for idx, value := range row[2:] { + result = fmt.Sprintf( + "%s%s{gpu=\"%s\"} %s\n", result, + metricList[idx], name, value) + } + } - fmt.Fprintf(response, strings.Replace(result, ".", "_", -1)) + fmt.Fprintf(response, strings.Replace(result, ".", "_", -1)) } func main() { - addr := ":9101" - if len(os.Args) > 1 { - addr = ":" + os.Args[1] - } + addr := ":9101" + if len(os.Args) > 1 { + addr = ":" + os.Args[1] + } - http.HandleFunc("/metrics/", metrics) - err := http.ListenAndServe(addr, nil) - if err != nil { - log.Fatal("ListenAndServe: ", err) - } + http.HandleFunc("/metrics/", metrics) + err := http.ListenAndServe(addr, nil) + if err != nil { + log.Fatal("ListenAndServe: ", err) + } }