src/go/plugin/go.d/modules/nvidia_smi/collect.go
// SPDX-License-Identifier: GPL-3.0-or-later
package nvidia_smi
import (
"encoding/xml"
"errors"
"fmt"
"strconv"
"strings"
)
func (nv *NvidiaSmi) collect() (map[string]int64, error) {
if nv.exec == nil {
return nil, errors.New("nvidia-smi exec is not initialized")
}
mx := make(map[string]int64)
if err := nv.collectGPUInfo(mx); err != nil {
return nil, err
}
return mx, nil
}
func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error {
bs, err := nv.exec.queryGPUInfo()
if err != nil {
return fmt.Errorf("error on quering XML GPU info: %v", err)
}
info := &gpusInfo{}
if err := xml.Unmarshal(bs, info); err != nil {
return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
}
seenGPU := make(map[string]bool)
seenMIG := make(map[string]bool)
for _, gpu := range info.GPUs {
if !isValidValue(gpu.UUID) {
continue
}
px := "gpu_" + gpu.UUID + "_"
seenGPU[px] = true
if !nv.gpus[px] {
nv.gpus[px] = true
nv.addGPUXMLCharts(gpu)
}
addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
if maxBw := calcMaxPCIEBandwidth(gpu); maxBw > 0 {
rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / maxBw) * 100)
mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / maxBw) * 100)
}
addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0)
addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0)
addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0)
addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0)
addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0)
addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
if gpu.PowerReadings != nil {
addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
} else if gpu.GPUPowerReadings != nil {
addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0)
}
addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
for i := 0; i < 16; i++ {
s := "P" + strconv.Itoa(i)
mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s)
}
if isValidValue(gpu.MIGMode.CurrentMIG) {
mode := strings.ToLower(gpu.MIGMode.CurrentMIG)
mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled")
mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled")
mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice))
}
for _, mig := range gpu.MIGDevices.MIGDevice {
if !isValidValue(mig.GPUInstanceID) {
continue
}
px := "mig_instance_" + mig.GPUInstanceID + "_" + px
seenMIG[px] = true
if !nv.migs[px] {
nv.migs[px] = true
nv.addMIGDeviceCharts(gpu, mig)
}
addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes
}
}
for px := range nv.gpus {
if !seenGPU[px] {
delete(nv.gpus, px)
nv.removeCharts(px)
}
}
for px := range nv.migs {
if !seenMIG[px] {
delete(nv.migs, px)
nv.removeCharts(px)
}
}
return nil
}
func calcMaxPCIEBandwidth(gpu gpuInfo) float64 {
gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")
if !isValidValue(gen) || !isValidValue(width) {
return 0
}
// https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
var speed, enc float64
switch gen {
case "1":
speed, enc = 2.5, 1.0/5.0
case "2":
speed, enc = 5, 1.0/5.0
case "3":
speed, enc = 8, 2.0/130.0
case "4":
speed, enc = 16, 2.0/130.0
case "5":
speed, enc = 32, 2.0/130.0
default:
return 0
}
// Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
}
func addMetric(mx map[string]int64, key, value string, mul int) {
if !isValidValue(value) {
return
}
value = removeUnits(value)
v, err := strconv.ParseFloat(value, 64)
if err != nil {
return
}
if mul > 0 {
v *= float64(mul)
}
mx[key] = int64(v)
}
func isValidValue(v string) bool {
return v != "" && v != "N/A" && v != "[N/A]"
}
func parseFloat(s string) float64 {
v, _ := strconv.ParseFloat(removeUnits(s), 64)
return v
}
func removeUnits(s string) string {
if i := strings.IndexByte(s, ' '); i != -1 {
s = s[:i]
}
return s
}
func boolToInt(v bool) int64 {
if v {
return 1
}
return 0
}