netdata/netdata

View on GitHub
src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go

Summary

Maintainability
B
4 hrs
Test Coverage
// SPDX-License-Identifier: GPL-3.0-or-later

package nvidia_smi

import (
    "encoding/xml"
    "fmt"
    "strconv"
    "strings"
)

func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
    bs, err := nv.exec.queryGPUInfoXML()
    if err != nil {
        return fmt.Errorf("error on quering XML GPU info: %v", err)
    }

    info := &xmlInfo{}
    if err := xml.Unmarshal(bs, info); err != nil {
        return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
    }

    seenGPU := make(map[string]bool)
    seenMIG := make(map[string]bool)

    for _, gpu := range info.GPUs {
        if !isValidValue(gpu.UUID) {
            continue
        }

        px := "gpu_" + gpu.UUID + "_"

        seenGPU[px] = true

        if !nv.gpus[px] {
            nv.gpus[px] = true
            nv.addGPUXMLCharts(gpu)
        }

        addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
        addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
        if max := calcMaxPCIEBandwidth(gpu); max > 0 {
            rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
            tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
            mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100)
            mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100)
        }
        addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
        addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
        addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
        addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0)
        addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0)
        addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024)         // MiB => bytes
        addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024)         // MiB => bytes
        addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
        addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024)               // MiB => bytes
        addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024)               // MiB => bytes
        addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0)
        addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0)
        addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0)
        addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
        addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
        if gpu.PowerReadings != nil {
            addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
        } else if gpu.GPUPowerReadings != nil {
            addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0)
        }
        addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
        for i := 0; i < 16; i++ {
            s := "P" + strconv.Itoa(i)
            mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s)
        }
        if isValidValue(gpu.MIGMode.CurrentMIG) {
            mode := strings.ToLower(gpu.MIGMode.CurrentMIG)
            mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled")
            mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled")
            mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice))
        }

        for _, mig := range gpu.MIGDevices.MIGDevice {
            if !isValidValue(mig.GPUInstanceID) {
                continue
            }

            px := "mig_instance_" + mig.GPUInstanceID + "_" + px

            seenMIG[px] = true

            if !nv.migs[px] {
                nv.migs[px] = true
                nv.addMIGDeviceXMLCharts(gpu, mig)
            }

            addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
            addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024)         // MiB => bytes
            addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024)         // MiB => bytes
            addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
            addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024)               // MiB => bytes
            addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024)               // MiB => bytes
        }
    }

    for px := range nv.gpus {
        if !seenGPU[px] {
            delete(nv.gpus, px)
            nv.removeCharts(px)
        }
    }

    for px := range nv.migs {
        if !seenMIG[px] {
            delete(nv.migs, px)
            nv.removeCharts(px)
        }
    }

    return nil
}

func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
    gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
    width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")

    if !isValidValue(gen) || !isValidValue(width) {
        return 0
    }

    // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
    var speed, enc float64
    switch gen {
    case "1":
        speed, enc = 2.5, 1.0/5.0
    case "2":
        speed, enc = 5, 1.0/5.0
    case "3":
        speed, enc = 8, 2.0/130.0
    case "4":
        speed, enc = 16, 2.0/130.0
    case "5":
        speed, enc = 32, 2.0/130.0
    default:
        return 0
    }

    // Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
    return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
}

type (
    xmlInfo struct {
        GPUs []xmlGPUInfo `xml:"gpu"`
    }
    xmlGPUInfo struct {
        ID                  string `xml:"id,attr"`
        ProductName         string `xml:"product_name"`
        ProductBrand        string `xml:"product_brand"`
        ProductArchitecture string `xml:"product_architecture"`
        UUID                string `xml:"uuid"`
        FanSpeed            string `xml:"fan_speed"`
        PerformanceState    string `xml:"performance_state"`
        MIGMode             struct {
            CurrentMIG string `xml:"current_mig"`
        } `xml:"mig_mode"`
        MIGDevices struct {
            MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"`
        } `xml:"mig_devices"`
        PCI struct {
            TxUtil         string `xml:"tx_util"`
            RxUtil         string `xml:"rx_util"`
            PCIGPULinkInfo struct {
                PCIEGen struct {
                    MaxLinkGen string `xml:"max_link_gen"`
                } `xml:"pcie_gen"`
                LinkWidths struct {
                    MaxLinkWidth string `xml:"max_link_width"`
                } `xml:"link_widths"`
            } `xml:"pci_gpu_link_info"`
        } `xml:"pci"`
        Utilization struct {
            GpuUtil     string `xml:"gpu_util"`
            MemoryUtil  string `xml:"memory_util"`
            EncoderUtil string `xml:"encoder_util"`
            DecoderUtil string `xml:"decoder_util"`
        } `xml:"utilization"`
        FBMemoryUsage struct {
            Total    string `xml:"total"`
            Reserved string `xml:"reserved"`
            Used     string `xml:"used"`
            Free     string `xml:"free"`
        } `xml:"fb_memory_usage"`
        Bar1MemoryUsage struct {
            Total string `xml:"total"`
            Used  string `xml:"used"`
            Free  string `xml:"free"`
        } `xml:"bar1_memory_usage"`
        Temperature struct {
            GpuTemp                string `xml:"gpu_temp"`
            GpuTempMaxThreshold    string `xml:"gpu_temp_max_threshold"`
            GpuTempSlowThreshold   string `xml:"gpu_temp_slow_threshold"`
            GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"`
            GpuTargetTemperature   string `xml:"gpu_target_temperature"`
            MemoryTemp             string `xml:"memory_temp"`
            GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"`
        } `xml:"temperature"`
        Clocks struct {
            GraphicsClock string `xml:"graphics_clock"`
            SmClock       string `xml:"sm_clock"`
            MemClock      string `xml:"mem_clock"`
            VideoClock    string `xml:"video_clock"`
        } `xml:"clocks"`
        PowerReadings    *xmlPowerReadings `xml:"power_readings"`
        GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"`
        Voltage          struct {
            GraphicsVolt string `xml:"graphics_volt"`
        } `xml:"voltage"`
        Processes struct {
            ProcessInfo []struct {
                PID         string `xml:"pid"`
                ProcessName string `xml:"process_name"`
                UsedMemory  string `xml:"used_memory"`
            } `sml:"process_info"`
        } `xml:"processes"`
    }

    xmlPowerReadings struct {
        //PowerState         string `xml:"power_state"`
        //PowerManagement    string `xml:"power_management"`
        PowerDraw string `xml:"power_draw"`
        //PowerLimit         string `xml:"power_limit"`
        //DefaultPowerLimit  string `xml:"default_power_limit"`
        //EnforcedPowerLimit string `xml:"enforced_power_limit"`
        //MinPowerLimit      string `xml:"min_power_limit"`
        //MaxPowerLimit      string `xml:"max_power_limit"`
    }

    xmlMIGDeviceInfo struct {
        Index             string `xml:"index"`
        GPUInstanceID     string `xml:"gpu_instance_id"`
        ComputeInstanceID string `xml:"compute_instance_id"`
        DeviceAttributes  struct {
            Shared struct {
                MultiprocessorCount string `xml:"multiprocessor_count"`
                CopyEngineCount     string `xml:"copy_engine_count"`
                EncoderCount        string `xml:"encoder_count"`
                DecoderCount        string `xml:"decoder_count"`
                OFACount            string `xml:"ofa_count"`
                JPGCount            string `xml:"jpg_count"`
            } `xml:"shared"`
        } `xml:"device_attributes"`
        ECCErrorCount struct {
            VolatileCount struct {
                SRAMUncorrectable string `xml:"sram_uncorrectable"`
            } `xml:"volatile_count"`
        } `xml:"ecc_error_count"`
        FBMemoryUsage struct {
            Free     string `xml:"free"`
            Used     string `xml:"used"`
            Reserved string `xml:"reserved"`
        } `xml:"fb_memory_usage"`
        BAR1MemoryUsage struct {
            Free string `xml:"free"`
            Used string `xml:"used"`
        } `xml:"bar1_memory_usage"`
    }
)