firehol/netdata

View on GitHub
src/go/plugin/go.d/modules/nvidia_smi/charts.go

Summary

Maintainability
A
1 hr
Test Coverage
// SPDX-License-Identifier: GPL-3.0-or-later

package nvidia_smi

import (
    "fmt"
    "strings"

    "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module"
)

const (
    prioGPUPCIBandwidthUsage = module.Priority + iota
    prioGPUPCIBandwidthUtilization
    prioGPUFanSpeed
    prioGPUUtilization
    prioGPUMemUtilization
    prioGPUDecoderUtilization
    prioGPUEncoderUtilization
    prioGPUMIGModeStatus
    prioGPUMIGDevicesCount
    prioGPUFBMemoryUsage
    prioGPUMIGFBMemoryUsage
    prioGPUBAR1MemoryUsage
    prioGPUMIGBAR1MemoryUsage
    prioGPUTemperatureChart
    prioGPUVoltageChart
    prioGPUClockFreq
    prioGPUPowerDraw
    prioGPUPerformanceState
)

var (
    gpuXMLCharts = module.Charts{
        gpuPCIBandwidthUsageChartTmpl.Copy(),
        gpuPCIBandwidthUtilizationChartTmpl.Copy(),
        gpuFanSpeedPercChartTmpl.Copy(),
        gpuUtilizationChartTmpl.Copy(),
        gpuMemUtilizationChartTmpl.Copy(),
        gpuDecoderUtilizationChartTmpl.Copy(),
        gpuEncoderUtilizationChartTmpl.Copy(),
        gpuMIGModeCurrentStatusChartTmpl.Copy(),
        gpuMIGDevicesCountChartTmpl.Copy(),
        gpuFrameBufferMemoryUsageChartTmpl.Copy(),
        gpuBAR1MemoryUsageChartTmpl.Copy(),
        gpuVoltageChartTmpl.Copy(),
        gpuTemperatureChartTmpl.Copy(),
        gpuClockFreqChartTmpl.Copy(),
        gpuPowerDrawChartTmpl.Copy(),
        gpuPerformanceStateChartTmpl.Copy(),
    }
    migDeviceXMLCharts = module.Charts{
        migDeviceFrameBufferMemoryUsageChartTmpl.Copy(),
        migDeviceBAR1MemoryUsageChartTmpl.Copy(),
    }
)

var (
    gpuPCIBandwidthUsageChartTmpl = module.Chart{
        ID:       "gpu_%s_pcie_bandwidth_usage",
        Title:    "PCI Express Bandwidth Usage",
        Units:    "B/s",
        Fam:      "pcie bandwidth",
        Ctx:      "nvidia_smi.gpu_pcie_bandwidth_usage",
        Type:     module.Area,
        Priority: prioGPUPCIBandwidthUsage,
        Dims: module.Dims{
            {ID: "gpu_%s_pcie_bandwidth_usage_rx", Name: "rx"},
            {ID: "gpu_%s_pcie_bandwidth_usage_tx", Name: "tx", Mul: -1},
        },
    }
    gpuPCIBandwidthUtilizationChartTmpl = module.Chart{
        ID:       "gpu_%s_pcie_bandwidth_utilization",
        Title:    "PCI Express Bandwidth Utilization",
        Units:    "percentage",
        Fam:      "pcie bandwidth",
        Ctx:      "nvidia_smi.gpu_pcie_bandwidth_utilization",
        Priority: prioGPUPCIBandwidthUtilization,
        Dims: module.Dims{
            {ID: "gpu_%s_pcie_bandwidth_utilization_rx", Name: "rx", Div: 100},
            {ID: "gpu_%s_pcie_bandwidth_utilization_tx", Name: "tx", Div: 100},
        },
    }
    gpuFanSpeedPercChartTmpl = module.Chart{
        ID:       "gpu_%s_fan_speed_perc",
        Title:    "Fan speed",
        Units:    "%",
        Fam:      "fan speed",
        Ctx:      "nvidia_smi.gpu_fan_speed_perc",
        Priority: prioGPUFanSpeed,
        Dims: module.Dims{
            {ID: "gpu_%s_fan_speed_perc", Name: "fan_speed"},
        },
    }
    gpuUtilizationChartTmpl = module.Chart{
        ID:       "gpu_%s_gpu_utilization",
        Title:    "GPU utilization",
        Units:    "%",
        Fam:      "gpu utilization",
        Ctx:      "nvidia_smi.gpu_utilization",
        Priority: prioGPUUtilization,
        Dims: module.Dims{
            {ID: "gpu_%s_gpu_utilization", Name: "gpu"},
        },
    }
    gpuMemUtilizationChartTmpl = module.Chart{
        ID:       "gpu_%s_memory_utilization",
        Title:    "Memory utilization",
        Units:    "%",
        Fam:      "mem utilization",
        Ctx:      "nvidia_smi.gpu_memory_utilization",
        Priority: prioGPUMemUtilization,
        Dims: module.Dims{
            {ID: "gpu_%s_mem_utilization", Name: "memory"},
        },
    }
    gpuDecoderUtilizationChartTmpl = module.Chart{
        ID:       "gpu_%s_decoder_utilization",
        Title:    "Decoder utilization",
        Units:    "%",
        Fam:      "dec utilization",
        Ctx:      "nvidia_smi.gpu_decoder_utilization",
        Priority: prioGPUDecoderUtilization,
        Dims: module.Dims{
            {ID: "gpu_%s_decoder_utilization", Name: "decoder"},
        },
    }
    gpuEncoderUtilizationChartTmpl = module.Chart{
        ID:       "gpu_%s_encoder_utilization",
        Title:    "Encoder utilization",
        Units:    "%",
        Fam:      "enc utilization",
        Ctx:      "nvidia_smi.gpu_encoder_utilization",
        Priority: prioGPUEncoderUtilization,
        Dims: module.Dims{
            {ID: "gpu_%s_encoder_utilization", Name: "encoder"},
        },
    }
    gpuMIGModeCurrentStatusChartTmpl = module.Chart{
        ID:       "gpu_%s_mig_mode_current_status",
        Title:    "MIG current mode",
        Units:    "status",
        Fam:      "mig",
        Ctx:      "nvidia_smi.gpu_mig_mode_current_status",
        Priority: prioGPUMIGModeStatus,
        Dims: module.Dims{
            {ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"},
            {ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"},
        },
    }
    gpuMIGDevicesCountChartTmpl = module.Chart{
        ID:       "gpu_%s_mig_devices_count",
        Title:    "MIG devices",
        Units:    "devices",
        Fam:      "mig",
        Ctx:      "nvidia_smi.gpu_mig_devices_count",
        Priority: prioGPUMIGDevicesCount,
        Dims: module.Dims{
            {ID: "gpu_%s_mig_devices_count", Name: "mig"},
        },
    }
    gpuFrameBufferMemoryUsageChartTmpl = module.Chart{
        ID:       "gpu_%s_frame_buffer_memory_usage",
        Title:    "Frame buffer memory usage",
        Units:    "B",
        Fam:      "fb mem usage",
        Ctx:      "nvidia_smi.gpu_frame_buffer_memory_usage",
        Type:     module.Stacked,
        Priority: prioGPUFBMemoryUsage,
        Dims: module.Dims{
            {ID: "gpu_%s_frame_buffer_memory_usage_free", Name: "free"},
            {ID: "gpu_%s_frame_buffer_memory_usage_used", Name: "used"},
            {ID: "gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"},
        },
    }
    gpuBAR1MemoryUsageChartTmpl = module.Chart{
        ID:       "gpu_%s_bar1_memory_usage",
        Title:    "BAR1 memory usage",
        Units:    "B",
        Fam:      "bar1 mem usage",
        Ctx:      "nvidia_smi.gpu_bar1_memory_usage",
        Type:     module.Stacked,
        Priority: prioGPUBAR1MemoryUsage,
        Dims: module.Dims{
            {ID: "gpu_%s_bar1_memory_usage_free", Name: "free"},
            {ID: "gpu_%s_bar1_memory_usage_used", Name: "used"},
        },
    }
    gpuTemperatureChartTmpl = module.Chart{
        ID:       "gpu_%s_temperature",
        Title:    "Temperature",
        Units:    "Celsius",
        Fam:      "temperature",
        Ctx:      "nvidia_smi.gpu_temperature",
        Priority: prioGPUTemperatureChart,
        Dims: module.Dims{
            {ID: "gpu_%s_temperature", Name: "temperature"},
        },
    }
    gpuVoltageChartTmpl = module.Chart{
        ID:       "gpu_%s_voltage",
        Title:    "Voltage",
        Units:    "V",
        Fam:      "voltage",
        Ctx:      "nvidia_smi.gpu_voltage",
        Priority: prioGPUVoltageChart,
        Dims: module.Dims{
            {ID: "gpu_%s_voltage", Name: "voltage", Div: 1000}, // mV => V
        },
    }
    gpuClockFreqChartTmpl = module.Chart{
        ID:       "gpu_%s_clock_freq",
        Title:    "Clock current frequency",
        Units:    "MHz",
        Fam:      "clocks",
        Ctx:      "nvidia_smi.gpu_clock_freq",
        Priority: prioGPUClockFreq,
        Dims: module.Dims{
            {ID: "gpu_%s_graphics_clock", Name: "graphics"},
            {ID: "gpu_%s_video_clock", Name: "video"},
            {ID: "gpu_%s_sm_clock", Name: "sm"},
            {ID: "gpu_%s_mem_clock", Name: "mem"},
        },
    }
    gpuPowerDrawChartTmpl = module.Chart{
        ID:       "gpu_%s_power_draw",
        Title:    "Power draw",
        Units:    "Watts",
        Fam:      "power draw",
        Ctx:      "nvidia_smi.gpu_power_draw",
        Priority: prioGPUPowerDraw,
        Dims: module.Dims{
            {ID: "gpu_%s_power_draw", Name: "power_draw"},
        },
    }
    gpuPerformanceStateChartTmpl = module.Chart{
        ID:       "gpu_%s_performance_state",
        Title:    "Performance state",
        Units:    "state",
        Fam:      "performance state",
        Ctx:      "nvidia_smi.gpu_performance_state",
        Priority: prioGPUPerformanceState,
        Dims: module.Dims{
            {ID: "gpu_%s_performance_state_P0", Name: "P0"},
            {ID: "gpu_%s_performance_state_P1", Name: "P1"},
            {ID: "gpu_%s_performance_state_P2", Name: "P2"},
            {ID: "gpu_%s_performance_state_P3", Name: "P3"},
            {ID: "gpu_%s_performance_state_P4", Name: "P4"},
            {ID: "gpu_%s_performance_state_P5", Name: "P5"},
            {ID: "gpu_%s_performance_state_P6", Name: "P6"},
            {ID: "gpu_%s_performance_state_P7", Name: "P7"},
            {ID: "gpu_%s_performance_state_P8", Name: "P8"},
            {ID: "gpu_%s_performance_state_P9", Name: "P9"},
            {ID: "gpu_%s_performance_state_P10", Name: "P10"},
            {ID: "gpu_%s_performance_state_P11", Name: "P11"},
            {ID: "gpu_%s_performance_state_P12", Name: "P12"},
            {ID: "gpu_%s_performance_state_P13", Name: "P13"},
            {ID: "gpu_%s_performance_state_P14", Name: "P14"},
            {ID: "gpu_%s_performance_state_P15", Name: "P15"},
        },
    }
)

func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) {
    charts := gpuXMLCharts.Copy()

    if !isValidValue(gpu.Utilization.GpuUtil) {
        _ = charts.Remove(gpuUtilizationChartTmpl.ID)
    }
    if !isValidValue(gpu.Utilization.MemoryUtil) {
        _ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
    }
    if !isValidValue(gpu.Utilization.DecoderUtil) {
        _ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID)
    }
    if !isValidValue(gpu.Utilization.EncoderUtil) {
        _ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID)
    }
    if !isValidValue(gpu.MIGMode.CurrentMIG) {
        _ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID)
        _ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID)
    }
    if !isValidValue(gpu.FanSpeed) {
        _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
    }
    if (gpu.PowerReadings == nil || !isValidValue(gpu.PowerReadings.PowerDraw)) &&
        (gpu.GPUPowerReadings == nil || !isValidValue(gpu.GPUPowerReadings.PowerDraw)) {
        _ = charts.Remove(gpuPowerDrawChartTmpl.ID)
    }
    if !isValidValue(gpu.Voltage.GraphicsVolt) {
        _ = charts.Remove(gpuVoltageChartTmpl.ID)
    }

    for _, c := range *charts {
        c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID))
        c.Labels = []module.Label{
            // csv output has no 'product_brand'
            {Key: "uuid", Value: gpu.UUID},
            {Key: "product_name", Value: gpu.ProductName},
        }
        for _, d := range c.Dims {
            d.ID = fmt.Sprintf(d.ID, gpu.UUID)
        }
    }

    if err := nv.Charts().Add(*charts...); err != nil {
        nv.Warning(err)
    }
}

var (
    migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{
        ID:       "mig_instance_%s_gpu_%s_frame_buffer_memory_usage",
        Title:    "MIG Frame buffer memory usage",
        Units:    "B",
        Fam:      "fb mem usage",
        Ctx:      "nvidia_smi.gpu_mig_frame_buffer_memory_usage",
        Type:     module.Stacked,
        Priority: prioGPUMIGFBMemoryUsage,
        Dims: module.Dims{
            {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"},
            {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"},
            {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"},
        },
    }
    migDeviceBAR1MemoryUsageChartTmpl = module.Chart{
        ID:       "mig_instance_%s_gpu_%s_bar1_memory_usage",
        Title:    "MIG BAR1 memory usage",
        Units:    "B",
        Fam:      "bar1 mem usage",
        Ctx:      "nvidia_smi.gpu_mig_bar1_memory_usage",
        Type:     module.Stacked,
        Priority: prioGPUMIGBAR1MemoryUsage,
        Dims: module.Dims{
            {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"},
            {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"},
        },
    }
)

func (nv *NvidiaSmi) addMIGDeviceCharts(gpu gpuInfo, mig gpuMIGDeviceInfo) {
    charts := migDeviceXMLCharts.Copy()

    for _, c := range *charts {
        c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID))
        c.Labels = []module.Label{
            {Key: "gpu_uuid", Value: gpu.UUID},
            {Key: "gpu_product_name", Value: gpu.ProductName},
            {Key: "gpu_instance_id", Value: mig.GPUInstanceID},
        }
        for _, d := range c.Dims {
            d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID)
        }
    }

    if err := nv.Charts().Add(*charts...); err != nil {
        nv.Warning(err)
    }
}

func (nv *NvidiaSmi) removeCharts(prefix string) {
    prefix = strings.ToLower(prefix)

    for _, c := range *nv.Charts() {
        if strings.HasPrefix(c.ID, prefix) {
            c.MarkRemove()
            c.MarkNotCreated()
        }
    }
}