github.com/netdata/go.d.plugin@v0.58.1/modules/nvidia_smi/charts.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package nvidia_smi
     4  
     5  import (
     6  	"fmt"
     7  	"strings"
     8  
     9  	"github.com/netdata/go.d.plugin/agent/module"
    10  )
    11  
    12  const (
    13  	prioGPUPCIBandwidthUsage = module.Priority + iota
    14  	prioGPUPCIBandwidthUtilization
    15  	prioGPUFanSpeed
    16  	prioGPUUtilization
    17  	prioGPUMemUtilization
    18  	prioGPUDecoderUtilization
    19  	prioGPUEncoderUtilization
    20  	prioGPUMIGModeStatus
    21  	prioGPUMIGDevicesCount
    22  	prioGPUFBMemoryUsage
    23  	prioGPUMIGFBMemoryUsage
    24  	prioGPUBAR1MemoryUsage
    25  	prioGPUMIGBAR1MemoryUsage
    26  	prioGPUTemperatureChart
    27  	prioGPUVoltageChart
    28  	prioGPUClockFreq
    29  	prioGPUPowerDraw
    30  	prioGPUPerformanceState
    31  )
    32  
    33  var (
    34  	gpuXMLCharts = module.Charts{
    35  		gpuPCIBandwidthUsageChartTmpl.Copy(),
    36  		gpuPCIBandwidthUtilizationChartTmpl.Copy(),
    37  		gpuFanSpeedPercChartTmpl.Copy(),
    38  		gpuUtilizationChartTmpl.Copy(),
    39  		gpuMemUtilizationChartTmpl.Copy(),
    40  		gpuDecoderUtilizationChartTmpl.Copy(),
    41  		gpuEncoderUtilizationChartTmpl.Copy(),
    42  		gpuMIGModeCurrentStatusChartTmpl.Copy(),
    43  		gpuMIGDevicesCountChartTmpl.Copy(),
    44  		gpuFrameBufferMemoryUsageChartTmpl.Copy(),
    45  		gpuBAR1MemoryUsageChartTmpl.Copy(),
    46  		gpuVoltageChartTmpl.Copy(),
    47  		gpuTemperatureChartTmpl.Copy(),
    48  		gpuClockFreqChartTmpl.Copy(),
    49  		gpuPowerDrawChartTmpl.Copy(),
    50  		gpuPerformanceStateChartTmpl.Copy(),
    51  	}
    52  	migDeviceXMLCharts = module.Charts{
    53  		migDeviceFrameBufferMemoryUsageChartTmpl.Copy(),
    54  		migDeviceBAR1MemoryUsageChartTmpl.Copy(),
    55  	}
    56  	gpuCSVCharts = module.Charts{
    57  		gpuFanSpeedPercChartTmpl.Copy(),
    58  		gpuUtilizationChartTmpl.Copy(),
    59  		gpuMemUtilizationChartTmpl.Copy(),
    60  		gpuFrameBufferMemoryUsageChartTmpl.Copy(),
    61  		gpuTemperatureChartTmpl.Copy(),
    62  		gpuClockFreqChartTmpl.Copy(),
    63  		gpuPowerDrawChartTmpl.Copy(),
    64  		gpuPerformanceStateChartTmpl.Copy(),
    65  	}
    66  )
    67  
    68  var (
    69  	gpuPCIBandwidthUsageChartTmpl = module.Chart{
    70  		ID:       "gpu_%s_pcie_bandwidth_usage",
    71  		Title:    "PCI Express Bandwidth Usage",
    72  		Units:    "B/s",
    73  		Fam:      "pcie bandwidth",
    74  		Ctx:      "nvidia_smi.gpu_pcie_bandwidth_usage",
    75  		Type:     module.Area,
    76  		Priority: prioGPUPCIBandwidthUsage,
    77  		Dims: module.Dims{
    78  			{ID: "gpu_%s_pcie_bandwidth_usage_rx", Name: "rx"},
    79  			{ID: "gpu_%s_pcie_bandwidth_usage_tx", Name: "tx", Mul: -1},
    80  		},
    81  	}
    82  	gpuPCIBandwidthUtilizationChartTmpl = module.Chart{
    83  		ID:       "gpu_%s_pcie_bandwidth_utilization",
    84  		Title:    "PCI Express Bandwidth Utilization",
    85  		Units:    "percentage",
    86  		Fam:      "pcie bandwidth",
    87  		Ctx:      "nvidia_smi.gpu_pcie_bandwidth_utilization",
    88  		Priority: prioGPUPCIBandwidthUtilization,
    89  		Dims: module.Dims{
    90  			{ID: "gpu_%s_pcie_bandwidth_utilization_rx", Name: "rx", Div: 100},
    91  			{ID: "gpu_%s_pcie_bandwidth_utilization_tx", Name: "tx", Div: 100},
    92  		},
    93  	}
    94  	gpuFanSpeedPercChartTmpl = module.Chart{
    95  		ID:       "gpu_%s_fan_speed_perc",
    96  		Title:    "Fan speed",
    97  		Units:    "%",
    98  		Fam:      "fan speed",
    99  		Ctx:      "nvidia_smi.gpu_fan_speed_perc",
   100  		Priority: prioGPUFanSpeed,
   101  		Dims: module.Dims{
   102  			{ID: "gpu_%s_fan_speed_perc", Name: "fan_speed"},
   103  		},
   104  	}
   105  	gpuUtilizationChartTmpl = module.Chart{
   106  		ID:       "gpu_%s_gpu_utilization",
   107  		Title:    "GPU utilization",
   108  		Units:    "%",
   109  		Fam:      "gpu utilization",
   110  		Ctx:      "nvidia_smi.gpu_utilization",
   111  		Priority: prioGPUUtilization,
   112  		Dims: module.Dims{
   113  			{ID: "gpu_%s_gpu_utilization", Name: "gpu"},
   114  		},
   115  	}
   116  	gpuMemUtilizationChartTmpl = module.Chart{
   117  		ID:       "gpu_%s_memory_utilization",
   118  		Title:    "Memory utilization",
   119  		Units:    "%",
   120  		Fam:      "mem utilization",
   121  		Ctx:      "nvidia_smi.gpu_memory_utilization",
   122  		Priority: prioGPUMemUtilization,
   123  		Dims: module.Dims{
   124  			{ID: "gpu_%s_mem_utilization", Name: "memory"},
   125  		},
   126  	}
   127  	gpuDecoderUtilizationChartTmpl = module.Chart{
   128  		ID:       "gpu_%s_decoder_utilization",
   129  		Title:    "Decoder utilization",
   130  		Units:    "%",
   131  		Fam:      "dec utilization",
   132  		Ctx:      "nvidia_smi.gpu_decoder_utilization",
   133  		Priority: prioGPUDecoderUtilization,
   134  		Dims: module.Dims{
   135  			{ID: "gpu_%s_decoder_utilization", Name: "decoder"},
   136  		},
   137  	}
   138  	gpuEncoderUtilizationChartTmpl = module.Chart{
   139  		ID:       "gpu_%s_encoder_utilization",
   140  		Title:    "Encoder utilization",
   141  		Units:    "%",
   142  		Fam:      "enc utilization",
   143  		Ctx:      "nvidia_smi.gpu_encoder_utilization",
   144  		Priority: prioGPUEncoderUtilization,
   145  		Dims: module.Dims{
   146  			{ID: "gpu_%s_encoder_utilization", Name: "encoder"},
   147  		},
   148  	}
   149  	gpuMIGModeCurrentStatusChartTmpl = module.Chart{
   150  		ID:       "gpu_%s_mig_mode_current_status",
   151  		Title:    "MIG current mode",
   152  		Units:    "status",
   153  		Fam:      "mig",
   154  		Ctx:      "nvidia_smi.gpu_mig_mode_current_status",
   155  		Priority: prioGPUMIGModeStatus,
   156  		Dims: module.Dims{
   157  			{ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"},
   158  			{ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"},
   159  		},
   160  	}
   161  	gpuMIGDevicesCountChartTmpl = module.Chart{
   162  		ID:       "gpu_%s_mig_devices_count",
   163  		Title:    "MIG devices",
   164  		Units:    "devices",
   165  		Fam:      "mig",
   166  		Ctx:      "nvidia_smi.gpu_mig_devices_count",
   167  		Priority: prioGPUMIGDevicesCount,
   168  		Dims: module.Dims{
   169  			{ID: "gpu_%s_mig_devices_count", Name: "mig"},
   170  		},
   171  	}
   172  	gpuFrameBufferMemoryUsageChartTmpl = module.Chart{
   173  		ID:       "gpu_%s_frame_buffer_memory_usage",
   174  		Title:    "Frame buffer memory usage",
   175  		Units:    "B",
   176  		Fam:      "fb mem usage",
   177  		Ctx:      "nvidia_smi.gpu_frame_buffer_memory_usage",
   178  		Type:     module.Stacked,
   179  		Priority: prioGPUFBMemoryUsage,
   180  		Dims: module.Dims{
   181  			{ID: "gpu_%s_frame_buffer_memory_usage_free", Name: "free"},
   182  			{ID: "gpu_%s_frame_buffer_memory_usage_used", Name: "used"},
   183  			{ID: "gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"},
   184  		},
   185  	}
   186  	gpuBAR1MemoryUsageChartTmpl = module.Chart{
   187  		ID:       "gpu_%s_bar1_memory_usage",
   188  		Title:    "BAR1 memory usage",
   189  		Units:    "B",
   190  		Fam:      "bar1 mem usage",
   191  		Ctx:      "nvidia_smi.gpu_bar1_memory_usage",
   192  		Type:     module.Stacked,
   193  		Priority: prioGPUBAR1MemoryUsage,
   194  		Dims: module.Dims{
   195  			{ID: "gpu_%s_bar1_memory_usage_free", Name: "free"},
   196  			{ID: "gpu_%s_bar1_memory_usage_used", Name: "used"},
   197  		},
   198  	}
   199  	gpuTemperatureChartTmpl = module.Chart{
   200  		ID:       "gpu_%s_temperature",
   201  		Title:    "Temperature",
   202  		Units:    "Celsius",
   203  		Fam:      "temperature",
   204  		Ctx:      "nvidia_smi.gpu_temperature",
   205  		Priority: prioGPUTemperatureChart,
   206  		Dims: module.Dims{
   207  			{ID: "gpu_%s_temperature", Name: "temperature"},
   208  		},
   209  	}
   210  	gpuVoltageChartTmpl = module.Chart{
   211  		ID:       "gpu_%s_voltage",
   212  		Title:    "Voltage",
   213  		Units:    "V",
   214  		Fam:      "voltage",
   215  		Ctx:      "nvidia_smi.gpu_voltage",
   216  		Priority: prioGPUVoltageChart,
   217  		Dims: module.Dims{
   218  			{ID: "gpu_%s_voltage", Name: "voltage", Div: 1000}, // mV => V
   219  		},
   220  	}
   221  	gpuClockFreqChartTmpl = module.Chart{
   222  		ID:       "gpu_%s_clock_freq",
   223  		Title:    "Clock current frequency",
   224  		Units:    "MHz",
   225  		Fam:      "clocks",
   226  		Ctx:      "nvidia_smi.gpu_clock_freq",
   227  		Priority: prioGPUClockFreq,
   228  		Dims: module.Dims{
   229  			{ID: "gpu_%s_graphics_clock", Name: "graphics"},
   230  			{ID: "gpu_%s_video_clock", Name: "video"},
   231  			{ID: "gpu_%s_sm_clock", Name: "sm"},
   232  			{ID: "gpu_%s_mem_clock", Name: "mem"},
   233  		},
   234  	}
   235  	gpuPowerDrawChartTmpl = module.Chart{
   236  		ID:       "gpu_%s_power_draw",
   237  		Title:    "Power draw",
   238  		Units:    "Watts",
   239  		Fam:      "power draw",
   240  		Ctx:      "nvidia_smi.gpu_power_draw",
   241  		Priority: prioGPUPowerDraw,
   242  		Dims: module.Dims{
   243  			{ID: "gpu_%s_power_draw", Name: "power_draw"},
   244  		},
   245  	}
   246  	gpuPerformanceStateChartTmpl = module.Chart{
   247  		ID:       "gpu_%s_performance_state",
   248  		Title:    "Performance state",
   249  		Units:    "state",
   250  		Fam:      "performance state",
   251  		Ctx:      "nvidia_smi.gpu_performance_state",
   252  		Priority: prioGPUPerformanceState,
   253  		Dims: module.Dims{
   254  			{ID: "gpu_%s_performance_state_P0", Name: "P0"},
   255  			{ID: "gpu_%s_performance_state_P1", Name: "P1"},
   256  			{ID: "gpu_%s_performance_state_P2", Name: "P2"},
   257  			{ID: "gpu_%s_performance_state_P3", Name: "P3"},
   258  			{ID: "gpu_%s_performance_state_P4", Name: "P4"},
   259  			{ID: "gpu_%s_performance_state_P5", Name: "P5"},
   260  			{ID: "gpu_%s_performance_state_P6", Name: "P6"},
   261  			{ID: "gpu_%s_performance_state_P7", Name: "P7"},
   262  			{ID: "gpu_%s_performance_state_P8", Name: "P8"},
   263  			{ID: "gpu_%s_performance_state_P9", Name: "P9"},
   264  			{ID: "gpu_%s_performance_state_P10", Name: "P10"},
   265  			{ID: "gpu_%s_performance_state_P11", Name: "P11"},
   266  			{ID: "gpu_%s_performance_state_P12", Name: "P12"},
   267  			{ID: "gpu_%s_performance_state_P13", Name: "P13"},
   268  			{ID: "gpu_%s_performance_state_P14", Name: "P14"},
   269  			{ID: "gpu_%s_performance_state_P15", Name: "P15"},
   270  		},
   271  	}
   272  )
   273  
   274  func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
   275  	charts := gpuXMLCharts.Copy()
   276  
   277  	if !isValidValue(gpu.Utilization.GpuUtil) {
   278  		_ = charts.Remove(gpuUtilizationChartTmpl.ID)
   279  	}
   280  	if !isValidValue(gpu.Utilization.MemoryUtil) {
   281  		_ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
   282  	}
   283  	if !isValidValue(gpu.Utilization.DecoderUtil) {
   284  		_ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID)
   285  	}
   286  	if !isValidValue(gpu.Utilization.EncoderUtil) {
   287  		_ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID)
   288  	}
   289  	if !isValidValue(gpu.MIGMode.CurrentMIG) {
   290  		_ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID)
   291  		_ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID)
   292  	}
   293  	if !isValidValue(gpu.FanSpeed) {
   294  		_ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
   295  	}
   296  	if (gpu.PowerReadings == nil || !isValidValue(gpu.PowerReadings.PowerDraw)) &&
   297  		(gpu.GPUPowerReadings == nil || !isValidValue(gpu.GPUPowerReadings.PowerDraw)) {
   298  		_ = charts.Remove(gpuPowerDrawChartTmpl.ID)
   299  	}
   300  	if !isValidValue(gpu.Voltage.GraphicsVolt) {
   301  		_ = charts.Remove(gpuVoltageChartTmpl.ID)
   302  	}
   303  
   304  	for _, c := range *charts {
   305  		c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID))
   306  		c.Labels = []module.Label{
   307  			// csv output has no 'product_brand'
   308  			{Key: "uuid", Value: gpu.UUID},
   309  			{Key: "product_name", Value: gpu.ProductName},
   310  		}
   311  		for _, d := range c.Dims {
   312  			d.ID = fmt.Sprintf(d.ID, gpu.UUID)
   313  		}
   314  	}
   315  
   316  	if err := nv.Charts().Add(*charts...); err != nil {
   317  		nv.Warning(err)
   318  	}
   319  }
   320  
   321  func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) {
   322  	charts := gpuCSVCharts.Copy()
   323  
   324  	if !isValidValue(gpu.utilizationGPU) {
   325  		_ = charts.Remove(gpuUtilizationChartTmpl.ID)
   326  	}
   327  	if !isValidValue(gpu.utilizationMemory) {
   328  		_ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
   329  	}
   330  	if !isValidValue(gpu.fanSpeed) {
   331  		_ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
   332  	}
   333  	if !isValidValue(gpu.powerDraw) {
   334  		_ = charts.Remove(gpuPowerDrawChartTmpl.ID)
   335  	}
   336  
   337  	for _, c := range *charts {
   338  		c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.uuid))
   339  		c.Labels = []module.Label{
   340  			{Key: "product_name", Value: gpu.name},
   341  		}
   342  		for _, d := range c.Dims {
   343  			d.ID = fmt.Sprintf(d.ID, gpu.uuid)
   344  		}
   345  	}
   346  
   347  	if err := nv.Charts().Add(*charts...); err != nil {
   348  		nv.Warning(err)
   349  	}
   350  }
   351  
   352  var (
   353  	migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{
   354  		ID:       "mig_instance_%s_gpu_%s_frame_buffer_memory_usage",
   355  		Title:    "MIG Frame buffer memory usage",
   356  		Units:    "B",
   357  		Fam:      "fb mem usage",
   358  		Ctx:      "nvidia_smi.gpu_mig_frame_buffer_memory_usage",
   359  		Type:     module.Stacked,
   360  		Priority: prioGPUMIGFBMemoryUsage,
   361  		Dims: module.Dims{
   362  			{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"},
   363  			{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"},
   364  			{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"},
   365  		},
   366  	}
   367  	migDeviceBAR1MemoryUsageChartTmpl = module.Chart{
   368  		ID:       "mig_instance_%s_gpu_%s_bar1_memory_usage",
   369  		Title:    "MIG BAR1 memory usage",
   370  		Units:    "B",
   371  		Fam:      "bar1 mem usage",
   372  		Ctx:      "nvidia_smi.gpu_mig_bar1_memory_usage",
   373  		Type:     module.Stacked,
   374  		Priority: prioGPUMIGBAR1MemoryUsage,
   375  		Dims: module.Dims{
   376  			{ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"},
   377  			{ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"},
   378  		},
   379  	}
   380  )
   381  
   382  func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo) {
   383  	charts := migDeviceXMLCharts.Copy()
   384  
   385  	for _, c := range *charts {
   386  		c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID))
   387  		c.Labels = []module.Label{
   388  			{Key: "gpu_uuid", Value: gpu.UUID},
   389  			{Key: "gpu_product_name", Value: gpu.ProductName},
   390  			{Key: "gpu_instance_id", Value: mig.GPUInstanceID},
   391  		}
   392  		for _, d := range c.Dims {
   393  			d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID)
   394  		}
   395  	}
   396  
   397  	if err := nv.Charts().Add(*charts...); err != nil {
   398  		nv.Warning(err)
   399  	}
   400  }
   401  
   402  func (nv *NvidiaSMI) removeCharts(prefix string) {
   403  	prefix = strings.ToLower(prefix)
   404  
   405  	for _, c := range *nv.Charts() {
   406  		if strings.HasPrefix(c.ID, prefix) {
   407  			c.MarkRemove()
   408  			c.MarkNotCreated()
   409  		}
   410  	}
   411  }