github.com/netdata/go.d.plugin@v0.58.1/modules/nvidia_smi/charts.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package nvidia_smi 4 5 import ( 6 "fmt" 7 "strings" 8 9 "github.com/netdata/go.d.plugin/agent/module" 10 ) 11 12 const ( 13 prioGPUPCIBandwidthUsage = module.Priority + iota 14 prioGPUPCIBandwidthUtilization 15 prioGPUFanSpeed 16 prioGPUUtilization 17 prioGPUMemUtilization 18 prioGPUDecoderUtilization 19 prioGPUEncoderUtilization 20 prioGPUMIGModeStatus 21 prioGPUMIGDevicesCount 22 prioGPUFBMemoryUsage 23 prioGPUMIGFBMemoryUsage 24 prioGPUBAR1MemoryUsage 25 prioGPUMIGBAR1MemoryUsage 26 prioGPUTemperatureChart 27 prioGPUVoltageChart 28 prioGPUClockFreq 29 prioGPUPowerDraw 30 prioGPUPerformanceState 31 ) 32 33 var ( 34 gpuXMLCharts = module.Charts{ 35 gpuPCIBandwidthUsageChartTmpl.Copy(), 36 gpuPCIBandwidthUtilizationChartTmpl.Copy(), 37 gpuFanSpeedPercChartTmpl.Copy(), 38 gpuUtilizationChartTmpl.Copy(), 39 gpuMemUtilizationChartTmpl.Copy(), 40 gpuDecoderUtilizationChartTmpl.Copy(), 41 gpuEncoderUtilizationChartTmpl.Copy(), 42 gpuMIGModeCurrentStatusChartTmpl.Copy(), 43 gpuMIGDevicesCountChartTmpl.Copy(), 44 gpuFrameBufferMemoryUsageChartTmpl.Copy(), 45 gpuBAR1MemoryUsageChartTmpl.Copy(), 46 gpuVoltageChartTmpl.Copy(), 47 gpuTemperatureChartTmpl.Copy(), 48 gpuClockFreqChartTmpl.Copy(), 49 gpuPowerDrawChartTmpl.Copy(), 50 gpuPerformanceStateChartTmpl.Copy(), 51 } 52 migDeviceXMLCharts = module.Charts{ 53 migDeviceFrameBufferMemoryUsageChartTmpl.Copy(), 54 migDeviceBAR1MemoryUsageChartTmpl.Copy(), 55 } 56 gpuCSVCharts = module.Charts{ 57 gpuFanSpeedPercChartTmpl.Copy(), 58 gpuUtilizationChartTmpl.Copy(), 59 gpuMemUtilizationChartTmpl.Copy(), 60 gpuFrameBufferMemoryUsageChartTmpl.Copy(), 61 gpuTemperatureChartTmpl.Copy(), 62 gpuClockFreqChartTmpl.Copy(), 63 gpuPowerDrawChartTmpl.Copy(), 64 gpuPerformanceStateChartTmpl.Copy(), 65 } 66 ) 67 68 var ( 69 gpuPCIBandwidthUsageChartTmpl = module.Chart{ 70 ID: "gpu_%s_pcie_bandwidth_usage", 71 Title: "PCI Express Bandwidth Usage", 72 Units: "B/s", 73 Fam: "pcie bandwidth", 74 Ctx: "nvidia_smi.gpu_pcie_bandwidth_usage", 75 Type: module.Area, 76 Priority: prioGPUPCIBandwidthUsage, 77 Dims: module.Dims{ 78 {ID: "gpu_%s_pcie_bandwidth_usage_rx", Name: "rx"}, 79 {ID: "gpu_%s_pcie_bandwidth_usage_tx", Name: "tx", Mul: -1}, 80 }, 81 } 82 gpuPCIBandwidthUtilizationChartTmpl = module.Chart{ 83 ID: "gpu_%s_pcie_bandwidth_utilization", 84 Title: "PCI Express Bandwidth Utilization", 85 Units: "percentage", 86 Fam: "pcie bandwidth", 87 Ctx: "nvidia_smi.gpu_pcie_bandwidth_utilization", 88 Priority: prioGPUPCIBandwidthUtilization, 89 Dims: module.Dims{ 90 {ID: "gpu_%s_pcie_bandwidth_utilization_rx", Name: "rx", Div: 100}, 91 {ID: "gpu_%s_pcie_bandwidth_utilization_tx", Name: "tx", Div: 100}, 92 }, 93 } 94 gpuFanSpeedPercChartTmpl = module.Chart{ 95 ID: "gpu_%s_fan_speed_perc", 96 Title: "Fan speed", 97 Units: "%", 98 Fam: "fan speed", 99 Ctx: "nvidia_smi.gpu_fan_speed_perc", 100 Priority: prioGPUFanSpeed, 101 Dims: module.Dims{ 102 {ID: "gpu_%s_fan_speed_perc", Name: "fan_speed"}, 103 }, 104 } 105 gpuUtilizationChartTmpl = module.Chart{ 106 ID: "gpu_%s_gpu_utilization", 107 Title: "GPU utilization", 108 Units: "%", 109 Fam: "gpu utilization", 110 Ctx: "nvidia_smi.gpu_utilization", 111 Priority: prioGPUUtilization, 112 Dims: module.Dims{ 113 {ID: "gpu_%s_gpu_utilization", Name: "gpu"}, 114 }, 115 } 116 gpuMemUtilizationChartTmpl = module.Chart{ 117 ID: "gpu_%s_memory_utilization", 118 Title: "Memory utilization", 119 Units: "%", 120 Fam: "mem utilization", 121 Ctx: "nvidia_smi.gpu_memory_utilization", 122 Priority: prioGPUMemUtilization, 123 Dims: module.Dims{ 124 {ID: "gpu_%s_mem_utilization", Name: "memory"}, 125 }, 126 } 127 gpuDecoderUtilizationChartTmpl = module.Chart{ 128 ID: "gpu_%s_decoder_utilization", 129 Title: "Decoder utilization", 130 Units: "%", 131 Fam: "dec utilization", 132 Ctx: "nvidia_smi.gpu_decoder_utilization", 133 Priority: prioGPUDecoderUtilization, 134 Dims: module.Dims{ 135 {ID: "gpu_%s_decoder_utilization", Name: "decoder"}, 136 }, 137 } 138 gpuEncoderUtilizationChartTmpl = module.Chart{ 139 ID: "gpu_%s_encoder_utilization", 140 Title: "Encoder utilization", 141 Units: "%", 142 Fam: "enc utilization", 143 Ctx: "nvidia_smi.gpu_encoder_utilization", 144 Priority: prioGPUEncoderUtilization, 145 Dims: module.Dims{ 146 {ID: "gpu_%s_encoder_utilization", Name: "encoder"}, 147 }, 148 } 149 gpuMIGModeCurrentStatusChartTmpl = module.Chart{ 150 ID: "gpu_%s_mig_mode_current_status", 151 Title: "MIG current mode", 152 Units: "status", 153 Fam: "mig", 154 Ctx: "nvidia_smi.gpu_mig_mode_current_status", 155 Priority: prioGPUMIGModeStatus, 156 Dims: module.Dims{ 157 {ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"}, 158 {ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"}, 159 }, 160 } 161 gpuMIGDevicesCountChartTmpl = module.Chart{ 162 ID: "gpu_%s_mig_devices_count", 163 Title: "MIG devices", 164 Units: "devices", 165 Fam: "mig", 166 Ctx: "nvidia_smi.gpu_mig_devices_count", 167 Priority: prioGPUMIGDevicesCount, 168 Dims: module.Dims{ 169 {ID: "gpu_%s_mig_devices_count", Name: "mig"}, 170 }, 171 } 172 gpuFrameBufferMemoryUsageChartTmpl = module.Chart{ 173 ID: "gpu_%s_frame_buffer_memory_usage", 174 Title: "Frame buffer memory usage", 175 Units: "B", 176 Fam: "fb mem usage", 177 Ctx: "nvidia_smi.gpu_frame_buffer_memory_usage", 178 Type: module.Stacked, 179 Priority: prioGPUFBMemoryUsage, 180 Dims: module.Dims{ 181 {ID: "gpu_%s_frame_buffer_memory_usage_free", Name: "free"}, 182 {ID: "gpu_%s_frame_buffer_memory_usage_used", Name: "used"}, 183 {ID: "gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"}, 184 }, 185 } 186 gpuBAR1MemoryUsageChartTmpl = module.Chart{ 187 ID: "gpu_%s_bar1_memory_usage", 188 Title: "BAR1 memory usage", 189 Units: "B", 190 Fam: "bar1 mem usage", 191 Ctx: "nvidia_smi.gpu_bar1_memory_usage", 192 Type: module.Stacked, 193 Priority: prioGPUBAR1MemoryUsage, 194 Dims: module.Dims{ 195 {ID: "gpu_%s_bar1_memory_usage_free", Name: "free"}, 196 {ID: "gpu_%s_bar1_memory_usage_used", Name: "used"}, 197 }, 198 } 199 gpuTemperatureChartTmpl = module.Chart{ 200 ID: "gpu_%s_temperature", 201 Title: "Temperature", 202 Units: "Celsius", 203 Fam: "temperature", 204 Ctx: "nvidia_smi.gpu_temperature", 205 Priority: prioGPUTemperatureChart, 206 Dims: module.Dims{ 207 {ID: "gpu_%s_temperature", Name: "temperature"}, 208 }, 209 } 210 gpuVoltageChartTmpl = module.Chart{ 211 ID: "gpu_%s_voltage", 212 Title: "Voltage", 213 Units: "V", 214 Fam: "voltage", 215 Ctx: "nvidia_smi.gpu_voltage", 216 Priority: prioGPUVoltageChart, 217 Dims: module.Dims{ 218 {ID: "gpu_%s_voltage", Name: "voltage", Div: 1000}, // mV => V 219 }, 220 } 221 gpuClockFreqChartTmpl = module.Chart{ 222 ID: "gpu_%s_clock_freq", 223 Title: "Clock current frequency", 224 Units: "MHz", 225 Fam: "clocks", 226 Ctx: "nvidia_smi.gpu_clock_freq", 227 Priority: prioGPUClockFreq, 228 Dims: module.Dims{ 229 {ID: "gpu_%s_graphics_clock", Name: "graphics"}, 230 {ID: "gpu_%s_video_clock", Name: "video"}, 231 {ID: "gpu_%s_sm_clock", Name: "sm"}, 232 {ID: "gpu_%s_mem_clock", Name: "mem"}, 233 }, 234 } 235 gpuPowerDrawChartTmpl = module.Chart{ 236 ID: "gpu_%s_power_draw", 237 Title: "Power draw", 238 Units: "Watts", 239 Fam: "power draw", 240 Ctx: "nvidia_smi.gpu_power_draw", 241 Priority: prioGPUPowerDraw, 242 Dims: module.Dims{ 243 {ID: "gpu_%s_power_draw", Name: "power_draw"}, 244 }, 245 } 246 gpuPerformanceStateChartTmpl = module.Chart{ 247 ID: "gpu_%s_performance_state", 248 Title: "Performance state", 249 Units: "state", 250 Fam: "performance state", 251 Ctx: "nvidia_smi.gpu_performance_state", 252 Priority: prioGPUPerformanceState, 253 Dims: module.Dims{ 254 {ID: "gpu_%s_performance_state_P0", Name: "P0"}, 255 {ID: "gpu_%s_performance_state_P1", Name: "P1"}, 256 {ID: "gpu_%s_performance_state_P2", Name: "P2"}, 257 {ID: "gpu_%s_performance_state_P3", Name: "P3"}, 258 {ID: "gpu_%s_performance_state_P4", Name: "P4"}, 259 {ID: "gpu_%s_performance_state_P5", Name: "P5"}, 260 {ID: "gpu_%s_performance_state_P6", Name: "P6"}, 261 {ID: "gpu_%s_performance_state_P7", Name: "P7"}, 262 {ID: "gpu_%s_performance_state_P8", Name: "P8"}, 263 {ID: "gpu_%s_performance_state_P9", Name: "P9"}, 264 {ID: "gpu_%s_performance_state_P10", Name: "P10"}, 265 {ID: "gpu_%s_performance_state_P11", Name: "P11"}, 266 {ID: "gpu_%s_performance_state_P12", Name: "P12"}, 267 {ID: "gpu_%s_performance_state_P13", Name: "P13"}, 268 {ID: "gpu_%s_performance_state_P14", Name: "P14"}, 269 {ID: "gpu_%s_performance_state_P15", Name: "P15"}, 270 }, 271 } 272 ) 273 274 func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) { 275 charts := gpuXMLCharts.Copy() 276 277 if !isValidValue(gpu.Utilization.GpuUtil) { 278 _ = charts.Remove(gpuUtilizationChartTmpl.ID) 279 } 280 if !isValidValue(gpu.Utilization.MemoryUtil) { 281 _ = charts.Remove(gpuMemUtilizationChartTmpl.ID) 282 } 283 if !isValidValue(gpu.Utilization.DecoderUtil) { 284 _ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID) 285 } 286 if !isValidValue(gpu.Utilization.EncoderUtil) { 287 _ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID) 288 } 289 if !isValidValue(gpu.MIGMode.CurrentMIG) { 290 _ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID) 291 _ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID) 292 } 293 if !isValidValue(gpu.FanSpeed) { 294 _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID) 295 } 296 if (gpu.PowerReadings == nil || !isValidValue(gpu.PowerReadings.PowerDraw)) && 297 (gpu.GPUPowerReadings == nil || !isValidValue(gpu.GPUPowerReadings.PowerDraw)) { 298 _ = charts.Remove(gpuPowerDrawChartTmpl.ID) 299 } 300 if !isValidValue(gpu.Voltage.GraphicsVolt) { 301 _ = charts.Remove(gpuVoltageChartTmpl.ID) 302 } 303 304 for _, c := range *charts { 305 c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID)) 306 c.Labels = []module.Label{ 307 // csv output has no 'product_brand' 308 {Key: "uuid", Value: gpu.UUID}, 309 {Key: "product_name", Value: gpu.ProductName}, 310 } 311 for _, d := range c.Dims { 312 d.ID = fmt.Sprintf(d.ID, gpu.UUID) 313 } 314 } 315 316 if err := nv.Charts().Add(*charts...); err != nil { 317 nv.Warning(err) 318 } 319 } 320 321 func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) { 322 charts := gpuCSVCharts.Copy() 323 324 if !isValidValue(gpu.utilizationGPU) { 325 _ = charts.Remove(gpuUtilizationChartTmpl.ID) 326 } 327 if !isValidValue(gpu.utilizationMemory) { 328 _ = charts.Remove(gpuMemUtilizationChartTmpl.ID) 329 } 330 if !isValidValue(gpu.fanSpeed) { 331 _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID) 332 } 333 if !isValidValue(gpu.powerDraw) { 334 _ = charts.Remove(gpuPowerDrawChartTmpl.ID) 335 } 336 337 for _, c := range *charts { 338 c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.uuid)) 339 c.Labels = []module.Label{ 340 {Key: "product_name", Value: gpu.name}, 341 } 342 for _, d := range c.Dims { 343 d.ID = fmt.Sprintf(d.ID, gpu.uuid) 344 } 345 } 346 347 if err := nv.Charts().Add(*charts...); err != nil { 348 nv.Warning(err) 349 } 350 } 351 352 var ( 353 migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{ 354 ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage", 355 Title: "MIG Frame buffer memory usage", 356 Units: "B", 357 Fam: "fb mem usage", 358 Ctx: "nvidia_smi.gpu_mig_frame_buffer_memory_usage", 359 Type: module.Stacked, 360 Priority: prioGPUMIGFBMemoryUsage, 361 Dims: module.Dims{ 362 {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"}, 363 {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"}, 364 {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"}, 365 }, 366 } 367 migDeviceBAR1MemoryUsageChartTmpl = module.Chart{ 368 ID: "mig_instance_%s_gpu_%s_bar1_memory_usage", 369 Title: "MIG BAR1 memory usage", 370 Units: "B", 371 Fam: "bar1 mem usage", 372 Ctx: "nvidia_smi.gpu_mig_bar1_memory_usage", 373 Type: module.Stacked, 374 Priority: prioGPUMIGBAR1MemoryUsage, 375 Dims: module.Dims{ 376 {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"}, 377 {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"}, 378 }, 379 } 380 ) 381 382 func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo) { 383 charts := migDeviceXMLCharts.Copy() 384 385 for _, c := range *charts { 386 c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID)) 387 c.Labels = []module.Label{ 388 {Key: "gpu_uuid", Value: gpu.UUID}, 389 {Key: "gpu_product_name", Value: gpu.ProductName}, 390 {Key: "gpu_instance_id", Value: mig.GPUInstanceID}, 391 } 392 for _, d := range c.Dims { 393 d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID) 394 } 395 } 396 397 if err := nv.Charts().Add(*charts...); err != nil { 398 nv.Warning(err) 399 } 400 } 401 402 func (nv *NvidiaSMI) removeCharts(prefix string) { 403 prefix = strings.ToLower(prefix) 404 405 for _, c := range *nv.Charts() { 406 if strings.HasPrefix(c.ID, prefix) { 407 c.MarkRemove() 408 c.MarkNotCreated() 409 } 410 } 411 }