github.com/netdata/go.d.plugin@v0.58.1/modules/nvidia_smi/collect_xml.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package nvidia_smi 4 5 import ( 6 "encoding/xml" 7 "fmt" 8 "strconv" 9 "strings" 10 ) 11 12 func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error { 13 bs, err := nv.exec.queryGPUInfoXML() 14 if err != nil { 15 return fmt.Errorf("error on quering XML GPU info: %v", err) 16 } 17 18 info := &xmlInfo{} 19 if err := xml.Unmarshal(bs, info); err != nil { 20 return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err) 21 } 22 23 seenGPU := make(map[string]bool) 24 seenMIG := make(map[string]bool) 25 26 for _, gpu := range info.GPUs { 27 if !isValidValue(gpu.UUID) { 28 continue 29 } 30 31 px := "gpu_" + gpu.UUID + "_" 32 33 seenGPU[px] = true 34 35 if !nv.gpus[px] { 36 nv.gpus[px] = true 37 nv.addGPUXMLCharts(gpu) 38 } 39 40 addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes 41 addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes 42 if max := calcMaxPCIEBandwidth(gpu); max > 0 { 43 rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes 44 tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes 45 mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100) 46 mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100) 47 } 48 addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0) 49 addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0) 50 addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0) 51 addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0) 52 addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0) 53 addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes 54 addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes 55 addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes 56 addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes 57 addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes 58 addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0) 59 addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0) 60 addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0) 61 addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0) 62 addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0) 63 if gpu.PowerReadings != nil { 64 addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0) 65 } else if gpu.GPUPowerReadings != nil { 66 addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0) 67 } 68 addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0) 69 for i := 0; i < 16; i++ { 70 s := "P" + strconv.Itoa(i) 71 mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s) 72 } 73 if isValidValue(gpu.MIGMode.CurrentMIG) { 74 mode := strings.ToLower(gpu.MIGMode.CurrentMIG) 75 mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled") 76 mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled") 77 mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice)) 78 } 79 80 for _, mig := range gpu.MIGDevices.MIGDevice { 81 if !isValidValue(mig.GPUInstanceID) { 82 continue 83 } 84 85 px := "mig_instance_" + mig.GPUInstanceID + "_" + px 86 87 seenMIG[px] = true 88 89 if !nv.migs[px] { 90 nv.migs[px] = true 91 nv.addMIGDeviceXMLCharts(gpu, mig) 92 } 93 94 addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0) 95 addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes 96 addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes 97 addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes 98 addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes 99 addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes 100 } 101 } 102 103 for px := range nv.gpus { 104 if !seenGPU[px] { 105 delete(nv.gpus, px) 106 nv.removeCharts(px) 107 } 108 } 109 110 for px := range nv.migs { 111 if !seenMIG[px] { 112 delete(nv.migs, px) 113 nv.removeCharts(px) 114 } 115 } 116 117 return nil 118 } 119 120 func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 { 121 gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen 122 width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x") 123 124 if !isValidValue(gen) || !isValidValue(width) { 125 return 0 126 } 127 128 // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance 129 var speed, enc float64 130 switch gen { 131 case "1": 132 speed, enc = 2.5, 1/5 133 case "2": 134 speed, enc = 5, 1/5 135 case "3": 136 speed, enc = 8, 2/130 137 case "4": 138 speed, enc = 16, 2/130 139 case "5": 140 speed, enc = 32, 2/130 141 default: 142 return 0 143 } 144 145 // Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s 146 return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes 147 } 148 149 type ( 150 xmlInfo struct { 151 GPUs []xmlGPUInfo `xml:"gpu"` 152 } 153 xmlGPUInfo struct { 154 ID string `xml:"id,attr"` 155 ProductName string `xml:"product_name"` 156 ProductBrand string `xml:"product_brand"` 157 ProductArchitecture string `xml:"product_architecture"` 158 UUID string `xml:"uuid"` 159 FanSpeed string `xml:"fan_speed"` 160 PerformanceState string `xml:"performance_state"` 161 MIGMode struct { 162 CurrentMIG string `xml:"current_mig"` 163 } `xml:"mig_mode"` 164 MIGDevices struct { 165 MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"` 166 } `xml:"mig_devices"` 167 PCI struct { 168 TxUtil string `xml:"tx_util"` 169 RxUtil string `xml:"rx_util"` 170 PCIGPULinkInfo struct { 171 PCIEGen struct { 172 MaxLinkGen string `xml:"max_link_gen"` 173 } `xml:"pcie_gen"` 174 LinkWidths struct { 175 MaxLinkWidth string `xml:"max_link_width"` 176 } `xml:"link_widths"` 177 } `xml:"pci_gpu_link_info"` 178 } `xml:"pci"` 179 Utilization struct { 180 GpuUtil string `xml:"gpu_util"` 181 MemoryUtil string `xml:"memory_util"` 182 EncoderUtil string `xml:"encoder_util"` 183 DecoderUtil string `xml:"decoder_util"` 184 } `xml:"utilization"` 185 FBMemoryUsage struct { 186 Total string `xml:"total"` 187 Reserved string `xml:"reserved"` 188 Used string `xml:"used"` 189 Free string `xml:"free"` 190 } `xml:"fb_memory_usage"` 191 Bar1MemoryUsage struct { 192 Total string `xml:"total"` 193 Used string `xml:"used"` 194 Free string `xml:"free"` 195 } `xml:"bar1_memory_usage"` 196 Temperature struct { 197 GpuTemp string `xml:"gpu_temp"` 198 GpuTempMaxThreshold string `xml:"gpu_temp_max_threshold"` 199 GpuTempSlowThreshold string `xml:"gpu_temp_slow_threshold"` 200 GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"` 201 GpuTargetTemperature string `xml:"gpu_target_temperature"` 202 MemoryTemp string `xml:"memory_temp"` 203 GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"` 204 } `xml:"temperature"` 205 Clocks struct { 206 GraphicsClock string `xml:"graphics_clock"` 207 SmClock string `xml:"sm_clock"` 208 MemClock string `xml:"mem_clock"` 209 VideoClock string `xml:"video_clock"` 210 } `xml:"clocks"` 211 PowerReadings *xmlPowerReadings `xml:"power_readings"` 212 GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"` 213 Voltage struct { 214 GraphicsVolt string `xml:"graphics_volt"` 215 } `xml:"voltage"` 216 Processes struct { 217 ProcessInfo []struct { 218 PID string `xml:"pid"` 219 ProcessName string `xml:"process_name"` 220 UsedMemory string `xml:"used_memory"` 221 } `sml:"process_info"` 222 } `xml:"processes"` 223 } 224 225 xmlPowerReadings struct { 226 //PowerState string `xml:"power_state"` 227 //PowerManagement string `xml:"power_management"` 228 PowerDraw string `xml:"power_draw"` 229 //PowerLimit string `xml:"power_limit"` 230 //DefaultPowerLimit string `xml:"default_power_limit"` 231 //EnforcedPowerLimit string `xml:"enforced_power_limit"` 232 //MinPowerLimit string `xml:"min_power_limit"` 233 //MaxPowerLimit string `xml:"max_power_limit"` 234 } 235 236 xmlMIGDeviceInfo struct { 237 Index string `xml:"index"` 238 GPUInstanceID string `xml:"gpu_instance_id"` 239 ComputeInstanceID string `xml:"compute_instance_id"` 240 DeviceAttributes struct { 241 Shared struct { 242 MultiprocessorCount string `xml:"multiprocessor_count"` 243 CopyEngineCount string `xml:"copy_engine_count"` 244 EncoderCount string `xml:"encoder_count"` 245 DecoderCount string `xml:"decoder_count"` 246 OFACount string `xml:"ofa_count"` 247 JPGCount string `xml:"jpg_count"` 248 } `xml:"shared"` 249 } `xml:"device_attributes"` 250 ECCErrorCount struct { 251 VolatileCount struct { 252 SRAMUncorrectable string `xml:"sram_uncorrectable"` 253 } `xml:"volatile_count"` 254 } `xml:"ecc_error_count"` 255 FBMemoryUsage struct { 256 Free string `xml:"free"` 257 Used string `xml:"used"` 258 Reserved string `xml:"reserved"` 259 } `xml:"fb_memory_usage"` 260 BAR1MemoryUsage struct { 261 Free string `xml:"free"` 262 Used string `xml:"used"` 263 } `xml:"bar1_memory_usage"` 264 } 265 )