github.com/netdata/go.d.plugin@v0.58.1/modules/nvidia_smi/collect_xml.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package nvidia_smi
     4  
     5  import (
     6  	"encoding/xml"
     7  	"fmt"
     8  	"strconv"
     9  	"strings"
    10  )
    11  
    12  func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
    13  	bs, err := nv.exec.queryGPUInfoXML()
    14  	if err != nil {
    15  		return fmt.Errorf("error on quering XML GPU info: %v", err)
    16  	}
    17  
    18  	info := &xmlInfo{}
    19  	if err := xml.Unmarshal(bs, info); err != nil {
    20  		return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
    21  	}
    22  
    23  	seenGPU := make(map[string]bool)
    24  	seenMIG := make(map[string]bool)
    25  
    26  	for _, gpu := range info.GPUs {
    27  		if !isValidValue(gpu.UUID) {
    28  			continue
    29  		}
    30  
    31  		px := "gpu_" + gpu.UUID + "_"
    32  
    33  		seenGPU[px] = true
    34  
    35  		if !nv.gpus[px] {
    36  			nv.gpus[px] = true
    37  			nv.addGPUXMLCharts(gpu)
    38  		}
    39  
    40  		addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
    41  		addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
    42  		if max := calcMaxPCIEBandwidth(gpu); max > 0 {
    43  			rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
    44  			tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
    45  			mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100)
    46  			mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100)
    47  		}
    48  		addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
    49  		addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
    50  		addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
    51  		addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0)
    52  		addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0)
    53  		addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024)         // MiB => bytes
    54  		addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024)         // MiB => bytes
    55  		addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
    56  		addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024)               // MiB => bytes
    57  		addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024)               // MiB => bytes
    58  		addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0)
    59  		addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0)
    60  		addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0)
    61  		addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
    62  		addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
    63  		if gpu.PowerReadings != nil {
    64  			addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
    65  		} else if gpu.GPUPowerReadings != nil {
    66  			addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0)
    67  		}
    68  		addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
    69  		for i := 0; i < 16; i++ {
    70  			s := "P" + strconv.Itoa(i)
    71  			mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s)
    72  		}
    73  		if isValidValue(gpu.MIGMode.CurrentMIG) {
    74  			mode := strings.ToLower(gpu.MIGMode.CurrentMIG)
    75  			mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled")
    76  			mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled")
    77  			mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice))
    78  		}
    79  
    80  		for _, mig := range gpu.MIGDevices.MIGDevice {
    81  			if !isValidValue(mig.GPUInstanceID) {
    82  				continue
    83  			}
    84  
    85  			px := "mig_instance_" + mig.GPUInstanceID + "_" + px
    86  
    87  			seenMIG[px] = true
    88  
    89  			if !nv.migs[px] {
    90  				nv.migs[px] = true
    91  				nv.addMIGDeviceXMLCharts(gpu, mig)
    92  			}
    93  
    94  			addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
    95  			addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024)         // MiB => bytes
    96  			addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024)         // MiB => bytes
    97  			addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
    98  			addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024)               // MiB => bytes
    99  			addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024)               // MiB => bytes
   100  		}
   101  	}
   102  
   103  	for px := range nv.gpus {
   104  		if !seenGPU[px] {
   105  			delete(nv.gpus, px)
   106  			nv.removeCharts(px)
   107  		}
   108  	}
   109  
   110  	for px := range nv.migs {
   111  		if !seenMIG[px] {
   112  			delete(nv.migs, px)
   113  			nv.removeCharts(px)
   114  		}
   115  	}
   116  
   117  	return nil
   118  }
   119  
   120  func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
   121  	gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
   122  	width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")
   123  
   124  	if !isValidValue(gen) || !isValidValue(width) {
   125  		return 0
   126  	}
   127  
   128  	// https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
   129  	var speed, enc float64
   130  	switch gen {
   131  	case "1":
   132  		speed, enc = 2.5, 1/5
   133  	case "2":
   134  		speed, enc = 5, 1/5
   135  	case "3":
   136  		speed, enc = 8, 2/130
   137  	case "4":
   138  		speed, enc = 16, 2/130
   139  	case "5":
   140  		speed, enc = 32, 2/130
   141  	default:
   142  		return 0
   143  	}
   144  
   145  	// Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
   146  	return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
   147  }
   148  
   149  type (
   150  	xmlInfo struct {
   151  		GPUs []xmlGPUInfo `xml:"gpu"`
   152  	}
   153  	xmlGPUInfo struct {
   154  		ID                  string `xml:"id,attr"`
   155  		ProductName         string `xml:"product_name"`
   156  		ProductBrand        string `xml:"product_brand"`
   157  		ProductArchitecture string `xml:"product_architecture"`
   158  		UUID                string `xml:"uuid"`
   159  		FanSpeed            string `xml:"fan_speed"`
   160  		PerformanceState    string `xml:"performance_state"`
   161  		MIGMode             struct {
   162  			CurrentMIG string `xml:"current_mig"`
   163  		} `xml:"mig_mode"`
   164  		MIGDevices struct {
   165  			MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"`
   166  		} `xml:"mig_devices"`
   167  		PCI struct {
   168  			TxUtil         string `xml:"tx_util"`
   169  			RxUtil         string `xml:"rx_util"`
   170  			PCIGPULinkInfo struct {
   171  				PCIEGen struct {
   172  					MaxLinkGen string `xml:"max_link_gen"`
   173  				} `xml:"pcie_gen"`
   174  				LinkWidths struct {
   175  					MaxLinkWidth string `xml:"max_link_width"`
   176  				} `xml:"link_widths"`
   177  			} `xml:"pci_gpu_link_info"`
   178  		} `xml:"pci"`
   179  		Utilization struct {
   180  			GpuUtil     string `xml:"gpu_util"`
   181  			MemoryUtil  string `xml:"memory_util"`
   182  			EncoderUtil string `xml:"encoder_util"`
   183  			DecoderUtil string `xml:"decoder_util"`
   184  		} `xml:"utilization"`
   185  		FBMemoryUsage struct {
   186  			Total    string `xml:"total"`
   187  			Reserved string `xml:"reserved"`
   188  			Used     string `xml:"used"`
   189  			Free     string `xml:"free"`
   190  		} `xml:"fb_memory_usage"`
   191  		Bar1MemoryUsage struct {
   192  			Total string `xml:"total"`
   193  			Used  string `xml:"used"`
   194  			Free  string `xml:"free"`
   195  		} `xml:"bar1_memory_usage"`
   196  		Temperature struct {
   197  			GpuTemp                string `xml:"gpu_temp"`
   198  			GpuTempMaxThreshold    string `xml:"gpu_temp_max_threshold"`
   199  			GpuTempSlowThreshold   string `xml:"gpu_temp_slow_threshold"`
   200  			GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"`
   201  			GpuTargetTemperature   string `xml:"gpu_target_temperature"`
   202  			MemoryTemp             string `xml:"memory_temp"`
   203  			GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"`
   204  		} `xml:"temperature"`
   205  		Clocks struct {
   206  			GraphicsClock string `xml:"graphics_clock"`
   207  			SmClock       string `xml:"sm_clock"`
   208  			MemClock      string `xml:"mem_clock"`
   209  			VideoClock    string `xml:"video_clock"`
   210  		} `xml:"clocks"`
   211  		PowerReadings    *xmlPowerReadings `xml:"power_readings"`
   212  		GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"`
   213  		Voltage          struct {
   214  			GraphicsVolt string `xml:"graphics_volt"`
   215  		} `xml:"voltage"`
   216  		Processes struct {
   217  			ProcessInfo []struct {
   218  				PID         string `xml:"pid"`
   219  				ProcessName string `xml:"process_name"`
   220  				UsedMemory  string `xml:"used_memory"`
   221  			} `sml:"process_info"`
   222  		} `xml:"processes"`
   223  	}
   224  
   225  	xmlPowerReadings struct {
   226  		//PowerState         string `xml:"power_state"`
   227  		//PowerManagement    string `xml:"power_management"`
   228  		PowerDraw string `xml:"power_draw"`
   229  		//PowerLimit         string `xml:"power_limit"`
   230  		//DefaultPowerLimit  string `xml:"default_power_limit"`
   231  		//EnforcedPowerLimit string `xml:"enforced_power_limit"`
   232  		//MinPowerLimit      string `xml:"min_power_limit"`
   233  		//MaxPowerLimit      string `xml:"max_power_limit"`
   234  	}
   235  
   236  	xmlMIGDeviceInfo struct {
   237  		Index             string `xml:"index"`
   238  		GPUInstanceID     string `xml:"gpu_instance_id"`
   239  		ComputeInstanceID string `xml:"compute_instance_id"`
   240  		DeviceAttributes  struct {
   241  			Shared struct {
   242  				MultiprocessorCount string `xml:"multiprocessor_count"`
   243  				CopyEngineCount     string `xml:"copy_engine_count"`
   244  				EncoderCount        string `xml:"encoder_count"`
   245  				DecoderCount        string `xml:"decoder_count"`
   246  				OFACount            string `xml:"ofa_count"`
   247  				JPGCount            string `xml:"jpg_count"`
   248  			} `xml:"shared"`
   249  		} `xml:"device_attributes"`
   250  		ECCErrorCount struct {
   251  			VolatileCount struct {
   252  				SRAMUncorrectable string `xml:"sram_uncorrectable"`
   253  			} `xml:"volatile_count"`
   254  		} `xml:"ecc_error_count"`
   255  		FBMemoryUsage struct {
   256  			Free     string `xml:"free"`
   257  			Used     string `xml:"used"`
   258  			Reserved string `xml:"reserved"`
   259  		} `xml:"fb_memory_usage"`
   260  		BAR1MemoryUsage struct {
   261  			Free string `xml:"free"`
   262  			Used string `xml:"used"`
   263  		} `xml:"bar1_memory_usage"`
   264  	}
   265  )