github.com/netdata/go.d.plugin@v0.58.1/modules/nvidia_smi/collect_csv.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package nvidia_smi
     4  
     5  import (
     6  	"bufio"
     7  	"bytes"
     8  	"encoding/csv"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"regexp"
    13  	"strconv"
    14  	"strings"
    15  )
    16  
    17  // use of property aliases is not implemented ('"<property>" or "<alias>"' in help-query-gpu)
    18  var knownProperties = map[string]bool{
    19  	"uuid":                    true,
    20  	"name":                    true,
    21  	"fan.speed":               true,
    22  	"pstate":                  true,
    23  	"utilization.gpu":         true,
    24  	"utilization.memory":      true,
    25  	"memory.used":             true,
    26  	"memory.free":             true,
    27  	"memory.reserved":         true,
    28  	"temperature.gpu":         true,
    29  	"clocks.current.graphics": true,
    30  	"clocks.current.video":    true,
    31  	"clocks.current.sm":       true,
    32  	"clocks.current.memory":   true,
    33  	"power.draw":              true,
    34  }
    35  
    36  var reHelpProperty = regexp.MustCompile(`"([a-zA-Z_.]+)"`)
    37  
    38  func (nv *NvidiaSMI) collectGPUInfoCSV(mx map[string]int64) error {
    39  	if len(nv.gpuQueryProperties) == 0 {
    40  		bs, err := nv.exec.queryHelpQueryGPU()
    41  		if err != nil {
    42  			return err
    43  		}
    44  
    45  		sc := bufio.NewScanner(bytes.NewBuffer(bs))
    46  
    47  		for sc.Scan() {
    48  			if !strings.HasPrefix(sc.Text(), "\"") {
    49  				continue
    50  			}
    51  			matches := reHelpProperty.FindAllString(sc.Text(), -1)
    52  			if len(matches) == 0 {
    53  				continue
    54  			}
    55  			for _, v := range matches {
    56  				if v = strings.Trim(v, "\""); knownProperties[v] {
    57  					nv.gpuQueryProperties = append(nv.gpuQueryProperties, v)
    58  				}
    59  			}
    60  		}
    61  		nv.Debugf("found query GPU properties: %v", nv.gpuQueryProperties)
    62  	}
    63  
    64  	bs, err := nv.exec.queryGPUInfoCSV(nv.gpuQueryProperties)
    65  	if err != nil {
    66  		return err
    67  	}
    68  
    69  	nv.Debugf("GPU info:\n%s", bs)
    70  
    71  	r := csv.NewReader(bytes.NewBuffer(bs))
    72  	r.Comma = ','
    73  	r.ReuseRecord = true
    74  	r.TrimLeadingSpace = true
    75  
    76  	// skip headers
    77  	if _, err := r.Read(); err != nil && err != io.EOF {
    78  		return err
    79  	}
    80  
    81  	var gpusInfo []csvGPUInfo
    82  	for {
    83  		record, err := r.Read()
    84  		if err != nil {
    85  			if errors.Is(err, io.EOF) {
    86  				break
    87  			}
    88  			return err
    89  		}
    90  
    91  		if len(record) != len(nv.gpuQueryProperties) {
    92  			return fmt.Errorf("record values (%d) != queried properties (%d)", len(record), len(nv.gpuQueryProperties))
    93  		}
    94  
    95  		var gpu csvGPUInfo
    96  		for i, v := range record {
    97  			switch nv.gpuQueryProperties[i] {
    98  			case "uuid":
    99  				gpu.uuid = v
   100  			case "name":
   101  				gpu.name = v
   102  			case "fan.speed":
   103  				gpu.fanSpeed = v
   104  			case "pstate":
   105  				gpu.pstate = v
   106  			case "utilization.gpu":
   107  				gpu.utilizationGPU = v
   108  			case "utilization.memory":
   109  				gpu.utilizationMemory = v
   110  			case "memory.used":
   111  				gpu.memoryUsed = v
   112  			case "memory.free":
   113  				gpu.memoryFree = v
   114  			case "memory.reserved":
   115  				gpu.memoryReserved = v
   116  			case "temperature.gpu":
   117  				gpu.temperatureGPU = v
   118  			case "clocks.current.graphics":
   119  				gpu.clocksCurrentGraphics = v
   120  			case "clocks.current.video":
   121  				gpu.clocksCurrentVideo = v
   122  			case "clocks.current.sm":
   123  				gpu.clocksCurrentSM = v
   124  			case "clocks.current.memory":
   125  				gpu.clocksCurrentMemory = v
   126  			case "power.draw":
   127  				gpu.powerDraw = v
   128  			}
   129  		}
   130  		gpusInfo = append(gpusInfo, gpu)
   131  	}
   132  
   133  	seen := make(map[string]bool)
   134  
   135  	for _, gpu := range gpusInfo {
   136  		if !isValidValue(gpu.uuid) || !isValidValue(gpu.name) {
   137  			continue
   138  		}
   139  
   140  		px := "gpu_" + gpu.uuid + "_"
   141  
   142  		seen[px] = true
   143  
   144  		if !nv.gpus[px] {
   145  			nv.gpus[px] = true
   146  			nv.addGPUCSVCharts(gpu)
   147  		}
   148  
   149  		addMetric(mx, px+"fan_speed_perc", gpu.fanSpeed, 0)
   150  		addMetric(mx, px+"gpu_utilization", gpu.utilizationGPU, 0)
   151  		addMetric(mx, px+"mem_utilization", gpu.utilizationMemory, 0)
   152  		addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.memoryFree, 1024*1024)         // MiB => bytes
   153  		addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.memoryUsed, 1024*1024)         // MiB => bytes
   154  		addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.memoryReserved, 1024*1024) // MiB => bytes
   155  		addMetric(mx, px+"temperature", gpu.temperatureGPU, 0)
   156  		addMetric(mx, px+"graphics_clock", gpu.clocksCurrentGraphics, 0)
   157  		addMetric(mx, px+"video_clock", gpu.clocksCurrentVideo, 0)
   158  		addMetric(mx, px+"sm_clock", gpu.clocksCurrentSM, 0)
   159  		addMetric(mx, px+"mem_clock", gpu.clocksCurrentMemory, 0)
   160  		addMetric(mx, px+"power_draw", gpu.powerDraw, 0)
   161  		for i := 0; i < 16; i++ {
   162  			if s := "P" + strconv.Itoa(i); gpu.pstate == s {
   163  				mx[px+"performance_state_"+s] = 1
   164  			} else {
   165  				mx[px+"performance_state_"+s] = 0
   166  			}
   167  		}
   168  	}
   169  
   170  	for px := range nv.gpus {
   171  		if !seen[px] {
   172  			delete(nv.gpus, px)
   173  			nv.removeCharts(px)
   174  		}
   175  	}
   176  
   177  	return nil
   178  }
   179  
   180  type (
   181  	csvGPUInfo struct {
   182  		uuid                  string
   183  		name                  string
   184  		fanSpeed              string
   185  		pstate                string
   186  		utilizationGPU        string
   187  		utilizationMemory     string
   188  		memoryUsed            string
   189  		memoryFree            string
   190  		memoryReserved        string
   191  		temperatureGPU        string
   192  		clocksCurrentGraphics string
   193  		clocksCurrentVideo    string
   194  		clocksCurrentSM       string
   195  		clocksCurrentMemory   string
   196  		powerDraw             string
   197  	}
   198  )