github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/nvml/client.go (about)

     1  package nvml
     2  
     3  import (
     4  	"fmt"
     5  )
     6  
     7  // DeviceData represents common fields for Nvidia device
     8  type DeviceData struct {
     9  	UUID       string
    10  	DeviceName *string
    11  	MemoryMiB  *uint64
    12  	PowerW     *uint
    13  	BAR1MiB    *uint64
    14  }
    15  
    16  // FingerprintDeviceData is a superset of DeviceData
    17  // it describes device specific fields returned from
    18  // nvml queries during fingerprinting call
    19  type FingerprintDeviceData struct {
    20  	*DeviceData
    21  	PCIBandwidthMBPerS *uint
    22  	CoresClockMHz      *uint
    23  	MemoryClockMHz     *uint
    24  	DisplayState       string
    25  	PersistenceMode    string
    26  	PCIBusID           string
    27  }
    28  
    29  // FingerprintData represets attributes of driver/devices
    30  type FingerprintData struct {
    31  	Devices       []*FingerprintDeviceData
    32  	DriverVersion string
    33  }
    34  
    35  // StatsData is a superset of DeviceData
    36  // it represents statistics data returned for every Nvidia device
    37  type StatsData struct {
    38  	*DeviceData
    39  	PowerUsageW        *uint
    40  	GPUUtilization     *uint
    41  	MemoryUtilization  *uint
    42  	EncoderUtilization *uint
    43  	DecoderUtilization *uint
    44  	TemperatureC       *uint
    45  	UsedMemoryMiB      *uint64
    46  	BAR1UsedMiB        *uint64
    47  	ECCErrorsL1Cache   *uint64
    48  	ECCErrorsL2Cache   *uint64
    49  	ECCErrorsDevice    *uint64
    50  }
    51  
    52  // NvmlClient describes how users would use nvml library
    53  type NvmlClient interface {
    54  	GetFingerprintData() (*FingerprintData, error)
    55  	GetStatsData() ([]*StatsData, error)
    56  }
    57  
    58  // nvmlClient implements NvmlClient
    59  // Users of this lib are expected to use this struct via NewNvmlClient func
    60  type nvmlClient struct {
    61  	driver NvmlDriver
    62  }
    63  
    64  // NewNvmlClient function creates new nvmlClient with real
    65  // NvmlDriver implementation. Also, this func initializes NvmlDriver
    66  func NewNvmlClient() (*nvmlClient, error) {
    67  	driver := &nvmlDriver{}
    68  	err := driver.Initialize()
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  	return &nvmlClient{
    73  		driver: driver,
    74  	}, nil
    75  }
    76  
    77  // GetFingerprintData returns FingerprintData for available Nvidia devices
    78  func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
    79  	/*
    80  		nvml fields to be fingerprinted # nvml_library_call
    81  		1  - Driver Version             # nvmlSystemGetDriverVersion
    82  		2  - Product Name               # nvmlDeviceGetName
    83  		3  - GPU UUID                   # nvmlDeviceGetUUID
    84  		4  - Total Memory               # nvmlDeviceGetMemoryInfo
    85  		5  - Power                      # nvmlDeviceGetPowerManagementLimit
    86  		6  - PCIBusID                   # nvmlDeviceGetPciInfo
    87  		7  - BAR1 Memory                # nvmlDeviceGetBAR1MemoryInfo(
    88  		8  - PCI Bandwidth
    89  		9  - Memory, Cores Clock        # nvmlDeviceGetMaxClockInfo
    90  		10 - Display Mode               # nvmlDeviceGetDisplayMode
    91  		11 - Persistence Mode           # nvmlDeviceGetPersistenceMode
    92  	*/
    93  
    94  	// Assumed that this method is called with receiver retrieved from
    95  	// NewNvmlClient
    96  	// because this method handles initialization of NVML library
    97  
    98  	driverVersion, err := c.driver.SystemDriverVersion()
    99  	if err != nil {
   100  		return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
   101  	}
   102  
   103  	numDevices, err := c.driver.DeviceCount()
   104  	if err != nil {
   105  		return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
   106  	}
   107  
   108  	allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
   109  
   110  	for i := 0; i < int(numDevices); i++ {
   111  		deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
   112  		if err != nil {
   113  			return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
   114  		}
   115  
   116  		allNvidiaGPUResources[i] = &FingerprintDeviceData{
   117  			DeviceData: &DeviceData{
   118  				DeviceName: deviceInfo.Name,
   119  				UUID:       deviceInfo.UUID,
   120  				MemoryMiB:  deviceInfo.MemoryMiB,
   121  				PowerW:     deviceInfo.PowerW,
   122  				BAR1MiB:    deviceInfo.BAR1MiB,
   123  			},
   124  			PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
   125  			CoresClockMHz:      deviceInfo.CoresClockMHz,
   126  			MemoryClockMHz:     deviceInfo.MemoryClockMHz,
   127  			DisplayState:       deviceInfo.DisplayState,
   128  			PersistenceMode:    deviceInfo.PersistenceMode,
   129  			PCIBusID:           deviceInfo.PCIBusID,
   130  		}
   131  	}
   132  	return &FingerprintData{
   133  		Devices:       allNvidiaGPUResources,
   134  		DriverVersion: driverVersion,
   135  	}, nil
   136  }
   137  
   138  // GetStatsData returns statistics data for all devices on this machine
   139  func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
   140  	/*
   141  	   nvml fields to be reported to stats api     # nvml_library_call
   142  	   1  - Used Memory                            # nvmlDeviceGetMemoryInfo
   143  	   2  - Utilization of GPU                     # nvmlDeviceGetUtilizationRates
   144  	   3  - Utilization of Memory                  # nvmlDeviceGetUtilizationRates
   145  	   4  - Utilization of Decoder                 # nvmlDeviceGetDecoderUtilization
   146  	   5  - Utilization of Encoder                 # nvmlDeviceGetEncoderUtilization
   147  	   6  - Current GPU Temperature                # nvmlDeviceGetTemperature
   148  	   7  - Power Draw                             # nvmlDeviceGetPowerUsage
   149  	   8  - BAR1 Used memory                       # nvmlDeviceGetBAR1MemoryInfo
   150  	   9  - ECC Errors on requesting L1Cache       # nvmlDeviceGetMemoryErrorCounter
   151  	   10 - ECC Errors on requesting L2Cache       # nvmlDeviceGetMemoryErrorCounter
   152  	   11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
   153  	*/
   154  
   155  	// Assumed that this method is called with receiver retrieved from
   156  	// NewNvmlClient
   157  	// because this method handles initialization of NVML library
   158  
   159  	numDevices, err := c.driver.DeviceCount()
   160  	if err != nil {
   161  		return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
   162  	}
   163  
   164  	allNvidiaGPUStats := make([]*StatsData, numDevices)
   165  
   166  	for i := 0; i < int(numDevices); i++ {
   167  		deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
   168  		if err != nil {
   169  			return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
   170  		}
   171  
   172  		allNvidiaGPUStats[i] = &StatsData{
   173  			DeviceData: &DeviceData{
   174  				DeviceName: deviceInfo.Name,
   175  				UUID:       deviceInfo.UUID,
   176  				MemoryMiB:  deviceInfo.MemoryMiB,
   177  				PowerW:     deviceInfo.PowerW,
   178  				BAR1MiB:    deviceInfo.BAR1MiB,
   179  			},
   180  			PowerUsageW:        deviceStatus.PowerUsageW,
   181  			GPUUtilization:     deviceStatus.GPUUtilization,
   182  			MemoryUtilization:  deviceStatus.MemoryUtilization,
   183  			EncoderUtilization: deviceStatus.EncoderUtilization,
   184  			DecoderUtilization: deviceStatus.DecoderUtilization,
   185  			TemperatureC:       deviceStatus.TemperatureC,
   186  			UsedMemoryMiB:      deviceStatus.UsedMemoryMiB,
   187  			BAR1UsedMiB:        deviceStatus.BAR1UsedMiB,
   188  			ECCErrorsL1Cache:   deviceStatus.ECCErrorsL1Cache,
   189  			ECCErrorsL2Cache:   deviceStatus.ECCErrorsL2Cache,
   190  			ECCErrorsDevice:    deviceStatus.ECCErrorsDevice,
   191  		}
   192  	}
   193  	return allNvidiaGPUStats, nil
   194  }