github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/nvml/client.go (about) 1 package nvml 2 3 import ( 4 "fmt" 5 ) 6 7 // DeviceData represents common fields for Nvidia device 8 type DeviceData struct { 9 UUID string 10 DeviceName *string 11 MemoryMiB *uint64 12 PowerW *uint 13 BAR1MiB *uint64 14 } 15 16 // FingerprintDeviceData is a superset of DeviceData 17 // it describes device specific fields returned from 18 // nvml queries during fingerprinting call 19 type FingerprintDeviceData struct { 20 *DeviceData 21 PCIBandwidthMBPerS *uint 22 CoresClockMHz *uint 23 MemoryClockMHz *uint 24 DisplayState string 25 PersistenceMode string 26 PCIBusID string 27 } 28 29 // FingerprintData represets attributes of driver/devices 30 type FingerprintData struct { 31 Devices []*FingerprintDeviceData 32 DriverVersion string 33 } 34 35 // StatsData is a superset of DeviceData 36 // it represents statistics data returned for every Nvidia device 37 type StatsData struct { 38 *DeviceData 39 PowerUsageW *uint 40 GPUUtilization *uint 41 MemoryUtilization *uint 42 EncoderUtilization *uint 43 DecoderUtilization *uint 44 TemperatureC *uint 45 UsedMemoryMiB *uint64 46 BAR1UsedMiB *uint64 47 ECCErrorsL1Cache *uint64 48 ECCErrorsL2Cache *uint64 49 ECCErrorsDevice *uint64 50 } 51 52 // NvmlClient describes how users would use nvml library 53 type NvmlClient interface { 54 GetFingerprintData() (*FingerprintData, error) 55 GetStatsData() ([]*StatsData, error) 56 } 57 58 // nvmlClient implements NvmlClient 59 // Users of this lib are expected to use this struct via NewNvmlClient func 60 type nvmlClient struct { 61 driver NvmlDriver 62 } 63 64 // NewNvmlClient function creates new nvmlClient with real 65 // NvmlDriver implementation. Also, this func initializes NvmlDriver 66 func NewNvmlClient() (*nvmlClient, error) { 67 driver := &nvmlDriver{} 68 err := driver.Initialize() 69 if err != nil { 70 return nil, err 71 } 72 return &nvmlClient{ 73 driver: driver, 74 }, nil 75 } 76 77 // GetFingerprintData returns FingerprintData for available Nvidia devices 78 func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { 79 /* 80 nvml fields to be fingerprinted # nvml_library_call 81 1 - Driver Version # nvmlSystemGetDriverVersion 82 2 - Product Name # nvmlDeviceGetName 83 3 - GPU UUID # nvmlDeviceGetUUID 84 4 - Total Memory # nvmlDeviceGetMemoryInfo 85 5 - Power # nvmlDeviceGetPowerManagementLimit 86 6 - PCIBusID # nvmlDeviceGetPciInfo 87 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( 88 8 - PCI Bandwidth 89 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo 90 10 - Display Mode # nvmlDeviceGetDisplayMode 91 11 - Persistence Mode # nvmlDeviceGetPersistenceMode 92 */ 93 94 // Assumed that this method is called with receiver retrieved from 95 // NewNvmlClient 96 // because this method handles initialization of NVML library 97 98 driverVersion, err := c.driver.SystemDriverVersion() 99 if err != nil { 100 return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) 101 } 102 103 numDevices, err := c.driver.DeviceCount() 104 if err != nil { 105 return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) 106 } 107 108 allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) 109 110 for i := 0; i < int(numDevices); i++ { 111 deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) 112 if err != nil { 113 return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) 114 } 115 116 allNvidiaGPUResources[i] = &FingerprintDeviceData{ 117 DeviceData: &DeviceData{ 118 DeviceName: deviceInfo.Name, 119 UUID: deviceInfo.UUID, 120 MemoryMiB: deviceInfo.MemoryMiB, 121 PowerW: deviceInfo.PowerW, 122 BAR1MiB: deviceInfo.BAR1MiB, 123 }, 124 PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, 125 CoresClockMHz: deviceInfo.CoresClockMHz, 126 MemoryClockMHz: deviceInfo.MemoryClockMHz, 127 DisplayState: deviceInfo.DisplayState, 128 PersistenceMode: deviceInfo.PersistenceMode, 129 PCIBusID: deviceInfo.PCIBusID, 130 } 131 } 132 return &FingerprintData{ 133 Devices: allNvidiaGPUResources, 134 DriverVersion: driverVersion, 135 }, nil 136 } 137 138 // GetStatsData returns statistics data for all devices on this machine 139 func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { 140 /* 141 nvml fields to be reported to stats api # nvml_library_call 142 1 - Used Memory # nvmlDeviceGetMemoryInfo 143 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates 144 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates 145 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization 146 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization 147 6 - Current GPU Temperature # nvmlDeviceGetTemperature 148 7 - Power Draw # nvmlDeviceGetPowerUsage 149 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo 150 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter 151 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter 152 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter 153 */ 154 155 // Assumed that this method is called with receiver retrieved from 156 // NewNvmlClient 157 // because this method handles initialization of NVML library 158 159 numDevices, err := c.driver.DeviceCount() 160 if err != nil { 161 return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) 162 } 163 164 allNvidiaGPUStats := make([]*StatsData, numDevices) 165 166 for i := 0; i < int(numDevices); i++ { 167 deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) 168 if err != nil { 169 return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) 170 } 171 172 allNvidiaGPUStats[i] = &StatsData{ 173 DeviceData: &DeviceData{ 174 DeviceName: deviceInfo.Name, 175 UUID: deviceInfo.UUID, 176 MemoryMiB: deviceInfo.MemoryMiB, 177 PowerW: deviceInfo.PowerW, 178 BAR1MiB: deviceInfo.BAR1MiB, 179 }, 180 PowerUsageW: deviceStatus.PowerUsageW, 181 GPUUtilization: deviceStatus.GPUUtilization, 182 MemoryUtilization: deviceStatus.MemoryUtilization, 183 EncoderUtilization: deviceStatus.EncoderUtilization, 184 DecoderUtilization: deviceStatus.DecoderUtilization, 185 TemperatureC: deviceStatus.TemperatureC, 186 UsedMemoryMiB: deviceStatus.UsedMemoryMiB, 187 BAR1UsedMiB: deviceStatus.BAR1UsedMiB, 188 ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, 189 ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, 190 ECCErrorsDevice: deviceStatus.ECCErrorsDevice, 191 } 192 } 193 return allNvidiaGPUStats, nil 194 }