github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/stats.go (about) 1 package nvidia 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" 8 "github.com/hashicorp/nomad/helper" 9 "github.com/hashicorp/nomad/plugins/device" 10 "github.com/hashicorp/nomad/plugins/shared/structs" 11 ) 12 13 const ( 14 // Attribute names for reporting stats output 15 PowerUsageAttr = "Power usage" 16 PowerUsageUnit = "W" 17 PowerUsageDesc = "Power usage for this GPU in watts and " + 18 "its associated circuitry (e.g. memory) / Maximum GPU Power" 19 GPUUtilizationAttr = "GPU utilization" 20 GPUUtilizationUnit = "%" 21 GPUUtilizationDesc = "Percent of time over the past sample period " + 22 "during which one or more kernels were executing on the GPU." 23 MemoryUtilizationAttr = "Memory utilization" 24 MemoryUtilizationUnit = "%" 25 MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" 26 EncoderUtilizationAttr = "Encoder utilization" 27 EncoderUtilizationUnit = "%" 28 EncoderUtilizationDesc = "Percent of time over the past sample period " + 29 "during which GPU Encoder was used" 30 DecoderUtilizationAttr = "Decoder utilization" 31 DecoderUtilizationUnit = "%" 32 DecoderUtilizationDesc = "Percent of time over the past sample period " + 33 "during which GPU Decoder was used" 34 TemperatureAttr = "Temperature" 35 TemperatureUnit = "C" // Celsius degrees 36 TemperatureDesc = "Temperature of the Unit" 37 MemoryStateAttr = "Memory state" 38 MemoryStateUnit = "MiB" // Mebibytes 39 MemoryStateDesc = "UsedMemory / TotalMemory" 40 BAR1StateAttr = "BAR1 buffer state" 41 BAR1StateUnit = "MiB" // Mebibytes 42 BAR1StateDesc = "UsedBAR1 / TotalBAR1" 43 ECCErrorsL1CacheAttr = "ECC L1 errors" 44 ECCErrorsL1CacheUnit = "#" // number of errors 45 ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" 46 ECCErrorsL2CacheAttr = "ECC L2 errors" 47 ECCErrorsL2CacheUnit = "#" // number of errors 48 ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" 49 ECCErrorsDeviceAttr = "ECC memory errors" 50 ECCErrorsDeviceUnit = "#" // number of errors 51 ECCErrorsDeviceDesc = "Requested memory error counter for the device" 52 ) 53 54 // stats is the long running goroutine that streams device statistics 55 func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) { 56 defer close(stats) 57 58 if d.initErr != nil { 59 if d.initErr.Error() != nvml.UnavailableLib.Error() { 60 d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) 61 stats <- device.NewStatsError(d.initErr) 62 } 63 64 return 65 } 66 67 // Create a timer that will fire immediately for the first detection 68 ticker := time.NewTimer(0) 69 70 for { 71 select { 72 case <-ctx.Done(): 73 return 74 case <-ticker.C: 75 ticker.Reset(interval) 76 } 77 78 d.writeStatsToChannel(stats, time.Now()) 79 } 80 } 81 82 // filterStatsByID accepts list of StatsData and set of IDs 83 // this function would return entries from StatsData with IDs found in the set 84 func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData { 85 var filteredStats []*nvml.StatsData 86 for _, statsItem := range stats { 87 if _, ok := IDs[statsItem.UUID]; ok { 88 filteredStats = append(filteredStats, statsItem) 89 } 90 } 91 return filteredStats 92 } 93 94 // writeStatsToChannel collects StatsData from NVML backend, groups StatsData 95 // by DeviceName attribute, populates DeviceGroupStats structure for every group 96 // and sends data over provided channel 97 func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { 98 statsData, err := d.nvmlClient.GetStatsData() 99 if err != nil { 100 d.logger.Error("failed to get nvidia stats", "error", err) 101 stats <- &device.StatsResponse{ 102 Error: err, 103 } 104 return 105 } 106 107 // filter only stats from devices that are stored in NvidiaDevice struct 108 d.deviceLock.RLock() 109 statsData = filterStatsByID(statsData, d.devices) 110 d.deviceLock.RUnlock() 111 112 // group stats by DeviceName struct field 113 statsListByDeviceName := make(map[string][]*nvml.StatsData) 114 for _, statsItem := range statsData { 115 deviceName := statsItem.DeviceName 116 if deviceName == nil { 117 // nvml driver was not able to detect device name. This kind 118 // of devices are placed to single group with 'notAvailable' name 119 notAvailableCopy := notAvailable 120 deviceName = ¬AvailableCopy 121 } 122 123 statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) 124 } 125 126 // place data device.DeviceGroupStats struct for every group of stats 127 deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) 128 for groupName, groupStats := range statsListByDeviceName { 129 deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) 130 } 131 132 stats <- &device.StatsResponse{ 133 Groups: deviceGroupsStats, 134 } 135 } 136 137 func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue { 138 return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)} 139 } 140 141 // statsForGroup is a helper function that populates device.DeviceGroupStats 142 // for given groupName with groupStats list 143 func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { 144 instanceStats := make(map[string]*device.DeviceStats) 145 for _, statsItem := range groupStats { 146 instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) 147 } 148 149 return &device.DeviceGroupStats{ 150 Vendor: vendor, 151 Type: deviceType, 152 Name: groupName, 153 InstanceStats: instanceStats, 154 } 155 } 156 157 // statsForItem is a helper function that populates device.DeviceStats for given 158 // nvml.StatsData 159 func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { 160 // nvml.StatsData holds pointers to values that can be nil 161 // In case they are nil return stats with 'notAvailable' constant 162 var ( 163 powerUsageStat *structs.StatValue 164 GPUUtilizationStat *structs.StatValue 165 memoryUtilizationStat *structs.StatValue 166 encoderUtilizationStat *structs.StatValue 167 decoderUtilizationStat *structs.StatValue 168 temperatureStat *structs.StatValue 169 memoryStateStat *structs.StatValue 170 BAR1StateStat *structs.StatValue 171 ECCErrorsL1CacheStat *structs.StatValue 172 ECCErrorsL2CacheStat *structs.StatValue 173 ECCErrorsDeviceStat *structs.StatValue 174 ) 175 176 if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { 177 powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) 178 } else { 179 powerUsageStat = &structs.StatValue{ 180 Unit: PowerUsageUnit, 181 Desc: PowerUsageDesc, 182 IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)), 183 IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW), 184 } 185 } 186 187 if statsItem.GPUUtilization == nil { 188 GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) 189 } else { 190 GPUUtilizationStat = &structs.StatValue{ 191 Unit: GPUUtilizationUnit, 192 Desc: GPUUtilizationDesc, 193 IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization), 194 } 195 } 196 197 if statsItem.MemoryUtilization == nil { 198 memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) 199 } else { 200 memoryUtilizationStat = &structs.StatValue{ 201 Unit: MemoryUtilizationUnit, 202 Desc: MemoryUtilizationDesc, 203 IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization), 204 } 205 } 206 207 if statsItem.EncoderUtilization == nil { 208 encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) 209 } else { 210 encoderUtilizationStat = &structs.StatValue{ 211 Unit: EncoderUtilizationUnit, 212 Desc: EncoderUtilizationDesc, 213 IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization), 214 } 215 } 216 217 if statsItem.DecoderUtilization == nil { 218 decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) 219 } else { 220 decoderUtilizationStat = &structs.StatValue{ 221 Unit: DecoderUtilizationUnit, 222 Desc: DecoderUtilizationDesc, 223 IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization), 224 } 225 } 226 227 if statsItem.TemperatureC == nil { 228 temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) 229 } else { 230 temperatureStat = &structs.StatValue{ 231 Unit: TemperatureUnit, 232 Desc: TemperatureDesc, 233 IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC), 234 } 235 } 236 237 if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { 238 memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) 239 } else { 240 memoryStateStat = &structs.StatValue{ 241 Unit: MemoryStateUnit, 242 Desc: MemoryStateDesc, 243 IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB), 244 IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB), 245 } 246 } 247 248 if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { 249 BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) 250 } else { 251 BAR1StateStat = &structs.StatValue{ 252 Unit: BAR1StateUnit, 253 Desc: BAR1StateDesc, 254 IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB), 255 IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB), 256 } 257 } 258 259 if statsItem.ECCErrorsL1Cache == nil { 260 ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) 261 } else { 262 ECCErrorsL1CacheStat = &structs.StatValue{ 263 Unit: ECCErrorsL1CacheUnit, 264 Desc: ECCErrorsL1CacheDesc, 265 IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache), 266 } 267 } 268 269 if statsItem.ECCErrorsL2Cache == nil { 270 ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) 271 } else { 272 ECCErrorsL2CacheStat = &structs.StatValue{ 273 Unit: ECCErrorsL2CacheUnit, 274 Desc: ECCErrorsL2CacheDesc, 275 IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache), 276 } 277 } 278 279 if statsItem.ECCErrorsDevice == nil { 280 ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) 281 } else { 282 ECCErrorsDeviceStat = &structs.StatValue{ 283 Unit: ECCErrorsDeviceUnit, 284 Desc: ECCErrorsDeviceDesc, 285 IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice), 286 } 287 } 288 return &device.DeviceStats{ 289 Summary: memoryStateStat, 290 Stats: &structs.StatObject{ 291 Attributes: map[string]*structs.StatValue{ 292 PowerUsageAttr: powerUsageStat, 293 GPUUtilizationAttr: GPUUtilizationStat, 294 MemoryUtilizationAttr: memoryUtilizationStat, 295 EncoderUtilizationAttr: encoderUtilizationStat, 296 DecoderUtilizationAttr: decoderUtilizationStat, 297 TemperatureAttr: temperatureStat, 298 MemoryStateAttr: memoryStateStat, 299 BAR1StateAttr: BAR1StateStat, 300 ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, 301 ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, 302 ECCErrorsDeviceAttr: ECCErrorsDeviceStat, 303 }, 304 }, 305 Timestamp: timestamp, 306 } 307 } 308 309 func uintToInt64Ptr(u *uint) *int64 { 310 if u == nil { 311 return nil 312 } 313 314 v := int64(*u) 315 return &v 316 } 317 318 func uint64ToInt64Ptr(u *uint64) *int64 { 319 if u == nil { 320 return nil 321 } 322 323 v := int64(*u) 324 return &v 325 }