github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/fingerprint.go (about) 1 package nvidia 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" 8 "github.com/hashicorp/nomad/helper" 9 "github.com/hashicorp/nomad/plugins/device" 10 "github.com/hashicorp/nomad/plugins/shared/structs" 11 ) 12 13 const ( 14 // Attribute names and units for reporting Fingerprint output 15 MemoryAttr = "memory" 16 PowerAttr = "power" 17 BAR1Attr = "bar1" 18 DriverVersionAttr = "driver_version" 19 CoresClockAttr = "cores_clock" 20 MemoryClockAttr = "memory_clock" 21 PCIBandwidthAttr = "pci_bandwidth" 22 DisplayStateAttr = "display_state" 23 PersistenceModeAttr = "persistence_mode" 24 ) 25 26 // fingerprint is the long running goroutine that detects hardware 27 func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { 28 defer close(devices) 29 30 if d.initErr != nil { 31 if d.initErr.Error() != nvml.UnavailableLib.Error() { 32 d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) 33 devices <- device.NewFingerprintError(d.initErr) 34 } 35 36 // Just close the channel to let server know that there are no working 37 // Nvidia GPU units 38 return 39 } 40 41 // Create a timer that will fire immediately for the first detection 42 ticker := time.NewTimer(0) 43 44 for { 45 select { 46 case <-ctx.Done(): 47 return 48 case <-ticker.C: 49 ticker.Reset(d.fingerprintPeriod) 50 } 51 d.writeFingerprintToChannel(devices) 52 } 53 } 54 55 // writeFingerprintToChannel makes nvml call and writes response to channel 56 func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { 57 fingerprintData, err := d.nvmlClient.GetFingerprintData() 58 if err != nil { 59 d.logger.Error("failed to get fingerprint nvidia devices", "error", err) 60 devices <- device.NewFingerprintError(err) 61 return 62 } 63 64 // ignore devices from fingerprint output 65 fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) 66 // check if any device health was updated or any device was added to host 67 if !d.fingerprintChanged(fingerprintDevices) { 68 return 69 } 70 71 commonAttributes := map[string]*structs.Attribute{ 72 DriverVersionAttr: { 73 String: helper.StringToPtr(fingerprintData.DriverVersion), 74 }, 75 } 76 77 // Group all FingerprintDevices by DeviceName attribute 78 deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) 79 for _, device := range fingerprintDevices { 80 deviceName := device.DeviceName 81 if deviceName == nil { 82 // nvml driver was not able to detect device name. This kind 83 // of devices are placed to single group with 'notAvailable' name 84 notAvailableCopy := notAvailable 85 deviceName = ¬AvailableCopy 86 } 87 88 deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) 89 } 90 91 // Build Fingerprint response with computed groups and send it over the channel 92 deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) 93 for groupName, devices := range deviceListByDeviceName { 94 deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) 95 } 96 devices <- device.NewFingerprint(deviceGroups...) 97 } 98 99 // ignoreFingerprintedDevices excludes ignored devices from fingerprint output 100 func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { 101 var result []*nvml.FingerprintDeviceData 102 for _, fingerprintDevice := range deviceData { 103 if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { 104 result = append(result, fingerprintDevice) 105 } 106 } 107 return result 108 } 109 110 // fingerprintChanged checks if there are any previously unseen nvidia devices located 111 // or any of fingerprinted nvidia devices disappeared since the last fingerprint run. 112 // Also, this func updates device map on NvidiaDevice with the latest data 113 func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { 114 d.deviceLock.Lock() 115 defer d.deviceLock.Unlock() 116 117 changeDetected := false 118 // check if every device in allDevices is in d.devices 119 for _, device := range allDevices { 120 if _, ok := d.devices[device.UUID]; !ok { 121 changeDetected = true 122 } 123 } 124 125 // check if every device in d.devices is in allDevices 126 fingerprintDeviceMap := make(map[string]struct{}) 127 for _, device := range allDevices { 128 fingerprintDeviceMap[device.UUID] = struct{}{} 129 } 130 for id := range d.devices { 131 if _, ok := fingerprintDeviceMap[id]; !ok { 132 changeDetected = true 133 } 134 } 135 136 d.devices = fingerprintDeviceMap 137 return changeDetected 138 } 139 140 // deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice 141 func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup { 142 // deviceGroup without devices makes no sense -> return nil when no devices are provided 143 if len(deviceList) == 0 { 144 return nil 145 } 146 147 devices := make([]*device.Device, len(deviceList)) 148 for index, dev := range deviceList { 149 devices[index] = &device.Device{ 150 ID: dev.UUID, 151 // all fingerprinted devices are "healthy" for now 152 // to get real health data -> dcgm bindings should be used 153 Healthy: true, 154 HwLocality: &device.DeviceLocality{ 155 PciBusID: dev.PCIBusID, 156 }, 157 } 158 } 159 160 deviceGroup := &device.DeviceGroup{ 161 Vendor: vendor, 162 Type: deviceType, 163 Name: groupName, 164 Devices: devices, 165 // Assumption made that devices with the same DeviceName have the same 166 // attributes like amount of memory, power, bar1memory etc 167 Attributes: attributesFromFingerprintDeviceData(deviceList[0]), 168 } 169 170 // Extend attribute map with common attributes 171 for attributeKey, attributeValue := range commonAttributes { 172 deviceGroup.Attributes[attributeKey] = attributeValue 173 } 174 175 return deviceGroup 176 } 177 178 // attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData 179 // struct to device.DeviceGroup.Attributes format (map[string]string) 180 // this function performs all nil checks for FingerprintDeviceData pointers 181 func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute { 182 attrs := map[string]*structs.Attribute{ 183 DisplayStateAttr: { 184 String: helper.StringToPtr(d.DisplayState), 185 }, 186 PersistenceModeAttr: { 187 String: helper.StringToPtr(d.PersistenceMode), 188 }, 189 } 190 191 if d.MemoryMiB != nil { 192 attrs[MemoryAttr] = &structs.Attribute{ 193 Int: helper.Int64ToPtr(int64(*d.MemoryMiB)), 194 Unit: structs.UnitMiB, 195 } 196 } 197 if d.PowerW != nil { 198 attrs[PowerAttr] = &structs.Attribute{ 199 Int: helper.Int64ToPtr(int64(*d.PowerW)), 200 Unit: structs.UnitW, 201 } 202 } 203 if d.BAR1MiB != nil { 204 attrs[BAR1Attr] = &structs.Attribute{ 205 Int: helper.Int64ToPtr(int64(*d.BAR1MiB)), 206 Unit: structs.UnitMiB, 207 } 208 } 209 if d.CoresClockMHz != nil { 210 attrs[CoresClockAttr] = &structs.Attribute{ 211 Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)), 212 Unit: structs.UnitMHz, 213 } 214 } 215 if d.MemoryClockMHz != nil { 216 attrs[MemoryClockAttr] = &structs.Attribute{ 217 Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)), 218 Unit: structs.UnitMHz, 219 } 220 } 221 if d.PCIBandwidthMBPerS != nil { 222 attrs[PCIBandwidthAttr] = &structs.Attribute{ 223 Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)), 224 Unit: structs.UnitMBPerS, 225 } 226 } 227 228 return attrs 229 }