github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/fingerprint.go (about)

     1  package nvidia
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
     8  	"github.com/hashicorp/nomad/helper"
     9  	"github.com/hashicorp/nomad/plugins/device"
    10  	"github.com/hashicorp/nomad/plugins/shared/structs"
    11  )
    12  
    13  const (
    14  	// Attribute names and units for reporting Fingerprint output
    15  	MemoryAttr          = "memory"
    16  	PowerAttr           = "power"
    17  	BAR1Attr            = "bar1"
    18  	DriverVersionAttr   = "driver_version"
    19  	CoresClockAttr      = "cores_clock"
    20  	MemoryClockAttr     = "memory_clock"
    21  	PCIBandwidthAttr    = "pci_bandwidth"
    22  	DisplayStateAttr    = "display_state"
    23  	PersistenceModeAttr = "persistence_mode"
    24  )
    25  
    26  // fingerprint is the long running goroutine that detects hardware
    27  func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
    28  	defer close(devices)
    29  
    30  	if d.initErr != nil {
    31  		if d.initErr.Error() != nvml.UnavailableLib.Error() {
    32  			d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
    33  			devices <- device.NewFingerprintError(d.initErr)
    34  		}
    35  
    36  		// Just close the channel to let server know that there are no working
    37  		// Nvidia GPU units
    38  		return
    39  	}
    40  
    41  	// Create a timer that will fire immediately for the first detection
    42  	ticker := time.NewTimer(0)
    43  
    44  	for {
    45  		select {
    46  		case <-ctx.Done():
    47  			return
    48  		case <-ticker.C:
    49  			ticker.Reset(d.fingerprintPeriod)
    50  		}
    51  		d.writeFingerprintToChannel(devices)
    52  	}
    53  }
    54  
    55  // writeFingerprintToChannel makes nvml call and writes response to channel
    56  func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
    57  	fingerprintData, err := d.nvmlClient.GetFingerprintData()
    58  	if err != nil {
    59  		d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
    60  		devices <- device.NewFingerprintError(err)
    61  		return
    62  	}
    63  
    64  	// ignore devices from fingerprint output
    65  	fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
    66  	// check if any device health was updated or any device was added to host
    67  	if !d.fingerprintChanged(fingerprintDevices) {
    68  		return
    69  	}
    70  
    71  	commonAttributes := map[string]*structs.Attribute{
    72  		DriverVersionAttr: {
    73  			String: helper.StringToPtr(fingerprintData.DriverVersion),
    74  		},
    75  	}
    76  
    77  	// Group all FingerprintDevices by DeviceName attribute
    78  	deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
    79  	for _, device := range fingerprintDevices {
    80  		deviceName := device.DeviceName
    81  		if deviceName == nil {
    82  			// nvml driver was not able to detect device name. This kind
    83  			// of devices are placed to single group with 'notAvailable' name
    84  			notAvailableCopy := notAvailable
    85  			deviceName = &notAvailableCopy
    86  		}
    87  
    88  		deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
    89  	}
    90  
    91  	// Build Fingerprint response with computed groups and send it over the channel
    92  	deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
    93  	for groupName, devices := range deviceListByDeviceName {
    94  		deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
    95  	}
    96  	devices <- device.NewFingerprint(deviceGroups...)
    97  }
    98  
    99  // ignoreFingerprintedDevices excludes ignored devices from fingerprint output
   100  func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
   101  	var result []*nvml.FingerprintDeviceData
   102  	for _, fingerprintDevice := range deviceData {
   103  		if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
   104  			result = append(result, fingerprintDevice)
   105  		}
   106  	}
   107  	return result
   108  }
   109  
   110  // fingerprintChanged checks if there are any previously unseen nvidia devices located
   111  // or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
   112  // Also, this func updates device map on NvidiaDevice with the latest data
   113  func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
   114  	d.deviceLock.Lock()
   115  	defer d.deviceLock.Unlock()
   116  
   117  	changeDetected := false
   118  	// check if every device in allDevices is in d.devices
   119  	for _, device := range allDevices {
   120  		if _, ok := d.devices[device.UUID]; !ok {
   121  			changeDetected = true
   122  		}
   123  	}
   124  
   125  	// check if every device in d.devices is in allDevices
   126  	fingerprintDeviceMap := make(map[string]struct{})
   127  	for _, device := range allDevices {
   128  		fingerprintDeviceMap[device.UUID] = struct{}{}
   129  	}
   130  	for id := range d.devices {
   131  		if _, ok := fingerprintDeviceMap[id]; !ok {
   132  			changeDetected = true
   133  		}
   134  	}
   135  
   136  	d.devices = fingerprintDeviceMap
   137  	return changeDetected
   138  }
   139  
   140  // deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
   141  func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup {
   142  	// deviceGroup without devices makes no sense -> return nil when no devices are provided
   143  	if len(deviceList) == 0 {
   144  		return nil
   145  	}
   146  
   147  	devices := make([]*device.Device, len(deviceList))
   148  	for index, dev := range deviceList {
   149  		devices[index] = &device.Device{
   150  			ID: dev.UUID,
   151  			// all fingerprinted devices are "healthy" for now
   152  			// to get real health data -> dcgm bindings should be used
   153  			Healthy: true,
   154  			HwLocality: &device.DeviceLocality{
   155  				PciBusID: dev.PCIBusID,
   156  			},
   157  		}
   158  	}
   159  
   160  	deviceGroup := &device.DeviceGroup{
   161  		Vendor:  vendor,
   162  		Type:    deviceType,
   163  		Name:    groupName,
   164  		Devices: devices,
   165  		// Assumption made that devices with the same DeviceName have the same
   166  		// attributes like amount of memory, power, bar1memory etc
   167  		Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
   168  	}
   169  
   170  	// Extend attribute map with common attributes
   171  	for attributeKey, attributeValue := range commonAttributes {
   172  		deviceGroup.Attributes[attributeKey] = attributeValue
   173  	}
   174  
   175  	return deviceGroup
   176  }
   177  
   178  // attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
   179  // struct to device.DeviceGroup.Attributes format (map[string]string)
   180  // this function performs all nil checks for FingerprintDeviceData pointers
   181  func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute {
   182  	attrs := map[string]*structs.Attribute{
   183  		DisplayStateAttr: {
   184  			String: helper.StringToPtr(d.DisplayState),
   185  		},
   186  		PersistenceModeAttr: {
   187  			String: helper.StringToPtr(d.PersistenceMode),
   188  		},
   189  	}
   190  
   191  	if d.MemoryMiB != nil {
   192  		attrs[MemoryAttr] = &structs.Attribute{
   193  			Int:  helper.Int64ToPtr(int64(*d.MemoryMiB)),
   194  			Unit: structs.UnitMiB,
   195  		}
   196  	}
   197  	if d.PowerW != nil {
   198  		attrs[PowerAttr] = &structs.Attribute{
   199  			Int:  helper.Int64ToPtr(int64(*d.PowerW)),
   200  			Unit: structs.UnitW,
   201  		}
   202  	}
   203  	if d.BAR1MiB != nil {
   204  		attrs[BAR1Attr] = &structs.Attribute{
   205  			Int:  helper.Int64ToPtr(int64(*d.BAR1MiB)),
   206  			Unit: structs.UnitMiB,
   207  		}
   208  	}
   209  	if d.CoresClockMHz != nil {
   210  		attrs[CoresClockAttr] = &structs.Attribute{
   211  			Int:  helper.Int64ToPtr(int64(*d.CoresClockMHz)),
   212  			Unit: structs.UnitMHz,
   213  		}
   214  	}
   215  	if d.MemoryClockMHz != nil {
   216  		attrs[MemoryClockAttr] = &structs.Attribute{
   217  			Int:  helper.Int64ToPtr(int64(*d.MemoryClockMHz)),
   218  			Unit: structs.UnitMHz,
   219  		}
   220  	}
   221  	if d.PCIBandwidthMBPerS != nil {
   222  		attrs[PCIBandwidthAttr] = &structs.Attribute{
   223  			Int:  helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)),
   224  			Unit: structs.UnitMBPerS,
   225  		}
   226  	}
   227  
   228  	return attrs
   229  }