github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/devices/gpu/nvidia/stats.go (about)

     1  package nvidia
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
     8  	"github.com/hashicorp/nomad/helper"
     9  	"github.com/hashicorp/nomad/plugins/device"
    10  	"github.com/hashicorp/nomad/plugins/shared/structs"
    11  )
    12  
    13  const (
    14  	// Attribute names for reporting stats output
    15  	PowerUsageAttr = "Power usage"
    16  	PowerUsageUnit = "W"
    17  	PowerUsageDesc = "Power usage for this GPU in watts and " +
    18  		"its associated circuitry (e.g. memory) / Maximum GPU Power"
    19  	GPUUtilizationAttr = "GPU utilization"
    20  	GPUUtilizationUnit = "%"
    21  	GPUUtilizationDesc = "Percent of time over the past sample period " +
    22  		"during which one or more kernels were executing on the GPU."
    23  	MemoryUtilizationAttr  = "Memory utilization"
    24  	MemoryUtilizationUnit  = "%"
    25  	MemoryUtilizationDesc  = "Percentage of bandwidth used during the past sample period"
    26  	EncoderUtilizationAttr = "Encoder utilization"
    27  	EncoderUtilizationUnit = "%"
    28  	EncoderUtilizationDesc = "Percent of time over the past sample period " +
    29  		"during which GPU Encoder was used"
    30  	DecoderUtilizationAttr = "Decoder utilization"
    31  	DecoderUtilizationUnit = "%"
    32  	DecoderUtilizationDesc = "Percent of time over the past sample period " +
    33  		"during which GPU Decoder was used"
    34  	TemperatureAttr      = "Temperature"
    35  	TemperatureUnit      = "C" // Celsius degrees
    36  	TemperatureDesc      = "Temperature of the Unit"
    37  	MemoryStateAttr      = "Memory state"
    38  	MemoryStateUnit      = "MiB" // Mebibytes
    39  	MemoryStateDesc      = "UsedMemory / TotalMemory"
    40  	BAR1StateAttr        = "BAR1 buffer state"
    41  	BAR1StateUnit        = "MiB" // Mebibytes
    42  	BAR1StateDesc        = "UsedBAR1 / TotalBAR1"
    43  	ECCErrorsL1CacheAttr = "ECC L1 errors"
    44  	ECCErrorsL1CacheUnit = "#" // number of errors
    45  	ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
    46  	ECCErrorsL2CacheAttr = "ECC L2 errors"
    47  	ECCErrorsL2CacheUnit = "#" // number of errors
    48  	ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
    49  	ECCErrorsDeviceAttr  = "ECC memory errors"
    50  	ECCErrorsDeviceUnit  = "#" // number of errors
    51  	ECCErrorsDeviceDesc  = "Requested memory error counter for the device"
    52  )
    53  
    54  // stats is the long running goroutine that streams device statistics
    55  func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) {
    56  	defer close(stats)
    57  
    58  	if d.initErr != nil {
    59  		if d.initErr.Error() != nvml.UnavailableLib.Error() {
    60  			d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
    61  			stats <- device.NewStatsError(d.initErr)
    62  		}
    63  
    64  		return
    65  	}
    66  
    67  	// Create a timer that will fire immediately for the first detection
    68  	ticker := time.NewTimer(0)
    69  
    70  	for {
    71  		select {
    72  		case <-ctx.Done():
    73  			return
    74  		case <-ticker.C:
    75  			ticker.Reset(interval)
    76  		}
    77  
    78  		d.writeStatsToChannel(stats, time.Now())
    79  	}
    80  }
    81  
    82  // filterStatsByID accepts list of StatsData and set of IDs
    83  // this function would return entries from StatsData with IDs found in the set
    84  func filterStatsByID(stats []*nvml.StatsData, IDs map[string]struct{}) []*nvml.StatsData {
    85  	var filteredStats []*nvml.StatsData
    86  	for _, statsItem := range stats {
    87  		if _, ok := IDs[statsItem.UUID]; ok {
    88  			filteredStats = append(filteredStats, statsItem)
    89  		}
    90  	}
    91  	return filteredStats
    92  }
    93  
    94  // writeStatsToChannel collects StatsData from NVML backend, groups StatsData
    95  // by DeviceName attribute, populates DeviceGroupStats structure for every group
    96  // and sends data over provided channel
    97  func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
    98  	statsData, err := d.nvmlClient.GetStatsData()
    99  	if err != nil {
   100  		d.logger.Error("failed to get nvidia stats", "error", err)
   101  		stats <- &device.StatsResponse{
   102  			Error: err,
   103  		}
   104  		return
   105  	}
   106  
   107  	// filter only stats from devices that are stored in NvidiaDevice struct
   108  	d.deviceLock.RLock()
   109  	statsData = filterStatsByID(statsData, d.devices)
   110  	d.deviceLock.RUnlock()
   111  
   112  	// group stats by DeviceName struct field
   113  	statsListByDeviceName := make(map[string][]*nvml.StatsData)
   114  	for _, statsItem := range statsData {
   115  		deviceName := statsItem.DeviceName
   116  		if deviceName == nil {
   117  			// nvml driver was not able to detect device name. This kind
   118  			// of devices are placed to single group with 'notAvailable' name
   119  			notAvailableCopy := notAvailable
   120  			deviceName = &notAvailableCopy
   121  		}
   122  
   123  		statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
   124  	}
   125  
   126  	// place data device.DeviceGroupStats struct for every group of stats
   127  	deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
   128  	for groupName, groupStats := range statsListByDeviceName {
   129  		deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
   130  	}
   131  
   132  	stats <- &device.StatsResponse{
   133  		Groups: deviceGroupsStats,
   134  	}
   135  }
   136  
   137  func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue {
   138  	return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)}
   139  }
   140  
   141  // statsForGroup is a helper function that populates device.DeviceGroupStats
   142  // for given groupName with groupStats list
   143  func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
   144  	instanceStats := make(map[string]*device.DeviceStats)
   145  	for _, statsItem := range groupStats {
   146  		instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
   147  	}
   148  
   149  	return &device.DeviceGroupStats{
   150  		Vendor:        vendor,
   151  		Type:          deviceType,
   152  		Name:          groupName,
   153  		InstanceStats: instanceStats,
   154  	}
   155  }
   156  
   157  // statsForItem is a helper function that populates device.DeviceStats for given
   158  // nvml.StatsData
   159  func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
   160  	// nvml.StatsData holds pointers to values that can be nil
   161  	// In case they are nil return stats with 'notAvailable' constant
   162  	var (
   163  		powerUsageStat         *structs.StatValue
   164  		GPUUtilizationStat     *structs.StatValue
   165  		memoryUtilizationStat  *structs.StatValue
   166  		encoderUtilizationStat *structs.StatValue
   167  		decoderUtilizationStat *structs.StatValue
   168  		temperatureStat        *structs.StatValue
   169  		memoryStateStat        *structs.StatValue
   170  		BAR1StateStat          *structs.StatValue
   171  		ECCErrorsL1CacheStat   *structs.StatValue
   172  		ECCErrorsL2CacheStat   *structs.StatValue
   173  		ECCErrorsDeviceStat    *structs.StatValue
   174  	)
   175  
   176  	if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
   177  		powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
   178  	} else {
   179  		powerUsageStat = &structs.StatValue{
   180  			Unit:              PowerUsageUnit,
   181  			Desc:              PowerUsageDesc,
   182  			IntNumeratorVal:   helper.Int64ToPtr(int64(*statsItem.PowerUsageW)),
   183  			IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW),
   184  		}
   185  	}
   186  
   187  	if statsItem.GPUUtilization == nil {
   188  		GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
   189  	} else {
   190  		GPUUtilizationStat = &structs.StatValue{
   191  			Unit:            GPUUtilizationUnit,
   192  			Desc:            GPUUtilizationDesc,
   193  			IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization),
   194  		}
   195  	}
   196  
   197  	if statsItem.MemoryUtilization == nil {
   198  		memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
   199  	} else {
   200  		memoryUtilizationStat = &structs.StatValue{
   201  			Unit:            MemoryUtilizationUnit,
   202  			Desc:            MemoryUtilizationDesc,
   203  			IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization),
   204  		}
   205  	}
   206  
   207  	if statsItem.EncoderUtilization == nil {
   208  		encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
   209  	} else {
   210  		encoderUtilizationStat = &structs.StatValue{
   211  			Unit:            EncoderUtilizationUnit,
   212  			Desc:            EncoderUtilizationDesc,
   213  			IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization),
   214  		}
   215  	}
   216  
   217  	if statsItem.DecoderUtilization == nil {
   218  		decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
   219  	} else {
   220  		decoderUtilizationStat = &structs.StatValue{
   221  			Unit:            DecoderUtilizationUnit,
   222  			Desc:            DecoderUtilizationDesc,
   223  			IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization),
   224  		}
   225  	}
   226  
   227  	if statsItem.TemperatureC == nil {
   228  		temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
   229  	} else {
   230  		temperatureStat = &structs.StatValue{
   231  			Unit:            TemperatureUnit,
   232  			Desc:            TemperatureDesc,
   233  			IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC),
   234  		}
   235  	}
   236  
   237  	if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
   238  		memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
   239  	} else {
   240  		memoryStateStat = &structs.StatValue{
   241  			Unit:              MemoryStateUnit,
   242  			Desc:              MemoryStateDesc,
   243  			IntNumeratorVal:   uint64ToInt64Ptr(statsItem.UsedMemoryMiB),
   244  			IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB),
   245  		}
   246  	}
   247  
   248  	if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
   249  		BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
   250  	} else {
   251  		BAR1StateStat = &structs.StatValue{
   252  			Unit:              BAR1StateUnit,
   253  			Desc:              BAR1StateDesc,
   254  			IntNumeratorVal:   uint64ToInt64Ptr(statsItem.BAR1UsedMiB),
   255  			IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB),
   256  		}
   257  	}
   258  
   259  	if statsItem.ECCErrorsL1Cache == nil {
   260  		ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
   261  	} else {
   262  		ECCErrorsL1CacheStat = &structs.StatValue{
   263  			Unit:            ECCErrorsL1CacheUnit,
   264  			Desc:            ECCErrorsL1CacheDesc,
   265  			IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache),
   266  		}
   267  	}
   268  
   269  	if statsItem.ECCErrorsL2Cache == nil {
   270  		ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
   271  	} else {
   272  		ECCErrorsL2CacheStat = &structs.StatValue{
   273  			Unit:            ECCErrorsL2CacheUnit,
   274  			Desc:            ECCErrorsL2CacheDesc,
   275  			IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache),
   276  		}
   277  	}
   278  
   279  	if statsItem.ECCErrorsDevice == nil {
   280  		ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
   281  	} else {
   282  		ECCErrorsDeviceStat = &structs.StatValue{
   283  			Unit:            ECCErrorsDeviceUnit,
   284  			Desc:            ECCErrorsDeviceDesc,
   285  			IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice),
   286  		}
   287  	}
   288  	return &device.DeviceStats{
   289  		Summary: memoryStateStat,
   290  		Stats: &structs.StatObject{
   291  			Attributes: map[string]*structs.StatValue{
   292  				PowerUsageAttr:         powerUsageStat,
   293  				GPUUtilizationAttr:     GPUUtilizationStat,
   294  				MemoryUtilizationAttr:  memoryUtilizationStat,
   295  				EncoderUtilizationAttr: encoderUtilizationStat,
   296  				DecoderUtilizationAttr: decoderUtilizationStat,
   297  				TemperatureAttr:        temperatureStat,
   298  				MemoryStateAttr:        memoryStateStat,
   299  				BAR1StateAttr:          BAR1StateStat,
   300  				ECCErrorsL1CacheAttr:   ECCErrorsL1CacheStat,
   301  				ECCErrorsL2CacheAttr:   ECCErrorsL2CacheStat,
   302  				ECCErrorsDeviceAttr:    ECCErrorsDeviceStat,
   303  			},
   304  		},
   305  		Timestamp: timestamp,
   306  	}
   307  }
   308  
   309  func uintToInt64Ptr(u *uint) *int64 {
   310  	if u == nil {
   311  		return nil
   312  	}
   313  
   314  	v := int64(*u)
   315  	return &v
   316  }
   317  
   318  func uint64ToInt64Ptr(u *uint64) *int64 {
   319  	if u == nil {
   320  		return nil
   321  	}
   322  
   323  	v := int64(*u)
   324  	return &v
   325  }