github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/plugins/device/device.go (about)

     1  package device
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	multierror "github.com/hashicorp/go-multierror"
     9  	"github.com/hashicorp/nomad/plugins/base"
    10  	"github.com/hashicorp/nomad/plugins/shared/structs"
    11  )
    12  
    13  const (
    14  	// DeviceTypeGPU is a canonical device type for a GPU.
    15  	DeviceTypeGPU = "gpu"
    16  )
    17  
    18  var (
    19  	// ErrPluginDisabled indicates that the device plugin is disabled
    20  	ErrPluginDisabled = fmt.Errorf("device is not enabled")
    21  )
    22  
    23  // DevicePlugin is the interface for a plugin that can expose detected devices
    24  // to Nomad and inform it how to mount them.
    25  type DevicePlugin interface {
    26  	base.BasePlugin
    27  
    28  	// Fingerprint returns a stream of devices that are detected.
    29  	Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error)
    30  
    31  	// Reserve is used to reserve a set of devices and retrieve mount
    32  	// instructions.
    33  	Reserve(deviceIDs []string) (*ContainerReservation, error)
    34  
    35  	// Stats returns a stream of statistics per device collected at the passed
    36  	// interval.
    37  	Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error)
    38  }
    39  
    40  // FingerprintResponse includes a set of detected devices or an error in the
    41  // process of fingerprinting.
    42  type FingerprintResponse struct {
    43  	// Devices is a set of devices that have been detected.
    44  	Devices []*DeviceGroup
    45  
    46  	// Error is populated when fingerprinting has failed.
    47  	Error error
    48  }
    49  
    50  // NewFingerprint takes a set of device groups and returns a fingerprint
    51  // response
    52  func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse {
    53  	return &FingerprintResponse{
    54  		Devices: devices,
    55  	}
    56  }
    57  
    58  // NewFingerprintError takes an error and returns a fingerprint response
    59  func NewFingerprintError(err error) *FingerprintResponse {
    60  	return &FingerprintResponse{
    61  		Error: err,
    62  	}
    63  }
    64  
    65  // DeviceGroup is a grouping of devices that share a common vendor, device type
    66  // and name.
    67  type DeviceGroup struct {
    68  	// Vendor is the vendor providing the device (nvidia, intel, etc).
    69  	Vendor string
    70  
    71  	// Type is the type of the device (gpu, fpga, etc).
    72  	Type string
    73  
    74  	// Name is the devices model name.
    75  	Name string
    76  
    77  	// Devices is the set of device instances.
    78  	Devices []*Device
    79  
    80  	// Attributes are a set of attributes shared for all the devices.
    81  	Attributes map[string]*structs.Attribute
    82  }
    83  
    84  // Validate validates that the device group is valid
    85  func (d *DeviceGroup) Validate() error {
    86  	var mErr multierror.Error
    87  
    88  	if d.Vendor == "" {
    89  		_ = multierror.Append(&mErr, fmt.Errorf("device vendor must be specified"))
    90  	}
    91  	if d.Type == "" {
    92  		_ = multierror.Append(&mErr, fmt.Errorf("device type must be specified"))
    93  	}
    94  	if d.Name == "" {
    95  		_ = multierror.Append(&mErr, fmt.Errorf("device name must be specified"))
    96  	}
    97  
    98  	for i, dev := range d.Devices {
    99  		if dev == nil {
   100  			_ = multierror.Append(&mErr, fmt.Errorf("device %d is nil", i))
   101  			continue
   102  		}
   103  
   104  		if err := dev.Validate(); err != nil {
   105  			_ = multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i)))
   106  		}
   107  	}
   108  
   109  	for k, v := range d.Attributes {
   110  		if err := v.Validate(); err != nil {
   111  			_ = multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err))
   112  		}
   113  	}
   114  
   115  	return mErr.ErrorOrNil()
   116  
   117  }
   118  
   119  // Device is an instance of a particular device.
   120  type Device struct {
   121  	// ID is the identifier for the device.
   122  	ID string
   123  
   124  	// Healthy marks whether the device is healthy and can be used for
   125  	// scheduling.
   126  	Healthy bool
   127  
   128  	// HealthDesc describes why the device may be unhealthy.
   129  	HealthDesc string
   130  
   131  	// HwLocality captures hardware locality information for the device.
   132  	HwLocality *DeviceLocality
   133  }
   134  
   135  // Validate validates that the device is valid
   136  func (d *Device) Validate() error {
   137  	if d.ID == "" {
   138  		return fmt.Errorf("device ID must be specified")
   139  	}
   140  
   141  	return nil
   142  }
   143  
   144  // DeviceLocality captures hardware locality information for a device.
   145  type DeviceLocality struct {
   146  	// PciBusID is the PCI bus ID of the device.
   147  	PciBusID string
   148  }
   149  
   150  // ContainerReservation describes how to mount a device into a container. A
   151  // container is an isolated environment that shares the host's OS.
   152  type ContainerReservation struct {
   153  	// Envs are a set of environment variables to set for the task.
   154  	Envs map[string]string
   155  
   156  	// Mounts are used to mount host volumes into a container that may include
   157  	// libraries, etc.
   158  	Mounts []*Mount
   159  
   160  	// Devices are the set of devices to mount into the container.
   161  	Devices []*DeviceSpec
   162  }
   163  
   164  // Mount is used to mount a host directory into a container.
   165  type Mount struct {
   166  	// TaskPath is the location in the task's file system to mount.
   167  	TaskPath string
   168  
   169  	// HostPath is the host directory path to mount.
   170  	HostPath string
   171  
   172  	// ReadOnly defines whether the mount should be read only to the task.
   173  	ReadOnly bool
   174  }
   175  
   176  // DeviceSpec captures how to mount a device into a container.
   177  type DeviceSpec struct {
   178  	// TaskPath is the location to mount the device in the task's file system.
   179  	TaskPath string
   180  
   181  	// HostPath is the host location of the device.
   182  	HostPath string
   183  
   184  	// CgroupPerms defines the permissions to use when mounting the device.
   185  	CgroupPerms string
   186  }
   187  
   188  // StatsResponse returns statistics for each device group.
   189  type StatsResponse struct {
   190  	// Groups contains statistics for each device group.
   191  	Groups []*DeviceGroupStats
   192  
   193  	// Error is populated when collecting statistics has failed.
   194  	Error error
   195  }
   196  
   197  // NewStatsError takes an error and returns a stats response
   198  func NewStatsError(err error) *StatsResponse {
   199  	return &StatsResponse{
   200  		Error: err,
   201  	}
   202  }
   203  
   204  // DeviceGroupStats contains statistics for each device of a particular
   205  // device group, identified by the vendor, type and name of the device.
   206  type DeviceGroupStats struct {
   207  	Vendor string
   208  	Type   string
   209  	Name   string
   210  
   211  	// InstanceStats is a mapping of each device ID to its statistics.
   212  	InstanceStats map[string]*DeviceStats
   213  }
   214  
   215  // DeviceStats is the statistics for an individual device
   216  type DeviceStats struct {
   217  	// Summary exposes a single summary metric that should be the most
   218  	// informative to users.
   219  	Summary *structs.StatValue
   220  
   221  	// Stats contains the verbose statistics for the device.
   222  	Stats *structs.StatObject
   223  
   224  	// Timestamp is the time the statistics were collected.
   225  	Timestamp time.Time
   226  }