github.com/manicqin/nomad@v0.9.5/plugins/device/device.go (about)

     1  package device
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	multierror "github.com/hashicorp/go-multierror"
     9  	"github.com/hashicorp/nomad/plugins/base"
    10  	"github.com/hashicorp/nomad/plugins/shared/structs"
    11  )
    12  
    13  const (
    14  	// DeviceTypeGPU is a canonical device type for a GPU.
    15  	DeviceTypeGPU = "gpu"
    16  )
    17  
    18  // DevicePlugin is the interface for a plugin that can expose detected devices
    19  // to Nomad and inform it how to mount them.
    20  type DevicePlugin interface {
    21  	base.BasePlugin
    22  
    23  	// Fingerprint returns a stream of devices that are detected.
    24  	Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error)
    25  
    26  	// Reserve is used to reserve a set of devices and retrieve mount
    27  	// instructions.
    28  	Reserve(deviceIDs []string) (*ContainerReservation, error)
    29  
    30  	// Stats returns a stream of statistics per device collected at the passed
    31  	// interval.
    32  	Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error)
    33  }
    34  
    35  // FingerprintResponse includes a set of detected devices or an error in the
    36  // process of fingerprinting.
    37  type FingerprintResponse struct {
    38  	// Devices is a set of devices that have been detected.
    39  	Devices []*DeviceGroup
    40  
    41  	// Error is populated when fingerprinting has failed.
    42  	Error error
    43  }
    44  
    45  // NewFingerprint takes a set of device groups and returns a fingerprint
    46  // response
    47  func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse {
    48  	return &FingerprintResponse{
    49  		Devices: devices,
    50  	}
    51  }
    52  
    53  // NewFingerprintError takes an error and returns a fingerprint response
    54  func NewFingerprintError(err error) *FingerprintResponse {
    55  	return &FingerprintResponse{
    56  		Error: err,
    57  	}
    58  }
    59  
    60  // DeviceGroup is a grouping of devices that share a common vendor, device type
    61  // and name.
    62  type DeviceGroup struct {
    63  	// Vendor is the vendor providing the device (nvidia, intel, etc).
    64  	Vendor string
    65  
    66  	// Type is the type of the device (gpu, fpga, etc).
    67  	Type string
    68  
    69  	// Name is the devices model name.
    70  	Name string
    71  
    72  	// Devices is the set of device instances.
    73  	Devices []*Device
    74  
    75  	// Attributes are a set of attributes shared for all the devices.
    76  	Attributes map[string]*structs.Attribute
    77  }
    78  
    79  // Validate validates that the device group is valid
    80  func (d *DeviceGroup) Validate() error {
    81  	var mErr multierror.Error
    82  
    83  	if d.Vendor == "" {
    84  		multierror.Append(&mErr, fmt.Errorf("device vendor must be specified"))
    85  	}
    86  	if d.Type == "" {
    87  		multierror.Append(&mErr, fmt.Errorf("device type must be specified"))
    88  	}
    89  	if d.Name == "" {
    90  		multierror.Append(&mErr, fmt.Errorf("device name must be specified"))
    91  	}
    92  
    93  	for i, dev := range d.Devices {
    94  		if dev == nil {
    95  			multierror.Append(&mErr, fmt.Errorf("device %d is nil", i))
    96  			continue
    97  		}
    98  
    99  		if err := dev.Validate(); err != nil {
   100  			multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i)))
   101  		}
   102  	}
   103  
   104  	for k, v := range d.Attributes {
   105  		if err := v.Validate(); err != nil {
   106  			multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err))
   107  		}
   108  	}
   109  
   110  	return mErr.ErrorOrNil()
   111  
   112  }
   113  
   114  // Device is an instance of a particular device.
   115  type Device struct {
   116  	// ID is the identifier for the device.
   117  	ID string
   118  
   119  	// Healthy marks whether the device is healthy and can be used for
   120  	// scheduling.
   121  	Healthy bool
   122  
   123  	// HealthDesc describes why the device may be unhealthy.
   124  	HealthDesc string
   125  
   126  	// HwLocality captures hardware locality information for the device.
   127  	HwLocality *DeviceLocality
   128  }
   129  
   130  // Validate validates that the device is valid
   131  func (d *Device) Validate() error {
   132  	if d.ID == "" {
   133  		return fmt.Errorf("device ID must be specified")
   134  	}
   135  
   136  	return nil
   137  }
   138  
   139  // DeviceLocality captures hardware locality information for a device.
   140  type DeviceLocality struct {
   141  	// PciBusID is the PCI bus ID of the device.
   142  	PciBusID string
   143  }
   144  
   145  // ContainerReservation describes how to mount a device into a container. A
   146  // container is an isolated environment that shares the host's OS.
   147  type ContainerReservation struct {
   148  	// Envs are a set of environment variables to set for the task.
   149  	Envs map[string]string
   150  
   151  	// Mounts are used to mount host volumes into a container that may include
   152  	// libraries, etc.
   153  	Mounts []*Mount
   154  
   155  	// Devices are the set of devices to mount into the container.
   156  	Devices []*DeviceSpec
   157  }
   158  
   159  // Mount is used to mount a host directory into a container.
   160  type Mount struct {
   161  	// TaskPath is the location in the task's file system to mount.
   162  	TaskPath string
   163  
   164  	// HostPath is the host directory path to mount.
   165  	HostPath string
   166  
   167  	// ReadOnly defines whether the mount should be read only to the task.
   168  	ReadOnly bool
   169  }
   170  
   171  // DeviceSpec captures how to mount a device into a container.
   172  type DeviceSpec struct {
   173  	// TaskPath is the location to mount the device in the task's file system.
   174  	TaskPath string
   175  
   176  	// HostPath is the host location of the device.
   177  	HostPath string
   178  
   179  	// CgroupPerms defines the permissions to use when mounting the device.
   180  	CgroupPerms string
   181  }
   182  
   183  // StatsResponse returns statistics for each device group.
   184  type StatsResponse struct {
   185  	// Groups contains statistics for each device group.
   186  	Groups []*DeviceGroupStats
   187  
   188  	// Error is populated when collecting statistics has failed.
   189  	Error error
   190  }
   191  
   192  // NewStatsError takes an error and returns a stats response
   193  func NewStatsError(err error) *StatsResponse {
   194  	return &StatsResponse{
   195  		Error: err,
   196  	}
   197  }
   198  
   199  // DeviceGroupStats contains statistics for each device of a particular
   200  // device group, identified by the vendor, type and name of the device.
   201  type DeviceGroupStats struct {
   202  	Vendor string
   203  	Type   string
   204  	Name   string
   205  
   206  	// InstanceStats is a mapping of each device ID to its statistics.
   207  	InstanceStats map[string]*DeviceStats
   208  }
   209  
   210  // DeviceStats is the statistics for an individual device
   211  type DeviceStats struct {
   212  	// Summary exposes a single summary metric that should be the most
   213  	// informative to users.
   214  	Summary *structs.StatValue
   215  
   216  	// Stats contains the verbose statistics for the device.
   217  	Stats *structs.StatObject
   218  
   219  	// Timestamp is the time the statistics were collected.
   220  	Timestamp time.Time
   221  }