github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/plugins/device/device.go (about)

     1  package device
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/plugins/base"
     8  )
     9  
    10  const (
    11  	// DeviceTypeGPU is a canonical device type for a GPU.
    12  	DeviceTypeGPU = "gpu"
    13  )
    14  
    15  // DevicePlugin is the interface for a plugin that can expose detected devices
    16  // to Nomad and inform it how to mount them.
    17  type DevicePlugin interface {
    18  	base.BasePlugin
    19  
    20  	// Fingerprint returns a stream of devices that are detected.
    21  	Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error)
    22  
    23  	// Reserve is used to reserve a set of devices and retrieve mount
    24  	// instructions.
    25  	Reserve(deviceIDs []string) (*ContainerReservation, error)
    26  
    27  	// Stats returns a stream of statistics per device.
    28  	Stats(ctx context.Context) (<-chan *StatsResponse, error)
    29  }
    30  
    31  // FingerprintResponse includes a set of detected devices or an error in the
    32  // process of fingerprinting.
    33  type FingerprintResponse struct {
    34  	// Devices is a set of devices that have been detected.
    35  	Devices []*DeviceGroup
    36  
    37  	// Error is populated when fingerprinting has failed.
    38  	Error error
    39  }
    40  
    41  // NewFingerprint takes a set of device groups and returns a fingerprint
    42  // response
    43  func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse {
    44  	return &FingerprintResponse{
    45  		Devices: devices,
    46  	}
    47  }
    48  
    49  // NewFingerprintError takes an error and returns a fingerprint response
    50  func NewFingerprintError(err error) *FingerprintResponse {
    51  	return &FingerprintResponse{
    52  		Error: err,
    53  	}
    54  }
    55  
    56  // DeviceGroup is a grouping of devices that share a common vendor, device type
    57  // and name.
    58  type DeviceGroup struct {
    59  	// Vendor is the vendor providing the device (nvidia, intel, etc).
    60  	Vendor string
    61  
    62  	// Type is the type of the device (gpu, fpga, etc).
    63  	Type string
    64  
    65  	// Name is the devices model name.
    66  	Name string
    67  
    68  	// Devices is the set of device instances.
    69  	Devices []*Device
    70  
    71  	// Attributes are a set of attributes shared for all the devices.
    72  	Attributes map[string]string
    73  }
    74  
    75  // Device is an instance of a particular device.
    76  type Device struct {
    77  	// ID is the identifier for the device.
    78  	ID string
    79  
    80  	// Healthy marks whether the device is healthy and can be used for
    81  	// scheduling.
    82  	Healthy bool
    83  
    84  	// HealthDesc describes why the device may be unhealthy.
    85  	HealthDesc string
    86  
    87  	// HwLocality captures hardware locality information for the device.
    88  	HwLocality *DeviceLocality
    89  }
    90  
    91  // DeviceLocality captures hardware locality information for a device.
    92  type DeviceLocality struct {
    93  	// PciBusID is the PCI bus ID of the device.
    94  	PciBusID string
    95  }
    96  
    97  // ContainerReservation describes how to mount a device into a container. A
    98  // container is an isolated environment that shares the host's OS.
    99  type ContainerReservation struct {
   100  	// Envs are a set of environment variables to set for the task.
   101  	Envs map[string]string
   102  
   103  	// Mounts are used to mount host volumes into a container that may include
   104  	// libraries, etc.
   105  	Mounts []*Mount
   106  
   107  	// Devices are the set of devices to mount into the container.
   108  	Devices []*DeviceSpec
   109  }
   110  
   111  // Mount is used to mount a host directory into a container.
   112  type Mount struct {
   113  	// TaskPath is the location in the task's file system to mount.
   114  	TaskPath string
   115  
   116  	// HostPath is the host directory path to mount.
   117  	HostPath string
   118  
   119  	// ReadOnly defines whether the mount should be read only to the task.
   120  	ReadOnly bool
   121  }
   122  
   123  // DeviceSpec captures how to mount a device into a container.
   124  type DeviceSpec struct {
   125  	// TaskPath is the location to mount the device in the task's file system.
   126  	TaskPath string
   127  
   128  	// HostPath is the host location of the device.
   129  	HostPath string
   130  
   131  	// CgroupPerms defines the permissions to use when mounting the device.
   132  	CgroupPerms string
   133  }
   134  
   135  // StatsResponse returns statistics for each device group.
   136  type StatsResponse struct {
   137  	// Groups contains statistics for each device group.
   138  	Groups []*DeviceGroupStats
   139  
   140  	// Error is populated when collecting statistics has failed.
   141  	Error error
   142  }
   143  
   144  // DeviceGroupStats contains statistics for each device of a particular
   145  // device group, identified by the vendor, type and name of the device.
   146  type DeviceGroupStats struct {
   147  	Vendor string
   148  	Type   string
   149  	Name   string
   150  
   151  	// InstanceStats is a mapping of each device ID to its statistics.
   152  	InstanceStats map[string]*DeviceStats
   153  }
   154  
   155  // DeviceStats is the statistics for an individual device
   156  type DeviceStats struct {
   157  	// Summary exposes a single summary metric that should be the most
   158  	// informative to users.
   159  	Summary *StatValue
   160  
   161  	// Stats contains the verbose statistics for the device.
   162  	Stats *StatObject
   163  
   164  	// Timestamp is the time the statistics were collected.
   165  	Timestamp time.Time
   166  }
   167  
   168  // StatObject is a collection of statistics either exposed at the top
   169  // level or via nested StatObjects.
   170  type StatObject struct {
   171  	// Nested is a mapping of object name to a nested stats object.
   172  	Nested map[string]*StatObject
   173  
   174  	// Attributes is a mapping of statistic name to its value.
   175  	Attributes map[string]*StatValue
   176  }
   177  
   178  // StatValue exposes the values of a particular statistic. The value may be of
   179  // type float, integer, string or boolean. Numeric types can be exposed as a
   180  // single value or as a fraction.
   181  type StatValue struct {
   182  	// FloatNumeratorVal exposes a floating point value. If denominator is set
   183  	// it is assumed to be a fractional value, otherwise it is a scalar.
   184  	FloatNumeratorVal   float64
   185  	FloatDenominatorVal float64
   186  
   187  	// IntNumeratorVal exposes a int value. If denominator is set it is assumed
   188  	// to be a fractional value, otherwise it is a scalar.
   189  	IntNumeratorVal   int64
   190  	IntDenominatorVal int64
   191  
   192  	// StringVal exposes a string value. These are likely annotations.
   193  	StringVal string
   194  
   195  	// BoolVal exposes a boolean statistic.
   196  	BoolVal bool
   197  
   198  	// Unit gives the unit type: °F, %, MHz, MB, etc.
   199  	Unit string
   200  
   201  	// Desc provides a human readable description of the statistic.
   202  	Desc string
   203  }