github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/plugins/device/device.go (about) 1 package device 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/hashicorp/nomad/plugins/base" 8 ) 9 10 const ( 11 // DeviceTypeGPU is a canonical device type for a GPU. 12 DeviceTypeGPU = "gpu" 13 ) 14 15 // DevicePlugin is the interface for a plugin that can expose detected devices 16 // to Nomad and inform it how to mount them. 17 type DevicePlugin interface { 18 base.BasePlugin 19 20 // Fingerprint returns a stream of devices that are detected. 21 Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error) 22 23 // Reserve is used to reserve a set of devices and retrieve mount 24 // instructions. 25 Reserve(deviceIDs []string) (*ContainerReservation, error) 26 27 // Stats returns a stream of statistics per device. 28 Stats(ctx context.Context) (<-chan *StatsResponse, error) 29 } 30 31 // FingerprintResponse includes a set of detected devices or an error in the 32 // process of fingerprinting. 33 type FingerprintResponse struct { 34 // Devices is a set of devices that have been detected. 35 Devices []*DeviceGroup 36 37 // Error is populated when fingerprinting has failed. 38 Error error 39 } 40 41 // NewFingerprint takes a set of device groups and returns a fingerprint 42 // response 43 func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse { 44 return &FingerprintResponse{ 45 Devices: devices, 46 } 47 } 48 49 // NewFingerprintError takes an error and returns a fingerprint response 50 func NewFingerprintError(err error) *FingerprintResponse { 51 return &FingerprintResponse{ 52 Error: err, 53 } 54 } 55 56 // DeviceGroup is a grouping of devices that share a common vendor, device type 57 // and name. 58 type DeviceGroup struct { 59 // Vendor is the vendor providing the device (nvidia, intel, etc). 60 Vendor string 61 62 // Type is the type of the device (gpu, fpga, etc). 63 Type string 64 65 // Name is the devices model name. 66 Name string 67 68 // Devices is the set of device instances. 69 Devices []*Device 70 71 // Attributes are a set of attributes shared for all the devices. 72 Attributes map[string]string 73 } 74 75 // Device is an instance of a particular device. 76 type Device struct { 77 // ID is the identifier for the device. 78 ID string 79 80 // Healthy marks whether the device is healthy and can be used for 81 // scheduling. 82 Healthy bool 83 84 // HealthDesc describes why the device may be unhealthy. 85 HealthDesc string 86 87 // HwLocality captures hardware locality information for the device. 88 HwLocality *DeviceLocality 89 } 90 91 // DeviceLocality captures hardware locality information for a device. 92 type DeviceLocality struct { 93 // PciBusID is the PCI bus ID of the device. 94 PciBusID string 95 } 96 97 // ContainerReservation describes how to mount a device into a container. A 98 // container is an isolated environment that shares the host's OS. 99 type ContainerReservation struct { 100 // Envs are a set of environment variables to set for the task. 101 Envs map[string]string 102 103 // Mounts are used to mount host volumes into a container that may include 104 // libraries, etc. 105 Mounts []*Mount 106 107 // Devices are the set of devices to mount into the container. 108 Devices []*DeviceSpec 109 } 110 111 // Mount is used to mount a host directory into a container. 112 type Mount struct { 113 // TaskPath is the location in the task's file system to mount. 114 TaskPath string 115 116 // HostPath is the host directory path to mount. 117 HostPath string 118 119 // ReadOnly defines whether the mount should be read only to the task. 120 ReadOnly bool 121 } 122 123 // DeviceSpec captures how to mount a device into a container. 124 type DeviceSpec struct { 125 // TaskPath is the location to mount the device in the task's file system. 126 TaskPath string 127 128 // HostPath is the host location of the device. 129 HostPath string 130 131 // CgroupPerms defines the permissions to use when mounting the device. 132 CgroupPerms string 133 } 134 135 // StatsResponse returns statistics for each device group. 136 type StatsResponse struct { 137 // Groups contains statistics for each device group. 138 Groups []*DeviceGroupStats 139 140 // Error is populated when collecting statistics has failed. 141 Error error 142 } 143 144 // DeviceGroupStats contains statistics for each device of a particular 145 // device group, identified by the vendor, type and name of the device. 146 type DeviceGroupStats struct { 147 Vendor string 148 Type string 149 Name string 150 151 // InstanceStats is a mapping of each device ID to its statistics. 152 InstanceStats map[string]*DeviceStats 153 } 154 155 // DeviceStats is the statistics for an individual device 156 type DeviceStats struct { 157 // Summary exposes a single summary metric that should be the most 158 // informative to users. 159 Summary *StatValue 160 161 // Stats contains the verbose statistics for the device. 162 Stats *StatObject 163 164 // Timestamp is the time the statistics were collected. 165 Timestamp time.Time 166 } 167 168 // StatObject is a collection of statistics either exposed at the top 169 // level or via nested StatObjects. 170 type StatObject struct { 171 // Nested is a mapping of object name to a nested stats object. 172 Nested map[string]*StatObject 173 174 // Attributes is a mapping of statistic name to its value. 175 Attributes map[string]*StatValue 176 } 177 178 // StatValue exposes the values of a particular statistic. The value may be of 179 // type float, integer, string or boolean. Numeric types can be exposed as a 180 // single value or as a fraction. 181 type StatValue struct { 182 // FloatNumeratorVal exposes a floating point value. If denominator is set 183 // it is assumed to be a fractional value, otherwise it is a scalar. 184 FloatNumeratorVal float64 185 FloatDenominatorVal float64 186 187 // IntNumeratorVal exposes a int value. If denominator is set it is assumed 188 // to be a fractional value, otherwise it is a scalar. 189 IntNumeratorVal int64 190 IntDenominatorVal int64 191 192 // StringVal exposes a string value. These are likely annotations. 193 StringVal string 194 195 // BoolVal exposes a boolean statistic. 196 BoolVal bool 197 198 // Unit gives the unit type: °F, %, MHz, MB, etc. 199 Unit string 200 201 // Desc provides a human readable description of the statistic. 202 Desc string 203 }