github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/plugins/device/device.go (about) 1 package device 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 multierror "github.com/hashicorp/go-multierror" 9 "github.com/hashicorp/nomad/plugins/base" 10 "github.com/hashicorp/nomad/plugins/shared/structs" 11 ) 12 13 const ( 14 // DeviceTypeGPU is a canonical device type for a GPU. 15 DeviceTypeGPU = "gpu" 16 ) 17 18 // DevicePlugin is the interface for a plugin that can expose detected devices 19 // to Nomad and inform it how to mount them. 20 type DevicePlugin interface { 21 base.BasePlugin 22 23 // Fingerprint returns a stream of devices that are detected. 24 Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error) 25 26 // Reserve is used to reserve a set of devices and retrieve mount 27 // instructions. 28 Reserve(deviceIDs []string) (*ContainerReservation, error) 29 30 // Stats returns a stream of statistics per device collected at the passed 31 // interval. 32 Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error) 33 } 34 35 // FingerprintResponse includes a set of detected devices or an error in the 36 // process of fingerprinting. 37 type FingerprintResponse struct { 38 // Devices is a set of devices that have been detected. 39 Devices []*DeviceGroup 40 41 // Error is populated when fingerprinting has failed. 42 Error error 43 } 44 45 // NewFingerprint takes a set of device groups and returns a fingerprint 46 // response 47 func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse { 48 return &FingerprintResponse{ 49 Devices: devices, 50 } 51 } 52 53 // NewFingerprintError takes an error and returns a fingerprint response 54 func NewFingerprintError(err error) *FingerprintResponse { 55 return &FingerprintResponse{ 56 Error: err, 57 } 58 } 59 60 // DeviceGroup is a grouping of devices that share a common vendor, device type 61 // and name. 62 type DeviceGroup struct { 63 // Vendor is the vendor providing the device (nvidia, intel, etc). 64 Vendor string 65 66 // Type is the type of the device (gpu, fpga, etc). 67 Type string 68 69 // Name is the devices model name. 70 Name string 71 72 // Devices is the set of device instances. 73 Devices []*Device 74 75 // Attributes are a set of attributes shared for all the devices. 76 Attributes map[string]*structs.Attribute 77 } 78 79 // Validate validates that the device group is valid 80 func (d *DeviceGroup) Validate() error { 81 var mErr multierror.Error 82 83 if d.Vendor == "" { 84 multierror.Append(&mErr, fmt.Errorf("device vendor must be specified")) 85 } 86 if d.Type == "" { 87 multierror.Append(&mErr, fmt.Errorf("device type must be specified")) 88 } 89 if d.Name == "" { 90 multierror.Append(&mErr, fmt.Errorf("device name must be specified")) 91 } 92 93 for i, dev := range d.Devices { 94 if dev == nil { 95 multierror.Append(&mErr, fmt.Errorf("device %d is nil", i)) 96 continue 97 } 98 99 if err := dev.Validate(); err != nil { 100 multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i))) 101 } 102 } 103 104 for k, v := range d.Attributes { 105 if err := v.Validate(); err != nil { 106 multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err)) 107 } 108 } 109 110 return mErr.ErrorOrNil() 111 112 } 113 114 // Device is an instance of a particular device. 115 type Device struct { 116 // ID is the identifier for the device. 117 ID string 118 119 // Healthy marks whether the device is healthy and can be used for 120 // scheduling. 121 Healthy bool 122 123 // HealthDesc describes why the device may be unhealthy. 124 HealthDesc string 125 126 // HwLocality captures hardware locality information for the device. 127 HwLocality *DeviceLocality 128 } 129 130 // Validate validates that the device is valid 131 func (d *Device) Validate() error { 132 if d.ID == "" { 133 return fmt.Errorf("device ID must be specified") 134 } 135 136 return nil 137 } 138 139 // DeviceLocality captures hardware locality information for a device. 140 type DeviceLocality struct { 141 // PciBusID is the PCI bus ID of the device. 142 PciBusID string 143 } 144 145 // ContainerReservation describes how to mount a device into a container. A 146 // container is an isolated environment that shares the host's OS. 147 type ContainerReservation struct { 148 // Envs are a set of environment variables to set for the task. 149 Envs map[string]string 150 151 // Mounts are used to mount host volumes into a container that may include 152 // libraries, etc. 153 Mounts []*Mount 154 155 // Devices are the set of devices to mount into the container. 156 Devices []*DeviceSpec 157 } 158 159 // Mount is used to mount a host directory into a container. 160 type Mount struct { 161 // TaskPath is the location in the task's file system to mount. 162 TaskPath string 163 164 // HostPath is the host directory path to mount. 165 HostPath string 166 167 // ReadOnly defines whether the mount should be read only to the task. 168 ReadOnly bool 169 } 170 171 // DeviceSpec captures how to mount a device into a container. 172 type DeviceSpec struct { 173 // TaskPath is the location to mount the device in the task's file system. 174 TaskPath string 175 176 // HostPath is the host location of the device. 177 HostPath string 178 179 // CgroupPerms defines the permissions to use when mounting the device. 180 CgroupPerms string 181 } 182 183 // StatsResponse returns statistics for each device group. 184 type StatsResponse struct { 185 // Groups contains statistics for each device group. 186 Groups []*DeviceGroupStats 187 188 // Error is populated when collecting statistics has failed. 189 Error error 190 } 191 192 // NewStatsError takes an error and returns a stats response 193 func NewStatsError(err error) *StatsResponse { 194 return &StatsResponse{ 195 Error: err, 196 } 197 } 198 199 // DeviceGroupStats contains statistics for each device of a particular 200 // device group, identified by the vendor, type and name of the device. 201 type DeviceGroupStats struct { 202 Vendor string 203 Type string 204 Name string 205 206 // InstanceStats is a mapping of each device ID to its statistics. 207 InstanceStats map[string]*DeviceStats 208 } 209 210 // DeviceStats is the statistics for an individual device 211 type DeviceStats struct { 212 // Summary exposes a single summary metric that should be the most 213 // informative to users. 214 Summary *structs.StatValue 215 216 // Stats contains the verbose statistics for the device. 217 Stats *structs.StatObject 218 219 // Timestamp is the time the statistics were collected. 220 Timestamp time.Time 221 }