github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/plugins/device/device.go (about) 1 package device 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 multierror "github.com/hashicorp/go-multierror" 9 "github.com/hashicorp/nomad/plugins/base" 10 "github.com/hashicorp/nomad/plugins/shared/structs" 11 ) 12 13 const ( 14 // DeviceTypeGPU is a canonical device type for a GPU. 15 DeviceTypeGPU = "gpu" 16 ) 17 18 var ( 19 // ErrPluginDisabled indicates that the device plugin is disabled 20 ErrPluginDisabled = fmt.Errorf("device is not enabled") 21 ) 22 23 // DevicePlugin is the interface for a plugin that can expose detected devices 24 // to Nomad and inform it how to mount them. 25 type DevicePlugin interface { 26 base.BasePlugin 27 28 // Fingerprint returns a stream of devices that are detected. 29 Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error) 30 31 // Reserve is used to reserve a set of devices and retrieve mount 32 // instructions. 33 Reserve(deviceIDs []string) (*ContainerReservation, error) 34 35 // Stats returns a stream of statistics per device collected at the passed 36 // interval. 37 Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error) 38 } 39 40 // FingerprintResponse includes a set of detected devices or an error in the 41 // process of fingerprinting. 42 type FingerprintResponse struct { 43 // Devices is a set of devices that have been detected. 44 Devices []*DeviceGroup 45 46 // Error is populated when fingerprinting has failed. 47 Error error 48 } 49 50 // NewFingerprint takes a set of device groups and returns a fingerprint 51 // response 52 func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse { 53 return &FingerprintResponse{ 54 Devices: devices, 55 } 56 } 57 58 // NewFingerprintError takes an error and returns a fingerprint response 59 func NewFingerprintError(err error) *FingerprintResponse { 60 return &FingerprintResponse{ 61 Error: err, 62 } 63 } 64 65 // DeviceGroup is a grouping of devices that share a common vendor, device type 66 // and name. 67 type DeviceGroup struct { 68 // Vendor is the vendor providing the device (nvidia, intel, etc). 69 Vendor string 70 71 // Type is the type of the device (gpu, fpga, etc). 72 Type string 73 74 // Name is the devices model name. 75 Name string 76 77 // Devices is the set of device instances. 78 Devices []*Device 79 80 // Attributes are a set of attributes shared for all the devices. 81 Attributes map[string]*structs.Attribute 82 } 83 84 // Validate validates that the device group is valid 85 func (d *DeviceGroup) Validate() error { 86 var mErr multierror.Error 87 88 if d.Vendor == "" { 89 _ = multierror.Append(&mErr, fmt.Errorf("device vendor must be specified")) 90 } 91 if d.Type == "" { 92 _ = multierror.Append(&mErr, fmt.Errorf("device type must be specified")) 93 } 94 if d.Name == "" { 95 _ = multierror.Append(&mErr, fmt.Errorf("device name must be specified")) 96 } 97 98 for i, dev := range d.Devices { 99 if dev == nil { 100 _ = multierror.Append(&mErr, fmt.Errorf("device %d is nil", i)) 101 continue 102 } 103 104 if err := dev.Validate(); err != nil { 105 _ = multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i))) 106 } 107 } 108 109 for k, v := range d.Attributes { 110 if err := v.Validate(); err != nil { 111 _ = multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err)) 112 } 113 } 114 115 return mErr.ErrorOrNil() 116 117 } 118 119 // Device is an instance of a particular device. 120 type Device struct { 121 // ID is the identifier for the device. 122 ID string 123 124 // Healthy marks whether the device is healthy and can be used for 125 // scheduling. 126 Healthy bool 127 128 // HealthDesc describes why the device may be unhealthy. 129 HealthDesc string 130 131 // HwLocality captures hardware locality information for the device. 132 HwLocality *DeviceLocality 133 } 134 135 // Validate validates that the device is valid 136 func (d *Device) Validate() error { 137 if d.ID == "" { 138 return fmt.Errorf("device ID must be specified") 139 } 140 141 return nil 142 } 143 144 // DeviceLocality captures hardware locality information for a device. 145 type DeviceLocality struct { 146 // PciBusID is the PCI bus ID of the device. 147 PciBusID string 148 } 149 150 // ContainerReservation describes how to mount a device into a container. A 151 // container is an isolated environment that shares the host's OS. 152 type ContainerReservation struct { 153 // Envs are a set of environment variables to set for the task. 154 Envs map[string]string 155 156 // Mounts are used to mount host volumes into a container that may include 157 // libraries, etc. 158 Mounts []*Mount 159 160 // Devices are the set of devices to mount into the container. 161 Devices []*DeviceSpec 162 } 163 164 // Mount is used to mount a host directory into a container. 165 type Mount struct { 166 // TaskPath is the location in the task's file system to mount. 167 TaskPath string 168 169 // HostPath is the host directory path to mount. 170 HostPath string 171 172 // ReadOnly defines whether the mount should be read only to the task. 173 ReadOnly bool 174 } 175 176 // DeviceSpec captures how to mount a device into a container. 177 type DeviceSpec struct { 178 // TaskPath is the location to mount the device in the task's file system. 179 TaskPath string 180 181 // HostPath is the host location of the device. 182 HostPath string 183 184 // CgroupPerms defines the permissions to use when mounting the device. 185 CgroupPerms string 186 } 187 188 // StatsResponse returns statistics for each device group. 189 type StatsResponse struct { 190 // Groups contains statistics for each device group. 191 Groups []*DeviceGroupStats 192 193 // Error is populated when collecting statistics has failed. 194 Error error 195 } 196 197 // NewStatsError takes an error and returns a stats response 198 func NewStatsError(err error) *StatsResponse { 199 return &StatsResponse{ 200 Error: err, 201 } 202 } 203 204 // DeviceGroupStats contains statistics for each device of a particular 205 // device group, identified by the vendor, type and name of the device. 206 type DeviceGroupStats struct { 207 Vendor string 208 Type string 209 Name string 210 211 // InstanceStats is a mapping of each device ID to its statistics. 212 InstanceStats map[string]*DeviceStats 213 } 214 215 // DeviceStats is the statistics for an individual device 216 type DeviceStats struct { 217 // Summary exposes a single summary metric that should be the most 218 // informative to users. 219 Summary *structs.StatValue 220 221 // Stats contains the verbose statistics for the device. 222 Stats *structs.StatObject 223 224 // Timestamp is the time the statistics were collected. 225 Timestamp time.Time 226 }