github.com/bigcommerce/nomad@v0.9.3-bc/client/devicemanager/manager.go (about) 1 // Package devicemanager is used to manage device plugins 2 package devicemanager 3 4 import ( 5 "context" 6 "fmt" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 plugin "github.com/hashicorp/go-plugin" 13 "github.com/hashicorp/nomad/client/devicemanager/state" 14 "github.com/hashicorp/nomad/client/pluginmanager" 15 "github.com/hashicorp/nomad/helper/pluginutils/loader" 16 "github.com/hashicorp/nomad/nomad/structs" 17 "github.com/hashicorp/nomad/plugins/base" 18 "github.com/hashicorp/nomad/plugins/device" 19 pstructs "github.com/hashicorp/nomad/plugins/shared/structs" 20 ) 21 22 // Manager is the interface used to manage device plugins 23 type Manager interface { 24 pluginmanager.PluginManager 25 26 // Reserve is used to reserve a set of devices 27 Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) 28 29 // AllStats is used to retrieve all the latest statistics for all devices. 30 AllStats() []*device.DeviceGroupStats 31 32 // DeviceStats returns the device statistics for the given device. 33 DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error) 34 } 35 36 // StateStorage is used to persist the device managers state across 37 // agent restarts. 38 type StateStorage interface { 39 // GetDevicePluginState is used to retrieve the device manager's plugin 40 // state. 41 GetDevicePluginState() (*state.PluginState, error) 42 43 // PutDevicePluginState is used to store the device manager's plugin 44 // state. 45 PutDevicePluginState(state *state.PluginState) error 46 } 47 48 // UpdateNodeDevices is a callback for updating the set of devices on a node. 49 type UpdateNodeDevicesFn func(devices []*structs.NodeDeviceResource) 50 51 // StorePluginReattachFn is used to store plugin reattachment configurations. 52 type StorePluginReattachFn func(*plugin.ReattachConfig) error 53 54 // Config is used to configure a device manager 55 type Config struct { 56 // Logger is the logger used by the device manager 57 Logger log.Logger 58 59 // Loader is the plugin loader 60 Loader loader.PluginCatalog 61 62 // PluginConfig is the config passed to the launched plugins 63 PluginConfig *base.AgentConfig 64 65 // Updater is used to update the node when device information changes 66 Updater UpdateNodeDevicesFn 67 68 // StatsInterval is the interval at which to collect statistics 69 StatsInterval time.Duration 70 71 // State is used to manage the device managers state 72 State StateStorage 73 } 74 75 // manager is used to manage a set of device plugins 76 type manager struct { 77 // logger is the logger used by the device manager 78 logger log.Logger 79 80 // state is used to manage the device managers state 81 state StateStorage 82 83 // ctx is used to shutdown the device manager 84 ctx context.Context 85 cancel context.CancelFunc 86 87 // loader is the plugin loader 88 loader loader.PluginCatalog 89 90 // pluginConfig is the config passed to the launched plugins 91 pluginConfig *base.AgentConfig 92 93 // updater is used to update the node when device information changes 94 updater UpdateNodeDevicesFn 95 96 // statsInterval is the duration at which to collect statistics 97 statsInterval time.Duration 98 99 // fingerprintResCh is used to be triggered that there are new devices 100 fingerprintResCh chan struct{} 101 102 // instances is the list of managed devices 103 instances map[loader.PluginID]*instanceManager 104 105 // reattachConfigs stores the plugin reattach configs 106 reattachConfigs map[loader.PluginID]*pstructs.ReattachConfig 107 reattachConfigLock sync.Mutex 108 } 109 110 // New returns a new device manager 111 func New(c *Config) *manager { 112 ctx, cancel := context.WithCancel(context.Background()) 113 return &manager{ 114 logger: c.Logger.Named("device_mgr"), 115 state: c.State, 116 ctx: ctx, 117 cancel: cancel, 118 loader: c.Loader, 119 pluginConfig: c.PluginConfig, 120 updater: c.Updater, 121 instances: make(map[loader.PluginID]*instanceManager), 122 reattachConfigs: make(map[loader.PluginID]*pstructs.ReattachConfig), 123 fingerprintResCh: make(chan struct{}, 1), 124 } 125 } 126 127 // PluginType identifies this manager to the plugin manager and satisfies the PluginManager interface. 128 func (*manager) PluginType() string { return base.PluginTypeDevice } 129 130 // Run starts thed device manager. The manager will shutdown any previously 131 // launched plugin and then begin fingerprinting and stats collection on all new 132 // device plugins. 133 func (m *manager) Run() { 134 // Check if there are any plugins that didn't get cleanly shutdown before 135 // and if there are shut them down. 136 m.cleanupStalePlugins() 137 138 // Get device plugins 139 devices := m.loader.Catalog()[base.PluginTypeDevice] 140 if len(devices) == 0 { 141 m.logger.Debug("exiting since there are no device plugins") 142 m.cancel() 143 return 144 } 145 146 for _, d := range devices { 147 id := loader.PluginInfoID(d) 148 storeFn := func(c *plugin.ReattachConfig) error { 149 id := id 150 return m.storePluginReattachConfig(id, c) 151 } 152 m.instances[id] = newInstanceManager(&instanceManagerConfig{ 153 Logger: m.logger, 154 Ctx: m.ctx, 155 Loader: m.loader, 156 StoreReattach: storeFn, 157 PluginConfig: m.pluginConfig, 158 Id: &id, 159 FingerprintOutCh: m.fingerprintResCh, 160 StatsInterval: m.statsInterval, 161 }) 162 } 163 164 // Now start the fingerprint handler 165 go m.fingerprint() 166 } 167 168 // fingerprint is the main fingerprint loop 169 func (m *manager) fingerprint() { 170 for { 171 select { 172 case <-m.ctx.Done(): 173 return 174 case <-m.fingerprintResCh: 175 } 176 177 // Collect the data 178 var fingerprinted []*device.DeviceGroup 179 for _, i := range m.instances { 180 fingerprinted = append(fingerprinted, i.Devices()...) 181 } 182 183 // Convert and update 184 out := make([]*structs.NodeDeviceResource, len(fingerprinted)) 185 for i, f := range fingerprinted { 186 out[i] = convertDeviceGroup(f) 187 } 188 189 // Call the updater 190 m.updater(out) 191 } 192 } 193 194 // Shutdown cleans up all the plugins 195 func (m *manager) Shutdown() { 196 // Cancel the context to stop any requests 197 m.cancel() 198 199 // Go through and shut everything down 200 for _, i := range m.instances { 201 i.cleanup() 202 } 203 } 204 205 func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} { 206 ctx, cancel := context.WithCancel(ctx) 207 go func() { 208 var wg sync.WaitGroup 209 for i := range m.instances { 210 wg.Add(1) 211 go func(instance *instanceManager) { 212 instance.WaitForFirstFingerprint(ctx) 213 wg.Done() 214 }(m.instances[i]) 215 } 216 wg.Wait() 217 cancel() 218 }() 219 return ctx.Done() 220 } 221 222 // Reserve reserves the given allocated device. If the device is unknown, an 223 // UnknownDeviceErr is returned. 224 func (m *manager) Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) { 225 // Go through each plugin and see if it can reserve the resources 226 for _, i := range m.instances { 227 if !i.HasDevices(d) { 228 continue 229 } 230 231 // We found a match so reserve 232 return i.Reserve(d) 233 } 234 235 return nil, UnknownDeviceErrFromAllocated("failed to reserve devices", d) 236 } 237 238 // AllStats returns statistics for all the devices 239 func (m *manager) AllStats() []*device.DeviceGroupStats { 240 // Go through each plugin and collect stats 241 var stats []*device.DeviceGroupStats 242 for _, i := range m.instances { 243 stats = append(stats, i.AllStats()...) 244 } 245 246 return stats 247 } 248 249 // DeviceStats returns the statistics for the passed devices. If the device is unknown, an 250 // UnknownDeviceErr is returned. 251 func (m *manager) DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error) { 252 // Go through each plugin and see if it has the requested devices 253 for _, i := range m.instances { 254 if !i.HasDevices(d) { 255 continue 256 } 257 258 // We found a match so reserve 259 return i.DeviceStats(d), nil 260 } 261 262 return nil, UnknownDeviceErrFromAllocated("failed to collect statistics", d) 263 } 264 265 // cleanupStalePlugins reads the device managers state and shuts down any 266 // previously launched plugin. 267 func (m *manager) cleanupStalePlugins() error { 268 269 // Read the old plugin state 270 s, err := m.state.GetDevicePluginState() 271 if err != nil { 272 return fmt.Errorf("failed to read plugin state: %v", err) 273 } 274 275 // No state was stored so there is nothing to do. 276 if s == nil { 277 return nil 278 } 279 280 // For each plugin go through and try to shut it down 281 var mErr multierror.Error 282 for name, c := range s.ReattachConfigs { 283 rc, err := pstructs.ReattachConfigToGoPlugin(c) 284 if err != nil { 285 multierror.Append(&mErr, fmt.Errorf("failed to convert reattach config: %v", err)) 286 continue 287 } 288 289 instance, err := m.loader.Reattach(name, base.PluginTypeDevice, rc) 290 if err != nil { 291 multierror.Append(&mErr, fmt.Errorf("failed to reattach to plugin %q: %v", name, err)) 292 continue 293 } 294 295 // Kill the instance 296 instance.Kill() 297 } 298 299 return mErr.ErrorOrNil() 300 } 301 302 // storePluginReattachConfig is used as a callback to the instance managers and 303 // persists thhe plugin reattach configurations. 304 func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error { 305 m.reattachConfigLock.Lock() 306 defer m.reattachConfigLock.Unlock() 307 308 // Store the new reattach config 309 m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c) 310 311 // Persist the state 312 s := &state.PluginState{ 313 ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)), 314 } 315 316 for id, c := range m.reattachConfigs { 317 s.ReattachConfigs[id.Name] = c 318 } 319 320 return m.state.PutDevicePluginState(s) 321 }