github.com/bigcommerce/nomad@v0.9.3-bc/client/pluginmanager/drivermanager/manager.go (about) 1 package drivermanager 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 8 log "github.com/hashicorp/go-hclog" 9 plugin "github.com/hashicorp/go-plugin" 10 "github.com/hashicorp/nomad/client/pluginmanager" 11 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" 12 "github.com/hashicorp/nomad/helper/pluginutils/loader" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/nomad/plugins/base" 15 "github.com/hashicorp/nomad/plugins/drivers" 16 pstructs "github.com/hashicorp/nomad/plugins/shared/structs" 17 ) 18 19 // ErrDriverNotFound is returned during Dispense when the requested driver 20 // plugin is not found in the plugin catalog 21 var ErrDriverNotFound = fmt.Errorf("driver not found") 22 23 // Manager is the interface used to manage driver plugins 24 type Manager interface { 25 pluginmanager.PluginManager 26 27 // Dispense returns a drivers.DriverPlugin for the given driver plugin name 28 // handling reattaching to an existing driver if available 29 Dispense(driver string) (drivers.DriverPlugin, error) 30 } 31 32 // TaskExecHandler is function to be called for executing commands in a task 33 type TaskExecHandler func( 34 ctx context.Context, 35 command []string, 36 tty bool, 37 stream drivers.ExecTaskStream) error 38 39 // EventHandler is a callback to be called for a task. 40 // The handler should not block execution. 41 type EventHandler func(*drivers.TaskEvent) 42 43 // TaskEventHandlerFactory returns an event handler for a given allocID/task name 44 type TaskEventHandlerFactory func(allocID, taskName string) EventHandler 45 46 // StateStorage is used to persist the driver managers state across 47 // agent restarts. 48 type StateStorage interface { 49 // GetDevicePluginState is used to retrieve the device manager's plugin 50 // state. 51 GetDriverPluginState() (*state.PluginState, error) 52 53 // PutDevicePluginState is used to store the device manager's plugin 54 // state. 55 PutDriverPluginState(state *state.PluginState) error 56 } 57 58 // UpdateNodeDriverInfoFn is the callback used to update the node from 59 // fingerprinting 60 type UpdateNodeDriverInfoFn func(string, *structs.DriverInfo) 61 62 // StorePluginReattachFn is used to store plugin reattachment configurations. 63 type StorePluginReattachFn func(*plugin.ReattachConfig) error 64 65 // FetchPluginReattachFn is used to retrieve the stored plugin reattachment 66 // configuration. 67 type FetchPluginReattachFn func() (*plugin.ReattachConfig, bool) 68 69 // Config is used to configure a driver manager 70 type Config struct { 71 // Logger is the logger used by the device manager 72 Logger log.Logger 73 74 // Loader is the plugin loader 75 Loader loader.PluginCatalog 76 77 // PluginConfig is the config passed to the launched plugins 78 PluginConfig *base.AgentConfig 79 80 // Updater is used to update the node when driver information changes 81 Updater UpdateNodeDriverInfoFn 82 83 // EventHandlerFactory is used to retrieve a task event handler 84 EventHandlerFactory TaskEventHandlerFactory 85 86 // State is used to manage the device managers state 87 State StateStorage 88 89 // AllowedDrivers if set will only start driver plugins for the given 90 // drivers 91 AllowedDrivers map[string]struct{} 92 93 // BlockedDrivers if set will not allow the given driver plugins to start 94 BlockedDrivers map[string]struct{} 95 } 96 97 // manager is used to manage a set of driver plugins 98 type manager struct { 99 // logger is the logger used by the device manager 100 logger log.Logger 101 102 // state is used to manage the device managers state 103 state StateStorage 104 105 // ctx is used to shutdown the device manager 106 ctx context.Context 107 cancel context.CancelFunc 108 109 // loader is the plugin loader 110 loader loader.PluginCatalog 111 112 // pluginConfig is the config passed to the launched plugins 113 pluginConfig *base.AgentConfig 114 115 // updater is used to update the node when device information changes 116 updater UpdateNodeDriverInfoFn 117 118 // eventHandlerFactory is passed to the instance managers and used to forward 119 // task events 120 eventHandlerFactory TaskEventHandlerFactory 121 122 // instances is the list of managed devices, access is serialized by instanceMu 123 instances map[string]*instanceManager 124 instancesMu sync.RWMutex 125 126 // reattachConfigs stores the plugin reattach configs 127 reattachConfigs map[loader.PluginID]*pstructs.ReattachConfig 128 reattachConfigLock sync.Mutex 129 130 // allows/block lists 131 allowedDrivers map[string]struct{} 132 blockedDrivers map[string]struct{} 133 134 // readyCh is ticked once at the end of Run() 135 readyCh chan struct{} 136 } 137 138 // New returns a new driver manager 139 func New(c *Config) *manager { 140 ctx, cancel := context.WithCancel(context.Background()) 141 return &manager{ 142 logger: c.Logger.Named("driver_mgr"), 143 state: c.State, 144 ctx: ctx, 145 cancel: cancel, 146 loader: c.Loader, 147 pluginConfig: c.PluginConfig, 148 updater: c.Updater, 149 eventHandlerFactory: c.EventHandlerFactory, 150 instances: make(map[string]*instanceManager), 151 reattachConfigs: make(map[loader.PluginID]*pstructs.ReattachConfig), 152 allowedDrivers: c.AllowedDrivers, 153 blockedDrivers: c.BlockedDrivers, 154 readyCh: make(chan struct{}), 155 } 156 } 157 158 // PluginType returns the type of plugin this manager mananges 159 func (*manager) PluginType() string { return base.PluginTypeDriver } 160 161 // Run starts the manager, initializes driver plugins and blocks until Shutdown 162 // is called. 163 func (m *manager) Run() { 164 // Load any previous plugin reattach configuration 165 if err := m.loadReattachConfigs(); err != nil { 166 m.logger.Warn("unable to load driver plugin reattach configs, a driver process may have been leaked", 167 "error", err) 168 } 169 170 // Get driver plugins 171 driversPlugins := m.loader.Catalog()[base.PluginTypeDriver] 172 if len(driversPlugins) == 0 { 173 m.logger.Debug("exiting since there are no driver plugins") 174 m.cancel() 175 return 176 } 177 178 var skippedDrivers []string 179 for _, d := range driversPlugins { 180 id := loader.PluginInfoID(d) 181 if m.isDriverBlocked(id.Name) { 182 skippedDrivers = append(skippedDrivers, id.Name) 183 continue 184 } 185 186 storeFn := func(c *plugin.ReattachConfig) error { 187 return m.storePluginReattachConfig(id, c) 188 } 189 fetchFn := func() (*plugin.ReattachConfig, bool) { 190 return m.fetchPluginReattachConfig(id) 191 } 192 193 instance := newInstanceManager(&instanceManagerConfig{ 194 Logger: m.logger, 195 Ctx: m.ctx, 196 Loader: m.loader, 197 StoreReattach: storeFn, 198 FetchReattach: fetchFn, 199 PluginConfig: m.pluginConfig, 200 ID: &id, 201 UpdateNodeFromDriver: m.updater, 202 EventHandlerFactory: m.eventHandlerFactory, 203 }) 204 205 m.instancesMu.Lock() 206 m.instances[id.Name] = instance 207 m.instancesMu.Unlock() 208 } 209 210 if len(skippedDrivers) > 0 { 211 m.logger.Debug("drivers skipped due to allow/block list", "skipped_drivers", skippedDrivers) 212 } 213 214 // signal ready 215 close(m.readyCh) 216 } 217 218 // Shutdown cleans up all the plugins 219 func (m *manager) Shutdown() { 220 // Cancel the context to stop any requests 221 m.cancel() 222 223 m.instancesMu.RLock() 224 defer m.instancesMu.RUnlock() 225 226 // Go through and shut everything down 227 for _, i := range m.instances { 228 i.cleanup() 229 } 230 } 231 232 func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} { 233 ctx, cancel := context.WithCancel(ctx) 234 go m.waitForFirstFingerprint(ctx, cancel) 235 return ctx.Done() 236 } 237 238 func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.CancelFunc) { 239 defer cancel() 240 // We don't want to start initial fingerprint wait until Run loop has 241 // finished 242 select { 243 case <-m.readyCh: 244 case <-ctx.Done(): 245 // parent context canceled or timedout 246 return 247 case <-m.ctx.Done(): 248 // shutdown called 249 return 250 } 251 252 var mu sync.Mutex 253 driversByStatus := map[drivers.HealthState][]string{} 254 255 var wg sync.WaitGroup 256 257 recordDriver := func(name string, lastHeath drivers.HealthState) { 258 mu.Lock() 259 defer mu.Unlock() 260 261 updated := append(driversByStatus[lastHeath], name) 262 driversByStatus[lastHeath] = updated 263 } 264 265 // loop through instances and wait for each to finish initial fingerprint 266 m.instancesMu.RLock() 267 for n, i := range m.instances { 268 wg.Add(1) 269 go func(name string, instance *instanceManager) { 270 defer wg.Done() 271 instance.WaitForFirstFingerprint(ctx) 272 recordDriver(name, instance.getLastHealth()) 273 }(n, i) 274 } 275 m.instancesMu.RUnlock() 276 wg.Wait() 277 278 m.logger.Debug("detected drivers", "drivers", driversByStatus) 279 } 280 281 func (m *manager) loadReattachConfigs() error { 282 m.reattachConfigLock.Lock() 283 defer m.reattachConfigLock.Unlock() 284 285 s, err := m.state.GetDriverPluginState() 286 if err != nil { 287 return err 288 } 289 290 if s != nil { 291 for name, c := range s.ReattachConfigs { 292 if m.isDriverBlocked(name) { 293 m.logger.Warn("reattach config for driver plugin found but driver is blocked due to allow/block list, killing plugin", 294 "driver", name) 295 m.shutdownBlockedDriver(name, c) 296 continue 297 } 298 299 id := loader.PluginID{ 300 PluginType: base.PluginTypeDriver, 301 Name: name, 302 } 303 304 m.reattachConfigs[id] = c 305 } 306 } 307 return nil 308 } 309 310 // shutdownBlockedDriver is used to forcefully shutdown a running driver plugin 311 // when it has been blocked due to allow/block lists 312 func (m *manager) shutdownBlockedDriver(name string, reattach *pstructs.ReattachConfig) { 313 c, err := pstructs.ReattachConfigToGoPlugin(reattach) 314 if err != nil { 315 m.logger.Warn("failed to reattach and kill blocked driver plugin", 316 "driver", name, "error", err) 317 return 318 319 } 320 pluginInstance, err := m.loader.Reattach(name, base.PluginTypeDriver, c) 321 if err != nil { 322 m.logger.Warn("failed to reattach and kill blocked driver plugin", 323 "driver", name, "error", err) 324 return 325 } 326 327 if !pluginInstance.Exited() { 328 pluginInstance.Kill() 329 } 330 } 331 332 // storePluginReattachConfig is used as a callback to the instance managers and 333 // persists thhe plugin reattach configurations. 334 func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error { 335 m.reattachConfigLock.Lock() 336 defer m.reattachConfigLock.Unlock() 337 338 if c == nil { 339 delete(m.reattachConfigs, id) 340 } else { 341 // Store the new reattach config 342 m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c) 343 } 344 // Persist the state 345 s := &state.PluginState{ 346 ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)), 347 } 348 349 for id, c := range m.reattachConfigs { 350 s.ReattachConfigs[id.Name] = c 351 } 352 353 return m.state.PutDriverPluginState(s) 354 } 355 356 // fetchPluginReattachConfig is used as a callback to the instance managers and 357 // retrieves the plugin reattach config. If it has not been stored it will 358 // return nil 359 func (m *manager) fetchPluginReattachConfig(id loader.PluginID) (*plugin.ReattachConfig, bool) { 360 m.reattachConfigLock.Lock() 361 defer m.reattachConfigLock.Unlock() 362 363 if cfg, ok := m.reattachConfigs[id]; ok { 364 c, err := pstructs.ReattachConfigToGoPlugin(cfg) 365 if err != nil { 366 m.logger.Warn("failed to read plugin reattach config", "config", cfg, "error", err) 367 delete(m.reattachConfigs, id) 368 return nil, false 369 } 370 return c, true 371 } 372 return nil, false 373 } 374 375 func (m *manager) Dispense(d string) (drivers.DriverPlugin, error) { 376 m.instancesMu.RLock() 377 defer m.instancesMu.RUnlock() 378 if instance, ok := m.instances[d]; ok { 379 return instance.dispense() 380 } 381 382 return nil, ErrDriverNotFound 383 } 384 385 func (m *manager) isDriverBlocked(name string) bool { 386 // Block drivers that are not in the allowed list if it is set. 387 if _, ok := m.allowedDrivers[name]; len(m.allowedDrivers) > 0 && !ok { 388 return true 389 } 390 391 // Block drivers that are in the blocked list 392 if _, ok := m.blockedDrivers[name]; ok { 393 return true 394 } 395 return false 396 }