github.com/smithx10/nomad@v0.9.1-rc1/client/pluginmanager/drivermanager/manager.go (about) 1 package drivermanager 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 8 log "github.com/hashicorp/go-hclog" 9 plugin "github.com/hashicorp/go-plugin" 10 "github.com/hashicorp/nomad/client/pluginmanager" 11 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" 12 "github.com/hashicorp/nomad/helper/pluginutils/loader" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/nomad/plugins/base" 15 "github.com/hashicorp/nomad/plugins/drivers" 16 pstructs "github.com/hashicorp/nomad/plugins/shared/structs" 17 ) 18 19 // ErrDriverNotFound is returned during Dispense when the requested driver 20 // plugin is not found in the plugin catalog 21 var ErrDriverNotFound = fmt.Errorf("driver not found") 22 23 // Manager is the interface used to manage driver plugins 24 type Manager interface { 25 pluginmanager.PluginManager 26 27 // Dispense returns a drivers.DriverPlugin for the given driver plugin name 28 // handling reattaching to an existing driver if available 29 Dispense(driver string) (drivers.DriverPlugin, error) 30 } 31 32 // EventHandler is a callback to be called for a task. 33 // The handler should not block execution. 34 type EventHandler func(*drivers.TaskEvent) 35 36 // TaskEventHandlerFactory returns an event handler for a given allocID/task name 37 type TaskEventHandlerFactory func(allocID, taskName string) EventHandler 38 39 // StateStorage is used to persist the driver managers state across 40 // agent restarts. 41 type StateStorage interface { 42 // GetDevicePluginState is used to retrieve the device manager's plugin 43 // state. 44 GetDriverPluginState() (*state.PluginState, error) 45 46 // PutDevicePluginState is used to store the device manager's plugin 47 // state. 48 PutDriverPluginState(state *state.PluginState) error 49 } 50 51 // UpdateNodeDriverInfoFn is the callback used to update the node from 52 // fingerprinting 53 type UpdateNodeDriverInfoFn func(string, *structs.DriverInfo) 54 55 // StorePluginReattachFn is used to store plugin reattachment configurations. 56 type StorePluginReattachFn func(*plugin.ReattachConfig) error 57 58 // FetchPluginReattachFn is used to retrieve the stored plugin reattachment 59 // configuration. 60 type FetchPluginReattachFn func() (*plugin.ReattachConfig, bool) 61 62 // Config is used to configure a driver manager 63 type Config struct { 64 // Logger is the logger used by the device manager 65 Logger log.Logger 66 67 // Loader is the plugin loader 68 Loader loader.PluginCatalog 69 70 // PluginConfig is the config passed to the launched plugins 71 PluginConfig *base.AgentConfig 72 73 // Updater is used to update the node when driver information changes 74 Updater UpdateNodeDriverInfoFn 75 76 // EventHandlerFactory is used to retrieve a task event handler 77 EventHandlerFactory TaskEventHandlerFactory 78 79 // State is used to manage the device managers state 80 State StateStorage 81 82 // AllowedDrivers if set will only start driver plugins for the given 83 // drivers 84 AllowedDrivers map[string]struct{} 85 86 // BlockedDrivers if set will not allow the given driver plugins to start 87 BlockedDrivers map[string]struct{} 88 } 89 90 // manager is used to manage a set of driver plugins 91 type manager struct { 92 // logger is the logger used by the device manager 93 logger log.Logger 94 95 // state is used to manage the device managers state 96 state StateStorage 97 98 // ctx is used to shutdown the device manager 99 ctx context.Context 100 cancel context.CancelFunc 101 102 // loader is the plugin loader 103 loader loader.PluginCatalog 104 105 // pluginConfig is the config passed to the launched plugins 106 pluginConfig *base.AgentConfig 107 108 // updater is used to update the node when device information changes 109 updater UpdateNodeDriverInfoFn 110 111 // eventHandlerFactory is passed to the instance managers and used to forward 112 // task events 113 eventHandlerFactory TaskEventHandlerFactory 114 115 // instances is the list of managed devices, access is serialized by instanceMu 116 instances map[string]*instanceManager 117 instancesMu sync.RWMutex 118 119 // reattachConfigs stores the plugin reattach configs 120 reattachConfigs map[loader.PluginID]*pstructs.ReattachConfig 121 reattachConfigLock sync.Mutex 122 123 // allows/block lists 124 allowedDrivers map[string]struct{} 125 blockedDrivers map[string]struct{} 126 127 // readyCh is ticked once at the end of Run() 128 readyCh chan struct{} 129 } 130 131 // New returns a new driver manager 132 func New(c *Config) *manager { 133 ctx, cancel := context.WithCancel(context.Background()) 134 return &manager{ 135 logger: c.Logger.Named("driver_mgr"), 136 state: c.State, 137 ctx: ctx, 138 cancel: cancel, 139 loader: c.Loader, 140 pluginConfig: c.PluginConfig, 141 updater: c.Updater, 142 eventHandlerFactory: c.EventHandlerFactory, 143 instances: make(map[string]*instanceManager), 144 reattachConfigs: make(map[loader.PluginID]*pstructs.ReattachConfig), 145 allowedDrivers: c.AllowedDrivers, 146 blockedDrivers: c.BlockedDrivers, 147 readyCh: make(chan struct{}), 148 } 149 } 150 151 // PluginType returns the type of plugin this manager mananges 152 func (*manager) PluginType() string { return base.PluginTypeDriver } 153 154 // Run starts the manager, initializes driver plugins and blocks until Shutdown 155 // is called. 156 func (m *manager) Run() { 157 // Load any previous plugin reattach configuration 158 if err := m.loadReattachConfigs(); err != nil { 159 m.logger.Warn("unable to load driver plugin reattach configs, a driver process may have been leaked", 160 "error", err) 161 } 162 163 // Get driver plugins 164 driversPlugins := m.loader.Catalog()[base.PluginTypeDriver] 165 if len(driversPlugins) == 0 { 166 m.logger.Debug("exiting since there are no driver plugins") 167 m.cancel() 168 return 169 } 170 171 var skippedDrivers []string 172 for _, d := range driversPlugins { 173 id := loader.PluginInfoID(d) 174 if m.isDriverBlocked(id.Name) { 175 skippedDrivers = append(skippedDrivers, id.Name) 176 continue 177 } 178 179 storeFn := func(c *plugin.ReattachConfig) error { 180 return m.storePluginReattachConfig(id, c) 181 } 182 fetchFn := func() (*plugin.ReattachConfig, bool) { 183 return m.fetchPluginReattachConfig(id) 184 } 185 186 instance := newInstanceManager(&instanceManagerConfig{ 187 Logger: m.logger, 188 Ctx: m.ctx, 189 Loader: m.loader, 190 StoreReattach: storeFn, 191 FetchReattach: fetchFn, 192 PluginConfig: m.pluginConfig, 193 ID: &id, 194 UpdateNodeFromDriver: m.updater, 195 EventHandlerFactory: m.eventHandlerFactory, 196 }) 197 198 m.instancesMu.Lock() 199 m.instances[id.Name] = instance 200 m.instancesMu.Unlock() 201 } 202 203 if len(skippedDrivers) > 0 { 204 m.logger.Debug("drivers skipped due to allow/block list", "skipped_drivers", skippedDrivers) 205 } 206 207 // signal ready 208 close(m.readyCh) 209 } 210 211 // Shutdown cleans up all the plugins 212 func (m *manager) Shutdown() { 213 // Cancel the context to stop any requests 214 m.cancel() 215 216 m.instancesMu.RLock() 217 defer m.instancesMu.RUnlock() 218 219 // Go through and shut everything down 220 for _, i := range m.instances { 221 i.cleanup() 222 } 223 } 224 225 func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} { 226 ctx, cancel := context.WithCancel(ctx) 227 go m.waitForFirstFingerprint(ctx, cancel) 228 return ctx.Done() 229 } 230 231 func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.CancelFunc) { 232 defer cancel() 233 // We don't want to start initial fingerprint wait until Run loop has 234 // finished 235 select { 236 case <-m.readyCh: 237 case <-ctx.Done(): 238 // parent context canceled or timedout 239 return 240 case <-m.ctx.Done(): 241 // shutdown called 242 return 243 } 244 245 var mu sync.Mutex 246 driversByStatus := map[drivers.HealthState][]string{} 247 248 var wg sync.WaitGroup 249 250 recordDriver := func(name string, lastHeath drivers.HealthState) { 251 mu.Lock() 252 defer mu.Unlock() 253 254 updated := append(driversByStatus[lastHeath], name) 255 driversByStatus[lastHeath] = updated 256 } 257 258 // loop through instances and wait for each to finish initial fingerprint 259 m.instancesMu.RLock() 260 for n, i := range m.instances { 261 wg.Add(1) 262 go func(name string, instance *instanceManager) { 263 defer wg.Done() 264 instance.WaitForFirstFingerprint(ctx) 265 recordDriver(name, instance.getLastHealth()) 266 }(n, i) 267 } 268 m.instancesMu.RUnlock() 269 wg.Wait() 270 271 m.logger.Debug("detected drivers", "drivers", driversByStatus) 272 } 273 274 func (m *manager) loadReattachConfigs() error { 275 m.reattachConfigLock.Lock() 276 defer m.reattachConfigLock.Unlock() 277 278 s, err := m.state.GetDriverPluginState() 279 if err != nil { 280 return err 281 } 282 283 if s != nil { 284 for name, c := range s.ReattachConfigs { 285 if m.isDriverBlocked(name) { 286 m.logger.Warn("reattach config for driver plugin found but driver is blocked due to allow/block list, killing plugin", 287 "driver", name) 288 m.shutdownBlockedDriver(name, c) 289 continue 290 } 291 292 id := loader.PluginID{ 293 PluginType: base.PluginTypeDriver, 294 Name: name, 295 } 296 297 m.reattachConfigs[id] = c 298 } 299 } 300 return nil 301 } 302 303 // shutdownBlockedDriver is used to forcefully shutdown a running driver plugin 304 // when it has been blocked due to allow/block lists 305 func (m *manager) shutdownBlockedDriver(name string, reattach *pstructs.ReattachConfig) { 306 c, err := pstructs.ReattachConfigToGoPlugin(reattach) 307 if err != nil { 308 m.logger.Warn("failed to reattach and kill blocked driver plugin", 309 "driver", name, "error", err) 310 return 311 312 } 313 pluginInstance, err := m.loader.Reattach(name, base.PluginTypeDriver, c) 314 if err != nil { 315 m.logger.Warn("failed to reattach and kill blocked driver plugin", 316 "driver", name, "error", err) 317 return 318 } 319 320 if !pluginInstance.Exited() { 321 pluginInstance.Kill() 322 } 323 } 324 325 // storePluginReattachConfig is used as a callback to the instance managers and 326 // persists thhe plugin reattach configurations. 327 func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error { 328 m.reattachConfigLock.Lock() 329 defer m.reattachConfigLock.Unlock() 330 331 if c == nil { 332 delete(m.reattachConfigs, id) 333 } else { 334 // Store the new reattach config 335 m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c) 336 } 337 // Persist the state 338 s := &state.PluginState{ 339 ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)), 340 } 341 342 for id, c := range m.reattachConfigs { 343 s.ReattachConfigs[id.Name] = c 344 } 345 346 return m.state.PutDriverPluginState(s) 347 } 348 349 // fetchPluginReattachConfig is used as a callback to the instance managers and 350 // retrieves the plugin reattach config. If it has not been stored it will 351 // return nil 352 func (m *manager) fetchPluginReattachConfig(id loader.PluginID) (*plugin.ReattachConfig, bool) { 353 m.reattachConfigLock.Lock() 354 defer m.reattachConfigLock.Unlock() 355 356 if cfg, ok := m.reattachConfigs[id]; ok { 357 c, err := pstructs.ReattachConfigToGoPlugin(cfg) 358 if err != nil { 359 m.logger.Warn("failed to read plugin reattach config", "config", cfg, "error", err) 360 delete(m.reattachConfigs, id) 361 return nil, false 362 } 363 return c, true 364 } 365 return nil, false 366 } 367 368 func (m *manager) Dispense(d string) (drivers.DriverPlugin, error) { 369 m.instancesMu.RLock() 370 defer m.instancesMu.RUnlock() 371 if instance, ok := m.instances[d]; ok { 372 return instance.dispense() 373 } 374 375 return nil, ErrDriverNotFound 376 } 377 378 func (m *manager) isDriverBlocked(name string) bool { 379 // Block drivers that are not in the allowed list if it is set. 380 if _, ok := m.allowedDrivers[name]; len(m.allowedDrivers) > 0 && !ok { 381 return true 382 } 383 384 // Block drivers that are in the blocked list 385 if _, ok := m.blockedDrivers[name]; ok { 386 return true 387 } 388 return false 389 }