github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/pluginmanager/drivermanager/instance.go (about) 1 package drivermanager 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/helper/pluginutils/loader" 11 "github.com/hashicorp/nomad/helper/pluginutils/singleton" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/plugins/base" 14 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 15 "github.com/hashicorp/nomad/plugins/drivers" 16 ) 17 18 const ( 19 // driverFPBackoffBaseline is the baseline time for exponential backoff while 20 // fingerprinting a driver. 21 driverFPBackoffBaseline = 5 * time.Second 22 23 // driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting 24 // a driver. 25 driverFPBackoffLimit = 2 * time.Minute 26 ) 27 28 // instanceManagerConfig configures a driver instance manager 29 type instanceManagerConfig struct { 30 // Logger is the logger used by the driver instance manager 31 Logger log.Logger 32 33 // Ctx is used to shutdown the driver instance manager 34 Ctx context.Context 35 36 // Loader is the plugin loader 37 Loader loader.PluginCatalog 38 39 // StoreReattach is used to store a plugins reattach config 40 StoreReattach StorePluginReattachFn 41 42 // FetchReattach is used to retrieve a plugin's reattach config 43 FetchReattach FetchPluginReattachFn 44 45 // PluginConfig is the config passed to the launched plugins 46 PluginConfig *base.AgentConfig 47 48 // ID is the ID of the plugin being managed 49 ID *loader.PluginID 50 51 // updateNodeFromDriver is the callback used to update the node from fingerprinting 52 UpdateNodeFromDriver UpdateNodeDriverInfoFn 53 54 // EventHandlerFactory is used to fetch a task event handler 55 EventHandlerFactory TaskEventHandlerFactory 56 } 57 58 // instanceManager is used to manage a single driver plugin 59 type instanceManager struct { 60 // logger is the logger used by the driver instance manager 61 logger log.Logger 62 63 // ctx is used to shutdown the driver manager 64 ctx context.Context 65 66 // cancel is used to shutdown management of this driver plugin 67 cancel context.CancelFunc 68 69 // loader is the plugin loader 70 loader loader.PluginCatalog 71 72 // storeReattach is used to store a plugins reattach config 73 storeReattach StorePluginReattachFn 74 75 // fetchReattach is used to retrieve a plugin's reattach config 76 fetchReattach FetchPluginReattachFn 77 78 // pluginConfig is the config passed to the launched plugins 79 pluginConfig *base.AgentConfig 80 81 // id is the ID of the plugin being managed 82 id *loader.PluginID 83 84 // plugin is the plugin instance being managed 85 plugin loader.PluginInstance 86 87 // driver is the driver plugin being managed 88 driver drivers.DriverPlugin 89 90 // pluginLock locks access to the driver and plugin 91 pluginLock sync.Mutex 92 93 // shutdownLock is used to serialize attempts to shutdown 94 shutdownLock sync.Mutex 95 96 // updateNodeFromDriver is the callback used to update the node from fingerprinting 97 updateNodeFromDriver UpdateNodeDriverInfoFn 98 99 // eventHandlerFactory is used to fetch a handler for a task event 100 eventHandlerFactory TaskEventHandlerFactory 101 102 // firstFingerprintCh is used to trigger that we have successfully 103 // fingerprinted once. It is used to gate launching the stats collection. 104 firstFingerprintCh chan struct{} 105 hasFingerprinted bool 106 107 // lastHealthState is the last known health fingerprinted by the manager 108 lastHealthState drivers.HealthState 109 lastHealthStateMu sync.Mutex 110 } 111 112 // newInstanceManager returns a new driver instance manager. It is expected that 113 // the context passed in the configuration is cancelled in order to shutdown 114 // launched goroutines. 115 func newInstanceManager(c *instanceManagerConfig) *instanceManager { 116 117 ctx, cancel := context.WithCancel(c.Ctx) 118 i := &instanceManager{ 119 logger: c.Logger.With("driver", c.ID.Name), 120 ctx: ctx, 121 cancel: cancel, 122 loader: c.Loader, 123 storeReattach: c.StoreReattach, 124 fetchReattach: c.FetchReattach, 125 pluginConfig: c.PluginConfig, 126 id: c.ID, 127 updateNodeFromDriver: c.UpdateNodeFromDriver, 128 eventHandlerFactory: c.EventHandlerFactory, 129 firstFingerprintCh: make(chan struct{}), 130 } 131 132 go i.run() 133 return i 134 } 135 136 // WaitForFirstFingerprint waits until either the plugin fingerprints, the 137 // passed context is done, or the plugin instance manager is shutdown. 138 func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) { 139 select { 140 case <-i.ctx.Done(): 141 case <-ctx.Done(): 142 case <-i.firstFingerprintCh: 143 } 144 } 145 146 // run is a long lived goroutine that starts the fingerprinting and stats 147 // collection goroutine and then shutsdown the plugin on exit. 148 func (i *instanceManager) run() { 149 // Dispense once to ensure we are given a valid plugin 150 if _, err := i.dispense(); err != nil { 151 i.logger.Error("dispensing initial plugin failed", "error", err) 152 return 153 } 154 155 // Create a waitgroup to block on shutdown for all created goroutines to 156 // exit 157 var wg sync.WaitGroup 158 159 // Start the fingerprinter 160 wg.Add(1) 161 go func() { 162 i.fingerprint() 163 wg.Done() 164 }() 165 166 // Start event handler 167 wg.Add(1) 168 go func() { 169 i.handleEvents() 170 wg.Done() 171 }() 172 173 // Do a final cleanup 174 wg.Wait() 175 i.cleanup() 176 } 177 178 // dispense is used to dispense a plugin. 179 func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) { 180 i.pluginLock.Lock() 181 defer i.pluginLock.Unlock() 182 183 // See if we already have a running instance 184 if i.plugin != nil && !i.plugin.Exited() { 185 return i.driver, nil 186 } 187 188 var pluginInstance loader.PluginInstance 189 dispenseFn := func() (loader.PluginInstance, error) { 190 return i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger) 191 } 192 193 if reattach, ok := i.fetchReattach(); ok { 194 // Reattach to existing plugin 195 pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach) 196 197 // If reattachment fails, get a new plugin instance 198 if err != nil { 199 i.logger.Warn("failed to reattach to plugin, starting new instance", "err", err) 200 pluginInstance, err = dispenseFn() 201 } 202 } else { 203 // Get an instance of the plugin 204 pluginInstance, err = dispenseFn() 205 } 206 207 if err != nil { 208 // Retry as the error just indicates the singleton has exited 209 if err == singleton.SingletonPluginExited { 210 pluginInstance, err = dispenseFn() 211 } 212 213 // If we still have an error there is a real problem 214 if err != nil { 215 return nil, fmt.Errorf("failed to start plugin: %v", err) 216 } 217 } 218 219 // Convert to a driver plugin 220 driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin) 221 if !ok { 222 pluginInstance.Kill() 223 return nil, fmt.Errorf("plugin loaded does not implement the driver interface") 224 } 225 226 // Store the plugin and driver 227 i.plugin = pluginInstance 228 i.driver = driver 229 230 // Store the reattach config 231 if c, ok := pluginInstance.ReattachConfig(); ok { 232 if err := i.storeReattach(c); err != nil { 233 i.logger.Error("error storing driver plugin reattach config", "error", err) 234 } 235 } 236 237 return driver, nil 238 } 239 240 // cleanup shutsdown the plugin 241 func (i *instanceManager) cleanup() { 242 i.shutdownLock.Lock() 243 i.pluginLock.Lock() 244 defer i.pluginLock.Unlock() 245 defer i.shutdownLock.Unlock() 246 247 if i.plugin == nil { 248 return 249 } 250 251 if internalPlugin, ok := i.plugin.Plugin().(drivers.InternalDriverPlugin); ok { 252 internalPlugin.Shutdown() 253 } 254 255 if !i.plugin.Exited() { 256 i.plugin.Kill() 257 if err := i.storeReattach(nil); err != nil { 258 i.logger.Warn("error clearing plugin reattach config from state store", "error", err) 259 } 260 } 261 262 i.cancel() 263 } 264 265 // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call 266 // to the driver. The fingerprint chan is returned along with the cancel func 267 // for the context used in the RPC. This cancel func should always be called 268 // when the caller is finished with the channel. 269 func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) { 270 driver, err := i.dispense() 271 if err != nil { 272 return nil, nil, err 273 } 274 275 ctx, cancel := context.WithCancel(i.ctx) 276 fingerCh, err := driver.Fingerprint(ctx) 277 if err != nil { 278 cancel() 279 return nil, nil, err 280 } 281 282 return fingerCh, cancel, nil 283 } 284 285 // fingerprint is the main loop for fingerprinting. 286 func (i *instanceManager) fingerprint() { 287 fpChan, cancel, err := i.dispenseFingerprintCh() 288 if err != nil { 289 i.logger.Error("failed to dispense driver plugin", "error", err) 290 } 291 292 // backoff and retry used if the RPC is closed by the other end 293 var backoff time.Duration 294 var retry int 295 for { 296 if backoff > 0 { 297 select { 298 case <-time.After(backoff): 299 case <-i.ctx.Done(): 300 cancel() 301 return 302 } 303 } 304 305 select { 306 case <-i.ctx.Done(): 307 cancel() 308 return 309 case fp, ok := <-fpChan: 310 if ok { 311 if fp.Err == nil { 312 i.handleFingerprint(fp) 313 } else { 314 i.logger.Warn("received fingerprint error from driver", "error", fp.Err) 315 i.handleFingerprintError() 316 } 317 continue 318 } 319 320 // if the channel is closed attempt to open a new one 321 newFpChan, newCancel, err := i.dispenseFingerprintCh() 322 if err != nil { 323 i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry) 324 i.handleFingerprintError() 325 326 // Calculate the new backoff 327 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 328 if backoff > driverFPBackoffLimit { 329 backoff = driverFPBackoffLimit 330 } 331 // Increment retry counter 332 retry++ 333 continue 334 } 335 cancel() 336 fpChan = newFpChan 337 cancel = newCancel 338 339 // Reset backoff 340 backoff = 0 341 retry = 0 342 } 343 } 344 } 345 346 // handleFingerprintError is called when an error occurred while fingerprinting 347 // and will set the driver to unhealthy 348 func (i *instanceManager) handleFingerprintError() { 349 di := &structs.DriverInfo{ 350 Healthy: false, 351 HealthDescription: "failed to fingerprint driver", 352 UpdateTime: time.Now(), 353 } 354 i.updateNodeFromDriver(i.id.Name, di) 355 } 356 357 // handleFingerprint updates the node with the current fingerprint status 358 func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) { 359 attrs := make(map[string]string, len(fp.Attributes)) 360 for key, attr := range fp.Attributes { 361 attrs[key] = attr.GoString() 362 } 363 di := &structs.DriverInfo{ 364 Attributes: attrs, 365 Detected: fp.Health != drivers.HealthStateUndetected, 366 Healthy: fp.Health == drivers.HealthStateHealthy, 367 HealthDescription: fp.HealthDescription, 368 UpdateTime: time.Now(), 369 } 370 i.updateNodeFromDriver(i.id.Name, di) 371 372 // log detected/undetected state changes after the initial fingerprint 373 i.lastHealthStateMu.Lock() 374 if i.hasFingerprinted { 375 if i.lastHealthState != fp.Health { 376 i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription) 377 } 378 } 379 i.lastHealthState = fp.Health 380 i.lastHealthStateMu.Unlock() 381 382 // if this is the first fingerprint, mark that we have received it 383 if !i.hasFingerprinted { 384 i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription) 385 close(i.firstFingerprintCh) 386 i.hasFingerprinted = true 387 } 388 } 389 390 // getLastHealth returns the most recent HealthState from fingerprinting 391 func (i *instanceManager) getLastHealth() drivers.HealthState { 392 i.lastHealthStateMu.Lock() 393 defer i.lastHealthStateMu.Unlock() 394 return i.lastHealthState 395 } 396 397 // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC. 398 // The TaskEvent chan and cancel func for the RPC is return. The cancel func must 399 // be called by the caller to properly cleanup the context 400 func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) { 401 driver, err := i.dispense() 402 if err != nil { 403 return nil, nil, err 404 } 405 406 ctx, cancel := context.WithCancel(i.ctx) 407 eventsCh, err := driver.TaskEvents(ctx) 408 if err != nil { 409 cancel() 410 return nil, nil, err 411 } 412 413 return eventsCh, cancel, nil 414 } 415 416 // handleEvents is the main loop that receives task events from the driver 417 func (i *instanceManager) handleEvents() { 418 eventsCh, cancel, err := i.dispenseTaskEventsCh() 419 if err != nil { 420 i.logger.Error("failed to dispense driver", "error", err) 421 } 422 423 var backoff time.Duration 424 var retry int 425 for { 426 if backoff > 0 { 427 select { 428 case <-time.After(backoff): 429 case <-i.ctx.Done(): 430 cancel() 431 return 432 } 433 } 434 435 select { 436 case <-i.ctx.Done(): 437 cancel() 438 return 439 case ev, ok := <-eventsCh: 440 if ok { 441 i.handleEvent(ev) 442 continue 443 } 444 445 // if the channel is closed attempt to open a new one 446 newEventsChan, newCancel, err := i.dispenseTaskEventsCh() 447 if err != nil { 448 i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry) 449 450 // Calculate the new backoff 451 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 452 if backoff > driverFPBackoffLimit { 453 backoff = driverFPBackoffLimit 454 } 455 retry++ 456 continue 457 } 458 cancel() 459 eventsCh = newEventsChan 460 cancel = newCancel 461 462 // Reset backoff 463 backoff = 0 464 retry = 0 465 } 466 } 467 } 468 469 // handleEvent looks up the event handler(s) for the event and runs them 470 func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) { 471 // Do not emit that the plugin is shutdown 472 if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown { 473 return 474 } 475 476 if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil { 477 i.logger.Trace("task event received", "event", ev) 478 handler(ev) 479 return 480 } 481 482 i.logger.Warn("no handler registered for event", "event", ev) 483 }