github.com/bigcommerce/nomad@v0.9.3-bc/client/pluginmanager/drivermanager/instance.go (about) 1 package drivermanager 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/helper/pluginutils/loader" 11 "github.com/hashicorp/nomad/helper/pluginutils/singleton" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/plugins/base" 14 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 15 "github.com/hashicorp/nomad/plugins/drivers" 16 ) 17 18 const ( 19 // driverFPBackoffBaseline is the baseline time for exponential backoff while 20 // fingerprinting a driver. 21 driverFPBackoffBaseline = 5 * time.Second 22 23 // driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting 24 // a driver. 25 driverFPBackoffLimit = 2 * time.Minute 26 ) 27 28 // instanceManagerConfig configures a driver instance manager 29 type instanceManagerConfig struct { 30 // Logger is the logger used by the driver instance manager 31 Logger log.Logger 32 33 // Ctx is used to shutdown the driver instance manager 34 Ctx context.Context 35 36 // Loader is the plugin loader 37 Loader loader.PluginCatalog 38 39 // StoreReattach is used to store a plugins reattach config 40 StoreReattach StorePluginReattachFn 41 42 // FetchReattach is used to retrieve a plugin's reattach config 43 FetchReattach FetchPluginReattachFn 44 45 // PluginConfig is the config passed to the launched plugins 46 PluginConfig *base.AgentConfig 47 48 // ID is the ID of the plugin being managed 49 ID *loader.PluginID 50 51 // updateNodeFromDriver is the callback used to update the node from fingerprinting 52 UpdateNodeFromDriver UpdateNodeDriverInfoFn 53 54 // EventHandlerFactory is used to fetch a task event handler 55 EventHandlerFactory TaskEventHandlerFactory 56 } 57 58 // instanceManager is used to manage a single driver plugin 59 type instanceManager struct { 60 // logger is the logger used by the driver instance manager 61 logger log.Logger 62 63 // ctx is used to shutdown the driver manager 64 ctx context.Context 65 66 // cancel is used to shutdown management of this driver plugin 67 cancel context.CancelFunc 68 69 // loader is the plugin loader 70 loader loader.PluginCatalog 71 72 // storeReattach is used to store a plugins reattach config 73 storeReattach StorePluginReattachFn 74 75 // fetchReattach is used to retrieve a plugin's reattach config 76 fetchReattach FetchPluginReattachFn 77 78 // pluginConfig is the config passed to the launched plugins 79 pluginConfig *base.AgentConfig 80 81 // id is the ID of the plugin being managed 82 id *loader.PluginID 83 84 // plugin is the plugin instance being managed 85 plugin loader.PluginInstance 86 87 // driver is the driver plugin being managed 88 driver drivers.DriverPlugin 89 90 // pluginLock locks access to the driver and plugin 91 pluginLock sync.Mutex 92 93 // shutdownLock is used to serialize attempts to shutdown 94 shutdownLock sync.Mutex 95 96 // updateNodeFromDriver is the callback used to update the node from fingerprinting 97 updateNodeFromDriver UpdateNodeDriverInfoFn 98 99 // eventHandlerFactory is used to fetch a handler for a task event 100 eventHandlerFactory TaskEventHandlerFactory 101 102 // firstFingerprintCh is used to trigger that we have successfully 103 // fingerprinted once. It is used to gate launching the stats collection. 104 firstFingerprintCh chan struct{} 105 hasFingerprinted bool 106 107 // lastHealthState is the last known health fingerprinted by the manager 108 lastHealthState drivers.HealthState 109 lastHealthStateMu sync.Mutex 110 } 111 112 // newInstanceManager returns a new driver instance manager. It is expected that 113 // the context passed in the configuration is cancelled in order to shutdown 114 // launched goroutines. 115 func newInstanceManager(c *instanceManagerConfig) *instanceManager { 116 117 ctx, cancel := context.WithCancel(c.Ctx) 118 i := &instanceManager{ 119 logger: c.Logger.With("driver", c.ID.Name), 120 ctx: ctx, 121 cancel: cancel, 122 loader: c.Loader, 123 storeReattach: c.StoreReattach, 124 fetchReattach: c.FetchReattach, 125 pluginConfig: c.PluginConfig, 126 id: c.ID, 127 updateNodeFromDriver: c.UpdateNodeFromDriver, 128 eventHandlerFactory: c.EventHandlerFactory, 129 firstFingerprintCh: make(chan struct{}), 130 } 131 132 go i.run() 133 return i 134 } 135 136 // WaitForFirstFingerprint waits until either the plugin fingerprints, the 137 // passed context is done, or the plugin instance manager is shutdown. 138 func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) { 139 select { 140 case <-i.ctx.Done(): 141 case <-ctx.Done(): 142 case <-i.firstFingerprintCh: 143 } 144 } 145 146 // run is a long lived goroutine that starts the fingerprinting and stats 147 // collection goroutine and then shutsdown the plugin on exit. 148 func (i *instanceManager) run() { 149 // Dispense once to ensure we are given a valid plugin 150 if _, err := i.dispense(); err != nil { 151 i.logger.Error("dispensing initial plugin failed", "error", err) 152 return 153 } 154 155 // Create a waitgroup to block on shutdown for all created goroutines to 156 // exit 157 var wg sync.WaitGroup 158 159 // Start the fingerprinter 160 wg.Add(1) 161 go func() { 162 i.fingerprint() 163 wg.Done() 164 }() 165 166 // Start event handler 167 wg.Add(1) 168 go func() { 169 i.handleEvents() 170 wg.Done() 171 }() 172 173 // Do a final cleanup 174 wg.Wait() 175 i.cleanup() 176 } 177 178 // dispense is used to dispense a plugin. 179 func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) { 180 i.pluginLock.Lock() 181 defer i.pluginLock.Unlock() 182 183 // See if we already have a running instance 184 if i.plugin != nil && !i.plugin.Exited() { 185 return i.driver, nil 186 } 187 188 var pluginInstance loader.PluginInstance 189 190 if reattach, ok := i.fetchReattach(); ok { 191 // Reattach to existing plugin 192 pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach) 193 } else { 194 // Get an instance of the plugin 195 pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger) 196 } 197 if err != nil { 198 // Retry as the error just indicates the singleton has exited 199 if err == singleton.SingletonPluginExited { 200 pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger) 201 } 202 203 // If we still have an error there is a real problem 204 if err != nil { 205 return nil, fmt.Errorf("failed to start plugin: %v", err) 206 } 207 } 208 209 // Convert to a driver plugin 210 driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin) 211 if !ok { 212 pluginInstance.Kill() 213 return nil, fmt.Errorf("plugin loaded does not implement the driver interface") 214 } 215 216 // Store the plugin and driver 217 i.plugin = pluginInstance 218 i.driver = driver 219 220 // Store the reattach config 221 if c, ok := pluginInstance.ReattachConfig(); ok { 222 if err := i.storeReattach(c); err != nil { 223 i.logger.Error("error storing driver plugin reattach config", "error", err) 224 } 225 } 226 227 return driver, nil 228 } 229 230 // cleanup shutsdown the plugin 231 func (i *instanceManager) cleanup() { 232 i.shutdownLock.Lock() 233 i.pluginLock.Lock() 234 defer i.pluginLock.Unlock() 235 defer i.shutdownLock.Unlock() 236 237 if i.plugin == nil { 238 return 239 } 240 241 if internalPlugin, ok := i.plugin.Plugin().(drivers.InternalDriverPlugin); ok { 242 internalPlugin.Shutdown() 243 } 244 245 if !i.plugin.Exited() { 246 i.plugin.Kill() 247 if err := i.storeReattach(nil); err != nil { 248 i.logger.Warn("error clearing plugin reattach config from state store", "error", err) 249 } 250 } 251 252 i.cancel() 253 } 254 255 // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call 256 // to the driver. The fingerprint chan is returned along with the cancel func 257 // for the context used in the RPC. This cancel func should always be called 258 // when the caller is finished with the channel. 259 func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) { 260 driver, err := i.dispense() 261 if err != nil { 262 return nil, nil, err 263 } 264 265 ctx, cancel := context.WithCancel(i.ctx) 266 fingerCh, err := driver.Fingerprint(ctx) 267 if err != nil { 268 cancel() 269 return nil, nil, err 270 } 271 272 return fingerCh, cancel, nil 273 } 274 275 // fingerprint is the main loop for fingerprinting. 276 func (i *instanceManager) fingerprint() { 277 fpChan, cancel, err := i.dispenseFingerprintCh() 278 if err != nil { 279 i.logger.Error("failed to dispense driver plugin", "error", err) 280 } 281 282 // backoff and retry used if the RPC is closed by the other end 283 var backoff time.Duration 284 var retry int 285 for { 286 if backoff > 0 { 287 select { 288 case <-time.After(backoff): 289 case <-i.ctx.Done(): 290 cancel() 291 return 292 } 293 } 294 295 select { 296 case <-i.ctx.Done(): 297 cancel() 298 return 299 case fp, ok := <-fpChan: 300 if ok { 301 if fp.Err == nil { 302 i.handleFingerprint(fp) 303 } else { 304 i.logger.Warn("received fingerprint error from driver", "error", fp.Err) 305 i.handleFingerprintError() 306 } 307 continue 308 } 309 310 // if the channel is closed attempt to open a new one 311 newFpChan, newCancel, err := i.dispenseFingerprintCh() 312 if err != nil { 313 i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry) 314 i.handleFingerprintError() 315 316 // Calculate the new backoff 317 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 318 if backoff > driverFPBackoffLimit { 319 backoff = driverFPBackoffLimit 320 } 321 // Increment retry counter 322 retry++ 323 continue 324 } 325 cancel() 326 fpChan = newFpChan 327 cancel = newCancel 328 329 // Reset backoff 330 backoff = 0 331 retry = 0 332 } 333 } 334 } 335 336 // handleFingerprintError is called when an error occurred while fingerprinting 337 // and will set the driver to unhealthy 338 func (i *instanceManager) handleFingerprintError() { 339 di := &structs.DriverInfo{ 340 Healthy: false, 341 HealthDescription: "failed to fingerprint driver", 342 UpdateTime: time.Now(), 343 } 344 i.updateNodeFromDriver(i.id.Name, di) 345 } 346 347 // handleFingerprint updates the node with the current fingerprint status 348 func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) { 349 attrs := make(map[string]string, len(fp.Attributes)) 350 for key, attr := range fp.Attributes { 351 attrs[key] = attr.GoString() 352 } 353 di := &structs.DriverInfo{ 354 Attributes: attrs, 355 Detected: fp.Health != drivers.HealthStateUndetected, 356 Healthy: fp.Health == drivers.HealthStateHealthy, 357 HealthDescription: fp.HealthDescription, 358 UpdateTime: time.Now(), 359 } 360 i.updateNodeFromDriver(i.id.Name, di) 361 362 // log detected/undetected state changes after the initial fingerprint 363 i.lastHealthStateMu.Lock() 364 if i.hasFingerprinted { 365 if i.lastHealthState != fp.Health { 366 i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription) 367 } 368 } 369 i.lastHealthState = fp.Health 370 i.lastHealthStateMu.Unlock() 371 372 // if this is the first fingerprint, mark that we have received it 373 if !i.hasFingerprinted { 374 i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription) 375 close(i.firstFingerprintCh) 376 i.hasFingerprinted = true 377 } 378 } 379 380 // getLastHealth returns the most recent HealthState from fingerprinting 381 func (i *instanceManager) getLastHealth() drivers.HealthState { 382 i.lastHealthStateMu.Lock() 383 defer i.lastHealthStateMu.Unlock() 384 return i.lastHealthState 385 } 386 387 // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC. 388 // The TaskEvent chan and cancel func for the RPC is return. The cancel func must 389 // be called by the caller to properly cleanup the context 390 func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) { 391 driver, err := i.dispense() 392 if err != nil { 393 return nil, nil, err 394 } 395 396 ctx, cancel := context.WithCancel(i.ctx) 397 eventsCh, err := driver.TaskEvents(ctx) 398 if err != nil { 399 cancel() 400 return nil, nil, err 401 } 402 403 return eventsCh, cancel, nil 404 } 405 406 // handleEvents is the main loop that receives task events from the driver 407 func (i *instanceManager) handleEvents() { 408 eventsCh, cancel, err := i.dispenseTaskEventsCh() 409 if err != nil { 410 i.logger.Error("failed to dispense driver", "error", err) 411 } 412 413 var backoff time.Duration 414 var retry int 415 for { 416 if backoff > 0 { 417 select { 418 case <-time.After(backoff): 419 case <-i.ctx.Done(): 420 cancel() 421 return 422 } 423 } 424 425 select { 426 case <-i.ctx.Done(): 427 cancel() 428 return 429 case ev, ok := <-eventsCh: 430 if ok { 431 i.handleEvent(ev) 432 continue 433 } 434 435 // if the channel is closed attempt to open a new one 436 newEventsChan, newCancel, err := i.dispenseTaskEventsCh() 437 if err != nil { 438 i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry) 439 440 // Calculate the new backoff 441 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 442 if backoff > driverFPBackoffLimit { 443 backoff = driverFPBackoffLimit 444 } 445 retry++ 446 continue 447 } 448 cancel() 449 eventsCh = newEventsChan 450 cancel = newCancel 451 452 // Reset backoff 453 backoff = 0 454 retry = 0 455 } 456 } 457 } 458 459 // handleEvent looks up the event handler(s) for the event and runs them 460 func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) { 461 // Do not emit that the plugin is shutdown 462 if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown { 463 return 464 } 465 466 if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil { 467 i.logger.Trace("task event received", "event", ev) 468 handler(ev) 469 return 470 } 471 472 i.logger.Warn("no handler registered for event", "event", ev) 473 }