github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/pluginmanager/drivermanager/instance.go (about) 1 package drivermanager 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 log "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/helper/pluginutils/loader" 11 "github.com/hashicorp/nomad/helper/pluginutils/singleton" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/plugins/base" 14 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 15 "github.com/hashicorp/nomad/plugins/drivers" 16 ) 17 18 const ( 19 // driverFPBackoffBaseline is the baseline time for exponential backoff while 20 // fingerprinting a driver. 21 driverFPBackoffBaseline = 5 * time.Second 22 23 // driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting 24 // a driver. 25 driverFPBackoffLimit = 2 * time.Minute 26 ) 27 28 // instanceManagerConfig configures a driver instance manager 29 type instanceManagerConfig struct { 30 // Logger is the logger used by the driver instance manager 31 Logger log.Logger 32 33 // Ctx is used to shutdown the driver instance manager 34 Ctx context.Context 35 36 // Loader is the plugin loader 37 Loader loader.PluginCatalog 38 39 // StoreReattach is used to store a plugins reattach config 40 StoreReattach StorePluginReattachFn 41 42 // FetchReattach is used to retrieve a plugin's reattach config 43 FetchReattach FetchPluginReattachFn 44 45 // PluginConfig is the config passed to the launched plugins 46 PluginConfig *base.AgentConfig 47 48 // ID is the ID of the plugin being managed 49 ID *loader.PluginID 50 51 // updateNodeFromDriver is the callback used to update the node from fingerprinting 52 UpdateNodeFromDriver UpdateNodeDriverInfoFn 53 54 // EventHandlerFactory is used to fetch a task event handler 55 EventHandlerFactory TaskEventHandlerFactory 56 } 57 58 // instanceManager is used to manage a single driver plugin 59 type instanceManager struct { 60 // logger is the logger used by the driver instance manager 61 logger log.Logger 62 63 // ctx is used to shutdown the driver manager 64 ctx context.Context 65 66 // cancel is used to shutdown management of this driver plugin 67 cancel context.CancelFunc 68 69 // loader is the plugin loader 70 loader loader.PluginCatalog 71 72 // storeReattach is used to store a plugins reattach config 73 storeReattach StorePluginReattachFn 74 75 // fetchReattach is used to retrieve a plugin's reattach config 76 fetchReattach FetchPluginReattachFn 77 78 // pluginConfig is the config passed to the launched plugins 79 pluginConfig *base.AgentConfig 80 81 // id is the ID of the plugin being managed 82 id *loader.PluginID 83 84 // plugin is the plugin instance being managed 85 plugin loader.PluginInstance 86 87 // driver is the driver plugin being managed 88 driver drivers.DriverPlugin 89 90 // pluginLock locks access to the driver and plugin 91 pluginLock sync.Mutex 92 93 // shutdownLock is used to serialize attempts to shutdown 94 shutdownLock sync.Mutex 95 96 // updateNodeFromDriver is the callback used to update the node from fingerprinting 97 updateNodeFromDriver UpdateNodeDriverInfoFn 98 99 // eventHandlerFactory is used to fetch a handler for a task event 100 eventHandlerFactory TaskEventHandlerFactory 101 102 // firstFingerprintCh is used to trigger that we have successfully 103 // fingerprinted once. It is used to gate launching the stats collection. 104 firstFingerprintCh chan struct{} 105 hasFingerprinted bool 106 107 // lastHealthState is the last known health fingerprinted by the manager 108 lastHealthState drivers.HealthState 109 lastHealthStateMu sync.Mutex 110 } 111 112 // newInstanceManager returns a new driver instance manager. It is expected that 113 // the context passed in the configuration is cancelled in order to shutdown 114 // launched goroutines. 115 func newInstanceManager(c *instanceManagerConfig) *instanceManager { 116 117 ctx, cancel := context.WithCancel(c.Ctx) 118 i := &instanceManager{ 119 logger: c.Logger.With("driver", c.ID.Name), 120 ctx: ctx, 121 cancel: cancel, 122 loader: c.Loader, 123 storeReattach: c.StoreReattach, 124 fetchReattach: c.FetchReattach, 125 pluginConfig: c.PluginConfig, 126 id: c.ID, 127 updateNodeFromDriver: c.UpdateNodeFromDriver, 128 eventHandlerFactory: c.EventHandlerFactory, 129 firstFingerprintCh: make(chan struct{}), 130 } 131 132 go i.run() 133 return i 134 } 135 136 // WaitForFirstFingerprint waits until either the plugin fingerprints, the 137 // passed context is done, or the plugin instance manager is shutdown. 138 func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) { 139 select { 140 case <-i.ctx.Done(): 141 case <-ctx.Done(): 142 case <-i.firstFingerprintCh: 143 } 144 } 145 146 // run is a long lived goroutine that starts the fingerprinting and stats 147 // collection goroutine and then shutsdown the plugin on exit. 148 func (i *instanceManager) run() { 149 // Dispense once to ensure we are given a valid plugin 150 if _, err := i.dispense(); err != nil { 151 i.logger.Error("dispensing initial plugin failed", "error", err) 152 return 153 } 154 155 // Create a waitgroup to block on shutdown for all created goroutines to 156 // exit 157 var wg sync.WaitGroup 158 159 // Start the fingerprinter 160 wg.Add(1) 161 go func() { 162 i.fingerprint() 163 wg.Done() 164 }() 165 166 // Start event handler 167 wg.Add(1) 168 go func() { 169 i.handleEvents() 170 wg.Done() 171 }() 172 173 // Do a final cleanup 174 wg.Wait() 175 i.cleanup() 176 } 177 178 // dispense is used to dispense a plugin. 179 func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) { 180 i.pluginLock.Lock() 181 defer i.pluginLock.Unlock() 182 183 // See if we already have a running instance 184 if i.plugin != nil && !i.plugin.Exited() { 185 return i.driver, nil 186 } 187 188 var pluginInstance loader.PluginInstance 189 dispenseFn := func() (loader.PluginInstance, error) { 190 return i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger) 191 } 192 193 if reattach, ok := i.fetchReattach(); ok { 194 // Reattach to existing plugin 195 pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach) 196 197 // If reattachment fails, get a new plugin instance 198 if err != nil { 199 i.logger.Warn("failed to reattach to plugin, starting new instance", "err", err) 200 pluginInstance, err = dispenseFn() 201 } 202 } else { 203 // Get an instance of the plugin 204 pluginInstance, err = dispenseFn() 205 } 206 207 if err != nil { 208 // Retry as the error just indicates the singleton has exited 209 if err == singleton.SingletonPluginExited { 210 pluginInstance, err = dispenseFn() 211 } 212 213 // If we still have an error there is a real problem 214 if err != nil { 215 return nil, fmt.Errorf("failed to start plugin: %v", err) 216 } 217 } 218 219 // Convert to a driver plugin 220 driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin) 221 if !ok { 222 pluginInstance.Kill() 223 return nil, fmt.Errorf("plugin loaded does not implement the driver interface") 224 } 225 226 // Store the plugin and driver 227 i.plugin = pluginInstance 228 i.driver = driver 229 230 // Store the reattach config 231 if c, ok := pluginInstance.ReattachConfig(); ok { 232 if err := i.storeReattach(c); err != nil { 233 i.logger.Error("error storing driver plugin reattach config", "error", err) 234 } 235 } 236 237 return driver, nil 238 } 239 240 // cleanup shutsdown the plugin 241 func (i *instanceManager) cleanup() { 242 i.shutdownLock.Lock() 243 i.pluginLock.Lock() 244 defer i.pluginLock.Unlock() 245 defer i.shutdownLock.Unlock() 246 247 if i.plugin == nil { 248 return 249 } 250 251 if !i.plugin.Exited() { 252 i.plugin.Kill() 253 if err := i.storeReattach(nil); err != nil { 254 i.logger.Warn("error clearing plugin reattach config from state store", "error", err) 255 } 256 } 257 258 i.cancel() 259 } 260 261 // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call 262 // to the driver. The fingerprint chan is returned along with the cancel func 263 // for the context used in the RPC. This cancel func should always be called 264 // when the caller is finished with the channel. 265 func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) { 266 driver, err := i.dispense() 267 if err != nil { 268 return nil, nil, err 269 } 270 271 ctx, cancel := context.WithCancel(i.ctx) 272 fingerCh, err := driver.Fingerprint(ctx) 273 if err != nil { 274 cancel() 275 return nil, nil, err 276 } 277 278 return fingerCh, cancel, nil 279 } 280 281 // fingerprint is the main loop for fingerprinting. 282 func (i *instanceManager) fingerprint() { 283 fpChan, cancel, err := i.dispenseFingerprintCh() 284 if err != nil { 285 i.logger.Error("failed to dispense driver plugin", "error", err) 286 } 287 288 // backoff and retry used if the RPC is closed by the other end 289 var backoff time.Duration 290 var retry int 291 for { 292 if backoff > 0 { 293 select { 294 case <-time.After(backoff): 295 case <-i.ctx.Done(): 296 cancel() 297 return 298 } 299 } 300 301 select { 302 case <-i.ctx.Done(): 303 cancel() 304 return 305 case fp, ok := <-fpChan: 306 if ok { 307 if fp.Err == nil { 308 i.handleFingerprint(fp) 309 } else { 310 i.logger.Warn("received fingerprint error from driver", "error", fp.Err) 311 i.handleFingerprintError() 312 } 313 continue 314 } 315 316 // avoid fingerprinting again if ctx and fpChan both close 317 if i.ctx.Err() != nil { 318 cancel() 319 return 320 } 321 322 // if the channel is closed attempt to open a new one 323 newFpChan, newCancel, err := i.dispenseFingerprintCh() 324 if err != nil { 325 i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry) 326 i.handleFingerprintError() 327 328 // Calculate the new backoff 329 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 330 if backoff > driverFPBackoffLimit { 331 backoff = driverFPBackoffLimit 332 } 333 // Increment retry counter 334 retry++ 335 continue 336 } 337 cancel() 338 fpChan = newFpChan 339 cancel = newCancel 340 341 // Reset backoff 342 backoff = 0 343 retry = 0 344 } 345 } 346 } 347 348 // handleFingerprintError is called when an error occurred while fingerprinting 349 // and will set the driver to unhealthy 350 func (i *instanceManager) handleFingerprintError() { 351 di := &structs.DriverInfo{ 352 Healthy: false, 353 HealthDescription: "failed to fingerprint driver", 354 UpdateTime: time.Now(), 355 } 356 i.updateNodeFromDriver(i.id.Name, di) 357 } 358 359 // handleFingerprint updates the node with the current fingerprint status 360 func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) { 361 attrs := make(map[string]string, len(fp.Attributes)) 362 for key, attr := range fp.Attributes { 363 attrs[key] = attr.GoString() 364 } 365 di := &structs.DriverInfo{ 366 Attributes: attrs, 367 Detected: fp.Health != drivers.HealthStateUndetected, 368 Healthy: fp.Health == drivers.HealthStateHealthy, 369 HealthDescription: fp.HealthDescription, 370 UpdateTime: time.Now(), 371 } 372 i.updateNodeFromDriver(i.id.Name, di) 373 374 // log detected/undetected state changes after the initial fingerprint 375 i.lastHealthStateMu.Lock() 376 if i.hasFingerprinted { 377 if i.lastHealthState != fp.Health { 378 i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription) 379 } 380 } 381 i.lastHealthState = fp.Health 382 i.lastHealthStateMu.Unlock() 383 384 // if this is the first fingerprint, mark that we have received it 385 if !i.hasFingerprinted { 386 i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription) 387 close(i.firstFingerprintCh) 388 i.hasFingerprinted = true 389 } 390 } 391 392 // getLastHealth returns the most recent HealthState from fingerprinting 393 func (i *instanceManager) getLastHealth() drivers.HealthState { 394 i.lastHealthStateMu.Lock() 395 defer i.lastHealthStateMu.Unlock() 396 return i.lastHealthState 397 } 398 399 // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC. 400 // The TaskEvent chan and cancel func for the RPC is return. The cancel func must 401 // be called by the caller to properly cleanup the context 402 func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) { 403 driver, err := i.dispense() 404 if err != nil { 405 return nil, nil, err 406 } 407 408 ctx, cancel := context.WithCancel(i.ctx) 409 eventsCh, err := driver.TaskEvents(ctx) 410 if err != nil { 411 cancel() 412 return nil, nil, err 413 } 414 415 return eventsCh, cancel, nil 416 } 417 418 // handleEvents is the main loop that receives task events from the driver 419 func (i *instanceManager) handleEvents() { 420 eventsCh, cancel, err := i.dispenseTaskEventsCh() 421 if err != nil { 422 i.logger.Error("failed to dispense driver", "error", err) 423 } 424 425 var backoff time.Duration 426 var retry int 427 for { 428 if backoff > 0 { 429 select { 430 case <-time.After(backoff): 431 case <-i.ctx.Done(): 432 cancel() 433 return 434 } 435 } 436 437 select { 438 case <-i.ctx.Done(): 439 cancel() 440 return 441 case ev, ok := <-eventsCh: 442 if ok { 443 i.handleEvent(ev) 444 continue 445 } 446 447 // if the channel is closed attempt to open a new one 448 newEventsChan, newCancel, err := i.dispenseTaskEventsCh() 449 if err != nil { 450 i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry) 451 452 // Calculate the new backoff 453 backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline 454 if backoff > driverFPBackoffLimit { 455 backoff = driverFPBackoffLimit 456 } 457 retry++ 458 continue 459 } 460 cancel() 461 eventsCh = newEventsChan 462 cancel = newCancel 463 464 // Reset backoff 465 backoff = 0 466 retry = 0 467 } 468 } 469 } 470 471 // handleEvent looks up the event handler(s) for the event and runs them 472 func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) { 473 // Do not emit that the plugin is shutdown 474 if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown { 475 return 476 } 477 478 if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil { 479 i.logger.Trace("task event received", "event", ev) 480 handler(ev) 481 return 482 } 483 484 i.logger.Warn("no handler registered for event", "event", ev) 485 }