github.com/bigcommerce/nomad@v0.9.3-bc/client/pluginmanager/drivermanager/instance.go (about)

     1  package drivermanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    11  	"github.com/hashicorp/nomad/helper/pluginutils/singleton"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/nomad/plugins/base"
    14  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    15  	"github.com/hashicorp/nomad/plugins/drivers"
    16  )
    17  
    18  const (
    19  	// driverFPBackoffBaseline is the baseline time for exponential backoff while
    20  	// fingerprinting a driver.
    21  	driverFPBackoffBaseline = 5 * time.Second
    22  
    23  	// driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting
    24  	// a driver.
    25  	driverFPBackoffLimit = 2 * time.Minute
    26  )
    27  
    28  // instanceManagerConfig configures a driver instance manager
    29  type instanceManagerConfig struct {
    30  	// Logger is the logger used by the driver instance manager
    31  	Logger log.Logger
    32  
    33  	// Ctx is used to shutdown the driver instance manager
    34  	Ctx context.Context
    35  
    36  	// Loader is the plugin loader
    37  	Loader loader.PluginCatalog
    38  
    39  	// StoreReattach is used to store a plugins reattach config
    40  	StoreReattach StorePluginReattachFn
    41  
    42  	// FetchReattach is used to retrieve a plugin's reattach config
    43  	FetchReattach FetchPluginReattachFn
    44  
    45  	// PluginConfig is the config passed to the launched plugins
    46  	PluginConfig *base.AgentConfig
    47  
    48  	// ID is the ID of the plugin being managed
    49  	ID *loader.PluginID
    50  
    51  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    52  	UpdateNodeFromDriver UpdateNodeDriverInfoFn
    53  
    54  	// EventHandlerFactory is used to fetch a task event handler
    55  	EventHandlerFactory TaskEventHandlerFactory
    56  }
    57  
    58  // instanceManager is used to manage a single driver plugin
    59  type instanceManager struct {
    60  	// logger is the logger used by the driver instance manager
    61  	logger log.Logger
    62  
    63  	// ctx is used to shutdown the driver manager
    64  	ctx context.Context
    65  
    66  	// cancel is used to shutdown management of this driver plugin
    67  	cancel context.CancelFunc
    68  
    69  	// loader is the plugin loader
    70  	loader loader.PluginCatalog
    71  
    72  	// storeReattach is used to store a plugins reattach config
    73  	storeReattach StorePluginReattachFn
    74  
    75  	// fetchReattach is used to retrieve a plugin's reattach config
    76  	fetchReattach FetchPluginReattachFn
    77  
    78  	// pluginConfig is the config passed to the launched plugins
    79  	pluginConfig *base.AgentConfig
    80  
    81  	// id is the ID of the plugin being managed
    82  	id *loader.PluginID
    83  
    84  	// plugin is the plugin instance being managed
    85  	plugin loader.PluginInstance
    86  
    87  	// driver is the driver plugin being managed
    88  	driver drivers.DriverPlugin
    89  
    90  	// pluginLock locks access to the driver and plugin
    91  	pluginLock sync.Mutex
    92  
    93  	// shutdownLock is used to serialize attempts to shutdown
    94  	shutdownLock sync.Mutex
    95  
    96  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    97  	updateNodeFromDriver UpdateNodeDriverInfoFn
    98  
    99  	// eventHandlerFactory is used to fetch a handler for a task event
   100  	eventHandlerFactory TaskEventHandlerFactory
   101  
   102  	// firstFingerprintCh is used to trigger that we have successfully
   103  	// fingerprinted once. It is used to gate launching the stats collection.
   104  	firstFingerprintCh chan struct{}
   105  	hasFingerprinted   bool
   106  
   107  	// lastHealthState is the last known health fingerprinted by the manager
   108  	lastHealthState   drivers.HealthState
   109  	lastHealthStateMu sync.Mutex
   110  }
   111  
   112  // newInstanceManager returns a new driver instance manager. It is expected that
   113  // the context passed in the configuration is cancelled in order to shutdown
   114  // launched goroutines.
   115  func newInstanceManager(c *instanceManagerConfig) *instanceManager {
   116  
   117  	ctx, cancel := context.WithCancel(c.Ctx)
   118  	i := &instanceManager{
   119  		logger:               c.Logger.With("driver", c.ID.Name),
   120  		ctx:                  ctx,
   121  		cancel:               cancel,
   122  		loader:               c.Loader,
   123  		storeReattach:        c.StoreReattach,
   124  		fetchReattach:        c.FetchReattach,
   125  		pluginConfig:         c.PluginConfig,
   126  		id:                   c.ID,
   127  		updateNodeFromDriver: c.UpdateNodeFromDriver,
   128  		eventHandlerFactory:  c.EventHandlerFactory,
   129  		firstFingerprintCh:   make(chan struct{}),
   130  	}
   131  
   132  	go i.run()
   133  	return i
   134  }
   135  
   136  // WaitForFirstFingerprint waits until either the plugin fingerprints, the
   137  // passed context is done, or the plugin instance manager is shutdown.
   138  func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
   139  	select {
   140  	case <-i.ctx.Done():
   141  	case <-ctx.Done():
   142  	case <-i.firstFingerprintCh:
   143  	}
   144  }
   145  
   146  // run is a long lived goroutine that starts the fingerprinting and stats
   147  // collection goroutine and then shutsdown the plugin on exit.
   148  func (i *instanceManager) run() {
   149  	// Dispense once to ensure we are given a valid plugin
   150  	if _, err := i.dispense(); err != nil {
   151  		i.logger.Error("dispensing initial plugin failed", "error", err)
   152  		return
   153  	}
   154  
   155  	// Create a waitgroup to block on shutdown for all created goroutines to
   156  	// exit
   157  	var wg sync.WaitGroup
   158  
   159  	// Start the fingerprinter
   160  	wg.Add(1)
   161  	go func() {
   162  		i.fingerprint()
   163  		wg.Done()
   164  	}()
   165  
   166  	// Start event handler
   167  	wg.Add(1)
   168  	go func() {
   169  		i.handleEvents()
   170  		wg.Done()
   171  	}()
   172  
   173  	// Do a final cleanup
   174  	wg.Wait()
   175  	i.cleanup()
   176  }
   177  
   178  // dispense is used to dispense a plugin.
   179  func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) {
   180  	i.pluginLock.Lock()
   181  	defer i.pluginLock.Unlock()
   182  
   183  	// See if we already have a running instance
   184  	if i.plugin != nil && !i.plugin.Exited() {
   185  		return i.driver, nil
   186  	}
   187  
   188  	var pluginInstance loader.PluginInstance
   189  
   190  	if reattach, ok := i.fetchReattach(); ok {
   191  		// Reattach to existing plugin
   192  		pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach)
   193  	} else {
   194  		// Get an instance of the plugin
   195  		pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   196  	}
   197  	if err != nil {
   198  		// Retry as the error just indicates the singleton has exited
   199  		if err == singleton.SingletonPluginExited {
   200  			pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   201  		}
   202  
   203  		// If we still have an error there is a real problem
   204  		if err != nil {
   205  			return nil, fmt.Errorf("failed to start plugin: %v", err)
   206  		}
   207  	}
   208  
   209  	// Convert to a driver plugin
   210  	driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin)
   211  	if !ok {
   212  		pluginInstance.Kill()
   213  		return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
   214  	}
   215  
   216  	// Store the plugin and driver
   217  	i.plugin = pluginInstance
   218  	i.driver = driver
   219  
   220  	// Store the reattach config
   221  	if c, ok := pluginInstance.ReattachConfig(); ok {
   222  		if err := i.storeReattach(c); err != nil {
   223  			i.logger.Error("error storing driver plugin reattach config", "error", err)
   224  		}
   225  	}
   226  
   227  	return driver, nil
   228  }
   229  
   230  // cleanup shutsdown the plugin
   231  func (i *instanceManager) cleanup() {
   232  	i.shutdownLock.Lock()
   233  	i.pluginLock.Lock()
   234  	defer i.pluginLock.Unlock()
   235  	defer i.shutdownLock.Unlock()
   236  
   237  	if i.plugin == nil {
   238  		return
   239  	}
   240  
   241  	if internalPlugin, ok := i.plugin.Plugin().(drivers.InternalDriverPlugin); ok {
   242  		internalPlugin.Shutdown()
   243  	}
   244  
   245  	if !i.plugin.Exited() {
   246  		i.plugin.Kill()
   247  		if err := i.storeReattach(nil); err != nil {
   248  			i.logger.Warn("error clearing plugin reattach config from state store", "error", err)
   249  		}
   250  	}
   251  
   252  	i.cancel()
   253  }
   254  
   255  // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call
   256  // to the driver. The fingerprint chan is returned along with the cancel func
   257  // for the context used in the RPC. This cancel func should always be called
   258  // when the caller is finished with the channel.
   259  func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) {
   260  	driver, err := i.dispense()
   261  	if err != nil {
   262  		return nil, nil, err
   263  	}
   264  
   265  	ctx, cancel := context.WithCancel(i.ctx)
   266  	fingerCh, err := driver.Fingerprint(ctx)
   267  	if err != nil {
   268  		cancel()
   269  		return nil, nil, err
   270  	}
   271  
   272  	return fingerCh, cancel, nil
   273  }
   274  
   275  // fingerprint is the main loop for fingerprinting.
   276  func (i *instanceManager) fingerprint() {
   277  	fpChan, cancel, err := i.dispenseFingerprintCh()
   278  	if err != nil {
   279  		i.logger.Error("failed to dispense driver plugin", "error", err)
   280  	}
   281  
   282  	// backoff and retry used if the RPC is closed by the other end
   283  	var backoff time.Duration
   284  	var retry int
   285  	for {
   286  		if backoff > 0 {
   287  			select {
   288  			case <-time.After(backoff):
   289  			case <-i.ctx.Done():
   290  				cancel()
   291  				return
   292  			}
   293  		}
   294  
   295  		select {
   296  		case <-i.ctx.Done():
   297  			cancel()
   298  			return
   299  		case fp, ok := <-fpChan:
   300  			if ok {
   301  				if fp.Err == nil {
   302  					i.handleFingerprint(fp)
   303  				} else {
   304  					i.logger.Warn("received fingerprint error from driver", "error", fp.Err)
   305  					i.handleFingerprintError()
   306  				}
   307  				continue
   308  			}
   309  
   310  			// if the channel is closed attempt to open a new one
   311  			newFpChan, newCancel, err := i.dispenseFingerprintCh()
   312  			if err != nil {
   313  				i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry)
   314  				i.handleFingerprintError()
   315  
   316  				// Calculate the new backoff
   317  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   318  				if backoff > driverFPBackoffLimit {
   319  					backoff = driverFPBackoffLimit
   320  				}
   321  				// Increment retry counter
   322  				retry++
   323  				continue
   324  			}
   325  			cancel()
   326  			fpChan = newFpChan
   327  			cancel = newCancel
   328  
   329  			// Reset backoff
   330  			backoff = 0
   331  			retry = 0
   332  		}
   333  	}
   334  }
   335  
   336  // handleFingerprintError is called when an error occurred while fingerprinting
   337  // and will set the driver to unhealthy
   338  func (i *instanceManager) handleFingerprintError() {
   339  	di := &structs.DriverInfo{
   340  		Healthy:           false,
   341  		HealthDescription: "failed to fingerprint driver",
   342  		UpdateTime:        time.Now(),
   343  	}
   344  	i.updateNodeFromDriver(i.id.Name, di)
   345  }
   346  
   347  // handleFingerprint updates the node with the current fingerprint status
   348  func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) {
   349  	attrs := make(map[string]string, len(fp.Attributes))
   350  	for key, attr := range fp.Attributes {
   351  		attrs[key] = attr.GoString()
   352  	}
   353  	di := &structs.DriverInfo{
   354  		Attributes:        attrs,
   355  		Detected:          fp.Health != drivers.HealthStateUndetected,
   356  		Healthy:           fp.Health == drivers.HealthStateHealthy,
   357  		HealthDescription: fp.HealthDescription,
   358  		UpdateTime:        time.Now(),
   359  	}
   360  	i.updateNodeFromDriver(i.id.Name, di)
   361  
   362  	// log detected/undetected state changes after the initial fingerprint
   363  	i.lastHealthStateMu.Lock()
   364  	if i.hasFingerprinted {
   365  		if i.lastHealthState != fp.Health {
   366  			i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription)
   367  		}
   368  	}
   369  	i.lastHealthState = fp.Health
   370  	i.lastHealthStateMu.Unlock()
   371  
   372  	// if this is the first fingerprint, mark that we have received it
   373  	if !i.hasFingerprinted {
   374  		i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription)
   375  		close(i.firstFingerprintCh)
   376  		i.hasFingerprinted = true
   377  	}
   378  }
   379  
   380  // getLastHealth returns the most recent HealthState from fingerprinting
   381  func (i *instanceManager) getLastHealth() drivers.HealthState {
   382  	i.lastHealthStateMu.Lock()
   383  	defer i.lastHealthStateMu.Unlock()
   384  	return i.lastHealthState
   385  }
   386  
   387  // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC.
   388  // The TaskEvent chan and cancel func for the RPC is return. The cancel func must
   389  // be called by the caller to properly cleanup the context
   390  func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) {
   391  	driver, err := i.dispense()
   392  	if err != nil {
   393  		return nil, nil, err
   394  	}
   395  
   396  	ctx, cancel := context.WithCancel(i.ctx)
   397  	eventsCh, err := driver.TaskEvents(ctx)
   398  	if err != nil {
   399  		cancel()
   400  		return nil, nil, err
   401  	}
   402  
   403  	return eventsCh, cancel, nil
   404  }
   405  
   406  // handleEvents is the main loop that receives task events from the driver
   407  func (i *instanceManager) handleEvents() {
   408  	eventsCh, cancel, err := i.dispenseTaskEventsCh()
   409  	if err != nil {
   410  		i.logger.Error("failed to dispense driver", "error", err)
   411  	}
   412  
   413  	var backoff time.Duration
   414  	var retry int
   415  	for {
   416  		if backoff > 0 {
   417  			select {
   418  			case <-time.After(backoff):
   419  			case <-i.ctx.Done():
   420  				cancel()
   421  				return
   422  			}
   423  		}
   424  
   425  		select {
   426  		case <-i.ctx.Done():
   427  			cancel()
   428  			return
   429  		case ev, ok := <-eventsCh:
   430  			if ok {
   431  				i.handleEvent(ev)
   432  				continue
   433  			}
   434  
   435  			// if the channel is closed attempt to open a new one
   436  			newEventsChan, newCancel, err := i.dispenseTaskEventsCh()
   437  			if err != nil {
   438  				i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry)
   439  
   440  				// Calculate the new backoff
   441  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   442  				if backoff > driverFPBackoffLimit {
   443  					backoff = driverFPBackoffLimit
   444  				}
   445  				retry++
   446  				continue
   447  			}
   448  			cancel()
   449  			eventsCh = newEventsChan
   450  			cancel = newCancel
   451  
   452  			// Reset backoff
   453  			backoff = 0
   454  			retry = 0
   455  		}
   456  	}
   457  }
   458  
   459  // handleEvent looks up the event handler(s) for the event and runs them
   460  func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) {
   461  	// Do not emit that the plugin is shutdown
   462  	if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown {
   463  		return
   464  	}
   465  
   466  	if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil {
   467  		i.logger.Trace("task event received", "event", ev)
   468  		handler(ev)
   469  		return
   470  	}
   471  
   472  	i.logger.Warn("no handler registered for event", "event", ev)
   473  }