github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/pluginmanager/drivermanager/instance.go (about)

     1  package drivermanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    11  	"github.com/hashicorp/nomad/helper/pluginutils/singleton"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/nomad/plugins/base"
    14  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    15  	"github.com/hashicorp/nomad/plugins/drivers"
    16  )
    17  
    18  const (
    19  	// driverFPBackoffBaseline is the baseline time for exponential backoff while
    20  	// fingerprinting a driver.
    21  	driverFPBackoffBaseline = 5 * time.Second
    22  
    23  	// driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting
    24  	// a driver.
    25  	driverFPBackoffLimit = 2 * time.Minute
    26  )
    27  
    28  // instanceManagerConfig configures a driver instance manager
    29  type instanceManagerConfig struct {
    30  	// Logger is the logger used by the driver instance manager
    31  	Logger log.Logger
    32  
    33  	// Ctx is used to shutdown the driver instance manager
    34  	Ctx context.Context
    35  
    36  	// Loader is the plugin loader
    37  	Loader loader.PluginCatalog
    38  
    39  	// StoreReattach is used to store a plugins reattach config
    40  	StoreReattach StorePluginReattachFn
    41  
    42  	// FetchReattach is used to retrieve a plugin's reattach config
    43  	FetchReattach FetchPluginReattachFn
    44  
    45  	// PluginConfig is the config passed to the launched plugins
    46  	PluginConfig *base.AgentConfig
    47  
    48  	// ID is the ID of the plugin being managed
    49  	ID *loader.PluginID
    50  
    51  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    52  	UpdateNodeFromDriver UpdateNodeDriverInfoFn
    53  
    54  	// EventHandlerFactory is used to fetch a task event handler
    55  	EventHandlerFactory TaskEventHandlerFactory
    56  }
    57  
    58  // instanceManager is used to manage a single driver plugin
    59  type instanceManager struct {
    60  	// logger is the logger used by the driver instance manager
    61  	logger log.Logger
    62  
    63  	// ctx is used to shutdown the driver manager
    64  	ctx context.Context
    65  
    66  	// cancel is used to shutdown management of this driver plugin
    67  	cancel context.CancelFunc
    68  
    69  	// loader is the plugin loader
    70  	loader loader.PluginCatalog
    71  
    72  	// storeReattach is used to store a plugins reattach config
    73  	storeReattach StorePluginReattachFn
    74  
    75  	// fetchReattach is used to retrieve a plugin's reattach config
    76  	fetchReattach FetchPluginReattachFn
    77  
    78  	// pluginConfig is the config passed to the launched plugins
    79  	pluginConfig *base.AgentConfig
    80  
    81  	// id is the ID of the plugin being managed
    82  	id *loader.PluginID
    83  
    84  	// plugin is the plugin instance being managed
    85  	plugin loader.PluginInstance
    86  
    87  	// driver is the driver plugin being managed
    88  	driver drivers.DriverPlugin
    89  
    90  	// pluginLock locks access to the driver and plugin
    91  	pluginLock sync.Mutex
    92  
    93  	// shutdownLock is used to serialize attempts to shutdown
    94  	shutdownLock sync.Mutex
    95  
    96  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    97  	updateNodeFromDriver UpdateNodeDriverInfoFn
    98  
    99  	// eventHandlerFactory is used to fetch a handler for a task event
   100  	eventHandlerFactory TaskEventHandlerFactory
   101  
   102  	// firstFingerprintCh is used to trigger that we have successfully
   103  	// fingerprinted once. It is used to gate launching the stats collection.
   104  	firstFingerprintCh chan struct{}
   105  	hasFingerprinted   bool
   106  
   107  	// lastHealthState is the last known health fingerprinted by the manager
   108  	lastHealthState   drivers.HealthState
   109  	lastHealthStateMu sync.Mutex
   110  }
   111  
   112  // newInstanceManager returns a new driver instance manager. It is expected that
   113  // the context passed in the configuration is cancelled in order to shutdown
   114  // launched goroutines.
   115  func newInstanceManager(c *instanceManagerConfig) *instanceManager {
   116  
   117  	ctx, cancel := context.WithCancel(c.Ctx)
   118  	i := &instanceManager{
   119  		logger:               c.Logger.With("driver", c.ID.Name),
   120  		ctx:                  ctx,
   121  		cancel:               cancel,
   122  		loader:               c.Loader,
   123  		storeReattach:        c.StoreReattach,
   124  		fetchReattach:        c.FetchReattach,
   125  		pluginConfig:         c.PluginConfig,
   126  		id:                   c.ID,
   127  		updateNodeFromDriver: c.UpdateNodeFromDriver,
   128  		eventHandlerFactory:  c.EventHandlerFactory,
   129  		firstFingerprintCh:   make(chan struct{}),
   130  	}
   131  
   132  	go i.run()
   133  	return i
   134  }
   135  
   136  // WaitForFirstFingerprint waits until either the plugin fingerprints, the
   137  // passed context is done, or the plugin instance manager is shutdown.
   138  func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
   139  	select {
   140  	case <-i.ctx.Done():
   141  	case <-ctx.Done():
   142  	case <-i.firstFingerprintCh:
   143  	}
   144  }
   145  
   146  // run is a long lived goroutine that starts the fingerprinting and stats
   147  // collection goroutine and then shutsdown the plugin on exit.
   148  func (i *instanceManager) run() {
   149  	// Dispense once to ensure we are given a valid plugin
   150  	if _, err := i.dispense(); err != nil {
   151  		i.logger.Error("dispensing initial plugin failed", "error", err)
   152  		return
   153  	}
   154  
   155  	// Create a waitgroup to block on shutdown for all created goroutines to
   156  	// exit
   157  	var wg sync.WaitGroup
   158  
   159  	// Start the fingerprinter
   160  	wg.Add(1)
   161  	go func() {
   162  		i.fingerprint()
   163  		wg.Done()
   164  	}()
   165  
   166  	// Start event handler
   167  	wg.Add(1)
   168  	go func() {
   169  		i.handleEvents()
   170  		wg.Done()
   171  	}()
   172  
   173  	// Do a final cleanup
   174  	wg.Wait()
   175  	i.cleanup()
   176  }
   177  
   178  // dispense is used to dispense a plugin.
   179  func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) {
   180  	i.pluginLock.Lock()
   181  	defer i.pluginLock.Unlock()
   182  
   183  	// See if we already have a running instance
   184  	if i.plugin != nil && !i.plugin.Exited() {
   185  		return i.driver, nil
   186  	}
   187  
   188  	var pluginInstance loader.PluginInstance
   189  	dispenseFn := func() (loader.PluginInstance, error) {
   190  		return i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   191  	}
   192  
   193  	if reattach, ok := i.fetchReattach(); ok {
   194  		// Reattach to existing plugin
   195  		pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach)
   196  
   197  		// If reattachment fails, get a new plugin instance
   198  		if err != nil {
   199  			i.logger.Warn("failed to reattach to plugin, starting new instance", "err", err)
   200  			pluginInstance, err = dispenseFn()
   201  		}
   202  	} else {
   203  		// Get an instance of the plugin
   204  		pluginInstance, err = dispenseFn()
   205  	}
   206  
   207  	if err != nil {
   208  		// Retry as the error just indicates the singleton has exited
   209  		if err == singleton.SingletonPluginExited {
   210  			pluginInstance, err = dispenseFn()
   211  		}
   212  
   213  		// If we still have an error there is a real problem
   214  		if err != nil {
   215  			return nil, fmt.Errorf("failed to start plugin: %v", err)
   216  		}
   217  	}
   218  
   219  	// Convert to a driver plugin
   220  	driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin)
   221  	if !ok {
   222  		pluginInstance.Kill()
   223  		return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
   224  	}
   225  
   226  	// Store the plugin and driver
   227  	i.plugin = pluginInstance
   228  	i.driver = driver
   229  
   230  	// Store the reattach config
   231  	if c, ok := pluginInstance.ReattachConfig(); ok {
   232  		if err := i.storeReattach(c); err != nil {
   233  			i.logger.Error("error storing driver plugin reattach config", "error", err)
   234  		}
   235  	}
   236  
   237  	return driver, nil
   238  }
   239  
   240  // cleanup shutsdown the plugin
   241  func (i *instanceManager) cleanup() {
   242  	i.shutdownLock.Lock()
   243  	i.pluginLock.Lock()
   244  	defer i.pluginLock.Unlock()
   245  	defer i.shutdownLock.Unlock()
   246  
   247  	if i.plugin == nil {
   248  		return
   249  	}
   250  
   251  	if !i.plugin.Exited() {
   252  		i.plugin.Kill()
   253  		if err := i.storeReattach(nil); err != nil {
   254  			i.logger.Warn("error clearing plugin reattach config from state store", "error", err)
   255  		}
   256  	}
   257  
   258  	i.cancel()
   259  }
   260  
   261  // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call
   262  // to the driver. The fingerprint chan is returned along with the cancel func
   263  // for the context used in the RPC. This cancel func should always be called
   264  // when the caller is finished with the channel.
   265  func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) {
   266  	driver, err := i.dispense()
   267  	if err != nil {
   268  		return nil, nil, err
   269  	}
   270  
   271  	ctx, cancel := context.WithCancel(i.ctx)
   272  	fingerCh, err := driver.Fingerprint(ctx)
   273  	if err != nil {
   274  		cancel()
   275  		return nil, nil, err
   276  	}
   277  
   278  	return fingerCh, cancel, nil
   279  }
   280  
   281  // fingerprint is the main loop for fingerprinting.
   282  func (i *instanceManager) fingerprint() {
   283  	fpChan, cancel, err := i.dispenseFingerprintCh()
   284  	if err != nil {
   285  		i.logger.Error("failed to dispense driver plugin", "error", err)
   286  	}
   287  
   288  	// backoff and retry used if the RPC is closed by the other end
   289  	var backoff time.Duration
   290  	var retry int
   291  	for {
   292  		if backoff > 0 {
   293  			select {
   294  			case <-time.After(backoff):
   295  			case <-i.ctx.Done():
   296  				cancel()
   297  				return
   298  			}
   299  		}
   300  
   301  		select {
   302  		case <-i.ctx.Done():
   303  			cancel()
   304  			return
   305  		case fp, ok := <-fpChan:
   306  			if ok {
   307  				if fp.Err == nil {
   308  					i.handleFingerprint(fp)
   309  				} else {
   310  					i.logger.Warn("received fingerprint error from driver", "error", fp.Err)
   311  					i.handleFingerprintError()
   312  				}
   313  				continue
   314  			}
   315  
   316  			// avoid fingerprinting again if ctx and fpChan both close
   317  			if i.ctx.Err() != nil {
   318  				cancel()
   319  				return
   320  			}
   321  
   322  			// if the channel is closed attempt to open a new one
   323  			newFpChan, newCancel, err := i.dispenseFingerprintCh()
   324  			if err != nil {
   325  				i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry)
   326  				i.handleFingerprintError()
   327  
   328  				// Calculate the new backoff
   329  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   330  				if backoff > driverFPBackoffLimit {
   331  					backoff = driverFPBackoffLimit
   332  				}
   333  				// Increment retry counter
   334  				retry++
   335  				continue
   336  			}
   337  			cancel()
   338  			fpChan = newFpChan
   339  			cancel = newCancel
   340  
   341  			// Reset backoff
   342  			backoff = 0
   343  			retry = 0
   344  		}
   345  	}
   346  }
   347  
   348  // handleFingerprintError is called when an error occurred while fingerprinting
   349  // and will set the driver to unhealthy
   350  func (i *instanceManager) handleFingerprintError() {
   351  	di := &structs.DriverInfo{
   352  		Healthy:           false,
   353  		HealthDescription: "failed to fingerprint driver",
   354  		UpdateTime:        time.Now(),
   355  	}
   356  	i.updateNodeFromDriver(i.id.Name, di)
   357  }
   358  
   359  // handleFingerprint updates the node with the current fingerprint status
   360  func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) {
   361  	attrs := make(map[string]string, len(fp.Attributes))
   362  	for key, attr := range fp.Attributes {
   363  		attrs[key] = attr.GoString()
   364  	}
   365  	di := &structs.DriverInfo{
   366  		Attributes:        attrs,
   367  		Detected:          fp.Health != drivers.HealthStateUndetected,
   368  		Healthy:           fp.Health == drivers.HealthStateHealthy,
   369  		HealthDescription: fp.HealthDescription,
   370  		UpdateTime:        time.Now(),
   371  	}
   372  	i.updateNodeFromDriver(i.id.Name, di)
   373  
   374  	// log detected/undetected state changes after the initial fingerprint
   375  	i.lastHealthStateMu.Lock()
   376  	if i.hasFingerprinted {
   377  		if i.lastHealthState != fp.Health {
   378  			i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription)
   379  		}
   380  	}
   381  	i.lastHealthState = fp.Health
   382  	i.lastHealthStateMu.Unlock()
   383  
   384  	// if this is the first fingerprint, mark that we have received it
   385  	if !i.hasFingerprinted {
   386  		i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription)
   387  		close(i.firstFingerprintCh)
   388  		i.hasFingerprinted = true
   389  	}
   390  }
   391  
   392  // getLastHealth returns the most recent HealthState from fingerprinting
   393  func (i *instanceManager) getLastHealth() drivers.HealthState {
   394  	i.lastHealthStateMu.Lock()
   395  	defer i.lastHealthStateMu.Unlock()
   396  	return i.lastHealthState
   397  }
   398  
   399  // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC.
   400  // The TaskEvent chan and cancel func for the RPC is return. The cancel func must
   401  // be called by the caller to properly cleanup the context
   402  func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) {
   403  	driver, err := i.dispense()
   404  	if err != nil {
   405  		return nil, nil, err
   406  	}
   407  
   408  	ctx, cancel := context.WithCancel(i.ctx)
   409  	eventsCh, err := driver.TaskEvents(ctx)
   410  	if err != nil {
   411  		cancel()
   412  		return nil, nil, err
   413  	}
   414  
   415  	return eventsCh, cancel, nil
   416  }
   417  
   418  // handleEvents is the main loop that receives task events from the driver
   419  func (i *instanceManager) handleEvents() {
   420  	eventsCh, cancel, err := i.dispenseTaskEventsCh()
   421  	if err != nil {
   422  		i.logger.Error("failed to dispense driver", "error", err)
   423  	}
   424  
   425  	var backoff time.Duration
   426  	var retry int
   427  	for {
   428  		if backoff > 0 {
   429  			select {
   430  			case <-time.After(backoff):
   431  			case <-i.ctx.Done():
   432  				cancel()
   433  				return
   434  			}
   435  		}
   436  
   437  		select {
   438  		case <-i.ctx.Done():
   439  			cancel()
   440  			return
   441  		case ev, ok := <-eventsCh:
   442  			if ok {
   443  				i.handleEvent(ev)
   444  				continue
   445  			}
   446  
   447  			// if the channel is closed attempt to open a new one
   448  			newEventsChan, newCancel, err := i.dispenseTaskEventsCh()
   449  			if err != nil {
   450  				i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry)
   451  
   452  				// Calculate the new backoff
   453  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   454  				if backoff > driverFPBackoffLimit {
   455  					backoff = driverFPBackoffLimit
   456  				}
   457  				retry++
   458  				continue
   459  			}
   460  			cancel()
   461  			eventsCh = newEventsChan
   462  			cancel = newCancel
   463  
   464  			// Reset backoff
   465  			backoff = 0
   466  			retry = 0
   467  		}
   468  	}
   469  }
   470  
   471  // handleEvent looks up the event handler(s) for the event and runs them
   472  func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) {
   473  	// Do not emit that the plugin is shutdown
   474  	if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown {
   475  		return
   476  	}
   477  
   478  	if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil {
   479  		i.logger.Trace("task event received", "event", ev)
   480  		handler(ev)
   481  		return
   482  	}
   483  
   484  	i.logger.Warn("no handler registered for event", "event", ev)
   485  }