github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/pluginmanager/drivermanager/instance.go (about)

     1  package drivermanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    11  	"github.com/hashicorp/nomad/helper/pluginutils/singleton"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/nomad/plugins/base"
    14  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    15  	"github.com/hashicorp/nomad/plugins/drivers"
    16  )
    17  
    18  const (
    19  	// driverFPBackoffBaseline is the baseline time for exponential backoff while
    20  	// fingerprinting a driver.
    21  	driverFPBackoffBaseline = 5 * time.Second
    22  
    23  	// driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting
    24  	// a driver.
    25  	driverFPBackoffLimit = 2 * time.Minute
    26  )
    27  
    28  // instanceManagerConfig configures a driver instance manager
    29  type instanceManagerConfig struct {
    30  	// Logger is the logger used by the driver instance manager
    31  	Logger log.Logger
    32  
    33  	// Ctx is used to shutdown the driver instance manager
    34  	Ctx context.Context
    35  
    36  	// Loader is the plugin loader
    37  	Loader loader.PluginCatalog
    38  
    39  	// StoreReattach is used to store a plugins reattach config
    40  	StoreReattach StorePluginReattachFn
    41  
    42  	// FetchReattach is used to retrieve a plugin's reattach config
    43  	FetchReattach FetchPluginReattachFn
    44  
    45  	// PluginConfig is the config passed to the launched plugins
    46  	PluginConfig *base.AgentConfig
    47  
    48  	// ID is the ID of the plugin being managed
    49  	ID *loader.PluginID
    50  
    51  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    52  	UpdateNodeFromDriver UpdateNodeDriverInfoFn
    53  
    54  	// EventHandlerFactory is used to fetch a task event handler
    55  	EventHandlerFactory TaskEventHandlerFactory
    56  }
    57  
    58  // instanceManager is used to manage a single driver plugin
    59  type instanceManager struct {
    60  	// logger is the logger used by the driver instance manager
    61  	logger log.Logger
    62  
    63  	// ctx is used to shutdown the driver manager
    64  	ctx context.Context
    65  
    66  	// cancel is used to shutdown management of this driver plugin
    67  	cancel context.CancelFunc
    68  
    69  	// loader is the plugin loader
    70  	loader loader.PluginCatalog
    71  
    72  	// storeReattach is used to store a plugins reattach config
    73  	storeReattach StorePluginReattachFn
    74  
    75  	// fetchReattach is used to retrieve a plugin's reattach config
    76  	fetchReattach FetchPluginReattachFn
    77  
    78  	// pluginConfig is the config passed to the launched plugins
    79  	pluginConfig *base.AgentConfig
    80  
    81  	// id is the ID of the plugin being managed
    82  	id *loader.PluginID
    83  
    84  	// plugin is the plugin instance being managed
    85  	plugin loader.PluginInstance
    86  
    87  	// driver is the driver plugin being managed
    88  	driver drivers.DriverPlugin
    89  
    90  	// pluginLock locks access to the driver and plugin
    91  	pluginLock sync.Mutex
    92  
    93  	// shutdownLock is used to serialize attempts to shutdown
    94  	shutdownLock sync.Mutex
    95  
    96  	// updateNodeFromDriver is the callback used to update the node from fingerprinting
    97  	updateNodeFromDriver UpdateNodeDriverInfoFn
    98  
    99  	// eventHandlerFactory is used to fetch a handler for a task event
   100  	eventHandlerFactory TaskEventHandlerFactory
   101  
   102  	// firstFingerprintCh is used to trigger that we have successfully
   103  	// fingerprinted once. It is used to gate launching the stats collection.
   104  	firstFingerprintCh chan struct{}
   105  	hasFingerprinted   bool
   106  
   107  	// lastHealthState is the last known health fingerprinted by the manager
   108  	lastHealthState   drivers.HealthState
   109  	lastHealthStateMu sync.Mutex
   110  }
   111  
   112  // newInstanceManager returns a new driver instance manager. It is expected that
   113  // the context passed in the configuration is cancelled in order to shutdown
   114  // launched goroutines.
   115  func newInstanceManager(c *instanceManagerConfig) *instanceManager {
   116  
   117  	ctx, cancel := context.WithCancel(c.Ctx)
   118  	i := &instanceManager{
   119  		logger:               c.Logger.With("driver", c.ID.Name),
   120  		ctx:                  ctx,
   121  		cancel:               cancel,
   122  		loader:               c.Loader,
   123  		storeReattach:        c.StoreReattach,
   124  		fetchReattach:        c.FetchReattach,
   125  		pluginConfig:         c.PluginConfig,
   126  		id:                   c.ID,
   127  		updateNodeFromDriver: c.UpdateNodeFromDriver,
   128  		eventHandlerFactory:  c.EventHandlerFactory,
   129  		firstFingerprintCh:   make(chan struct{}),
   130  	}
   131  
   132  	go i.run()
   133  	return i
   134  }
   135  
   136  // WaitForFirstFingerprint waits until either the plugin fingerprints, the
   137  // passed context is done, or the plugin instance manager is shutdown.
   138  func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
   139  	select {
   140  	case <-i.ctx.Done():
   141  	case <-ctx.Done():
   142  	case <-i.firstFingerprintCh:
   143  	}
   144  }
   145  
   146  // run is a long lived goroutine that starts the fingerprinting and stats
   147  // collection goroutine and then shutsdown the plugin on exit.
   148  func (i *instanceManager) run() {
   149  	// Dispense once to ensure we are given a valid plugin
   150  	if _, err := i.dispense(); err != nil {
   151  		i.logger.Error("dispensing initial plugin failed", "error", err)
   152  		return
   153  	}
   154  
   155  	// Create a waitgroup to block on shutdown for all created goroutines to
   156  	// exit
   157  	var wg sync.WaitGroup
   158  
   159  	// Start the fingerprinter
   160  	wg.Add(1)
   161  	go func() {
   162  		i.fingerprint()
   163  		wg.Done()
   164  	}()
   165  
   166  	// Start event handler
   167  	wg.Add(1)
   168  	go func() {
   169  		i.handleEvents()
   170  		wg.Done()
   171  	}()
   172  
   173  	// Do a final cleanup
   174  	wg.Wait()
   175  	i.cleanup()
   176  }
   177  
   178  // dispense is used to dispense a plugin.
   179  func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) {
   180  	i.pluginLock.Lock()
   181  	defer i.pluginLock.Unlock()
   182  
   183  	// See if we already have a running instance
   184  	if i.plugin != nil && !i.plugin.Exited() {
   185  		return i.driver, nil
   186  	}
   187  
   188  	var pluginInstance loader.PluginInstance
   189  	dispenseFn := func() (loader.PluginInstance, error) {
   190  		return i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   191  	}
   192  
   193  	if reattach, ok := i.fetchReattach(); ok {
   194  		// Reattach to existing plugin
   195  		pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach)
   196  
   197  		// If reattachment fails, get a new plugin instance
   198  		if err != nil {
   199  			i.logger.Warn("failed to reattach to plugin, starting new instance", "err", err)
   200  			pluginInstance, err = dispenseFn()
   201  		}
   202  	} else {
   203  		// Get an instance of the plugin
   204  		pluginInstance, err = dispenseFn()
   205  	}
   206  
   207  	if err != nil {
   208  		// Retry as the error just indicates the singleton has exited
   209  		if err == singleton.SingletonPluginExited {
   210  			pluginInstance, err = dispenseFn()
   211  		}
   212  
   213  		// If we still have an error there is a real problem
   214  		if err != nil {
   215  			return nil, fmt.Errorf("failed to start plugin: %v", err)
   216  		}
   217  	}
   218  
   219  	// Convert to a driver plugin
   220  	driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin)
   221  	if !ok {
   222  		pluginInstance.Kill()
   223  		return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
   224  	}
   225  
   226  	// Store the plugin and driver
   227  	i.plugin = pluginInstance
   228  	i.driver = driver
   229  
   230  	// Store the reattach config
   231  	if c, ok := pluginInstance.ReattachConfig(); ok {
   232  		if err := i.storeReattach(c); err != nil {
   233  			i.logger.Error("error storing driver plugin reattach config", "error", err)
   234  		}
   235  	}
   236  
   237  	return driver, nil
   238  }
   239  
   240  // cleanup shutsdown the plugin
   241  func (i *instanceManager) cleanup() {
   242  	i.shutdownLock.Lock()
   243  	i.pluginLock.Lock()
   244  	defer i.pluginLock.Unlock()
   245  	defer i.shutdownLock.Unlock()
   246  
   247  	if i.plugin == nil {
   248  		return
   249  	}
   250  
   251  	if internalPlugin, ok := i.plugin.Plugin().(drivers.InternalDriverPlugin); ok {
   252  		internalPlugin.Shutdown()
   253  	}
   254  
   255  	if !i.plugin.Exited() {
   256  		i.plugin.Kill()
   257  		if err := i.storeReattach(nil); err != nil {
   258  			i.logger.Warn("error clearing plugin reattach config from state store", "error", err)
   259  		}
   260  	}
   261  
   262  	i.cancel()
   263  }
   264  
   265  // dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call
   266  // to the driver. The fingerprint chan is returned along with the cancel func
   267  // for the context used in the RPC. This cancel func should always be called
   268  // when the caller is finished with the channel.
   269  func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) {
   270  	driver, err := i.dispense()
   271  	if err != nil {
   272  		return nil, nil, err
   273  	}
   274  
   275  	ctx, cancel := context.WithCancel(i.ctx)
   276  	fingerCh, err := driver.Fingerprint(ctx)
   277  	if err != nil {
   278  		cancel()
   279  		return nil, nil, err
   280  	}
   281  
   282  	return fingerCh, cancel, nil
   283  }
   284  
   285  // fingerprint is the main loop for fingerprinting.
   286  func (i *instanceManager) fingerprint() {
   287  	fpChan, cancel, err := i.dispenseFingerprintCh()
   288  	if err != nil {
   289  		i.logger.Error("failed to dispense driver plugin", "error", err)
   290  	}
   291  
   292  	// backoff and retry used if the RPC is closed by the other end
   293  	var backoff time.Duration
   294  	var retry int
   295  	for {
   296  		if backoff > 0 {
   297  			select {
   298  			case <-time.After(backoff):
   299  			case <-i.ctx.Done():
   300  				cancel()
   301  				return
   302  			}
   303  		}
   304  
   305  		select {
   306  		case <-i.ctx.Done():
   307  			cancel()
   308  			return
   309  		case fp, ok := <-fpChan:
   310  			if ok {
   311  				if fp.Err == nil {
   312  					i.handleFingerprint(fp)
   313  				} else {
   314  					i.logger.Warn("received fingerprint error from driver", "error", fp.Err)
   315  					i.handleFingerprintError()
   316  				}
   317  				continue
   318  			}
   319  
   320  			// if the channel is closed attempt to open a new one
   321  			newFpChan, newCancel, err := i.dispenseFingerprintCh()
   322  			if err != nil {
   323  				i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry)
   324  				i.handleFingerprintError()
   325  
   326  				// Calculate the new backoff
   327  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   328  				if backoff > driverFPBackoffLimit {
   329  					backoff = driverFPBackoffLimit
   330  				}
   331  				// Increment retry counter
   332  				retry++
   333  				continue
   334  			}
   335  			cancel()
   336  			fpChan = newFpChan
   337  			cancel = newCancel
   338  
   339  			// Reset backoff
   340  			backoff = 0
   341  			retry = 0
   342  		}
   343  	}
   344  }
   345  
   346  // handleFingerprintError is called when an error occurred while fingerprinting
   347  // and will set the driver to unhealthy
   348  func (i *instanceManager) handleFingerprintError() {
   349  	di := &structs.DriverInfo{
   350  		Healthy:           false,
   351  		HealthDescription: "failed to fingerprint driver",
   352  		UpdateTime:        time.Now(),
   353  	}
   354  	i.updateNodeFromDriver(i.id.Name, di)
   355  }
   356  
   357  // handleFingerprint updates the node with the current fingerprint status
   358  func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) {
   359  	attrs := make(map[string]string, len(fp.Attributes))
   360  	for key, attr := range fp.Attributes {
   361  		attrs[key] = attr.GoString()
   362  	}
   363  	di := &structs.DriverInfo{
   364  		Attributes:        attrs,
   365  		Detected:          fp.Health != drivers.HealthStateUndetected,
   366  		Healthy:           fp.Health == drivers.HealthStateHealthy,
   367  		HealthDescription: fp.HealthDescription,
   368  		UpdateTime:        time.Now(),
   369  	}
   370  	i.updateNodeFromDriver(i.id.Name, di)
   371  
   372  	// log detected/undetected state changes after the initial fingerprint
   373  	i.lastHealthStateMu.Lock()
   374  	if i.hasFingerprinted {
   375  		if i.lastHealthState != fp.Health {
   376  			i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription)
   377  		}
   378  	}
   379  	i.lastHealthState = fp.Health
   380  	i.lastHealthStateMu.Unlock()
   381  
   382  	// if this is the first fingerprint, mark that we have received it
   383  	if !i.hasFingerprinted {
   384  		i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription)
   385  		close(i.firstFingerprintCh)
   386  		i.hasFingerprinted = true
   387  	}
   388  }
   389  
   390  // getLastHealth returns the most recent HealthState from fingerprinting
   391  func (i *instanceManager) getLastHealth() drivers.HealthState {
   392  	i.lastHealthStateMu.Lock()
   393  	defer i.lastHealthStateMu.Unlock()
   394  	return i.lastHealthState
   395  }
   396  
   397  // dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC.
   398  // The TaskEvent chan and cancel func for the RPC is return. The cancel func must
   399  // be called by the caller to properly cleanup the context
   400  func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) {
   401  	driver, err := i.dispense()
   402  	if err != nil {
   403  		return nil, nil, err
   404  	}
   405  
   406  	ctx, cancel := context.WithCancel(i.ctx)
   407  	eventsCh, err := driver.TaskEvents(ctx)
   408  	if err != nil {
   409  		cancel()
   410  		return nil, nil, err
   411  	}
   412  
   413  	return eventsCh, cancel, nil
   414  }
   415  
   416  // handleEvents is the main loop that receives task events from the driver
   417  func (i *instanceManager) handleEvents() {
   418  	eventsCh, cancel, err := i.dispenseTaskEventsCh()
   419  	if err != nil {
   420  		i.logger.Error("failed to dispense driver", "error", err)
   421  	}
   422  
   423  	var backoff time.Duration
   424  	var retry int
   425  	for {
   426  		if backoff > 0 {
   427  			select {
   428  			case <-time.After(backoff):
   429  			case <-i.ctx.Done():
   430  				cancel()
   431  				return
   432  			}
   433  		}
   434  
   435  		select {
   436  		case <-i.ctx.Done():
   437  			cancel()
   438  			return
   439  		case ev, ok := <-eventsCh:
   440  			if ok {
   441  				i.handleEvent(ev)
   442  				continue
   443  			}
   444  
   445  			// if the channel is closed attempt to open a new one
   446  			newEventsChan, newCancel, err := i.dispenseTaskEventsCh()
   447  			if err != nil {
   448  				i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry)
   449  
   450  				// Calculate the new backoff
   451  				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
   452  				if backoff > driverFPBackoffLimit {
   453  					backoff = driverFPBackoffLimit
   454  				}
   455  				retry++
   456  				continue
   457  			}
   458  			cancel()
   459  			eventsCh = newEventsChan
   460  			cancel = newCancel
   461  
   462  			// Reset backoff
   463  			backoff = 0
   464  			retry = 0
   465  		}
   466  	}
   467  }
   468  
   469  // handleEvent looks up the event handler(s) for the event and runs them
   470  func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) {
   471  	// Do not emit that the plugin is shutdown
   472  	if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown {
   473  		return
   474  	}
   475  
   476  	if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil {
   477  		i.logger.Trace("task event received", "event", ev)
   478  		handler(ev)
   479  		return
   480  	}
   481  
   482  	i.logger.Warn("no handler registered for event", "event", ev)
   483  }