github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/devicemanager/instance.go (about)

     1  package devicemanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	multierror "github.com/hashicorp/go-multierror"
    11  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    12  	"github.com/hashicorp/nomad/helper/pluginutils/singleton"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/nomad/plugins/base"
    15  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    16  	"github.com/hashicorp/nomad/plugins/device"
    17  )
    18  
    19  const (
    20  	// statsBackoffBaseline is the baseline time for exponential backoff while
    21  	// collecting device stats.
    22  	statsBackoffBaseline = 5 * time.Second
    23  
    24  	// statsBackoffLimit is the limit of the exponential backoff for collecting
    25  	// device statistics.
    26  	statsBackoffLimit = 30 * time.Minute
    27  )
    28  
    29  // instanceManagerConfig configures a device instance manager
    30  type instanceManagerConfig struct {
    31  	// Logger is the logger used by the device instance manager
    32  	Logger log.Logger
    33  
    34  	// Ctx is used to shutdown the device instance manager
    35  	Ctx context.Context
    36  
    37  	// Loader is the plugin loader
    38  	Loader loader.PluginCatalog
    39  
    40  	// StoreReattach is used to store a plugins reattach config
    41  	StoreReattach StorePluginReattachFn
    42  
    43  	// PluginConfig is the config passed to the launched plugins
    44  	PluginConfig *base.AgentConfig
    45  
    46  	// Id is the ID of the plugin being managed
    47  	Id *loader.PluginID
    48  
    49  	// FingerprintOutCh is used to emit new fingerprinted devices
    50  	FingerprintOutCh chan<- struct{}
    51  
    52  	// StatsInterval is the interval at which we collect statistics.
    53  	StatsInterval time.Duration
    54  }
    55  
    56  // instanceManager is used to manage a single device plugin
    57  type instanceManager struct {
    58  	// logger is the logger used by the device instance manager
    59  	logger log.Logger
    60  
    61  	// ctx is used to shutdown the device manager
    62  	ctx context.Context
    63  
    64  	// cancel is used to shutdown management of this device plugin
    65  	cancel context.CancelFunc
    66  
    67  	// loader is the plugin loader
    68  	loader loader.PluginCatalog
    69  
    70  	// storeReattach is used to store a plugins reattach config
    71  	storeReattach StorePluginReattachFn
    72  
    73  	// pluginConfig is the config passed to the launched plugins
    74  	pluginConfig *base.AgentConfig
    75  
    76  	// id is the ID of the plugin being managed
    77  	id *loader.PluginID
    78  
    79  	// fingerprintOutCh is used to emit new fingerprinted devices
    80  	fingerprintOutCh chan<- struct{}
    81  
    82  	// plugin is the plugin instance being managed
    83  	plugin loader.PluginInstance
    84  
    85  	// device is the device plugin being managed
    86  	device device.DevicePlugin
    87  
    88  	// pluginLock locks access to the device and plugin
    89  	pluginLock sync.Mutex
    90  
    91  	// shutdownLock is used to serialize attempts to shutdown
    92  	shutdownLock sync.Mutex
    93  
    94  	// devices is the set of fingerprinted devices
    95  	devices    []*device.DeviceGroup
    96  	deviceLock sync.RWMutex
    97  
    98  	// statsInterval is the interval at which we collect statistics.
    99  	statsInterval time.Duration
   100  
   101  	// deviceStats is the set of statistics objects per devices
   102  	deviceStats     []*device.DeviceGroupStats
   103  	deviceStatsLock sync.RWMutex
   104  
   105  	// firstFingerprintCh is used to trigger that we have successfully
   106  	// fingerprinted once. It is used to gate launching the stats collection.
   107  	firstFingerprintCh chan struct{}
   108  	hasFingerprinted   bool
   109  }
   110  
   111  // newInstanceManager returns a new device instance manager. It is expected that
   112  // the context passed in the configuration is cancelled in order to shutdown
   113  // launched goroutines.
   114  func newInstanceManager(c *instanceManagerConfig) *instanceManager {
   115  
   116  	ctx, cancel := context.WithCancel(c.Ctx)
   117  	i := &instanceManager{
   118  		logger:             c.Logger.With("plugin", c.Id.Name),
   119  		ctx:                ctx,
   120  		cancel:             cancel,
   121  		loader:             c.Loader,
   122  		storeReattach:      c.StoreReattach,
   123  		pluginConfig:       c.PluginConfig,
   124  		id:                 c.Id,
   125  		fingerprintOutCh:   c.FingerprintOutCh,
   126  		statsInterval:      c.StatsInterval,
   127  		firstFingerprintCh: make(chan struct{}),
   128  	}
   129  
   130  	go i.run()
   131  	return i
   132  }
   133  
   134  // HasDevices returns if the instance is managing the passed devices
   135  func (i *instanceManager) HasDevices(d *structs.AllocatedDeviceResource) bool {
   136  	i.deviceLock.RLock()
   137  	defer i.deviceLock.RUnlock()
   138  
   139  OUTER:
   140  	for _, dev := range i.devices {
   141  		if dev.Name != d.Name || dev.Type != d.Type || dev.Vendor != d.Vendor {
   142  			continue
   143  		}
   144  
   145  		// Check that we have all the requested devices
   146  		ids := make(map[string]struct{}, len(dev.Devices))
   147  		for _, inst := range dev.Devices {
   148  			ids[inst.ID] = struct{}{}
   149  		}
   150  
   151  		for _, reqID := range d.DeviceIDs {
   152  			if _, ok := ids[reqID]; !ok {
   153  				continue OUTER
   154  			}
   155  		}
   156  
   157  		return true
   158  	}
   159  
   160  	return false
   161  }
   162  
   163  // AllStats returns all the device statistics returned by the device plugin.
   164  func (i *instanceManager) AllStats() []*device.DeviceGroupStats {
   165  	i.deviceStatsLock.RLock()
   166  	defer i.deviceStatsLock.RUnlock()
   167  	return i.deviceStats
   168  }
   169  
   170  // DeviceStats returns the device statistics for the request devices.
   171  func (i *instanceManager) DeviceStats(d *structs.AllocatedDeviceResource) *device.DeviceGroupStats {
   172  	i.deviceStatsLock.RLock()
   173  	defer i.deviceStatsLock.RUnlock()
   174  
   175  	// Find the device in question and then gather the instance statistics we
   176  	// are interested in
   177  	for _, group := range i.deviceStats {
   178  		if group.Vendor != d.Vendor || group.Type != d.Type || group.Name != d.Name {
   179  			continue
   180  		}
   181  
   182  		// We found the group we want so now grab the instance stats
   183  		out := &device.DeviceGroupStats{
   184  			Vendor:        d.Vendor,
   185  			Type:          d.Type,
   186  			Name:          d.Name,
   187  			InstanceStats: make(map[string]*device.DeviceStats, len(d.DeviceIDs)),
   188  		}
   189  
   190  		for _, id := range d.DeviceIDs {
   191  			out.InstanceStats[id] = group.InstanceStats[id]
   192  		}
   193  
   194  		return out
   195  	}
   196  
   197  	return nil
   198  }
   199  
   200  // Reserve reserves the given devices
   201  func (i *instanceManager) Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) {
   202  	// Get a device plugin
   203  	devicePlugin, err := i.dispense()
   204  	if err != nil {
   205  		i.logger.Error("dispensing plugin failed", "error", err)
   206  		return nil, err
   207  	}
   208  
   209  	// Send the reserve request
   210  	return devicePlugin.Reserve(d.DeviceIDs)
   211  }
   212  
   213  // Devices returns the detected devices.
   214  func (i *instanceManager) Devices() []*device.DeviceGroup {
   215  	i.deviceLock.RLock()
   216  	defer i.deviceLock.RUnlock()
   217  	return i.devices
   218  }
   219  
   220  // WaitForFirstFingerprint waits until either the plugin fingerprints, the
   221  // passed context is done, or the plugin instance manager is shutdown.
   222  func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
   223  	select {
   224  	case <-i.ctx.Done():
   225  	case <-ctx.Done():
   226  	case <-i.firstFingerprintCh:
   227  	}
   228  }
   229  
   230  // run is a long lived goroutine that starts the fingerprinting and stats
   231  // collection goroutine and then shutsdown the plugin on exit.
   232  func (i *instanceManager) run() {
   233  	// Dispense once to ensure we are given a valid plugin
   234  	if _, err := i.dispense(); err != nil {
   235  		i.logger.Error("dispensing initial plugin failed", "error", err)
   236  		return
   237  	}
   238  
   239  	// Create a waitgroup to block on shutdown for all created goroutines to
   240  	// exit
   241  	var wg sync.WaitGroup
   242  
   243  	// Start the fingerprinter
   244  	wg.Add(1)
   245  	go func() {
   246  		i.fingerprint()
   247  		wg.Done()
   248  	}()
   249  
   250  	// Wait for a valid result before starting stats collection
   251  	select {
   252  	case <-i.ctx.Done():
   253  		goto DONE
   254  	case <-i.firstFingerprintCh:
   255  	}
   256  
   257  	// Start stats
   258  	wg.Add(1)
   259  	go func() {
   260  		i.collectStats()
   261  		wg.Done()
   262  	}()
   263  
   264  	// Do a final cleanup
   265  DONE:
   266  	wg.Wait()
   267  	i.cleanup()
   268  }
   269  
   270  // dispense is used to dispense a plugin.
   271  func (i *instanceManager) dispense() (plugin device.DevicePlugin, err error) {
   272  	i.pluginLock.Lock()
   273  	defer i.pluginLock.Unlock()
   274  
   275  	// See if we already have a running instance
   276  	if i.plugin != nil && !i.plugin.Exited() {
   277  		return i.device, nil
   278  	}
   279  
   280  	// Get an instance of the plugin
   281  	pluginInstance, err := i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   282  	if err != nil {
   283  		// Retry as the error just indicates the singleton has exited
   284  		if err == singleton.SingletonPluginExited {
   285  			pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
   286  		}
   287  
   288  		// If we still have an error there is a real problem
   289  		if err != nil {
   290  			return nil, fmt.Errorf("failed to start plugin: %v", err)
   291  		}
   292  	}
   293  
   294  	// Convert to a fingerprint plugin
   295  	device, ok := pluginInstance.Plugin().(device.DevicePlugin)
   296  	if !ok {
   297  		pluginInstance.Kill()
   298  		return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
   299  	}
   300  
   301  	// Store the plugin and device
   302  	i.plugin = pluginInstance
   303  	i.device = device
   304  
   305  	// Store the reattach config
   306  	if c, ok := pluginInstance.ReattachConfig(); ok {
   307  		i.storeReattach(c)
   308  	}
   309  
   310  	return device, nil
   311  }
   312  
   313  // cleanup shutsdown the plugin
   314  func (i *instanceManager) cleanup() {
   315  	i.shutdownLock.Lock()
   316  	i.pluginLock.Lock()
   317  	defer i.pluginLock.Unlock()
   318  	defer i.shutdownLock.Unlock()
   319  
   320  	if i.plugin != nil && !i.plugin.Exited() {
   321  		i.plugin.Kill()
   322  		i.storeReattach(nil)
   323  	}
   324  }
   325  
   326  // fingerprint is a long lived routine used to fingerprint the device
   327  func (i *instanceManager) fingerprint() {
   328  START:
   329  	// Get a device plugin
   330  	devicePlugin, err := i.dispense()
   331  	if err != nil {
   332  		i.logger.Error("dispensing plugin failed", "error", err)
   333  		i.cancel()
   334  		return
   335  	}
   336  
   337  	// Start fingerprinting
   338  	fingerprintCh, err := devicePlugin.Fingerprint(i.ctx)
   339  	if err != nil {
   340  		i.logger.Error("fingerprinting failed", "error", err)
   341  		i.handleFingerprintError()
   342  		return
   343  	}
   344  
   345  	var fresp *device.FingerprintResponse
   346  	var ok bool
   347  	for {
   348  		select {
   349  		case <-i.ctx.Done():
   350  			return
   351  		case fresp, ok = <-fingerprintCh:
   352  		}
   353  
   354  		if !ok {
   355  			i.logger.Trace("exiting since fingerprinting gracefully shutdown")
   356  			i.handleFingerprintError()
   357  			return
   358  		}
   359  
   360  		// Guard against error by the plugin
   361  		if fresp == nil {
   362  			continue
   363  		}
   364  
   365  		// Handle any errors
   366  		if fresp.Error != nil {
   367  			if fresp.Error == bstructs.ErrPluginShutdown {
   368  				i.logger.Error("plugin exited unexpectedly")
   369  				goto START
   370  			}
   371  
   372  			i.logger.Error("fingerprinting returned an error", "error", err)
   373  			i.handleFingerprintError()
   374  			return
   375  		}
   376  
   377  		if err := i.handleFingerprint(fresp); err != nil {
   378  			// Cancel the context so we cleanup all goroutines
   379  			i.logger.Error("returned devices failed fingerprinting", "error", err)
   380  			i.handleFingerprintError()
   381  		}
   382  	}
   383  }
   384  
   385  // handleFingerprintError exits the manager and shutsdown the plugin.
   386  func (i *instanceManager) handleFingerprintError() {
   387  	// Clear out the devices and trigger a node update
   388  	i.deviceLock.Lock()
   389  	defer i.deviceLock.Unlock()
   390  
   391  	// If we have fingerprinted before clear it out
   392  	if i.hasFingerprinted {
   393  		// Store the new devices
   394  		i.devices = nil
   395  
   396  		// Trigger that the we have new devices
   397  		select {
   398  		case i.fingerprintOutCh <- struct{}{}:
   399  		default:
   400  		}
   401  	}
   402  
   403  	// Cancel the context so we cleanup all goroutines
   404  	i.cancel()
   405  
   406  	return
   407  }
   408  
   409  // handleFingerprint stores the new devices and triggers the fingerprint output
   410  // channel. An error is returned if the passed devices don't pass validation.
   411  func (i *instanceManager) handleFingerprint(f *device.FingerprintResponse) error {
   412  	// If no devices are returned then there is nothing to do.
   413  	if f.Devices == nil {
   414  		return nil
   415  	}
   416  
   417  	// Validate the received devices
   418  	var validationErr multierror.Error
   419  	for i, d := range f.Devices {
   420  		if err := d.Validate(); err != nil {
   421  			multierror.Append(&validationErr, multierror.Prefix(err, fmt.Sprintf("device group %d: ", i)))
   422  		}
   423  	}
   424  
   425  	if err := validationErr.ErrorOrNil(); err != nil {
   426  		return err
   427  	}
   428  
   429  	i.deviceLock.Lock()
   430  	defer i.deviceLock.Unlock()
   431  
   432  	// Store the new devices
   433  	i.devices = f.Devices
   434  
   435  	// Mark that we have received data
   436  	if !i.hasFingerprinted {
   437  		close(i.firstFingerprintCh)
   438  		i.hasFingerprinted = true
   439  	}
   440  
   441  	// Trigger that we have data to pull
   442  	select {
   443  	case i.fingerprintOutCh <- struct{}{}:
   444  	default:
   445  	}
   446  
   447  	return nil
   448  }
   449  
   450  // collectStats is a long lived goroutine for collecting device statistics. It
   451  // handles errors by backing off exponentially and retrying.
   452  func (i *instanceManager) collectStats() {
   453  	attempt := 0
   454  
   455  START:
   456  	// Get a device plugin
   457  	devicePlugin, err := i.dispense()
   458  	if err != nil {
   459  		i.logger.Error("dispensing plugin failed", "error", err)
   460  		i.cancel()
   461  		return
   462  	}
   463  
   464  	// Start stats collection
   465  	statsCh, err := devicePlugin.Stats(i.ctx, i.statsInterval)
   466  	if err != nil {
   467  		i.logger.Error("stats collection failed", "error", err)
   468  		return
   469  	}
   470  
   471  	var sresp *device.StatsResponse
   472  	var ok bool
   473  	for {
   474  		select {
   475  		case <-i.ctx.Done():
   476  			return
   477  		case sresp, ok = <-statsCh:
   478  		}
   479  
   480  		if !ok {
   481  			i.logger.Trace("exiting since stats gracefully shutdown")
   482  			return
   483  		}
   484  
   485  		// Guard against error by the plugin
   486  		if sresp == nil {
   487  			continue
   488  		}
   489  
   490  		// Handle any errors
   491  		if sresp.Error != nil {
   492  			if sresp.Error == bstructs.ErrPluginShutdown {
   493  				i.logger.Error("plugin exited unexpectedly")
   494  				goto START
   495  			}
   496  
   497  			// Retry with an exponential backoff
   498  			backoff := (1 << (2 * uint64(attempt))) * statsBackoffBaseline
   499  			if backoff > statsBackoffLimit {
   500  				backoff = statsBackoffLimit
   501  			}
   502  			attempt++
   503  
   504  			i.logger.Error("stats returned an error", "error", err, "retry", backoff)
   505  
   506  			select {
   507  			case <-i.ctx.Done():
   508  				return
   509  			case <-time.After(backoff):
   510  				goto START
   511  			}
   512  		}
   513  
   514  		// Reset the attempt since we got statistics
   515  		attempt = 0
   516  
   517  		// Store the new stats
   518  		if sresp.Groups != nil {
   519  			i.deviceStatsLock.Lock()
   520  			i.deviceStats = sresp.Groups
   521  			i.deviceStatsLock.Unlock()
   522  		}
   523  	}
   524  }