github.com/bigcommerce/nomad@v0.9.3-bc/client/pluginmanager/drivermanager/manager.go (about)

     1  package drivermanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  	plugin "github.com/hashicorp/go-plugin"
    10  	"github.com/hashicorp/nomad/client/pluginmanager"
    11  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
    12  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/nomad/plugins/base"
    15  	"github.com/hashicorp/nomad/plugins/drivers"
    16  	pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
    17  )
    18  
    19  // ErrDriverNotFound is returned during Dispense when the requested driver
    20  // plugin is not found in the plugin catalog
    21  var ErrDriverNotFound = fmt.Errorf("driver not found")
    22  
    23  // Manager is the interface used to manage driver plugins
    24  type Manager interface {
    25  	pluginmanager.PluginManager
    26  
    27  	// Dispense returns a drivers.DriverPlugin for the given driver plugin name
    28  	// handling reattaching to an existing driver if available
    29  	Dispense(driver string) (drivers.DriverPlugin, error)
    30  }
    31  
    32  // TaskExecHandler is function to be called for executing commands in a task
    33  type TaskExecHandler func(
    34  	ctx context.Context,
    35  	command []string,
    36  	tty bool,
    37  	stream drivers.ExecTaskStream) error
    38  
    39  // EventHandler is a callback to be called for a task.
    40  // The handler should not block execution.
    41  type EventHandler func(*drivers.TaskEvent)
    42  
    43  // TaskEventHandlerFactory returns an event handler for a given allocID/task name
    44  type TaskEventHandlerFactory func(allocID, taskName string) EventHandler
    45  
    46  // StateStorage is used to persist the driver managers state across
    47  // agent restarts.
    48  type StateStorage interface {
    49  	// GetDevicePluginState is used to retrieve the device manager's plugin
    50  	// state.
    51  	GetDriverPluginState() (*state.PluginState, error)
    52  
    53  	// PutDevicePluginState is used to store the device manager's plugin
    54  	// state.
    55  	PutDriverPluginState(state *state.PluginState) error
    56  }
    57  
    58  // UpdateNodeDriverInfoFn is the callback used to update the node from
    59  // fingerprinting
    60  type UpdateNodeDriverInfoFn func(string, *structs.DriverInfo)
    61  
    62  // StorePluginReattachFn is used to store plugin reattachment configurations.
    63  type StorePluginReattachFn func(*plugin.ReattachConfig) error
    64  
    65  // FetchPluginReattachFn is used to retrieve the stored plugin reattachment
    66  // configuration.
    67  type FetchPluginReattachFn func() (*plugin.ReattachConfig, bool)
    68  
    69  // Config is used to configure a driver manager
    70  type Config struct {
    71  	// Logger is the logger used by the device manager
    72  	Logger log.Logger
    73  
    74  	// Loader is the plugin loader
    75  	Loader loader.PluginCatalog
    76  
    77  	// PluginConfig is the config passed to the launched plugins
    78  	PluginConfig *base.AgentConfig
    79  
    80  	// Updater is used to update the node when driver information changes
    81  	Updater UpdateNodeDriverInfoFn
    82  
    83  	// EventHandlerFactory is used to retrieve a task event handler
    84  	EventHandlerFactory TaskEventHandlerFactory
    85  
    86  	// State is used to manage the device managers state
    87  	State StateStorage
    88  
    89  	// AllowedDrivers if set will only start driver plugins for the given
    90  	// drivers
    91  	AllowedDrivers map[string]struct{}
    92  
    93  	// BlockedDrivers if set will not allow the given driver plugins to start
    94  	BlockedDrivers map[string]struct{}
    95  }
    96  
    97  // manager is used to manage a set of driver plugins
    98  type manager struct {
    99  	// logger is the logger used by the device manager
   100  	logger log.Logger
   101  
   102  	// state is used to manage the device managers state
   103  	state StateStorage
   104  
   105  	// ctx is used to shutdown the device manager
   106  	ctx    context.Context
   107  	cancel context.CancelFunc
   108  
   109  	// loader is the plugin loader
   110  	loader loader.PluginCatalog
   111  
   112  	// pluginConfig is the config passed to the launched plugins
   113  	pluginConfig *base.AgentConfig
   114  
   115  	// updater is used to update the node when device information changes
   116  	updater UpdateNodeDriverInfoFn
   117  
   118  	// eventHandlerFactory is passed to the instance managers and used to forward
   119  	// task events
   120  	eventHandlerFactory TaskEventHandlerFactory
   121  
   122  	// instances is the list of managed devices, access is serialized by instanceMu
   123  	instances   map[string]*instanceManager
   124  	instancesMu sync.RWMutex
   125  
   126  	// reattachConfigs stores the plugin reattach configs
   127  	reattachConfigs    map[loader.PluginID]*pstructs.ReattachConfig
   128  	reattachConfigLock sync.Mutex
   129  
   130  	// allows/block lists
   131  	allowedDrivers map[string]struct{}
   132  	blockedDrivers map[string]struct{}
   133  
   134  	// readyCh is ticked once at the end of Run()
   135  	readyCh chan struct{}
   136  }
   137  
   138  // New returns a new driver manager
   139  func New(c *Config) *manager {
   140  	ctx, cancel := context.WithCancel(context.Background())
   141  	return &manager{
   142  		logger:              c.Logger.Named("driver_mgr"),
   143  		state:               c.State,
   144  		ctx:                 ctx,
   145  		cancel:              cancel,
   146  		loader:              c.Loader,
   147  		pluginConfig:        c.PluginConfig,
   148  		updater:             c.Updater,
   149  		eventHandlerFactory: c.EventHandlerFactory,
   150  		instances:           make(map[string]*instanceManager),
   151  		reattachConfigs:     make(map[loader.PluginID]*pstructs.ReattachConfig),
   152  		allowedDrivers:      c.AllowedDrivers,
   153  		blockedDrivers:      c.BlockedDrivers,
   154  		readyCh:             make(chan struct{}),
   155  	}
   156  }
   157  
   158  // PluginType returns the type of plugin this manager mananges
   159  func (*manager) PluginType() string { return base.PluginTypeDriver }
   160  
   161  // Run starts the manager, initializes driver plugins and blocks until Shutdown
   162  // is called.
   163  func (m *manager) Run() {
   164  	// Load any previous plugin reattach configuration
   165  	if err := m.loadReattachConfigs(); err != nil {
   166  		m.logger.Warn("unable to load driver plugin reattach configs, a driver process may have been leaked",
   167  			"error", err)
   168  	}
   169  
   170  	// Get driver plugins
   171  	driversPlugins := m.loader.Catalog()[base.PluginTypeDriver]
   172  	if len(driversPlugins) == 0 {
   173  		m.logger.Debug("exiting since there are no driver plugins")
   174  		m.cancel()
   175  		return
   176  	}
   177  
   178  	var skippedDrivers []string
   179  	for _, d := range driversPlugins {
   180  		id := loader.PluginInfoID(d)
   181  		if m.isDriverBlocked(id.Name) {
   182  			skippedDrivers = append(skippedDrivers, id.Name)
   183  			continue
   184  		}
   185  
   186  		storeFn := func(c *plugin.ReattachConfig) error {
   187  			return m.storePluginReattachConfig(id, c)
   188  		}
   189  		fetchFn := func() (*plugin.ReattachConfig, bool) {
   190  			return m.fetchPluginReattachConfig(id)
   191  		}
   192  
   193  		instance := newInstanceManager(&instanceManagerConfig{
   194  			Logger:               m.logger,
   195  			Ctx:                  m.ctx,
   196  			Loader:               m.loader,
   197  			StoreReattach:        storeFn,
   198  			FetchReattach:        fetchFn,
   199  			PluginConfig:         m.pluginConfig,
   200  			ID:                   &id,
   201  			UpdateNodeFromDriver: m.updater,
   202  			EventHandlerFactory:  m.eventHandlerFactory,
   203  		})
   204  
   205  		m.instancesMu.Lock()
   206  		m.instances[id.Name] = instance
   207  		m.instancesMu.Unlock()
   208  	}
   209  
   210  	if len(skippedDrivers) > 0 {
   211  		m.logger.Debug("drivers skipped due to allow/block list", "skipped_drivers", skippedDrivers)
   212  	}
   213  
   214  	// signal ready
   215  	close(m.readyCh)
   216  }
   217  
   218  // Shutdown cleans up all the plugins
   219  func (m *manager) Shutdown() {
   220  	// Cancel the context to stop any requests
   221  	m.cancel()
   222  
   223  	m.instancesMu.RLock()
   224  	defer m.instancesMu.RUnlock()
   225  
   226  	// Go through and shut everything down
   227  	for _, i := range m.instances {
   228  		i.cleanup()
   229  	}
   230  }
   231  
   232  func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} {
   233  	ctx, cancel := context.WithCancel(ctx)
   234  	go m.waitForFirstFingerprint(ctx, cancel)
   235  	return ctx.Done()
   236  }
   237  
   238  func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.CancelFunc) {
   239  	defer cancel()
   240  	// We don't want to start initial fingerprint wait until Run loop has
   241  	// finished
   242  	select {
   243  	case <-m.readyCh:
   244  	case <-ctx.Done():
   245  		// parent context canceled or timedout
   246  		return
   247  	case <-m.ctx.Done():
   248  		// shutdown called
   249  		return
   250  	}
   251  
   252  	var mu sync.Mutex
   253  	driversByStatus := map[drivers.HealthState][]string{}
   254  
   255  	var wg sync.WaitGroup
   256  
   257  	recordDriver := func(name string, lastHeath drivers.HealthState) {
   258  		mu.Lock()
   259  		defer mu.Unlock()
   260  
   261  		updated := append(driversByStatus[lastHeath], name)
   262  		driversByStatus[lastHeath] = updated
   263  	}
   264  
   265  	// loop through instances and wait for each to finish initial fingerprint
   266  	m.instancesMu.RLock()
   267  	for n, i := range m.instances {
   268  		wg.Add(1)
   269  		go func(name string, instance *instanceManager) {
   270  			defer wg.Done()
   271  			instance.WaitForFirstFingerprint(ctx)
   272  			recordDriver(name, instance.getLastHealth())
   273  		}(n, i)
   274  	}
   275  	m.instancesMu.RUnlock()
   276  	wg.Wait()
   277  
   278  	m.logger.Debug("detected drivers", "drivers", driversByStatus)
   279  }
   280  
   281  func (m *manager) loadReattachConfigs() error {
   282  	m.reattachConfigLock.Lock()
   283  	defer m.reattachConfigLock.Unlock()
   284  
   285  	s, err := m.state.GetDriverPluginState()
   286  	if err != nil {
   287  		return err
   288  	}
   289  
   290  	if s != nil {
   291  		for name, c := range s.ReattachConfigs {
   292  			if m.isDriverBlocked(name) {
   293  				m.logger.Warn("reattach config for driver plugin found but driver is blocked due to allow/block list, killing plugin",
   294  					"driver", name)
   295  				m.shutdownBlockedDriver(name, c)
   296  				continue
   297  			}
   298  
   299  			id := loader.PluginID{
   300  				PluginType: base.PluginTypeDriver,
   301  				Name:       name,
   302  			}
   303  
   304  			m.reattachConfigs[id] = c
   305  		}
   306  	}
   307  	return nil
   308  }
   309  
   310  // shutdownBlockedDriver is used to forcefully shutdown a running driver plugin
   311  // when it has been blocked due to allow/block lists
   312  func (m *manager) shutdownBlockedDriver(name string, reattach *pstructs.ReattachConfig) {
   313  	c, err := pstructs.ReattachConfigToGoPlugin(reattach)
   314  	if err != nil {
   315  		m.logger.Warn("failed to reattach and kill blocked driver plugin",
   316  			"driver", name, "error", err)
   317  		return
   318  
   319  	}
   320  	pluginInstance, err := m.loader.Reattach(name, base.PluginTypeDriver, c)
   321  	if err != nil {
   322  		m.logger.Warn("failed to reattach and kill blocked driver plugin",
   323  			"driver", name, "error", err)
   324  		return
   325  	}
   326  
   327  	if !pluginInstance.Exited() {
   328  		pluginInstance.Kill()
   329  	}
   330  }
   331  
   332  // storePluginReattachConfig is used as a callback to the instance managers and
   333  // persists thhe plugin reattach configurations.
   334  func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error {
   335  	m.reattachConfigLock.Lock()
   336  	defer m.reattachConfigLock.Unlock()
   337  
   338  	if c == nil {
   339  		delete(m.reattachConfigs, id)
   340  	} else {
   341  		// Store the new reattach config
   342  		m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c)
   343  	}
   344  	// Persist the state
   345  	s := &state.PluginState{
   346  		ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)),
   347  	}
   348  
   349  	for id, c := range m.reattachConfigs {
   350  		s.ReattachConfigs[id.Name] = c
   351  	}
   352  
   353  	return m.state.PutDriverPluginState(s)
   354  }
   355  
   356  // fetchPluginReattachConfig is used as a callback to the instance managers and
   357  // retrieves the plugin reattach config. If it has not been stored it will
   358  // return nil
   359  func (m *manager) fetchPluginReattachConfig(id loader.PluginID) (*plugin.ReattachConfig, bool) {
   360  	m.reattachConfigLock.Lock()
   361  	defer m.reattachConfigLock.Unlock()
   362  
   363  	if cfg, ok := m.reattachConfigs[id]; ok {
   364  		c, err := pstructs.ReattachConfigToGoPlugin(cfg)
   365  		if err != nil {
   366  			m.logger.Warn("failed to read plugin reattach config", "config", cfg, "error", err)
   367  			delete(m.reattachConfigs, id)
   368  			return nil, false
   369  		}
   370  		return c, true
   371  	}
   372  	return nil, false
   373  }
   374  
   375  func (m *manager) Dispense(d string) (drivers.DriverPlugin, error) {
   376  	m.instancesMu.RLock()
   377  	defer m.instancesMu.RUnlock()
   378  	if instance, ok := m.instances[d]; ok {
   379  		return instance.dispense()
   380  	}
   381  
   382  	return nil, ErrDriverNotFound
   383  }
   384  
   385  func (m *manager) isDriverBlocked(name string) bool {
   386  	// Block drivers that are not in the allowed list if it is set.
   387  	if _, ok := m.allowedDrivers[name]; len(m.allowedDrivers) > 0 && !ok {
   388  		return true
   389  	}
   390  
   391  	// Block drivers that are in the blocked list
   392  	if _, ok := m.blockedDrivers[name]; ok {
   393  		return true
   394  	}
   395  	return false
   396  }