github.com/smithx10/nomad@v0.9.1-rc1/client/pluginmanager/drivermanager/manager.go (about)

     1  package drivermanager
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  	plugin "github.com/hashicorp/go-plugin"
    10  	"github.com/hashicorp/nomad/client/pluginmanager"
    11  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
    12  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/nomad/plugins/base"
    15  	"github.com/hashicorp/nomad/plugins/drivers"
    16  	pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
    17  )
    18  
    19  // ErrDriverNotFound is returned during Dispense when the requested driver
    20  // plugin is not found in the plugin catalog
    21  var ErrDriverNotFound = fmt.Errorf("driver not found")
    22  
    23  // Manager is the interface used to manage driver plugins
    24  type Manager interface {
    25  	pluginmanager.PluginManager
    26  
    27  	// Dispense returns a drivers.DriverPlugin for the given driver plugin name
    28  	// handling reattaching to an existing driver if available
    29  	Dispense(driver string) (drivers.DriverPlugin, error)
    30  }
    31  
    32  // EventHandler is a callback to be called for a task.
    33  // The handler should not block execution.
    34  type EventHandler func(*drivers.TaskEvent)
    35  
    36  // TaskEventHandlerFactory returns an event handler for a given allocID/task name
    37  type TaskEventHandlerFactory func(allocID, taskName string) EventHandler
    38  
    39  // StateStorage is used to persist the driver managers state across
    40  // agent restarts.
    41  type StateStorage interface {
    42  	// GetDevicePluginState is used to retrieve the device manager's plugin
    43  	// state.
    44  	GetDriverPluginState() (*state.PluginState, error)
    45  
    46  	// PutDevicePluginState is used to store the device manager's plugin
    47  	// state.
    48  	PutDriverPluginState(state *state.PluginState) error
    49  }
    50  
    51  // UpdateNodeDriverInfoFn is the callback used to update the node from
    52  // fingerprinting
    53  type UpdateNodeDriverInfoFn func(string, *structs.DriverInfo)
    54  
    55  // StorePluginReattachFn is used to store plugin reattachment configurations.
    56  type StorePluginReattachFn func(*plugin.ReattachConfig) error
    57  
    58  // FetchPluginReattachFn is used to retrieve the stored plugin reattachment
    59  // configuration.
    60  type FetchPluginReattachFn func() (*plugin.ReattachConfig, bool)
    61  
    62  // Config is used to configure a driver manager
    63  type Config struct {
    64  	// Logger is the logger used by the device manager
    65  	Logger log.Logger
    66  
    67  	// Loader is the plugin loader
    68  	Loader loader.PluginCatalog
    69  
    70  	// PluginConfig is the config passed to the launched plugins
    71  	PluginConfig *base.AgentConfig
    72  
    73  	// Updater is used to update the node when driver information changes
    74  	Updater UpdateNodeDriverInfoFn
    75  
    76  	// EventHandlerFactory is used to retrieve a task event handler
    77  	EventHandlerFactory TaskEventHandlerFactory
    78  
    79  	// State is used to manage the device managers state
    80  	State StateStorage
    81  
    82  	// AllowedDrivers if set will only start driver plugins for the given
    83  	// drivers
    84  	AllowedDrivers map[string]struct{}
    85  
    86  	// BlockedDrivers if set will not allow the given driver plugins to start
    87  	BlockedDrivers map[string]struct{}
    88  }
    89  
    90  // manager is used to manage a set of driver plugins
    91  type manager struct {
    92  	// logger is the logger used by the device manager
    93  	logger log.Logger
    94  
    95  	// state is used to manage the device managers state
    96  	state StateStorage
    97  
    98  	// ctx is used to shutdown the device manager
    99  	ctx    context.Context
   100  	cancel context.CancelFunc
   101  
   102  	// loader is the plugin loader
   103  	loader loader.PluginCatalog
   104  
   105  	// pluginConfig is the config passed to the launched plugins
   106  	pluginConfig *base.AgentConfig
   107  
   108  	// updater is used to update the node when device information changes
   109  	updater UpdateNodeDriverInfoFn
   110  
   111  	// eventHandlerFactory is passed to the instance managers and used to forward
   112  	// task events
   113  	eventHandlerFactory TaskEventHandlerFactory
   114  
   115  	// instances is the list of managed devices, access is serialized by instanceMu
   116  	instances   map[string]*instanceManager
   117  	instancesMu sync.RWMutex
   118  
   119  	// reattachConfigs stores the plugin reattach configs
   120  	reattachConfigs    map[loader.PluginID]*pstructs.ReattachConfig
   121  	reattachConfigLock sync.Mutex
   122  
   123  	// allows/block lists
   124  	allowedDrivers map[string]struct{}
   125  	blockedDrivers map[string]struct{}
   126  
   127  	// readyCh is ticked once at the end of Run()
   128  	readyCh chan struct{}
   129  }
   130  
   131  // New returns a new driver manager
   132  func New(c *Config) *manager {
   133  	ctx, cancel := context.WithCancel(context.Background())
   134  	return &manager{
   135  		logger:              c.Logger.Named("driver_mgr"),
   136  		state:               c.State,
   137  		ctx:                 ctx,
   138  		cancel:              cancel,
   139  		loader:              c.Loader,
   140  		pluginConfig:        c.PluginConfig,
   141  		updater:             c.Updater,
   142  		eventHandlerFactory: c.EventHandlerFactory,
   143  		instances:           make(map[string]*instanceManager),
   144  		reattachConfigs:     make(map[loader.PluginID]*pstructs.ReattachConfig),
   145  		allowedDrivers:      c.AllowedDrivers,
   146  		blockedDrivers:      c.BlockedDrivers,
   147  		readyCh:             make(chan struct{}),
   148  	}
   149  }
   150  
   151  // PluginType returns the type of plugin this manager mananges
   152  func (*manager) PluginType() string { return base.PluginTypeDriver }
   153  
   154  // Run starts the manager, initializes driver plugins and blocks until Shutdown
   155  // is called.
   156  func (m *manager) Run() {
   157  	// Load any previous plugin reattach configuration
   158  	if err := m.loadReattachConfigs(); err != nil {
   159  		m.logger.Warn("unable to load driver plugin reattach configs, a driver process may have been leaked",
   160  			"error", err)
   161  	}
   162  
   163  	// Get driver plugins
   164  	driversPlugins := m.loader.Catalog()[base.PluginTypeDriver]
   165  	if len(driversPlugins) == 0 {
   166  		m.logger.Debug("exiting since there are no driver plugins")
   167  		m.cancel()
   168  		return
   169  	}
   170  
   171  	var skippedDrivers []string
   172  	for _, d := range driversPlugins {
   173  		id := loader.PluginInfoID(d)
   174  		if m.isDriverBlocked(id.Name) {
   175  			skippedDrivers = append(skippedDrivers, id.Name)
   176  			continue
   177  		}
   178  
   179  		storeFn := func(c *plugin.ReattachConfig) error {
   180  			return m.storePluginReattachConfig(id, c)
   181  		}
   182  		fetchFn := func() (*plugin.ReattachConfig, bool) {
   183  			return m.fetchPluginReattachConfig(id)
   184  		}
   185  
   186  		instance := newInstanceManager(&instanceManagerConfig{
   187  			Logger:               m.logger,
   188  			Ctx:                  m.ctx,
   189  			Loader:               m.loader,
   190  			StoreReattach:        storeFn,
   191  			FetchReattach:        fetchFn,
   192  			PluginConfig:         m.pluginConfig,
   193  			ID:                   &id,
   194  			UpdateNodeFromDriver: m.updater,
   195  			EventHandlerFactory:  m.eventHandlerFactory,
   196  		})
   197  
   198  		m.instancesMu.Lock()
   199  		m.instances[id.Name] = instance
   200  		m.instancesMu.Unlock()
   201  	}
   202  
   203  	if len(skippedDrivers) > 0 {
   204  		m.logger.Debug("drivers skipped due to allow/block list", "skipped_drivers", skippedDrivers)
   205  	}
   206  
   207  	// signal ready
   208  	close(m.readyCh)
   209  }
   210  
   211  // Shutdown cleans up all the plugins
   212  func (m *manager) Shutdown() {
   213  	// Cancel the context to stop any requests
   214  	m.cancel()
   215  
   216  	m.instancesMu.RLock()
   217  	defer m.instancesMu.RUnlock()
   218  
   219  	// Go through and shut everything down
   220  	for _, i := range m.instances {
   221  		i.cleanup()
   222  	}
   223  }
   224  
   225  func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} {
   226  	ctx, cancel := context.WithCancel(ctx)
   227  	go m.waitForFirstFingerprint(ctx, cancel)
   228  	return ctx.Done()
   229  }
   230  
   231  func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.CancelFunc) {
   232  	defer cancel()
   233  	// We don't want to start initial fingerprint wait until Run loop has
   234  	// finished
   235  	select {
   236  	case <-m.readyCh:
   237  	case <-ctx.Done():
   238  		// parent context canceled or timedout
   239  		return
   240  	case <-m.ctx.Done():
   241  		// shutdown called
   242  		return
   243  	}
   244  
   245  	var mu sync.Mutex
   246  	driversByStatus := map[drivers.HealthState][]string{}
   247  
   248  	var wg sync.WaitGroup
   249  
   250  	recordDriver := func(name string, lastHeath drivers.HealthState) {
   251  		mu.Lock()
   252  		defer mu.Unlock()
   253  
   254  		updated := append(driversByStatus[lastHeath], name)
   255  		driversByStatus[lastHeath] = updated
   256  	}
   257  
   258  	// loop through instances and wait for each to finish initial fingerprint
   259  	m.instancesMu.RLock()
   260  	for n, i := range m.instances {
   261  		wg.Add(1)
   262  		go func(name string, instance *instanceManager) {
   263  			defer wg.Done()
   264  			instance.WaitForFirstFingerprint(ctx)
   265  			recordDriver(name, instance.getLastHealth())
   266  		}(n, i)
   267  	}
   268  	m.instancesMu.RUnlock()
   269  	wg.Wait()
   270  
   271  	m.logger.Debug("detected drivers", "drivers", driversByStatus)
   272  }
   273  
   274  func (m *manager) loadReattachConfigs() error {
   275  	m.reattachConfigLock.Lock()
   276  	defer m.reattachConfigLock.Unlock()
   277  
   278  	s, err := m.state.GetDriverPluginState()
   279  	if err != nil {
   280  		return err
   281  	}
   282  
   283  	if s != nil {
   284  		for name, c := range s.ReattachConfigs {
   285  			if m.isDriverBlocked(name) {
   286  				m.logger.Warn("reattach config for driver plugin found but driver is blocked due to allow/block list, killing plugin",
   287  					"driver", name)
   288  				m.shutdownBlockedDriver(name, c)
   289  				continue
   290  			}
   291  
   292  			id := loader.PluginID{
   293  				PluginType: base.PluginTypeDriver,
   294  				Name:       name,
   295  			}
   296  
   297  			m.reattachConfigs[id] = c
   298  		}
   299  	}
   300  	return nil
   301  }
   302  
   303  // shutdownBlockedDriver is used to forcefully shutdown a running driver plugin
   304  // when it has been blocked due to allow/block lists
   305  func (m *manager) shutdownBlockedDriver(name string, reattach *pstructs.ReattachConfig) {
   306  	c, err := pstructs.ReattachConfigToGoPlugin(reattach)
   307  	if err != nil {
   308  		m.logger.Warn("failed to reattach and kill blocked driver plugin",
   309  			"driver", name, "error", err)
   310  		return
   311  
   312  	}
   313  	pluginInstance, err := m.loader.Reattach(name, base.PluginTypeDriver, c)
   314  	if err != nil {
   315  		m.logger.Warn("failed to reattach and kill blocked driver plugin",
   316  			"driver", name, "error", err)
   317  		return
   318  	}
   319  
   320  	if !pluginInstance.Exited() {
   321  		pluginInstance.Kill()
   322  	}
   323  }
   324  
   325  // storePluginReattachConfig is used as a callback to the instance managers and
   326  // persists thhe plugin reattach configurations.
   327  func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error {
   328  	m.reattachConfigLock.Lock()
   329  	defer m.reattachConfigLock.Unlock()
   330  
   331  	if c == nil {
   332  		delete(m.reattachConfigs, id)
   333  	} else {
   334  		// Store the new reattach config
   335  		m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c)
   336  	}
   337  	// Persist the state
   338  	s := &state.PluginState{
   339  		ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)),
   340  	}
   341  
   342  	for id, c := range m.reattachConfigs {
   343  		s.ReattachConfigs[id.Name] = c
   344  	}
   345  
   346  	return m.state.PutDriverPluginState(s)
   347  }
   348  
   349  // fetchPluginReattachConfig is used as a callback to the instance managers and
   350  // retrieves the plugin reattach config. If it has not been stored it will
   351  // return nil
   352  func (m *manager) fetchPluginReattachConfig(id loader.PluginID) (*plugin.ReattachConfig, bool) {
   353  	m.reattachConfigLock.Lock()
   354  	defer m.reattachConfigLock.Unlock()
   355  
   356  	if cfg, ok := m.reattachConfigs[id]; ok {
   357  		c, err := pstructs.ReattachConfigToGoPlugin(cfg)
   358  		if err != nil {
   359  			m.logger.Warn("failed to read plugin reattach config", "config", cfg, "error", err)
   360  			delete(m.reattachConfigs, id)
   361  			return nil, false
   362  		}
   363  		return c, true
   364  	}
   365  	return nil, false
   366  }
   367  
   368  func (m *manager) Dispense(d string) (drivers.DriverPlugin, error) {
   369  	m.instancesMu.RLock()
   370  	defer m.instancesMu.RUnlock()
   371  	if instance, ok := m.instances[d]; ok {
   372  		return instance.dispense()
   373  	}
   374  
   375  	return nil, ErrDriverNotFound
   376  }
   377  
   378  func (m *manager) isDriverBlocked(name string) bool {
   379  	// Block drivers that are not in the allowed list if it is set.
   380  	if _, ok := m.allowedDrivers[name]; len(m.allowedDrivers) > 0 && !ok {
   381  		return true
   382  	}
   383  
   384  	// Block drivers that are in the blocked list
   385  	if _, ok := m.blockedDrivers[name]; ok {
   386  		return true
   387  	}
   388  	return false
   389  }