github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/devicemanager/manager.go (about)

     1  // Package devicemanager is used to manage device plugins
     2  package devicemanager
     3  
     4  import (
     5  	"context"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	multierror "github.com/hashicorp/go-multierror"
    12  	plugin "github.com/hashicorp/go-plugin"
    13  	"github.com/hashicorp/nomad/client/devicemanager/state"
    14  	"github.com/hashicorp/nomad/client/pluginmanager"
    15  	"github.com/hashicorp/nomad/helper/pluginutils/loader"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  	"github.com/hashicorp/nomad/plugins/base"
    18  	"github.com/hashicorp/nomad/plugins/device"
    19  	pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
    20  )
    21  
    22  // Manager is the interface used to manage device plugins
    23  type Manager interface {
    24  	pluginmanager.PluginManager
    25  
    26  	// Reserve is used to reserve a set of devices
    27  	Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error)
    28  
    29  	// AllStats is used to retrieve all the latest statistics for all devices.
    30  	AllStats() []*device.DeviceGroupStats
    31  
    32  	// DeviceStats returns the device statistics for the given device.
    33  	DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error)
    34  }
    35  
    36  // StateStorage is used to persist the device managers state across
    37  // agent restarts.
    38  type StateStorage interface {
    39  	// GetDevicePluginState is used to retrieve the device manager's plugin
    40  	// state.
    41  	GetDevicePluginState() (*state.PluginState, error)
    42  
    43  	// PutDevicePluginState is used to store the device manager's plugin
    44  	// state.
    45  	PutDevicePluginState(state *state.PluginState) error
    46  }
    47  
    48  // UpdateNodeDevices is a callback for updating the set of devices on a node.
    49  type UpdateNodeDevicesFn func(devices []*structs.NodeDeviceResource)
    50  
    51  // StorePluginReattachFn is used to store plugin reattachment configurations.
    52  type StorePluginReattachFn func(*plugin.ReattachConfig) error
    53  
    54  // Config is used to configure a device manager
    55  type Config struct {
    56  	// Logger is the logger used by the device manager
    57  	Logger log.Logger
    58  
    59  	// Loader is the plugin loader
    60  	Loader loader.PluginCatalog
    61  
    62  	// PluginConfig is the config passed to the launched plugins
    63  	PluginConfig *base.AgentConfig
    64  
    65  	// Updater is used to update the node when device information changes
    66  	Updater UpdateNodeDevicesFn
    67  
    68  	// StatsInterval is the interval at which to collect statistics
    69  	StatsInterval time.Duration
    70  
    71  	// State is used to manage the device managers state
    72  	State StateStorage
    73  }
    74  
    75  // manager is used to manage a set of device plugins
    76  type manager struct {
    77  	// logger is the logger used by the device manager
    78  	logger log.Logger
    79  
    80  	// state is used to manage the device managers state
    81  	state StateStorage
    82  
    83  	// ctx is used to shutdown the device manager
    84  	ctx    context.Context
    85  	cancel context.CancelFunc
    86  
    87  	// loader is the plugin loader
    88  	loader loader.PluginCatalog
    89  
    90  	// pluginConfig is the config passed to the launched plugins
    91  	pluginConfig *base.AgentConfig
    92  
    93  	// updater is used to update the node when device information changes
    94  	updater UpdateNodeDevicesFn
    95  
    96  	// statsInterval is the duration at which to collect statistics
    97  	statsInterval time.Duration
    98  
    99  	// fingerprintResCh is used to be triggered that there are new devices
   100  	fingerprintResCh chan struct{}
   101  
   102  	// instances is the list of managed devices
   103  	instances map[loader.PluginID]*instanceManager
   104  
   105  	// reattachConfigs stores the plugin reattach configs
   106  	reattachConfigs    map[loader.PluginID]*pstructs.ReattachConfig
   107  	reattachConfigLock sync.Mutex
   108  }
   109  
   110  // New returns a new device manager
   111  func New(c *Config) *manager {
   112  	ctx, cancel := context.WithCancel(context.Background())
   113  	return &manager{
   114  		logger:           c.Logger.Named("device_mgr"),
   115  		state:            c.State,
   116  		ctx:              ctx,
   117  		cancel:           cancel,
   118  		loader:           c.Loader,
   119  		pluginConfig:     c.PluginConfig,
   120  		updater:          c.Updater,
   121  		statsInterval:    c.StatsInterval,
   122  		instances:        make(map[loader.PluginID]*instanceManager),
   123  		reattachConfigs:  make(map[loader.PluginID]*pstructs.ReattachConfig),
   124  		fingerprintResCh: make(chan struct{}, 1),
   125  	}
   126  }
   127  
   128  // PluginType identifies this manager to the plugin manager and satisfies the PluginManager interface.
   129  func (*manager) PluginType() string { return base.PluginTypeDevice }
   130  
   131  // Run starts the device manager. The manager will shutdown any previously
   132  // launched plugin and then begin fingerprinting and stats collection on all new
   133  // device plugins.
   134  func (m *manager) Run() {
   135  	// Check if there are any plugins that didn't get cleanly shutdown before
   136  	// and if there are shut them down.
   137  	m.cleanupStalePlugins()
   138  
   139  	// Get device plugins
   140  	devices := m.loader.Catalog()[base.PluginTypeDevice]
   141  	if len(devices) == 0 {
   142  		m.logger.Debug("exiting since there are no device plugins")
   143  		m.cancel()
   144  		return
   145  	}
   146  
   147  	for _, d := range devices {
   148  		id := loader.PluginInfoID(d)
   149  		storeFn := func(c *plugin.ReattachConfig) error {
   150  			id := id
   151  			return m.storePluginReattachConfig(id, c)
   152  		}
   153  		m.instances[id] = newInstanceManager(&instanceManagerConfig{
   154  			Logger:           m.logger,
   155  			Ctx:              m.ctx,
   156  			Loader:           m.loader,
   157  			StoreReattach:    storeFn,
   158  			PluginConfig:     m.pluginConfig,
   159  			Id:               &id,
   160  			FingerprintOutCh: m.fingerprintResCh,
   161  			StatsInterval:    m.statsInterval,
   162  		})
   163  	}
   164  
   165  	// Now start the fingerprint handler
   166  	go m.fingerprint()
   167  }
   168  
   169  // fingerprint is the main fingerprint loop
   170  func (m *manager) fingerprint() {
   171  	for {
   172  		select {
   173  		case <-m.ctx.Done():
   174  			return
   175  		case <-m.fingerprintResCh:
   176  		}
   177  
   178  		// Collect the data
   179  		var fingerprinted []*device.DeviceGroup
   180  		for _, i := range m.instances {
   181  			fingerprinted = append(fingerprinted, i.Devices()...)
   182  		}
   183  
   184  		// Convert and update
   185  		out := make([]*structs.NodeDeviceResource, len(fingerprinted))
   186  		for i, f := range fingerprinted {
   187  			out[i] = convertDeviceGroup(f)
   188  		}
   189  
   190  		// Call the updater
   191  		m.updater(out)
   192  	}
   193  }
   194  
   195  // Shutdown cleans up all the plugins
   196  func (m *manager) Shutdown() {
   197  	// Cancel the context to stop any requests
   198  	m.cancel()
   199  
   200  	// Go through and shut everything down
   201  	for _, i := range m.instances {
   202  		i.cleanup()
   203  	}
   204  }
   205  
   206  func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} {
   207  	ctx, cancel := context.WithCancel(ctx)
   208  	go func() {
   209  		var wg sync.WaitGroup
   210  		for i := range m.instances {
   211  			wg.Add(1)
   212  			go func(instance *instanceManager) {
   213  				instance.WaitForFirstFingerprint(ctx)
   214  				wg.Done()
   215  			}(m.instances[i])
   216  		}
   217  		wg.Wait()
   218  		cancel()
   219  	}()
   220  	return ctx.Done()
   221  }
   222  
   223  // Reserve reserves the given allocated device. If the device is unknown, an
   224  // UnknownDeviceErr is returned.
   225  func (m *manager) Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) {
   226  	// Go through each plugin and see if it can reserve the resources
   227  	for _, i := range m.instances {
   228  		if !i.HasDevices(d) {
   229  			continue
   230  		}
   231  
   232  		// We found a match so reserve
   233  		return i.Reserve(d)
   234  	}
   235  
   236  	return nil, UnknownDeviceErrFromAllocated("failed to reserve devices", d)
   237  }
   238  
   239  // AllStats returns statistics for all the devices
   240  func (m *manager) AllStats() []*device.DeviceGroupStats {
   241  	// Go through each plugin and collect stats
   242  	var stats []*device.DeviceGroupStats
   243  	for _, i := range m.instances {
   244  		stats = append(stats, i.AllStats()...)
   245  	}
   246  
   247  	return stats
   248  }
   249  
   250  // DeviceStats returns the statistics for the passed devices. If the device is unknown, an
   251  // UnknownDeviceErr is returned.
   252  func (m *manager) DeviceStats(d *structs.AllocatedDeviceResource) (*device.DeviceGroupStats, error) {
   253  	// Go through each plugin and see if it has the requested devices
   254  	for _, i := range m.instances {
   255  		if !i.HasDevices(d) {
   256  			continue
   257  		}
   258  
   259  		// We found a match so reserve
   260  		return i.DeviceStats(d), nil
   261  	}
   262  
   263  	return nil, UnknownDeviceErrFromAllocated("failed to collect statistics", d)
   264  }
   265  
   266  // cleanupStalePlugins reads the device managers state and shuts down any
   267  // previously launched plugin.
   268  func (m *manager) cleanupStalePlugins() error {
   269  
   270  	// Read the old plugin state
   271  	s, err := m.state.GetDevicePluginState()
   272  	if err != nil {
   273  		return fmt.Errorf("failed to read plugin state: %v", err)
   274  	}
   275  
   276  	// No state was stored so there is nothing to do.
   277  	if s == nil {
   278  		return nil
   279  	}
   280  
   281  	// For each plugin go through and try to shut it down
   282  	var mErr multierror.Error
   283  	for name, c := range s.ReattachConfigs {
   284  		rc, err := pstructs.ReattachConfigToGoPlugin(c)
   285  		if err != nil {
   286  			multierror.Append(&mErr, fmt.Errorf("failed to convert reattach config: %v", err))
   287  			continue
   288  		}
   289  
   290  		instance, err := m.loader.Reattach(name, base.PluginTypeDevice, rc)
   291  		if err != nil {
   292  			multierror.Append(&mErr, fmt.Errorf("failed to reattach to plugin %q: %v", name, err))
   293  			continue
   294  		}
   295  
   296  		// Kill the instance
   297  		instance.Kill()
   298  	}
   299  
   300  	return mErr.ErrorOrNil()
   301  }
   302  
   303  // storePluginReattachConfig is used as a callback to the instance managers and
   304  // persists thhe plugin reattach configurations.
   305  func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error {
   306  	m.reattachConfigLock.Lock()
   307  	defer m.reattachConfigLock.Unlock()
   308  
   309  	// Store the new reattach config
   310  	m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c)
   311  
   312  	// Persist the state
   313  	s := &state.PluginState{
   314  		ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)),
   315  	}
   316  
   317  	for id, c := range m.reattachConfigs {
   318  		s.ReattachConfigs[id.Name] = c
   319  	}
   320  
   321  	return m.state.PutDevicePluginState(s)
   322  }