
     1  package client
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"net"
     7  	"net/rpc"
     8  	"os"
     9  	"path/filepath"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    16  	metrics ""
    17  	consulapi ""
    18  	""
    19  	hclog ""
    20  	multierror ""
    21  	""
    22  	""
    23  	""
    24  	arstate ""
    25  	""
    26  	""
    27  	consulApi ""
    28  	""
    29  	""
    30  	""
    31  	""
    32  	""
    33  	""
    34  	""
    35  	""
    36  	""
    37  	cstructs ""
    38  	""
    39  	""
    40  	""
    41  	""
    42  	hstats ""
    43  	""
    44  	""
    45  	""
    46  	nconfig ""
    47  	""
    48  	""
    49  	""
    50  	vaultapi ""
    51  	""
    52  	""
    53  )
    55  const (
    56  	// clientRPCCache controls how long we keep an idle connection
    57  	// open to a server
    58  	clientRPCCache = 5 * time.Minute
    60  	// clientMaxStreams controls how many idle streams we keep
    61  	// open to a server
    62  	clientMaxStreams = 2
    64  	// datacenterQueryLimit searches through up to this many adjacent
    65  	// datacenters looking for the Nomad server service.
    66  	datacenterQueryLimit = 9
    68  	// registerRetryIntv is minimum interval on which we retry
    69  	// registration. We pick a value between this and 2x this.
    70  	registerRetryIntv = 15 * time.Second
    72  	// getAllocRetryIntv is minimum interval on which we retry
    73  	// to fetch allocations. We pick a value between this and 2x this.
    74  	getAllocRetryIntv = 30 * time.Second
    76  	// devModeRetryIntv is the retry interval used for development
    77  	devModeRetryIntv = time.Second
    79  	// stateSnapshotIntv is how often the client snapshots state
    80  	stateSnapshotIntv = 60 * time.Second
    82  	// initialHeartbeatStagger is used to stagger the interval between
    83  	// starting and the initial heartbeat. After the initial heartbeat,
    84  	// we switch to using the TTL specified by the servers.
    85  	initialHeartbeatStagger = 10 * time.Second
    87  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    88  	// node attributes or meta map.
    89  	nodeUpdateRetryIntv = 5 * time.Second
    91  	// allocSyncIntv is the batching period of allocation updates before they
    92  	// are synced with the server.
    93  	allocSyncIntv = 200 * time.Millisecond
    95  	// allocSyncRetryIntv is the interval on which we retry updating
    96  	// the status of the allocation
    97  	allocSyncRetryIntv = 5 * time.Second
    99  	// defaultConnectSidecarImage is the image set in the node meta by default
   100  	// to be used by Consul Connect sidecar tasks
   101  	// Update sidecar_task.html when updating this.
   102  	defaultConnectSidecarImage = "envoyproxy/envoy:v1.11.2@sha256:a7769160c9c1a55bb8d07a3b71ce5d64f72b1f665f10d81aa1581bc3cf850d09"
   104  	// defaultConnectLogLevel is the log level set in the node meta by default
   105  	// to be used by Consul Connect sidecar tasks
   106  	defaultConnectLogLevel = "info"
   107  )
   109  var (
   110  	// grace period to allow for batch fingerprint processing
   111  	batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second
   112  )
   114  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
   115  // Client
   116  type ClientStatsReporter interface {
   117  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
   118  	// If it does not exist an error is reported.
   119  	GetAllocStats(allocID string) (interfaces.AllocStatsReporter, error)
   121  	// LatestHostStats returns the latest resource usage stats for the host
   122  	LatestHostStats() *stats.HostStats
   123  }
   125  // AllocRunner is the interface implemented by the core alloc runner.
   126  //TODO Create via factory to allow testing Client with mock AllocRunners.
   127  type AllocRunner interface {
   128  	Alloc() *structs.Allocation
   129  	AllocState() *arstate.State
   130  	Destroy()
   131  	Shutdown()
   132  	GetAllocDir() *allocdir.AllocDir
   133  	IsDestroyed() bool
   134  	IsMigrating() bool
   135  	IsWaiting() bool
   136  	Listener() *cstructs.AllocListener
   137  	Restore() error
   138  	Run()
   139  	StatsReporter() interfaces.AllocStatsReporter
   140  	Update(*structs.Allocation)
   141  	WaitCh() <-chan struct{}
   142  	DestroyCh() <-chan struct{}
   143  	ShutdownCh() <-chan struct{}
   144  	Signal(taskName, signal string) error
   145  	GetTaskEventHandler(taskName string) drivermanager.EventHandler
   146  	PersistState() error
   148  	RestartTask(taskName string, taskEvent *structs.TaskEvent) error
   149  	RestartAll(taskEvent *structs.TaskEvent) error
   151  	GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler
   152  	GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error)
   153  }
   155  // Client is used to implement the client interaction with Nomad. Clients
   156  // are expected to register as a schedulable node to the servers, and to
   157  // run allocations as determined by the servers.
   158  type Client struct {
   159  	config *config.Config
   160  	start  time.Time
   162  	// stateDB is used to efficiently store client state.
   163  	stateDB state.StateDB
   165  	// configCopy is a copy that should be passed to alloc-runners.
   166  	configCopy *config.Config
   167  	configLock sync.RWMutex
   169  	logger    hclog.InterceptLogger
   170  	rpcLogger hclog.Logger
   172  	connPool *pool.ConnPool
   174  	// tlsWrap is used to wrap outbound connections using TLS. It should be
   175  	// accessed using the lock.
   176  	tlsWrap     tlsutil.RegionWrapper
   177  	tlsWrapLock sync.RWMutex
   179  	// servers is the list of nomad servers
   180  	servers *servers.Manager
   182  	// heartbeat related times for tracking how often to heartbeat
   183  	heartbeatTTL    time.Duration
   184  	haveHeartbeated bool
   185  	heartbeatLock   sync.Mutex
   186  	heartbeatStop   *heartbeatStop
   188  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   189  	triggerDiscoveryCh chan struct{}
   191  	// triggerNodeUpdate triggers the client to mark the Node as changed and
   192  	// update it.
   193  	triggerNodeUpdate chan struct{}
   195  	// triggerEmitNodeEvent sends an event and triggers the client to update the
   196  	// server for the node event
   197  	triggerEmitNodeEvent chan *structs.NodeEvent
   199  	// rpcRetryCh is closed when there an event such as server discovery or a
   200  	// successful RPC occurring happens such that a retry should happen. Access
   201  	// should only occur via the getter method
   202  	rpcRetryCh   chan struct{}
   203  	rpcRetryLock sync.Mutex
   205  	// allocs maps alloc IDs to their AllocRunner. This map includes all
   206  	// AllocRunners - running and GC'd - until the server GCs them.
   207  	allocs    map[string]AllocRunner
   208  	allocLock sync.RWMutex
   210  	// invalidAllocs is a map that tracks allocations that failed because
   211  	// the client couldn't initialize alloc or task runners for it. This can
   212  	// happen due to driver errors
   213  	invalidAllocs     map[string]struct{}
   214  	invalidAllocsLock sync.Mutex
   216  	// allocUpdates stores allocations that need to be synced to the server.
   217  	allocUpdates chan *structs.Allocation
   219  	// consulService is Nomad's custom Consul client for managing services
   220  	// and checks.
   221  	consulService consulApi.ConsulServiceAPI
   223  	// consulCatalog is the subset of Consul's Catalog API Nomad uses.
   224  	consulCatalog consul.CatalogAPI
   226  	// HostStatsCollector collects host resource usage stats
   227  	hostStatsCollector *stats.HostStatsCollector
   229  	// shutdown is true when the Client has been shutdown. Must hold
   230  	// shutdownLock to access.
   231  	shutdown bool
   233  	// shutdownCh is closed to signal the Client is shutting down.
   234  	shutdownCh chan struct{}
   236  	shutdownLock sync.Mutex
   238  	// shutdownGroup are goroutines that exit when shutdownCh is closed.
   239  	// Shutdown() blocks on Wait() after closing shutdownCh.
   240  	shutdownGroup group
   242  	// tokensClient is Nomad Client's custom Consul client for requesting Consul
   243  	// Service Identity tokens through Nomad Server.
   244  	tokensClient consulApi.ServiceIdentityAPI
   246  	// vaultClient is used to interact with Vault for token and secret renewals
   247  	vaultClient vaultclient.VaultClient
   249  	// garbageCollector is used to garbage collect terminal allocations present
   250  	// in the node automatically
   251  	garbageCollector *AllocGarbageCollector
   253  	// clientACLResolver holds the ACL resolution state
   254  	clientACLResolver
   256  	// rpcServer is used to serve RPCs by the local agent.
   257  	rpcServer     *rpc.Server
   258  	endpoints     rpcEndpoints
   259  	streamingRpcs *structs.StreamingRpcRegistry
   261  	// pluginManagers is the set of PluginManagers registered by the client
   262  	pluginManagers *pluginmanager.PluginGroup
   264  	// csimanager is responsible for managing csi plugins.
   265  	csimanager csimanager.Manager
   267  	// devicemanger is responsible for managing device plugins.
   268  	devicemanager devicemanager.Manager
   270  	// drivermanager is responsible for managing driver plugins
   271  	drivermanager drivermanager.Manager
   273  	// baseLabels are used when emitting tagged metrics. All client metrics will
   274  	// have these tags, and optionally more.
   275  	baseLabels []metrics.Label
   277  	// batchNodeUpdates is used to batch initial updates to the node
   278  	batchNodeUpdates *batchNodeUpdates
   280  	// fpInitialized chan is closed when the first batch of fingerprints are
   281  	// applied to the node and the server is updated
   282  	fpInitialized chan struct{}
   284  	// serversContactedCh is closed when GetClientAllocs and runAllocs have
   285  	// successfully run once.
   286  	serversContactedCh   chan struct{}
   287  	serversContactedOnce sync.Once
   289  	// dynamicRegistry provides access to plugins that are dynamically registered
   290  	// with a nomad client. Currently only used for CSI.
   291  	dynamicRegistry dynamicplugins.Registry
   292  }
   294  var (
   295  	// noServersErr is returned by the RPC method when the client has no
   296  	// configured servers. This is used to trigger Consul discovery if
   297  	// enabled.
   298  	noServersErr = errors.New("no servers")
   299  )
   301  // NewClient is used to create a new client from the given configuration
   302  func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService consulApi.ConsulServiceAPI) (*Client, error) {
   303  	// Create the tls wrapper
   304  	var tlsWrap tlsutil.RegionWrapper
   305  	if cfg.TLSConfig.EnableRPC {
   306  		tw, err := tlsutil.NewTLSConfiguration(cfg.TLSConfig, true, true)
   307  		if err != nil {
   308  			return nil, err
   309  		}
   310  		tlsWrap, err = tw.OutgoingTLSWrapper()
   311  		if err != nil {
   312  			return nil, err
   313  		}
   314  	}
   316  	if cfg.StateDBFactory == nil {
   317  		cfg.StateDBFactory = state.GetStateDBFactory(cfg.DevMode)
   318  	}
   320  	// Create the logger
   321  	logger := cfg.Logger.ResetNamedIntercept("client")
   323  	// Create the client
   324  	c := &Client{
   325  		config:               cfg,
   326  		consulCatalog:        consulCatalog,
   327  		consulService:        consulService,
   328  		start:                time.Now(),
   329  		connPool:             pool.NewPool(logger, clientRPCCache, clientMaxStreams, tlsWrap),
   330  		tlsWrap:              tlsWrap,
   331  		streamingRpcs:        structs.NewStreamingRpcRegistry(),
   332  		logger:               logger,
   333  		rpcLogger:            logger.Named("rpc"),
   334  		allocs:               make(map[string]AllocRunner),
   335  		allocUpdates:         make(chan *structs.Allocation, 64),
   336  		shutdownCh:           make(chan struct{}),
   337  		triggerDiscoveryCh:   make(chan struct{}),
   338  		triggerNodeUpdate:    make(chan struct{}, 8),
   339  		triggerEmitNodeEvent: make(chan *structs.NodeEvent, 8),
   340  		fpInitialized:        make(chan struct{}),
   341  		invalidAllocs:        make(map[string]struct{}),
   342  		serversContactedCh:   make(chan struct{}),
   343  		serversContactedOnce: sync.Once{},
   344  	}
   346  	c.batchNodeUpdates = newBatchNodeUpdates(
   347  		c.updateNodeFromDriver,
   348  		c.updateNodeFromDevices,
   349  		c.updateNodeFromCSI,
   350  	)
   352  	// Initialize the server manager
   353  	c.servers = servers.New(c.logger, c.shutdownCh, c)
   355  	// Start server manager rebalancing go routine
   356  	go c.servers.Start()
   358  	// initialize the client
   359  	if err := c.init(); err != nil {
   360  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   361  	}
   363  	// initialize the dynamic registry (needs to happen after init)
   364  	c.dynamicRegistry =
   365  		dynamicplugins.NewRegistry(c.stateDB, map[string]dynamicplugins.PluginDispenser{
   366  			dynamicplugins.PluginTypeCSIController: func(info *dynamicplugins.PluginInfo) (interface{}, error) {
   367  				return csi.NewClient(info.ConnectionInfo.SocketPath, logger.Named("csi_client").With("", info.Name, "plugin.type", "controller"))
   368  			},
   369  			dynamicplugins.PluginTypeCSINode: func(info *dynamicplugins.PluginInfo) (interface{}, error) {
   370  				return csi.NewClient(info.ConnectionInfo.SocketPath, logger.Named("csi_client").With("", info.Name, "plugin.type", "client"))
   371  			}, // TODO(tgross): refactor these dispenser constructors into csimanager to tidy it up
   372  		})
   374  	// Setup the clients RPC server
   375  	c.setupClientRpc()
   377  	// Initialize the ACL state
   378  	if err := c.clientACLResolver.init(); err != nil {
   379  		return nil, fmt.Errorf("failed to initialize ACL state: %v", err)
   380  	}
   382  	// Setup the node
   383  	if err := c.setupNode(); err != nil {
   384  		return nil, fmt.Errorf("node setup failed: %v", err)
   385  	}
   387  	// Store the config copy before restoring state but after it has been
   388  	// initialized.
   389  	c.configLock.Lock()
   390  	c.configCopy = c.config.Copy()
   391  	c.configLock.Unlock()
   393  	fingerprintManager := NewFingerprintManager(
   394  		c.configCopy.PluginSingletonLoader, c.GetConfig, c.configCopy.Node,
   395  		c.shutdownCh, c.updateNodeFromFingerprint, c.logger)
   397  	c.pluginManagers = pluginmanager.New(c.logger)
   399  	// Fingerprint the node and scan for drivers
   400  	if err := fingerprintManager.Run(); err != nil {
   401  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   402  	}
   404  	// Build the white/blacklists of drivers.
   405  	allowlistDrivers := cfg.ReadStringListToMap("driver.whitelist")
   406  	blocklistDrivers := cfg.ReadStringListToMap("driver.blacklist")
   408  	// Setup the csi manager
   409  	csiConfig := &csimanager.Config{
   410  		Logger:                c.logger,
   411  		DynamicRegistry:       c.dynamicRegistry,
   412  		UpdateNodeCSIInfoFunc: c.batchNodeUpdates.updateNodeFromCSI,
   413  		TriggerNodeEvent:      c.triggerNodeEvent,
   414  	}
   415  	csiManager := csimanager.New(csiConfig)
   416  	c.csimanager = csiManager
   417  	c.pluginManagers.RegisterAndRun(csiManager.PluginManager())
   419  	// Setup the driver manager
   420  	driverConfig := &drivermanager.Config{
   421  		Logger:              c.logger,
   422  		Loader:              c.configCopy.PluginSingletonLoader,
   423  		PluginConfig:        c.configCopy.NomadPluginConfig(),
   424  		Updater:             c.batchNodeUpdates.updateNodeFromDriver,
   425  		EventHandlerFactory: c.GetTaskEventHandler,
   426  		State:               c.stateDB,
   427  		AllowedDrivers:      allowlistDrivers,
   428  		BlockedDrivers:      blocklistDrivers,
   429  	}
   430  	drvManager := drivermanager.New(driverConfig)
   431  	c.drivermanager = drvManager
   432  	c.pluginManagers.RegisterAndRun(drvManager)
   434  	// Setup the device manager
   435  	devConfig := &devicemanager.Config{
   436  		Logger:        c.logger,
   437  		Loader:        c.configCopy.PluginSingletonLoader,
   438  		PluginConfig:  c.configCopy.NomadPluginConfig(),
   439  		Updater:       c.batchNodeUpdates.updateNodeFromDevices,
   440  		StatsInterval: c.configCopy.StatsCollectionInterval,
   441  		State:         c.stateDB,
   442  	}
   443  	devManager := devicemanager.New(devConfig)
   444  	c.devicemanager = devManager
   445  	c.pluginManagers.RegisterAndRun(devManager)
   447  	// Batching of initial fingerprints is done to reduce the number of node
   448  	// updates sent to the server on startup. This is the first RPC to the servers
   449  	go c.batchFirstFingerprints()
   451  	// create heartbeatStop. We go after the first attempt to connect to the server, so
   452  	// that our grace period for connection goes for the full time
   453  	c.heartbeatStop = newHeartbeatStop(c.getAllocRunner, batchFirstFingerprintsTimeout, logger, c.shutdownCh)
   455  	// Watch for disconnection, and heartbeatStopAllocs configured to have a maximum
   456  	// lifetime when out of touch with the server
   457  	go
   459  	// Add the stats collector
   460  	statsCollector := stats.NewHostStatsCollector(c.logger, c.config.AllocDir, c.devicemanager.AllStats)
   461  	c.hostStatsCollector = statsCollector
   463  	// Add the garbage collector
   464  	gcConfig := &GCConfig{
   465  		MaxAllocs:           cfg.GCMaxAllocs,
   466  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   467  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   468  		Interval:            cfg.GCInterval,
   469  		ParallelDestroys:    cfg.GCParallelDestroys,
   470  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   471  	}
   472  	c.garbageCollector = NewAllocGarbageCollector(c.logger, statsCollector, c, gcConfig)
   473  	go c.garbageCollector.Run()
   475  	// Set the preconfigured list of static servers
   476  	c.configLock.RLock()
   477  	if len(c.configCopy.Servers) > 0 {
   478  		if _, err := c.setServersImpl(c.configCopy.Servers, true); err != nil {
   479  			logger.Warn("none of the configured servers are valid", "error", err)
   480  		}
   481  	}
   482  	c.configLock.RUnlock()
   484  	// Setup Consul discovery if enabled
   485  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   486  		c.shutdownGroup.Go(c.consulDiscovery)
   487  		if c.servers.NumServers() == 0 {
   488  			// No configured servers; trigger discovery manually
   489  			c.triggerDiscoveryCh <- struct{}{}
   490  		}
   491  	}
   493  	if err := c.setupConsulTokenClient(); err != nil {
   494  		return nil, errors.Wrap(err, "failed to setup consul tokens client")
   495  	}
   497  	// Setup the vault client for token and secret renewals
   498  	if err := c.setupVaultClient(); err != nil {
   499  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   500  	}
   502  	// wait until drivers are healthy before restoring or registering with servers
   503  	select {
   504  	case <-c.Ready():
   505  	case <-time.After(batchFirstFingerprintsProcessingGrace):
   506  		logger.Warn("batch fingerprint operation timed out; proceeding to register with fingerprinted plugins so far")
   507  	}
   509  	// Register and then start heartbeating to the servers.
   510  	c.shutdownGroup.Go(c.registerAndHeartbeat)
   512  	// Restore the state
   513  	if err := c.restoreState(); err != nil {
   514  		logger.Error("failed to restore state", "error", err)
   515  		logger.Error("Nomad is unable to start due to corrupt state. "+
   516  			"The safest way to proceed is to manually stop running task processes "+
   517  			"and remove Nomad's state and alloc directories before "+
   518  			"restarting. Lost allocations will be rescheduled.",
   519  			"state_dir", c.config.StateDir, "alloc_dir", c.config.AllocDir)
   520  		logger.Error("Corrupt state is often caused by a bug. Please " +
   521  			"report as much information as possible to " +
   522  			"")
   523  		return nil, fmt.Errorf("failed to restore state")
   524  	}
   526  	// Begin periodic snapshotting of state.
   527  	c.shutdownGroup.Go(c.periodicSnapshot)
   529  	// Begin syncing allocations to the server
   530  	c.shutdownGroup.Go(c.allocSync)
   532  	// Start the client! Don't use the shutdownGroup as run handles
   533  	// shutdowns manually to prevent updates from being applied during
   534  	// shutdown.
   535  	go
   537  	// Start collecting stats
   538  	c.shutdownGroup.Go(c.emitStats)
   540  	c.logger.Info("started client", "node_id", c.NodeID())
   541  	return c, nil
   542  }
   544  // Ready returns a chan that is closed when the client is fully initialized
   545  func (c *Client) Ready() <-chan struct{} {
   546  	return c.fpInitialized
   547  }
   549  // init is used to initialize the client and perform any setup
   550  // needed before we begin starting its various components.
   551  func (c *Client) init() error {
   552  	// Ensure the state dir exists if we have one
   553  	if c.config.StateDir != "" {
   554  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   555  			return fmt.Errorf("failed creating state dir: %s", err)
   556  		}
   558  	} else {
   559  		// Otherwise make a temp directory to use.
   560  		p, err := ioutil.TempDir("", "NomadClient")
   561  		if err != nil {
   562  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   563  		}
   565  		p, err = filepath.EvalSymlinks(p)
   566  		if err != nil {
   567  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   568  		}
   570  		c.config.StateDir = p
   571  	}
   572  	c.logger.Info("using state directory", "state_dir", c.config.StateDir)
   574  	// Open the state database
   575  	db, err := c.config.StateDBFactory(c.logger, c.config.StateDir)
   576  	if err != nil {
   577  		return fmt.Errorf("failed to open state database: %v", err)
   578  	}
   580  	// Upgrade the state database
   581  	if err := db.Upgrade(); err != nil {
   582  		// Upgrade only returns an error on critical persistence
   583  		// failures in which an operator should intervene before the
   584  		// node is accessible. Upgrade drops and logs corrupt state it
   585  		// encounters, so failing to start the agent should be extremely
   586  		// rare.
   587  		return fmt.Errorf("failed to upgrade state database: %v", err)
   588  	}
   590  	c.stateDB = db
   592  	// Ensure the alloc dir exists if we have one
   593  	if c.config.AllocDir != "" {
   594  		if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil {
   595  			return fmt.Errorf("failed creating alloc dir: %s", err)
   596  		}
   597  	} else {
   598  		// Otherwise make a temp directory to use.
   599  		p, err := ioutil.TempDir("", "NomadClient")
   600  		if err != nil {
   601  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   602  		}
   604  		p, err = filepath.EvalSymlinks(p)
   605  		if err != nil {
   606  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   607  		}
   609  		// Change the permissions to have the execute bit
   610  		if err := os.Chmod(p, 0711); err != nil {
   611  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   612  		}
   614  		c.config.AllocDir = p
   615  	}
   617  	c.logger.Info("using alloc directory", "alloc_dir", c.config.AllocDir)
   618  	return nil
   619  }
   621  // reloadTLSConnections allows a client to reload its TLS configuration on the
   622  // fly
   623  func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error {
   624  	var tlsWrap tlsutil.RegionWrapper
   625  	if newConfig != nil && newConfig.EnableRPC {
   626  		tw, err := tlsutil.NewTLSConfiguration(newConfig, true, true)
   627  		if err != nil {
   628  			return err
   629  		}
   631  		twWrap, err := tw.OutgoingTLSWrapper()
   632  		if err != nil {
   633  			return err
   634  		}
   635  		tlsWrap = twWrap
   636  	}
   638  	// Store the new tls wrapper.
   639  	c.tlsWrapLock.Lock()
   640  	c.tlsWrap = tlsWrap
   641  	c.tlsWrapLock.Unlock()
   643  	// Keep the client configuration up to date as we use configuration values to
   644  	// decide on what type of connections to accept
   645  	c.configLock.Lock()
   646  	c.config.TLSConfig = newConfig
   647  	c.configLock.Unlock()
   649  	c.connPool.ReloadTLS(tlsWrap)
   651  	return nil
   652  }
   654  // Reload allows a client to reload its configuration on the fly
   655  func (c *Client) Reload(newConfig *config.Config) error {
   656  	shouldReloadTLS, err := tlsutil.ShouldReloadRPCConnections(c.config.TLSConfig, newConfig.TLSConfig)
   657  	if err != nil {
   658  		c.logger.Error("error parsing TLS configuration", "error", err)
   659  		return err
   660  	}
   662  	if shouldReloadTLS {
   663  		return c.reloadTLSConnections(newConfig.TLSConfig)
   664  	}
   666  	return nil
   667  }
   669  // Leave is used to prepare the client to leave the cluster
   670  func (c *Client) Leave() error {
   671  	// TODO
   672  	return nil
   673  }
   675  // GetConfig returns the config of the client
   676  func (c *Client) GetConfig() *config.Config {
   677  	c.configLock.Lock()
   678  	defer c.configLock.Unlock()
   679  	return c.configCopy
   680  }
   682  // Datacenter returns the datacenter for the given client
   683  func (c *Client) Datacenter() string {
   684  	return c.config.Node.Datacenter
   685  }
   687  // Region returns the region for the given client
   688  func (c *Client) Region() string {
   689  	return c.config.Region
   690  }
   692  // NodeID returns the node ID for the given client
   693  func (c *Client) NodeID() string {
   694  	return c.config.Node.ID
   695  }
   697  // secretNodeID returns the secret node ID for the given client
   698  func (c *Client) secretNodeID() string {
   699  	return c.config.Node.SecretID
   700  }
   702  // AuthToken returns the ACL token for client RPC authentication
   703  func (c *Client) AuthToken() string {
   704  	return c.config.Node.Token
   705  }
   707  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   708  // client.
   709  func (c *Client) RPCMajorVersion() int {
   710  	return structs.ApiMajorVersion
   711  }
   713  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   714  // client.
   715  func (c *Client) RPCMinorVersion() int {
   716  	return structs.ApiMinorVersion
   717  }
   719  // Shutdown is used to tear down the client
   720  func (c *Client) Shutdown() error {
   721  	c.shutdownLock.Lock()
   722  	defer c.shutdownLock.Unlock()
   724  	if c.shutdown {
   725  		c.logger.Info("already shutdown")
   726  		return nil
   727  	}
   728  	c.logger.Info("shutting down")
   730  	// Stop renewing tokens and secrets
   731  	if c.vaultClient != nil {
   732  		c.vaultClient.Stop()
   733  	}
   735  	// Stop Garbage collector
   736  	c.garbageCollector.Stop()
   738  	arGroup := group{}
   739  	if c.config.DevMode {
   740  		// In DevMode destroy all the running allocations.
   741  		for _, ar := range c.getAllocRunners() {
   742  			ar.Destroy()
   743  			arGroup.AddCh(ar.DestroyCh())
   744  		}
   745  	} else {
   746  		// In normal mode call shutdown
   747  		for _, ar := range c.getAllocRunners() {
   748  			ar.Shutdown()
   749  			arGroup.AddCh(ar.ShutdownCh())
   750  		}
   751  	}
   752  	arGroup.Wait()
   754  	// Shutdown the plugin managers
   755  	c.pluginManagers.Shutdown()
   757  	c.shutdown = true
   758  	close(c.shutdownCh)
   760  	// Must close connection pool to unblock alloc watcher
   761  	c.connPool.Shutdown()
   763  	// Wait for goroutines to stop
   764  	c.shutdownGroup.Wait()
   766  	// One final save state
   767  	c.saveState()
   768  	return c.stateDB.Close()
   769  }
   771  // Stats is used to return statistics for debugging and insight
   772  // for various sub-systems
   773  func (c *Client) Stats() map[string]map[string]string {
   774  	c.heartbeatLock.Lock()
   775  	defer c.heartbeatLock.Unlock()
   776  	stats := map[string]map[string]string{
   777  		"client": {
   778  			"node_id":         c.NodeID(),
   779  			"known_servers":   strings.Join(c.GetServers(), ","),
   780  			"num_allocations": strconv.Itoa(c.NumAllocs()),
   781  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat())),
   782  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   783  		},
   784  		"runtime": hstats.RuntimeStats(),
   785  	}
   786  	return stats
   787  }
   789  // GetAlloc returns an allocation or an error.
   790  func (c *Client) GetAlloc(allocID string) (*structs.Allocation, error) {
   791  	ar, err := c.getAllocRunner(allocID)
   792  	if err != nil {
   793  		return nil, err
   794  	}
   796  	return ar.Alloc(), nil
   797  }
   799  // SignalAllocation sends a signal to the tasks within an allocation.
   800  // If the provided task is empty, then every allocation will be signalled.
   801  // If a task is provided, then only an exactly matching task will be signalled.
   802  func (c *Client) SignalAllocation(allocID, task, signal string) error {
   803  	ar, err := c.getAllocRunner(allocID)
   804  	if err != nil {
   805  		return err
   806  	}
   808  	return ar.Signal(task, signal)
   809  }
   811  // CollectAllocation garbage collects a single allocation on a node. Returns
   812  // true if alloc was found and garbage collected; otherwise false.
   813  func (c *Client) CollectAllocation(allocID string) bool {
   814  	return c.garbageCollector.Collect(allocID)
   815  }
   817  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   818  // state
   819  func (c *Client) CollectAllAllocs() {
   820  	c.garbageCollector.CollectAll()
   821  }
   823  func (c *Client) RestartAllocation(allocID, taskName string) error {
   824  	ar, err := c.getAllocRunner(allocID)
   825  	if err != nil {
   826  		return err
   827  	}
   829  	event := structs.NewTaskEvent(structs.TaskRestartSignal).
   830  		SetRestartReason("User requested restart")
   832  	if taskName != "" {
   833  		return ar.RestartTask(taskName, event)
   834  	}
   836  	return ar.RestartAll(event)
   837  }
   839  // Node returns the locally registered node
   840  func (c *Client) Node() *structs.Node {
   841  	c.configLock.RLock()
   842  	defer c.configLock.RUnlock()
   843  	return c.configCopy.Node
   844  }
   846  // getAllocRunner returns an AllocRunner or an UnknownAllocation error if the
   847  // client has no runner for the given alloc ID.
   848  func (c *Client) getAllocRunner(allocID string) (AllocRunner, error) {
   849  	c.allocLock.RLock()
   850  	defer c.allocLock.RUnlock()
   852  	ar, ok := c.allocs[allocID]
   853  	if !ok {
   854  		return nil, structs.NewErrUnknownAllocation(allocID)
   855  	}
   857  	return ar, nil
   858  }
   860  // StatsReporter exposes the various APIs related resource usage of a Nomad
   861  // client
   862  func (c *Client) StatsReporter() ClientStatsReporter {
   863  	return c
   864  }
   866  func (c *Client) GetAllocStats(allocID string) (interfaces.AllocStatsReporter, error) {
   867  	ar, err := c.getAllocRunner(allocID)
   868  	if err != nil {
   869  		return nil, err
   870  	}
   871  	return ar.StatsReporter(), nil
   872  }
   874  // HostStats returns all the stats related to a Nomad client
   875  func (c *Client) LatestHostStats() *stats.HostStats {
   876  	return c.hostStatsCollector.Stats()
   877  }
   879  func (c *Client) LatestDeviceResourceStats(devices []*structs.AllocatedDeviceResource) []*device.DeviceGroupStats {
   880  	return c.computeAllocatedDeviceGroupStats(devices, c.LatestHostStats().DeviceStats)
   881  }
   883  func (c *Client) computeAllocatedDeviceGroupStats(devices []*structs.AllocatedDeviceResource, hostDeviceGroupStats []*device.DeviceGroupStats) []*device.DeviceGroupStats {
   884  	// basic optimization for the usual case
   885  	if len(devices) == 0 || len(hostDeviceGroupStats) == 0 {
   886  		return nil
   887  	}
   889  	// Build an index of allocated devices
   890  	adIdx := map[structs.DeviceIdTuple][]string{}
   892  	total := 0
   893  	for _, ds := range devices {
   894  		adIdx[*ds.ID()] = ds.DeviceIDs
   895  		total += len(ds.DeviceIDs)
   896  	}
   898  	// Collect allocated device stats from host stats
   899  	result := make([]*device.DeviceGroupStats, 0, len(adIdx))
   901  	for _, dg := range hostDeviceGroupStats {
   902  		k := structs.DeviceIdTuple{
   903  			Vendor: dg.Vendor,
   904  			Type:   dg.Type,
   905  			Name:   dg.Name,
   906  		}
   908  		allocatedDeviceIDs, ok := adIdx[k]
   909  		if !ok {
   910  			continue
   911  		}
   913  		rdgStats := &device.DeviceGroupStats{
   914  			Vendor:        dg.Vendor,
   915  			Type:          dg.Type,
   916  			Name:          dg.Name,
   917  			InstanceStats: map[string]*device.DeviceStats{},
   918  		}
   920  		for _, adID := range allocatedDeviceIDs {
   921  			deviceStats, ok := dg.InstanceStats[adID]
   922  			if !ok || deviceStats == nil {
   923  				c.logger.Warn("device not found in stats", "device_id", adID, "device_group_id", k)
   924  				continue
   925  			}
   927  			rdgStats.InstanceStats[adID] = deviceStats
   928  		}
   929  		result = append(result, rdgStats)
   930  	}
   932  	return result
   933  }
   935  // ValidateMigrateToken verifies that a token is for a specific client and
   936  // allocation, and has been created by a trusted party that has privileged
   937  // knowledge of the client's secret identifier
   938  func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool {
   939  	if !c.config.ACLEnabled {
   940  		return true
   941  	}
   943  	return structs.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken)
   944  }
   946  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   947  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   948  	ar, err := c.getAllocRunner(allocID)
   949  	if err != nil {
   950  		return nil, err
   951  	}
   952  	return ar.GetAllocDir(), nil
   953  }
   955  // GetAllocState returns a copy of an allocation's state on this client. It
   956  // returns either an AllocState or an unknown allocation error.
   957  func (c *Client) GetAllocState(allocID string) (*arstate.State, error) {
   958  	ar, err := c.getAllocRunner(allocID)
   959  	if err != nil {
   960  		return nil, err
   961  	}
   963  	return ar.AllocState(), nil
   964  }
   966  // GetServers returns the list of nomad servers this client is aware of.
   967  func (c *Client) GetServers() []string {
   968  	endpoints := c.servers.GetServers()
   969  	res := make([]string, len(endpoints))
   970  	for i := range endpoints {
   971  		res[i] = endpoints[i].String()
   972  	}
   973  	sort.Strings(res)
   974  	return res
   975  }
   977  // SetServers sets a new list of nomad servers to connect to. As long as one
   978  // server is resolvable no error is returned.
   979  func (c *Client) SetServers(in []string) (int, error) {
   980  	return c.setServersImpl(in, false)
   981  }
   983  // setServersImpl sets a new list of nomad servers to connect to. If force is
   984  // set, we add the server to the internal serverlist even if the server could not
   985  // be pinged. An error is returned if no endpoints were valid when non-forcing.
   986  //
   987  // Force should be used when setting the servers from the initial configuration
   988  // since the server may be starting up in parallel and initial pings may fail.
   989  func (c *Client) setServersImpl(in []string, force bool) (int, error) {
   990  	var mu sync.Mutex
   991  	var wg sync.WaitGroup
   992  	var merr multierror.Error
   994  	endpoints := make([]*servers.Server, 0, len(in))
   995  	wg.Add(len(in))
   997  	for _, s := range in {
   998  		go func(srv string) {
   999  			defer wg.Done()
  1000  			addr, err := resolveServer(srv)
  1001  			if err != nil {
  1002  				mu.Lock()
  1003  				c.logger.Debug("ignoring server due to resolution error", "error", err, "server", srv)
  1004  				merr.Errors = append(merr.Errors, err)
  1005  				mu.Unlock()
  1006  				return
  1007  			}
  1009  			// Try to ping to check if it is a real server
  1010  			if err := c.Ping(addr); err != nil {
  1011  				mu.Lock()
  1012  				merr.Errors = append(merr.Errors, fmt.Errorf("Server at address %s failed ping: %v", addr, err))
  1013  				mu.Unlock()
  1015  				// If we are forcing the setting of the servers, inject it to
  1016  				// the serverlist even if we can't ping immediately.
  1017  				if !force {
  1018  					return
  1019  				}
  1020  			}
  1022  			mu.Lock()
  1023  			endpoints = append(endpoints, &servers.Server{Addr: addr})
  1024  			mu.Unlock()
  1025  		}(s)
  1026  	}
  1028  	wg.Wait()
  1030  	// Only return errors if no servers are valid
  1031  	if len(endpoints) == 0 {
  1032  		if len(merr.Errors) > 0 {
  1033  			return 0, merr.ErrorOrNil()
  1034  		}
  1035  		return 0, noServersErr
  1036  	}
  1038  	c.servers.SetServers(endpoints)
  1039  	return len(endpoints), nil
  1040  }
  1042  // restoreState is used to restore our state from the data dir
  1043  // If there are errors restoring a specific allocation it is marked
  1044  // as failed whenever possible.
  1045  func (c *Client) restoreState() error {
  1046  	if c.config.DevMode {
  1047  		return nil
  1048  	}
  1050  	//XXX REMOVED! make a note in backward compat / upgrading doc
  1051  	// COMPAT: Remove in 0.7.0
  1052  	// 0.6.0 transitioned from individual state files to a single bolt-db.
  1053  	// The upgrade path is to:
  1054  	// Check if old state exists
  1055  	//   If so, restore from that and delete old state
  1056  	// Restore using state database
  1058  	// Restore allocations
  1059  	allocs, allocErrs, err := c.stateDB.GetAllAllocations()
  1060  	if err != nil {
  1061  		return err
  1062  	}
  1064  	for allocID, err := range allocErrs {
  1065  		c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
  1066  		//TODO Cleanup
  1067  		// Try to clean up alloc dir
  1068  		// Remove boltdb entries?
  1069  		// Send to server with clientstatus=failed
  1070  	}
  1072  	// Load each alloc back
  1073  	for _, alloc := range allocs {
  1075  		// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
  1076  		// See hasLocalState for details.  Skipping suspicious allocs
  1077  		// now.  If allocs should be run, they will be started when the client
  1078  		// gets allocs from servers.
  1079  		if !c.hasLocalState(alloc) {
  1080  			c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
  1081  			continue
  1082  		}
  1084  		//XXX On Restore we give up on watching previous allocs because
  1085  		//    we need the local AllocRunners initialized first. We could
  1086  		//    add a second loop to initialize just the alloc watcher.
  1087  		prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
  1088  		prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
  1090  		c.configLock.RLock()
  1091  		arConf := &allocrunner.Config{
  1092  			Alloc:               alloc,
  1093  			Logger:              c.logger,
  1094  			ClientConfig:        c.configCopy,
  1095  			StateDB:             c.stateDB,
  1096  			StateUpdater:        c,
  1097  			DeviceStatsReporter: c,
  1098  			Consul:              c.consulService,
  1099  			ConsulSI:            c.tokensClient,
  1100  			Vault:               c.vaultClient,
  1101  			PrevAllocWatcher:    prevAllocWatcher,
  1102  			PrevAllocMigrator:   prevAllocMigrator,
  1103  			DynamicRegistry:     c.dynamicRegistry,
  1104  			CSIManager:          c.csimanager,
  1105  			DeviceManager:       c.devicemanager,
  1106  			DriverManager:       c.drivermanager,
  1107  			ServersContactedCh:  c.serversContactedCh,
  1108  			RPCClient:           c,
  1109  		}
  1110  		c.configLock.RUnlock()
  1112  		ar, err := allocrunner.NewAllocRunner(arConf)
  1113  		if err != nil {
  1114  			c.logger.Error("error running alloc", "error", err, "alloc_id", alloc.ID)
  1115  			c.handleInvalidAllocs(alloc, err)
  1116  			continue
  1117  		}
  1119  		// Restore state
  1120  		if err := ar.Restore(); err != nil {
  1121  			c.logger.Error("error restoring alloc", "error", err, "alloc_id", alloc.ID)
  1122  			// Override the status of the alloc to failed
  1123  			ar.SetClientStatus(structs.AllocClientStatusFailed)
  1124  			// Destroy the alloc runner since this is a failed restore
  1125  			ar.Destroy()
  1126  			continue
  1127  		}
  1129  		// Maybe mark the alloc for halt on missing server heartbeats
  1130  		if c.heartbeatStop.shouldStop(alloc) {
  1131  			err = c.heartbeatStop.stopAlloc(alloc.ID)
  1132  			if err != nil {
  1133  				c.logger.Error("error stopping alloc", "error", err, "alloc_id", alloc.ID)
  1134  			}
  1135  			continue
  1136  		}
  1138  		//XXX is this locking necessary?
  1139  		c.allocLock.Lock()
  1140  		c.allocs[alloc.ID] = ar
  1141  		c.allocLock.Unlock()
  1143  		c.heartbeatStop.allocHook(alloc)
  1144  	}
  1146  	// All allocs restored successfully, run them!
  1147  	c.allocLock.Lock()
  1148  	for _, ar := range c.allocs {
  1149  		go ar.Run()
  1150  	}
  1151  	c.allocLock.Unlock()
  1152  	return nil
  1153  }
  1155  // hasLocalState returns true if we have any other associated state
  1156  // with alloc beyond the task itself
  1157  //
  1158  // Useful for detecting if a potentially completed alloc got resurrected
  1159  // after AR was destroyed.  In such cases, re-running the alloc lead to
  1160  // unexpected reruns and may lead to process and task exhaustion on node.
  1161  //
  1162  // The heuristic used here is an alloc is suspect if we see no other information
  1163  // and no other task/status info is found.
  1164  //
  1165  // Also, an alloc without any client state will not be restored correctly; there will
  1166  // be no tasks processes to reattach to, etc.  In such cases, client should
  1167  // wait until it gets allocs from server to launch them.
  1168  //
  1169  // See:
  1170  //  *
  1171  //  *
  1172  //
  1173  // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
  1174  func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
  1175  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1176  	if tg == nil {
  1177  		// corrupt alloc?!
  1178  		return false
  1179  	}
  1181  	for _, task := range tg.Tasks {
  1182  		ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
  1183  		if ls != nil || tr != nil {
  1184  			return true
  1185  		}
  1186  	}
  1188  	return false
  1189  }
  1191  func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
  1192  	c.invalidAllocsLock.Lock()
  1193  	c.invalidAllocs[alloc.ID] = struct{}{}
  1194  	c.invalidAllocsLock.Unlock()
  1196  	// Mark alloc as failed so server can handle this
  1197  	failed := makeFailedAlloc(alloc, err)
  1198  	select {
  1199  	case c.allocUpdates <- failed:
  1200  	case <-c.shutdownCh:
  1201  	}
  1202  }
  1204  // saveState is used to snapshot our state into the data dir.
  1205  func (c *Client) saveState() error {
  1206  	var wg sync.WaitGroup
  1207  	var l sync.Mutex
  1208  	var mErr multierror.Error
  1209  	runners := c.getAllocRunners()
  1210  	wg.Add(len(runners))
  1212  	for id, ar := range runners {
  1213  		go func(id string, ar AllocRunner) {
  1214  			err := ar.PersistState()
  1215  			if err != nil {
  1216  				c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
  1217  				l.Lock()
  1218  				multierror.Append(&mErr, err)
  1219  				l.Unlock()
  1220  			}
  1221  			wg.Done()
  1222  		}(id, ar)
  1223  	}
  1225  	wg.Wait()
  1226  	return mErr.ErrorOrNil()
  1227  }
  1229  // getAllocRunners returns a snapshot of the current set of alloc runners.
  1230  func (c *Client) getAllocRunners() map[string]AllocRunner {
  1231  	c.allocLock.RLock()
  1232  	defer c.allocLock.RUnlock()
  1233  	runners := make(map[string]AllocRunner, len(c.allocs))
  1234  	for id, ar := range c.allocs {
  1235  		runners[id] = ar
  1236  	}
  1237  	return runners
  1238  }
  1240  // NumAllocs returns the number of un-GC'd allocs this client has. Used to
  1241  // fulfill the AllocCounter interface for the GC.
  1242  func (c *Client) NumAllocs() int {
  1243  	n := 0
  1244  	c.allocLock.RLock()
  1245  	for _, a := range c.allocs {
  1246  		if !a.IsDestroyed() {
  1247  			n++
  1248  		}
  1249  	}
  1250  	c.allocLock.RUnlock()
  1251  	return n
  1252  }
  1254  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
  1255  // The node ID is, if available, a persistent unique ID.  The secret ID is a
  1256  // high-entropy random UUID.
  1257  func (c *Client) nodeID() (id, secret string, err error) {
  1258  	var hostID string
  1259  	hostInfo, err := host.Info()
  1260  	if !c.config.NoHostUUID && err == nil {
  1261  		if hashed, ok := helper.HashUUID(hostInfo.HostID); ok {
  1262  			hostID = hashed
  1263  		}
  1264  	}
  1266  	if hostID == "" {
  1267  		// Generate a random hostID if no constant ID is available on
  1268  		// this platform.
  1269  		hostID = uuid.Generate()
  1270  	}
  1272  	// Do not persist in dev mode
  1273  	if c.config.DevMode {
  1274  		return hostID, uuid.Generate(), nil
  1275  	}
  1277  	// Attempt to read existing ID
  1278  	idPath := filepath.Join(c.config.StateDir, "client-id")
  1279  	idBuf, err := ioutil.ReadFile(idPath)
  1280  	if err != nil && !os.IsNotExist(err) {
  1281  		return "", "", err
  1282  	}
  1284  	// Attempt to read existing secret ID
  1285  	secretPath := filepath.Join(c.config.StateDir, "secret-id") // TODO: override this from command args
  1286  	secretBuf, err := ioutil.ReadFile(secretPath)
  1287  	if err != nil && !os.IsNotExist(err) {
  1288  		return "", "", err
  1289  	}
  1291  	// Use existing ID if any
  1292  	if len(idBuf) != 0 {
  1293  		id = strings.ToLower(string(idBuf))
  1294  	} else {
  1295  		id = hostID
  1297  		// Persist the ID
  1298  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
  1299  			return "", "", err
  1300  		}
  1301  	}
  1303  	if len(secretBuf) != 0 {
  1304  		secret = string(secretBuf)
  1305  	} else {
  1306  		// Generate new ID
  1307  		secret = uuid.Generate()
  1309  		// Persist the ID
  1310  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
  1311  			return "", "", err
  1312  		}
  1313  	}
  1315  	return id, secret, nil
  1316  }
  1318  // setupNode is used to setup the initial node
  1319  func (c *Client) setupNode() error {
  1320  	node := c.config.Node
  1321  	if node == nil {
  1322  		node = &structs.Node{}
  1323  		c.config.Node = node
  1324  	}
  1325  	// Generate an ID and secret for the node
  1326  	id, secretID, err := c.nodeID()
  1327  	if err != nil {
  1328  		return fmt.Errorf("node ID setup failed: %v", err)
  1329  	}
  1331  	node.ID = id
  1332  	node.SecretID = secretID
  1333  	if node.Attributes == nil {
  1334  		node.Attributes = make(map[string]string)
  1335  	}
  1336  	if node.Links == nil {
  1337  		node.Links = make(map[string]string)
  1338  	}
  1339  	if node.Drivers == nil {
  1340  		node.Drivers = make(map[string]*structs.DriverInfo)
  1341  	}
  1342  	if node.CSIControllerPlugins == nil {
  1343  		node.CSIControllerPlugins = make(map[string]*structs.CSIInfo)
  1344  	}
  1345  	if node.CSINodePlugins == nil {
  1346  		node.CSINodePlugins = make(map[string]*structs.CSIInfo)
  1347  	}
  1348  	if node.Meta == nil {
  1349  		node.Meta = make(map[string]string)
  1350  	}
  1351  	if node.NodeResources == nil {
  1352  		node.NodeResources = &structs.NodeResources{}
  1353  	}
  1354  	if node.ReservedResources == nil {
  1355  		node.ReservedResources = &structs.NodeReservedResources{}
  1356  	}
  1357  	if node.Resources == nil {
  1358  		node.Resources = &structs.Resources{}
  1359  	}
  1360  	if node.Reserved == nil {
  1361  		node.Reserved = &structs.Resources{}
  1362  	}
  1363  	if node.Datacenter == "" {
  1364  		node.Datacenter = "dc1"
  1365  	}
  1366  	if node.Name == "" {
  1367  		node.Name, _ = os.Hostname()
  1368  	}
  1369  	if node.HostVolumes == nil {
  1370  		if l := len(c.config.HostVolumes); l != 0 {
  1371  			node.HostVolumes = make(map[string]*structs.ClientHostVolumeConfig, l)
  1372  			for k, v := range c.config.HostVolumes {
  1373  				if _, err := os.Stat(v.Path); err != nil {
  1374  					return fmt.Errorf("failed to validate volume %s, err: %v", v.Name, err)
  1375  				}
  1376  				node.HostVolumes[k] = v.Copy()
  1377  			}
  1378  		}
  1379  	}
  1381  	if node.Name == "" {
  1382  		node.Name = node.ID
  1383  	}
  1384  	node.Status = structs.NodeStatusInit
  1386  	// Setup default meta
  1387  	if _, ok := node.Meta["connect.sidecar_image"]; !ok {
  1388  		node.Meta["connect.sidecar_image"] = defaultConnectSidecarImage
  1389  	}
  1390  	if _, ok := node.Meta["connect.log_level"]; !ok {
  1391  		node.Meta["connect.log_level"] = defaultConnectLogLevel
  1392  	}
  1394  	return nil
  1395  }
  1397  // updateNodeFromFingerprint updates the node with the result of
  1398  // fingerprinting the node from the diff that was created
  1399  func (c *Client) updateNodeFromFingerprint(response *fingerprint.FingerprintResponse) *structs.Node {
  1400  	c.configLock.Lock()
  1401  	defer c.configLock.Unlock()
  1403  	nodeHasChanged := false
  1405  	for name, newVal := range response.Attributes {
  1406  		oldVal := c.config.Node.Attributes[name]
  1407  		if oldVal == newVal {
  1408  			continue
  1409  		}
  1411  		nodeHasChanged = true
  1412  		if newVal == "" {
  1413  			delete(c.config.Node.Attributes, name)
  1414  		} else {
  1415  			c.config.Node.Attributes[name] = newVal
  1416  		}
  1417  	}
  1419  	// update node links and resources from the diff created from
  1420  	// fingerprinting
  1421  	for name, newVal := range response.Links {
  1422  		oldVal := c.config.Node.Links[name]
  1423  		if oldVal == newVal {
  1424  			continue
  1425  		}
  1427  		nodeHasChanged = true
  1428  		if newVal == "" {
  1429  			delete(c.config.Node.Links, name)
  1430  		} else {
  1431  			c.config.Node.Links[name] = newVal
  1432  		}
  1433  	}
  1435  	// COMPAT(0.10): Remove in 0.10
  1436  	// update the response networks with the config
  1437  	// if we still have node changes, merge them
  1438  	if response.Resources != nil {
  1439  		response.Resources.Networks = updateNetworks(
  1440  			c.config.Node.Resources.Networks,
  1441  			response.Resources.Networks,
  1442  			c.config)
  1443  		if !c.config.Node.Resources.Equals(response.Resources) {
  1444  			c.config.Node.Resources.Merge(response.Resources)
  1445  			nodeHasChanged = true
  1446  		}
  1447  	}
  1449  	// update the response networks with the config
  1450  	// if we still have node changes, merge them
  1451  	if response.NodeResources != nil {
  1452  		response.NodeResources.Networks = updateNetworks(
  1453  			c.config.Node.NodeResources.Networks,
  1454  			response.NodeResources.Networks,
  1455  			c.config)
  1456  		if !c.config.Node.NodeResources.Equals(response.NodeResources) {
  1457  			c.config.Node.NodeResources.Merge(response.NodeResources)
  1458  			nodeHasChanged = true
  1459  		}
  1460  	}
  1462  	if nodeHasChanged {
  1463  		c.updateNodeLocked()
  1464  	}
  1466  	return c.configCopy.Node
  1467  }
  1469  // updateNetworks preserves manually configured network options, but
  1470  // applies fingerprint updates
  1471  func updateNetworks(ns structs.Networks, up structs.Networks, c *config.Config) structs.Networks {
  1472  	if c.NetworkInterface == "" {
  1473  		ns = up
  1474  	} else {
  1475  		// If a network device is configured, filter up to contain details for only
  1476  		// that device
  1477  		upd := []*structs.NetworkResource{}
  1478  		for _, n := range up {
  1479  			if c.NetworkInterface == n.Device {
  1480  				upd = append(upd, n)
  1481  			}
  1482  		}
  1483  		// If updates, use them. Otherwise, ns contains the configured interfaces
  1484  		if len(upd) > 0 {
  1485  			ns = upd
  1486  		}
  1487  	}
  1489  	// ns is set, apply the config NetworkSpeed to all
  1490  	if c.NetworkSpeed != 0 {
  1491  		for _, n := range ns {
  1492  			n.MBits = c.NetworkSpeed
  1493  		}
  1494  	}
  1495  	return ns
  1496  }
  1498  // retryIntv calculates a retry interval value given the base
  1499  func (c *Client) retryIntv(base time.Duration) time.Duration {
  1500  	if c.config.DevMode {
  1501  		return devModeRetryIntv
  1502  	}
  1503  	return base + lib.RandomStagger(base)
  1504  }
  1506  // registerAndHeartbeat is a long lived goroutine used to register the client
  1507  // and then start heartbeating to the server.
  1508  func (c *Client) registerAndHeartbeat() {
  1509  	// Register the node
  1510  	c.retryRegisterNode()
  1512  	// Start watching changes for node changes
  1513  	go c.watchNodeUpdates()
  1515  	// Start watching for emitting node events
  1516  	go c.watchNodeEvents()
  1518  	// Setup the heartbeat timer, for the initial registration
  1519  	// we want to do this quickly. We want to do it extra quickly
  1520  	// in development mode.
  1521  	var heartbeat <-chan time.Time
  1522  	if c.config.DevMode {
  1523  		heartbeat = time.After(0)
  1524  	} else {
  1525  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1526  	}
  1528  	for {
  1529  		select {
  1530  		case <-c.rpcRetryWatcher():
  1531  		case <-heartbeat:
  1532  		case <-c.shutdownCh:
  1533  			return
  1534  		}
  1535  		if err := c.updateNodeStatus(); err != nil {
  1536  			// The servers have changed such that this node has not been
  1537  			// registered before
  1538  			if strings.Contains(err.Error(), "node not found") {
  1539  				// Re-register the node
  1540  				c.logger.Info("re-registering node")
  1541  				c.retryRegisterNode()
  1542  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1543  			} else {
  1544  				intv := c.getHeartbeatRetryIntv(err)
  1545  				c.logger.Error("error heartbeating. retrying", "error", err, "period", intv)
  1546  				heartbeat = time.After(intv)
  1548  				// If heartbeating fails, trigger Consul discovery
  1549  				c.triggerDiscovery()
  1550  			}
  1551  		} else {
  1552  			c.heartbeatLock.Lock()
  1553  			heartbeat = time.After(c.heartbeatTTL)
  1554  			c.heartbeatLock.Unlock()
  1555  		}
  1556  	}
  1557  }
  1559  func (c *Client) lastHeartbeat() time.Time {
  1560  	return c.heartbeatStop.getLastOk()
  1561  }
  1563  // getHeartbeatRetryIntv is used to retrieve the time to wait before attempting
  1564  // another heartbeat.
  1565  func (c *Client) getHeartbeatRetryIntv(err error) time.Duration {
  1566  	if c.config.DevMode {
  1567  		return devModeRetryIntv
  1568  	}
  1570  	// Collect the useful heartbeat info
  1571  	c.heartbeatLock.Lock()
  1572  	haveHeartbeated := c.haveHeartbeated
  1573  	last := c.lastHeartbeat()
  1574  	ttl := c.heartbeatTTL
  1575  	c.heartbeatLock.Unlock()
  1577  	// If we haven't even successfully heartbeated once or there is no leader
  1578  	// treat it as a registration. In the case that there is a leadership loss,
  1579  	// we will have our heartbeat timer reset to a much larger threshold, so
  1580  	// do not put unnecessary pressure on the new leader.
  1581  	if !haveHeartbeated || err == structs.ErrNoLeader {
  1582  		return c.retryIntv(registerRetryIntv)
  1583  	}
  1585  	// Determine how much time we have left to heartbeat
  1586  	left := last.Add(ttl).Sub(time.Now())
  1588  	// Logic for retrying is:
  1589  	// * Do not retry faster than once a second
  1590  	// * Do not retry less that once every 30 seconds
  1591  	// * If we have missed the heartbeat by more than 30 seconds, start to use
  1592  	// the absolute time since we do not want to retry indefinitely
  1593  	switch {
  1594  	case left < -30*time.Second:
  1595  		// Make left the absolute value so we delay and jitter properly.
  1596  		left *= -1
  1597  	case left < 0:
  1598  		return time.Second + lib.RandomStagger(time.Second)
  1599  	default:
  1600  	}
  1602  	stagger := lib.RandomStagger(left)
  1603  	switch {
  1604  	case stagger < time.Second:
  1605  		return time.Second + lib.RandomStagger(time.Second)
  1606  	case stagger > 30*time.Second:
  1607  		return 25*time.Second + lib.RandomStagger(5*time.Second)
  1608  	default:
  1609  		return stagger
  1610  	}
  1611  }
  1613  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
  1614  // state of the client
  1615  func (c *Client) periodicSnapshot() {
  1616  	// Create a snapshot timer
  1617  	snapshot := time.After(stateSnapshotIntv)
  1619  	for {
  1620  		select {
  1621  		case <-snapshot:
  1622  			snapshot = time.After(stateSnapshotIntv)
  1623  			if err := c.saveState(); err != nil {
  1624  				c.logger.Error("error saving state", "error", err)
  1625  			}
  1627  		case <-c.shutdownCh:
  1628  			return
  1629  		}
  1630  	}
  1631  }
  1633  // run is a long lived goroutine used to run the client. Shutdown() stops it first
  1634  func (c *Client) run() {
  1635  	// Watch for changes in allocations
  1636  	allocUpdates := make(chan *allocUpdates, 8)
  1637  	go c.watchAllocations(allocUpdates)
  1639  	for {
  1640  		select {
  1641  		case update := <-allocUpdates:
  1642  			// Don't apply updates while shutting down.
  1643  			c.shutdownLock.Lock()
  1644  			if c.shutdown {
  1645  				c.shutdownLock.Unlock()
  1646  				return
  1647  			}
  1649  			// Apply updates inside lock to prevent a concurrent
  1650  			// shutdown.
  1651  			c.runAllocs(update)
  1652  			c.shutdownLock.Unlock()
  1654  		case <-c.shutdownCh:
  1655  			return
  1656  		}
  1657  	}
  1658  }
  1660  // submitNodeEvents is used to submit a client-side node event. Examples of
  1661  // these kinds of events include when a driver moves from healthy to unhealthy
  1662  // (and vice versa)
  1663  func (c *Client) submitNodeEvents(events []*structs.NodeEvent) error {
  1664  	nodeID := c.NodeID()
  1665  	nodeEvents := map[string][]*structs.NodeEvent{
  1666  		nodeID: events,
  1667  	}
  1668  	req := structs.EmitNodeEventsRequest{
  1669  		NodeEvents: nodeEvents,
  1670  		WriteRequest: structs.WriteRequest{
  1671  			Region:    c.Region(),
  1672  			AuthToken: c.AuthToken(),
  1673  		},
  1674  	}
  1675  	var resp structs.EmitNodeEventsResponse
  1676  	if err := c.RPC("Node.EmitEvents", &req, &resp); err != nil {
  1677  		return fmt.Errorf("Emitting node events failed: %v", err)
  1678  	}
  1679  	return nil
  1680  }
  1682  // watchNodeEvents is a handler which receives node events and on a interval
  1683  // and submits them in batch format to the server
  1684  func (c *Client) watchNodeEvents() {
  1685  	// batchEvents stores events that have yet to be published
  1686  	var batchEvents []*structs.NodeEvent
  1688  	timer := stoppedTimer()
  1689  	defer timer.Stop()
  1691  	for {
  1692  		select {
  1693  		case event := <-c.triggerEmitNodeEvent:
  1694  			if l := len(batchEvents); l <= structs.MaxRetainedNodeEvents {
  1695  				batchEvents = append(batchEvents, event)
  1696  			} else {
  1697  				// Drop the oldest event
  1698  				c.logger.Warn("dropping node event", "node_event", batchEvents[0])
  1699  				batchEvents = append(batchEvents[1:], event)
  1700  			}
  1701  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1702  		case <-timer.C:
  1703  			if err := c.submitNodeEvents(batchEvents); err != nil {
  1704  				c.logger.Error("error submitting node events", "error", err)
  1705  				timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1706  			} else {
  1707  				// Reset the events since we successfully sent them.
  1708  				batchEvents = []*structs.NodeEvent{}
  1709  			}
  1710  		case <-c.shutdownCh:
  1711  			return
  1712  		}
  1713  	}
  1714  }
  1716  // triggerNodeEvent triggers a emit node event
  1717  func (c *Client) triggerNodeEvent(nodeEvent *structs.NodeEvent) {
  1718  	select {
  1719  	case c.triggerEmitNodeEvent <- nodeEvent:
  1720  		// emit node event goroutine was released to execute
  1721  	default:
  1722  		// emit node event goroutine was already running
  1723  	}
  1724  }
  1726  // retryRegisterNode is used to register the node or update the registration and
  1727  // retry in case of failure.
  1728  func (c *Client) retryRegisterNode() {
  1729  	for {
  1730  		err := c.registerNode()
  1731  		if err == nil {
  1732  			// Registered!
  1733  			return
  1734  		}
  1736  		if err == noServersErr {
  1737  			c.logger.Debug("registration waiting on servers")
  1738  			c.triggerDiscovery()
  1739  		} else {
  1740  			c.logger.Error("error registering", "error", err)
  1741  		}
  1742  		select {
  1743  		case <-c.rpcRetryWatcher():
  1744  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1745  		case <-c.shutdownCh:
  1746  			return
  1747  		}
  1748  	}
  1749  }
  1751  // registerNode is used to register the node or update the registration
  1752  func (c *Client) registerNode() error {
  1753  	node := c.Node()
  1754  	req := structs.NodeRegisterRequest{
  1755  		Node: node,
  1756  		WriteRequest: structs.WriteRequest{
  1757  			Region:    c.Region(),
  1758  			AuthToken: c.AuthToken(),
  1759  		},
  1760  	}
  1761  	var resp structs.NodeUpdateResponse
  1762  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1763  		return err
  1764  	}
  1766  	// Update the node status to ready after we register.
  1767  	c.configLock.Lock()
  1768  	node.Status = structs.NodeStatusReady
  1769  	c.config.Node.Status = structs.NodeStatusReady
  1770  	c.configLock.Unlock()
  1772  	c.logger.Info("node registration complete")
  1773  	if len(resp.EvalIDs) != 0 {
  1774  		c.logger.Debug("evaluations triggered by node registration", "num_evals", len(resp.EvalIDs))
  1775  	}
  1777  	c.heartbeatLock.Lock()
  1778  	defer c.heartbeatLock.Unlock()
  1779  	c.heartbeatStop.setLastOk(time.Now())
  1780  	c.heartbeatTTL = resp.HeartbeatTTL
  1781  	return nil
  1782  }
  1784  // updateNodeStatus is used to heartbeat and update the status of the node
  1785  func (c *Client) updateNodeStatus() error {
  1786  	start := time.Now()
  1787  	req := structs.NodeUpdateStatusRequest{
  1788  		NodeID: c.NodeID(),
  1789  		Status: structs.NodeStatusReady,
  1790  		WriteRequest: structs.WriteRequest{
  1791  			Region:    c.Region(),
  1792  			AuthToken: c.AuthToken(),
  1793  		},
  1794  	}
  1795  	var resp structs.NodeUpdateResponse
  1796  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1797  		c.triggerDiscovery()
  1798  		return fmt.Errorf("failed to update status: %v", err)
  1799  	}
  1800  	end := time.Now()
  1802  	if len(resp.EvalIDs) != 0 {
  1803  		c.logger.Debug("evaluations triggered by node update", "num_evals", len(resp.EvalIDs))
  1804  	}
  1806  	// Update the last heartbeat and the new TTL, capturing the old values
  1807  	c.heartbeatLock.Lock()
  1808  	last := c.lastHeartbeat()
  1809  	oldTTL := c.heartbeatTTL
  1810  	haveHeartbeated := c.haveHeartbeated
  1811  	c.heartbeatStop.setLastOk(time.Now())
  1812  	c.heartbeatTTL = resp.HeartbeatTTL
  1813  	c.haveHeartbeated = true
  1814  	c.heartbeatLock.Unlock()
  1815  	c.logger.Trace("next heartbeat", "period", resp.HeartbeatTTL)
  1817  	if resp.Index != 0 {
  1818  		c.logger.Debug("state updated", "node_status", req.Status)
  1820  		// We have potentially missed our TTL log how delayed we were
  1821  		if haveHeartbeated {
  1822  			c.logger.Warn("missed heartbeat",
  1823  				"req_latency", end.Sub(start), "heartbeat_ttl", oldTTL, "since_last_heartbeat", time.Since(last))
  1824  		}
  1825  	}
  1827  	// Update the number of nodes in the cluster so we can adjust our server
  1828  	// rebalance rate.
  1829  	c.servers.SetNumNodes(resp.NumNodes)
  1831  	// Convert []*NodeServerInfo to []*servers.Server
  1832  	nomadServers := make([]*servers.Server, 0, len(resp.Servers))
  1833  	for _, s := range resp.Servers {
  1834  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1835  		if err != nil {
  1836  			c.logger.Warn("ignoring invalid server", "error", err, "server", s.RPCAdvertiseAddr)
  1837  			continue
  1838  		}
  1839  		e := &servers.Server{Addr: addr}
  1840  		nomadServers = append(nomadServers, e)
  1841  	}
  1842  	if len(nomadServers) == 0 {
  1843  		return fmt.Errorf("heartbeat response returned no valid servers")
  1844  	}
  1845  	c.servers.SetServers(nomadServers)
  1847  	// Begin polling Consul if there is no Nomad leader.  We could be
  1848  	// heartbeating to a Nomad server that is in the minority of a
  1849  	// partition of the Nomad server quorum, but this Nomad Agent still
  1850  	// has connectivity to the existing majority of Nomad Servers, but
  1851  	// only if it queries Consul.
  1852  	if resp.LeaderRPCAddr == "" {
  1853  		c.triggerDiscovery()
  1854  	}
  1856  	return nil
  1857  }
  1859  // AllocStateUpdated asynchronously updates the server with the current state
  1860  // of an allocations and its tasks.
  1861  func (c *Client) AllocStateUpdated(alloc *structs.Allocation) {
  1862  	if alloc.Terminated() {
  1863  		// Terminated, mark for GC if we're still tracking this alloc
  1864  		// runner. If it's not being tracked that means the server has
  1865  		// already GC'd it (see removeAlloc).
  1866  		ar, err := c.getAllocRunner(alloc.ID)
  1868  		if err == nil {
  1869  			c.garbageCollector.MarkForCollection(alloc.ID, ar)
  1871  			// Trigger a GC in case we're over thresholds and just
  1872  			// waiting for eligible allocs.
  1873  			c.garbageCollector.Trigger()
  1874  		}
  1875  	}
  1877  	// Strip all the information that can be reconstructed at the server.  Only
  1878  	// send the fields that are updatable by the client.
  1879  	stripped := new(structs.Allocation)
  1880  	stripped.ID = alloc.ID
  1881  	stripped.NodeID = c.NodeID()
  1882  	stripped.TaskStates = alloc.TaskStates
  1883  	stripped.ClientStatus = alloc.ClientStatus
  1884  	stripped.ClientDescription = alloc.ClientDescription
  1885  	stripped.DeploymentStatus = alloc.DeploymentStatus
  1887  	select {
  1888  	case c.allocUpdates <- stripped:
  1889  	case <-c.shutdownCh:
  1890  	}
  1891  }
  1893  // allocSync is a long lived function that batches allocation updates to the
  1894  // server.
  1895  func (c *Client) allocSync() {
  1896  	staggered := false
  1897  	syncTicker := time.NewTicker(allocSyncIntv)
  1898  	updates := make(map[string]*structs.Allocation)
  1899  	for {
  1900  		select {
  1901  		case <-c.shutdownCh:
  1902  			syncTicker.Stop()
  1903  			return
  1904  		case alloc := <-c.allocUpdates:
  1905  			// Batch the allocation updates until the timer triggers.
  1906  			updates[alloc.ID] = alloc
  1907  		case <-syncTicker.C:
  1908  			// Fast path if there are no updates
  1909  			if len(updates) == 0 {
  1910  				continue
  1911  			}
  1913  			sync := make([]*structs.Allocation, 0, len(updates))
  1914  			for _, alloc := range updates {
  1915  				sync = append(sync, alloc)
  1916  			}
  1918  			// Send to server.
  1919  			args := structs.AllocUpdateRequest{
  1920  				Alloc: sync,
  1921  				WriteRequest: structs.WriteRequest{
  1922  					Region:    c.Region(),
  1923  					AuthToken: c.AuthToken(),
  1924  				},
  1925  			}
  1927  			var resp structs.GenericResponse
  1928  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1929  				c.logger.Error("error updating allocations", "error", err)
  1930  				syncTicker.Stop()
  1931  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1932  				staggered = true
  1933  			} else {
  1934  				updates = make(map[string]*structs.Allocation)
  1935  				if staggered {
  1936  					syncTicker.Stop()
  1937  					syncTicker = time.NewTicker(allocSyncIntv)
  1938  					staggered = false
  1939  				}
  1940  			}
  1941  		}
  1942  	}
  1943  }
  1945  // allocUpdates holds the results of receiving updated allocations from the
  1946  // servers.
  1947  type allocUpdates struct {
  1948  	// index is index of server store snapshot used for fetching alloc status
  1949  	index uint64
  1951  	// pulled is the set of allocations that were downloaded from the servers.
  1952  	pulled map[string]*structs.Allocation
  1954  	// filtered is the set of allocations that were not pulled because their
  1955  	// AllocModifyIndex didn't change.
  1956  	filtered map[string]struct{}
  1958  	// migrateTokens are a list of tokens necessary for when clients pull data
  1959  	// from authorized volumes
  1960  	migrateTokens map[string]string
  1961  }
  1963  // watchAllocations is used to scan for updates to allocations
  1964  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1965  	// The request and response for getting the map of allocations that should
  1966  	// be running on the Node to their AllocModifyIndex which is incremented
  1967  	// when the allocation is updated by the servers.
  1968  	req := structs.NodeSpecificRequest{
  1969  		NodeID:   c.NodeID(),
  1970  		SecretID: c.secretNodeID(),
  1971  		QueryOptions: structs.QueryOptions{
  1972  			Region:     c.Region(),
  1973  			AllowStale: true,
  1974  		},
  1975  	}
  1976  	var resp structs.NodeClientAllocsResponse
  1978  	// The request and response for pulling down the set of allocations that are
  1979  	// new, or updated server side.
  1980  	allocsReq := structs.AllocsGetRequest{
  1981  		QueryOptions: structs.QueryOptions{
  1982  			Region:     c.Region(),
  1983  			AllowStale: true,
  1984  			AuthToken:  c.secretNodeID(),
  1985  		},
  1986  	}
  1987  	var allocsResp structs.AllocsGetResponse
  1989  OUTER:
  1990  	for {
  1991  		// Get the allocation modify index map, blocking for updates. We will
  1992  		// use this to determine exactly what allocations need to be downloaded
  1993  		// in full.
  1994  		resp = structs.NodeClientAllocsResponse{}
  1995  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1996  		if err != nil {
  1997  			// Shutdown often causes EOF errors, so check for shutdown first
  1998  			select {
  1999  			case <-c.shutdownCh:
  2000  				return
  2001  			default:
  2002  			}
  2004  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  2005  			// servers are not fully upgraded before the clients register. This
  2006  			// can cause the SecretID to be lost
  2007  			if strings.Contains(err.Error(), "node secret ID does not match") {
  2008  				c.logger.Debug("secret mismatch; re-registering node", "error", err)
  2009  				c.retryRegisterNode()
  2010  			} else if err != noServersErr {
  2011  				c.logger.Error("error querying node allocations", "error", err)
  2012  			}
  2013  			retry := c.retryIntv(getAllocRetryIntv)
  2014  			select {
  2015  			case <-c.rpcRetryWatcher():
  2016  				continue
  2017  			case <-time.After(retry):
  2018  				continue
  2019  			case <-c.shutdownCh:
  2020  				return
  2021  			}
  2022  		}
  2024  		// Check for shutdown
  2025  		select {
  2026  		case <-c.shutdownCh:
  2027  			return
  2028  		default:
  2029  		}
  2031  		// Filter all allocations whose AllocModifyIndex was not incremented.
  2032  		// These are the allocations who have either not been updated, or whose
  2033  		// updates are a result of the client sending an update for the alloc.
  2034  		// This lets us reduce the network traffic to the server as we don't
  2035  		// need to pull all the allocations.
  2036  		var pull []string
  2037  		filtered := make(map[string]struct{})
  2038  		var pullIndex uint64
  2039  		for allocID, modifyIndex := range resp.Allocs {
  2040  			// Pull the allocation if we don't have an alloc runner for the
  2041  			// allocation or if the alloc runner requires an updated allocation.
  2042  			//XXX Part of Client alloc index tracking exp
  2043  			c.allocLock.RLock()
  2044  			currentAR, ok := c.allocs[allocID]
  2045  			c.allocLock.RUnlock()
  2047  			// Ignore alloc updates for allocs that are invalid because of initialization errors
  2048  			c.invalidAllocsLock.Lock()
  2049  			_, isInvalid := c.invalidAllocs[allocID]
  2050  			c.invalidAllocsLock.Unlock()
  2052  			if (!ok || modifyIndex > currentAR.Alloc().AllocModifyIndex) && !isInvalid {
  2053  				// Only pull allocs that are required. Filtered
  2054  				// allocs might be at a higher index, so ignore
  2055  				// it.
  2056  				if modifyIndex > pullIndex {
  2057  					pullIndex = modifyIndex
  2058  				}
  2059  				pull = append(pull, allocID)
  2060  			} else {
  2061  				filtered[allocID] = struct{}{}
  2062  			}
  2063  		}
  2065  		// Pull the allocations that passed filtering.
  2066  		allocsResp.Allocs = nil
  2067  		var pulledAllocs map[string]*structs.Allocation
  2068  		if len(pull) != 0 {
  2069  			// Pull the allocations that need to be updated.
  2070  			allocsReq.AllocIDs = pull
  2071  			allocsReq.MinQueryIndex = pullIndex - 1
  2072  			allocsResp = structs.AllocsGetResponse{}
  2073  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  2074  				c.logger.Error("error querying updated allocations", "error", err)
  2075  				retry := c.retryIntv(getAllocRetryIntv)
  2076  				select {
  2077  				case <-c.rpcRetryWatcher():
  2078  					continue
  2079  				case <-time.After(retry):
  2080  					continue
  2081  				case <-c.shutdownCh:
  2082  					return
  2083  				}
  2084  			}
  2086  			// Ensure that we received all the allocations we wanted
  2087  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  2088  			for _, alloc := range allocsResp.Allocs {
  2090  				// handle an old Server
  2091  				alloc.Canonicalize()
  2093  				pulledAllocs[alloc.ID] = alloc
  2094  			}
  2096  			for _, desiredID := range pull {
  2097  				if _, ok := pulledAllocs[desiredID]; !ok {
  2098  					// We didn't get everything we wanted. Do not update the
  2099  					// MinQueryIndex, sleep and then retry.
  2100  					wait := c.retryIntv(2 * time.Second)
  2101  					select {
  2102  					case <-time.After(wait):
  2103  						// Wait for the server we contact to receive the
  2104  						// allocations
  2105  						continue OUTER
  2106  					case <-c.shutdownCh:
  2107  						return
  2108  					}
  2109  				}
  2110  			}
  2112  			// Check for shutdown
  2113  			select {
  2114  			case <-c.shutdownCh:
  2115  				return
  2116  			default:
  2117  			}
  2118  		}
  2120  		c.logger.Debug("updated allocations", "index", resp.Index,
  2121  			"total", len(resp.Allocs), "pulled", len(allocsResp.Allocs), "filtered", len(filtered))
  2123  		// Update the query index.
  2124  		if resp.Index > req.MinQueryIndex {
  2125  			req.MinQueryIndex = resp.Index
  2126  		}
  2128  		// Push the updates.
  2129  		update := &allocUpdates{
  2130  			filtered:      filtered,
  2131  			pulled:        pulledAllocs,
  2132  			migrateTokens: resp.MigrateTokens,
  2133  			index:         resp.Index,
  2134  		}
  2136  		select {
  2137  		case updates <- update:
  2138  		case <-c.shutdownCh:
  2139  			return
  2140  		}
  2141  	}
  2142  }
  2144  // updateNode updates the Node copy and triggers the client to send the updated
  2145  // Node to the server. This should be done while the caller holds the
  2146  // configLock lock.
  2147  func (c *Client) updateNodeLocked() {
  2148  	// Update the config copy.
  2149  	node := c.config.Node.Copy()
  2150  	c.configCopy.Node = node
  2152  	select {
  2153  	case c.triggerNodeUpdate <- struct{}{}:
  2154  		// Node update goroutine was released to execute
  2155  	default:
  2156  		// Node update goroutine was already running
  2157  	}
  2158  }
  2160  // watchNodeUpdates blocks until it is edge triggered. Once triggered,
  2161  // it will update the client node copy and re-register the node.
  2162  func (c *Client) watchNodeUpdates() {
  2163  	var hasChanged bool
  2165  	timer := stoppedTimer()
  2166  	defer timer.Stop()
  2168  	for {
  2169  		select {
  2170  		case <-timer.C:
  2171  			c.logger.Debug("state changed, updating node and re-registering")
  2172  			c.retryRegisterNode()
  2173  			hasChanged = false
  2174  		case <-c.triggerNodeUpdate:
  2175  			if hasChanged {
  2176  				continue
  2177  			}
  2178  			hasChanged = true
  2179  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  2180  		case <-c.shutdownCh:
  2181  			return
  2182  		}
  2183  	}
  2184  }
  2186  // runAllocs is invoked when we get an updated set of allocations
  2187  func (c *Client) runAllocs(update *allocUpdates) {
  2188  	// Get the existing allocs
  2189  	c.allocLock.RLock()
  2190  	existing := make(map[string]uint64, len(c.allocs))
  2191  	for id, ar := range c.allocs {
  2192  		existing[id] = ar.Alloc().AllocModifyIndex
  2193  	}
  2194  	c.allocLock.RUnlock()
  2196  	// Diff the existing and updated allocations
  2197  	diff := diffAllocs(existing, update)
  2198  	c.logger.Debug("allocation updates", "added", len(diff.added), "removed", len(diff.removed),
  2199  		"updated", len(diff.updated), "ignored", len(diff.ignore))
  2201  	errs := 0
  2203  	// Remove the old allocations
  2204  	for _, remove := range diff.removed {
  2205  		c.removeAlloc(remove)
  2206  	}
  2208  	// Update the existing allocations
  2209  	for _, update := range diff.updated {
  2210  		c.logger.Trace("updating alloc", "alloc_id", update.ID, "index", update.AllocModifyIndex)
  2211  		c.updateAlloc(update)
  2212  	}
  2214  	// Make room for new allocations before running
  2215  	if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil {
  2216  		c.logger.Error("error making room for new allocations", "error", err)
  2217  		errs++
  2218  	}
  2220  	// Start the new allocations
  2221  	for _, add := range diff.added {
  2222  		migrateToken := update.migrateTokens[add.ID]
  2223  		if err := c.addAlloc(add, migrateToken); err != nil {
  2224  			c.logger.Error("error adding alloc", "error", err, "alloc_id", add.ID)
  2225  			errs++
  2226  			// We mark the alloc as failed and send an update to the server
  2227  			// We track the fact that creating an allocrunner failed so that we don't send updates again
  2228  			if add.ClientStatus != structs.AllocClientStatusFailed {
  2229  				c.handleInvalidAllocs(add, err)
  2230  			}
  2231  		}
  2232  	}
  2234  	// Mark servers as having been contacted so blocked tasks that failed
  2235  	// to restore can now restart.
  2236  	c.serversContactedOnce.Do(func() {
  2237  		close(c.serversContactedCh)
  2238  	})
  2240  	// Trigger the GC once more now that new allocs are started that could
  2241  	// have caused thresholds to be exceeded
  2242  	c.garbageCollector.Trigger()
  2243  	c.logger.Debug("allocation updates applied", "added", len(diff.added), "removed", len(diff.removed),
  2244  		"updated", len(diff.updated), "ignored", len(diff.ignore), "errors", errs)
  2245  }
  2247  // makeFailedAlloc creates a stripped down version of the allocation passed in
  2248  // with its status set to failed and other fields needed for the server to be
  2249  // able to examine deployment and task states
  2250  func makeFailedAlloc(add *structs.Allocation, err error) *structs.Allocation {
  2251  	stripped := new(structs.Allocation)
  2252  	stripped.ID = add.ID
  2253  	stripped.NodeID = add.NodeID
  2254  	stripped.ClientStatus = structs.AllocClientStatusFailed
  2255  	stripped.ClientDescription = fmt.Sprintf("Unable to add allocation due to error: %v", err)
  2257  	// Copy task states if it exists in the original allocation
  2258  	if add.TaskStates != nil {
  2259  		stripped.TaskStates = add.TaskStates
  2260  	} else {
  2261  		stripped.TaskStates = make(map[string]*structs.TaskState)
  2262  	}
  2264  	failTime := time.Now()
  2265  	if add.DeploymentStatus.HasHealth() {
  2266  		// Never change deployment health once it has been set
  2267  		stripped.DeploymentStatus = add.DeploymentStatus.Copy()
  2268  	} else {
  2269  		stripped.DeploymentStatus = &structs.AllocDeploymentStatus{
  2270  			Healthy:   helper.BoolToPtr(false),
  2271  			Timestamp: failTime,
  2272  		}
  2273  	}
  2275  	taskGroup := add.Job.LookupTaskGroup(add.TaskGroup)
  2276  	if taskGroup == nil {
  2277  		return stripped
  2278  	}
  2279  	for _, task := range taskGroup.Tasks {
  2280  		ts, ok := stripped.TaskStates[task.Name]
  2281  		if !ok {
  2282  			ts = &structs.TaskState{}
  2283  			stripped.TaskStates[task.Name] = ts
  2284  		}
  2285  		if ts.FinishedAt.IsZero() {
  2286  			ts.FinishedAt = failTime
  2287  		}
  2288  	}
  2289  	return stripped
  2290  }
  2292  // removeAlloc is invoked when we should remove an allocation because it has
  2293  // been removed by the server.
  2294  func (c *Client) removeAlloc(allocID string) {
  2295  	c.allocLock.Lock()
  2296  	defer c.allocLock.Unlock()
  2298  	ar, ok := c.allocs[allocID]
  2299  	if !ok {
  2300  		c.invalidAllocsLock.Lock()
  2301  		if _, ok := c.invalidAllocs[allocID]; ok {
  2302  			// Removing from invalid allocs map if present
  2303  			delete(c.invalidAllocs, allocID)
  2304  		} else {
  2305  			// Alloc is unknown, log a warning.
  2306  			c.logger.Warn("cannot remove nonexistent alloc", "alloc_id", allocID, "error", "alloc not found")
  2307  		}
  2308  		c.invalidAllocsLock.Unlock()
  2309  		return
  2310  	}
  2312  	// Stop tracking alloc runner as it's been GC'd by the server
  2313  	delete(c.allocs, allocID)
  2315  	// Ensure the GC has a reference and then collect. Collecting through the GC
  2316  	// applies rate limiting
  2317  	c.garbageCollector.MarkForCollection(allocID, ar)
  2319  	// GC immediately since the server has GC'd it
  2320  	go c.garbageCollector.Collect(allocID)
  2321  }
  2323  // updateAlloc is invoked when we should update an allocation
  2324  func (c *Client) updateAlloc(update *structs.Allocation) {
  2325  	ar, err := c.getAllocRunner(update.ID)
  2326  	if err != nil {
  2327  		c.logger.Warn("cannot update nonexistent alloc", "alloc_id", update.ID)
  2328  		return
  2329  	}
  2331  	// Update local copy of alloc
  2332  	if err := c.stateDB.PutAllocation(update); err != nil {
  2333  		c.logger.Error("error persisting updated alloc locally", "error", err, "alloc_id", update.ID)
  2334  	}
  2336  	// Update alloc runner
  2337  	ar.Update(update)
  2338  }
  2340  // addAlloc is invoked when we should add an allocation
  2341  func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error {
  2342  	c.allocLock.Lock()
  2343  	defer c.allocLock.Unlock()
  2345  	// Check if we already have an alloc runner
  2346  	if _, ok := c.allocs[alloc.ID]; ok {
  2347  		c.logger.Debug("dropping duplicate add allocation request", "alloc_id", alloc.ID)
  2348  		return nil
  2349  	}
  2351  	// Initialize local copy of alloc before creating the alloc runner so
  2352  	// we can't end up with an alloc runner that does not have an alloc.
  2353  	if err := c.stateDB.PutAllocation(alloc); err != nil {
  2354  		return err
  2355  	}
  2357  	// Collect any preempted allocations to pass into the previous alloc watcher
  2358  	var preemptedAllocs map[string]allocwatcher.AllocRunnerMeta
  2359  	if len(alloc.PreemptedAllocations) > 0 {
  2360  		preemptedAllocs = make(map[string]allocwatcher.AllocRunnerMeta)
  2361  		for _, palloc := range alloc.PreemptedAllocations {
  2362  			preemptedAllocs[palloc] = c.allocs[palloc]
  2363  		}
  2364  	}
  2366  	// Since only the Client has access to other AllocRunners and the RPC
  2367  	// client, create the previous allocation watcher here.
  2368  	watcherConfig := allocwatcher.Config{
  2369  		Alloc:            alloc,
  2370  		PreviousRunner:   c.allocs[alloc.PreviousAllocation],
  2371  		PreemptedRunners: preemptedAllocs,
  2372  		RPC:              c,
  2373  		Config:           c.configCopy,
  2374  		MigrateToken:     migrateToken,
  2375  		Logger:           c.logger,
  2376  	}
  2377  	prevAllocWatcher, prevAllocMigrator := allocwatcher.NewAllocWatcher(watcherConfig)
  2379  	// Copy the config since the node can be swapped out as it is being updated.
  2380  	// The long term fix is to pass in the config and node separately and then
  2381  	// we don't have to do a copy.
  2382  	c.configLock.RLock()
  2383  	arConf := &allocrunner.Config{
  2384  		Alloc:               alloc,
  2385  		Logger:              c.logger,
  2386  		ClientConfig:        c.configCopy,
  2387  		StateDB:             c.stateDB,
  2388  		Consul:              c.consulService,
  2389  		ConsulSI:            c.tokensClient,
  2390  		Vault:               c.vaultClient,
  2391  		StateUpdater:        c,
  2392  		DeviceStatsReporter: c,
  2393  		PrevAllocWatcher:    prevAllocWatcher,
  2394  		PrevAllocMigrator:   prevAllocMigrator,
  2395  		DynamicRegistry:     c.dynamicRegistry,
  2396  		CSIManager:          c.csimanager,
  2397  		DeviceManager:       c.devicemanager,
  2398  		DriverManager:       c.drivermanager,
  2399  		RPCClient:           c,
  2400  	}
  2401  	c.configLock.RUnlock()
  2403  	ar, err := allocrunner.NewAllocRunner(arConf)
  2404  	if err != nil {
  2405  		return err
  2406  	}
  2408  	// Store the alloc runner.
  2409  	c.allocs[alloc.ID] = ar
  2411  	// Maybe mark the alloc for halt on missing server heartbeats
  2412  	c.heartbeatStop.allocHook(alloc)
  2414  	go ar.Run()
  2415  	return nil
  2416  }
  2418  // setupConsulTokenClient configures a tokenClient for managing consul service
  2419  // identity tokens.
  2420  func (c *Client) setupConsulTokenClient() error {
  2421  	tc := consulApi.NewIdentitiesClient(c.logger, c.deriveSIToken)
  2422  	c.tokensClient = tc
  2423  	return nil
  2424  }
  2426  // setupVaultClient creates an object to periodically renew tokens and secrets
  2427  // with vault.
  2428  func (c *Client) setupVaultClient() error {
  2429  	var err error
  2430  	c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken)
  2431  	if err != nil {
  2432  		return err
  2433  	}
  2435  	if c.vaultClient == nil {
  2436  		c.logger.Error("failed to create vault client")
  2437  		return fmt.Errorf("failed to create vault client")
  2438  	}
  2440  	// Start renewing tokens and secrets
  2441  	c.vaultClient.Start()
  2443  	return nil
  2444  }
  2446  // deriveToken takes in an allocation and a set of tasks and derives vault
  2447  // tokens for each of the tasks, unwraps all of them using the supplied vault
  2448  // client and returns a map of unwrapped tokens, indexed by the task name.
  2449  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  2450  	vlogger := c.logger.Named("vault")
  2452  	verifiedTasks, err := verifiedTasks(vlogger, alloc, taskNames)
  2453  	if err != nil {
  2454  		return nil, err
  2455  	}
  2457  	// DeriveVaultToken of nomad server can take in a set of tasks and
  2458  	// creates tokens for all the tasks.
  2459  	req := &structs.DeriveVaultTokenRequest{
  2460  		NodeID:   c.NodeID(),
  2461  		SecretID: c.secretNodeID(),
  2462  		AllocID:  alloc.ID,
  2463  		Tasks:    verifiedTasks,
  2464  		QueryOptions: structs.QueryOptions{
  2465  			Region:     c.Region(),
  2466  			AllowStale: false,
  2467  		},
  2468  	}
  2470  	// Derive the tokens
  2471  	var resp structs.DeriveVaultTokenResponse
  2472  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  2473  		vlogger.Error("error making derive token RPC", "error", err)
  2474  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  2475  	}
  2476  	if resp.Error != nil {
  2477  		vlogger.Error("error deriving vault tokens", "error", resp.Error)
  2478  		return nil, structs.NewWrappedServerError(resp.Error)
  2479  	}
  2480  	if resp.Tasks == nil {
  2481  		vlogger.Error("error derivng vault token", "error", "invalid response")
  2482  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  2483  	}
  2485  	unwrappedTokens := make(map[string]string)
  2487  	// Retrieve the wrapped tokens from the response and unwrap it
  2488  	for _, taskName := range verifiedTasks {
  2489  		// Get the wrapped token
  2490  		wrappedToken, ok := resp.Tasks[taskName]
  2491  		if !ok {
  2492  			vlogger.Error("wrapped token missing for task", "task_name", taskName)
  2493  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  2494  		}
  2496  		// Unwrap the vault token
  2497  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  2498  		if err != nil {
  2499  			if structs.VaultUnrecoverableError.MatchString(err.Error()) {
  2500  				return nil, err
  2501  			}
  2503  			// The error is recoverable
  2504  			return nil, structs.NewRecoverableError(
  2505  				fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err), true)
  2506  		}
  2508  		// Validate the response
  2509  		var validationErr error
  2510  		if unwrapResp == nil {
  2511  			validationErr = fmt.Errorf("Vault returned nil secret when unwrapping")
  2512  		} else if unwrapResp.Auth == nil {
  2513  			validationErr = fmt.Errorf("Vault returned unwrap secret with nil Auth. Secret warnings: %v", unwrapResp.Warnings)
  2514  		} else if unwrapResp.Auth.ClientToken == "" {
  2515  			validationErr = fmt.Errorf("Vault returned unwrap secret with empty Auth.ClientToken. Secret warnings: %v", unwrapResp.Warnings)
  2516  		}
  2517  		if validationErr != nil {
  2518  			vlogger.Warn("error unwrapping token", "error", err)
  2519  			return nil, structs.NewRecoverableError(validationErr, true)
  2520  		}
  2522  		// Append the unwrapped token to the return value
  2523  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  2524  	}
  2526  	return unwrappedTokens, nil
  2527  }
  2529  // deriveSIToken takes an allocation and a set of tasks and derives Consul
  2530  // Service Identity tokens for each of the tasks by requesting them from the
  2531  // Nomad Server.
  2532  func (c *Client) deriveSIToken(alloc *structs.Allocation, taskNames []string) (map[string]string, error) {
  2533  	tasks, err := verifiedTasks(c.logger, alloc, taskNames)
  2534  	if err != nil {
  2535  		return nil, err
  2536  	}
  2538  	req := &structs.DeriveSITokenRequest{
  2539  		NodeID:       c.NodeID(),
  2540  		SecretID:     c.secretNodeID(),
  2541  		AllocID:      alloc.ID,
  2542  		Tasks:        tasks,
  2543  		QueryOptions: structs.QueryOptions{Region: c.Region()},
  2544  	}
  2546  	// Nicely ask Nomad Server for the tokens.
  2547  	var resp structs.DeriveSITokenResponse
  2548  	if err := c.RPC("Node.DeriveSIToken", &req, &resp); err != nil {
  2549  		c.logger.Error("error making derive token RPC", "error", err)
  2550  		return nil, fmt.Errorf("DeriveSIToken RPC failed: %v", err)
  2551  	}
  2552  	if err := resp.Error; err != nil {
  2553  		c.logger.Error("error deriving SI tokens", "error", err)
  2554  		return nil, structs.NewWrappedServerError(err)
  2555  	}
  2556  	if len(resp.Tokens) == 0 {
  2557  		c.logger.Error("error deriving SI tokens", "error", "invalid_response")
  2558  		return nil, fmt.Errorf("failed to derive SI tokens: invalid response")
  2559  	}
  2561  	// NOTE: Unlike with the Vault integration, Nomad Server replies with the
  2562  	// actual Consul SI token (.SecretID), because otherwise each Nomad
  2563  	// Client would need to be blessed with 'acl:write' permissions to read the
  2564  	// secret value given the .AccessorID, which does not fit well in the Consul
  2565  	// security model.
  2566  	//
  2567  	//
  2568  	//
  2570  	m := helper.CopyMapStringString(resp.Tokens)
  2571  	return m, nil
  2572  }
  2574  // verifiedTasks asserts each task in taskNames actually exists in the given alloc,
  2575  // otherwise an error is returned.
  2576  func verifiedTasks(logger hclog.Logger, alloc *structs.Allocation, taskNames []string) ([]string, error) {
  2577  	if alloc == nil {
  2578  		return nil, fmt.Errorf("nil allocation")
  2579  	}
  2581  	if len(taskNames) == 0 {
  2582  		return nil, fmt.Errorf("missing task names")
  2583  	}
  2585  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  2586  	if group == nil {
  2587  		return nil, fmt.Errorf("group name in allocation is not present in job")
  2588  	}
  2590  	verifiedTasks := make([]string, 0, len(taskNames))
  2592  	// confirm the requested task names actually exist in the allocation
  2593  	for _, taskName := range taskNames {
  2594  		if !taskIsPresent(taskName, group.Tasks) {
  2595  			logger.Error("task not found in the allocation", "task_name", taskName)
  2596  			return nil, fmt.Errorf("task %q not found in allocation", taskName)
  2597  		}
  2598  		verifiedTasks = append(verifiedTasks, taskName)
  2599  	}
  2601  	return verifiedTasks, nil
  2602  }
  2604  func taskIsPresent(taskName string, tasks []*structs.Task) bool {
  2605  	for _, task := range tasks {
  2606  		if task.Name == taskName {
  2607  			return true
  2608  		}
  2609  	}
  2610  	return false
  2611  }
  2613  // triggerDiscovery causes a Consul discovery to begin (if one hasn't already)
  2614  func (c *Client) triggerDiscovery() {
  2615  	select {
  2616  	case c.triggerDiscoveryCh <- struct{}{}:
  2617  		// Discovery goroutine was released to execute
  2618  	default:
  2619  		// Discovery goroutine was already running
  2620  	}
  2621  }
  2623  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  2624  // It's intended to be started in a goroutine. See triggerDiscovery() for
  2625  // causing consul discovery from other code locations.
  2626  func (c *Client) consulDiscovery() {
  2627  	for {
  2628  		select {
  2629  		case <-c.triggerDiscoveryCh:
  2630  			if err := c.consulDiscoveryImpl(); err != nil {
  2631  				c.logger.Error("error discovering nomad servers", "error", err)
  2632  			}
  2633  		case <-c.shutdownCh:
  2634  			return
  2635  		}
  2636  	}
  2637  }
  2639  func (c *Client) consulDiscoveryImpl() error {
  2640  	consulLogger := c.logger.Named("consul")
  2642  	dcs, err := c.consulCatalog.Datacenters()
  2643  	if err != nil {
  2644  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  2645  	}
  2646  	if len(dcs) > 2 {
  2647  		// Query the local DC first, then shuffle the
  2648  		// remaining DCs.  Future heartbeats will cause Nomad
  2649  		// Clients to fixate on their local datacenter so
  2650  		// it's okay to talk with remote DCs.  If the no
  2651  		// Nomad servers are available within
  2652  		// datacenterQueryLimit, the next heartbeat will pick
  2653  		// a new set of servers so it's okay.
  2654  		shuffleStrings(dcs[1:])
  2655  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  2656  	}
  2658  	// Query for servers in this client's region only
  2659  	region := c.Region()
  2660  	rpcargs := structs.GenericRequest{
  2661  		QueryOptions: structs.QueryOptions{
  2662  			Region: region,
  2663  		},
  2664  	}
  2666  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  2667  	var mErr multierror.Error
  2668  	var nomadServers servers.Servers
  2669  	consulLogger.Debug("bootstrap contacting Consul DCs", "consul_dcs", dcs)
  2670  DISCOLOOP:
  2671  	for _, dc := range dcs {
  2672  		consulOpts := &consulapi.QueryOptions{
  2673  			AllowStale: true,
  2674  			Datacenter: dc,
  2675  			Near:       "_agent",
  2676  			WaitTime:   consul.DefaultQueryWaitDuration,
  2677  		}
  2678  		consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  2679  		if err != nil {
  2680  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  2681  			continue
  2682  		}
  2684  		for _, s := range consulServices {
  2685  			port := strconv.Itoa(s.ServicePort)
  2686  			addrstr := s.ServiceAddress
  2687  			if addrstr == "" {
  2688  				addrstr = s.Address
  2689  			}
  2690  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  2691  			if err != nil {
  2692  				mErr.Errors = append(mErr.Errors, err)
  2693  				continue
  2694  			}
  2695  			var peers []string
  2696  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  2697  				mErr.Errors = append(mErr.Errors, err)
  2698  				continue
  2699  			}
  2701  			// Successfully received the Server peers list of the correct
  2702  			// region
  2703  			for _, p := range peers {
  2704  				addr, err := net.ResolveTCPAddr("tcp", p)
  2705  				if err != nil {
  2706  					mErr.Errors = append(mErr.Errors, err)
  2707  				}
  2708  				srv := &servers.Server{Addr: addr}
  2709  				nomadServers = append(nomadServers, srv)
  2710  			}
  2711  			if len(nomadServers) > 0 {
  2712  				break DISCOLOOP
  2713  			}
  2714  		}
  2715  	}
  2716  	if len(nomadServers) == 0 {
  2717  		if len(mErr.Errors) > 0 {
  2718  			return mErr.ErrorOrNil()
  2719  		}
  2720  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  2721  	}
  2723  	consulLogger.Info("discovered following servers", "servers", nomadServers)
  2725  	// Fire the retry trigger if we have updated the set of servers.
  2726  	if c.servers.SetServers(nomadServers) {
  2727  		// Start rebalancing
  2728  		c.servers.RebalanceServers()
  2730  		// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  2731  		// isn't receiving on this chan yet they'll still retry eventually.
  2732  		// This is a shortcircuit for the longer retry intervals.
  2733  		c.fireRpcRetryWatcher()
  2734  	}
  2736  	return nil
  2737  }
  2739  // emitStats collects host resource usage stats periodically
  2740  func (c *Client) emitStats() {
  2741  	// Determining NodeClass to be emitted
  2742  	var emittedNodeClass string
  2743  	if emittedNodeClass = c.Node().NodeClass; emittedNodeClass == "" {
  2744  		emittedNodeClass = "none"
  2745  	}
  2747  	// Assign labels directly before emitting stats so the information expected
  2748  	// is ready
  2749  	c.baseLabels = []metrics.Label{
  2750  		{Name: "node_id", Value: c.NodeID()},
  2751  		{Name: "datacenter", Value: c.Datacenter()},
  2752  		{Name: "node_class", Value: emittedNodeClass},
  2753  	}
  2755  	// Start collecting host stats right away and then keep collecting every
  2756  	// collection interval
  2757  	next := time.NewTimer(0)
  2758  	defer next.Stop()
  2759  	for {
  2760  		select {
  2761  		case <-next.C:
  2762  			err := c.hostStatsCollector.Collect()
  2763  			next.Reset(c.config.StatsCollectionInterval)
  2764  			if err != nil {
  2765  				c.logger.Warn("error fetching host resource usage stats", "error", err)
  2766  			} else {
  2767  				// Publish Node metrics if operator has opted in
  2768  				if c.config.PublishNodeMetrics {
  2769  					c.emitHostStats()
  2770  				}
  2771  			}
  2773  			c.emitClientMetrics()
  2774  		case <-c.shutdownCh:
  2775  			return
  2776  		}
  2777  	}
  2778  }
  2780  // setGaugeForMemoryStats proxies metrics for memory specific statistics
  2781  func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats, baseLabels []metrics.Label) {
  2782  	if !c.config.DisableTaggedMetrics {
  2783  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), baseLabels)
  2784  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), baseLabels)
  2785  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), baseLabels)
  2786  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), baseLabels)
  2787  	}
  2789  	if c.config.BackwardsCompatibleMetrics {
  2790  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2791  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2792  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2793  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2794  	}
  2795  }
  2797  // setGaugeForCPUStats proxies metrics for CPU specific statistics
  2798  func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats, baseLabels []metrics.Label) {
  2799  	for _, cpu := range hStats.CPU {
  2800  		if !c.config.DisableTaggedMetrics {
  2801  			labels := append(baseLabels, metrics.Label{
  2802  				Name:  "cpu",
  2803  				Value: cpu.CPU,
  2804  			})
  2806  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels)
  2807  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels)
  2808  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels)
  2809  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels)
  2810  		}
  2812  		if c.config.BackwardsCompatibleMetrics {
  2813  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2814  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2815  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2816  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2817  		}
  2818  	}
  2819  }
  2821  // setGaugeForDiskStats proxies metrics for disk specific statistics
  2822  func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats, baseLabels []metrics.Label) {
  2823  	for _, disk := range hStats.DiskStats {
  2824  		if !c.config.DisableTaggedMetrics {
  2825  			labels := append(baseLabels, metrics.Label{
  2826  				Name:  "disk",
  2827  				Value: disk.Device,
  2828  			})
  2830  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels)
  2831  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels)
  2832  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels)
  2833  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels)
  2834  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels)
  2835  		}
  2837  		if c.config.BackwardsCompatibleMetrics {
  2838  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2839  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2840  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2841  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2842  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2843  		}
  2844  	}
  2845  }
  2847  // setGaugeForAllocationStats proxies metrics for allocation specific statistics
  2848  func (c *Client) setGaugeForAllocationStats(nodeID string) {
  2849  	c.configLock.RLock()
  2850  	node := c.configCopy.Node
  2851  	c.configLock.RUnlock()
  2852  	total := node.NodeResources
  2853  	res := node.ReservedResources
  2854  	allocated := c.getAllocatedResources(node)
  2856  	// Emit allocated
  2857  	if !c.config.DisableTaggedMetrics {
  2858  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.Flattened.Memory.MemoryMB), c.baseLabels)
  2859  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.Shared.DiskMB), c.baseLabels)
  2860  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.Flattened.Cpu.CpuShares), c.baseLabels)
  2861  	}
  2863  	if c.config.BackwardsCompatibleMetrics {
  2864  		metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.Flattened.Memory.MemoryMB))
  2865  		metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.Shared.DiskMB))
  2866  		metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.Flattened.Cpu.CpuShares))
  2867  	}
  2869  	for _, n := range allocated.Flattened.Networks {
  2870  		if !c.config.DisableTaggedMetrics {
  2871  			labels := append(c.baseLabels, metrics.Label{
  2872  				Name:  "device",
  2873  				Value: n.Device,
  2874  			})
  2875  			metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels)
  2876  		}
  2878  		if c.config.BackwardsCompatibleMetrics {
  2879  			metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2880  		}
  2881  	}
  2883  	// Emit unallocated
  2884  	unallocatedMem := total.Memory.MemoryMB - res.Memory.MemoryMB - allocated.Flattened.Memory.MemoryMB
  2885  	unallocatedDisk := total.Disk.DiskMB - res.Disk.DiskMB - allocated.Shared.DiskMB
  2886  	unallocatedCpu := total.Cpu.CpuShares - res.Cpu.CpuShares - allocated.Flattened.Cpu.CpuShares
  2888  	if !c.config.DisableTaggedMetrics {
  2889  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels)
  2890  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels)
  2891  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels)
  2892  	}
  2894  	if c.config.BackwardsCompatibleMetrics {
  2895  		metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2896  		metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2897  		metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2898  	}
  2900  	totalComparable := total.Comparable()
  2901  	for _, n := range totalComparable.Flattened.Networks {
  2902  		// Determined the used resources
  2903  		var usedMbits int
  2904  		totalIdx := allocated.Flattened.Networks.NetIndex(n)
  2905  		if totalIdx != -1 {
  2906  			usedMbits = allocated.Flattened.Networks[totalIdx].MBits
  2907  		}
  2909  		unallocatedMbits := n.MBits - usedMbits
  2910  		if !c.config.DisableTaggedMetrics {
  2911  			labels := append(c.baseLabels, metrics.Label{
  2912  				Name:  "device",
  2913  				Value: n.Device,
  2914  			})
  2915  			metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels)
  2916  		}
  2918  		if c.config.BackwardsCompatibleMetrics {
  2919  			metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2920  		}
  2921  	}
  2922  }
  2924  // No labels are required so we emit with only a key/value syntax
  2925  func (c *Client) setGaugeForUptime(hStats *stats.HostStats, baseLabels []metrics.Label) {
  2926  	if !c.config.DisableTaggedMetrics {
  2927  		metrics.SetGaugeWithLabels([]string{"client", "uptime"}, float32(hStats.Uptime), baseLabels)
  2928  	}
  2929  	if c.config.BackwardsCompatibleMetrics {
  2930  		metrics.SetGauge([]string{"client", "uptime"}, float32(hStats.Uptime))
  2931  	}
  2932  }
  2934  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2935  func (c *Client) emitHostStats() {
  2936  	nodeID := c.NodeID()
  2937  	hStats := c.hostStatsCollector.Stats()
  2939  	c.configLock.RLock()
  2940  	nodeStatus := c.configCopy.Node.Status
  2941  	nodeEligibility := c.configCopy.Node.SchedulingEligibility
  2942  	c.configLock.RUnlock()
  2944  	labels := append(c.baseLabels,
  2945  		metrics.Label{Name: "node_status", Value: nodeStatus},
  2946  		metrics.Label{Name: "node_scheduling_eligibility", Value: nodeEligibility},
  2947  	)
  2949  	c.setGaugeForMemoryStats(nodeID, hStats, labels)
  2950  	c.setGaugeForUptime(hStats, labels)
  2951  	c.setGaugeForCPUStats(nodeID, hStats, labels)
  2952  	c.setGaugeForDiskStats(nodeID, hStats, labels)
  2953  }
  2955  // emitClientMetrics emits lower volume client metrics
  2956  func (c *Client) emitClientMetrics() {
  2957  	nodeID := c.NodeID()
  2959  	c.setGaugeForAllocationStats(nodeID)
  2961  	// Emit allocation metrics
  2962  	blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0
  2963  	for _, ar := range c.getAllocRunners() {
  2964  		switch ar.AllocState().ClientStatus {
  2965  		case structs.AllocClientStatusPending:
  2966  			switch {
  2967  			case ar.IsWaiting():
  2968  				blocked++
  2969  			case ar.IsMigrating():
  2970  				migrating++
  2971  			default:
  2972  				pending++
  2973  			}
  2974  		case structs.AllocClientStatusRunning:
  2975  			running++
  2976  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2977  			terminal++
  2978  		}
  2979  	}
  2981  	if !c.config.DisableTaggedMetrics {
  2982  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels)
  2983  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels)
  2984  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels)
  2985  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels)
  2986  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels)
  2987  	}
  2989  	if c.config.BackwardsCompatibleMetrics {
  2990  		metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2991  		metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2992  		metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2993  		metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2994  		metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2995  	}
  2996  }
  2998  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.ComparableResources {
  2999  	// Unfortunately the allocs only have IP so we need to match them to the
  3000  	// device
  3001  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  3002  	for _, n := range selfNode.NodeResources.Networks {
  3003  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  3004  		if err != nil {
  3005  			continue
  3006  		}
  3007  		cidrToDevice[ipnet] = n.Device
  3008  	}
  3010  	// Sum the allocated resources
  3011  	var allocated structs.ComparableResources
  3012  	allocatedDeviceMbits := make(map[string]int)
  3013  	for _, ar := range c.getAllocRunners() {
  3014  		alloc := ar.Alloc()
  3015  		if alloc.ServerTerminalStatus() || ar.AllocState().ClientTerminalStatus() {
  3016  			continue
  3017  		}
  3019  		// Add the resources
  3020  		// COMPAT(0.11): Just use the allocated resources
  3021  		allocated.Add(alloc.ComparableResources())
  3023  		// Add the used network
  3024  		if alloc.AllocatedResources != nil {
  3025  			for _, tr := range alloc.AllocatedResources.Tasks {
  3026  				for _, allocatedNetwork := range tr.Networks {
  3027  					for cidr, dev := range cidrToDevice {
  3028  						ip := net.ParseIP(allocatedNetwork.IP)
  3029  						if cidr.Contains(ip) {
  3030  							allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  3031  							break
  3032  						}
  3033  					}
  3034  				}
  3035  			}
  3036  		} else if alloc.Resources != nil {
  3037  			for _, allocatedNetwork := range alloc.Resources.Networks {
  3038  				for cidr, dev := range cidrToDevice {
  3039  					ip := net.ParseIP(allocatedNetwork.IP)
  3040  					if cidr.Contains(ip) {
  3041  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  3042  						break
  3043  					}
  3044  				}
  3045  			}
  3046  		}
  3047  	}
  3049  	// Clear the networks
  3050  	allocated.Flattened.Networks = nil
  3051  	for dev, speed := range allocatedDeviceMbits {
  3052  		net := &structs.NetworkResource{
  3053  			Device: dev,
  3054  			MBits:  speed,
  3055  		}
  3056  		allocated.Flattened.Networks = append(allocated.Flattened.Networks, net)
  3057  	}
  3059  	return &allocated
  3060  }
  3062  // GetTaskEventHandler returns an event handler for the given allocID and task name
  3063  func (c *Client) GetTaskEventHandler(allocID, taskName string) drivermanager.EventHandler {
  3064  	c.allocLock.RLock()
  3065  	defer c.allocLock.RUnlock()
  3066  	if ar, ok := c.allocs[allocID]; ok {
  3067  		return ar.GetTaskEventHandler(taskName)
  3068  	}
  3069  	return nil
  3070  }
  3072  // group wraps a func() in a goroutine and provides a way to block until it
  3073  // exits. Inspired by
  3074  type group struct {
  3075  	wg sync.WaitGroup
  3076  }
  3078  // Go starts f in a goroutine and must be called before Wait.
  3079  func (g *group) Go(f func()) {
  3080  	g.wg.Add(1)
  3081  	go func() {
  3082  		defer g.wg.Done()
  3083  		f()
  3084  	}()
  3085  }
  3087  func (c *group) AddCh(ch <-chan struct{}) {
  3088  	c.Go(func() {
  3089  		<-ch
  3090  	})
  3091  }
  3093  // Wait for all goroutines to exit. Must be called after all calls to Go
  3094  // complete.
  3095  func (g *group) Wait() {
  3096  	g.wg.Wait()
  3097  }