github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"net"
     8  	"net/rpc"
     9  	"os"
    10  	"path/filepath"
    11  	"sort"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/armon/go-metrics"
    18  	consulapi "github.com/hashicorp/consul/api"
    19  	"github.com/hashicorp/consul/lib"
    20  	"github.com/hashicorp/go-hclog"
    21  	"github.com/hashicorp/go-multierror"
    22  	"github.com/hashicorp/nomad/client/allocdir"
    23  	"github.com/hashicorp/nomad/client/allocrunner"
    24  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    25  	arstate "github.com/hashicorp/nomad/client/allocrunner/state"
    26  	"github.com/hashicorp/nomad/client/allocwatcher"
    27  	"github.com/hashicorp/nomad/client/config"
    28  	consulApi "github.com/hashicorp/nomad/client/consul"
    29  	"github.com/hashicorp/nomad/client/devicemanager"
    30  	"github.com/hashicorp/nomad/client/fingerprint"
    31  	"github.com/hashicorp/nomad/client/pluginmanager"
    32  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    33  	"github.com/hashicorp/nomad/client/servers"
    34  	"github.com/hashicorp/nomad/client/state"
    35  	"github.com/hashicorp/nomad/client/stats"
    36  	cstructs "github.com/hashicorp/nomad/client/structs"
    37  	"github.com/hashicorp/nomad/client/vaultclient"
    38  	"github.com/hashicorp/nomad/command/agent/consul"
    39  	"github.com/hashicorp/nomad/helper"
    40  	"github.com/hashicorp/nomad/helper/pool"
    41  	hstats "github.com/hashicorp/nomad/helper/stats"
    42  	"github.com/hashicorp/nomad/helper/tlsutil"
    43  	"github.com/hashicorp/nomad/helper/uuid"
    44  	"github.com/hashicorp/nomad/nomad/structs"
    45  	nconfig "github.com/hashicorp/nomad/nomad/structs/config"
    46  	"github.com/hashicorp/nomad/plugins/device"
    47  	"github.com/hashicorp/nomad/plugins/drivers"
    48  	vaultapi "github.com/hashicorp/vault/api"
    49  	"github.com/shirou/gopsutil/host"
    50  )
    51  
    52  const (
    53  	// clientRPCCache controls how long we keep an idle connection
    54  	// open to a server
    55  	clientRPCCache = 5 * time.Minute
    56  
    57  	// clientMaxStreams controls how many idle streams we keep
    58  	// open to a server
    59  	clientMaxStreams = 2
    60  
    61  	// datacenterQueryLimit searches through up to this many adjacent
    62  	// datacenters looking for the Nomad server service.
    63  	datacenterQueryLimit = 9
    64  
    65  	// registerRetryIntv is minimum interval on which we retry
    66  	// registration. We pick a value between this and 2x this.
    67  	registerRetryIntv = 15 * time.Second
    68  
    69  	// getAllocRetryIntv is minimum interval on which we retry
    70  	// to fetch allocations. We pick a value between this and 2x this.
    71  	getAllocRetryIntv = 30 * time.Second
    72  
    73  	// devModeRetryIntv is the retry interval used for development
    74  	devModeRetryIntv = time.Second
    75  
    76  	// stateSnapshotIntv is how often the client snapshots state
    77  	stateSnapshotIntv = 60 * time.Second
    78  
    79  	// initialHeartbeatStagger is used to stagger the interval between
    80  	// starting and the initial heartbeat. After the initial heartbeat,
    81  	// we switch to using the TTL specified by the servers.
    82  	initialHeartbeatStagger = 10 * time.Second
    83  
    84  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    85  	// node attributes or meta map.
    86  	nodeUpdateRetryIntv = 5 * time.Second
    87  
    88  	// allocSyncIntv is the batching period of allocation updates before they
    89  	// are synced with the server.
    90  	allocSyncIntv = 200 * time.Millisecond
    91  
    92  	// allocSyncRetryIntv is the interval on which we retry updating
    93  	// the status of the allocation
    94  	allocSyncRetryIntv = 5 * time.Second
    95  )
    96  
    97  var (
    98  	// grace period to allow for batch fingerprint processing
    99  	batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second
   100  )
   101  
   102  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
   103  // Client
   104  type ClientStatsReporter interface {
   105  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
   106  	// If it does not exist an error is reported.
   107  	GetAllocStats(allocID string) (interfaces.AllocStatsReporter, error)
   108  
   109  	// LatestHostStats returns the latest resource usage stats for the host
   110  	LatestHostStats() *stats.HostStats
   111  }
   112  
   113  // AllocRunner is the interface implemented by the core alloc runner.
   114  //TODO Create via factory to allow testing Client with mock AllocRunners.
   115  type AllocRunner interface {
   116  	Alloc() *structs.Allocation
   117  	AllocState() *arstate.State
   118  	Destroy()
   119  	Shutdown()
   120  	GetAllocDir() *allocdir.AllocDir
   121  	IsDestroyed() bool
   122  	IsMigrating() bool
   123  	IsWaiting() bool
   124  	Listener() *cstructs.AllocListener
   125  	Restore() error
   126  	Run()
   127  	StatsReporter() interfaces.AllocStatsReporter
   128  	Update(*structs.Allocation)
   129  	WaitCh() <-chan struct{}
   130  	DestroyCh() <-chan struct{}
   131  	ShutdownCh() <-chan struct{}
   132  	Signal(taskName, signal string) error
   133  	GetTaskEventHandler(taskName string) drivermanager.EventHandler
   134  
   135  	RestartTask(taskName string, taskEvent *structs.TaskEvent) error
   136  	RestartAll(taskEvent *structs.TaskEvent) error
   137  
   138  	GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler
   139  	GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error)
   140  }
   141  
   142  // Client is used to implement the client interaction with Nomad. Clients
   143  // are expected to register as a schedulable node to the servers, and to
   144  // run allocations as determined by the servers.
   145  type Client struct {
   146  	config *config.Config
   147  	start  time.Time
   148  
   149  	// stateDB is used to efficiently store client state.
   150  	stateDB state.StateDB
   151  
   152  	// configCopy is a copy that should be passed to alloc-runners.
   153  	configCopy *config.Config
   154  	configLock sync.RWMutex
   155  
   156  	logger    hclog.Logger
   157  	rpcLogger hclog.Logger
   158  
   159  	connPool *pool.ConnPool
   160  
   161  	// tlsWrap is used to wrap outbound connections using TLS. It should be
   162  	// accessed using the lock.
   163  	tlsWrap     tlsutil.RegionWrapper
   164  	tlsWrapLock sync.RWMutex
   165  
   166  	// servers is the list of nomad servers
   167  	servers *servers.Manager
   168  
   169  	// heartbeat related times for tracking how often to heartbeat
   170  	lastHeartbeat   time.Time
   171  	heartbeatTTL    time.Duration
   172  	haveHeartbeated bool
   173  	heartbeatLock   sync.Mutex
   174  
   175  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   176  	triggerDiscoveryCh chan struct{}
   177  
   178  	// triggerNodeUpdate triggers the client to mark the Node as changed and
   179  	// update it.
   180  	triggerNodeUpdate chan struct{}
   181  
   182  	// triggerEmitNodeEvent sends an event and triggers the client to update the
   183  	// server for the node event
   184  	triggerEmitNodeEvent chan *structs.NodeEvent
   185  
   186  	// rpcRetryCh is closed when there an event such as server discovery or a
   187  	// successful RPC occurring happens such that a retry should happen. Access
   188  	// should only occur via the getter method
   189  	rpcRetryCh   chan struct{}
   190  	rpcRetryLock sync.Mutex
   191  
   192  	// allocs maps alloc IDs to their AllocRunner. This map includes all
   193  	// AllocRunners - running and GC'd - until the server GCs them.
   194  	allocs    map[string]AllocRunner
   195  	allocLock sync.RWMutex
   196  
   197  	// invalidAllocs is a map that tracks allocations that failed because
   198  	// the client couldn't initialize alloc or task runners for it. This can
   199  	// happen due to driver errors
   200  	invalidAllocs     map[string]struct{}
   201  	invalidAllocsLock sync.Mutex
   202  
   203  	// allocUpdates stores allocations that need to be synced to the server.
   204  	allocUpdates chan *structs.Allocation
   205  
   206  	// consulService is Nomad's custom Consul client for managing services
   207  	// and checks.
   208  	consulService consulApi.ConsulServiceAPI
   209  
   210  	// consulCatalog is the subset of Consul's Catalog API Nomad uses.
   211  	consulCatalog consul.CatalogAPI
   212  
   213  	// HostStatsCollector collects host resource usage stats
   214  	hostStatsCollector *stats.HostStatsCollector
   215  
   216  	// shutdown is true when the Client has been shutdown. Must hold
   217  	// shutdownLock to access.
   218  	shutdown bool
   219  
   220  	// shutdownCh is closed to signal the Client is shutting down.
   221  	shutdownCh chan struct{}
   222  
   223  	shutdownLock sync.Mutex
   224  
   225  	// shutdownGroup are goroutines that exit when shutdownCh is closed.
   226  	// Shutdown() blocks on Wait() after closing shutdownCh.
   227  	shutdownGroup group
   228  
   229  	// vaultClient is used to interact with Vault for token and secret renewals
   230  	vaultClient vaultclient.VaultClient
   231  
   232  	// garbageCollector is used to garbage collect terminal allocations present
   233  	// in the node automatically
   234  	garbageCollector *AllocGarbageCollector
   235  
   236  	// clientACLResolver holds the ACL resolution state
   237  	clientACLResolver
   238  
   239  	// rpcServer is used to serve RPCs by the local agent.
   240  	rpcServer     *rpc.Server
   241  	endpoints     rpcEndpoints
   242  	streamingRpcs *structs.StreamingRpcRegistry
   243  
   244  	// pluginManagers is the set of PluginManagers registered by the client
   245  	pluginManagers *pluginmanager.PluginGroup
   246  
   247  	// devicemanger is responsible for managing device plugins.
   248  	devicemanager devicemanager.Manager
   249  
   250  	// drivermanager is responsible for managing driver plugins
   251  	drivermanager drivermanager.Manager
   252  
   253  	// baseLabels are used when emitting tagged metrics. All client metrics will
   254  	// have these tags, and optionally more.
   255  	baseLabels []metrics.Label
   256  
   257  	// batchNodeUpdates is used to batch initial updates to the node
   258  	batchNodeUpdates *batchNodeUpdates
   259  
   260  	// fpInitialized chan is closed when the first batch of fingerprints are
   261  	// applied to the node and the server is updated
   262  	fpInitialized chan struct{}
   263  
   264  	// serversContactedCh is closed when GetClientAllocs and runAllocs have
   265  	// successfully run once.
   266  	serversContactedCh   chan struct{}
   267  	serversContactedOnce sync.Once
   268  }
   269  
   270  var (
   271  	// noServersErr is returned by the RPC method when the client has no
   272  	// configured servers. This is used to trigger Consul discovery if
   273  	// enabled.
   274  	noServersErr = errors.New("no servers")
   275  )
   276  
   277  // NewClient is used to create a new client from the given configuration
   278  func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService consulApi.ConsulServiceAPI) (*Client, error) {
   279  	// Create the tls wrapper
   280  	var tlsWrap tlsutil.RegionWrapper
   281  	if cfg.TLSConfig.EnableRPC {
   282  		tw, err := tlsutil.NewTLSConfiguration(cfg.TLSConfig, true, true)
   283  		if err != nil {
   284  			return nil, err
   285  		}
   286  		tlsWrap, err = tw.OutgoingTLSWrapper()
   287  		if err != nil {
   288  			return nil, err
   289  		}
   290  	}
   291  
   292  	if cfg.StateDBFactory == nil {
   293  		cfg.StateDBFactory = state.GetStateDBFactory(cfg.DevMode)
   294  	}
   295  
   296  	// Create the logger
   297  	logger := cfg.Logger.ResetNamed("client")
   298  
   299  	// Create the client
   300  	c := &Client{
   301  		config:               cfg,
   302  		consulCatalog:        consulCatalog,
   303  		consulService:        consulService,
   304  		start:                time.Now(),
   305  		connPool:             pool.NewPool(logger, clientRPCCache, clientMaxStreams, tlsWrap),
   306  		tlsWrap:              tlsWrap,
   307  		streamingRpcs:        structs.NewStreamingRpcRegistry(),
   308  		logger:               logger,
   309  		rpcLogger:            logger.Named("rpc"),
   310  		allocs:               make(map[string]AllocRunner),
   311  		allocUpdates:         make(chan *structs.Allocation, 64),
   312  		shutdownCh:           make(chan struct{}),
   313  		triggerDiscoveryCh:   make(chan struct{}),
   314  		triggerNodeUpdate:    make(chan struct{}, 8),
   315  		triggerEmitNodeEvent: make(chan *structs.NodeEvent, 8),
   316  		fpInitialized:        make(chan struct{}),
   317  		invalidAllocs:        make(map[string]struct{}),
   318  		serversContactedCh:   make(chan struct{}),
   319  		serversContactedOnce: sync.Once{},
   320  	}
   321  
   322  	c.batchNodeUpdates = newBatchNodeUpdates(
   323  		c.updateNodeFromDriver,
   324  		c.updateNodeFromDevices,
   325  	)
   326  
   327  	// Initialize the server manager
   328  	c.servers = servers.New(c.logger, c.shutdownCh, c)
   329  
   330  	// Start server manager rebalancing go routine
   331  	go c.servers.Start()
   332  
   333  	// Initialize the client
   334  	if err := c.init(); err != nil {
   335  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   336  	}
   337  
   338  	// Setup the clients RPC server
   339  	c.setupClientRpc()
   340  
   341  	// Initialize the ACL state
   342  	if err := c.clientACLResolver.init(); err != nil {
   343  		return nil, fmt.Errorf("failed to initialize ACL state: %v", err)
   344  	}
   345  
   346  	// Setup the node
   347  	if err := c.setupNode(); err != nil {
   348  		return nil, fmt.Errorf("node setup failed: %v", err)
   349  	}
   350  
   351  	// Store the config copy before restoring state but after it has been
   352  	// initialized.
   353  	c.configLock.Lock()
   354  	c.configCopy = c.config.Copy()
   355  	c.configLock.Unlock()
   356  
   357  	fingerprintManager := NewFingerprintManager(
   358  		c.configCopy.PluginSingletonLoader, c.GetConfig, c.configCopy.Node,
   359  		c.shutdownCh, c.updateNodeFromFingerprint, c.logger)
   360  
   361  	c.pluginManagers = pluginmanager.New(c.logger)
   362  
   363  	// Fingerprint the node and scan for drivers
   364  	if err := fingerprintManager.Run(); err != nil {
   365  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   366  	}
   367  
   368  	// Build the white/blacklists of drivers.
   369  	allowlistDrivers := cfg.ReadStringListToMap("driver.whitelist")
   370  	blocklistDrivers := cfg.ReadStringListToMap("driver.blacklist")
   371  
   372  	// Setup the driver manager
   373  	driverConfig := &drivermanager.Config{
   374  		Logger:              c.logger,
   375  		Loader:              c.configCopy.PluginSingletonLoader,
   376  		PluginConfig:        c.configCopy.NomadPluginConfig(),
   377  		Updater:             c.batchNodeUpdates.updateNodeFromDriver,
   378  		EventHandlerFactory: c.GetTaskEventHandler,
   379  		State:               c.stateDB,
   380  		AllowedDrivers:      allowlistDrivers,
   381  		BlockedDrivers:      blocklistDrivers,
   382  	}
   383  	drvManager := drivermanager.New(driverConfig)
   384  	c.drivermanager = drvManager
   385  	c.pluginManagers.RegisterAndRun(drvManager)
   386  
   387  	// Setup the device manager
   388  	devConfig := &devicemanager.Config{
   389  		Logger:        c.logger,
   390  		Loader:        c.configCopy.PluginSingletonLoader,
   391  		PluginConfig:  c.configCopy.NomadPluginConfig(),
   392  		Updater:       c.batchNodeUpdates.updateNodeFromDevices,
   393  		StatsInterval: c.configCopy.StatsCollectionInterval,
   394  		State:         c.stateDB,
   395  	}
   396  	devManager := devicemanager.New(devConfig)
   397  	c.devicemanager = devManager
   398  	c.pluginManagers.RegisterAndRun(devManager)
   399  
   400  	// Batching of initial fingerprints is done to reduce the number of node
   401  	// updates sent to the server on startup.
   402  	go c.batchFirstFingerprints()
   403  
   404  	// Add the stats collector
   405  	statsCollector := stats.NewHostStatsCollector(c.logger, c.config.AllocDir, c.devicemanager.AllStats)
   406  	c.hostStatsCollector = statsCollector
   407  
   408  	// Add the garbage collector
   409  	gcConfig := &GCConfig{
   410  		MaxAllocs:           cfg.GCMaxAllocs,
   411  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   412  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   413  		Interval:            cfg.GCInterval,
   414  		ParallelDestroys:    cfg.GCParallelDestroys,
   415  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   416  	}
   417  	c.garbageCollector = NewAllocGarbageCollector(c.logger, statsCollector, c, gcConfig)
   418  	go c.garbageCollector.Run()
   419  
   420  	// Set the preconfigured list of static servers
   421  	c.configLock.RLock()
   422  	if len(c.configCopy.Servers) > 0 {
   423  		if _, err := c.setServersImpl(c.configCopy.Servers, true); err != nil {
   424  			logger.Warn("none of the configured servers are valid", "error", err)
   425  		}
   426  	}
   427  	c.configLock.RUnlock()
   428  
   429  	// Setup Consul discovery if enabled
   430  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   431  		c.shutdownGroup.Go(c.consulDiscovery)
   432  		if c.servers.NumServers() == 0 {
   433  			// No configured servers; trigger discovery manually
   434  			c.triggerDiscoveryCh <- struct{}{}
   435  		}
   436  	}
   437  
   438  	// Setup the vault client for token and secret renewals
   439  	if err := c.setupVaultClient(); err != nil {
   440  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   441  	}
   442  
   443  	// wait until drivers are healthy before restoring or registering with servers
   444  	select {
   445  	case <-c.Ready():
   446  	case <-time.After(batchFirstFingerprintsProcessingGrace):
   447  		logger.Warn("batch fingerprint operation timed out; proceeding to register with fingerprinted plugins so far")
   448  	}
   449  
   450  	// Register and then start heartbeating to the servers.
   451  	c.shutdownGroup.Go(c.registerAndHeartbeat)
   452  
   453  	// Restore the state
   454  	if err := c.restoreState(); err != nil {
   455  		logger.Error("failed to restore state", "error", err)
   456  		logger.Error("Nomad is unable to start due to corrupt state. "+
   457  			"The safest way to proceed is to manually stop running task processes "+
   458  			"and remove Nomad's state and alloc directories before "+
   459  			"restarting. Lost allocations will be rescheduled.",
   460  			"state_dir", c.config.StateDir, "alloc_dir", c.config.AllocDir)
   461  		logger.Error("Corrupt state is often caused by a bug. Please " +
   462  			"report as much information as possible to " +
   463  			"https://github.com/hashicorp/nomad/issues")
   464  		return nil, fmt.Errorf("failed to restore state")
   465  	}
   466  
   467  	// Begin periodic snapshotting of state.
   468  	c.shutdownGroup.Go(c.periodicSnapshot)
   469  
   470  	// Begin syncing allocations to the server
   471  	c.shutdownGroup.Go(c.allocSync)
   472  
   473  	// Start the client! Don't use the shutdownGroup as run handles
   474  	// shutdowns manually to prevent updates from being applied during
   475  	// shutdown.
   476  	go c.run()
   477  
   478  	// Start collecting stats
   479  	c.shutdownGroup.Go(c.emitStats)
   480  
   481  	c.logger.Info("started client", "node_id", c.NodeID())
   482  	return c, nil
   483  }
   484  
   485  // Ready returns a chan that is closed when the client is fully initialized
   486  func (c *Client) Ready() <-chan struct{} {
   487  	return c.fpInitialized
   488  }
   489  
   490  // init is used to initialize the client and perform any setup
   491  // needed before we begin starting its various components.
   492  func (c *Client) init() error {
   493  	// Ensure the state dir exists if we have one
   494  	if c.config.StateDir != "" {
   495  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   496  			return fmt.Errorf("failed creating state dir: %s", err)
   497  		}
   498  
   499  	} else {
   500  		// Otherwise make a temp directory to use.
   501  		p, err := ioutil.TempDir("", "NomadClient")
   502  		if err != nil {
   503  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   504  		}
   505  
   506  		p, err = filepath.EvalSymlinks(p)
   507  		if err != nil {
   508  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   509  		}
   510  
   511  		c.config.StateDir = p
   512  	}
   513  	c.logger.Info("using state directory", "state_dir", c.config.StateDir)
   514  
   515  	// Open the state database
   516  	db, err := c.config.StateDBFactory(c.logger, c.config.StateDir)
   517  	if err != nil {
   518  		return fmt.Errorf("failed to open state database: %v", err)
   519  	}
   520  
   521  	// Upgrade the state database
   522  	if err := db.Upgrade(); err != nil {
   523  		// Upgrade only returns an error on critical persistence
   524  		// failures in which an operator should intervene before the
   525  		// node is accessible. Upgrade drops and logs corrupt state it
   526  		// encounters, so failing to start the agent should be extremely
   527  		// rare.
   528  		return fmt.Errorf("failed to upgrade state database: %v", err)
   529  	}
   530  
   531  	c.stateDB = db
   532  
   533  	// Ensure the alloc dir exists if we have one
   534  	if c.config.AllocDir != "" {
   535  		if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil {
   536  			return fmt.Errorf("failed creating alloc dir: %s", err)
   537  		}
   538  	} else {
   539  		// Otherwise make a temp directory to use.
   540  		p, err := ioutil.TempDir("", "NomadClient")
   541  		if err != nil {
   542  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   543  		}
   544  
   545  		p, err = filepath.EvalSymlinks(p)
   546  		if err != nil {
   547  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   548  		}
   549  
   550  		// Change the permissions to have the execute bit
   551  		if err := os.Chmod(p, 0711); err != nil {
   552  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   553  		}
   554  
   555  		c.config.AllocDir = p
   556  	}
   557  
   558  	c.logger.Info("using alloc directory", "alloc_dir", c.config.AllocDir)
   559  	return nil
   560  }
   561  
   562  // reloadTLSConnections allows a client to reload its TLS configuration on the
   563  // fly
   564  func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error {
   565  	var tlsWrap tlsutil.RegionWrapper
   566  	if newConfig != nil && newConfig.EnableRPC {
   567  		tw, err := tlsutil.NewTLSConfiguration(newConfig, true, true)
   568  		if err != nil {
   569  			return err
   570  		}
   571  
   572  		twWrap, err := tw.OutgoingTLSWrapper()
   573  		if err != nil {
   574  			return err
   575  		}
   576  		tlsWrap = twWrap
   577  	}
   578  
   579  	// Store the new tls wrapper.
   580  	c.tlsWrapLock.Lock()
   581  	c.tlsWrap = tlsWrap
   582  	c.tlsWrapLock.Unlock()
   583  
   584  	// Keep the client configuration up to date as we use configuration values to
   585  	// decide on what type of connections to accept
   586  	c.configLock.Lock()
   587  	c.config.TLSConfig = newConfig
   588  	c.configLock.Unlock()
   589  
   590  	c.connPool.ReloadTLS(tlsWrap)
   591  
   592  	return nil
   593  }
   594  
   595  // Reload allows a client to reload its configuration on the fly
   596  func (c *Client) Reload(newConfig *config.Config) error {
   597  	shouldReloadTLS, err := tlsutil.ShouldReloadRPCConnections(c.config.TLSConfig, newConfig.TLSConfig)
   598  	if err != nil {
   599  		c.logger.Error("error parsing TLS configuration", "error", err)
   600  		return err
   601  	}
   602  
   603  	if shouldReloadTLS {
   604  		return c.reloadTLSConnections(newConfig.TLSConfig)
   605  	}
   606  
   607  	return nil
   608  }
   609  
   610  // Leave is used to prepare the client to leave the cluster
   611  func (c *Client) Leave() error {
   612  	// TODO
   613  	return nil
   614  }
   615  
   616  // GetConfig returns the config of the client
   617  func (c *Client) GetConfig() *config.Config {
   618  	c.configLock.Lock()
   619  	defer c.configLock.Unlock()
   620  	return c.configCopy
   621  }
   622  
   623  // Datacenter returns the datacenter for the given client
   624  func (c *Client) Datacenter() string {
   625  	return c.config.Node.Datacenter
   626  }
   627  
   628  // Region returns the region for the given client
   629  func (c *Client) Region() string {
   630  	return c.config.Region
   631  }
   632  
   633  // NodeID returns the node ID for the given client
   634  func (c *Client) NodeID() string {
   635  	return c.config.Node.ID
   636  }
   637  
   638  // secretNodeID returns the secret node ID for the given client
   639  func (c *Client) secretNodeID() string {
   640  	return c.config.Node.SecretID
   641  }
   642  
   643  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   644  // client.
   645  func (c *Client) RPCMajorVersion() int {
   646  	return structs.ApiMajorVersion
   647  }
   648  
   649  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   650  // client.
   651  func (c *Client) RPCMinorVersion() int {
   652  	return structs.ApiMinorVersion
   653  }
   654  
   655  // Shutdown is used to tear down the client
   656  func (c *Client) Shutdown() error {
   657  	c.shutdownLock.Lock()
   658  	defer c.shutdownLock.Unlock()
   659  
   660  	if c.shutdown {
   661  		c.logger.Info("already shutdown")
   662  		return nil
   663  	}
   664  	c.logger.Info("shutting down")
   665  
   666  	// Stop renewing tokens and secrets
   667  	if c.vaultClient != nil {
   668  		c.vaultClient.Stop()
   669  	}
   670  
   671  	// Stop Garbage collector
   672  	c.garbageCollector.Stop()
   673  
   674  	arGroup := group{}
   675  	if c.config.DevMode {
   676  		// In DevMode destroy all the running allocations.
   677  		for _, ar := range c.getAllocRunners() {
   678  			ar.Destroy()
   679  			arGroup.AddCh(ar.DestroyCh())
   680  		}
   681  	} else {
   682  		// In normal mode call shutdown
   683  		for _, ar := range c.getAllocRunners() {
   684  			ar.Shutdown()
   685  			arGroup.AddCh(ar.ShutdownCh())
   686  		}
   687  	}
   688  	arGroup.Wait()
   689  
   690  	// Shutdown the plugin managers
   691  	c.pluginManagers.Shutdown()
   692  
   693  	c.shutdown = true
   694  	close(c.shutdownCh)
   695  
   696  	// Must close connection pool to unblock alloc watcher
   697  	c.connPool.Shutdown()
   698  
   699  	// Wait for goroutines to stop
   700  	c.shutdownGroup.Wait()
   701  
   702  	// One final save state
   703  	c.saveState()
   704  	return c.stateDB.Close()
   705  }
   706  
   707  // Stats is used to return statistics for debugging and insight
   708  // for various sub-systems
   709  func (c *Client) Stats() map[string]map[string]string {
   710  	c.heartbeatLock.Lock()
   711  	defer c.heartbeatLock.Unlock()
   712  	stats := map[string]map[string]string{
   713  		"client": {
   714  			"node_id":         c.NodeID(),
   715  			"known_servers":   strings.Join(c.GetServers(), ","),
   716  			"num_allocations": strconv.Itoa(c.NumAllocs()),
   717  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   718  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   719  		},
   720  		"runtime": hstats.RuntimeStats(),
   721  	}
   722  	return stats
   723  }
   724  
   725  // SignalAllocation sends a signal to the tasks within an allocation.
   726  // If the provided task is empty, then every allocation will be signalled.
   727  // If a task is provided, then only an exactly matching task will be signalled.
   728  func (c *Client) SignalAllocation(allocID, task, signal string) error {
   729  	ar, err := c.getAllocRunner(allocID)
   730  	if err != nil {
   731  		return err
   732  	}
   733  
   734  	return ar.Signal(task, signal)
   735  }
   736  
   737  // CollectAllocation garbage collects a single allocation on a node. Returns
   738  // true if alloc was found and garbage collected; otherwise false.
   739  func (c *Client) CollectAllocation(allocID string) bool {
   740  	return c.garbageCollector.Collect(allocID)
   741  }
   742  
   743  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   744  // state
   745  func (c *Client) CollectAllAllocs() {
   746  	c.garbageCollector.CollectAll()
   747  }
   748  
   749  func (c *Client) RestartAllocation(allocID, taskName string) error {
   750  	ar, err := c.getAllocRunner(allocID)
   751  	if err != nil {
   752  		return err
   753  	}
   754  
   755  	event := structs.NewTaskEvent(structs.TaskRestartSignal).
   756  		SetRestartReason("User requested restart")
   757  
   758  	if taskName != "" {
   759  		return ar.RestartTask(taskName, event)
   760  	}
   761  
   762  	return ar.RestartAll(event)
   763  }
   764  
   765  // Node returns the locally registered node
   766  func (c *Client) Node() *structs.Node {
   767  	c.configLock.RLock()
   768  	defer c.configLock.RUnlock()
   769  	return c.configCopy.Node
   770  }
   771  
   772  func (c *Client) getAllocRunner(allocID string) (AllocRunner, error) {
   773  	c.allocLock.RLock()
   774  	defer c.allocLock.RUnlock()
   775  
   776  	ar, ok := c.allocs[allocID]
   777  	if !ok {
   778  		return nil, structs.NewErrUnknownAllocation(allocID)
   779  	}
   780  
   781  	return ar, nil
   782  }
   783  
   784  // StatsReporter exposes the various APIs related resource usage of a Nomad
   785  // client
   786  func (c *Client) StatsReporter() ClientStatsReporter {
   787  	return c
   788  }
   789  
   790  func (c *Client) GetAllocStats(allocID string) (interfaces.AllocStatsReporter, error) {
   791  	ar, err := c.getAllocRunner(allocID)
   792  	if err != nil {
   793  		return nil, err
   794  	}
   795  	return ar.StatsReporter(), nil
   796  }
   797  
   798  // HostStats returns all the stats related to a Nomad client
   799  func (c *Client) LatestHostStats() *stats.HostStats {
   800  	return c.hostStatsCollector.Stats()
   801  }
   802  
   803  func (c *Client) LatestDeviceResourceStats(devices []*structs.AllocatedDeviceResource) []*device.DeviceGroupStats {
   804  	return c.computeAllocatedDeviceGroupStats(devices, c.LatestHostStats().DeviceStats)
   805  }
   806  
   807  func (c *Client) computeAllocatedDeviceGroupStats(devices []*structs.AllocatedDeviceResource, hostDeviceGroupStats []*device.DeviceGroupStats) []*device.DeviceGroupStats {
   808  	// basic optimization for the usual case
   809  	if len(devices) == 0 || len(hostDeviceGroupStats) == 0 {
   810  		return nil
   811  	}
   812  
   813  	// Build an index of allocated devices
   814  	adIdx := map[structs.DeviceIdTuple][]string{}
   815  
   816  	total := 0
   817  	for _, ds := range devices {
   818  		adIdx[*ds.ID()] = ds.DeviceIDs
   819  		total += len(ds.DeviceIDs)
   820  	}
   821  
   822  	// Collect allocated device stats from host stats
   823  	result := make([]*device.DeviceGroupStats, 0, len(adIdx))
   824  
   825  	for _, dg := range hostDeviceGroupStats {
   826  		k := structs.DeviceIdTuple{
   827  			Vendor: dg.Vendor,
   828  			Type:   dg.Type,
   829  			Name:   dg.Name,
   830  		}
   831  
   832  		allocatedDeviceIDs, ok := adIdx[k]
   833  		if !ok {
   834  			continue
   835  		}
   836  
   837  		rdgStats := &device.DeviceGroupStats{
   838  			Vendor:        dg.Vendor,
   839  			Type:          dg.Type,
   840  			Name:          dg.Name,
   841  			InstanceStats: map[string]*device.DeviceStats{},
   842  		}
   843  
   844  		for _, adID := range allocatedDeviceIDs {
   845  			deviceStats, ok := dg.InstanceStats[adID]
   846  			if !ok || deviceStats == nil {
   847  				c.logger.Warn("device not found in stats", "device_id", adID, "device_group_id", k)
   848  				continue
   849  			}
   850  
   851  			rdgStats.InstanceStats[adID] = deviceStats
   852  		}
   853  		result = append(result, rdgStats)
   854  	}
   855  
   856  	return result
   857  }
   858  
   859  // ValidateMigrateToken verifies that a token is for a specific client and
   860  // allocation, and has been created by a trusted party that has privileged
   861  // knowledge of the client's secret identifier
   862  func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool {
   863  	if !c.config.ACLEnabled {
   864  		return true
   865  	}
   866  
   867  	return structs.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken)
   868  }
   869  
   870  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   871  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   872  	ar, err := c.getAllocRunner(allocID)
   873  	if err != nil {
   874  		return nil, err
   875  	}
   876  
   877  	return ar.GetAllocDir(), nil
   878  }
   879  
   880  // GetAllocState returns a copy of an allocation's state on this client. It
   881  // returns either an AllocState or an unknown allocation error.
   882  func (c *Client) GetAllocState(allocID string) (*arstate.State, error) {
   883  	ar, err := c.getAllocRunner(allocID)
   884  	if err != nil {
   885  		return nil, err
   886  	}
   887  
   888  	return ar.AllocState(), nil
   889  }
   890  
   891  // GetServers returns the list of nomad servers this client is aware of.
   892  func (c *Client) GetServers() []string {
   893  	endpoints := c.servers.GetServers()
   894  	res := make([]string, len(endpoints))
   895  	for i := range endpoints {
   896  		res[i] = endpoints[i].String()
   897  	}
   898  	sort.Strings(res)
   899  	return res
   900  }
   901  
   902  // SetServers sets a new list of nomad servers to connect to. As long as one
   903  // server is resolvable no error is returned.
   904  func (c *Client) SetServers(in []string) (int, error) {
   905  	return c.setServersImpl(in, false)
   906  }
   907  
   908  // setServersImpl sets a new list of nomad servers to connect to. If force is
   909  // set, we add the server to the internal serverlist even if the server could not
   910  // be pinged. An error is returned if no endpoints were valid when non-forcing.
   911  //
   912  // Force should be used when setting the servers from the initial configuration
   913  // since the server may be starting up in parallel and initial pings may fail.
   914  func (c *Client) setServersImpl(in []string, force bool) (int, error) {
   915  	var mu sync.Mutex
   916  	var wg sync.WaitGroup
   917  	var merr multierror.Error
   918  
   919  	endpoints := make([]*servers.Server, 0, len(in))
   920  	wg.Add(len(in))
   921  
   922  	for _, s := range in {
   923  		go func(srv string) {
   924  			defer wg.Done()
   925  			addr, err := resolveServer(srv)
   926  			if err != nil {
   927  				mu.Lock()
   928  				c.logger.Debug("ignoring server due to resolution error", "error", err, "server", srv)
   929  				merr.Errors = append(merr.Errors, err)
   930  				mu.Unlock()
   931  				return
   932  			}
   933  
   934  			// Try to ping to check if it is a real server
   935  			if err := c.Ping(addr); err != nil {
   936  				mu.Lock()
   937  				merr.Errors = append(merr.Errors, fmt.Errorf("Server at address %s failed ping: %v", addr, err))
   938  				mu.Unlock()
   939  
   940  				// If we are forcing the setting of the servers, inject it to
   941  				// the serverlist even if we can't ping immediately.
   942  				if !force {
   943  					return
   944  				}
   945  			}
   946  
   947  			mu.Lock()
   948  			endpoints = append(endpoints, &servers.Server{Addr: addr})
   949  			mu.Unlock()
   950  		}(s)
   951  	}
   952  
   953  	wg.Wait()
   954  
   955  	// Only return errors if no servers are valid
   956  	if len(endpoints) == 0 {
   957  		if len(merr.Errors) > 0 {
   958  			return 0, merr.ErrorOrNil()
   959  		}
   960  		return 0, noServersErr
   961  	}
   962  
   963  	c.servers.SetServers(endpoints)
   964  	return len(endpoints), nil
   965  }
   966  
   967  // restoreState is used to restore our state from the data dir
   968  // If there are errors restoring a specific allocation it is marked
   969  // as failed whenever possible.
   970  func (c *Client) restoreState() error {
   971  	if c.config.DevMode {
   972  		return nil
   973  	}
   974  
   975  	//XXX REMOVED! make a note in backward compat / upgrading doc
   976  	// COMPAT: Remove in 0.7.0
   977  	// 0.6.0 transitioned from individual state files to a single bolt-db.
   978  	// The upgrade path is to:
   979  	// Check if old state exists
   980  	//   If so, restore from that and delete old state
   981  	// Restore using state database
   982  
   983  	// Restore allocations
   984  	allocs, allocErrs, err := c.stateDB.GetAllAllocations()
   985  	if err != nil {
   986  		return err
   987  	}
   988  
   989  	for allocID, err := range allocErrs {
   990  		c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
   991  		//TODO Cleanup
   992  		// Try to clean up alloc dir
   993  		// Remove boltdb entries?
   994  		// Send to server with clientstatus=failed
   995  	}
   996  
   997  	// Load each alloc back
   998  	for _, alloc := range allocs {
   999  
  1000  		//XXX On Restore we give up on watching previous allocs because
  1001  		//    we need the local AllocRunners initialized first. We could
  1002  		//    add a second loop to initialize just the alloc watcher.
  1003  		prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
  1004  		prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
  1005  
  1006  		c.configLock.RLock()
  1007  		arConf := &allocrunner.Config{
  1008  			Alloc:               alloc,
  1009  			Logger:              c.logger,
  1010  			ClientConfig:        c.configCopy,
  1011  			StateDB:             c.stateDB,
  1012  			StateUpdater:        c,
  1013  			DeviceStatsReporter: c,
  1014  			Consul:              c.consulService,
  1015  			Vault:               c.vaultClient,
  1016  			PrevAllocWatcher:    prevAllocWatcher,
  1017  			PrevAllocMigrator:   prevAllocMigrator,
  1018  			DeviceManager:       c.devicemanager,
  1019  			DriverManager:       c.drivermanager,
  1020  			ServersContactedCh:  c.serversContactedCh,
  1021  		}
  1022  		c.configLock.RUnlock()
  1023  
  1024  		ar, err := allocrunner.NewAllocRunner(arConf)
  1025  		if err != nil {
  1026  			c.logger.Error("error running alloc", "error", err, "alloc_id", alloc.ID)
  1027  			c.handleInvalidAllocs(alloc, err)
  1028  			continue
  1029  		}
  1030  
  1031  		// Restore state
  1032  		if err := ar.Restore(); err != nil {
  1033  			c.logger.Error("error restoring alloc", "error", err, "alloc_id", alloc.ID)
  1034  			// Override the status of the alloc to failed
  1035  			ar.SetClientStatus(structs.AllocClientStatusFailed)
  1036  			// Destroy the alloc runner since this is a failed restore
  1037  			ar.Destroy()
  1038  			continue
  1039  		}
  1040  
  1041  		//XXX is this locking necessary?
  1042  		c.allocLock.Lock()
  1043  		c.allocs[alloc.ID] = ar
  1044  		c.allocLock.Unlock()
  1045  	}
  1046  
  1047  	// All allocs restored successfully, run them!
  1048  	c.allocLock.Lock()
  1049  	for _, ar := range c.allocs {
  1050  		go ar.Run()
  1051  	}
  1052  	c.allocLock.Unlock()
  1053  	return nil
  1054  }
  1055  
  1056  func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
  1057  	c.invalidAllocsLock.Lock()
  1058  	c.invalidAllocs[alloc.ID] = struct{}{}
  1059  	c.invalidAllocsLock.Unlock()
  1060  
  1061  	// Mark alloc as failed so server can handle this
  1062  	failed := makeFailedAlloc(alloc, err)
  1063  	select {
  1064  	case c.allocUpdates <- failed:
  1065  	case <-c.shutdownCh:
  1066  	}
  1067  }
  1068  
  1069  // saveState is used to snapshot our state into the data dir.
  1070  func (c *Client) saveState() error {
  1071  	var wg sync.WaitGroup
  1072  	var l sync.Mutex
  1073  	var mErr multierror.Error
  1074  	runners := c.getAllocRunners()
  1075  	wg.Add(len(runners))
  1076  
  1077  	for id, ar := range runners {
  1078  		go func(id string, ar AllocRunner) {
  1079  			err := c.stateDB.PutAllocation(ar.Alloc())
  1080  			if err != nil {
  1081  				c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
  1082  				l.Lock()
  1083  				multierror.Append(&mErr, err)
  1084  				l.Unlock()
  1085  			}
  1086  			wg.Done()
  1087  		}(id, ar)
  1088  	}
  1089  
  1090  	wg.Wait()
  1091  	return mErr.ErrorOrNil()
  1092  }
  1093  
  1094  // getAllocRunners returns a snapshot of the current set of alloc runners.
  1095  func (c *Client) getAllocRunners() map[string]AllocRunner {
  1096  	c.allocLock.RLock()
  1097  	defer c.allocLock.RUnlock()
  1098  	runners := make(map[string]AllocRunner, len(c.allocs))
  1099  	for id, ar := range c.allocs {
  1100  		runners[id] = ar
  1101  	}
  1102  	return runners
  1103  }
  1104  
  1105  // NumAllocs returns the number of un-GC'd allocs this client has. Used to
  1106  // fulfill the AllocCounter interface for the GC.
  1107  func (c *Client) NumAllocs() int {
  1108  	n := 0
  1109  	c.allocLock.RLock()
  1110  	for _, a := range c.allocs {
  1111  		if !a.IsDestroyed() {
  1112  			n++
  1113  		}
  1114  	}
  1115  	c.allocLock.RUnlock()
  1116  	return n
  1117  }
  1118  
  1119  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
  1120  // The node ID is, if available, a persistent unique ID.  The secret ID is a
  1121  // high-entropy random UUID.
  1122  func (c *Client) nodeID() (id, secret string, err error) {
  1123  	var hostID string
  1124  	hostInfo, err := host.Info()
  1125  	if !c.config.NoHostUUID && err == nil {
  1126  		if hashed, ok := helper.HashUUID(hostInfo.HostID); ok {
  1127  			hostID = hashed
  1128  		}
  1129  	}
  1130  
  1131  	if hostID == "" {
  1132  		// Generate a random hostID if no constant ID is available on
  1133  		// this platform.
  1134  		hostID = uuid.Generate()
  1135  	}
  1136  
  1137  	// Do not persist in dev mode
  1138  	if c.config.DevMode {
  1139  		return hostID, uuid.Generate(), nil
  1140  	}
  1141  
  1142  	// Attempt to read existing ID
  1143  	idPath := filepath.Join(c.config.StateDir, "client-id")
  1144  	idBuf, err := ioutil.ReadFile(idPath)
  1145  	if err != nil && !os.IsNotExist(err) {
  1146  		return "", "", err
  1147  	}
  1148  
  1149  	// Attempt to read existing secret ID
  1150  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
  1151  	secretBuf, err := ioutil.ReadFile(secretPath)
  1152  	if err != nil && !os.IsNotExist(err) {
  1153  		return "", "", err
  1154  	}
  1155  
  1156  	// Use existing ID if any
  1157  	if len(idBuf) != 0 {
  1158  		id = strings.ToLower(string(idBuf))
  1159  	} else {
  1160  		id = hostID
  1161  
  1162  		// Persist the ID
  1163  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
  1164  			return "", "", err
  1165  		}
  1166  	}
  1167  
  1168  	if len(secretBuf) != 0 {
  1169  		secret = string(secretBuf)
  1170  	} else {
  1171  		// Generate new ID
  1172  		secret = uuid.Generate()
  1173  
  1174  		// Persist the ID
  1175  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
  1176  			return "", "", err
  1177  		}
  1178  	}
  1179  
  1180  	return id, secret, nil
  1181  }
  1182  
  1183  // setupNode is used to setup the initial node
  1184  func (c *Client) setupNode() error {
  1185  	node := c.config.Node
  1186  	if node == nil {
  1187  		node = &structs.Node{}
  1188  		c.config.Node = node
  1189  	}
  1190  	// Generate an ID and secret for the node
  1191  	id, secretID, err := c.nodeID()
  1192  	if err != nil {
  1193  		return fmt.Errorf("node ID setup failed: %v", err)
  1194  	}
  1195  
  1196  	node.ID = id
  1197  	node.SecretID = secretID
  1198  	if node.Attributes == nil {
  1199  		node.Attributes = make(map[string]string)
  1200  	}
  1201  	if node.Links == nil {
  1202  		node.Links = make(map[string]string)
  1203  	}
  1204  	if node.Drivers == nil {
  1205  		node.Drivers = make(map[string]*structs.DriverInfo)
  1206  	}
  1207  	if node.Meta == nil {
  1208  		node.Meta = make(map[string]string)
  1209  	}
  1210  	if node.NodeResources == nil {
  1211  		node.NodeResources = &structs.NodeResources{}
  1212  	}
  1213  	if node.ReservedResources == nil {
  1214  		node.ReservedResources = &structs.NodeReservedResources{}
  1215  	}
  1216  	if node.Resources == nil {
  1217  		node.Resources = &structs.Resources{}
  1218  	}
  1219  	if node.Reserved == nil {
  1220  		node.Reserved = &structs.Resources{}
  1221  	}
  1222  	if node.Datacenter == "" {
  1223  		node.Datacenter = "dc1"
  1224  	}
  1225  	if node.Name == "" {
  1226  		node.Name, _ = os.Hostname()
  1227  	}
  1228  	if node.Name == "" {
  1229  		node.Name = node.ID
  1230  	}
  1231  	node.Status = structs.NodeStatusInit
  1232  	return nil
  1233  }
  1234  
  1235  // updateNodeFromFingerprint updates the node with the result of
  1236  // fingerprinting the node from the diff that was created
  1237  func (c *Client) updateNodeFromFingerprint(response *fingerprint.FingerprintResponse) *structs.Node {
  1238  	c.configLock.Lock()
  1239  	defer c.configLock.Unlock()
  1240  
  1241  	nodeHasChanged := false
  1242  
  1243  	for name, newVal := range response.Attributes {
  1244  		oldVal := c.config.Node.Attributes[name]
  1245  		if oldVal == newVal {
  1246  			continue
  1247  		}
  1248  
  1249  		nodeHasChanged = true
  1250  		if newVal == "" {
  1251  			delete(c.config.Node.Attributes, name)
  1252  		} else {
  1253  			c.config.Node.Attributes[name] = newVal
  1254  		}
  1255  	}
  1256  
  1257  	// update node links and resources from the diff created from
  1258  	// fingerprinting
  1259  	for name, newVal := range response.Links {
  1260  		oldVal := c.config.Node.Links[name]
  1261  		if oldVal == newVal {
  1262  			continue
  1263  		}
  1264  
  1265  		nodeHasChanged = true
  1266  		if newVal == "" {
  1267  			delete(c.config.Node.Links, name)
  1268  		} else {
  1269  			c.config.Node.Links[name] = newVal
  1270  		}
  1271  	}
  1272  
  1273  	// COMPAT(0.10): Remove in 0.10
  1274  	// update the response networks with the config
  1275  	// if we still have node changes, merge them
  1276  	if response.Resources != nil {
  1277  		response.Resources.Networks = updateNetworks(
  1278  			c.config.Node.Resources.Networks,
  1279  			response.Resources.Networks,
  1280  			c.config)
  1281  		if !c.config.Node.Resources.Equals(response.Resources) {
  1282  			c.config.Node.Resources.Merge(response.Resources)
  1283  			nodeHasChanged = true
  1284  		}
  1285  	}
  1286  
  1287  	// update the response networks with the config
  1288  	// if we still have node changes, merge them
  1289  	if response.NodeResources != nil {
  1290  		response.NodeResources.Networks = updateNetworks(
  1291  			c.config.Node.NodeResources.Networks,
  1292  			response.NodeResources.Networks,
  1293  			c.config)
  1294  		if !c.config.Node.NodeResources.Equals(response.NodeResources) {
  1295  			c.config.Node.NodeResources.Merge(response.NodeResources)
  1296  			nodeHasChanged = true
  1297  		}
  1298  	}
  1299  
  1300  	if nodeHasChanged {
  1301  		c.updateNodeLocked()
  1302  	}
  1303  
  1304  	return c.configCopy.Node
  1305  }
  1306  
  1307  // updateNetworks preserves manually configured network options, but
  1308  // applies fingerprint updates
  1309  func updateNetworks(ns structs.Networks, up structs.Networks, c *config.Config) structs.Networks {
  1310  	if c.NetworkInterface == "" {
  1311  		ns = up
  1312  	} else {
  1313  		// If a network device is configured, filter up to contain details for only
  1314  		// that device
  1315  		upd := []*structs.NetworkResource{}
  1316  		for _, n := range up {
  1317  			if c.NetworkInterface == n.Device {
  1318  				upd = append(upd, n)
  1319  			}
  1320  		}
  1321  		// If updates, use them. Otherwise, ns contains the configured interfaces
  1322  		if len(upd) > 0 {
  1323  			ns = upd
  1324  		}
  1325  	}
  1326  
  1327  	// ns is set, apply the config NetworkSpeed to all
  1328  	if c.NetworkSpeed != 0 {
  1329  		for _, n := range ns {
  1330  			n.MBits = c.NetworkSpeed
  1331  		}
  1332  	}
  1333  	return ns
  1334  }
  1335  
  1336  // retryIntv calculates a retry interval value given the base
  1337  func (c *Client) retryIntv(base time.Duration) time.Duration {
  1338  	if c.config.DevMode {
  1339  		return devModeRetryIntv
  1340  	}
  1341  	return base + lib.RandomStagger(base)
  1342  }
  1343  
  1344  // registerAndHeartbeat is a long lived goroutine used to register the client
  1345  // and then start heartbeating to the server.
  1346  func (c *Client) registerAndHeartbeat() {
  1347  	// Register the node
  1348  	c.retryRegisterNode()
  1349  
  1350  	// Start watching changes for node changes
  1351  	go c.watchNodeUpdates()
  1352  
  1353  	// Start watching for emitting node events
  1354  	go c.watchNodeEvents()
  1355  
  1356  	// Setup the heartbeat timer, for the initial registration
  1357  	// we want to do this quickly. We want to do it extra quickly
  1358  	// in development mode.
  1359  	var heartbeat <-chan time.Time
  1360  	if c.config.DevMode {
  1361  		heartbeat = time.After(0)
  1362  	} else {
  1363  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1364  	}
  1365  
  1366  	for {
  1367  		select {
  1368  		case <-c.rpcRetryWatcher():
  1369  		case <-heartbeat:
  1370  		case <-c.shutdownCh:
  1371  			return
  1372  		}
  1373  		if err := c.updateNodeStatus(); err != nil {
  1374  			// The servers have changed such that this node has not been
  1375  			// registered before
  1376  			if strings.Contains(err.Error(), "node not found") {
  1377  				// Re-register the node
  1378  				c.logger.Info("re-registering node")
  1379  				c.retryRegisterNode()
  1380  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1381  			} else {
  1382  				intv := c.getHeartbeatRetryIntv(err)
  1383  				c.logger.Error("error heartbeating. retrying", "error", err, "period", intv)
  1384  				heartbeat = time.After(intv)
  1385  
  1386  				// If heartbeating fails, trigger Consul discovery
  1387  				c.triggerDiscovery()
  1388  			}
  1389  		} else {
  1390  			c.heartbeatLock.Lock()
  1391  			heartbeat = time.After(c.heartbeatTTL)
  1392  			c.heartbeatLock.Unlock()
  1393  		}
  1394  	}
  1395  }
  1396  
  1397  // getHeartbeatRetryIntv is used to retrieve the time to wait before attempting
  1398  // another heartbeat.
  1399  func (c *Client) getHeartbeatRetryIntv(err error) time.Duration {
  1400  	if c.config.DevMode {
  1401  		return devModeRetryIntv
  1402  	}
  1403  
  1404  	// Collect the useful heartbeat info
  1405  	c.heartbeatLock.Lock()
  1406  	haveHeartbeated := c.haveHeartbeated
  1407  	last := c.lastHeartbeat
  1408  	ttl := c.heartbeatTTL
  1409  	c.heartbeatLock.Unlock()
  1410  
  1411  	// If we haven't even successfully heartbeated once or there is no leader
  1412  	// treat it as a registration. In the case that there is a leadership loss,
  1413  	// we will have our heartbeat timer reset to a much larger threshold, so
  1414  	// do not put unnecessary pressure on the new leader.
  1415  	if !haveHeartbeated || err == structs.ErrNoLeader {
  1416  		return c.retryIntv(registerRetryIntv)
  1417  	}
  1418  
  1419  	// Determine how much time we have left to heartbeat
  1420  	left := last.Add(ttl).Sub(time.Now())
  1421  
  1422  	// Logic for retrying is:
  1423  	// * Do not retry faster than once a second
  1424  	// * Do not retry less that once every 30 seconds
  1425  	// * If we have missed the heartbeat by more than 30 seconds, start to use
  1426  	// the absolute time since we do not want to retry indefinitely
  1427  	switch {
  1428  	case left < -30*time.Second:
  1429  		// Make left the absolute value so we delay and jitter properly.
  1430  		left *= -1
  1431  	case left < 0:
  1432  		return time.Second + lib.RandomStagger(time.Second)
  1433  	default:
  1434  	}
  1435  
  1436  	stagger := lib.RandomStagger(left)
  1437  	switch {
  1438  	case stagger < time.Second:
  1439  		return time.Second + lib.RandomStagger(time.Second)
  1440  	case stagger > 30*time.Second:
  1441  		return 25*time.Second + lib.RandomStagger(5*time.Second)
  1442  	default:
  1443  		return stagger
  1444  	}
  1445  }
  1446  
  1447  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
  1448  // state of the client
  1449  func (c *Client) periodicSnapshot() {
  1450  	// Create a snapshot timer
  1451  	snapshot := time.After(stateSnapshotIntv)
  1452  
  1453  	for {
  1454  		select {
  1455  		case <-snapshot:
  1456  			snapshot = time.After(stateSnapshotIntv)
  1457  			if err := c.saveState(); err != nil {
  1458  				c.logger.Error("error saving state", "error", err)
  1459  			}
  1460  
  1461  		case <-c.shutdownCh:
  1462  			return
  1463  		}
  1464  	}
  1465  }
  1466  
  1467  // run is a long lived goroutine used to run the client. Shutdown() stops it first
  1468  func (c *Client) run() {
  1469  	// Watch for changes in allocations
  1470  	allocUpdates := make(chan *allocUpdates, 8)
  1471  	go c.watchAllocations(allocUpdates)
  1472  
  1473  	for {
  1474  		select {
  1475  		case update := <-allocUpdates:
  1476  			// Don't apply updates while shutting down.
  1477  			c.shutdownLock.Lock()
  1478  			if c.shutdown {
  1479  				c.shutdownLock.Unlock()
  1480  				return
  1481  			}
  1482  
  1483  			// Apply updates inside lock to prevent a concurrent
  1484  			// shutdown.
  1485  			c.runAllocs(update)
  1486  			c.shutdownLock.Unlock()
  1487  
  1488  		case <-c.shutdownCh:
  1489  			return
  1490  		}
  1491  	}
  1492  }
  1493  
  1494  // submitNodeEvents is used to submit a client-side node event. Examples of
  1495  // these kinds of events include when a driver moves from healthy to unhealthy
  1496  // (and vice versa)
  1497  func (c *Client) submitNodeEvents(events []*structs.NodeEvent) error {
  1498  	nodeID := c.NodeID()
  1499  	nodeEvents := map[string][]*structs.NodeEvent{
  1500  		nodeID: events,
  1501  	}
  1502  	req := structs.EmitNodeEventsRequest{
  1503  		NodeEvents:   nodeEvents,
  1504  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1505  	}
  1506  	var resp structs.EmitNodeEventsResponse
  1507  	if err := c.RPC("Node.EmitEvents", &req, &resp); err != nil {
  1508  		return fmt.Errorf("Emitting node events failed: %v", err)
  1509  	}
  1510  	return nil
  1511  }
  1512  
  1513  // watchNodeEvents is a handler which receives node events and on a interval
  1514  // and submits them in batch format to the server
  1515  func (c *Client) watchNodeEvents() {
  1516  	// batchEvents stores events that have yet to be published
  1517  	var batchEvents []*structs.NodeEvent
  1518  
  1519  	timer := stoppedTimer()
  1520  	defer timer.Stop()
  1521  
  1522  	for {
  1523  		select {
  1524  		case event := <-c.triggerEmitNodeEvent:
  1525  			if l := len(batchEvents); l <= structs.MaxRetainedNodeEvents {
  1526  				batchEvents = append(batchEvents, event)
  1527  			} else {
  1528  				// Drop the oldest event
  1529  				c.logger.Warn("dropping node event", "node_event", batchEvents[0])
  1530  				batchEvents = append(batchEvents[1:], event)
  1531  			}
  1532  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1533  		case <-timer.C:
  1534  			if err := c.submitNodeEvents(batchEvents); err != nil {
  1535  				c.logger.Error("error submitting node events", "error", err)
  1536  				timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1537  			} else {
  1538  				// Reset the events since we successfully sent them.
  1539  				batchEvents = []*structs.NodeEvent{}
  1540  			}
  1541  		case <-c.shutdownCh:
  1542  			return
  1543  		}
  1544  	}
  1545  }
  1546  
  1547  // triggerNodeEvent triggers a emit node event
  1548  func (c *Client) triggerNodeEvent(nodeEvent *structs.NodeEvent) {
  1549  	select {
  1550  	case c.triggerEmitNodeEvent <- nodeEvent:
  1551  		// emit node event goroutine was released to execute
  1552  	default:
  1553  		// emit node event goroutine was already running
  1554  	}
  1555  }
  1556  
  1557  // retryRegisterNode is used to register the node or update the registration and
  1558  // retry in case of failure.
  1559  func (c *Client) retryRegisterNode() {
  1560  	for {
  1561  		err := c.registerNode()
  1562  		if err == nil {
  1563  			// Registered!
  1564  			return
  1565  		}
  1566  
  1567  		if err == noServersErr {
  1568  			c.logger.Debug("registration waiting on servers")
  1569  			c.triggerDiscovery()
  1570  		} else {
  1571  			c.logger.Error("error registering", "error", err)
  1572  		}
  1573  		select {
  1574  		case <-c.rpcRetryWatcher():
  1575  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1576  		case <-c.shutdownCh:
  1577  			return
  1578  		}
  1579  	}
  1580  }
  1581  
  1582  // registerNode is used to register the node or update the registration
  1583  func (c *Client) registerNode() error {
  1584  	node := c.Node()
  1585  	req := structs.NodeRegisterRequest{
  1586  		Node:         node,
  1587  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1588  	}
  1589  	var resp structs.NodeUpdateResponse
  1590  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1591  		return err
  1592  	}
  1593  
  1594  	// Update the node status to ready after we register.
  1595  	c.configLock.Lock()
  1596  	node.Status = structs.NodeStatusReady
  1597  	c.config.Node.Status = structs.NodeStatusReady
  1598  	c.configLock.Unlock()
  1599  
  1600  	c.logger.Info("node registration complete")
  1601  	if len(resp.EvalIDs) != 0 {
  1602  		c.logger.Debug("evaluations triggered by node registration", "num_evals", len(resp.EvalIDs))
  1603  	}
  1604  
  1605  	c.heartbeatLock.Lock()
  1606  	defer c.heartbeatLock.Unlock()
  1607  	c.lastHeartbeat = time.Now()
  1608  	c.heartbeatTTL = resp.HeartbeatTTL
  1609  	return nil
  1610  }
  1611  
  1612  // updateNodeStatus is used to heartbeat and update the status of the node
  1613  func (c *Client) updateNodeStatus() error {
  1614  	start := time.Now()
  1615  	req := structs.NodeUpdateStatusRequest{
  1616  		NodeID:       c.NodeID(),
  1617  		Status:       structs.NodeStatusReady,
  1618  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1619  	}
  1620  	var resp structs.NodeUpdateResponse
  1621  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1622  		c.triggerDiscovery()
  1623  		return fmt.Errorf("failed to update status: %v", err)
  1624  	}
  1625  	end := time.Now()
  1626  
  1627  	if len(resp.EvalIDs) != 0 {
  1628  		c.logger.Debug("evaluations triggered by node update", "num_evals", len(resp.EvalIDs))
  1629  	}
  1630  
  1631  	// Update the last heartbeat and the new TTL, capturing the old values
  1632  	c.heartbeatLock.Lock()
  1633  	last := c.lastHeartbeat
  1634  	oldTTL := c.heartbeatTTL
  1635  	haveHeartbeated := c.haveHeartbeated
  1636  	c.lastHeartbeat = time.Now()
  1637  	c.heartbeatTTL = resp.HeartbeatTTL
  1638  	c.haveHeartbeated = true
  1639  	c.heartbeatLock.Unlock()
  1640  	c.logger.Trace("next heartbeat", "period", resp.HeartbeatTTL)
  1641  
  1642  	if resp.Index != 0 {
  1643  		c.logger.Debug("state updated", "node_status", req.Status)
  1644  
  1645  		// We have potentially missed our TTL log how delayed we were
  1646  		if haveHeartbeated {
  1647  			c.logger.Warn("missed heartbeat",
  1648  				"req_latency", end.Sub(start), "heartbeat_ttl", oldTTL, "since_last_heartbeat", time.Since(last))
  1649  		}
  1650  	}
  1651  
  1652  	// Update the number of nodes in the cluster so we can adjust our server
  1653  	// rebalance rate.
  1654  	c.servers.SetNumNodes(resp.NumNodes)
  1655  
  1656  	// Convert []*NodeServerInfo to []*servers.Server
  1657  	nomadServers := make([]*servers.Server, 0, len(resp.Servers))
  1658  	for _, s := range resp.Servers {
  1659  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1660  		if err != nil {
  1661  			c.logger.Warn("ignoring invalid server", "error", err, "server", s.RPCAdvertiseAddr)
  1662  			continue
  1663  		}
  1664  		e := &servers.Server{Addr: addr}
  1665  		nomadServers = append(nomadServers, e)
  1666  	}
  1667  	if len(nomadServers) == 0 {
  1668  		return fmt.Errorf("heartbeat response returned no valid servers")
  1669  	}
  1670  	c.servers.SetServers(nomadServers)
  1671  
  1672  	// Begin polling Consul if there is no Nomad leader.  We could be
  1673  	// heartbeating to a Nomad server that is in the minority of a
  1674  	// partition of the Nomad server quorum, but this Nomad Agent still
  1675  	// has connectivity to the existing majority of Nomad Servers, but
  1676  	// only if it queries Consul.
  1677  	if resp.LeaderRPCAddr == "" {
  1678  		c.triggerDiscovery()
  1679  	}
  1680  
  1681  	return nil
  1682  }
  1683  
  1684  // AllocStateUpdated asynchronously updates the server with the current state
  1685  // of an allocations and its tasks.
  1686  func (c *Client) AllocStateUpdated(alloc *structs.Allocation) {
  1687  	if alloc.Terminated() {
  1688  		// Terminated, mark for GC if we're still tracking this alloc
  1689  		// runner. If it's not being tracked that means the server has
  1690  		// already GC'd it (see removeAlloc).
  1691  		ar, err := c.getAllocRunner(alloc.ID)
  1692  
  1693  		if err == nil {
  1694  			c.garbageCollector.MarkForCollection(alloc.ID, ar)
  1695  
  1696  			// Trigger a GC in case we're over thresholds and just
  1697  			// waiting for eligible allocs.
  1698  			c.garbageCollector.Trigger()
  1699  		}
  1700  	}
  1701  
  1702  	// Strip all the information that can be reconstructed at the server.  Only
  1703  	// send the fields that are updatable by the client.
  1704  	stripped := new(structs.Allocation)
  1705  	stripped.ID = alloc.ID
  1706  	stripped.NodeID = c.NodeID()
  1707  	stripped.TaskStates = alloc.TaskStates
  1708  	stripped.ClientStatus = alloc.ClientStatus
  1709  	stripped.ClientDescription = alloc.ClientDescription
  1710  	stripped.DeploymentStatus = alloc.DeploymentStatus
  1711  
  1712  	select {
  1713  	case c.allocUpdates <- stripped:
  1714  	case <-c.shutdownCh:
  1715  	}
  1716  }
  1717  
  1718  // allocSync is a long lived function that batches allocation updates to the
  1719  // server.
  1720  func (c *Client) allocSync() {
  1721  	staggered := false
  1722  	syncTicker := time.NewTicker(allocSyncIntv)
  1723  	updates := make(map[string]*structs.Allocation)
  1724  	for {
  1725  		select {
  1726  		case <-c.shutdownCh:
  1727  			syncTicker.Stop()
  1728  			return
  1729  		case alloc := <-c.allocUpdates:
  1730  			// Batch the allocation updates until the timer triggers.
  1731  			updates[alloc.ID] = alloc
  1732  		case <-syncTicker.C:
  1733  			// Fast path if there are no updates
  1734  			if len(updates) == 0 {
  1735  				continue
  1736  			}
  1737  
  1738  			sync := make([]*structs.Allocation, 0, len(updates))
  1739  			for _, alloc := range updates {
  1740  				sync = append(sync, alloc)
  1741  			}
  1742  
  1743  			// Send to server.
  1744  			args := structs.AllocUpdateRequest{
  1745  				Alloc:        sync,
  1746  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1747  			}
  1748  
  1749  			var resp structs.GenericResponse
  1750  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1751  				c.logger.Error("error updating allocations", "error", err)
  1752  				syncTicker.Stop()
  1753  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1754  				staggered = true
  1755  			} else {
  1756  				updates = make(map[string]*structs.Allocation)
  1757  				if staggered {
  1758  					syncTicker.Stop()
  1759  					syncTicker = time.NewTicker(allocSyncIntv)
  1760  					staggered = false
  1761  				}
  1762  			}
  1763  		}
  1764  	}
  1765  }
  1766  
  1767  // allocUpdates holds the results of receiving updated allocations from the
  1768  // servers.
  1769  type allocUpdates struct {
  1770  	// pulled is the set of allocations that were downloaded from the servers.
  1771  	pulled map[string]*structs.Allocation
  1772  
  1773  	// filtered is the set of allocations that were not pulled because their
  1774  	// AllocModifyIndex didn't change.
  1775  	filtered map[string]struct{}
  1776  
  1777  	// migrateTokens are a list of tokens necessary for when clients pull data
  1778  	// from authorized volumes
  1779  	migrateTokens map[string]string
  1780  }
  1781  
  1782  // watchAllocations is used to scan for updates to allocations
  1783  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1784  	// The request and response for getting the map of allocations that should
  1785  	// be running on the Node to their AllocModifyIndex which is incremented
  1786  	// when the allocation is updated by the servers.
  1787  	req := structs.NodeSpecificRequest{
  1788  		NodeID:   c.NodeID(),
  1789  		SecretID: c.secretNodeID(),
  1790  		QueryOptions: structs.QueryOptions{
  1791  			Region:     c.Region(),
  1792  			AllowStale: true,
  1793  		},
  1794  	}
  1795  	var resp structs.NodeClientAllocsResponse
  1796  
  1797  	// The request and response for pulling down the set of allocations that are
  1798  	// new, or updated server side.
  1799  	allocsReq := structs.AllocsGetRequest{
  1800  		QueryOptions: structs.QueryOptions{
  1801  			Region:     c.Region(),
  1802  			AllowStale: true,
  1803  		},
  1804  	}
  1805  	var allocsResp structs.AllocsGetResponse
  1806  
  1807  OUTER:
  1808  	for {
  1809  		// Get the allocation modify index map, blocking for updates. We will
  1810  		// use this to determine exactly what allocations need to be downloaded
  1811  		// in full.
  1812  		resp = structs.NodeClientAllocsResponse{}
  1813  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1814  		if err != nil {
  1815  			// Shutdown often causes EOF errors, so check for shutdown first
  1816  			select {
  1817  			case <-c.shutdownCh:
  1818  				return
  1819  			default:
  1820  			}
  1821  
  1822  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1823  			// servers are not fully upgraded before the clients register. This
  1824  			// can cause the SecretID to be lost
  1825  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1826  				c.logger.Debug("secret mismatch; re-registering node", "error", err)
  1827  				c.retryRegisterNode()
  1828  			} else if err != noServersErr {
  1829  				c.logger.Error("error querying node allocations", "error", err)
  1830  			}
  1831  			retry := c.retryIntv(getAllocRetryIntv)
  1832  			select {
  1833  			case <-c.rpcRetryWatcher():
  1834  				continue
  1835  			case <-time.After(retry):
  1836  				continue
  1837  			case <-c.shutdownCh:
  1838  				return
  1839  			}
  1840  		}
  1841  
  1842  		// Check for shutdown
  1843  		select {
  1844  		case <-c.shutdownCh:
  1845  			return
  1846  		default:
  1847  		}
  1848  
  1849  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1850  		// These are the allocations who have either not been updated, or whose
  1851  		// updates are a result of the client sending an update for the alloc.
  1852  		// This lets us reduce the network traffic to the server as we don't
  1853  		// need to pull all the allocations.
  1854  		var pull []string
  1855  		filtered := make(map[string]struct{})
  1856  		var pullIndex uint64
  1857  		for allocID, modifyIndex := range resp.Allocs {
  1858  			// Pull the allocation if we don't have an alloc runner for the
  1859  			// allocation or if the alloc runner requires an updated allocation.
  1860  			//XXX Part of Client alloc index tracking exp
  1861  			c.allocLock.RLock()
  1862  			currentAR, ok := c.allocs[allocID]
  1863  			c.allocLock.RUnlock()
  1864  
  1865  			// Ignore alloc updates for allocs that are invalid because of initialization errors
  1866  			c.invalidAllocsLock.Lock()
  1867  			_, isInvalid := c.invalidAllocs[allocID]
  1868  			c.invalidAllocsLock.Unlock()
  1869  
  1870  			if (!ok || modifyIndex > currentAR.Alloc().AllocModifyIndex) && !isInvalid {
  1871  				// Only pull allocs that are required. Filtered
  1872  				// allocs might be at a higher index, so ignore
  1873  				// it.
  1874  				if modifyIndex > pullIndex {
  1875  					pullIndex = modifyIndex
  1876  				}
  1877  				pull = append(pull, allocID)
  1878  			} else {
  1879  				filtered[allocID] = struct{}{}
  1880  			}
  1881  		}
  1882  
  1883  		// Pull the allocations that passed filtering.
  1884  		allocsResp.Allocs = nil
  1885  		var pulledAllocs map[string]*structs.Allocation
  1886  		if len(pull) != 0 {
  1887  			// Pull the allocations that need to be updated.
  1888  			allocsReq.AllocIDs = pull
  1889  			allocsReq.MinQueryIndex = pullIndex - 1
  1890  			allocsResp = structs.AllocsGetResponse{}
  1891  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1892  				c.logger.Error("error querying updated allocations", "error", err)
  1893  				retry := c.retryIntv(getAllocRetryIntv)
  1894  				select {
  1895  				case <-c.rpcRetryWatcher():
  1896  					continue
  1897  				case <-time.After(retry):
  1898  					continue
  1899  				case <-c.shutdownCh:
  1900  					return
  1901  				}
  1902  			}
  1903  
  1904  			// Ensure that we received all the allocations we wanted
  1905  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1906  			for _, alloc := range allocsResp.Allocs {
  1907  				pulledAllocs[alloc.ID] = alloc
  1908  			}
  1909  
  1910  			for _, desiredID := range pull {
  1911  				if _, ok := pulledAllocs[desiredID]; !ok {
  1912  					// We didn't get everything we wanted. Do not update the
  1913  					// MinQueryIndex, sleep and then retry.
  1914  					wait := c.retryIntv(2 * time.Second)
  1915  					select {
  1916  					case <-time.After(wait):
  1917  						// Wait for the server we contact to receive the
  1918  						// allocations
  1919  						continue OUTER
  1920  					case <-c.shutdownCh:
  1921  						return
  1922  					}
  1923  				}
  1924  			}
  1925  
  1926  			// Check for shutdown
  1927  			select {
  1928  			case <-c.shutdownCh:
  1929  				return
  1930  			default:
  1931  			}
  1932  		}
  1933  
  1934  		c.logger.Debug("updated allocations", "index", resp.Index,
  1935  			"total", len(resp.Allocs), "pulled", len(allocsResp.Allocs), "filtered", len(filtered))
  1936  
  1937  		// Update the query index.
  1938  		if resp.Index > req.MinQueryIndex {
  1939  			req.MinQueryIndex = resp.Index
  1940  		}
  1941  
  1942  		// Push the updates.
  1943  		update := &allocUpdates{
  1944  			filtered:      filtered,
  1945  			pulled:        pulledAllocs,
  1946  			migrateTokens: resp.MigrateTokens,
  1947  		}
  1948  
  1949  		select {
  1950  		case updates <- update:
  1951  		case <-c.shutdownCh:
  1952  			return
  1953  		}
  1954  	}
  1955  }
  1956  
  1957  // updateNode updates the Node copy and triggers the client to send the updated
  1958  // Node to the server. This should be done while the caller holds the
  1959  // configLock lock.
  1960  func (c *Client) updateNodeLocked() {
  1961  	// Update the config copy.
  1962  	node := c.config.Node.Copy()
  1963  	c.configCopy.Node = node
  1964  
  1965  	select {
  1966  	case c.triggerNodeUpdate <- struct{}{}:
  1967  		// Node update goroutine was released to execute
  1968  	default:
  1969  		// Node update goroutine was already running
  1970  	}
  1971  }
  1972  
  1973  // watchNodeUpdates blocks until it is edge triggered. Once triggered,
  1974  // it will update the client node copy and re-register the node.
  1975  func (c *Client) watchNodeUpdates() {
  1976  	var hasChanged bool
  1977  
  1978  	timer := stoppedTimer()
  1979  	defer timer.Stop()
  1980  
  1981  	for {
  1982  		select {
  1983  		case <-timer.C:
  1984  			c.logger.Debug("state changed, updating node and re-registering")
  1985  			c.retryRegisterNode()
  1986  			hasChanged = false
  1987  		case <-c.triggerNodeUpdate:
  1988  			if hasChanged {
  1989  				continue
  1990  			}
  1991  			hasChanged = true
  1992  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1993  		case <-c.shutdownCh:
  1994  			return
  1995  		}
  1996  	}
  1997  }
  1998  
  1999  // runAllocs is invoked when we get an updated set of allocations
  2000  func (c *Client) runAllocs(update *allocUpdates) {
  2001  	// Get the existing allocs
  2002  	c.allocLock.RLock()
  2003  	existing := make(map[string]uint64, len(c.allocs))
  2004  	for id, ar := range c.allocs {
  2005  		existing[id] = ar.Alloc().AllocModifyIndex
  2006  	}
  2007  	c.allocLock.RUnlock()
  2008  
  2009  	// Diff the existing and updated allocations
  2010  	diff := diffAllocs(existing, update)
  2011  	c.logger.Debug("allocation updates", "added", len(diff.added), "removed", len(diff.removed),
  2012  		"updated", len(diff.updated), "ignored", len(diff.ignore))
  2013  
  2014  	errs := 0
  2015  
  2016  	// Remove the old allocations
  2017  	for _, remove := range diff.removed {
  2018  		c.removeAlloc(remove)
  2019  	}
  2020  
  2021  	// Update the existing allocations
  2022  	for _, update := range diff.updated {
  2023  		c.logger.Trace("updating alloc", "alloc_id", update.ID, "index", update.AllocModifyIndex)
  2024  		c.updateAlloc(update)
  2025  	}
  2026  
  2027  	// Make room for new allocations before running
  2028  	if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil {
  2029  		c.logger.Error("error making room for new allocations", "error", err)
  2030  		errs++
  2031  	}
  2032  
  2033  	// Start the new allocations
  2034  	for _, add := range diff.added {
  2035  		migrateToken := update.migrateTokens[add.ID]
  2036  		if err := c.addAlloc(add, migrateToken); err != nil {
  2037  			c.logger.Error("error adding alloc", "error", err, "alloc_id", add.ID)
  2038  			errs++
  2039  			// We mark the alloc as failed and send an update to the server
  2040  			// We track the fact that creating an allocrunner failed so that we don't send updates again
  2041  			if add.ClientStatus != structs.AllocClientStatusFailed {
  2042  				c.handleInvalidAllocs(add, err)
  2043  			}
  2044  		}
  2045  	}
  2046  
  2047  	// Mark servers as having been contacted so blocked tasks that failed
  2048  	// to restore can now restart.
  2049  	c.serversContactedOnce.Do(func() {
  2050  		close(c.serversContactedCh)
  2051  	})
  2052  
  2053  	// Trigger the GC once more now that new allocs are started that could
  2054  	// have caused thresholds to be exceeded
  2055  	c.garbageCollector.Trigger()
  2056  	c.logger.Debug("allocation updates applied", "added", len(diff.added), "removed", len(diff.removed),
  2057  		"updated", len(diff.updated), "ignored", len(diff.ignore), "errors", errs)
  2058  }
  2059  
  2060  // makeFailedAlloc creates a stripped down version of the allocation passed in
  2061  // with its status set to failed and other fields needed for the server to be
  2062  // able to examine deployment and task states
  2063  func makeFailedAlloc(add *structs.Allocation, err error) *structs.Allocation {
  2064  	stripped := new(structs.Allocation)
  2065  	stripped.ID = add.ID
  2066  	stripped.NodeID = add.NodeID
  2067  	stripped.ClientStatus = structs.AllocClientStatusFailed
  2068  	stripped.ClientDescription = fmt.Sprintf("Unable to add allocation due to error: %v", err)
  2069  
  2070  	// Copy task states if it exists in the original allocation
  2071  	if add.TaskStates != nil {
  2072  		stripped.TaskStates = add.TaskStates
  2073  	} else {
  2074  		stripped.TaskStates = make(map[string]*structs.TaskState)
  2075  	}
  2076  
  2077  	failTime := time.Now()
  2078  	if add.DeploymentStatus.HasHealth() {
  2079  		// Never change deployment health once it has been set
  2080  		stripped.DeploymentStatus = add.DeploymentStatus.Copy()
  2081  	} else {
  2082  		stripped.DeploymentStatus = &structs.AllocDeploymentStatus{
  2083  			Healthy:   helper.BoolToPtr(false),
  2084  			Timestamp: failTime,
  2085  		}
  2086  	}
  2087  
  2088  	taskGroup := add.Job.LookupTaskGroup(add.TaskGroup)
  2089  	if taskGroup == nil {
  2090  		return stripped
  2091  	}
  2092  	for _, task := range taskGroup.Tasks {
  2093  		ts, ok := stripped.TaskStates[task.Name]
  2094  		if !ok {
  2095  			ts = &structs.TaskState{}
  2096  			stripped.TaskStates[task.Name] = ts
  2097  		}
  2098  		if ts.FinishedAt.IsZero() {
  2099  			ts.FinishedAt = failTime
  2100  		}
  2101  	}
  2102  	return stripped
  2103  }
  2104  
  2105  // removeAlloc is invoked when we should remove an allocation because it has
  2106  // been removed by the server.
  2107  func (c *Client) removeAlloc(allocID string) {
  2108  	c.allocLock.Lock()
  2109  	defer c.allocLock.Unlock()
  2110  
  2111  	ar, ok := c.allocs[allocID]
  2112  	if !ok {
  2113  		c.invalidAllocsLock.Lock()
  2114  		if _, ok := c.invalidAllocs[allocID]; ok {
  2115  			// Removing from invalid allocs map if present
  2116  			delete(c.invalidAllocs, allocID)
  2117  		} else {
  2118  			// Alloc is unknown, log a warning.
  2119  			c.logger.Warn("cannot remove nonexistent alloc", "alloc_id", allocID, "error", "alloc not found")
  2120  		}
  2121  		c.invalidAllocsLock.Unlock()
  2122  		return
  2123  	}
  2124  
  2125  	// Stop tracking alloc runner as it's been GC'd by the server
  2126  	delete(c.allocs, allocID)
  2127  
  2128  	// Ensure the GC has a reference and then collect. Collecting through the GC
  2129  	// applies rate limiting
  2130  	c.garbageCollector.MarkForCollection(allocID, ar)
  2131  
  2132  	// GC immediately since the server has GC'd it
  2133  	go c.garbageCollector.Collect(allocID)
  2134  }
  2135  
  2136  // updateAlloc is invoked when we should update an allocation
  2137  func (c *Client) updateAlloc(update *structs.Allocation) {
  2138  	ar, err := c.getAllocRunner(update.ID)
  2139  	if err != nil {
  2140  		c.logger.Warn("cannot update nonexistent alloc", "alloc_id", update.ID)
  2141  		return
  2142  	}
  2143  
  2144  	// Update local copy of alloc
  2145  	if err := c.stateDB.PutAllocation(update); err != nil {
  2146  		c.logger.Error("error persisting updated alloc locally", "error", err, "alloc_id", update.ID)
  2147  	}
  2148  
  2149  	// Update alloc runner
  2150  	ar.Update(update)
  2151  }
  2152  
  2153  // addAlloc is invoked when we should add an allocation
  2154  func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error {
  2155  	c.allocLock.Lock()
  2156  	defer c.allocLock.Unlock()
  2157  
  2158  	// Check if we already have an alloc runner
  2159  	if _, ok := c.allocs[alloc.ID]; ok {
  2160  		c.logger.Debug("dropping duplicate add allocation request", "alloc_id", alloc.ID)
  2161  		return nil
  2162  	}
  2163  
  2164  	// Initialize local copy of alloc before creating the alloc runner so
  2165  	// we can't end up with an alloc runner that does not have an alloc.
  2166  	if err := c.stateDB.PutAllocation(alloc); err != nil {
  2167  		return err
  2168  	}
  2169  
  2170  	// Collect any preempted allocations to pass into the previous alloc watcher
  2171  	var preemptedAllocs map[string]allocwatcher.AllocRunnerMeta
  2172  	if len(alloc.PreemptedAllocations) > 0 {
  2173  		preemptedAllocs = make(map[string]allocwatcher.AllocRunnerMeta)
  2174  		for _, palloc := range alloc.PreemptedAllocations {
  2175  			preemptedAllocs[palloc] = c.allocs[palloc]
  2176  		}
  2177  	}
  2178  
  2179  	// Since only the Client has access to other AllocRunners and the RPC
  2180  	// client, create the previous allocation watcher here.
  2181  	watcherConfig := allocwatcher.Config{
  2182  		Alloc:            alloc,
  2183  		PreviousRunner:   c.allocs[alloc.PreviousAllocation],
  2184  		PreemptedRunners: preemptedAllocs,
  2185  		RPC:              c,
  2186  		Config:           c.configCopy,
  2187  		MigrateToken:     migrateToken,
  2188  		Logger:           c.logger,
  2189  	}
  2190  	prevAllocWatcher, prevAllocMigrator := allocwatcher.NewAllocWatcher(watcherConfig)
  2191  
  2192  	// Copy the config since the node can be swapped out as it is being updated.
  2193  	// The long term fix is to pass in the config and node separately and then
  2194  	// we don't have to do a copy.
  2195  	c.configLock.RLock()
  2196  	arConf := &allocrunner.Config{
  2197  		Alloc:               alloc,
  2198  		Logger:              c.logger,
  2199  		ClientConfig:        c.configCopy,
  2200  		StateDB:             c.stateDB,
  2201  		Consul:              c.consulService,
  2202  		Vault:               c.vaultClient,
  2203  		StateUpdater:        c,
  2204  		DeviceStatsReporter: c,
  2205  		PrevAllocWatcher:    prevAllocWatcher,
  2206  		PrevAllocMigrator:   prevAllocMigrator,
  2207  		DeviceManager:       c.devicemanager,
  2208  		DriverManager:       c.drivermanager,
  2209  	}
  2210  	c.configLock.RUnlock()
  2211  
  2212  	ar, err := allocrunner.NewAllocRunner(arConf)
  2213  	if err != nil {
  2214  		return err
  2215  	}
  2216  
  2217  	// Store the alloc runner.
  2218  	c.allocs[alloc.ID] = ar
  2219  
  2220  	go ar.Run()
  2221  	return nil
  2222  }
  2223  
  2224  // setupVaultClient creates an object to periodically renew tokens and secrets
  2225  // with vault.
  2226  func (c *Client) setupVaultClient() error {
  2227  	var err error
  2228  	c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken)
  2229  	if err != nil {
  2230  		return err
  2231  	}
  2232  
  2233  	if c.vaultClient == nil {
  2234  		c.logger.Error("failed to create vault client")
  2235  		return fmt.Errorf("failed to create vault client")
  2236  	}
  2237  
  2238  	// Start renewing tokens and secrets
  2239  	c.vaultClient.Start()
  2240  
  2241  	return nil
  2242  }
  2243  
  2244  // deriveToken takes in an allocation and a set of tasks and derives vault
  2245  // tokens for each of the tasks, unwraps all of them using the supplied vault
  2246  // client and returns a map of unwrapped tokens, indexed by the task name.
  2247  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  2248  	vlogger := c.logger.Named("vault")
  2249  	if alloc == nil {
  2250  		return nil, fmt.Errorf("nil allocation")
  2251  	}
  2252  
  2253  	if taskNames == nil || len(taskNames) == 0 {
  2254  		return nil, fmt.Errorf("missing task names")
  2255  	}
  2256  
  2257  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  2258  	if group == nil {
  2259  		return nil, fmt.Errorf("group name in allocation is not present in job")
  2260  	}
  2261  
  2262  	verifiedTasks := []string{}
  2263  	// Check if the given task names actually exist in the allocation
  2264  	for _, taskName := range taskNames {
  2265  		found := false
  2266  		for _, task := range group.Tasks {
  2267  			if task.Name == taskName {
  2268  				found = true
  2269  			}
  2270  		}
  2271  		if !found {
  2272  			vlogger.Error("task not found in the allocation", "task_name", taskName)
  2273  			return nil, fmt.Errorf("task %q not found in the allocation", taskName)
  2274  		}
  2275  		verifiedTasks = append(verifiedTasks, taskName)
  2276  	}
  2277  
  2278  	// DeriveVaultToken of nomad server can take in a set of tasks and
  2279  	// creates tokens for all the tasks.
  2280  	req := &structs.DeriveVaultTokenRequest{
  2281  		NodeID:   c.NodeID(),
  2282  		SecretID: c.secretNodeID(),
  2283  		AllocID:  alloc.ID,
  2284  		Tasks:    verifiedTasks,
  2285  		QueryOptions: structs.QueryOptions{
  2286  			Region:     c.Region(),
  2287  			AllowStale: false,
  2288  		},
  2289  	}
  2290  
  2291  	// Derive the tokens
  2292  	var resp structs.DeriveVaultTokenResponse
  2293  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  2294  		vlogger.Error("error making derive token RPC", "error", err)
  2295  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  2296  	}
  2297  	if resp.Error != nil {
  2298  		vlogger.Error("error deriving vault tokens", "error", resp.Error)
  2299  		return nil, structs.NewWrappedServerError(resp.Error)
  2300  	}
  2301  	if resp.Tasks == nil {
  2302  		vlogger.Error("error derivng vault token", "error", "invalid response")
  2303  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  2304  	}
  2305  
  2306  	unwrappedTokens := make(map[string]string)
  2307  
  2308  	// Retrieve the wrapped tokens from the response and unwrap it
  2309  	for _, taskName := range verifiedTasks {
  2310  		// Get the wrapped token
  2311  		wrappedToken, ok := resp.Tasks[taskName]
  2312  		if !ok {
  2313  			vlogger.Error("wrapped token missing for task", "task_name", taskName)
  2314  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  2315  		}
  2316  
  2317  		// Unwrap the vault token
  2318  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  2319  		if err != nil {
  2320  			if structs.VaultUnrecoverableError.MatchString(err.Error()) {
  2321  				return nil, err
  2322  			}
  2323  
  2324  			// The error is recoverable
  2325  			return nil, structs.NewRecoverableError(
  2326  				fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err), true)
  2327  		}
  2328  
  2329  		// Validate the response
  2330  		var validationErr error
  2331  		if unwrapResp == nil {
  2332  			validationErr = fmt.Errorf("Vault returned nil secret when unwrapping")
  2333  		} else if unwrapResp.Auth == nil {
  2334  			validationErr = fmt.Errorf("Vault returned unwrap secret with nil Auth. Secret warnings: %v", unwrapResp.Warnings)
  2335  		} else if unwrapResp.Auth.ClientToken == "" {
  2336  			validationErr = fmt.Errorf("Vault returned unwrap secret with empty Auth.ClientToken. Secret warnings: %v", unwrapResp.Warnings)
  2337  		}
  2338  		if validationErr != nil {
  2339  			vlogger.Warn("error unwrapping token", "error", err)
  2340  			return nil, structs.NewRecoverableError(validationErr, true)
  2341  		}
  2342  
  2343  		// Append the unwrapped token to the return value
  2344  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  2345  	}
  2346  
  2347  	return unwrappedTokens, nil
  2348  }
  2349  
  2350  // triggerDiscovery causes a Consul discovery to begin (if one hasn't already)
  2351  func (c *Client) triggerDiscovery() {
  2352  	select {
  2353  	case c.triggerDiscoveryCh <- struct{}{}:
  2354  		// Discovery goroutine was released to execute
  2355  	default:
  2356  		// Discovery goroutine was already running
  2357  	}
  2358  }
  2359  
  2360  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  2361  // It's intended to be started in a goroutine. See triggerDiscovery() for
  2362  // causing consul discovery from other code locations.
  2363  func (c *Client) consulDiscovery() {
  2364  	for {
  2365  		select {
  2366  		case <-c.triggerDiscoveryCh:
  2367  			if err := c.consulDiscoveryImpl(); err != nil {
  2368  				c.logger.Error("error discovering nomad servers", "error", err)
  2369  			}
  2370  		case <-c.shutdownCh:
  2371  			return
  2372  		}
  2373  	}
  2374  }
  2375  
  2376  func (c *Client) consulDiscoveryImpl() error {
  2377  	consulLogger := c.logger.Named("consul")
  2378  
  2379  	dcs, err := c.consulCatalog.Datacenters()
  2380  	if err != nil {
  2381  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  2382  	}
  2383  	if len(dcs) > 2 {
  2384  		// Query the local DC first, then shuffle the
  2385  		// remaining DCs.  Future heartbeats will cause Nomad
  2386  		// Clients to fixate on their local datacenter so
  2387  		// it's okay to talk with remote DCs.  If the no
  2388  		// Nomad servers are available within
  2389  		// datacenterQueryLimit, the next heartbeat will pick
  2390  		// a new set of servers so it's okay.
  2391  		shuffleStrings(dcs[1:])
  2392  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  2393  	}
  2394  
  2395  	// Query for servers in this client's region only
  2396  	region := c.Region()
  2397  	rpcargs := structs.GenericRequest{
  2398  		QueryOptions: structs.QueryOptions{
  2399  			Region: region,
  2400  		},
  2401  	}
  2402  
  2403  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  2404  	var mErr multierror.Error
  2405  	var nomadServers servers.Servers
  2406  	consulLogger.Debug("bootstrap contacting Consul DCs", "consul_dcs", dcs)
  2407  DISCOLOOP:
  2408  	for _, dc := range dcs {
  2409  		consulOpts := &consulapi.QueryOptions{
  2410  			AllowStale: true,
  2411  			Datacenter: dc,
  2412  			Near:       "_agent",
  2413  			WaitTime:   consul.DefaultQueryWaitDuration,
  2414  		}
  2415  		consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  2416  		if err != nil {
  2417  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  2418  			continue
  2419  		}
  2420  
  2421  		for _, s := range consulServices {
  2422  			port := strconv.Itoa(s.ServicePort)
  2423  			addrstr := s.ServiceAddress
  2424  			if addrstr == "" {
  2425  				addrstr = s.Address
  2426  			}
  2427  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  2428  			if err != nil {
  2429  				mErr.Errors = append(mErr.Errors, err)
  2430  				continue
  2431  			}
  2432  			var peers []string
  2433  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  2434  				mErr.Errors = append(mErr.Errors, err)
  2435  				continue
  2436  			}
  2437  
  2438  			// Successfully received the Server peers list of the correct
  2439  			// region
  2440  			for _, p := range peers {
  2441  				addr, err := net.ResolveTCPAddr("tcp", p)
  2442  				if err != nil {
  2443  					mErr.Errors = append(mErr.Errors, err)
  2444  				}
  2445  				srv := &servers.Server{Addr: addr}
  2446  				nomadServers = append(nomadServers, srv)
  2447  			}
  2448  			if len(nomadServers) > 0 {
  2449  				break DISCOLOOP
  2450  			}
  2451  		}
  2452  	}
  2453  	if len(nomadServers) == 0 {
  2454  		if len(mErr.Errors) > 0 {
  2455  			return mErr.ErrorOrNil()
  2456  		}
  2457  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  2458  	}
  2459  
  2460  	consulLogger.Info("discovered following servers", "servers", nomadServers)
  2461  
  2462  	// Fire the retry trigger if we have updated the set of servers.
  2463  	if c.servers.SetServers(nomadServers) {
  2464  		// Start rebalancing
  2465  		c.servers.RebalanceServers()
  2466  
  2467  		// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  2468  		// isn't receiving on this chan yet they'll still retry eventually.
  2469  		// This is a shortcircuit for the longer retry intervals.
  2470  		c.fireRpcRetryWatcher()
  2471  	}
  2472  
  2473  	return nil
  2474  }
  2475  
  2476  // emitStats collects host resource usage stats periodically
  2477  func (c *Client) emitStats() {
  2478  	// Determining NodeClass to be emitted
  2479  	var emittedNodeClass string
  2480  	if emittedNodeClass = c.Node().NodeClass; emittedNodeClass == "" {
  2481  		emittedNodeClass = "none"
  2482  	}
  2483  
  2484  	// Assign labels directly before emitting stats so the information expected
  2485  	// is ready
  2486  	c.baseLabels = []metrics.Label{
  2487  		{Name: "node_id", Value: c.NodeID()},
  2488  		{Name: "datacenter", Value: c.Datacenter()},
  2489  		{Name: "node_class", Value: emittedNodeClass},
  2490  	}
  2491  
  2492  	// Start collecting host stats right away and then keep collecting every
  2493  	// collection interval
  2494  	next := time.NewTimer(0)
  2495  	defer next.Stop()
  2496  	for {
  2497  		select {
  2498  		case <-next.C:
  2499  			err := c.hostStatsCollector.Collect()
  2500  			next.Reset(c.config.StatsCollectionInterval)
  2501  			if err != nil {
  2502  				c.logger.Warn("error fetching host resource usage stats", "error", err)
  2503  				continue
  2504  			}
  2505  
  2506  			// Publish Node metrics if operator has opted in
  2507  			if c.config.PublishNodeMetrics {
  2508  				c.emitHostStats()
  2509  			}
  2510  
  2511  			c.emitClientMetrics()
  2512  		case <-c.shutdownCh:
  2513  			return
  2514  		}
  2515  	}
  2516  }
  2517  
  2518  // setGaugeForMemoryStats proxies metrics for memory specific statistics
  2519  func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats) {
  2520  	if !c.config.DisableTaggedMetrics {
  2521  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), c.baseLabels)
  2522  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), c.baseLabels)
  2523  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), c.baseLabels)
  2524  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), c.baseLabels)
  2525  	}
  2526  
  2527  	if c.config.BackwardsCompatibleMetrics {
  2528  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2529  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2530  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2531  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2532  	}
  2533  }
  2534  
  2535  // setGaugeForCPUStats proxies metrics for CPU specific statistics
  2536  func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats) {
  2537  	for _, cpu := range hStats.CPU {
  2538  		if !c.config.DisableTaggedMetrics {
  2539  			labels := append(c.baseLabels, metrics.Label{
  2540  				Name:  "cpu",
  2541  				Value: cpu.CPU,
  2542  			})
  2543  
  2544  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels)
  2545  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels)
  2546  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels)
  2547  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels)
  2548  		}
  2549  
  2550  		if c.config.BackwardsCompatibleMetrics {
  2551  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2552  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2553  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2554  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2555  		}
  2556  	}
  2557  }
  2558  
  2559  // setGaugeForDiskStats proxies metrics for disk specific statistics
  2560  func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats) {
  2561  	for _, disk := range hStats.DiskStats {
  2562  		if !c.config.DisableTaggedMetrics {
  2563  			labels := append(c.baseLabels, metrics.Label{
  2564  				Name:  "disk",
  2565  				Value: disk.Device,
  2566  			})
  2567  
  2568  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels)
  2569  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels)
  2570  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels)
  2571  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels)
  2572  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels)
  2573  		}
  2574  
  2575  		if c.config.BackwardsCompatibleMetrics {
  2576  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2577  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2578  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2579  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2580  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2581  		}
  2582  	}
  2583  }
  2584  
  2585  // setGaugeForAllocationStats proxies metrics for allocation specific statistics
  2586  func (c *Client) setGaugeForAllocationStats(nodeID string) {
  2587  	c.configLock.RLock()
  2588  	node := c.configCopy.Node
  2589  	c.configLock.RUnlock()
  2590  	total := node.NodeResources
  2591  	res := node.ReservedResources
  2592  	allocated := c.getAllocatedResources(node)
  2593  
  2594  	// Emit allocated
  2595  	if !c.config.DisableTaggedMetrics {
  2596  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.Flattened.Memory.MemoryMB), c.baseLabels)
  2597  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.Shared.DiskMB), c.baseLabels)
  2598  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.Flattened.Cpu.CpuShares), c.baseLabels)
  2599  	}
  2600  
  2601  	if c.config.BackwardsCompatibleMetrics {
  2602  		metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.Flattened.Memory.MemoryMB))
  2603  		metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.Shared.DiskMB))
  2604  		metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.Flattened.Cpu.CpuShares))
  2605  	}
  2606  
  2607  	for _, n := range allocated.Flattened.Networks {
  2608  		if !c.config.DisableTaggedMetrics {
  2609  			labels := append(c.baseLabels, metrics.Label{
  2610  				Name:  "device",
  2611  				Value: n.Device,
  2612  			})
  2613  			metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels)
  2614  		}
  2615  
  2616  		if c.config.BackwardsCompatibleMetrics {
  2617  			metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2618  		}
  2619  	}
  2620  
  2621  	// Emit unallocated
  2622  	unallocatedMem := total.Memory.MemoryMB - res.Memory.MemoryMB - allocated.Flattened.Memory.MemoryMB
  2623  	unallocatedDisk := total.Disk.DiskMB - res.Disk.DiskMB - allocated.Shared.DiskMB
  2624  	unallocatedCpu := total.Cpu.CpuShares - res.Cpu.CpuShares - allocated.Flattened.Cpu.CpuShares
  2625  
  2626  	if !c.config.DisableTaggedMetrics {
  2627  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels)
  2628  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels)
  2629  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels)
  2630  	}
  2631  
  2632  	if c.config.BackwardsCompatibleMetrics {
  2633  		metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2634  		metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2635  		metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2636  	}
  2637  
  2638  	totalComparable := total.Comparable()
  2639  	for _, n := range totalComparable.Flattened.Networks {
  2640  		// Determined the used resources
  2641  		var usedMbits int
  2642  		totalIdx := allocated.Flattened.Networks.NetIndex(n)
  2643  		if totalIdx != -1 {
  2644  			usedMbits = allocated.Flattened.Networks[totalIdx].MBits
  2645  		}
  2646  
  2647  		unallocatedMbits := n.MBits - usedMbits
  2648  		if !c.config.DisableTaggedMetrics {
  2649  			labels := append(c.baseLabels, metrics.Label{
  2650  				Name:  "device",
  2651  				Value: n.Device,
  2652  			})
  2653  			metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels)
  2654  		}
  2655  
  2656  		if c.config.BackwardsCompatibleMetrics {
  2657  			metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2658  		}
  2659  	}
  2660  }
  2661  
  2662  // No labels are required so we emit with only a key/value syntax
  2663  func (c *Client) setGaugeForUptime(hStats *stats.HostStats) {
  2664  	if !c.config.DisableTaggedMetrics {
  2665  		metrics.SetGaugeWithLabels([]string{"client", "uptime"}, float32(hStats.Uptime), c.baseLabels)
  2666  	}
  2667  	if c.config.BackwardsCompatibleMetrics {
  2668  		metrics.SetGauge([]string{"client", "uptime"}, float32(hStats.Uptime))
  2669  	}
  2670  }
  2671  
  2672  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2673  func (c *Client) emitHostStats() {
  2674  	nodeID := c.NodeID()
  2675  	hStats := c.hostStatsCollector.Stats()
  2676  
  2677  	c.setGaugeForMemoryStats(nodeID, hStats)
  2678  	c.setGaugeForUptime(hStats)
  2679  	c.setGaugeForCPUStats(nodeID, hStats)
  2680  	c.setGaugeForDiskStats(nodeID, hStats)
  2681  }
  2682  
  2683  // emitClientMetrics emits lower volume client metrics
  2684  func (c *Client) emitClientMetrics() {
  2685  	nodeID := c.NodeID()
  2686  
  2687  	c.setGaugeForAllocationStats(nodeID)
  2688  
  2689  	// Emit allocation metrics
  2690  	blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0
  2691  	for _, ar := range c.getAllocRunners() {
  2692  		switch ar.AllocState().ClientStatus {
  2693  		case structs.AllocClientStatusPending:
  2694  			switch {
  2695  			case ar.IsWaiting():
  2696  				blocked++
  2697  			case ar.IsMigrating():
  2698  				migrating++
  2699  			default:
  2700  				pending++
  2701  			}
  2702  		case structs.AllocClientStatusRunning:
  2703  			running++
  2704  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2705  			terminal++
  2706  		}
  2707  	}
  2708  
  2709  	if !c.config.DisableTaggedMetrics {
  2710  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels)
  2711  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels)
  2712  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels)
  2713  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels)
  2714  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels)
  2715  	}
  2716  
  2717  	if c.config.BackwardsCompatibleMetrics {
  2718  		metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2719  		metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2720  		metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2721  		metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2722  		metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2723  	}
  2724  }
  2725  
  2726  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.ComparableResources {
  2727  	// Unfortunately the allocs only have IP so we need to match them to the
  2728  	// device
  2729  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  2730  	for _, n := range selfNode.NodeResources.Networks {
  2731  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  2732  		if err != nil {
  2733  			continue
  2734  		}
  2735  		cidrToDevice[ipnet] = n.Device
  2736  	}
  2737  
  2738  	// Sum the allocated resources
  2739  	var allocated structs.ComparableResources
  2740  	allocatedDeviceMbits := make(map[string]int)
  2741  	for _, ar := range c.getAllocRunners() {
  2742  		alloc := ar.Alloc()
  2743  		if alloc.ServerTerminalStatus() || ar.AllocState().ClientTerminalStatus() {
  2744  			continue
  2745  		}
  2746  
  2747  		// Add the resources
  2748  		// COMPAT(0.11): Just use the allocated resources
  2749  		allocated.Add(alloc.ComparableResources())
  2750  
  2751  		// Add the used network
  2752  		if alloc.AllocatedResources != nil {
  2753  			for _, tr := range alloc.AllocatedResources.Tasks {
  2754  				for _, allocatedNetwork := range tr.Networks {
  2755  					for cidr, dev := range cidrToDevice {
  2756  						ip := net.ParseIP(allocatedNetwork.IP)
  2757  						if cidr.Contains(ip) {
  2758  							allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2759  							break
  2760  						}
  2761  					}
  2762  				}
  2763  			}
  2764  		} else if alloc.Resources != nil {
  2765  			for _, allocatedNetwork := range alloc.Resources.Networks {
  2766  				for cidr, dev := range cidrToDevice {
  2767  					ip := net.ParseIP(allocatedNetwork.IP)
  2768  					if cidr.Contains(ip) {
  2769  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2770  						break
  2771  					}
  2772  				}
  2773  			}
  2774  		}
  2775  	}
  2776  
  2777  	// Clear the networks
  2778  	allocated.Flattened.Networks = nil
  2779  	for dev, speed := range allocatedDeviceMbits {
  2780  		net := &structs.NetworkResource{
  2781  			Device: dev,
  2782  			MBits:  speed,
  2783  		}
  2784  		allocated.Flattened.Networks = append(allocated.Flattened.Networks, net)
  2785  	}
  2786  
  2787  	return &allocated
  2788  }
  2789  
  2790  // GetTaskEventHandler returns an event handler for the given allocID and task name
  2791  func (c *Client) GetTaskEventHandler(allocID, taskName string) drivermanager.EventHandler {
  2792  	c.allocLock.RLock()
  2793  	defer c.allocLock.RUnlock()
  2794  	if ar, ok := c.allocs[allocID]; ok {
  2795  		return ar.GetTaskEventHandler(taskName)
  2796  	}
  2797  	return nil
  2798  }
  2799  
  2800  // group wraps a func() in a goroutine and provides a way to block until it
  2801  // exits. Inspired by https://godoc.org/golang.org/x/sync/errgroup
  2802  type group struct {
  2803  	wg sync.WaitGroup
  2804  }
  2805  
  2806  // Go starts f in a goroutine and must be called before Wait.
  2807  func (g *group) Go(f func()) {
  2808  	g.wg.Add(1)
  2809  	go func() {
  2810  		defer g.wg.Done()
  2811  		f()
  2812  	}()
  2813  }
  2814  
  2815  func (c *group) AddCh(ch <-chan struct{}) {
  2816  	c.Go(func() {
  2817  		<-ch
  2818  	})
  2819  }
  2820  
  2821  // Wait for all goroutines to exit. Must be called after all calls to Go
  2822  // complete.
  2823  func (g *group) Wait() {
  2824  	g.wg.Wait()
  2825  }