github.com/djenriquez/nomad-1@v0.8.1/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"log"
     8  	"net"
     9  	"net/rpc"
    10  	"os"
    11  	"path/filepath"
    12  	"sort"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	metrics "github.com/armon/go-metrics"
    19  	"github.com/boltdb/bolt"
    20  	consulapi "github.com/hashicorp/consul/api"
    21  	"github.com/hashicorp/consul/lib"
    22  	multierror "github.com/hashicorp/go-multierror"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/config"
    25  	"github.com/hashicorp/nomad/client/servers"
    26  	"github.com/hashicorp/nomad/client/stats"
    27  	cstructs "github.com/hashicorp/nomad/client/structs"
    28  	"github.com/hashicorp/nomad/client/vaultclient"
    29  	"github.com/hashicorp/nomad/command/agent/consul"
    30  	"github.com/hashicorp/nomad/helper"
    31  	"github.com/hashicorp/nomad/helper/pool"
    32  	hstats "github.com/hashicorp/nomad/helper/stats"
    33  	"github.com/hashicorp/nomad/helper/tlsutil"
    34  	"github.com/hashicorp/nomad/helper/uuid"
    35  	"github.com/hashicorp/nomad/nomad/structs"
    36  	nconfig "github.com/hashicorp/nomad/nomad/structs/config"
    37  	vaultapi "github.com/hashicorp/vault/api"
    38  	"github.com/shirou/gopsutil/host"
    39  )
    40  
    41  const (
    42  	// clientRPCCache controls how long we keep an idle connection
    43  	// open to a server
    44  	clientRPCCache = 5 * time.Minute
    45  
    46  	// clientMaxStreams controls how many idle streams we keep
    47  	// open to a server
    48  	clientMaxStreams = 2
    49  
    50  	// datacenterQueryLimit searches through up to this many adjacent
    51  	// datacenters looking for the Nomad server service.
    52  	datacenterQueryLimit = 9
    53  
    54  	// registerRetryIntv is minimum interval on which we retry
    55  	// registration. We pick a value between this and 2x this.
    56  	registerRetryIntv = 15 * time.Second
    57  
    58  	// getAllocRetryIntv is minimum interval on which we retry
    59  	// to fetch allocations. We pick a value between this and 2x this.
    60  	getAllocRetryIntv = 30 * time.Second
    61  
    62  	// devModeRetryIntv is the retry interval used for development
    63  	devModeRetryIntv = time.Second
    64  
    65  	// stateSnapshotIntv is how often the client snapshots state
    66  	stateSnapshotIntv = 60 * time.Second
    67  
    68  	// initialHeartbeatStagger is used to stagger the interval between
    69  	// starting and the initial heartbeat. After the initial heartbeat,
    70  	// we switch to using the TTL specified by the servers.
    71  	initialHeartbeatStagger = 10 * time.Second
    72  
    73  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    74  	// node attributes or meta map.
    75  	nodeUpdateRetryIntv = 5 * time.Second
    76  
    77  	// allocSyncIntv is the batching period of allocation updates before they
    78  	// are synced with the server.
    79  	allocSyncIntv = 200 * time.Millisecond
    80  
    81  	// allocSyncRetryIntv is the interval on which we retry updating
    82  	// the status of the allocation
    83  	allocSyncRetryIntv = 5 * time.Second
    84  )
    85  
    86  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
    87  // Client
    88  type ClientStatsReporter interface {
    89  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
    90  	// If it does not exist an error is reported.
    91  	GetAllocStats(allocID string) (AllocStatsReporter, error)
    92  
    93  	// LatestHostStats returns the latest resource usage stats for the host
    94  	LatestHostStats() *stats.HostStats
    95  }
    96  
    97  // Client is used to implement the client interaction with Nomad. Clients
    98  // are expected to register as a schedulable node to the servers, and to
    99  // run allocations as determined by the servers.
   100  type Client struct {
   101  	config *config.Config
   102  	start  time.Time
   103  
   104  	// stateDB is used to efficiently store client state.
   105  	stateDB *bolt.DB
   106  
   107  	// configCopy is a copy that should be passed to alloc-runners.
   108  	configCopy *config.Config
   109  	configLock sync.RWMutex
   110  
   111  	logger *log.Logger
   112  
   113  	connPool *pool.ConnPool
   114  
   115  	// tlsWrap is used to wrap outbound connections using TLS. It should be
   116  	// accessed using the lock.
   117  	tlsWrap     tlsutil.RegionWrapper
   118  	tlsWrapLock sync.RWMutex
   119  
   120  	// servers is the list of nomad servers
   121  	servers *servers.Manager
   122  
   123  	// heartbeat related times for tracking how often to heartbeat
   124  	lastHeartbeat   time.Time
   125  	heartbeatTTL    time.Duration
   126  	haveHeartbeated bool
   127  	heartbeatLock   sync.Mutex
   128  
   129  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   130  	triggerDiscoveryCh chan struct{}
   131  
   132  	// triggerNodeUpdate triggers the client to mark the Node as changed and
   133  	// update it.
   134  	triggerNodeUpdate chan struct{}
   135  
   136  	// triggerEmitNodeEvent sends an event and triggers the client to update the
   137  	// server for the node event
   138  	triggerEmitNodeEvent chan *structs.NodeEvent
   139  
   140  	// rpcRetryCh is closed when there an event such as server discovery or a
   141  	// successful RPC occurring happens such that a retry should happen. Access
   142  	// should only occur via the getter method
   143  	rpcRetryCh   chan struct{}
   144  	rpcRetryLock sync.Mutex
   145  
   146  	// allocs maps alloc IDs to their AllocRunner. This map includes all
   147  	// AllocRunners - running and GC'd - until the server GCs them.
   148  	allocs    map[string]*AllocRunner
   149  	allocLock sync.RWMutex
   150  
   151  	// allocUpdates stores allocations that need to be synced to the server.
   152  	allocUpdates chan *structs.Allocation
   153  
   154  	// consulService is Nomad's custom Consul client for managing services
   155  	// and checks.
   156  	consulService ConsulServiceAPI
   157  
   158  	// consulCatalog is the subset of Consul's Catalog API Nomad uses.
   159  	consulCatalog consul.CatalogAPI
   160  
   161  	// HostStatsCollector collects host resource usage stats
   162  	hostStatsCollector *stats.HostStatsCollector
   163  
   164  	shutdown     bool
   165  	shutdownCh   chan struct{}
   166  	shutdownLock sync.Mutex
   167  
   168  	// vaultClient is used to interact with Vault for token and secret renewals
   169  	vaultClient vaultclient.VaultClient
   170  
   171  	// garbageCollector is used to garbage collect terminal allocations present
   172  	// in the node automatically
   173  	garbageCollector *AllocGarbageCollector
   174  
   175  	// clientACLResolver holds the ACL resolution state
   176  	clientACLResolver
   177  
   178  	// rpcServer is used to serve RPCs by the local agent.
   179  	rpcServer     *rpc.Server
   180  	endpoints     rpcEndpoints
   181  	streamingRpcs *structs.StreamingRpcRegistry
   182  
   183  	// baseLabels are used when emitting tagged metrics. All client metrics will
   184  	// have these tags, and optionally more.
   185  	baseLabels []metrics.Label
   186  }
   187  
   188  var (
   189  	// noServersErr is returned by the RPC method when the client has no
   190  	// configured servers. This is used to trigger Consul discovery if
   191  	// enabled.
   192  	noServersErr = errors.New("no servers")
   193  )
   194  
   195  // NewClient is used to create a new client from the given configuration
   196  func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) {
   197  	// Create the tls wrapper
   198  	var tlsWrap tlsutil.RegionWrapper
   199  	if cfg.TLSConfig.EnableRPC {
   200  		tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper()
   201  		if err != nil {
   202  			return nil, err
   203  		}
   204  		tlsWrap = tw
   205  	}
   206  
   207  	// Create the client
   208  	c := &Client{
   209  		config:               cfg,
   210  		consulCatalog:        consulCatalog,
   211  		consulService:        consulService,
   212  		start:                time.Now(),
   213  		connPool:             pool.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap),
   214  		tlsWrap:              tlsWrap,
   215  		streamingRpcs:        structs.NewStreamingRpcRegistry(),
   216  		logger:               logger,
   217  		allocs:               make(map[string]*AllocRunner),
   218  		allocUpdates:         make(chan *structs.Allocation, 64),
   219  		shutdownCh:           make(chan struct{}),
   220  		triggerDiscoveryCh:   make(chan struct{}),
   221  		triggerNodeUpdate:    make(chan struct{}, 8),
   222  		triggerEmitNodeEvent: make(chan *structs.NodeEvent, 8),
   223  	}
   224  
   225  	// Initialize the server manager
   226  	c.servers = servers.New(c.logger, c.shutdownCh, c)
   227  
   228  	// Initialize the client
   229  	if err := c.init(); err != nil {
   230  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   231  	}
   232  
   233  	// Setup the clients RPC server
   234  	c.setupClientRpc()
   235  
   236  	// Initialize the ACL state
   237  	if err := c.clientACLResolver.init(); err != nil {
   238  		return nil, fmt.Errorf("failed to initialize ACL state: %v", err)
   239  	}
   240  
   241  	// Add the stats collector
   242  	statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir)
   243  	c.hostStatsCollector = statsCollector
   244  
   245  	// Add the garbage collector
   246  	gcConfig := &GCConfig{
   247  		MaxAllocs:           cfg.GCMaxAllocs,
   248  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   249  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   250  		Interval:            cfg.GCInterval,
   251  		ParallelDestroys:    cfg.GCParallelDestroys,
   252  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   253  	}
   254  	c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig)
   255  	go c.garbageCollector.Run()
   256  
   257  	// Setup the node
   258  	if err := c.setupNode(); err != nil {
   259  		return nil, fmt.Errorf("node setup failed: %v", err)
   260  	}
   261  
   262  	// Store the config copy before restoring state but after it has been
   263  	// initialized.
   264  	c.configLock.Lock()
   265  	c.configCopy = c.config.Copy()
   266  	c.configLock.Unlock()
   267  
   268  	fingerprintManager := NewFingerprintManager(c.GetConfig, c.configCopy.Node,
   269  		c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromDriver,
   270  		c.logger)
   271  
   272  	// Fingerprint the node and scan for drivers
   273  	if err := fingerprintManager.Run(); err != nil {
   274  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   275  	}
   276  
   277  	// Setup the reserved resources
   278  	c.reservePorts()
   279  
   280  	// Set the preconfigured list of static servers
   281  	c.configLock.RLock()
   282  	if len(c.configCopy.Servers) > 0 {
   283  		if err := c.setServersImpl(c.configCopy.Servers, true); err != nil {
   284  			logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
   285  		}
   286  	}
   287  	c.configLock.RUnlock()
   288  
   289  	// Setup Consul discovery if enabled
   290  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   291  		go c.consulDiscovery()
   292  		if c.servers.NumServers() == 0 {
   293  			// No configured servers; trigger discovery manually
   294  			c.triggerDiscoveryCh <- struct{}{}
   295  		}
   296  	}
   297  
   298  	// Setup the vault client for token and secret renewals
   299  	if err := c.setupVaultClient(); err != nil {
   300  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   301  	}
   302  
   303  	// Restore the state
   304  	if err := c.restoreState(); err != nil {
   305  		logger.Printf("[ERR] client: failed to restore state: %v", err)
   306  		logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+
   307  			"The safest way to proceed is to manually stop running task processes "+
   308  			"and remove Nomad's state (%q) and alloc (%q) directories before "+
   309  			"restarting. Lost allocations will be rescheduled.",
   310  			c.config.StateDir, c.config.AllocDir)
   311  		logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " +
   312  			"report as much information as possible to " +
   313  			"https://github.com/hashicorp/nomad/issues")
   314  		return nil, fmt.Errorf("failed to restore state")
   315  	}
   316  
   317  	// Register and then start heartbeating to the servers.
   318  	go c.registerAndHeartbeat()
   319  
   320  	// Begin periodic snapshotting of state.
   321  	go c.periodicSnapshot()
   322  
   323  	// Begin syncing allocations to the server
   324  	go c.allocSync()
   325  
   326  	// Start the client!
   327  	go c.run()
   328  
   329  	// Start collecting stats
   330  	go c.emitStats()
   331  
   332  	c.logger.Printf("[INFO] client: Node ID %q", c.NodeID())
   333  	return c, nil
   334  }
   335  
   336  // init is used to initialize the client and perform any setup
   337  // needed before we begin starting its various components.
   338  func (c *Client) init() error {
   339  	// Ensure the state dir exists if we have one
   340  	if c.config.StateDir != "" {
   341  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   342  			return fmt.Errorf("failed creating state dir: %s", err)
   343  		}
   344  
   345  	} else {
   346  		// Otherwise make a temp directory to use.
   347  		p, err := ioutil.TempDir("", "NomadClient")
   348  		if err != nil {
   349  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   350  		}
   351  
   352  		p, err = filepath.EvalSymlinks(p)
   353  		if err != nil {
   354  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   355  		}
   356  
   357  		c.config.StateDir = p
   358  	}
   359  	c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir)
   360  
   361  	// Create or open the state database
   362  	db, err := bolt.Open(filepath.Join(c.config.StateDir, "state.db"), 0600, nil)
   363  	if err != nil {
   364  		return fmt.Errorf("failed to create state database: %v", err)
   365  	}
   366  	c.stateDB = db
   367  
   368  	// Ensure the alloc dir exists if we have one
   369  	if c.config.AllocDir != "" {
   370  		if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil {
   371  			return fmt.Errorf("failed creating alloc dir: %s", err)
   372  		}
   373  	} else {
   374  		// Otherwise make a temp directory to use.
   375  		p, err := ioutil.TempDir("", "NomadClient")
   376  		if err != nil {
   377  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   378  		}
   379  
   380  		p, err = filepath.EvalSymlinks(p)
   381  		if err != nil {
   382  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   383  		}
   384  
   385  		// Change the permissions to have the execute bit
   386  		if err := os.Chmod(p, 0711); err != nil {
   387  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   388  		}
   389  
   390  		c.config.AllocDir = p
   391  	}
   392  
   393  	c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir)
   394  	return nil
   395  }
   396  
   397  // reloadTLSConnections allows a client to reload its TLS configuration on the
   398  // fly
   399  func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error {
   400  	var tlsWrap tlsutil.RegionWrapper
   401  	if newConfig != nil && newConfig.EnableRPC {
   402  		tw, err := tlsutil.NewTLSConfiguration(newConfig).OutgoingTLSWrapper()
   403  		if err != nil {
   404  			return err
   405  		}
   406  		tlsWrap = tw
   407  	}
   408  
   409  	// Store the new tls wrapper.
   410  	c.tlsWrapLock.Lock()
   411  	c.tlsWrap = tlsWrap
   412  	c.tlsWrapLock.Unlock()
   413  
   414  	// Keep the client configuration up to date as we use configuration values to
   415  	// decide on what type of connections to accept
   416  	c.configLock.Lock()
   417  	c.config.TLSConfig = newConfig
   418  	c.configLock.Unlock()
   419  
   420  	c.connPool.ReloadTLS(tlsWrap)
   421  
   422  	return nil
   423  }
   424  
   425  // Reload allows a client to reload its configuration on the fly
   426  func (c *Client) Reload(newConfig *config.Config) error {
   427  	return c.reloadTLSConnections(newConfig.TLSConfig)
   428  }
   429  
   430  // Leave is used to prepare the client to leave the cluster
   431  func (c *Client) Leave() error {
   432  	// TODO
   433  	return nil
   434  }
   435  
   436  // GetConfig returns the config of the client
   437  func (c *Client) GetConfig() *config.Config {
   438  	c.configLock.Lock()
   439  	defer c.configLock.Unlock()
   440  	return c.configCopy
   441  }
   442  
   443  // Datacenter returns the datacenter for the given client
   444  func (c *Client) Datacenter() string {
   445  	return c.config.Node.Datacenter
   446  }
   447  
   448  // Region returns the region for the given client
   449  func (c *Client) Region() string {
   450  	return c.config.Region
   451  }
   452  
   453  // NodeID returns the node ID for the given client
   454  func (c *Client) NodeID() string {
   455  	return c.config.Node.ID
   456  }
   457  
   458  // secretNodeID returns the secret node ID for the given client
   459  func (c *Client) secretNodeID() string {
   460  	return c.config.Node.SecretID
   461  }
   462  
   463  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   464  // client.
   465  func (c *Client) RPCMajorVersion() int {
   466  	return structs.ApiMajorVersion
   467  }
   468  
   469  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   470  // client.
   471  func (c *Client) RPCMinorVersion() int {
   472  	return structs.ApiMinorVersion
   473  }
   474  
   475  // Shutdown is used to tear down the client
   476  func (c *Client) Shutdown() error {
   477  	c.logger.Printf("[INFO] client: shutting down")
   478  	c.shutdownLock.Lock()
   479  	defer c.shutdownLock.Unlock()
   480  
   481  	if c.shutdown {
   482  		return nil
   483  	}
   484  
   485  	// Defer closing the database
   486  	defer func() {
   487  		if err := c.stateDB.Close(); err != nil {
   488  			c.logger.Printf("[ERR] client: failed to close state database on shutdown: %v", err)
   489  		}
   490  	}()
   491  
   492  	// Stop renewing tokens and secrets
   493  	if c.vaultClient != nil {
   494  		c.vaultClient.Stop()
   495  	}
   496  
   497  	// Stop Garbage collector
   498  	c.garbageCollector.Stop()
   499  
   500  	// Destroy all the running allocations.
   501  	if c.config.DevMode {
   502  		for _, ar := range c.getAllocRunners() {
   503  			ar.Destroy()
   504  			<-ar.WaitCh()
   505  		}
   506  	}
   507  
   508  	c.shutdown = true
   509  	close(c.shutdownCh)
   510  	c.connPool.Shutdown()
   511  	return c.saveState()
   512  }
   513  
   514  // Stats is used to return statistics for debugging and insight
   515  // for various sub-systems
   516  func (c *Client) Stats() map[string]map[string]string {
   517  	c.heartbeatLock.Lock()
   518  	defer c.heartbeatLock.Unlock()
   519  	stats := map[string]map[string]string{
   520  		"client": {
   521  			"node_id":         c.NodeID(),
   522  			"known_servers":   strings.Join(c.GetServers(), ","),
   523  			"num_allocations": strconv.Itoa(c.NumAllocs()),
   524  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   525  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   526  		},
   527  		"runtime": hstats.RuntimeStats(),
   528  	}
   529  	return stats
   530  }
   531  
   532  // CollectAllocation garbage collects a single allocation on a node. Returns
   533  // true if alloc was found and garbage collected; otherwise false.
   534  func (c *Client) CollectAllocation(allocID string) bool {
   535  	return c.garbageCollector.Collect(allocID)
   536  }
   537  
   538  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   539  // state
   540  func (c *Client) CollectAllAllocs() {
   541  	c.garbageCollector.CollectAll()
   542  }
   543  
   544  // Node returns the locally registered node
   545  func (c *Client) Node() *structs.Node {
   546  	c.configLock.RLock()
   547  	defer c.configLock.RUnlock()
   548  	return c.configCopy.Node
   549  }
   550  
   551  // StatsReporter exposes the various APIs related resource usage of a Nomad
   552  // client
   553  func (c *Client) StatsReporter() ClientStatsReporter {
   554  	return c
   555  }
   556  
   557  func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) {
   558  	c.allocLock.RLock()
   559  	defer c.allocLock.RUnlock()
   560  	ar, ok := c.allocs[allocID]
   561  	if !ok {
   562  		return nil, structs.NewErrUnknownAllocation(allocID)
   563  	}
   564  	return ar.StatsReporter(), nil
   565  }
   566  
   567  // HostStats returns all the stats related to a Nomad client
   568  func (c *Client) LatestHostStats() *stats.HostStats {
   569  	return c.hostStatsCollector.Stats()
   570  }
   571  
   572  // ValidateMigrateToken verifies that a token is for a specific client and
   573  // allocation, and has been created by a trusted party that has privileged
   574  // knowledge of the client's secret identifier
   575  func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool {
   576  	if !c.config.ACLEnabled {
   577  		return true
   578  	}
   579  
   580  	return structs.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken)
   581  }
   582  
   583  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   584  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   585  	c.allocLock.RLock()
   586  	defer c.allocLock.RUnlock()
   587  
   588  	ar, ok := c.allocs[allocID]
   589  	if !ok {
   590  		return nil, structs.NewErrUnknownAllocation(allocID)
   591  	}
   592  	return ar.GetAllocDir(), nil
   593  }
   594  
   595  // GetClientAlloc returns the allocation from the client
   596  func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
   597  	all := c.allAllocs()
   598  	alloc, ok := all[allocID]
   599  	if !ok {
   600  		return nil, structs.NewErrUnknownAllocation(allocID)
   601  	}
   602  	return alloc, nil
   603  }
   604  
   605  // GetServers returns the list of nomad servers this client is aware of.
   606  func (c *Client) GetServers() []string {
   607  	endpoints := c.servers.GetServers()
   608  	res := make([]string, len(endpoints))
   609  	for i := range endpoints {
   610  		res[i] = endpoints[i].String()
   611  	}
   612  	sort.Strings(res)
   613  	return res
   614  }
   615  
   616  // SetServers sets a new list of nomad servers to connect to. As long as one
   617  // server is resolvable no error is returned.
   618  func (c *Client) SetServers(in []string) error {
   619  	return c.setServersImpl(in, false)
   620  }
   621  
   622  // setServersImpl sets a new list of nomad servers to connect to. If force is
   623  // set, we add the server to the internal serverlist even if the server could not
   624  // be pinged. An error is returned if no endpoints were valid when non-forcing.
   625  //
   626  // Force should be used when setting the servers from the initial configuration
   627  // since the server may be starting up in parallel and initial pings may fail.
   628  func (c *Client) setServersImpl(in []string, force bool) error {
   629  	var mu sync.Mutex
   630  	var wg sync.WaitGroup
   631  	var merr multierror.Error
   632  
   633  	endpoints := make([]*servers.Server, 0, len(in))
   634  	wg.Add(len(in))
   635  
   636  	for _, s := range in {
   637  		go func(srv string) {
   638  			defer wg.Done()
   639  			addr, err := resolveServer(srv)
   640  			if err != nil {
   641  				c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", srv, err)
   642  				merr.Errors = append(merr.Errors, err)
   643  				return
   644  			}
   645  
   646  			// Try to ping to check if it is a real server
   647  			if err := c.Ping(addr); err != nil {
   648  				merr.Errors = append(merr.Errors, fmt.Errorf("Server at address %s failed ping: %v", addr, err))
   649  
   650  				// If we are forcing the setting of the servers, inject it to
   651  				// the serverlist even if we can't ping immediately.
   652  				if !force {
   653  					return
   654  				}
   655  			}
   656  
   657  			mu.Lock()
   658  			endpoints = append(endpoints, &servers.Server{Addr: addr})
   659  			mu.Unlock()
   660  		}(s)
   661  	}
   662  
   663  	wg.Wait()
   664  
   665  	// Only return errors if no servers are valid
   666  	if len(endpoints) == 0 {
   667  		if len(merr.Errors) > 0 {
   668  			return merr.ErrorOrNil()
   669  		}
   670  		return noServersErr
   671  	}
   672  
   673  	c.servers.SetServers(endpoints)
   674  	return nil
   675  }
   676  
   677  // restoreState is used to restore our state from the data dir
   678  func (c *Client) restoreState() error {
   679  	if c.config.DevMode {
   680  		return nil
   681  	}
   682  
   683  	// COMPAT: Remove in 0.7.0
   684  	// 0.6.0 transitioned from individual state files to a single bolt-db.
   685  	// The upgrade path is to:
   686  	// Check if old state exists
   687  	//   If so, restore from that and delete old state
   688  	// Restore using state database
   689  
   690  	// Allocs holds the IDs of the allocations being restored
   691  	var allocs []string
   692  
   693  	// Upgrading tracks whether this is a pre 0.6.0 upgrade path
   694  	var upgrading bool
   695  
   696  	// Scan the directory
   697  	allocDir := filepath.Join(c.config.StateDir, "alloc")
   698  	list, err := ioutil.ReadDir(allocDir)
   699  	if err != nil && !os.IsNotExist(err) {
   700  		return fmt.Errorf("failed to list alloc state: %v", err)
   701  	} else if err == nil && len(list) != 0 {
   702  		upgrading = true
   703  		for _, entry := range list {
   704  			allocs = append(allocs, entry.Name())
   705  		}
   706  	} else {
   707  		// Normal path
   708  		err := c.stateDB.View(func(tx *bolt.Tx) error {
   709  			allocs, err = getAllAllocationIDs(tx)
   710  			if err != nil {
   711  				return fmt.Errorf("failed to list allocations: %v", err)
   712  			}
   713  			return nil
   714  		})
   715  		if err != nil {
   716  			return err
   717  		}
   718  	}
   719  
   720  	// Load each alloc back
   721  	var mErr multierror.Error
   722  	for _, id := range allocs {
   723  		alloc := &structs.Allocation{ID: id}
   724  
   725  		// don't worry about blocking/migrating when restoring
   726  		watcher := noopPrevAlloc{}
   727  
   728  		c.configLock.RLock()
   729  		ar := NewAllocRunner(c.logger, c.configCopy.Copy(), c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher)
   730  		c.configLock.RUnlock()
   731  
   732  		c.allocLock.Lock()
   733  		c.allocs[id] = ar
   734  		c.allocLock.Unlock()
   735  
   736  		if err := ar.RestoreState(); err != nil {
   737  			c.logger.Printf("[ERR] client: failed to restore state for alloc %q: %v", id, err)
   738  			mErr.Errors = append(mErr.Errors, err)
   739  		} else {
   740  			go ar.Run()
   741  
   742  			if upgrading {
   743  				if err := ar.SaveState(); err != nil {
   744  					c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", id, err)
   745  				}
   746  			}
   747  		}
   748  	}
   749  
   750  	// Delete all the entries
   751  	if upgrading {
   752  		if err := os.RemoveAll(allocDir); err != nil {
   753  			mErr.Errors = append(mErr.Errors, err)
   754  		}
   755  	}
   756  
   757  	return mErr.ErrorOrNil()
   758  }
   759  
   760  // saveState is used to snapshot our state into the data dir.
   761  func (c *Client) saveState() error {
   762  	if c.config.DevMode {
   763  		return nil
   764  	}
   765  
   766  	var wg sync.WaitGroup
   767  	var l sync.Mutex
   768  	var mErr multierror.Error
   769  	runners := c.getAllocRunners()
   770  	wg.Add(len(runners))
   771  
   772  	for id, ar := range runners {
   773  		go func(id string, ar *AllocRunner) {
   774  			err := ar.SaveState()
   775  			if err != nil {
   776  				c.logger.Printf("[ERR] client: failed to save state for alloc %q: %v", id, err)
   777  				l.Lock()
   778  				multierror.Append(&mErr, err)
   779  				l.Unlock()
   780  			}
   781  			wg.Done()
   782  		}(id, ar)
   783  	}
   784  
   785  	wg.Wait()
   786  	return mErr.ErrorOrNil()
   787  }
   788  
   789  // getAllocRunners returns a snapshot of the current set of alloc runners.
   790  func (c *Client) getAllocRunners() map[string]*AllocRunner {
   791  	c.allocLock.RLock()
   792  	defer c.allocLock.RUnlock()
   793  	runners := make(map[string]*AllocRunner, len(c.allocs))
   794  	for id, ar := range c.allocs {
   795  		runners[id] = ar
   796  	}
   797  	return runners
   798  }
   799  
   800  // NumAllocs returns the number of un-GC'd allocs this client has. Used to
   801  // fulfill the AllocCounter interface for the GC.
   802  func (c *Client) NumAllocs() int {
   803  	n := 0
   804  	c.allocLock.RLock()
   805  	for _, a := range c.allocs {
   806  		if !a.IsDestroyed() {
   807  			n++
   808  		}
   809  	}
   810  	c.allocLock.RUnlock()
   811  	return n
   812  }
   813  
   814  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
   815  // The node ID is, if available, a persistent unique ID.  The secret ID is a
   816  // high-entropy random UUID.
   817  func (c *Client) nodeID() (id, secret string, err error) {
   818  	var hostID string
   819  	hostInfo, err := host.Info()
   820  	if !c.config.NoHostUUID && err == nil {
   821  		if hashed, ok := helper.HashUUID(hostInfo.HostID); ok {
   822  			hostID = hashed
   823  		}
   824  	}
   825  
   826  	if hostID == "" {
   827  		// Generate a random hostID if no constant ID is available on
   828  		// this platform.
   829  		hostID = uuid.Generate()
   830  	}
   831  
   832  	// Do not persist in dev mode
   833  	if c.config.DevMode {
   834  		return hostID, uuid.Generate(), nil
   835  	}
   836  
   837  	// Attempt to read existing ID
   838  	idPath := filepath.Join(c.config.StateDir, "client-id")
   839  	idBuf, err := ioutil.ReadFile(idPath)
   840  	if err != nil && !os.IsNotExist(err) {
   841  		return "", "", err
   842  	}
   843  
   844  	// Attempt to read existing secret ID
   845  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
   846  	secretBuf, err := ioutil.ReadFile(secretPath)
   847  	if err != nil && !os.IsNotExist(err) {
   848  		return "", "", err
   849  	}
   850  
   851  	// Use existing ID if any
   852  	if len(idBuf) != 0 {
   853  		id = strings.ToLower(string(idBuf))
   854  	} else {
   855  		id = hostID
   856  
   857  		// Persist the ID
   858  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
   859  			return "", "", err
   860  		}
   861  	}
   862  
   863  	if len(secretBuf) != 0 {
   864  		secret = string(secretBuf)
   865  	} else {
   866  		// Generate new ID
   867  		secret = uuid.Generate()
   868  
   869  		// Persist the ID
   870  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
   871  			return "", "", err
   872  		}
   873  	}
   874  
   875  	return id, secret, nil
   876  }
   877  
   878  // setupNode is used to setup the initial node
   879  func (c *Client) setupNode() error {
   880  	node := c.config.Node
   881  	if node == nil {
   882  		node = &structs.Node{}
   883  		c.config.Node = node
   884  	}
   885  	// Generate an ID and secret for the node
   886  	id, secretID, err := c.nodeID()
   887  	if err != nil {
   888  		return fmt.Errorf("node ID setup failed: %v", err)
   889  	}
   890  
   891  	node.ID = id
   892  	node.SecretID = secretID
   893  	if node.Attributes == nil {
   894  		node.Attributes = make(map[string]string)
   895  	}
   896  	if node.Links == nil {
   897  		node.Links = make(map[string]string)
   898  	}
   899  	if node.Drivers == nil {
   900  		node.Drivers = make(map[string]*structs.DriverInfo)
   901  	}
   902  	if node.Meta == nil {
   903  		node.Meta = make(map[string]string)
   904  	}
   905  	if node.Resources == nil {
   906  		node.Resources = &structs.Resources{}
   907  	}
   908  	if node.Reserved == nil {
   909  		node.Reserved = &structs.Resources{}
   910  	}
   911  	if node.Datacenter == "" {
   912  		node.Datacenter = "dc1"
   913  	}
   914  	if node.Name == "" {
   915  		node.Name, _ = os.Hostname()
   916  	}
   917  	if node.Name == "" {
   918  		node.Name = node.ID
   919  	}
   920  	node.Status = structs.NodeStatusInit
   921  	return nil
   922  }
   923  
   924  // reservePorts is used to reserve ports on the fingerprinted network devices.
   925  func (c *Client) reservePorts() {
   926  	c.configLock.RLock()
   927  	defer c.configLock.RUnlock()
   928  	global := c.config.GloballyReservedPorts
   929  	if len(global) == 0 {
   930  		return
   931  	}
   932  
   933  	node := c.config.Node
   934  	networks := node.Resources.Networks
   935  	reservedIndex := make(map[string]*structs.NetworkResource, len(networks))
   936  	for _, resNet := range node.Reserved.Networks {
   937  		reservedIndex[resNet.IP] = resNet
   938  	}
   939  
   940  	// Go through each network device and reserve ports on it.
   941  	for _, net := range networks {
   942  		res, ok := reservedIndex[net.IP]
   943  		if !ok {
   944  			res = net.Copy()
   945  			res.MBits = 0
   946  			reservedIndex[net.IP] = res
   947  		}
   948  
   949  		for _, portVal := range global {
   950  			p := structs.Port{Value: portVal}
   951  			res.ReservedPorts = append(res.ReservedPorts, p)
   952  		}
   953  	}
   954  
   955  	// Clear the reserved networks.
   956  	if node.Reserved == nil {
   957  		node.Reserved = new(structs.Resources)
   958  	} else {
   959  		node.Reserved.Networks = nil
   960  	}
   961  
   962  	// Restore the reserved networks
   963  	for _, net := range reservedIndex {
   964  		node.Reserved.Networks = append(node.Reserved.Networks, net)
   965  	}
   966  
   967  	// Make the changes available to the config copy.
   968  	c.configCopy = c.config.Copy()
   969  }
   970  
   971  // updateNodeFromFingerprint updates the node with the result of
   972  // fingerprinting the node from the diff that was created
   973  func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintResponse) *structs.Node {
   974  	c.configLock.Lock()
   975  	defer c.configLock.Unlock()
   976  
   977  	nodeHasChanged := false
   978  
   979  	for name, newVal := range response.Attributes {
   980  		oldVal := c.config.Node.Attributes[name]
   981  		if oldVal == newVal {
   982  			continue
   983  		}
   984  
   985  		nodeHasChanged = true
   986  		if newVal == "" {
   987  			delete(c.config.Node.Attributes, name)
   988  		} else {
   989  			c.config.Node.Attributes[name] = newVal
   990  		}
   991  	}
   992  
   993  	// update node links and resources from the diff created from
   994  	// fingerprinting
   995  	for name, newVal := range response.Links {
   996  		oldVal := c.config.Node.Links[name]
   997  		if oldVal == newVal {
   998  			continue
   999  		}
  1000  
  1001  		nodeHasChanged = true
  1002  		if newVal == "" {
  1003  			delete(c.config.Node.Links, name)
  1004  		} else {
  1005  			c.config.Node.Links[name] = newVal
  1006  		}
  1007  	}
  1008  
  1009  	if response.Resources != nil && !resourcesAreEqual(c.config.Node.Resources, response.Resources) {
  1010  		nodeHasChanged = true
  1011  		c.config.Node.Resources.Merge(response.Resources)
  1012  	}
  1013  
  1014  	if nodeHasChanged {
  1015  		c.updateNodeLocked()
  1016  	}
  1017  
  1018  	return c.configCopy.Node
  1019  }
  1020  
  1021  // updateNodeFromDriver receives either a fingerprint of the driver or its
  1022  // health and merges this into a single DriverInfo object
  1023  func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs.DriverInfo) *structs.Node {
  1024  	c.configLock.Lock()
  1025  	defer c.configLock.Unlock()
  1026  
  1027  	var hasChanged bool
  1028  
  1029  	hadDriver := c.config.Node.Drivers[name] != nil
  1030  	if fingerprint != nil {
  1031  		if !hadDriver {
  1032  			// If the driver info has not yet been set, do that here
  1033  			hasChanged = true
  1034  			c.config.Node.Drivers[name] = fingerprint
  1035  			for attrName, newVal := range fingerprint.Attributes {
  1036  				c.config.Node.Attributes[attrName] = newVal
  1037  			}
  1038  		} else {
  1039  			// The driver info has already been set, fix it up
  1040  			if c.config.Node.Drivers[name].Detected != fingerprint.Detected {
  1041  				hasChanged = true
  1042  				c.config.Node.Drivers[name].Detected = fingerprint.Detected
  1043  			}
  1044  
  1045  			for attrName, newVal := range fingerprint.Attributes {
  1046  				oldVal := c.config.Node.Drivers[name].Attributes[attrName]
  1047  				if oldVal == newVal {
  1048  					continue
  1049  				}
  1050  
  1051  				hasChanged = true
  1052  				if newVal == "" {
  1053  					delete(c.config.Node.Attributes, attrName)
  1054  				} else {
  1055  					c.config.Node.Attributes[attrName] = newVal
  1056  				}
  1057  			}
  1058  		}
  1059  
  1060  		// COMPAT Remove in Nomad 0.10
  1061  		// We maintain the driver enabled attribute until all drivers expose
  1062  		// their attributes as DriverInfo
  1063  		driverName := fmt.Sprintf("driver.%s", name)
  1064  		if fingerprint.Detected {
  1065  			c.config.Node.Attributes[driverName] = "1"
  1066  		} else {
  1067  			delete(c.config.Node.Attributes, driverName)
  1068  		}
  1069  	}
  1070  
  1071  	if health != nil {
  1072  		if !hadDriver {
  1073  			hasChanged = true
  1074  			if info, ok := c.config.Node.Drivers[name]; !ok {
  1075  				c.config.Node.Drivers[name] = health
  1076  			} else {
  1077  				info.MergeHealthCheck(health)
  1078  			}
  1079  		} else {
  1080  			oldVal := c.config.Node.Drivers[name]
  1081  			if health.HealthCheckEquals(oldVal) {
  1082  				// Make sure we accurately reflect the last time a health check has been
  1083  				// performed for the driver.
  1084  				oldVal.UpdateTime = health.UpdateTime
  1085  			} else {
  1086  				hasChanged = true
  1087  
  1088  				// Only emit an event if the health status has changed after node
  1089  				// initial startup (the health description will not get populated until
  1090  				// a health check has run; the initial status is equal to whether the
  1091  				// node is detected or not).
  1092  				if health.Healthy != oldVal.Healthy && health.HealthDescription != "" {
  1093  					event := &structs.NodeEvent{
  1094  						Subsystem: "Driver",
  1095  						Message:   health.HealthDescription,
  1096  						Timestamp: time.Now(),
  1097  						Details:   map[string]string{"driver": name},
  1098  					}
  1099  					c.triggerNodeEvent(event)
  1100  				}
  1101  
  1102  				// Update the node with the latest information
  1103  				c.config.Node.Drivers[name].MergeHealthCheck(health)
  1104  			}
  1105  		}
  1106  	}
  1107  
  1108  	if hasChanged {
  1109  		c.config.Node.Drivers[name].UpdateTime = time.Now()
  1110  		c.updateNodeLocked()
  1111  	}
  1112  
  1113  	return c.configCopy.Node
  1114  }
  1115  
  1116  // resourcesAreEqual is a temporary function to compare whether resources are
  1117  // equal. We can use this until we change fingerprinters to set pointers on a
  1118  // return type.
  1119  func resourcesAreEqual(first, second *structs.Resources) bool {
  1120  	if first.CPU != second.CPU {
  1121  		return false
  1122  	}
  1123  	if first.MemoryMB != second.MemoryMB {
  1124  		return false
  1125  	}
  1126  	if first.DiskMB != second.DiskMB {
  1127  		return false
  1128  	}
  1129  	if first.IOPS != second.IOPS {
  1130  		return false
  1131  	}
  1132  	if len(first.Networks) != len(second.Networks) {
  1133  		return false
  1134  	}
  1135  	for i, e := range first.Networks {
  1136  		if len(second.Networks) < i {
  1137  			return false
  1138  		}
  1139  		f := second.Networks[i]
  1140  		if !e.Equals(f) {
  1141  			return false
  1142  		}
  1143  	}
  1144  	return true
  1145  }
  1146  
  1147  // retryIntv calculates a retry interval value given the base
  1148  func (c *Client) retryIntv(base time.Duration) time.Duration {
  1149  	if c.config.DevMode {
  1150  		return devModeRetryIntv
  1151  	}
  1152  	return base + lib.RandomStagger(base)
  1153  }
  1154  
  1155  // registerAndHeartbeat is a long lived goroutine used to register the client
  1156  // and then start heartbeating to the server.
  1157  func (c *Client) registerAndHeartbeat() {
  1158  	// Register the node
  1159  	c.retryRegisterNode()
  1160  
  1161  	// Start watching changes for node changes
  1162  	go c.watchNodeUpdates()
  1163  
  1164  	// Start watching for emitting node events
  1165  	go c.watchNodeEvents()
  1166  
  1167  	// Setup the heartbeat timer, for the initial registration
  1168  	// we want to do this quickly. We want to do it extra quickly
  1169  	// in development mode.
  1170  	var heartbeat <-chan time.Time
  1171  	if c.config.DevMode {
  1172  		heartbeat = time.After(0)
  1173  	} else {
  1174  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1175  	}
  1176  
  1177  	for {
  1178  		select {
  1179  		case <-c.rpcRetryWatcher():
  1180  		case <-heartbeat:
  1181  		case <-c.shutdownCh:
  1182  			return
  1183  		}
  1184  
  1185  		if err := c.updateNodeStatus(); err != nil {
  1186  			// The servers have changed such that this node has not been
  1187  			// registered before
  1188  			if strings.Contains(err.Error(), "node not found") {
  1189  				// Re-register the node
  1190  				c.logger.Printf("[INFO] client: re-registering node")
  1191  				c.retryRegisterNode()
  1192  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1193  			} else {
  1194  				intv := c.getHeartbeatRetryIntv(err)
  1195  				c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
  1196  				heartbeat = time.After(intv)
  1197  
  1198  				// If heartbeating fails, trigger Consul discovery
  1199  				c.triggerDiscovery()
  1200  			}
  1201  		} else {
  1202  			c.heartbeatLock.Lock()
  1203  			heartbeat = time.After(c.heartbeatTTL)
  1204  			c.heartbeatLock.Unlock()
  1205  		}
  1206  	}
  1207  }
  1208  
  1209  // getHeartbeatRetryIntv is used to retrieve the time to wait before attempting
  1210  // another heartbeat.
  1211  func (c *Client) getHeartbeatRetryIntv(err error) time.Duration {
  1212  	if c.config.DevMode {
  1213  		return devModeRetryIntv
  1214  	}
  1215  
  1216  	// Collect the useful heartbeat info
  1217  	c.heartbeatLock.Lock()
  1218  	haveHeartbeated := c.haveHeartbeated
  1219  	last := c.lastHeartbeat
  1220  	ttl := c.heartbeatTTL
  1221  	c.heartbeatLock.Unlock()
  1222  
  1223  	// If we haven't even successfully heartbeated once or there is no leader
  1224  	// treat it as a registration. In the case that there is a leadership loss,
  1225  	// we will have our heartbeat timer reset to a much larger threshold, so
  1226  	// do not put unnecessary pressure on the new leader.
  1227  	if !haveHeartbeated || err == structs.ErrNoLeader {
  1228  		return c.retryIntv(registerRetryIntv)
  1229  	}
  1230  
  1231  	// Determine how much time we have left to heartbeat
  1232  	left := last.Add(ttl).Sub(time.Now())
  1233  
  1234  	// Logic for retrying is:
  1235  	// * Do not retry faster than once a second
  1236  	// * Do not retry less that once every 30 seconds
  1237  	// * If we have missed the heartbeat by more than 30 seconds, start to use
  1238  	// the absolute time since we do not want to retry indefinitely
  1239  	switch {
  1240  	case left < -30*time.Second:
  1241  		// Make left the absolute value so we delay and jitter properly.
  1242  		left *= -1
  1243  	case left < 0:
  1244  		return time.Second + lib.RandomStagger(time.Second)
  1245  	default:
  1246  	}
  1247  
  1248  	stagger := lib.RandomStagger(left)
  1249  	switch {
  1250  	case stagger < time.Second:
  1251  		return time.Second + lib.RandomStagger(time.Second)
  1252  	case stagger > 30*time.Second:
  1253  		return 25*time.Second + lib.RandomStagger(5*time.Second)
  1254  	default:
  1255  		return stagger
  1256  	}
  1257  }
  1258  
  1259  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
  1260  // state of the client
  1261  func (c *Client) periodicSnapshot() {
  1262  	// Create a snapshot timer
  1263  	snapshot := time.After(stateSnapshotIntv)
  1264  
  1265  	for {
  1266  		select {
  1267  		case <-snapshot:
  1268  			snapshot = time.After(stateSnapshotIntv)
  1269  			if err := c.saveState(); err != nil {
  1270  				c.logger.Printf("[ERR] client: failed to save state: %v", err)
  1271  			}
  1272  
  1273  		case <-c.shutdownCh:
  1274  			return
  1275  		}
  1276  	}
  1277  }
  1278  
  1279  // run is a long lived goroutine used to run the client
  1280  func (c *Client) run() {
  1281  	// Watch for changes in allocations
  1282  	allocUpdates := make(chan *allocUpdates, 8)
  1283  	go c.watchAllocations(allocUpdates)
  1284  
  1285  	for {
  1286  		select {
  1287  		case update := <-allocUpdates:
  1288  			c.runAllocs(update)
  1289  
  1290  		case <-c.shutdownCh:
  1291  			return
  1292  		}
  1293  	}
  1294  }
  1295  
  1296  // submitNodeEvents is used to submit a client-side node event. Examples of
  1297  // these kinds of events include when a driver moves from healthy to unhealthy
  1298  // (and vice versa)
  1299  func (c *Client) submitNodeEvents(events []*structs.NodeEvent) error {
  1300  	nodeID := c.NodeID()
  1301  	nodeEvents := map[string][]*structs.NodeEvent{
  1302  		nodeID: events,
  1303  	}
  1304  	req := structs.EmitNodeEventsRequest{
  1305  		NodeEvents:   nodeEvents,
  1306  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1307  	}
  1308  	var resp structs.EmitNodeEventsResponse
  1309  	if err := c.RPC("Node.EmitEvents", &req, &resp); err != nil {
  1310  		return fmt.Errorf("Emitting node events failed: %v", err)
  1311  	}
  1312  	return nil
  1313  }
  1314  
  1315  // watchNodeEvents is a handler which receives node events and on a interval
  1316  // and submits them in batch format to the server
  1317  func (c *Client) watchNodeEvents() {
  1318  	// batchEvents stores events that have yet to be published
  1319  	var batchEvents []*structs.NodeEvent
  1320  
  1321  	// Create and drain the timer
  1322  	timer := time.NewTimer(0)
  1323  	timer.Stop()
  1324  	select {
  1325  	case <-timer.C:
  1326  	default:
  1327  	}
  1328  	defer timer.Stop()
  1329  
  1330  	for {
  1331  		select {
  1332  		case event := <-c.triggerEmitNodeEvent:
  1333  			if l := len(batchEvents); l <= structs.MaxRetainedNodeEvents {
  1334  				batchEvents = append(batchEvents, event)
  1335  			} else {
  1336  				// Drop the oldest event
  1337  				c.logger.Printf("[WARN] client: dropping node event: %v", batchEvents[0])
  1338  				batchEvents = append(batchEvents[1:], event)
  1339  			}
  1340  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1341  		case <-timer.C:
  1342  			if err := c.submitNodeEvents(batchEvents); err != nil {
  1343  				c.logger.Printf("[ERR] client: submitting node events failed: %v", err)
  1344  				timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1345  			} else {
  1346  				// Reset the events since we successfully sent them.
  1347  				batchEvents = []*structs.NodeEvent{}
  1348  			}
  1349  		case <-c.shutdownCh:
  1350  			return
  1351  		}
  1352  	}
  1353  }
  1354  
  1355  // triggerNodeEvent triggers a emit node event
  1356  func (c *Client) triggerNodeEvent(nodeEvent *structs.NodeEvent) {
  1357  	select {
  1358  	case c.triggerEmitNodeEvent <- nodeEvent:
  1359  		// emit node event goroutine was released to execute
  1360  	default:
  1361  		// emit node event goroutine was already running
  1362  	}
  1363  }
  1364  
  1365  // retryRegisterNode is used to register the node or update the registration and
  1366  // retry in case of failure.
  1367  func (c *Client) retryRegisterNode() {
  1368  	for {
  1369  		err := c.registerNode()
  1370  		if err == nil {
  1371  			// Registered!
  1372  			return
  1373  		}
  1374  
  1375  		if err == noServersErr {
  1376  			c.logger.Print("[DEBUG] client: registration waiting on servers")
  1377  			c.triggerDiscovery()
  1378  		} else {
  1379  			c.logger.Printf("[ERR] client: registration failure: %v", err)
  1380  		}
  1381  		select {
  1382  		case <-c.rpcRetryWatcher():
  1383  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1384  		case <-c.shutdownCh:
  1385  			return
  1386  		}
  1387  	}
  1388  }
  1389  
  1390  // registerNode is used to register the node or update the registration
  1391  func (c *Client) registerNode() error {
  1392  	node := c.Node()
  1393  	req := structs.NodeRegisterRequest{
  1394  		Node:         node,
  1395  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1396  	}
  1397  	var resp structs.NodeUpdateResponse
  1398  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1399  		return err
  1400  	}
  1401  
  1402  	// Update the node status to ready after we register.
  1403  	c.configLock.Lock()
  1404  	node.Status = structs.NodeStatusReady
  1405  	c.config.Node.Status = structs.NodeStatusReady
  1406  	c.configLock.Unlock()
  1407  
  1408  	c.logger.Printf("[INFO] client: node registration complete")
  1409  	if len(resp.EvalIDs) != 0 {
  1410  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs))
  1411  	}
  1412  
  1413  	c.heartbeatLock.Lock()
  1414  	defer c.heartbeatLock.Unlock()
  1415  	c.lastHeartbeat = time.Now()
  1416  	c.heartbeatTTL = resp.HeartbeatTTL
  1417  	return nil
  1418  }
  1419  
  1420  // updateNodeStatus is used to heartbeat and update the status of the node
  1421  func (c *Client) updateNodeStatus() error {
  1422  	start := time.Now()
  1423  	req := structs.NodeUpdateStatusRequest{
  1424  		NodeID:       c.NodeID(),
  1425  		Status:       structs.NodeStatusReady,
  1426  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1427  	}
  1428  	var resp structs.NodeUpdateResponse
  1429  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1430  		c.triggerDiscovery()
  1431  		return fmt.Errorf("failed to update status: %v", err)
  1432  	}
  1433  	end := time.Now()
  1434  
  1435  	if len(resp.EvalIDs) != 0 {
  1436  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs))
  1437  	}
  1438  
  1439  	// Update the last heartbeat and the new TTL, capturing the old values
  1440  	c.heartbeatLock.Lock()
  1441  	last := c.lastHeartbeat
  1442  	oldTTL := c.heartbeatTTL
  1443  	haveHeartbeated := c.haveHeartbeated
  1444  	c.lastHeartbeat = time.Now()
  1445  	c.heartbeatTTL = resp.HeartbeatTTL
  1446  	c.haveHeartbeated = true
  1447  	c.heartbeatLock.Unlock()
  1448  	c.logger.Printf("[TRACE] client: next heartbeat in %v", resp.HeartbeatTTL)
  1449  
  1450  	if resp.Index != 0 {
  1451  		c.logger.Printf("[DEBUG] client: state updated to %s", req.Status)
  1452  
  1453  		// We have potentially missed our TTL log how delayed we were
  1454  		if haveHeartbeated {
  1455  			c.logger.Printf("[WARN] client: heartbeat missed (request took %v). Heartbeat TTL was %v and heartbeated after %v",
  1456  				end.Sub(start), oldTTL, time.Since(last))
  1457  		}
  1458  	}
  1459  
  1460  	// Update the number of nodes in the cluster so we can adjust our server
  1461  	// rebalance rate.
  1462  	c.servers.SetNumNodes(resp.NumNodes)
  1463  
  1464  	// Convert []*NodeServerInfo to []*servers.Server
  1465  	nomadServers := make([]*servers.Server, 0, len(resp.Servers))
  1466  	for _, s := range resp.Servers {
  1467  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1468  		if err != nil {
  1469  			c.logger.Printf("[WARN] client: ignoring invalid server %q: %v", s.RPCAdvertiseAddr, err)
  1470  			continue
  1471  		}
  1472  		e := &servers.Server{DC: s.Datacenter, Addr: addr}
  1473  		nomadServers = append(nomadServers, e)
  1474  	}
  1475  	if len(nomadServers) == 0 {
  1476  		return fmt.Errorf("heartbeat response returned no valid servers")
  1477  	}
  1478  	c.servers.SetServers(nomadServers)
  1479  
  1480  	// Begin polling Consul if there is no Nomad leader.  We could be
  1481  	// heartbeating to a Nomad server that is in the minority of a
  1482  	// partition of the Nomad server quorum, but this Nomad Agent still
  1483  	// has connectivity to the existing majority of Nomad Servers, but
  1484  	// only if it queries Consul.
  1485  	if resp.LeaderRPCAddr == "" {
  1486  		c.triggerDiscovery()
  1487  	}
  1488  
  1489  	return nil
  1490  }
  1491  
  1492  // updateAllocStatus is used to update the status of an allocation
  1493  func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
  1494  	if alloc.Terminated() {
  1495  		// Terminated, mark for GC if we're still tracking this alloc
  1496  		// runner. If it's not being tracked that means the server has
  1497  		// already GC'd it (see removeAlloc).
  1498  		c.allocLock.RLock()
  1499  		ar, ok := c.allocs[alloc.ID]
  1500  		c.allocLock.RUnlock()
  1501  
  1502  		if ok {
  1503  			c.garbageCollector.MarkForCollection(ar)
  1504  
  1505  			// Trigger a GC in case we're over thresholds and just
  1506  			// waiting for eligible allocs.
  1507  			c.garbageCollector.Trigger()
  1508  		}
  1509  	}
  1510  
  1511  	// Strip all the information that can be reconstructed at the server.  Only
  1512  	// send the fields that are updatable by the client.
  1513  	stripped := new(structs.Allocation)
  1514  	stripped.ID = alloc.ID
  1515  	stripped.NodeID = c.NodeID()
  1516  	stripped.TaskStates = alloc.TaskStates
  1517  	stripped.ClientStatus = alloc.ClientStatus
  1518  	stripped.ClientDescription = alloc.ClientDescription
  1519  	stripped.DeploymentStatus = alloc.DeploymentStatus
  1520  
  1521  	select {
  1522  	case c.allocUpdates <- stripped:
  1523  	case <-c.shutdownCh:
  1524  	}
  1525  }
  1526  
  1527  // allocSync is a long lived function that batches allocation updates to the
  1528  // server.
  1529  func (c *Client) allocSync() {
  1530  	staggered := false
  1531  	syncTicker := time.NewTicker(allocSyncIntv)
  1532  	updates := make(map[string]*structs.Allocation)
  1533  	for {
  1534  		select {
  1535  		case <-c.shutdownCh:
  1536  			syncTicker.Stop()
  1537  			return
  1538  		case alloc := <-c.allocUpdates:
  1539  			// Batch the allocation updates until the timer triggers.
  1540  			updates[alloc.ID] = alloc
  1541  		case <-syncTicker.C:
  1542  			// Fast path if there are no updates
  1543  			if len(updates) == 0 {
  1544  				continue
  1545  			}
  1546  
  1547  			sync := make([]*structs.Allocation, 0, len(updates))
  1548  			for _, alloc := range updates {
  1549  				sync = append(sync, alloc)
  1550  			}
  1551  
  1552  			// Send to server.
  1553  			args := structs.AllocUpdateRequest{
  1554  				Alloc:        sync,
  1555  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1556  			}
  1557  
  1558  			var resp structs.GenericResponse
  1559  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1560  				c.logger.Printf("[ERR] client: failed to update allocations: %v", err)
  1561  				syncTicker.Stop()
  1562  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1563  				staggered = true
  1564  			} else {
  1565  				updates = make(map[string]*structs.Allocation)
  1566  				if staggered {
  1567  					syncTicker.Stop()
  1568  					syncTicker = time.NewTicker(allocSyncIntv)
  1569  					staggered = false
  1570  				}
  1571  			}
  1572  		}
  1573  	}
  1574  }
  1575  
  1576  // allocUpdates holds the results of receiving updated allocations from the
  1577  // servers.
  1578  type allocUpdates struct {
  1579  	// pulled is the set of allocations that were downloaded from the servers.
  1580  	pulled map[string]*structs.Allocation
  1581  
  1582  	// filtered is the set of allocations that were not pulled because their
  1583  	// AllocModifyIndex didn't change.
  1584  	filtered map[string]struct{}
  1585  
  1586  	// migrateTokens are a list of tokens necessary for when clients pull data
  1587  	// from authorized volumes
  1588  	migrateTokens map[string]string
  1589  }
  1590  
  1591  // watchAllocations is used to scan for updates to allocations
  1592  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1593  	// The request and response for getting the map of allocations that should
  1594  	// be running on the Node to their AllocModifyIndex which is incremented
  1595  	// when the allocation is updated by the servers.
  1596  	req := structs.NodeSpecificRequest{
  1597  		NodeID:   c.NodeID(),
  1598  		SecretID: c.secretNodeID(),
  1599  		QueryOptions: structs.QueryOptions{
  1600  			Region:     c.Region(),
  1601  			AllowStale: true,
  1602  		},
  1603  	}
  1604  	var resp structs.NodeClientAllocsResponse
  1605  
  1606  	// The request and response for pulling down the set of allocations that are
  1607  	// new, or updated server side.
  1608  	allocsReq := structs.AllocsGetRequest{
  1609  		QueryOptions: structs.QueryOptions{
  1610  			Region:     c.Region(),
  1611  			AllowStale: true,
  1612  		},
  1613  	}
  1614  	var allocsResp structs.AllocsGetResponse
  1615  
  1616  OUTER:
  1617  	for {
  1618  		// Get the allocation modify index map, blocking for updates. We will
  1619  		// use this to determine exactly what allocations need to be downloaded
  1620  		// in full.
  1621  		resp = structs.NodeClientAllocsResponse{}
  1622  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1623  		if err != nil {
  1624  			// Shutdown often causes EOF errors, so check for shutdown first
  1625  			select {
  1626  			case <-c.shutdownCh:
  1627  				return
  1628  			default:
  1629  			}
  1630  
  1631  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1632  			// servers are not fully upgraded before the clients register. This
  1633  			// can cause the SecretID to be lost
  1634  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1635  				c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err)
  1636  				c.retryRegisterNode()
  1637  			} else if err != noServersErr {
  1638  				c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err)
  1639  			}
  1640  			retry := c.retryIntv(getAllocRetryIntv)
  1641  			select {
  1642  			case <-c.rpcRetryWatcher():
  1643  				continue
  1644  			case <-time.After(retry):
  1645  				continue
  1646  			case <-c.shutdownCh:
  1647  				return
  1648  			}
  1649  		}
  1650  
  1651  		// Check for shutdown
  1652  		select {
  1653  		case <-c.shutdownCh:
  1654  			return
  1655  		default:
  1656  		}
  1657  
  1658  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1659  		// These are the allocations who have either not been updated, or whose
  1660  		// updates are a result of the client sending an update for the alloc.
  1661  		// This lets us reduce the network traffic to the server as we don't
  1662  		// need to pull all the allocations.
  1663  		var pull []string
  1664  		filtered := make(map[string]struct{})
  1665  		runners := c.getAllocRunners()
  1666  		var pullIndex uint64
  1667  		for allocID, modifyIndex := range resp.Allocs {
  1668  			// Pull the allocation if we don't have an alloc runner for the
  1669  			// allocation or if the alloc runner requires an updated allocation.
  1670  			runner, ok := runners[allocID]
  1671  
  1672  			if !ok || runner.shouldUpdate(modifyIndex) {
  1673  				// Only pull allocs that are required. Filtered
  1674  				// allocs might be at a higher index, so ignore
  1675  				// it.
  1676  				if modifyIndex > pullIndex {
  1677  					pullIndex = modifyIndex
  1678  				}
  1679  				pull = append(pull, allocID)
  1680  			} else {
  1681  				filtered[allocID] = struct{}{}
  1682  			}
  1683  		}
  1684  
  1685  		// Pull the allocations that passed filtering.
  1686  		allocsResp.Allocs = nil
  1687  		var pulledAllocs map[string]*structs.Allocation
  1688  		if len(pull) != 0 {
  1689  			// Pull the allocations that need to be updated.
  1690  			allocsReq.AllocIDs = pull
  1691  			allocsReq.MinQueryIndex = pullIndex - 1
  1692  			allocsResp = structs.AllocsGetResponse{}
  1693  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1694  				c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err)
  1695  				retry := c.retryIntv(getAllocRetryIntv)
  1696  				select {
  1697  				case <-c.rpcRetryWatcher():
  1698  					continue
  1699  				case <-time.After(retry):
  1700  					continue
  1701  				case <-c.shutdownCh:
  1702  					return
  1703  				}
  1704  			}
  1705  
  1706  			// Ensure that we received all the allocations we wanted
  1707  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1708  			for _, alloc := range allocsResp.Allocs {
  1709  				pulledAllocs[alloc.ID] = alloc
  1710  			}
  1711  
  1712  			for _, desiredID := range pull {
  1713  				if _, ok := pulledAllocs[desiredID]; !ok {
  1714  					// We didn't get everything we wanted. Do not update the
  1715  					// MinQueryIndex, sleep and then retry.
  1716  					wait := c.retryIntv(2 * time.Second)
  1717  					select {
  1718  					case <-time.After(wait):
  1719  						// Wait for the server we contact to receive the
  1720  						// allocations
  1721  						continue OUTER
  1722  					case <-c.shutdownCh:
  1723  						return
  1724  					}
  1725  				}
  1726  			}
  1727  
  1728  			// Check for shutdown
  1729  			select {
  1730  			case <-c.shutdownCh:
  1731  				return
  1732  			default:
  1733  			}
  1734  		}
  1735  
  1736  		c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)",
  1737  			resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered))
  1738  
  1739  		// Update the query index.
  1740  		if resp.Index > req.MinQueryIndex {
  1741  			req.MinQueryIndex = resp.Index
  1742  		}
  1743  
  1744  		// Push the updates.
  1745  		update := &allocUpdates{
  1746  			filtered:      filtered,
  1747  			pulled:        pulledAllocs,
  1748  			migrateTokens: resp.MigrateTokens,
  1749  		}
  1750  		select {
  1751  		case updates <- update:
  1752  		case <-c.shutdownCh:
  1753  			return
  1754  		}
  1755  	}
  1756  }
  1757  
  1758  // updateNode updates the Node copy and triggers the client to send the updated
  1759  // Node to the server. This should be done while the caller holds the
  1760  // configLock lock.
  1761  func (c *Client) updateNodeLocked() {
  1762  	// Update the config copy.
  1763  	node := c.config.Node.Copy()
  1764  	c.configCopy.Node = node
  1765  
  1766  	select {
  1767  	case c.triggerNodeUpdate <- struct{}{}:
  1768  		// Node update goroutine was released to execute
  1769  	default:
  1770  		// Node update goroutine was already running
  1771  	}
  1772  }
  1773  
  1774  // watchNodeUpdates blocks until it is edge triggered. Once triggered,
  1775  // it will update the client node copy and re-register the node.
  1776  func (c *Client) watchNodeUpdates() {
  1777  	var hasChanged bool
  1778  	timer := time.NewTimer(c.retryIntv(nodeUpdateRetryIntv))
  1779  	defer timer.Stop()
  1780  
  1781  	for {
  1782  		select {
  1783  		case <-timer.C:
  1784  			c.logger.Printf("[DEBUG] client: state changed, updating node and re-registering.")
  1785  			c.retryRegisterNode()
  1786  			hasChanged = false
  1787  		case <-c.triggerNodeUpdate:
  1788  			if hasChanged {
  1789  				continue
  1790  			}
  1791  			hasChanged = true
  1792  			timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
  1793  		case <-c.shutdownCh:
  1794  			return
  1795  		}
  1796  	}
  1797  }
  1798  
  1799  // runAllocs is invoked when we get an updated set of allocations
  1800  func (c *Client) runAllocs(update *allocUpdates) {
  1801  	// Get the existing allocs
  1802  	c.allocLock.RLock()
  1803  	exist := make([]*structs.Allocation, 0, len(c.allocs))
  1804  	for _, ar := range c.allocs {
  1805  		exist = append(exist, ar.alloc)
  1806  	}
  1807  	c.allocLock.RUnlock()
  1808  
  1809  	// Diff the existing and updated allocations
  1810  	diff := diffAllocs(exist, update)
  1811  	c.logger.Printf("[DEBUG] client: %#v", diff)
  1812  
  1813  	// Remove the old allocations
  1814  	for _, remove := range diff.removed {
  1815  		c.removeAlloc(remove)
  1816  	}
  1817  
  1818  	// Update the existing allocations
  1819  	for _, update := range diff.updated {
  1820  		if err := c.updateAlloc(update.exist, update.updated); err != nil {
  1821  			c.logger.Printf("[ERR] client: failed to update alloc %q: %v",
  1822  				update.exist.ID, err)
  1823  		}
  1824  	}
  1825  
  1826  	// Make room for new allocations before running
  1827  	if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil {
  1828  		c.logger.Printf("[ERR] client: error making room for new allocations: %v", err)
  1829  	}
  1830  
  1831  	// Start the new allocations
  1832  	for _, add := range diff.added {
  1833  		migrateToken := update.migrateTokens[add.ID]
  1834  		if err := c.addAlloc(add, migrateToken); err != nil {
  1835  			c.logger.Printf("[ERR] client: failed to add alloc '%s': %v",
  1836  				add.ID, err)
  1837  		}
  1838  	}
  1839  
  1840  	// Trigger the GC once more now that new allocs are started that could
  1841  	// have caused thresholds to be exceeded
  1842  	c.garbageCollector.Trigger()
  1843  }
  1844  
  1845  // removeAlloc is invoked when we should remove an allocation because it has
  1846  // been removed by the server.
  1847  func (c *Client) removeAlloc(alloc *structs.Allocation) {
  1848  	c.allocLock.Lock()
  1849  	ar, ok := c.allocs[alloc.ID]
  1850  	if !ok {
  1851  		c.allocLock.Unlock()
  1852  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID)
  1853  		return
  1854  	}
  1855  
  1856  	// Stop tracking alloc runner as it's been GC'd by the server
  1857  	delete(c.allocs, alloc.ID)
  1858  	c.allocLock.Unlock()
  1859  
  1860  	// Ensure the GC has a reference and then collect. Collecting through the GC
  1861  	// applies rate limiting
  1862  	c.garbageCollector.MarkForCollection(ar)
  1863  
  1864  	// GC immediately since the server has GC'd it
  1865  	go c.garbageCollector.Collect(alloc.ID)
  1866  }
  1867  
  1868  // updateAlloc is invoked when we should update an allocation
  1869  func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
  1870  	c.allocLock.RLock()
  1871  	ar, ok := c.allocs[exist.ID]
  1872  	c.allocLock.RUnlock()
  1873  	if !ok {
  1874  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID)
  1875  		return nil
  1876  	}
  1877  
  1878  	ar.Update(update)
  1879  	return nil
  1880  }
  1881  
  1882  // addAlloc is invoked when we should add an allocation
  1883  func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error {
  1884  	// Check if we already have an alloc runner
  1885  	c.allocLock.Lock()
  1886  	defer c.allocLock.Unlock()
  1887  	if _, ok := c.allocs[alloc.ID]; ok {
  1888  		c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID)
  1889  		return nil
  1890  	}
  1891  
  1892  	// get the previous alloc runner - if one exists - for the
  1893  	// blocking/migrating watcher
  1894  	var prevAR *AllocRunner
  1895  	if alloc.PreviousAllocation != "" {
  1896  		prevAR = c.allocs[alloc.PreviousAllocation]
  1897  	}
  1898  
  1899  	c.configLock.RLock()
  1900  	prevAlloc := newAllocWatcher(alloc, prevAR, c, c.configCopy, c.logger, migrateToken)
  1901  
  1902  	// Copy the config since the node can be swapped out as it is being updated.
  1903  	// The long term fix is to pass in the config and node separately and then
  1904  	// we don't have to do a copy.
  1905  	ar := NewAllocRunner(c.logger, c.configCopy.Copy(), c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc)
  1906  	c.configLock.RUnlock()
  1907  
  1908  	// Store the alloc runner.
  1909  	c.allocs[alloc.ID] = ar
  1910  
  1911  	if err := ar.SaveState(); err != nil {
  1912  		c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", alloc.ID, err)
  1913  	}
  1914  
  1915  	go ar.Run()
  1916  	return nil
  1917  }
  1918  
  1919  // setupVaultClient creates an object to periodically renew tokens and secrets
  1920  // with vault.
  1921  func (c *Client) setupVaultClient() error {
  1922  	var err error
  1923  	c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken)
  1924  	if err != nil {
  1925  		return err
  1926  	}
  1927  
  1928  	if c.vaultClient == nil {
  1929  		c.logger.Printf("[ERR] client: failed to create vault client")
  1930  		return fmt.Errorf("failed to create vault client")
  1931  	}
  1932  
  1933  	// Start renewing tokens and secrets
  1934  	c.vaultClient.Start()
  1935  
  1936  	return nil
  1937  }
  1938  
  1939  // deriveToken takes in an allocation and a set of tasks and derives vault
  1940  // tokens for each of the tasks, unwraps all of them using the supplied vault
  1941  // client and returns a map of unwrapped tokens, indexed by the task name.
  1942  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  1943  	if alloc == nil {
  1944  		return nil, fmt.Errorf("nil allocation")
  1945  	}
  1946  
  1947  	if taskNames == nil || len(taskNames) == 0 {
  1948  		return nil, fmt.Errorf("missing task names")
  1949  	}
  1950  
  1951  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1952  	if group == nil {
  1953  		return nil, fmt.Errorf("group name in allocation is not present in job")
  1954  	}
  1955  
  1956  	verifiedTasks := []string{}
  1957  	// Check if the given task names actually exist in the allocation
  1958  	for _, taskName := range taskNames {
  1959  		found := false
  1960  		for _, task := range group.Tasks {
  1961  			if task.Name == taskName {
  1962  				found = true
  1963  			}
  1964  		}
  1965  		if !found {
  1966  			c.logger.Printf("[ERR] task %q not found in the allocation", taskName)
  1967  			return nil, fmt.Errorf("task %q not found in the allocation", taskName)
  1968  		}
  1969  		verifiedTasks = append(verifiedTasks, taskName)
  1970  	}
  1971  
  1972  	// DeriveVaultToken of nomad server can take in a set of tasks and
  1973  	// creates tokens for all the tasks.
  1974  	req := &structs.DeriveVaultTokenRequest{
  1975  		NodeID:   c.NodeID(),
  1976  		SecretID: c.secretNodeID(),
  1977  		AllocID:  alloc.ID,
  1978  		Tasks:    verifiedTasks,
  1979  		QueryOptions: structs.QueryOptions{
  1980  			Region:     c.Region(),
  1981  			AllowStale: false,
  1982  		},
  1983  	}
  1984  
  1985  	// Derive the tokens
  1986  	var resp structs.DeriveVaultTokenResponse
  1987  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  1988  		c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err)
  1989  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  1990  	}
  1991  	if resp.Error != nil {
  1992  		c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error)
  1993  		return nil, structs.NewWrappedServerError(resp.Error)
  1994  	}
  1995  	if resp.Tasks == nil {
  1996  		c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response")
  1997  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  1998  	}
  1999  
  2000  	unwrappedTokens := make(map[string]string)
  2001  
  2002  	// Retrieve the wrapped tokens from the response and unwrap it
  2003  	for _, taskName := range verifiedTasks {
  2004  		// Get the wrapped token
  2005  		wrappedToken, ok := resp.Tasks[taskName]
  2006  		if !ok {
  2007  			c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName)
  2008  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  2009  		}
  2010  
  2011  		// Unwrap the vault token
  2012  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  2013  		if err != nil {
  2014  			if structs.VaultUnrecoverableError.MatchString(err.Error()) {
  2015  				return nil, err
  2016  			}
  2017  
  2018  			// The error is recoverable
  2019  			return nil, structs.NewRecoverableError(
  2020  				fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err), true)
  2021  		}
  2022  
  2023  		// Validate the response
  2024  		var validationErr error
  2025  		if unwrapResp == nil {
  2026  			validationErr = fmt.Errorf("Vault returned nil secret when unwrapping")
  2027  		} else if unwrapResp.Auth == nil {
  2028  			validationErr = fmt.Errorf("Vault returned unwrap secret with nil Auth. Secret warnings: %v", unwrapResp.Warnings)
  2029  		} else if unwrapResp.Auth.ClientToken == "" {
  2030  			validationErr = fmt.Errorf("Vault returned unwrap secret with empty Auth.ClientToken. Secret warnings: %v", unwrapResp.Warnings)
  2031  		}
  2032  		if validationErr != nil {
  2033  			c.logger.Printf("[WARN] client.vault: failed to unwrap token: %v", err)
  2034  			return nil, structs.NewRecoverableError(validationErr, true)
  2035  		}
  2036  
  2037  		// Append the unwrapped token to the return value
  2038  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  2039  	}
  2040  
  2041  	return unwrappedTokens, nil
  2042  }
  2043  
  2044  // triggerDiscovery causes a Consul discovery to begin (if one hasn't already)
  2045  func (c *Client) triggerDiscovery() {
  2046  	select {
  2047  	case c.triggerDiscoveryCh <- struct{}{}:
  2048  		// Discovery goroutine was released to execute
  2049  	default:
  2050  		// Discovery goroutine was already running
  2051  	}
  2052  }
  2053  
  2054  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  2055  // It's intended to be started in a goroutine. See triggerDiscovery() for
  2056  // causing consul discovery from other code locations.
  2057  func (c *Client) consulDiscovery() {
  2058  	for {
  2059  		select {
  2060  		case <-c.triggerDiscoveryCh:
  2061  			if err := c.consulDiscoveryImpl(); err != nil {
  2062  				c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err)
  2063  			}
  2064  		case <-c.shutdownCh:
  2065  			return
  2066  		}
  2067  	}
  2068  }
  2069  
  2070  func (c *Client) consulDiscoveryImpl() error {
  2071  	// Acquire heartbeat lock to prevent heartbeat from running
  2072  	// concurrently with discovery. Concurrent execution is safe, however
  2073  	// discovery is usually triggered when heartbeating has failed so
  2074  	// there's no point in allowing it.
  2075  	c.heartbeatLock.Lock()
  2076  	defer c.heartbeatLock.Unlock()
  2077  
  2078  	dcs, err := c.consulCatalog.Datacenters()
  2079  	if err != nil {
  2080  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  2081  	}
  2082  	if len(dcs) > 2 {
  2083  		// Query the local DC first, then shuffle the
  2084  		// remaining DCs.  Future heartbeats will cause Nomad
  2085  		// Clients to fixate on their local datacenter so
  2086  		// it's okay to talk with remote DCs.  If the no
  2087  		// Nomad servers are available within
  2088  		// datacenterQueryLimit, the next heartbeat will pick
  2089  		// a new set of servers so it's okay.
  2090  		shuffleStrings(dcs[1:])
  2091  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  2092  	}
  2093  
  2094  	// Query for servers in this client's region only
  2095  	region := c.Region()
  2096  	rpcargs := structs.GenericRequest{
  2097  		QueryOptions: structs.QueryOptions{
  2098  			Region: region,
  2099  		},
  2100  	}
  2101  
  2102  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  2103  	var mErr multierror.Error
  2104  	var nomadServers servers.Servers
  2105  	c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
  2106  DISCOLOOP:
  2107  	for _, dc := range dcs {
  2108  		consulOpts := &consulapi.QueryOptions{
  2109  			AllowStale: true,
  2110  			Datacenter: dc,
  2111  			Near:       "_agent",
  2112  			WaitTime:   consul.DefaultQueryWaitDuration,
  2113  		}
  2114  		consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  2115  		if err != nil {
  2116  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  2117  			continue
  2118  		}
  2119  
  2120  		for _, s := range consulServices {
  2121  			port := strconv.Itoa(s.ServicePort)
  2122  			addrstr := s.ServiceAddress
  2123  			if addrstr == "" {
  2124  				addrstr = s.Address
  2125  			}
  2126  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  2127  			if err != nil {
  2128  				mErr.Errors = append(mErr.Errors, err)
  2129  				continue
  2130  			}
  2131  			var peers []string
  2132  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  2133  				mErr.Errors = append(mErr.Errors, err)
  2134  				continue
  2135  			}
  2136  
  2137  			// Successfully received the Server peers list of the correct
  2138  			// region
  2139  			for _, p := range peers {
  2140  				addr, err := net.ResolveTCPAddr("tcp", p)
  2141  				if err != nil {
  2142  					mErr.Errors = append(mErr.Errors, err)
  2143  				}
  2144  				srv := &servers.Server{Addr: addr}
  2145  				nomadServers = append(nomadServers, srv)
  2146  			}
  2147  			if len(nomadServers) > 0 {
  2148  				break DISCOLOOP
  2149  			}
  2150  		}
  2151  	}
  2152  	if len(nomadServers) == 0 {
  2153  		if len(mErr.Errors) > 0 {
  2154  			return mErr.ErrorOrNil()
  2155  		}
  2156  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  2157  	}
  2158  
  2159  	c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", nomadServers)
  2160  
  2161  	// Fire the retry trigger if we have updated the set of servers.
  2162  	if c.servers.SetServers(nomadServers) {
  2163  		// Start rebalancing
  2164  		c.servers.RebalanceServers()
  2165  
  2166  		// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  2167  		// isn't receiving on this chan yet they'll still retry eventually.
  2168  		// This is a shortcircuit for the longer retry intervals.
  2169  		c.fireRpcRetryWatcher()
  2170  	}
  2171  
  2172  	return nil
  2173  }
  2174  
  2175  // emitStats collects host resource usage stats periodically
  2176  func (c *Client) emitStats() {
  2177  	// Assign labels directly before emitting stats so the information expected
  2178  	// is ready
  2179  	c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}}
  2180  
  2181  	// Start collecting host stats right away and then keep collecting every
  2182  	// collection interval
  2183  	next := time.NewTimer(0)
  2184  	defer next.Stop()
  2185  	for {
  2186  		select {
  2187  		case <-next.C:
  2188  			err := c.hostStatsCollector.Collect()
  2189  			next.Reset(c.config.StatsCollectionInterval)
  2190  			if err != nil {
  2191  				c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err)
  2192  				continue
  2193  			}
  2194  
  2195  			// Publish Node metrics if operator has opted in
  2196  			if c.config.PublishNodeMetrics {
  2197  				c.emitHostStats()
  2198  			}
  2199  
  2200  			c.emitClientMetrics()
  2201  		case <-c.shutdownCh:
  2202  			return
  2203  		}
  2204  	}
  2205  }
  2206  
  2207  // setGaugeForMemoryStats proxies metrics for memory specific statistics
  2208  func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats) {
  2209  	if !c.config.DisableTaggedMetrics {
  2210  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), c.baseLabels)
  2211  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), c.baseLabels)
  2212  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), c.baseLabels)
  2213  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), c.baseLabels)
  2214  	}
  2215  
  2216  	if c.config.BackwardsCompatibleMetrics {
  2217  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2218  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2219  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2220  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2221  	}
  2222  }
  2223  
  2224  // setGaugeForCPUStats proxies metrics for CPU specific statistics
  2225  func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats) {
  2226  	for _, cpu := range hStats.CPU {
  2227  		if !c.config.DisableTaggedMetrics {
  2228  			labels := append(c.baseLabels, metrics.Label{
  2229  				Name:  "cpu",
  2230  				Value: cpu.CPU,
  2231  			})
  2232  
  2233  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels)
  2234  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels)
  2235  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels)
  2236  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels)
  2237  		}
  2238  
  2239  		if c.config.BackwardsCompatibleMetrics {
  2240  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2241  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2242  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2243  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2244  		}
  2245  	}
  2246  }
  2247  
  2248  // setGaugeForDiskStats proxies metrics for disk specific statistics
  2249  func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats) {
  2250  	for _, disk := range hStats.DiskStats {
  2251  		if !c.config.DisableTaggedMetrics {
  2252  			labels := append(c.baseLabels, metrics.Label{
  2253  				Name:  "disk",
  2254  				Value: disk.Device,
  2255  			})
  2256  
  2257  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels)
  2258  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels)
  2259  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels)
  2260  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels)
  2261  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels)
  2262  		}
  2263  
  2264  		if c.config.BackwardsCompatibleMetrics {
  2265  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2266  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2267  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2268  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2269  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2270  		}
  2271  	}
  2272  }
  2273  
  2274  // setGaugeForAllocationStats proxies metrics for allocation specific statistics
  2275  func (c *Client) setGaugeForAllocationStats(nodeID string) {
  2276  	c.configLock.RLock()
  2277  	node := c.configCopy.Node
  2278  	c.configLock.RUnlock()
  2279  	total := node.Resources
  2280  	res := node.Reserved
  2281  	allocated := c.getAllocatedResources(node)
  2282  
  2283  	// Emit allocated
  2284  	if !c.config.DisableTaggedMetrics {
  2285  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.MemoryMB), c.baseLabels)
  2286  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.DiskMB), c.baseLabels)
  2287  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.CPU), c.baseLabels)
  2288  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "iops"}, float32(allocated.IOPS), c.baseLabels)
  2289  	}
  2290  
  2291  	if c.config.BackwardsCompatibleMetrics {
  2292  		metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB))
  2293  		metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB))
  2294  		metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU))
  2295  		metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS))
  2296  	}
  2297  
  2298  	for _, n := range allocated.Networks {
  2299  		if !c.config.DisableTaggedMetrics {
  2300  			labels := append(c.baseLabels, metrics.Label{
  2301  				Name:  "device",
  2302  				Value: n.Device,
  2303  			})
  2304  			metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels)
  2305  		}
  2306  
  2307  		if c.config.BackwardsCompatibleMetrics {
  2308  			metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2309  		}
  2310  	}
  2311  
  2312  	// Emit unallocated
  2313  	unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB
  2314  	unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB
  2315  	unallocatedCpu := total.CPU - res.CPU - allocated.CPU
  2316  	unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS
  2317  
  2318  	if !c.config.DisableTaggedMetrics {
  2319  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels)
  2320  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels)
  2321  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels)
  2322  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "iops"}, float32(unallocatedIops), c.baseLabels)
  2323  	}
  2324  
  2325  	if c.config.BackwardsCompatibleMetrics {
  2326  		metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2327  		metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2328  		metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2329  		metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops))
  2330  	}
  2331  
  2332  	for _, n := range allocated.Networks {
  2333  		totalIdx := total.NetIndex(n)
  2334  		if totalIdx != -1 {
  2335  			continue
  2336  		}
  2337  
  2338  		totalMbits := total.Networks[totalIdx].MBits
  2339  		unallocatedMbits := totalMbits - n.MBits
  2340  
  2341  		if !c.config.DisableTaggedMetrics {
  2342  			labels := append(c.baseLabels, metrics.Label{
  2343  				Name:  "device",
  2344  				Value: n.Device,
  2345  			})
  2346  			metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels)
  2347  		}
  2348  
  2349  		if c.config.BackwardsCompatibleMetrics {
  2350  			metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2351  		}
  2352  	}
  2353  }
  2354  
  2355  // No labels are required so we emit with only a key/value syntax
  2356  func (c *Client) setGaugeForUptime(hStats *stats.HostStats) {
  2357  	if !c.config.DisableTaggedMetrics {
  2358  		metrics.SetGaugeWithLabels([]string{"client", "uptime"}, float32(hStats.Uptime), c.baseLabels)
  2359  	}
  2360  	if c.config.BackwardsCompatibleMetrics {
  2361  		metrics.SetGauge([]string{"client", "uptime"}, float32(hStats.Uptime))
  2362  	}
  2363  }
  2364  
  2365  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2366  func (c *Client) emitHostStats() {
  2367  	nodeID := c.NodeID()
  2368  	hStats := c.hostStatsCollector.Stats()
  2369  
  2370  	c.setGaugeForMemoryStats(nodeID, hStats)
  2371  	c.setGaugeForUptime(hStats)
  2372  	c.setGaugeForCPUStats(nodeID, hStats)
  2373  	c.setGaugeForDiskStats(nodeID, hStats)
  2374  }
  2375  
  2376  // emitClientMetrics emits lower volume client metrics
  2377  func (c *Client) emitClientMetrics() {
  2378  	nodeID := c.NodeID()
  2379  
  2380  	c.setGaugeForAllocationStats(nodeID)
  2381  
  2382  	// Emit allocation metrics
  2383  	blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0
  2384  	for _, ar := range c.getAllocRunners() {
  2385  		switch ar.Alloc().ClientStatus {
  2386  		case structs.AllocClientStatusPending:
  2387  			switch {
  2388  			case ar.IsWaiting():
  2389  				blocked++
  2390  			case ar.IsMigrating():
  2391  				migrating++
  2392  			default:
  2393  				pending++
  2394  			}
  2395  		case structs.AllocClientStatusRunning:
  2396  			running++
  2397  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2398  			terminal++
  2399  		}
  2400  	}
  2401  
  2402  	if !c.config.DisableTaggedMetrics {
  2403  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels)
  2404  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels)
  2405  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels)
  2406  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels)
  2407  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels)
  2408  	}
  2409  
  2410  	if c.config.BackwardsCompatibleMetrics {
  2411  		metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2412  		metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2413  		metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2414  		metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2415  		metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2416  	}
  2417  }
  2418  
  2419  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources {
  2420  	// Unfortunately the allocs only have IP so we need to match them to the
  2421  	// device
  2422  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  2423  	for _, n := range selfNode.Resources.Networks {
  2424  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  2425  		if err != nil {
  2426  			continue
  2427  		}
  2428  		cidrToDevice[ipnet] = n.Device
  2429  	}
  2430  
  2431  	// Sum the allocated resources
  2432  	allocs := c.allAllocs()
  2433  	var allocated structs.Resources
  2434  	allocatedDeviceMbits := make(map[string]int)
  2435  	for _, alloc := range allocs {
  2436  		if !alloc.TerminalStatus() {
  2437  			allocated.Add(alloc.Resources)
  2438  			for _, allocatedNetwork := range alloc.Resources.Networks {
  2439  				for cidr, dev := range cidrToDevice {
  2440  					ip := net.ParseIP(allocatedNetwork.IP)
  2441  					if cidr.Contains(ip) {
  2442  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2443  						break
  2444  					}
  2445  				}
  2446  			}
  2447  		}
  2448  	}
  2449  
  2450  	// Clear the networks
  2451  	allocated.Networks = nil
  2452  	for dev, speed := range allocatedDeviceMbits {
  2453  		net := &structs.NetworkResource{
  2454  			Device: dev,
  2455  			MBits:  speed,
  2456  		}
  2457  		allocated.Networks = append(allocated.Networks, net)
  2458  	}
  2459  
  2460  	return &allocated
  2461  }
  2462  
  2463  // allAllocs returns all the allocations managed by the client
  2464  func (c *Client) allAllocs() map[string]*structs.Allocation {
  2465  	ars := c.getAllocRunners()
  2466  	allocs := make(map[string]*structs.Allocation, len(ars))
  2467  	for _, ar := range c.getAllocRunners() {
  2468  		a := ar.Alloc()
  2469  		allocs[a.ID] = a
  2470  	}
  2471  	return allocs
  2472  }