github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/client.go

github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"log"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	metrics "github.com/armon/go-metrics"
    17  	"github.com/boltdb/bolt"
    18  	consulapi "github.com/hashicorp/consul/api"
    19  	"github.com/hashicorp/consul/lib"
    20  	multierror "github.com/hashicorp/go-multierror"
    21  	"github.com/hashicorp/nomad/client/allocdir"
    22  	"github.com/hashicorp/nomad/client/config"
    23  	"github.com/hashicorp/nomad/client/driver"
    24  	"github.com/hashicorp/nomad/client/fingerprint"
    25  	"github.com/hashicorp/nomad/client/stats"
    26  	"github.com/hashicorp/nomad/client/vaultclient"
    27  	"github.com/hashicorp/nomad/command/agent/consul"
    28  	"github.com/hashicorp/nomad/helper"
    29  	"github.com/hashicorp/nomad/helper/tlsutil"
    30  	"github.com/hashicorp/nomad/helper/uuid"
    31  	"github.com/hashicorp/nomad/nomad"
    32  	"github.com/hashicorp/nomad/nomad/structs"
    33  	vaultapi "github.com/hashicorp/vault/api"
    34  	"github.com/mitchellh/hashstructure"
    35  	"github.com/shirou/gopsutil/host"
    36  )
    37  
    38  const (
    39  	// clientRPCCache controls how long we keep an idle connection
    40  	// open to a server
    41  	clientRPCCache = 5 * time.Minute
    42  
    43  	// clientMaxStreams controsl how many idle streams we keep
    44  	// open to a server
    45  	clientMaxStreams = 2
    46  
    47  	// datacenterQueryLimit searches through up to this many adjacent
    48  	// datacenters looking for the Nomad server service.
    49  	datacenterQueryLimit = 9
    50  
    51  	// registerRetryIntv is minimum interval on which we retry
    52  	// registration. We pick a value between this and 2x this.
    53  	registerRetryIntv = 15 * time.Second
    54  
    55  	// getAllocRetryIntv is minimum interval on which we retry
    56  	// to fetch allocations. We pick a value between this and 2x this.
    57  	getAllocRetryIntv = 30 * time.Second
    58  
    59  	// devModeRetryIntv is the retry interval used for development
    60  	devModeRetryIntv = time.Second
    61  
    62  	// stateSnapshotIntv is how often the client snapshots state
    63  	stateSnapshotIntv = 60 * time.Second
    64  
    65  	// initialHeartbeatStagger is used to stagger the interval between
    66  	// starting and the initial heartbeat. After the initial heartbeat,
    67  	// we switch to using the TTL specified by the servers.
    68  	initialHeartbeatStagger = 10 * time.Second
    69  
    70  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    71  	// node attributes or meta map.
    72  	nodeUpdateRetryIntv = 5 * time.Second
    73  
    74  	// allocSyncIntv is the batching period of allocation updates before they
    75  	// are synced with the server.
    76  	allocSyncIntv = 200 * time.Millisecond
    77  
    78  	// allocSyncRetryIntv is the interval on which we retry updating
    79  	// the status of the allocation
    80  	allocSyncRetryIntv = 5 * time.Second
    81  )
    82  
    83  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
    84  // Client
    85  type ClientStatsReporter interface {
    86  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
    87  	// If it does not exist an error is reported.
    88  	GetAllocStats(allocID string) (AllocStatsReporter, error)
    89  
    90  	// LatestHostStats returns the latest resource usage stats for the host
    91  	LatestHostStats() *stats.HostStats
    92  }
    93  
    94  // Client is used to implement the client interaction with Nomad. Clients
    95  // are expected to register as a schedulable node to the servers, and to
    96  // run allocations as determined by the servers.
    97  type Client struct {
    98  	config *config.Config
    99  	start  time.Time
   100  
   101  	// stateDB is used to efficiently store client state.
   102  	stateDB *bolt.DB
   103  
   104  	// configCopy is a copy that should be passed to alloc-runners.
   105  	configCopy *config.Config
   106  	configLock sync.RWMutex
   107  
   108  	logger *log.Logger
   109  
   110  	connPool *nomad.ConnPool
   111  
   112  	// servers is the (optionally prioritized) list of nomad servers
   113  	servers *serverlist
   114  
   115  	// heartbeat related times for tracking how often to heartbeat
   116  	lastHeartbeat   time.Time
   117  	heartbeatTTL    time.Duration
   118  	haveHeartbeated bool
   119  	heartbeatLock   sync.Mutex
   120  
   121  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   122  	triggerDiscoveryCh chan struct{}
   123  
   124  	// discovered will be ticked whenever Consul discovery completes
   125  	// successfully
   126  	serversDiscoveredCh chan struct{}
   127  
   128  	// allocs maps alloc IDs to their AllocRunner. This map includes all
   129  	// AllocRunners - running and GC'd - until the server GCs them.
   130  	allocs    map[string]*AllocRunner
   131  	allocLock sync.RWMutex
   132  
   133  	// allocUpdates stores allocations that need to be synced to the server.
   134  	allocUpdates chan *structs.Allocation
   135  
   136  	// consulService is Nomad's custom Consul client for managing services
   137  	// and checks.
   138  	consulService ConsulServiceAPI
   139  
   140  	// consulCatalog is the subset of Consul's Catalog API Nomad uses.
   141  	consulCatalog consul.CatalogAPI
   142  
   143  	// HostStatsCollector collects host resource usage stats
   144  	hostStatsCollector *stats.HostStatsCollector
   145  
   146  	shutdown     bool
   147  	shutdownCh   chan struct{}
   148  	shutdownLock sync.Mutex
   149  
   150  	// vaultClient is used to interact with Vault for token and secret renewals
   151  	vaultClient vaultclient.VaultClient
   152  
   153  	// garbageCollector is used to garbage collect terminal allocations present
   154  	// in the node automatically
   155  	garbageCollector *AllocGarbageCollector
   156  
   157  	// clientACLResolver holds the ACL resolution state
   158  	clientACLResolver
   159  
   160  	// baseLabels are used when emitting tagged metrics. All client metrics will
   161  	// have these tags, and optionally more.
   162  	baseLabels []metrics.Label
   163  }
   164  
   165  var (
   166  	// noServersErr is returned by the RPC method when the client has no
   167  	// configured servers. This is used to trigger Consul discovery if
   168  	// enabled.
   169  	noServersErr = errors.New("no servers")
   170  )
   171  
   172  // NewClient is used to create a new client from the given configuration
   173  func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) {
   174  	// Create the tls wrapper
   175  	var tlsWrap tlsutil.RegionWrapper
   176  	if cfg.TLSConfig.EnableRPC {
   177  		tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper()
   178  		if err != nil {
   179  			return nil, err
   180  		}
   181  		tlsWrap = tw
   182  	}
   183  
   184  	// Create the client
   185  	c := &Client{
   186  		config:              cfg,
   187  		consulCatalog:       consulCatalog,
   188  		consulService:       consulService,
   189  		start:               time.Now(),
   190  		connPool:            nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap),
   191  		logger:              logger,
   192  		allocs:              make(map[string]*AllocRunner),
   193  		allocUpdates:        make(chan *structs.Allocation, 64),
   194  		shutdownCh:          make(chan struct{}),
   195  		servers:             newServerList(),
   196  		triggerDiscoveryCh:  make(chan struct{}),
   197  		serversDiscoveredCh: make(chan struct{}),
   198  	}
   199  
   200  	// Initialize the client
   201  	if err := c.init(); err != nil {
   202  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   203  	}
   204  
   205  	// Initialize the ACL state
   206  	if err := c.clientACLResolver.init(); err != nil {
   207  		return nil, fmt.Errorf("failed to initialize ACL state: %v", err)
   208  	}
   209  
   210  	// Add the stats collector
   211  	statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir)
   212  	c.hostStatsCollector = statsCollector
   213  
   214  	// Add the garbage collector
   215  	gcConfig := &GCConfig{
   216  		MaxAllocs:           cfg.GCMaxAllocs,
   217  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   218  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   219  		Interval:            cfg.GCInterval,
   220  		ParallelDestroys:    cfg.GCParallelDestroys,
   221  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   222  	}
   223  	c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig)
   224  	go c.garbageCollector.Run()
   225  
   226  	// Setup the node
   227  	if err := c.setupNode(); err != nil {
   228  		return nil, fmt.Errorf("node setup failed: %v", err)
   229  	}
   230  
   231  	// Fingerprint the node
   232  	if err := c.fingerprint(); err != nil {
   233  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   234  	}
   235  
   236  	// Scan for drivers
   237  	if err := c.setupDrivers(); err != nil {
   238  		return nil, fmt.Errorf("driver setup failed: %v", err)
   239  	}
   240  
   241  	// Setup the reserved resources
   242  	c.reservePorts()
   243  
   244  	// Store the config copy before restoring state but after it has been
   245  	// initialized.
   246  	c.configLock.Lock()
   247  	c.configCopy = c.config.Copy()
   248  	c.configLock.Unlock()
   249  
   250  	// Set the preconfigured list of static servers
   251  	c.configLock.RLock()
   252  	if len(c.configCopy.Servers) > 0 {
   253  		if err := c.SetServers(c.configCopy.Servers); err != nil {
   254  			logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
   255  		}
   256  	}
   257  	c.configLock.RUnlock()
   258  
   259  	// Setup Consul discovery if enabled
   260  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   261  		go c.consulDiscovery()
   262  		if len(c.servers.all()) == 0 {
   263  			// No configured servers; trigger discovery manually
   264  			c.triggerDiscoveryCh <- struct{}{}
   265  		}
   266  	}
   267  
   268  	// Setup the vault client for token and secret renewals
   269  	if err := c.setupVaultClient(); err != nil {
   270  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   271  	}
   272  
   273  	// Restore the state
   274  	if err := c.restoreState(); err != nil {
   275  		logger.Printf("[ERR] client: failed to restore state: %v", err)
   276  		logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+
   277  			"The safest way to proceed is to manually stop running task processes "+
   278  			"and remove Nomad's state (%q) and alloc (%q) directories before "+
   279  			"restarting. Lost allocations will be rescheduled.",
   280  			c.config.StateDir, c.config.AllocDir)
   281  		logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " +
   282  			"report as much information as possible to " +
   283  			"https://github.com/hashicorp/nomad/issues")
   284  		return nil, fmt.Errorf("failed to restore state")
   285  	}
   286  
   287  	// Register and then start heartbeating to the servers.
   288  	go c.registerAndHeartbeat()
   289  
   290  	// Begin periodic snapshotting of state.
   291  	go c.periodicSnapshot()
   292  
   293  	// Begin syncing allocations to the server
   294  	go c.allocSync()
   295  
   296  	// Start the client!
   297  	go c.run()
   298  
   299  	// Start collecting stats
   300  	go c.emitStats()
   301  
   302  	c.logger.Printf("[INFO] client: Node ID %q", c.NodeID())
   303  	return c, nil
   304  }
   305  
   306  // init is used to initialize the client and perform any setup
   307  // needed before we begin starting its various components.
   308  func (c *Client) init() error {
   309  	// Ensure the state dir exists if we have one
   310  	if c.config.StateDir != "" {
   311  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   312  			return fmt.Errorf("failed creating state dir: %s", err)
   313  		}
   314  
   315  	} else {
   316  		// Othewise make a temp directory to use.
   317  		p, err := ioutil.TempDir("", "NomadClient")
   318  		if err != nil {
   319  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   320  		}
   321  
   322  		p, err = filepath.EvalSymlinks(p)
   323  		if err != nil {
   324  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   325  		}
   326  
   327  		c.config.StateDir = p
   328  	}
   329  	c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir)
   330  
   331  	// Create or open the state database
   332  	db, err := bolt.Open(filepath.Join(c.config.StateDir, "state.db"), 0600, nil)
   333  	if err != nil {
   334  		return fmt.Errorf("failed to create state database: %v", err)
   335  	}
   336  	c.stateDB = db
   337  
   338  	// Ensure the alloc dir exists if we have one
   339  	if c.config.AllocDir != "" {
   340  		if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil {
   341  			return fmt.Errorf("failed creating alloc dir: %s", err)
   342  		}
   343  	} else {
   344  		// Othewise make a temp directory to use.
   345  		p, err := ioutil.TempDir("", "NomadClient")
   346  		if err != nil {
   347  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   348  		}
   349  
   350  		p, err = filepath.EvalSymlinks(p)
   351  		if err != nil {
   352  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   353  		}
   354  
   355  		// Change the permissions to have the execute bit
   356  		if err := os.Chmod(p, 0711); err != nil {
   357  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   358  		}
   359  
   360  		c.config.AllocDir = p
   361  	}
   362  
   363  	c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir)
   364  	return nil
   365  }
   366  
   367  // Leave is used to prepare the client to leave the cluster
   368  func (c *Client) Leave() error {
   369  	// TODO
   370  	return nil
   371  }
   372  
   373  // GetConfig returns the config of the client for testing purposes only
   374  func (c *Client) GetConfig() *config.Config {
   375  	return c.config
   376  }
   377  
   378  // Datacenter returns the datacenter for the given client
   379  func (c *Client) Datacenter() string {
   380  	return c.config.Node.Datacenter
   381  }
   382  
   383  // Region returns the region for the given client
   384  func (c *Client) Region() string {
   385  	return c.config.Region
   386  }
   387  
   388  // NodeID returns the node ID for the given client
   389  func (c *Client) NodeID() string {
   390  	return c.config.Node.ID
   391  }
   392  
   393  // secretNodeID returns the secret node ID for the given client
   394  func (c *Client) secretNodeID() string {
   395  	return c.config.Node.SecretID
   396  }
   397  
   398  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   399  // client.
   400  func (c *Client) RPCMajorVersion() int {
   401  	return structs.ApiMajorVersion
   402  }
   403  
   404  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   405  // client.
   406  func (c *Client) RPCMinorVersion() int {
   407  	return structs.ApiMinorVersion
   408  }
   409  
   410  // Shutdown is used to tear down the client
   411  func (c *Client) Shutdown() error {
   412  	c.logger.Printf("[INFO] client: shutting down")
   413  	c.shutdownLock.Lock()
   414  	defer c.shutdownLock.Unlock()
   415  
   416  	if c.shutdown {
   417  		return nil
   418  	}
   419  
   420  	// Defer closing the database
   421  	defer func() {
   422  		if err := c.stateDB.Close(); err != nil {
   423  			c.logger.Printf("[ERR] client: failed to close state database on shutdown: %v", err)
   424  		}
   425  	}()
   426  
   427  	// Stop renewing tokens and secrets
   428  	if c.vaultClient != nil {
   429  		c.vaultClient.Stop()
   430  	}
   431  
   432  	// Stop Garbage collector
   433  	c.garbageCollector.Stop()
   434  
   435  	// Destroy all the running allocations.
   436  	if c.config.DevMode {
   437  		for _, ar := range c.getAllocRunners() {
   438  			ar.Destroy()
   439  			<-ar.WaitCh()
   440  		}
   441  	}
   442  
   443  	c.shutdown = true
   444  	close(c.shutdownCh)
   445  	c.connPool.Shutdown()
   446  	return c.saveState()
   447  }
   448  
   449  // RPC is used to forward an RPC call to a nomad server, or fail if no servers.
   450  func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
   451  	// Invoke the RPCHandler if it exists
   452  	if c.config.RPCHandler != nil {
   453  		return c.config.RPCHandler.RPC(method, args, reply)
   454  	}
   455  
   456  	servers := c.servers.all()
   457  	if len(servers) == 0 {
   458  		return noServersErr
   459  	}
   460  
   461  	var mErr multierror.Error
   462  	for _, s := range servers {
   463  		// Make the RPC request
   464  		if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil {
   465  			errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err)
   466  			mErr.Errors = append(mErr.Errors, errmsg)
   467  			c.logger.Printf("[DEBUG] client: %v", errmsg)
   468  			c.servers.failed(s)
   469  			continue
   470  		}
   471  		c.servers.good(s)
   472  		return nil
   473  	}
   474  
   475  	return mErr.ErrorOrNil()
   476  }
   477  
   478  // Stats is used to return statistics for debugging and insight
   479  // for various sub-systems
   480  func (c *Client) Stats() map[string]map[string]string {
   481  	c.heartbeatLock.Lock()
   482  	defer c.heartbeatLock.Unlock()
   483  	stats := map[string]map[string]string{
   484  		"client": {
   485  			"node_id":         c.NodeID(),
   486  			"known_servers":   c.servers.all().String(),
   487  			"num_allocations": strconv.Itoa(c.NumAllocs()),
   488  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   489  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   490  		},
   491  		"runtime": nomad.RuntimeStats(),
   492  	}
   493  	return stats
   494  }
   495  
   496  // CollectAllocation garbage collects a single allocation on a node. Returns
   497  // true if alloc was found and garbage collected; otherwise false.
   498  func (c *Client) CollectAllocation(allocID string) bool {
   499  	return c.garbageCollector.Collect(allocID)
   500  }
   501  
   502  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   503  // state
   504  func (c *Client) CollectAllAllocs() {
   505  	c.garbageCollector.CollectAll()
   506  }
   507  
   508  // Node returns the locally registered node
   509  func (c *Client) Node() *structs.Node {
   510  	c.configLock.RLock()
   511  	defer c.configLock.RUnlock()
   512  	return c.configCopy.Node
   513  }
   514  
   515  // StatsReporter exposes the various APIs related resource usage of a Nomad
   516  // client
   517  func (c *Client) StatsReporter() ClientStatsReporter {
   518  	return c
   519  }
   520  
   521  func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) {
   522  	c.allocLock.RLock()
   523  	defer c.allocLock.RUnlock()
   524  	ar, ok := c.allocs[allocID]
   525  	if !ok {
   526  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   527  	}
   528  	return ar.StatsReporter(), nil
   529  }
   530  
   531  // HostStats returns all the stats related to a Nomad client
   532  func (c *Client) LatestHostStats() *stats.HostStats {
   533  	return c.hostStatsCollector.Stats()
   534  }
   535  
   536  // ValidateMigrateToken verifies that a token is for a specific client and
   537  // allocation, and has been created by a trusted party that has privileged
   538  // knowledge of the client's secret identifier
   539  func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool {
   540  	if !c.config.ACLEnabled {
   541  		return true
   542  	}
   543  
   544  	return nomad.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken)
   545  }
   546  
   547  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   548  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   549  	c.allocLock.RLock()
   550  	defer c.allocLock.RUnlock()
   551  
   552  	ar, ok := c.allocs[allocID]
   553  	if !ok {
   554  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   555  	}
   556  	return ar.GetAllocDir(), nil
   557  }
   558  
   559  // GetClientAlloc returns the allocation from the client
   560  func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
   561  	all := c.allAllocs()
   562  	alloc, ok := all[allocID]
   563  	if !ok {
   564  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   565  	}
   566  	return alloc, nil
   567  }
   568  
   569  // GetServers returns the list of nomad servers this client is aware of.
   570  func (c *Client) GetServers() []string {
   571  	endpoints := c.servers.all()
   572  	res := make([]string, len(endpoints))
   573  	for i := range endpoints {
   574  		res[i] = endpoints[i].addr.String()
   575  	}
   576  	return res
   577  }
   578  
   579  // SetServers sets a new list of nomad servers to connect to. As long as one
   580  // server is resolvable no error is returned.
   581  func (c *Client) SetServers(servers []string) error {
   582  	endpoints := make([]*endpoint, 0, len(servers))
   583  	var merr multierror.Error
   584  	for _, s := range servers {
   585  		addr, err := resolveServer(s)
   586  		if err != nil {
   587  			c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err)
   588  			merr.Errors = append(merr.Errors, err)
   589  			continue
   590  		}
   591  
   592  		// Valid endpoint, append it without a priority as this API
   593  		// doesn't support different priorities for different servers
   594  		endpoints = append(endpoints, &endpoint{name: s, addr: addr})
   595  	}
   596  
   597  	// Only return errors if no servers are valid
   598  	if len(endpoints) == 0 {
   599  		if len(merr.Errors) > 0 {
   600  			return merr.ErrorOrNil()
   601  		}
   602  		return noServersErr
   603  	}
   604  
   605  	c.servers.set(endpoints)
   606  	return nil
   607  }
   608  
   609  // restoreState is used to restore our state from the data dir
   610  func (c *Client) restoreState() error {
   611  	if c.config.DevMode {
   612  		return nil
   613  	}
   614  
   615  	// COMPAT: Remove in 0.7.0
   616  	// 0.6.0 transistioned from individual state files to a single bolt-db.
   617  	// The upgrade path is to:
   618  	// Check if old state exists
   619  	//   If so, restore from that and delete old state
   620  	// Restore using state database
   621  
   622  	// Allocs holds the IDs of the allocations being restored
   623  	var allocs []string
   624  
   625  	// Upgrading tracks whether this is a pre 0.6.0 upgrade path
   626  	var upgrading bool
   627  
   628  	// Scan the directory
   629  	allocDir := filepath.Join(c.config.StateDir, "alloc")
   630  	list, err := ioutil.ReadDir(allocDir)
   631  	if err != nil && !os.IsNotExist(err) {
   632  		return fmt.Errorf("failed to list alloc state: %v", err)
   633  	} else if err == nil && len(list) != 0 {
   634  		upgrading = true
   635  		for _, entry := range list {
   636  			allocs = append(allocs, entry.Name())
   637  		}
   638  	} else {
   639  		// Normal path
   640  		err := c.stateDB.View(func(tx *bolt.Tx) error {
   641  			allocs, err = getAllAllocationIDs(tx)
   642  			if err != nil {
   643  				return fmt.Errorf("failed to list allocations: %v", err)
   644  			}
   645  			return nil
   646  		})
   647  		if err != nil {
   648  			return err
   649  		}
   650  	}
   651  
   652  	// Load each alloc back
   653  	var mErr multierror.Error
   654  	for _, id := range allocs {
   655  		alloc := &structs.Allocation{ID: id}
   656  
   657  		// don't worry about blocking/migrating when restoring
   658  		watcher := noopPrevAlloc{}
   659  
   660  		c.configLock.RLock()
   661  		ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher)
   662  		c.configLock.RUnlock()
   663  
   664  		c.allocLock.Lock()
   665  		c.allocs[id] = ar
   666  		c.allocLock.Unlock()
   667  
   668  		if err := ar.RestoreState(); err != nil {
   669  			c.logger.Printf("[ERR] client: failed to restore state for alloc %q: %v", id, err)
   670  			mErr.Errors = append(mErr.Errors, err)
   671  		} else {
   672  			go ar.Run()
   673  
   674  			if upgrading {
   675  				if err := ar.SaveState(); err != nil {
   676  					c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", id, err)
   677  				}
   678  			}
   679  		}
   680  	}
   681  
   682  	// Delete all the entries
   683  	if upgrading {
   684  		if err := os.RemoveAll(allocDir); err != nil {
   685  			mErr.Errors = append(mErr.Errors, err)
   686  		}
   687  	}
   688  
   689  	return mErr.ErrorOrNil()
   690  }
   691  
   692  // saveState is used to snapshot our state into the data dir.
   693  func (c *Client) saveState() error {
   694  	if c.config.DevMode {
   695  		return nil
   696  	}
   697  
   698  	var wg sync.WaitGroup
   699  	var l sync.Mutex
   700  	var mErr multierror.Error
   701  	runners := c.getAllocRunners()
   702  	wg.Add(len(runners))
   703  
   704  	for id, ar := range runners {
   705  		go func(id string, ar *AllocRunner) {
   706  			err := ar.SaveState()
   707  			if err != nil {
   708  				c.logger.Printf("[ERR] client: failed to save state for alloc %q: %v", id, err)
   709  				l.Lock()
   710  				multierror.Append(&mErr, err)
   711  				l.Unlock()
   712  			}
   713  			wg.Done()
   714  		}(id, ar)
   715  	}
   716  
   717  	wg.Wait()
   718  	return mErr.ErrorOrNil()
   719  }
   720  
   721  // getAllocRunners returns a snapshot of the current set of alloc runners.
   722  func (c *Client) getAllocRunners() map[string]*AllocRunner {
   723  	c.allocLock.RLock()
   724  	defer c.allocLock.RUnlock()
   725  	runners := make(map[string]*AllocRunner, len(c.allocs))
   726  	for id, ar := range c.allocs {
   727  		runners[id] = ar
   728  	}
   729  	return runners
   730  }
   731  
   732  // NumAllocs returns the number of un-GC'd allocs this client has. Used to
   733  // fulfill the AllocCounter interface for the GC.
   734  func (c *Client) NumAllocs() int {
   735  	n := 0
   736  	c.allocLock.RLock()
   737  	for _, a := range c.allocs {
   738  		if !a.IsDestroyed() {
   739  			n++
   740  		}
   741  	}
   742  	c.allocLock.RUnlock()
   743  	return n
   744  }
   745  
   746  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
   747  // The node ID is, if available, a persistent unique ID.  The secret ID is a
   748  // high-entropy random UUID.
   749  func (c *Client) nodeID() (id, secret string, err error) {
   750  	var hostID string
   751  	hostInfo, err := host.Info()
   752  	if !c.config.NoHostUUID && err == nil {
   753  		if hashed, ok := helper.HashUUID(hostInfo.HostID); ok {
   754  			hostID = hashed
   755  		}
   756  	}
   757  
   758  	if hostID == "" {
   759  		// Generate a random hostID if no constant ID is available on
   760  		// this platform.
   761  		hostID = uuid.Generate()
   762  	}
   763  
   764  	// Do not persist in dev mode
   765  	if c.config.DevMode {
   766  		return hostID, uuid.Generate(), nil
   767  	}
   768  
   769  	// Attempt to read existing ID
   770  	idPath := filepath.Join(c.config.StateDir, "client-id")
   771  	idBuf, err := ioutil.ReadFile(idPath)
   772  	if err != nil && !os.IsNotExist(err) {
   773  		return "", "", err
   774  	}
   775  
   776  	// Attempt to read existing secret ID
   777  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
   778  	secretBuf, err := ioutil.ReadFile(secretPath)
   779  	if err != nil && !os.IsNotExist(err) {
   780  		return "", "", err
   781  	}
   782  
   783  	// Use existing ID if any
   784  	if len(idBuf) != 0 {
   785  		id = strings.ToLower(string(idBuf))
   786  	} else {
   787  		id = hostID
   788  
   789  		// Persist the ID
   790  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
   791  			return "", "", err
   792  		}
   793  	}
   794  
   795  	if len(secretBuf) != 0 {
   796  		secret = string(secretBuf)
   797  	} else {
   798  		// Generate new ID
   799  		secret = uuid.Generate()
   800  
   801  		// Persist the ID
   802  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
   803  			return "", "", err
   804  		}
   805  	}
   806  
   807  	return id, secret, nil
   808  }
   809  
   810  // setupNode is used to setup the initial node
   811  func (c *Client) setupNode() error {
   812  	node := c.config.Node
   813  	if node == nil {
   814  		node = &structs.Node{}
   815  		c.config.Node = node
   816  	}
   817  	// Generate an ID and secret for the node
   818  	id, secretID, err := c.nodeID()
   819  	if err != nil {
   820  		return fmt.Errorf("node ID setup failed: %v", err)
   821  	}
   822  
   823  	node.ID = id
   824  	node.SecretID = secretID
   825  	if node.Attributes == nil {
   826  		node.Attributes = make(map[string]string)
   827  	}
   828  	if node.Links == nil {
   829  		node.Links = make(map[string]string)
   830  	}
   831  	if node.Meta == nil {
   832  		node.Meta = make(map[string]string)
   833  	}
   834  	if node.Resources == nil {
   835  		node.Resources = &structs.Resources{}
   836  	}
   837  	if node.Reserved == nil {
   838  		node.Reserved = &structs.Resources{}
   839  	}
   840  	if node.Datacenter == "" {
   841  		node.Datacenter = "dc1"
   842  	}
   843  	if node.Name == "" {
   844  		node.Name, _ = os.Hostname()
   845  	}
   846  	if node.Name == "" {
   847  		node.Name = node.ID
   848  	}
   849  	node.Status = structs.NodeStatusInit
   850  	return nil
   851  }
   852  
   853  // reservePorts is used to reserve ports on the fingerprinted network devices.
   854  func (c *Client) reservePorts() {
   855  	c.configLock.RLock()
   856  	defer c.configLock.RUnlock()
   857  	global := c.config.GloballyReservedPorts
   858  	if len(global) == 0 {
   859  		return
   860  	}
   861  
   862  	node := c.config.Node
   863  	networks := node.Resources.Networks
   864  	reservedIndex := make(map[string]*structs.NetworkResource, len(networks))
   865  	for _, resNet := range node.Reserved.Networks {
   866  		reservedIndex[resNet.IP] = resNet
   867  	}
   868  
   869  	// Go through each network device and reserve ports on it.
   870  	for _, net := range networks {
   871  		res, ok := reservedIndex[net.IP]
   872  		if !ok {
   873  			res = net.Copy()
   874  			res.MBits = 0
   875  			reservedIndex[net.IP] = res
   876  		}
   877  
   878  		for _, portVal := range global {
   879  			p := structs.Port{Value: portVal}
   880  			res.ReservedPorts = append(res.ReservedPorts, p)
   881  		}
   882  	}
   883  
   884  	// Clear the reserved networks.
   885  	if node.Reserved == nil {
   886  		node.Reserved = new(structs.Resources)
   887  	} else {
   888  		node.Reserved.Networks = nil
   889  	}
   890  
   891  	// Restore the reserved networks
   892  	for _, net := range reservedIndex {
   893  		node.Reserved.Networks = append(node.Reserved.Networks, net)
   894  	}
   895  }
   896  
   897  // fingerprint is used to fingerprint the client and setup the node
   898  func (c *Client) fingerprint() error {
   899  	whitelist := c.config.ReadStringListToMap("fingerprint.whitelist")
   900  	whitelistEnabled := len(whitelist) > 0
   901  	blacklist := c.config.ReadStringListToMap("fingerprint.blacklist")
   902  
   903  	c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints())
   904  
   905  	var applied []string
   906  	var skipped []string
   907  	for _, name := range fingerprint.BuiltinFingerprints() {
   908  		// Skip modules that are not in the whitelist if it is enabled.
   909  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   910  			skipped = append(skipped, name)
   911  			continue
   912  		}
   913  		// Skip modules that are in the blacklist
   914  		if _, ok := blacklist[name]; ok {
   915  			skipped = append(skipped, name)
   916  			continue
   917  		}
   918  		f, err := fingerprint.NewFingerprint(name, c.logger)
   919  		if err != nil {
   920  			return err
   921  		}
   922  
   923  		c.configLock.Lock()
   924  		applies, err := f.Fingerprint(c.config, c.config.Node)
   925  		c.configLock.Unlock()
   926  		if err != nil {
   927  			return err
   928  		}
   929  		if applies {
   930  			applied = append(applied, name)
   931  		}
   932  		p, period := f.Periodic()
   933  		if p {
   934  			// TODO: If more periodic fingerprinters are added, then
   935  			// fingerprintPeriodic should be used to handle all the periodic
   936  			// fingerprinters by using a priority queue.
   937  			go c.fingerprintPeriodic(name, f, period)
   938  		}
   939  	}
   940  	c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied)
   941  	if len(skipped) != 0 {
   942  		c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped)
   943  	}
   944  	return nil
   945  }
   946  
   947  // fingerprintPeriodic runs a fingerprinter at the specified duration.
   948  func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) {
   949  	c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d)
   950  	for {
   951  		select {
   952  		case <-time.After(d):
   953  			c.configLock.Lock()
   954  			if _, err := f.Fingerprint(c.config, c.config.Node); err != nil {
   955  				c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err)
   956  			}
   957  			c.configLock.Unlock()
   958  		case <-c.shutdownCh:
   959  			return
   960  		}
   961  	}
   962  }
   963  
   964  // setupDrivers is used to find the available drivers
   965  func (c *Client) setupDrivers() error {
   966  	// Build the white/blacklists of drivers.
   967  	whitelist := c.config.ReadStringListToMap("driver.whitelist")
   968  	whitelistEnabled := len(whitelist) > 0
   969  	blacklist := c.config.ReadStringListToMap("driver.blacklist")
   970  
   971  	var avail []string
   972  	var skipped []string
   973  	driverCtx := driver.NewDriverContext("", "", c.config, c.config.Node, c.logger, nil)
   974  	for name := range driver.BuiltinDrivers {
   975  		// Skip fingerprinting drivers that are not in the whitelist if it is
   976  		// enabled.
   977  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   978  			skipped = append(skipped, name)
   979  			continue
   980  		}
   981  		// Skip fingerprinting drivers that are in the blacklist
   982  		if _, ok := blacklist[name]; ok {
   983  			skipped = append(skipped, name)
   984  			continue
   985  		}
   986  
   987  		d, err := driver.NewDriver(name, driverCtx)
   988  		if err != nil {
   989  			return err
   990  		}
   991  		c.configLock.Lock()
   992  		applies, err := d.Fingerprint(c.config, c.config.Node)
   993  		c.configLock.Unlock()
   994  		if err != nil {
   995  			return err
   996  		}
   997  		if applies {
   998  			avail = append(avail, name)
   999  		}
  1000  
  1001  		p, period := d.Periodic()
  1002  		if p {
  1003  			go c.fingerprintPeriodic(name, d, period)
  1004  		}
  1005  
  1006  	}
  1007  
  1008  	c.logger.Printf("[DEBUG] client: available drivers %v", avail)
  1009  
  1010  	if len(skipped) != 0 {
  1011  		c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped)
  1012  	}
  1013  
  1014  	return nil
  1015  }
  1016  
  1017  // retryIntv calculates a retry interval value given the base
  1018  func (c *Client) retryIntv(base time.Duration) time.Duration {
  1019  	if c.config.DevMode {
  1020  		return devModeRetryIntv
  1021  	}
  1022  	return base + lib.RandomStagger(base)
  1023  }
  1024  
  1025  // registerAndHeartbeat is a long lived goroutine used to register the client
  1026  // and then start heartbeatng to the server.
  1027  func (c *Client) registerAndHeartbeat() {
  1028  	// Before registering capture the hashes of the Node's attribute and
  1029  	// metadata maps. The hashes may be out of date with what registers but this
  1030  	// is okay since the loop checking for node updates will detect this and
  1031  	// reregister. This is necessary to avoid races between the periodic
  1032  	// fingerprinters and the node registering.
  1033  	attrHash, metaHash, err := nodeMapHashes(c.Node())
  1034  	if err != nil {
  1035  		c.logger.Printf("[ERR] client: failed to determine initial node hashes. May result in stale node being registered: %v", err)
  1036  	}
  1037  
  1038  	// Register the node
  1039  	c.retryRegisterNode()
  1040  
  1041  	// Start watching changes for node changes
  1042  	go c.watchNodeUpdates(attrHash, metaHash)
  1043  
  1044  	// Setup the heartbeat timer, for the initial registration
  1045  	// we want to do this quickly. We want to do it extra quickly
  1046  	// in development mode.
  1047  	var heartbeat <-chan time.Time
  1048  	if c.config.DevMode {
  1049  		heartbeat = time.After(0)
  1050  	} else {
  1051  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1052  	}
  1053  
  1054  	for {
  1055  		select {
  1056  		case <-c.serversDiscoveredCh:
  1057  		case <-heartbeat:
  1058  		case <-c.shutdownCh:
  1059  			return
  1060  		}
  1061  
  1062  		if err := c.updateNodeStatus(); err != nil {
  1063  			// The servers have changed such that this node has not been
  1064  			// registered before
  1065  			if strings.Contains(err.Error(), "node not found") {
  1066  				// Re-register the node
  1067  				c.logger.Printf("[INFO] client: re-registering node")
  1068  				c.retryRegisterNode()
  1069  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1070  			} else {
  1071  				intv := c.retryIntv(registerRetryIntv)
  1072  				c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
  1073  				heartbeat = time.After(intv)
  1074  
  1075  				// if heartbeating fails, trigger Consul discovery
  1076  				c.triggerDiscovery()
  1077  			}
  1078  		} else {
  1079  			c.heartbeatLock.Lock()
  1080  			heartbeat = time.After(c.heartbeatTTL)
  1081  			c.heartbeatLock.Unlock()
  1082  		}
  1083  	}
  1084  }
  1085  
  1086  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
  1087  // state of the client
  1088  func (c *Client) periodicSnapshot() {
  1089  	// Create a snapshot timer
  1090  	snapshot := time.After(stateSnapshotIntv)
  1091  
  1092  	for {
  1093  		select {
  1094  		case <-snapshot:
  1095  			snapshot = time.After(stateSnapshotIntv)
  1096  			if err := c.saveState(); err != nil {
  1097  				c.logger.Printf("[ERR] client: failed to save state: %v", err)
  1098  			}
  1099  
  1100  		case <-c.shutdownCh:
  1101  			return
  1102  		}
  1103  	}
  1104  }
  1105  
  1106  // run is a long lived goroutine used to run the client
  1107  func (c *Client) run() {
  1108  	// Watch for changes in allocations
  1109  	allocUpdates := make(chan *allocUpdates, 8)
  1110  	go c.watchAllocations(allocUpdates)
  1111  
  1112  	for {
  1113  		select {
  1114  		case update := <-allocUpdates:
  1115  			c.runAllocs(update)
  1116  
  1117  		case <-c.shutdownCh:
  1118  			return
  1119  		}
  1120  	}
  1121  }
  1122  
  1123  // nodeMapHashes returns the hashes of the passed Node's attribute and metadata
  1124  // maps.
  1125  func nodeMapHashes(node *structs.Node) (attrHash, metaHash uint64, err error) {
  1126  	attrHash, err = hashstructure.Hash(node.Attributes, nil)
  1127  	if err != nil {
  1128  		return 0, 0, fmt.Errorf("unable to calculate node attributes hash: %v", err)
  1129  	}
  1130  	// Calculate node meta map hash
  1131  	metaHash, err = hashstructure.Hash(node.Meta, nil)
  1132  	if err != nil {
  1133  		return 0, 0, fmt.Errorf("unable to calculate node meta hash: %v", err)
  1134  	}
  1135  	return attrHash, metaHash, nil
  1136  }
  1137  
  1138  // hasNodeChanged calculates a hash for the node attributes- and meta map.
  1139  // The new hash values are compared against the old (passed-in) hash values to
  1140  // determine if the node properties have changed. It returns the new hash values
  1141  // in case they are different from the old hash values.
  1142  func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) {
  1143  	c.configLock.RLock()
  1144  	defer c.configLock.RUnlock()
  1145  
  1146  	// Check if the Node that is being updated by fingerprinters has changed.
  1147  	newAttrHash, newMetaHash, err := nodeMapHashes(c.config.Node)
  1148  	if err != nil {
  1149  		c.logger.Printf("[DEBUG] client: unable to calculate node hashes: %v", err)
  1150  	}
  1151  	if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash {
  1152  		return true, newAttrHash, newMetaHash
  1153  	}
  1154  	return false, oldAttrHash, oldMetaHash
  1155  }
  1156  
  1157  // retryRegisterNode is used to register the node or update the registration and
  1158  // retry in case of failure.
  1159  func (c *Client) retryRegisterNode() {
  1160  	for {
  1161  		err := c.registerNode()
  1162  		if err == nil {
  1163  			// Registered!
  1164  			return
  1165  		}
  1166  
  1167  		if err == noServersErr {
  1168  			c.logger.Print("[DEBUG] client: registration waiting on servers")
  1169  			c.triggerDiscovery()
  1170  		} else {
  1171  			c.logger.Printf("[ERR] client: registration failure: %v", err)
  1172  		}
  1173  		select {
  1174  		case <-c.serversDiscoveredCh:
  1175  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1176  		case <-c.shutdownCh:
  1177  			return
  1178  		}
  1179  	}
  1180  }
  1181  
  1182  // registerNode is used to register the node or update the registration
  1183  func (c *Client) registerNode() error {
  1184  	node := c.Node()
  1185  	req := structs.NodeRegisterRequest{
  1186  		Node:         node,
  1187  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1188  	}
  1189  	var resp structs.NodeUpdateResponse
  1190  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1191  		return err
  1192  	}
  1193  
  1194  	// Update the node status to ready after we register.
  1195  	c.configLock.Lock()
  1196  	node.Status = structs.NodeStatusReady
  1197  	c.configLock.Unlock()
  1198  
  1199  	c.logger.Printf("[INFO] client: node registration complete")
  1200  	if len(resp.EvalIDs) != 0 {
  1201  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs))
  1202  	}
  1203  
  1204  	c.heartbeatLock.Lock()
  1205  	defer c.heartbeatLock.Unlock()
  1206  	c.lastHeartbeat = time.Now()
  1207  	c.heartbeatTTL = resp.HeartbeatTTL
  1208  	return nil
  1209  }
  1210  
  1211  // updateNodeStatus is used to heartbeat and update the status of the node
  1212  func (c *Client) updateNodeStatus() error {
  1213  	start := time.Now()
  1214  	req := structs.NodeUpdateStatusRequest{
  1215  		NodeID:       c.NodeID(),
  1216  		Status:       structs.NodeStatusReady,
  1217  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1218  	}
  1219  	var resp structs.NodeUpdateResponse
  1220  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1221  		c.triggerDiscovery()
  1222  		return fmt.Errorf("failed to update status: %v", err)
  1223  	}
  1224  	end := time.Now()
  1225  
  1226  	if len(resp.EvalIDs) != 0 {
  1227  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs))
  1228  	}
  1229  
  1230  	// Update the last heartbeat and the new TTL, capturing the old values
  1231  	c.heartbeatLock.Lock()
  1232  	last := c.lastHeartbeat
  1233  	oldTTL := c.heartbeatTTL
  1234  	haveHeartbeated := c.haveHeartbeated
  1235  	c.lastHeartbeat = time.Now()
  1236  	c.heartbeatTTL = resp.HeartbeatTTL
  1237  	c.haveHeartbeated = true
  1238  	c.heartbeatLock.Unlock()
  1239  	c.logger.Printf("[TRACE] client: next heartbeat in %v", resp.HeartbeatTTL)
  1240  
  1241  	if resp.Index != 0 {
  1242  		c.logger.Printf("[DEBUG] client: state updated to %s", req.Status)
  1243  
  1244  		// We have potentially missed our TTL log how delayed we were
  1245  		if haveHeartbeated {
  1246  			c.logger.Printf("[WARN] client: heartbeat missed (request took %v). Heartbeat TTL was %v and heartbeated after %v",
  1247  				end.Sub(start), oldTTL, time.Since(last))
  1248  		}
  1249  	}
  1250  
  1251  	// Convert []*NodeServerInfo to []*endpoints
  1252  	localdc := c.Datacenter()
  1253  	servers := make(endpoints, 0, len(resp.Servers))
  1254  	for _, s := range resp.Servers {
  1255  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1256  		if err != nil {
  1257  			c.logger.Printf("[WARN] client: ignoring invalid server %q: %v", s.RPCAdvertiseAddr, err)
  1258  			continue
  1259  		}
  1260  		e := endpoint{name: s.RPCAdvertiseAddr, addr: addr}
  1261  		if s.Datacenter != localdc {
  1262  			// server is non-local; de-prioritize
  1263  			e.priority = 1
  1264  		}
  1265  		servers = append(servers, &e)
  1266  	}
  1267  	if len(servers) == 0 {
  1268  		return fmt.Errorf("server returned no valid servers")
  1269  	}
  1270  	c.servers.set(servers)
  1271  
  1272  	// Begin polling Consul if there is no Nomad leader.  We could be
  1273  	// heartbeating to a Nomad server that is in the minority of a
  1274  	// partition of the Nomad server quorum, but this Nomad Agent still
  1275  	// has connectivity to the existing majority of Nomad Servers, but
  1276  	// only if it queries Consul.
  1277  	if resp.LeaderRPCAddr == "" {
  1278  		c.triggerDiscovery()
  1279  	}
  1280  
  1281  	return nil
  1282  }
  1283  
  1284  // updateAllocStatus is used to update the status of an allocation
  1285  func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
  1286  	if alloc.Terminated() {
  1287  		// Terminated, mark for GC if we're still tracking this alloc
  1288  		// runner. If it's not being tracked that means the server has
  1289  		// already GC'd it (see removeAlloc).
  1290  		c.allocLock.RLock()
  1291  		ar, ok := c.allocs[alloc.ID]
  1292  		c.allocLock.RUnlock()
  1293  
  1294  		if ok {
  1295  			c.garbageCollector.MarkForCollection(ar)
  1296  
  1297  			// Trigger a GC in case we're over thresholds and just
  1298  			// waiting for eligible allocs.
  1299  			c.garbageCollector.Trigger()
  1300  		}
  1301  	}
  1302  
  1303  	// Strip all the information that can be reconstructed at the server.  Only
  1304  	// send the fields that are updatable by the client.
  1305  	stripped := new(structs.Allocation)
  1306  	stripped.ID = alloc.ID
  1307  	stripped.NodeID = c.NodeID()
  1308  	stripped.TaskStates = alloc.TaskStates
  1309  	stripped.ClientStatus = alloc.ClientStatus
  1310  	stripped.ClientDescription = alloc.ClientDescription
  1311  	stripped.DeploymentStatus = alloc.DeploymentStatus
  1312  
  1313  	select {
  1314  	case c.allocUpdates <- stripped:
  1315  	case <-c.shutdownCh:
  1316  	}
  1317  }
  1318  
  1319  // allocSync is a long lived function that batches allocation updates to the
  1320  // server.
  1321  func (c *Client) allocSync() {
  1322  	staggered := false
  1323  	syncTicker := time.NewTicker(allocSyncIntv)
  1324  	updates := make(map[string]*structs.Allocation)
  1325  	for {
  1326  		select {
  1327  		case <-c.shutdownCh:
  1328  			syncTicker.Stop()
  1329  			return
  1330  		case alloc := <-c.allocUpdates:
  1331  			// Batch the allocation updates until the timer triggers.
  1332  			updates[alloc.ID] = alloc
  1333  		case <-syncTicker.C:
  1334  			// Fast path if there are no updates
  1335  			if len(updates) == 0 {
  1336  				continue
  1337  			}
  1338  
  1339  			sync := make([]*structs.Allocation, 0, len(updates))
  1340  			for _, alloc := range updates {
  1341  				sync = append(sync, alloc)
  1342  			}
  1343  
  1344  			// Send to server.
  1345  			args := structs.AllocUpdateRequest{
  1346  				Alloc:        sync,
  1347  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1348  			}
  1349  
  1350  			var resp structs.GenericResponse
  1351  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1352  				c.logger.Printf("[ERR] client: failed to update allocations: %v", err)
  1353  				syncTicker.Stop()
  1354  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1355  				staggered = true
  1356  			} else {
  1357  				updates = make(map[string]*structs.Allocation)
  1358  				if staggered {
  1359  					syncTicker.Stop()
  1360  					syncTicker = time.NewTicker(allocSyncIntv)
  1361  					staggered = false
  1362  				}
  1363  			}
  1364  		}
  1365  	}
  1366  }
  1367  
  1368  // allocUpdates holds the results of receiving updated allocations from the
  1369  // servers.
  1370  type allocUpdates struct {
  1371  	// pulled is the set of allocations that were downloaded from the servers.
  1372  	pulled map[string]*structs.Allocation
  1373  
  1374  	// filtered is the set of allocations that were not pulled because their
  1375  	// AllocModifyIndex didn't change.
  1376  	filtered map[string]struct{}
  1377  
  1378  	// migrateTokens are a list of tokens necessary for when clients pull data
  1379  	// from authorized volumes
  1380  	migrateTokens map[string]string
  1381  }
  1382  
  1383  // watchAllocations is used to scan for updates to allocations
  1384  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1385  	// The request and response for getting the map of allocations that should
  1386  	// be running on the Node to their AllocModifyIndex which is incremented
  1387  	// when the allocation is updated by the servers.
  1388  	req := structs.NodeSpecificRequest{
  1389  		NodeID:   c.NodeID(),
  1390  		SecretID: c.secretNodeID(),
  1391  		QueryOptions: structs.QueryOptions{
  1392  			Region:     c.Region(),
  1393  			AllowStale: true,
  1394  		},
  1395  	}
  1396  	var resp structs.NodeClientAllocsResponse
  1397  
  1398  	// The request and response for pulling down the set of allocations that are
  1399  	// new, or updated server side.
  1400  	allocsReq := structs.AllocsGetRequest{
  1401  		QueryOptions: structs.QueryOptions{
  1402  			Region:     c.Region(),
  1403  			AllowStale: true,
  1404  		},
  1405  	}
  1406  	var allocsResp structs.AllocsGetResponse
  1407  
  1408  OUTER:
  1409  	for {
  1410  		// Get the allocation modify index map, blocking for updates. We will
  1411  		// use this to determine exactly what allocations need to be downloaded
  1412  		// in full.
  1413  		resp = structs.NodeClientAllocsResponse{}
  1414  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1415  		if err != nil {
  1416  			// Shutdown often causes EOF errors, so check for shutdown first
  1417  			select {
  1418  			case <-c.shutdownCh:
  1419  				return
  1420  			default:
  1421  			}
  1422  
  1423  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1424  			// servers are not fully upgraded before the clients register. This
  1425  			// can cause the SecretID to be lost
  1426  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1427  				c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err)
  1428  				c.retryRegisterNode()
  1429  			} else if err != noServersErr {
  1430  				c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err)
  1431  			}
  1432  			retry := c.retryIntv(getAllocRetryIntv)
  1433  			select {
  1434  			case <-c.serversDiscoveredCh:
  1435  				continue
  1436  			case <-time.After(retry):
  1437  				continue
  1438  			case <-c.shutdownCh:
  1439  				return
  1440  			}
  1441  		}
  1442  
  1443  		// Check for shutdown
  1444  		select {
  1445  		case <-c.shutdownCh:
  1446  			return
  1447  		default:
  1448  		}
  1449  
  1450  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1451  		// These are the allocations who have either not been updated, or whose
  1452  		// updates are a result of the client sending an update for the alloc.
  1453  		// This lets us reduce the network traffic to the server as we don't
  1454  		// need to pull all the allocations.
  1455  		var pull []string
  1456  		filtered := make(map[string]struct{})
  1457  		runners := c.getAllocRunners()
  1458  		var pullIndex uint64
  1459  		for allocID, modifyIndex := range resp.Allocs {
  1460  			// Pull the allocation if we don't have an alloc runner for the
  1461  			// allocation or if the alloc runner requires an updated allocation.
  1462  			runner, ok := runners[allocID]
  1463  
  1464  			if !ok || runner.shouldUpdate(modifyIndex) {
  1465  				// Only pull allocs that are required. Filtered
  1466  				// allocs might be at a higher index, so ignore
  1467  				// it.
  1468  				if modifyIndex > pullIndex {
  1469  					pullIndex = modifyIndex
  1470  				}
  1471  				pull = append(pull, allocID)
  1472  			} else {
  1473  				filtered[allocID] = struct{}{}
  1474  			}
  1475  		}
  1476  
  1477  		// Pull the allocations that passed filtering.
  1478  		allocsResp.Allocs = nil
  1479  		var pulledAllocs map[string]*structs.Allocation
  1480  		if len(pull) != 0 {
  1481  			// Pull the allocations that need to be updated.
  1482  			allocsReq.AllocIDs = pull
  1483  			allocsReq.MinQueryIndex = pullIndex - 1
  1484  			allocsResp = structs.AllocsGetResponse{}
  1485  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1486  				c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err)
  1487  				retry := c.retryIntv(getAllocRetryIntv)
  1488  				select {
  1489  				case <-c.serversDiscoveredCh:
  1490  					continue
  1491  				case <-time.After(retry):
  1492  					continue
  1493  				case <-c.shutdownCh:
  1494  					return
  1495  				}
  1496  			}
  1497  
  1498  			// Ensure that we received all the allocations we wanted
  1499  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1500  			for _, alloc := range allocsResp.Allocs {
  1501  				pulledAllocs[alloc.ID] = alloc
  1502  			}
  1503  
  1504  			for _, desiredID := range pull {
  1505  				if _, ok := pulledAllocs[desiredID]; !ok {
  1506  					// We didn't get everything we wanted. Do not update the
  1507  					// MinQueryIndex, sleep and then retry.
  1508  					wait := c.retryIntv(2 * time.Second)
  1509  					select {
  1510  					case <-time.After(wait):
  1511  						// Wait for the server we contact to receive the
  1512  						// allocations
  1513  						continue OUTER
  1514  					case <-c.shutdownCh:
  1515  						return
  1516  					}
  1517  				}
  1518  			}
  1519  
  1520  			// Check for shutdown
  1521  			select {
  1522  			case <-c.shutdownCh:
  1523  				return
  1524  			default:
  1525  			}
  1526  		}
  1527  
  1528  		c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)",
  1529  			resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered))
  1530  
  1531  		// Update the query index.
  1532  		if resp.Index > req.MinQueryIndex {
  1533  			req.MinQueryIndex = resp.Index
  1534  		}
  1535  
  1536  		// Push the updates.
  1537  		update := &allocUpdates{
  1538  			filtered:      filtered,
  1539  			pulled:        pulledAllocs,
  1540  			migrateTokens: resp.MigrateTokens,
  1541  		}
  1542  		select {
  1543  		case updates <- update:
  1544  		case <-c.shutdownCh:
  1545  			return
  1546  		}
  1547  	}
  1548  }
  1549  
  1550  // watchNodeUpdates periodically checks for changes to the node attributes or
  1551  // meta map. The passed hashes are the initial hash values for the attribute and
  1552  // metadata of the node respectively.
  1553  func (c *Client) watchNodeUpdates(attrHash, metaHash uint64) {
  1554  	c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv)
  1555  
  1556  	var changed bool
  1557  	for {
  1558  		select {
  1559  		case <-time.After(c.retryIntv(nodeUpdateRetryIntv)):
  1560  			changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash)
  1561  			if changed {
  1562  				c.logger.Printf("[DEBUG] client: state changed, updating node.")
  1563  
  1564  				// Update the config copy.
  1565  				c.configLock.Lock()
  1566  				node := c.config.Node.Copy()
  1567  				c.configCopy.Node = node
  1568  				c.configLock.Unlock()
  1569  
  1570  				c.retryRegisterNode()
  1571  			}
  1572  		case <-c.shutdownCh:
  1573  			return
  1574  		}
  1575  	}
  1576  }
  1577  
  1578  // runAllocs is invoked when we get an updated set of allocations
  1579  func (c *Client) runAllocs(update *allocUpdates) {
  1580  	// Get the existing allocs
  1581  	c.allocLock.RLock()
  1582  	exist := make([]*structs.Allocation, 0, len(c.allocs))
  1583  	for _, ar := range c.allocs {
  1584  		exist = append(exist, ar.alloc)
  1585  	}
  1586  	c.allocLock.RUnlock()
  1587  
  1588  	// Diff the existing and updated allocations
  1589  	diff := diffAllocs(exist, update)
  1590  	c.logger.Printf("[DEBUG] client: %#v", diff)
  1591  
  1592  	// Remove the old allocations
  1593  	for _, remove := range diff.removed {
  1594  		c.removeAlloc(remove)
  1595  	}
  1596  
  1597  	// Update the existing allocations
  1598  	for _, update := range diff.updated {
  1599  		if err := c.updateAlloc(update.exist, update.updated); err != nil {
  1600  			c.logger.Printf("[ERR] client: failed to update alloc %q: %v",
  1601  				update.exist.ID, err)
  1602  		}
  1603  	}
  1604  
  1605  	// Make room for new allocations before running
  1606  	if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil {
  1607  		c.logger.Printf("[ERR] client: error making room for new allocations: %v", err)
  1608  	}
  1609  
  1610  	// Start the new allocations
  1611  	for _, add := range diff.added {
  1612  		migrateToken := update.migrateTokens[add.ID]
  1613  		if err := c.addAlloc(add, migrateToken); err != nil {
  1614  			c.logger.Printf("[ERR] client: failed to add alloc '%s': %v",
  1615  				add.ID, err)
  1616  		}
  1617  	}
  1618  
  1619  	// Trigger the GC once more now that new allocs are started that could
  1620  	// have caused thesholds to be exceeded
  1621  	c.garbageCollector.Trigger()
  1622  }
  1623  
  1624  // removeAlloc is invoked when we should remove an allocation because it has
  1625  // been removed by the server.
  1626  func (c *Client) removeAlloc(alloc *structs.Allocation) {
  1627  	c.allocLock.Lock()
  1628  	ar, ok := c.allocs[alloc.ID]
  1629  	if !ok {
  1630  		c.allocLock.Unlock()
  1631  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID)
  1632  		return
  1633  	}
  1634  
  1635  	// Stop tracking alloc runner as it's been GC'd by the server
  1636  	delete(c.allocs, alloc.ID)
  1637  	c.allocLock.Unlock()
  1638  
  1639  	// Ensure the GC has a reference and then collect. Collecting through the GC
  1640  	// applies rate limiting
  1641  	c.garbageCollector.MarkForCollection(ar)
  1642  
  1643  	// GC immediately since the server has GC'd it
  1644  	go c.garbageCollector.Collect(alloc.ID)
  1645  }
  1646  
  1647  // updateAlloc is invoked when we should update an allocation
  1648  func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
  1649  	c.allocLock.RLock()
  1650  	ar, ok := c.allocs[exist.ID]
  1651  	c.allocLock.RUnlock()
  1652  	if !ok {
  1653  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID)
  1654  		return nil
  1655  	}
  1656  
  1657  	ar.Update(update)
  1658  	return nil
  1659  }
  1660  
  1661  // addAlloc is invoked when we should add an allocation
  1662  func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error {
  1663  	// Check if we already have an alloc runner
  1664  	c.allocLock.Lock()
  1665  	defer c.allocLock.Unlock()
  1666  	if _, ok := c.allocs[alloc.ID]; ok {
  1667  		c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID)
  1668  		return nil
  1669  	}
  1670  
  1671  	// get the previous alloc runner - if one exists - for the
  1672  	// blocking/migrating watcher
  1673  	var prevAR *AllocRunner
  1674  	if alloc.PreviousAllocation != "" {
  1675  		prevAR = c.allocs[alloc.PreviousAllocation]
  1676  	}
  1677  
  1678  	c.configLock.RLock()
  1679  	prevAlloc := newAllocWatcher(alloc, prevAR, c, c.configCopy, c.logger, migrateToken)
  1680  
  1681  	ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc)
  1682  	c.configLock.RUnlock()
  1683  
  1684  	// Store the alloc runner.
  1685  	c.allocs[alloc.ID] = ar
  1686  
  1687  	if err := ar.SaveState(); err != nil {
  1688  		c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", alloc.ID, err)
  1689  	}
  1690  
  1691  	go ar.Run()
  1692  	return nil
  1693  }
  1694  
  1695  // setupVaultClient creates an object to periodically renew tokens and secrets
  1696  // with vault.
  1697  func (c *Client) setupVaultClient() error {
  1698  	var err error
  1699  	c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken)
  1700  	if err != nil {
  1701  		return err
  1702  	}
  1703  
  1704  	if c.vaultClient == nil {
  1705  		c.logger.Printf("[ERR] client: failed to create vault client")
  1706  		return fmt.Errorf("failed to create vault client")
  1707  	}
  1708  
  1709  	// Start renewing tokens and secrets
  1710  	c.vaultClient.Start()
  1711  
  1712  	return nil
  1713  }
  1714  
  1715  // deriveToken takes in an allocation and a set of tasks and derives vault
  1716  // tokens for each of the tasks, unwraps all of them using the supplied vault
  1717  // client and returns a map of unwrapped tokens, indexed by the task name.
  1718  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  1719  	if alloc == nil {
  1720  		return nil, fmt.Errorf("nil allocation")
  1721  	}
  1722  
  1723  	if taskNames == nil || len(taskNames) == 0 {
  1724  		return nil, fmt.Errorf("missing task names")
  1725  	}
  1726  
  1727  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1728  	if group == nil {
  1729  		return nil, fmt.Errorf("group name in allocation is not present in job")
  1730  	}
  1731  
  1732  	verifiedTasks := []string{}
  1733  	// Check if the given task names actually exist in the allocation
  1734  	for _, taskName := range taskNames {
  1735  		found := false
  1736  		for _, task := range group.Tasks {
  1737  			if task.Name == taskName {
  1738  				found = true
  1739  			}
  1740  		}
  1741  		if !found {
  1742  			c.logger.Printf("[ERR] task %q not found in the allocation", taskName)
  1743  			return nil, fmt.Errorf("task %q not found in the allocaition", taskName)
  1744  		}
  1745  		verifiedTasks = append(verifiedTasks, taskName)
  1746  	}
  1747  
  1748  	// DeriveVaultToken of nomad server can take in a set of tasks and
  1749  	// creates tokens for all the tasks.
  1750  	req := &structs.DeriveVaultTokenRequest{
  1751  		NodeID:   c.NodeID(),
  1752  		SecretID: c.secretNodeID(),
  1753  		AllocID:  alloc.ID,
  1754  		Tasks:    verifiedTasks,
  1755  		QueryOptions: structs.QueryOptions{
  1756  			Region:     c.Region(),
  1757  			AllowStale: false,
  1758  		},
  1759  	}
  1760  
  1761  	// Derive the tokens
  1762  	var resp structs.DeriveVaultTokenResponse
  1763  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  1764  		c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err)
  1765  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  1766  	}
  1767  	if resp.Error != nil {
  1768  		c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error)
  1769  		return nil, resp.Error
  1770  	}
  1771  	if resp.Tasks == nil {
  1772  		c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response")
  1773  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  1774  	}
  1775  
  1776  	unwrappedTokens := make(map[string]string)
  1777  
  1778  	// Retrieve the wrapped tokens from the response and unwrap it
  1779  	for _, taskName := range verifiedTasks {
  1780  		// Get the wrapped token
  1781  		wrappedToken, ok := resp.Tasks[taskName]
  1782  		if !ok {
  1783  			c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName)
  1784  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  1785  		}
  1786  
  1787  		// Unwrap the vault token
  1788  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  1789  		if err != nil {
  1790  			return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err)
  1791  		}
  1792  		if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" {
  1793  			return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName)
  1794  		}
  1795  
  1796  		// Append the unwrapped token to the return value
  1797  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  1798  	}
  1799  
  1800  	return unwrappedTokens, nil
  1801  }
  1802  
  1803  // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread)
  1804  func (c *Client) triggerDiscovery() {
  1805  	select {
  1806  	case c.triggerDiscoveryCh <- struct{}{}:
  1807  		// Discovery goroutine was released to execute
  1808  	default:
  1809  		// Discovery goroutine was already running
  1810  	}
  1811  }
  1812  
  1813  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  1814  // It's intended to be started in a goroutine. See triggerDiscovery() for
  1815  // causing consul discovery from other code locations.
  1816  func (c *Client) consulDiscovery() {
  1817  	for {
  1818  		select {
  1819  		case <-c.triggerDiscoveryCh:
  1820  			if err := c.consulDiscoveryImpl(); err != nil {
  1821  				c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err)
  1822  			}
  1823  		case <-c.shutdownCh:
  1824  			return
  1825  		}
  1826  	}
  1827  }
  1828  
  1829  func (c *Client) consulDiscoveryImpl() error {
  1830  	// Acquire heartbeat lock to prevent heartbeat from running
  1831  	// concurrently with discovery. Concurrent execution is safe, however
  1832  	// discovery is usually triggered when heartbeating has failed so
  1833  	// there's no point in allowing it.
  1834  	c.heartbeatLock.Lock()
  1835  	defer c.heartbeatLock.Unlock()
  1836  
  1837  	dcs, err := c.consulCatalog.Datacenters()
  1838  	if err != nil {
  1839  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  1840  	}
  1841  	if len(dcs) > 2 {
  1842  		// Query the local DC first, then shuffle the
  1843  		// remaining DCs.  Future heartbeats will cause Nomad
  1844  		// Clients to fixate on their local datacenter so
  1845  		// it's okay to talk with remote DCs.  If the no
  1846  		// Nomad servers are available within
  1847  		// datacenterQueryLimit, the next heartbeat will pick
  1848  		// a new set of servers so it's okay.
  1849  		shuffleStrings(dcs[1:])
  1850  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  1851  	}
  1852  
  1853  	// Query for servers in this client's region only
  1854  	region := c.Region()
  1855  	rpcargs := structs.GenericRequest{
  1856  		QueryOptions: structs.QueryOptions{
  1857  			Region: region,
  1858  		},
  1859  	}
  1860  
  1861  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  1862  	var mErr multierror.Error
  1863  	var servers endpoints
  1864  	c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
  1865  DISCOLOOP:
  1866  	for _, dc := range dcs {
  1867  		consulOpts := &consulapi.QueryOptions{
  1868  			AllowStale: true,
  1869  			Datacenter: dc,
  1870  			Near:       "_agent",
  1871  			WaitTime:   consul.DefaultQueryWaitDuration,
  1872  		}
  1873  		consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  1874  		if err != nil {
  1875  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  1876  			continue
  1877  		}
  1878  
  1879  		for _, s := range consulServices {
  1880  			port := strconv.Itoa(s.ServicePort)
  1881  			addrstr := s.ServiceAddress
  1882  			if addrstr == "" {
  1883  				addrstr = s.Address
  1884  			}
  1885  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  1886  			if err != nil {
  1887  				mErr.Errors = append(mErr.Errors, err)
  1888  				continue
  1889  			}
  1890  			var peers []string
  1891  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  1892  				mErr.Errors = append(mErr.Errors, err)
  1893  				continue
  1894  			}
  1895  
  1896  			// Successfully received the Server peers list of the correct
  1897  			// region
  1898  			for _, p := range peers {
  1899  				addr, err := net.ResolveTCPAddr("tcp", p)
  1900  				if err != nil {
  1901  					mErr.Errors = append(mErr.Errors, err)
  1902  				}
  1903  				servers = append(servers, &endpoint{name: p, addr: addr})
  1904  			}
  1905  			if len(servers) > 0 {
  1906  				break DISCOLOOP
  1907  			}
  1908  		}
  1909  	}
  1910  	if len(servers) == 0 {
  1911  		if len(mErr.Errors) > 0 {
  1912  			return mErr.ErrorOrNil()
  1913  		}
  1914  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  1915  	}
  1916  
  1917  	c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers)
  1918  	c.servers.set(servers)
  1919  
  1920  	// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  1921  	// isn't receiving on this chan yet they'll still retry eventually.
  1922  	// This is a shortcircuit for the longer retry intervals.
  1923  	for {
  1924  		select {
  1925  		case c.serversDiscoveredCh <- struct{}{}:
  1926  		default:
  1927  			return nil
  1928  		}
  1929  	}
  1930  }
  1931  
  1932  // emitStats collects host resource usage stats periodically
  1933  func (c *Client) emitStats() {
  1934  	// Assign labels directly before emitting stats so the information expected
  1935  	// is ready
  1936  	c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}}
  1937  
  1938  	// Start collecting host stats right away and then keep collecting every
  1939  	// collection interval
  1940  	next := time.NewTimer(0)
  1941  	defer next.Stop()
  1942  	for {
  1943  		select {
  1944  		case <-next.C:
  1945  			err := c.hostStatsCollector.Collect()
  1946  			next.Reset(c.config.StatsCollectionInterval)
  1947  			if err != nil {
  1948  				c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err)
  1949  				continue
  1950  			}
  1951  
  1952  			// Publish Node metrics if operator has opted in
  1953  			if c.config.PublishNodeMetrics {
  1954  				c.emitHostStats()
  1955  			}
  1956  
  1957  			c.emitClientMetrics()
  1958  		case <-c.shutdownCh:
  1959  			return
  1960  		}
  1961  	}
  1962  }
  1963  
  1964  // setGaugeForMemoryStats proxies metrics for memory specific statistics
  1965  func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats) {
  1966  	if !c.config.DisableTaggedMetrics {
  1967  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), c.baseLabels)
  1968  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), c.baseLabels)
  1969  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), c.baseLabels)
  1970  		metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), c.baseLabels)
  1971  	}
  1972  
  1973  	if c.config.BackwardsCompatibleMetrics {
  1974  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  1975  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  1976  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  1977  		metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  1978  	}
  1979  }
  1980  
  1981  // setGaugeForCPUStats proxies metrics for CPU specific statistics
  1982  func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats) {
  1983  	for _, cpu := range hStats.CPU {
  1984  		if !c.config.DisableTaggedMetrics {
  1985  			labels := append(c.baseLabels, metrics.Label{
  1986  				Name:  "cpu",
  1987  				Value: cpu.CPU,
  1988  			})
  1989  
  1990  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels)
  1991  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels)
  1992  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels)
  1993  			metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels)
  1994  		}
  1995  
  1996  		if c.config.BackwardsCompatibleMetrics {
  1997  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  1998  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  1999  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2000  			metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2001  		}
  2002  	}
  2003  }
  2004  
  2005  // setGaugeForDiskStats proxies metrics for disk specific statistics
  2006  func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats) {
  2007  	for _, disk := range hStats.DiskStats {
  2008  		if !c.config.DisableTaggedMetrics {
  2009  			labels := append(c.baseLabels, metrics.Label{
  2010  				Name:  "disk",
  2011  				Value: disk.Device,
  2012  			})
  2013  
  2014  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels)
  2015  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels)
  2016  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels)
  2017  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels)
  2018  			metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels)
  2019  		}
  2020  
  2021  		if c.config.BackwardsCompatibleMetrics {
  2022  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2023  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2024  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2025  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2026  			metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2027  		}
  2028  	}
  2029  }
  2030  
  2031  // setGaugeForAllocationStats proxies metrics for allocation specific statistics
  2032  func (c *Client) setGaugeForAllocationStats(nodeID string) {
  2033  	c.configLock.RLock()
  2034  	node := c.configCopy.Node
  2035  	c.configLock.RUnlock()
  2036  	total := node.Resources
  2037  	res := node.Reserved
  2038  	allocated := c.getAllocatedResources(node)
  2039  
  2040  	// Emit allocated
  2041  	if !c.config.DisableTaggedMetrics {
  2042  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.MemoryMB), c.baseLabels)
  2043  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.DiskMB), c.baseLabels)
  2044  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.CPU), c.baseLabels)
  2045  		metrics.SetGaugeWithLabels([]string{"client", "allocated", "iops"}, float32(allocated.IOPS), c.baseLabels)
  2046  	}
  2047  
  2048  	if c.config.BackwardsCompatibleMetrics {
  2049  		metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB))
  2050  		metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB))
  2051  		metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU))
  2052  		metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS))
  2053  	}
  2054  
  2055  	for _, n := range allocated.Networks {
  2056  		if !c.config.DisableTaggedMetrics {
  2057  			labels := append(c.baseLabels, metrics.Label{
  2058  				Name:  "device",
  2059  				Value: n.Device,
  2060  			})
  2061  			metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels)
  2062  		}
  2063  
  2064  		if c.config.BackwardsCompatibleMetrics {
  2065  			metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2066  		}
  2067  	}
  2068  
  2069  	// Emit unallocated
  2070  	unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB
  2071  	unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB
  2072  	unallocatedCpu := total.CPU - res.CPU - allocated.CPU
  2073  	unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS
  2074  
  2075  	if !c.config.DisableTaggedMetrics {
  2076  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels)
  2077  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels)
  2078  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels)
  2079  		metrics.SetGaugeWithLabels([]string{"client", "unallocated", "iops"}, float32(unallocatedIops), c.baseLabels)
  2080  	}
  2081  
  2082  	if c.config.BackwardsCompatibleMetrics {
  2083  		metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2084  		metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2085  		metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2086  		metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops))
  2087  	}
  2088  
  2089  	for _, n := range allocated.Networks {
  2090  		totalIdx := total.NetIndex(n)
  2091  		if totalIdx != -1 {
  2092  			continue
  2093  		}
  2094  
  2095  		totalMbits := total.Networks[totalIdx].MBits
  2096  		unallocatedMbits := totalMbits - n.MBits
  2097  
  2098  		if !c.config.DisableTaggedMetrics {
  2099  			labels := append(c.baseLabels, metrics.Label{
  2100  				Name:  "device",
  2101  				Value: n.Device,
  2102  			})
  2103  			metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels)
  2104  		}
  2105  
  2106  		if c.config.BackwardsCompatibleMetrics {
  2107  			metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2108  		}
  2109  	}
  2110  }
  2111  
  2112  // No lables are required so we emit with only a key/value syntax
  2113  func (c *Client) setGaugeForUptime(hStats *stats.HostStats) {
  2114  	if !c.config.DisableTaggedMetrics {
  2115  		metrics.SetGaugeWithLabels([]string{"uptime"}, float32(hStats.Uptime), c.baseLabels)
  2116  	}
  2117  	if c.config.BackwardsCompatibleMetrics {
  2118  		metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime))
  2119  	}
  2120  }
  2121  
  2122  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2123  func (c *Client) emitHostStats() {
  2124  	nodeID := c.NodeID()
  2125  	hStats := c.hostStatsCollector.Stats()
  2126  
  2127  	c.setGaugeForMemoryStats(nodeID, hStats)
  2128  	c.setGaugeForUptime(hStats)
  2129  	c.setGaugeForCPUStats(nodeID, hStats)
  2130  	c.setGaugeForDiskStats(nodeID, hStats)
  2131  }
  2132  
  2133  // emitClientMetrics emits lower volume client metrics
  2134  func (c *Client) emitClientMetrics() {
  2135  	nodeID := c.NodeID()
  2136  
  2137  	c.setGaugeForAllocationStats(nodeID)
  2138  
  2139  	// Emit allocation metrics
  2140  	blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0
  2141  	for _, ar := range c.getAllocRunners() {
  2142  		switch ar.Alloc().ClientStatus {
  2143  		case structs.AllocClientStatusPending:
  2144  			switch {
  2145  			case ar.IsWaiting():
  2146  				blocked++
  2147  			case ar.IsMigrating():
  2148  				migrating++
  2149  			default:
  2150  				pending++
  2151  			}
  2152  		case structs.AllocClientStatusRunning:
  2153  			running++
  2154  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2155  			terminal++
  2156  		}
  2157  	}
  2158  
  2159  	if !c.config.DisableTaggedMetrics {
  2160  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels)
  2161  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels)
  2162  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels)
  2163  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels)
  2164  		metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels)
  2165  	}
  2166  
  2167  	if c.config.BackwardsCompatibleMetrics {
  2168  		metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2169  		metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2170  		metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2171  		metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2172  		metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2173  	}
  2174  }
  2175  
  2176  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources {
  2177  	// Unfortunately the allocs only have IP so we need to match them to the
  2178  	// device
  2179  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  2180  	for _, n := range selfNode.Resources.Networks {
  2181  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  2182  		if err != nil {
  2183  			continue
  2184  		}
  2185  		cidrToDevice[ipnet] = n.Device
  2186  	}
  2187  
  2188  	// Sum the allocated resources
  2189  	allocs := c.allAllocs()
  2190  	var allocated structs.Resources
  2191  	allocatedDeviceMbits := make(map[string]int)
  2192  	for _, alloc := range allocs {
  2193  		if !alloc.TerminalStatus() {
  2194  			allocated.Add(alloc.Resources)
  2195  			for _, allocatedNetwork := range alloc.Resources.Networks {
  2196  				for cidr, dev := range cidrToDevice {
  2197  					ip := net.ParseIP(allocatedNetwork.IP)
  2198  					if cidr.Contains(ip) {
  2199  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2200  						break
  2201  					}
  2202  				}
  2203  			}
  2204  		}
  2205  	}
  2206  
  2207  	// Clear the networks
  2208  	allocated.Networks = nil
  2209  	for dev, speed := range allocatedDeviceMbits {
  2210  		net := &structs.NetworkResource{
  2211  			Device: dev,
  2212  			MBits:  speed,
  2213  		}
  2214  		allocated.Networks = append(allocated.Networks, net)
  2215  	}
  2216  
  2217  	return &allocated
  2218  }
  2219  
  2220  // allAllocs returns all the allocations managed by the client
  2221  func (c *Client) allAllocs() map[string]*structs.Allocation {
  2222  	ars := c.getAllocRunners()
  2223  	allocs := make(map[string]*structs.Allocation, len(ars))
  2224  	for _, ar := range c.getAllocRunners() {
  2225  		a := ar.Alloc()
  2226  		allocs[a.ID] = a
  2227  	}
  2228  	return allocs
  2229  }
  2230  
  2231  // resolveServer given a sever's address as a string, return it's resolved
  2232  // net.Addr or an error.
  2233  func resolveServer(s string) (net.Addr, error) {
  2234  	const defaultClientPort = "4647" // default client RPC port
  2235  	host, port, err := net.SplitHostPort(s)
  2236  	if err != nil {
  2237  		if strings.Contains(err.Error(), "missing port") {
  2238  			host = s
  2239  			port = defaultClientPort
  2240  		} else {
  2241  			return nil, err
  2242  		}
  2243  	}
  2244  	return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port))
  2245  }