github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/client/client.go

github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"archive/tar"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"log"
    10  	"net"
    11  	"os"
    12  	"path/filepath"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/armon/go-metrics"
    19  	consulapi "github.com/hashicorp/consul/api"
    20  	"github.com/hashicorp/consul/lib"
    21  	"github.com/hashicorp/go-multierror"
    22  	nomadapi "github.com/hashicorp/nomad/api"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/config"
    25  	"github.com/hashicorp/nomad/client/driver"
    26  	"github.com/hashicorp/nomad/client/fingerprint"
    27  	"github.com/hashicorp/nomad/client/stats"
    28  	"github.com/hashicorp/nomad/client/vaultclient"
    29  	"github.com/hashicorp/nomad/command/agent/consul"
    30  	"github.com/hashicorp/nomad/helper/tlsutil"
    31  	"github.com/hashicorp/nomad/nomad"
    32  	"github.com/hashicorp/nomad/nomad/structs"
    33  	vaultapi "github.com/hashicorp/vault/api"
    34  	"github.com/mitchellh/hashstructure"
    35  )
    36  
    37  const (
    38  	// clientRPCCache controls how long we keep an idle connection
    39  	// open to a server
    40  	clientRPCCache = 5 * time.Minute
    41  
    42  	// clientMaxStreams controsl how many idle streams we keep
    43  	// open to a server
    44  	clientMaxStreams = 2
    45  
    46  	// datacenterQueryLimit searches through up to this many adjacent
    47  	// datacenters looking for the Nomad server service.
    48  	datacenterQueryLimit = 9
    49  
    50  	// consulReaperIntv is the interval at which the Consul reaper will
    51  	// run.
    52  	consulReaperIntv = 5 * time.Second
    53  
    54  	// registerRetryIntv is minimum interval on which we retry
    55  	// registration. We pick a value between this and 2x this.
    56  	registerRetryIntv = 15 * time.Second
    57  
    58  	// getAllocRetryIntv is minimum interval on which we retry
    59  	// to fetch allocations. We pick a value between this and 2x this.
    60  	getAllocRetryIntv = 30 * time.Second
    61  
    62  	// devModeRetryIntv is the retry interval used for development
    63  	devModeRetryIntv = time.Second
    64  
    65  	// stateSnapshotIntv is how often the client snapshots state
    66  	stateSnapshotIntv = 60 * time.Second
    67  
    68  	// initialHeartbeatStagger is used to stagger the interval between
    69  	// starting and the intial heartbeat. After the intial heartbeat,
    70  	// we switch to using the TTL specified by the servers.
    71  	initialHeartbeatStagger = 10 * time.Second
    72  
    73  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    74  	// node attributes or meta map.
    75  	nodeUpdateRetryIntv = 5 * time.Second
    76  
    77  	// allocSyncIntv is the batching period of allocation updates before they
    78  	// are synced with the server.
    79  	allocSyncIntv = 200 * time.Millisecond
    80  
    81  	// allocSyncRetryIntv is the interval on which we retry updating
    82  	// the status of the allocation
    83  	allocSyncRetryIntv = 5 * time.Second
    84  )
    85  
    86  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
    87  // Client
    88  type ClientStatsReporter interface {
    89  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
    90  	// If it does not exist an error is reported.
    91  	GetAllocStats(allocID string) (AllocStatsReporter, error)
    92  
    93  	// LatestHostStats returns the latest resource usage stats for the host
    94  	LatestHostStats() *stats.HostStats
    95  }
    96  
    97  // Client is used to implement the client interaction with Nomad. Clients
    98  // are expected to register as a schedulable node to the servers, and to
    99  // run allocations as determined by the servers.
   100  type Client struct {
   101  	config *config.Config
   102  	start  time.Time
   103  
   104  	// configCopy is a copy that should be passed to alloc-runners.
   105  	configCopy *config.Config
   106  	configLock sync.RWMutex
   107  
   108  	logger *log.Logger
   109  
   110  	connPool *nomad.ConnPool
   111  
   112  	// servers is the (optionally prioritized) list of nomad servers
   113  	servers *serverlist
   114  
   115  	// heartbeat related times for tracking how often to heartbeat
   116  	lastHeartbeat time.Time
   117  	heartbeatTTL  time.Duration
   118  	heartbeatLock sync.Mutex
   119  
   120  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   121  	triggerDiscoveryCh chan struct{}
   122  
   123  	// discovered will be ticked whenever Consul discovery completes
   124  	// succesfully
   125  	serversDiscoveredCh chan struct{}
   126  
   127  	// allocs is the current set of allocations
   128  	allocs    map[string]*AllocRunner
   129  	allocLock sync.RWMutex
   130  
   131  	// blockedAllocations are allocations which are blocked because their
   132  	// chained allocations haven't finished running
   133  	blockedAllocations map[string]*structs.Allocation
   134  	blockedAllocsLock  sync.RWMutex
   135  
   136  	// allocUpdates stores allocations that need to be synced to the server.
   137  	allocUpdates chan *structs.Allocation
   138  
   139  	// consulSyncer advertises this Nomad Agent with Consul
   140  	consulSyncer *consul.Syncer
   141  
   142  	// HostStatsCollector collects host resource usage stats
   143  	hostStatsCollector *stats.HostStatsCollector
   144  	resourceUsage      *stats.HostStats
   145  	resourceUsageLock  sync.RWMutex
   146  
   147  	shutdown     bool
   148  	shutdownCh   chan struct{}
   149  	shutdownLock sync.Mutex
   150  
   151  	// vaultClient is used to interact with Vault for token and secret renewals
   152  	vaultClient vaultclient.VaultClient
   153  
   154  	// migratingAllocs is the set of allocs whose data migration is in flight
   155  	migratingAllocs     map[string]chan struct{}
   156  	migratingAllocsLock sync.Mutex
   157  }
   158  
   159  var (
   160  	// noServersErr is returned by the RPC method when the client has no
   161  	// configured servers. This is used to trigger Consul discovery if
   162  	// enabled.
   163  	noServersErr = errors.New("no servers")
   164  )
   165  
   166  // NewClient is used to create a new client from the given configuration
   167  func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Client, error) {
   168  	// Create the tls wrapper
   169  	var tlsWrap tlsutil.RegionWrapper
   170  	if cfg.TLSConfig.EnableRPC {
   171  		tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper()
   172  		if err != nil {
   173  			return nil, err
   174  		}
   175  		tlsWrap = tw
   176  	}
   177  
   178  	// Create the client
   179  	c := &Client{
   180  		config:              cfg,
   181  		consulSyncer:        consulSyncer,
   182  		start:               time.Now(),
   183  		connPool:            nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap),
   184  		logger:              logger,
   185  		hostStatsCollector:  stats.NewHostStatsCollector(),
   186  		allocs:              make(map[string]*AllocRunner),
   187  		blockedAllocations:  make(map[string]*structs.Allocation),
   188  		allocUpdates:        make(chan *structs.Allocation, 64),
   189  		shutdownCh:          make(chan struct{}),
   190  		migratingAllocs:     make(map[string]chan struct{}),
   191  		servers:             newServerList(),
   192  		triggerDiscoveryCh:  make(chan struct{}),
   193  		serversDiscoveredCh: make(chan struct{}),
   194  	}
   195  
   196  	// Initialize the client
   197  	if err := c.init(); err != nil {
   198  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   199  	}
   200  
   201  	// Setup the node
   202  	if err := c.setupNode(); err != nil {
   203  		return nil, fmt.Errorf("node setup failed: %v", err)
   204  	}
   205  
   206  	// Fingerprint the node
   207  	if err := c.fingerprint(); err != nil {
   208  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   209  	}
   210  
   211  	// Scan for drivers
   212  	if err := c.setupDrivers(); err != nil {
   213  		return nil, fmt.Errorf("driver setup failed: %v", err)
   214  	}
   215  
   216  	// Setup the reserved resources
   217  	c.reservePorts()
   218  
   219  	// Store the config copy before restoring state but after it has been
   220  	// initialized.
   221  	c.configLock.Lock()
   222  	c.configCopy = c.config.Copy()
   223  	c.configLock.Unlock()
   224  
   225  	// Set the preconfigured list of static servers
   226  	c.configLock.RLock()
   227  	if len(c.configCopy.Servers) > 0 {
   228  		if err := c.SetServers(c.configCopy.Servers); err != nil {
   229  			logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
   230  		}
   231  	}
   232  	c.configLock.RUnlock()
   233  
   234  	// Setup Consul discovery if enabled
   235  	if c.configCopy.ConsulConfig.ClientAutoJoin {
   236  		go c.consulDiscovery()
   237  		if len(c.servers.all()) == 0 {
   238  			// No configured servers; trigger discovery manually
   239  			c.triggerDiscoveryCh <- struct{}{}
   240  		}
   241  	}
   242  
   243  	// Start Consul reaper
   244  	go c.consulReaper()
   245  
   246  	// Setup the vault client for token and secret renewals
   247  	if err := c.setupVaultClient(); err != nil {
   248  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   249  	}
   250  
   251  	// Restore the state
   252  	if err := c.restoreState(); err != nil {
   253  		return nil, fmt.Errorf("failed to restore state: %v", err)
   254  	}
   255  
   256  	// Register and then start heartbeating to the servers.
   257  	go c.registerAndHeartbeat()
   258  
   259  	// Begin periodic snapshotting of state.
   260  	go c.periodicSnapshot()
   261  
   262  	// Begin syncing allocations to the server
   263  	go c.allocSync()
   264  
   265  	// Start the client!
   266  	go c.run()
   267  
   268  	// Start collecting stats
   269  	go c.collectHostStats()
   270  
   271  	c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID)
   272  	return c, nil
   273  }
   274  
   275  // init is used to initialize the client and perform any setup
   276  // needed before we begin starting its various components.
   277  func (c *Client) init() error {
   278  	// Ensure the state dir exists if we have one
   279  	if c.config.StateDir != "" {
   280  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   281  			return fmt.Errorf("failed creating state dir: %s", err)
   282  		}
   283  
   284  	} else {
   285  		// Othewise make a temp directory to use.
   286  		p, err := ioutil.TempDir("", "NomadClient")
   287  		if err != nil {
   288  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   289  		}
   290  
   291  		p, err = filepath.EvalSymlinks(p)
   292  		if err != nil {
   293  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   294  		}
   295  
   296  		c.config.StateDir = p
   297  	}
   298  	c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir)
   299  
   300  	// Ensure the alloc dir exists if we have one
   301  	if c.config.AllocDir != "" {
   302  		if err := os.MkdirAll(c.config.AllocDir, 0755); err != nil {
   303  			return fmt.Errorf("failed creating alloc dir: %s", err)
   304  		}
   305  	} else {
   306  		// Othewise make a temp directory to use.
   307  		p, err := ioutil.TempDir("", "NomadClient")
   308  		if err != nil {
   309  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   310  		}
   311  
   312  		p, err = filepath.EvalSymlinks(p)
   313  		if err != nil {
   314  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   315  		}
   316  
   317  		c.config.AllocDir = p
   318  	}
   319  
   320  	c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir)
   321  	return nil
   322  }
   323  
   324  // Leave is used to prepare the client to leave the cluster
   325  func (c *Client) Leave() error {
   326  	// TODO
   327  	return nil
   328  }
   329  
   330  // Datacenter returns the datacenter for the given client
   331  func (c *Client) Datacenter() string {
   332  	c.configLock.RLock()
   333  	dc := c.configCopy.Node.Datacenter
   334  	c.configLock.RUnlock()
   335  	return dc
   336  }
   337  
   338  // Region returns the region for the given client
   339  func (c *Client) Region() string {
   340  	return c.config.Region
   341  }
   342  
   343  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   344  // client.
   345  func (c *Client) RPCMajorVersion() int {
   346  	return structs.ApiMajorVersion
   347  }
   348  
   349  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   350  // client.
   351  func (c *Client) RPCMinorVersion() int {
   352  	return structs.ApiMinorVersion
   353  }
   354  
   355  // Shutdown is used to tear down the client
   356  func (c *Client) Shutdown() error {
   357  	c.logger.Printf("[INFO] client: shutting down")
   358  	c.shutdownLock.Lock()
   359  	defer c.shutdownLock.Unlock()
   360  
   361  	if c.shutdown {
   362  		return nil
   363  	}
   364  
   365  	// Stop renewing tokens and secrets
   366  	if c.vaultClient != nil {
   367  		c.vaultClient.Stop()
   368  	}
   369  
   370  	// Destroy all the running allocations.
   371  	if c.config.DevMode {
   372  		c.allocLock.Lock()
   373  		for _, ar := range c.allocs {
   374  			ar.Destroy()
   375  			<-ar.WaitCh()
   376  		}
   377  		c.allocLock.Unlock()
   378  	}
   379  
   380  	c.shutdown = true
   381  	close(c.shutdownCh)
   382  	c.connPool.Shutdown()
   383  	return c.saveState()
   384  }
   385  
   386  // RPC is used to forward an RPC call to a nomad server, or fail if no servers.
   387  func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
   388  	// Invoke the RPCHandler if it exists
   389  	if c.config.RPCHandler != nil {
   390  		return c.config.RPCHandler.RPC(method, args, reply)
   391  	}
   392  
   393  	servers := c.servers.all()
   394  	if len(servers) == 0 {
   395  		return noServersErr
   396  	}
   397  
   398  	var mErr multierror.Error
   399  	for _, s := range servers {
   400  		// Make the RPC request
   401  		if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil {
   402  			errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err)
   403  			mErr.Errors = append(mErr.Errors, errmsg)
   404  			c.logger.Printf("[DEBUG] client: %v", errmsg)
   405  			c.servers.failed(s)
   406  			continue
   407  		}
   408  		c.servers.good(s)
   409  		return nil
   410  	}
   411  
   412  	return mErr.ErrorOrNil()
   413  }
   414  
   415  // Stats is used to return statistics for debugging and insight
   416  // for various sub-systems
   417  func (c *Client) Stats() map[string]map[string]string {
   418  	c.allocLock.RLock()
   419  	numAllocs := len(c.allocs)
   420  	c.allocLock.RUnlock()
   421  
   422  	c.heartbeatLock.Lock()
   423  	defer c.heartbeatLock.Unlock()
   424  	stats := map[string]map[string]string{
   425  		"client": map[string]string{
   426  			"node_id":         c.Node().ID,
   427  			"known_servers":   c.servers.all().String(),
   428  			"num_allocations": strconv.Itoa(numAllocs),
   429  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   430  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   431  		},
   432  		"runtime": nomad.RuntimeStats(),
   433  	}
   434  	return stats
   435  }
   436  
   437  // Node returns the locally registered node
   438  func (c *Client) Node() *structs.Node {
   439  	c.configLock.RLock()
   440  	defer c.configLock.RUnlock()
   441  	return c.config.Node
   442  }
   443  
   444  // StatsReporter exposes the various APIs related resource usage of a Nomad
   445  // client
   446  func (c *Client) StatsReporter() ClientStatsReporter {
   447  	return c
   448  }
   449  
   450  func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) {
   451  	c.allocLock.RLock()
   452  	defer c.allocLock.RUnlock()
   453  	ar, ok := c.allocs[allocID]
   454  	if !ok {
   455  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   456  	}
   457  	return ar.StatsReporter(), nil
   458  }
   459  
   460  // HostStats returns all the stats related to a Nomad client
   461  func (c *Client) LatestHostStats() *stats.HostStats {
   462  	c.resourceUsageLock.RLock()
   463  	defer c.resourceUsageLock.RUnlock()
   464  	return c.resourceUsage
   465  }
   466  
   467  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   468  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   469  	c.allocLock.RLock()
   470  	defer c.allocLock.RUnlock()
   471  
   472  	ar, ok := c.allocs[allocID]
   473  	if !ok {
   474  		return nil, fmt.Errorf("alloc not found")
   475  	}
   476  	return ar.GetAllocDir(), nil
   477  }
   478  
   479  // GetServers returns the list of nomad servers this client is aware of.
   480  func (c *Client) GetServers() []string {
   481  	endpoints := c.servers.all()
   482  	res := make([]string, len(endpoints))
   483  	for i := range endpoints {
   484  		res[i] = endpoints[i].addr.String()
   485  	}
   486  	return res
   487  }
   488  
   489  // SetServers sets a new list of nomad servers to connect to. As long as one
   490  // server is resolvable no error is returned.
   491  func (c *Client) SetServers(servers []string) error {
   492  	endpoints := make([]*endpoint, 0, len(servers))
   493  	var merr multierror.Error
   494  	for _, s := range servers {
   495  		addr, err := resolveServer(s)
   496  		if err != nil {
   497  			c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err)
   498  			merr.Errors = append(merr.Errors, err)
   499  			continue
   500  		}
   501  
   502  		// Valid endpoint, append it without a priority as this API
   503  		// doesn't support different priorities for different servers
   504  		endpoints = append(endpoints, &endpoint{name: s, addr: addr})
   505  	}
   506  
   507  	// Only return errors if no servers are valid
   508  	if len(endpoints) == 0 {
   509  		if len(merr.Errors) > 0 {
   510  			return merr.ErrorOrNil()
   511  		}
   512  		return noServersErr
   513  	}
   514  
   515  	c.servers.set(endpoints)
   516  	return nil
   517  }
   518  
   519  // restoreState is used to restore our state from the data dir
   520  func (c *Client) restoreState() error {
   521  	if c.config.DevMode {
   522  		return nil
   523  	}
   524  
   525  	// Scan the directory
   526  	list, err := ioutil.ReadDir(filepath.Join(c.config.StateDir, "alloc"))
   527  	if err != nil && os.IsNotExist(err) {
   528  		return nil
   529  	} else if err != nil {
   530  		return fmt.Errorf("failed to list alloc state: %v", err)
   531  	}
   532  
   533  	// Load each alloc back
   534  	var mErr multierror.Error
   535  	for _, entry := range list {
   536  		id := entry.Name()
   537  		alloc := &structs.Allocation{ID: id}
   538  		c.configLock.RLock()
   539  		ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient)
   540  		c.configLock.RUnlock()
   541  		c.allocLock.Lock()
   542  		c.allocs[id] = ar
   543  		c.allocLock.Unlock()
   544  		if err := ar.RestoreState(); err != nil {
   545  			c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err)
   546  			mErr.Errors = append(mErr.Errors, err)
   547  		} else {
   548  			go ar.Run()
   549  		}
   550  	}
   551  	return mErr.ErrorOrNil()
   552  }
   553  
   554  // saveState is used to snapshot our state into the data dir
   555  func (c *Client) saveState() error {
   556  	if c.config.DevMode {
   557  		return nil
   558  	}
   559  
   560  	var mErr multierror.Error
   561  	for id, ar := range c.getAllocRunners() {
   562  		if err := ar.SaveState(); err != nil {
   563  			c.logger.Printf("[ERR] client: failed to save state for alloc %s: %v",
   564  				id, err)
   565  			mErr.Errors = append(mErr.Errors, err)
   566  		}
   567  	}
   568  	return mErr.ErrorOrNil()
   569  }
   570  
   571  // getAllocRunners returns a snapshot of the current set of alloc runners.
   572  func (c *Client) getAllocRunners() map[string]*AllocRunner {
   573  	c.allocLock.RLock()
   574  	defer c.allocLock.RUnlock()
   575  	runners := make(map[string]*AllocRunner, len(c.allocs))
   576  	for id, ar := range c.allocs {
   577  		runners[id] = ar
   578  	}
   579  	return runners
   580  }
   581  
   582  // nodeIDs restores the nodes persistent unique ID and SecretID or generates new
   583  // ones
   584  func (c *Client) nodeID() (id string, secret string, err error) {
   585  	// Do not persist in dev mode
   586  	if c.config.DevMode {
   587  		return structs.GenerateUUID(), structs.GenerateUUID(), nil
   588  	}
   589  
   590  	// Attempt to read existing ID
   591  	idPath := filepath.Join(c.config.StateDir, "client-id")
   592  	idBuf, err := ioutil.ReadFile(idPath)
   593  	if err != nil && !os.IsNotExist(err) {
   594  		return "", "", err
   595  	}
   596  
   597  	// Attempt to read existing secret ID
   598  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
   599  	secretBuf, err := ioutil.ReadFile(secretPath)
   600  	if err != nil && !os.IsNotExist(err) {
   601  		return "", "", err
   602  	}
   603  
   604  	// Use existing ID if any
   605  	if len(idBuf) != 0 {
   606  		id = string(idBuf)
   607  	} else {
   608  		// Generate new ID
   609  		id = structs.GenerateUUID()
   610  
   611  		// Persist the ID
   612  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
   613  			return "", "", err
   614  		}
   615  	}
   616  
   617  	if len(secretBuf) != 0 {
   618  		secret = string(secretBuf)
   619  	} else {
   620  		// Generate new ID
   621  		secret = structs.GenerateUUID()
   622  
   623  		// Persist the ID
   624  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
   625  			return "", "", err
   626  		}
   627  	}
   628  
   629  	return id, secret, nil
   630  }
   631  
   632  // setupNode is used to setup the initial node
   633  func (c *Client) setupNode() error {
   634  	node := c.config.Node
   635  	if node == nil {
   636  		node = &structs.Node{}
   637  		c.config.Node = node
   638  	}
   639  	// Generate an iD for the node
   640  	id, secretID, err := c.nodeID()
   641  	if err != nil {
   642  		return fmt.Errorf("node ID setup failed: %v", err)
   643  	}
   644  
   645  	node.ID = id
   646  	node.SecretID = secretID
   647  	if node.Attributes == nil {
   648  		node.Attributes = make(map[string]string)
   649  	}
   650  	if node.Links == nil {
   651  		node.Links = make(map[string]string)
   652  	}
   653  	if node.Meta == nil {
   654  		node.Meta = make(map[string]string)
   655  	}
   656  	if node.Resources == nil {
   657  		node.Resources = &structs.Resources{}
   658  	}
   659  	if node.Reserved == nil {
   660  		node.Reserved = &structs.Resources{}
   661  	}
   662  	if node.Datacenter == "" {
   663  		node.Datacenter = "dc1"
   664  	}
   665  	if node.Name == "" {
   666  		node.Name, _ = os.Hostname()
   667  	}
   668  	if node.Name == "" {
   669  		node.Name = node.ID
   670  	}
   671  	node.Status = structs.NodeStatusInit
   672  	return nil
   673  }
   674  
   675  // reservePorts is used to reserve ports on the fingerprinted network devices.
   676  func (c *Client) reservePorts() {
   677  	c.configLock.RLock()
   678  	defer c.configLock.RUnlock()
   679  	global := c.config.GloballyReservedPorts
   680  	if len(global) == 0 {
   681  		return
   682  	}
   683  
   684  	node := c.config.Node
   685  	networks := node.Resources.Networks
   686  	reservedIndex := make(map[string]*structs.NetworkResource, len(networks))
   687  	for _, resNet := range node.Reserved.Networks {
   688  		reservedIndex[resNet.IP] = resNet
   689  	}
   690  
   691  	// Go through each network device and reserve ports on it.
   692  	for _, net := range networks {
   693  		res, ok := reservedIndex[net.IP]
   694  		if !ok {
   695  			res = net.Copy()
   696  			res.MBits = 0
   697  			reservedIndex[net.IP] = res
   698  		}
   699  
   700  		for _, portVal := range global {
   701  			p := structs.Port{Value: portVal}
   702  			res.ReservedPorts = append(res.ReservedPorts, p)
   703  		}
   704  	}
   705  
   706  	// Clear the reserved networks.
   707  	if node.Reserved == nil {
   708  		node.Reserved = new(structs.Resources)
   709  	} else {
   710  		node.Reserved.Networks = nil
   711  	}
   712  
   713  	// Restore the reserved networks
   714  	for _, net := range reservedIndex {
   715  		node.Reserved.Networks = append(node.Reserved.Networks, net)
   716  	}
   717  }
   718  
   719  // fingerprint is used to fingerprint the client and setup the node
   720  func (c *Client) fingerprint() error {
   721  	whitelist := c.config.ReadStringListToMap("fingerprint.whitelist")
   722  	whitelistEnabled := len(whitelist) > 0
   723  	blacklist := c.config.ReadStringListToMap("fingerprint.blacklist")
   724  
   725  	c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints())
   726  
   727  	var applied []string
   728  	var skipped []string
   729  	for _, name := range fingerprint.BuiltinFingerprints() {
   730  		// Skip modules that are not in the whitelist if it is enabled.
   731  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   732  			skipped = append(skipped, name)
   733  			continue
   734  		}
   735  		// Skip modules that are in the blacklist
   736  		if _, ok := blacklist[name]; ok {
   737  			skipped = append(skipped, name)
   738  			continue
   739  		}
   740  		f, err := fingerprint.NewFingerprint(name, c.logger)
   741  		if err != nil {
   742  			return err
   743  		}
   744  
   745  		c.configLock.Lock()
   746  		applies, err := f.Fingerprint(c.config, c.config.Node)
   747  		c.configLock.Unlock()
   748  		if err != nil {
   749  			return err
   750  		}
   751  		if applies {
   752  			applied = append(applied, name)
   753  		}
   754  		p, period := f.Periodic()
   755  		if p {
   756  			// TODO: If more periodic fingerprinters are added, then
   757  			// fingerprintPeriodic should be used to handle all the periodic
   758  			// fingerprinters by using a priority queue.
   759  			go c.fingerprintPeriodic(name, f, period)
   760  		}
   761  	}
   762  	c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied)
   763  	if len(skipped) != 0 {
   764  		c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped)
   765  	}
   766  	return nil
   767  }
   768  
   769  // fingerprintPeriodic runs a fingerprinter at the specified duration.
   770  func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) {
   771  	c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d)
   772  	for {
   773  		select {
   774  		case <-time.After(d):
   775  			c.configLock.Lock()
   776  			if _, err := f.Fingerprint(c.config, c.config.Node); err != nil {
   777  				c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err)
   778  			}
   779  			c.configLock.Unlock()
   780  		case <-c.shutdownCh:
   781  			return
   782  		}
   783  	}
   784  }
   785  
   786  // setupDrivers is used to find the available drivers
   787  func (c *Client) setupDrivers() error {
   788  	// Build the white/blacklists of drivers.
   789  	whitelist := c.config.ReadStringListToMap("driver.whitelist")
   790  	whitelistEnabled := len(whitelist) > 0
   791  	blacklist := c.config.ReadStringListToMap("driver.blacklist")
   792  
   793  	var avail []string
   794  	var skipped []string
   795  	driverCtx := driver.NewDriverContext("", c.config, c.config.Node, c.logger, nil)
   796  	for name := range driver.BuiltinDrivers {
   797  		// Skip fingerprinting drivers that are not in the whitelist if it is
   798  		// enabled.
   799  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   800  			skipped = append(skipped, name)
   801  			continue
   802  		}
   803  		// Skip fingerprinting drivers that are in the blacklist
   804  		if _, ok := blacklist[name]; ok {
   805  			skipped = append(skipped, name)
   806  			continue
   807  		}
   808  
   809  		d, err := driver.NewDriver(name, driverCtx)
   810  		if err != nil {
   811  			return err
   812  		}
   813  		c.configLock.Lock()
   814  		applies, err := d.Fingerprint(c.config, c.config.Node)
   815  		c.configLock.Unlock()
   816  		if err != nil {
   817  			return err
   818  		}
   819  		if applies {
   820  			avail = append(avail, name)
   821  		}
   822  
   823  		p, period := d.Periodic()
   824  		if p {
   825  			go c.fingerprintPeriodic(name, d, period)
   826  		}
   827  
   828  	}
   829  
   830  	c.logger.Printf("[DEBUG] client: available drivers %v", avail)
   831  
   832  	if len(skipped) != 0 {
   833  		c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped)
   834  	}
   835  
   836  	return nil
   837  }
   838  
   839  // retryIntv calculates a retry interval value given the base
   840  func (c *Client) retryIntv(base time.Duration) time.Duration {
   841  	if c.config.DevMode {
   842  		return devModeRetryIntv
   843  	}
   844  	return base + lib.RandomStagger(base)
   845  }
   846  
   847  // registerAndHeartbeat is a long lived goroutine used to register the client
   848  // and then start heartbeatng to the server.
   849  func (c *Client) registerAndHeartbeat() {
   850  	// Register the node
   851  	c.retryRegisterNode()
   852  
   853  	// Start watching changes for node changes
   854  	go c.watchNodeUpdates()
   855  
   856  	// Setup the heartbeat timer, for the initial registration
   857  	// we want to do this quickly. We want to do it extra quickly
   858  	// in development mode.
   859  	var heartbeat <-chan time.Time
   860  	if c.config.DevMode {
   861  		heartbeat = time.After(0)
   862  	} else {
   863  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
   864  	}
   865  
   866  	for {
   867  		select {
   868  		case <-c.serversDiscoveredCh:
   869  		case <-heartbeat:
   870  		case <-c.shutdownCh:
   871  			return
   872  		}
   873  
   874  		if err := c.updateNodeStatus(); err != nil {
   875  			// The servers have changed such that this node has not been
   876  			// registered before
   877  			if strings.Contains(err.Error(), "node not found") {
   878  				// Re-register the node
   879  				c.logger.Printf("[INFO] client: re-registering node")
   880  				c.retryRegisterNode()
   881  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
   882  			} else {
   883  				intv := c.retryIntv(registerRetryIntv)
   884  				c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
   885  				heartbeat = time.After(intv)
   886  
   887  				// if heartbeating fails, trigger Consul discovery
   888  				c.triggerDiscovery()
   889  			}
   890  		} else {
   891  			c.heartbeatLock.Lock()
   892  			heartbeat = time.After(c.heartbeatTTL)
   893  			c.heartbeatLock.Unlock()
   894  		}
   895  	}
   896  }
   897  
   898  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
   899  // state of the client
   900  func (c *Client) periodicSnapshot() {
   901  	// Create a snapshot timer
   902  	snapshot := time.After(stateSnapshotIntv)
   903  
   904  	for {
   905  		select {
   906  		case <-snapshot:
   907  			snapshot = time.After(stateSnapshotIntv)
   908  			if err := c.saveState(); err != nil {
   909  				c.logger.Printf("[ERR] client: failed to save state: %v", err)
   910  			}
   911  
   912  		case <-c.shutdownCh:
   913  			return
   914  		}
   915  	}
   916  }
   917  
   918  // run is a long lived goroutine used to run the client
   919  func (c *Client) run() {
   920  	// Watch for changes in allocations
   921  	allocUpdates := make(chan *allocUpdates, 8)
   922  	go c.watchAllocations(allocUpdates)
   923  
   924  	for {
   925  		select {
   926  		case update := <-allocUpdates:
   927  			c.runAllocs(update)
   928  
   929  		case <-c.shutdownCh:
   930  			return
   931  		}
   932  	}
   933  }
   934  
   935  // hasNodeChanged calculates a hash for the node attributes- and meta map.
   936  // The new hash values are compared against the old (passed-in) hash values to
   937  // determine if the node properties have changed. It returns the new hash values
   938  // in case they are different from the old hash values.
   939  func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) {
   940  	c.configLock.RLock()
   941  	defer c.configLock.RUnlock()
   942  	newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil)
   943  	if err != nil {
   944  		c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err)
   945  	}
   946  	// Calculate node meta map hash
   947  	newMetaHash, err := hashstructure.Hash(c.config.Node.Meta, nil)
   948  	if err != nil {
   949  		c.logger.Printf("[DEBUG] client: unable to calculate node meta hash: %v", err)
   950  	}
   951  	if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash {
   952  		return true, newAttrHash, newMetaHash
   953  	}
   954  	return false, oldAttrHash, oldMetaHash
   955  }
   956  
   957  // retryRegisterNode is used to register the node or update the registration and
   958  // retry in case of failure.
   959  func (c *Client) retryRegisterNode() {
   960  	for {
   961  		err := c.registerNode()
   962  		if err == nil {
   963  			// Registered!
   964  			return
   965  		}
   966  
   967  		if err == noServersErr {
   968  			c.logger.Print("[DEBUG] client: registration waiting on servers")
   969  			c.triggerDiscovery()
   970  		} else {
   971  			c.logger.Printf("[ERR] client: registration failure: %v", err)
   972  		}
   973  		select {
   974  		case <-c.serversDiscoveredCh:
   975  		case <-time.After(c.retryIntv(registerRetryIntv)):
   976  		case <-c.shutdownCh:
   977  			return
   978  		}
   979  	}
   980  }
   981  
   982  // registerNode is used to register the node or update the registration
   983  func (c *Client) registerNode() error {
   984  	node := c.Node()
   985  	req := structs.NodeRegisterRequest{
   986  		Node:         node,
   987  		WriteRequest: structs.WriteRequest{Region: c.Region()},
   988  	}
   989  	var resp structs.NodeUpdateResponse
   990  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
   991  		return err
   992  	}
   993  
   994  	// Update the node status to ready after we register.
   995  	c.configLock.Lock()
   996  	node.Status = structs.NodeStatusReady
   997  	c.configLock.Unlock()
   998  
   999  	c.logger.Printf("[INFO] client: node registration complete")
  1000  	if len(resp.EvalIDs) != 0 {
  1001  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs))
  1002  	}
  1003  
  1004  	c.heartbeatLock.Lock()
  1005  	defer c.heartbeatLock.Unlock()
  1006  	c.lastHeartbeat = time.Now()
  1007  	c.heartbeatTTL = resp.HeartbeatTTL
  1008  	return nil
  1009  }
  1010  
  1011  // updateNodeStatus is used to heartbeat and update the status of the node
  1012  func (c *Client) updateNodeStatus() error {
  1013  	c.heartbeatLock.Lock()
  1014  	defer c.heartbeatLock.Unlock()
  1015  
  1016  	node := c.Node()
  1017  	req := structs.NodeUpdateStatusRequest{
  1018  		NodeID:       node.ID,
  1019  		Status:       structs.NodeStatusReady,
  1020  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1021  	}
  1022  	var resp structs.NodeUpdateResponse
  1023  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1024  		c.triggerDiscovery()
  1025  		return fmt.Errorf("failed to update status: %v", err)
  1026  	}
  1027  	if len(resp.EvalIDs) != 0 {
  1028  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs))
  1029  	}
  1030  	if resp.Index != 0 {
  1031  		c.logger.Printf("[DEBUG] client: state updated to %s", req.Status)
  1032  	}
  1033  
  1034  	// Update heartbeat time and ttl
  1035  	c.lastHeartbeat = time.Now()
  1036  	c.heartbeatTTL = resp.HeartbeatTTL
  1037  
  1038  	// Convert []*NodeServerInfo to []*endpoints
  1039  	localdc := c.Datacenter()
  1040  	servers := make(endpoints, 0, len(resp.Servers))
  1041  	for _, s := range resp.Servers {
  1042  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1043  		if err != nil {
  1044  			continue
  1045  		}
  1046  		e := endpoint{name: s.RPCAdvertiseAddr, addr: addr}
  1047  		if s.Datacenter != localdc {
  1048  			// server is non-local; de-prioritize
  1049  			e.priority = 1
  1050  		}
  1051  		servers = append(servers, &e)
  1052  	}
  1053  	if len(servers) == 0 {
  1054  		return fmt.Errorf("server returned no valid servers")
  1055  	}
  1056  	c.servers.set(servers)
  1057  
  1058  	// Begin polling Consul if there is no Nomad leader.  We could be
  1059  	// heartbeating to a Nomad server that is in the minority of a
  1060  	// partition of the Nomad server quorum, but this Nomad Agent still
  1061  	// has connectivity to the existing majority of Nomad Servers, but
  1062  	// only if it queries Consul.
  1063  	if resp.LeaderRPCAddr == "" {
  1064  		c.triggerDiscovery()
  1065  	}
  1066  
  1067  	return nil
  1068  }
  1069  
  1070  // updateAllocStatus is used to update the status of an allocation
  1071  func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
  1072  	// Only send the fields that are updatable by the client.
  1073  	stripped := new(structs.Allocation)
  1074  	stripped.ID = alloc.ID
  1075  	stripped.NodeID = c.Node().ID
  1076  	stripped.TaskStates = alloc.TaskStates
  1077  	stripped.ClientStatus = alloc.ClientStatus
  1078  	stripped.ClientDescription = alloc.ClientDescription
  1079  	select {
  1080  	case c.allocUpdates <- stripped:
  1081  	case <-c.shutdownCh:
  1082  	}
  1083  }
  1084  
  1085  // allocSync is a long lived function that batches allocation updates to the
  1086  // server.
  1087  func (c *Client) allocSync() {
  1088  	staggered := false
  1089  	syncTicker := time.NewTicker(allocSyncIntv)
  1090  	updates := make(map[string]*structs.Allocation)
  1091  	for {
  1092  		select {
  1093  		case <-c.shutdownCh:
  1094  			syncTicker.Stop()
  1095  			return
  1096  		case alloc := <-c.allocUpdates:
  1097  			// Batch the allocation updates until the timer triggers.
  1098  			updates[alloc.ID] = alloc
  1099  
  1100  			// If this alloc was blocking another alloc and transitioned to a
  1101  			// terminal state then start the blocked allocation
  1102  			c.blockedAllocsLock.Lock()
  1103  			if blockedAlloc, ok := c.blockedAllocations[alloc.ID]; ok && alloc.Terminated() {
  1104  				var prevAllocDir *allocdir.AllocDir
  1105  				if ar, ok := c.getAllocRunners()[alloc.ID]; ok {
  1106  					prevAllocDir = ar.GetAllocDir()
  1107  				}
  1108  				if err := c.addAlloc(blockedAlloc, prevAllocDir); err != nil {
  1109  					c.logger.Printf("[ERR] client: failed to add alloc which was previously blocked %q: %v",
  1110  						blockedAlloc.ID, err)
  1111  				}
  1112  				delete(c.blockedAllocations, blockedAlloc.PreviousAllocation)
  1113  			}
  1114  			c.blockedAllocsLock.Unlock()
  1115  		case <-syncTicker.C:
  1116  			// Fast path if there are no updates
  1117  			if len(updates) == 0 {
  1118  				continue
  1119  			}
  1120  
  1121  			sync := make([]*structs.Allocation, 0, len(updates))
  1122  			for _, alloc := range updates {
  1123  				sync = append(sync, alloc)
  1124  			}
  1125  
  1126  			// Send to server.
  1127  			args := structs.AllocUpdateRequest{
  1128  				Alloc:        sync,
  1129  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1130  			}
  1131  
  1132  			var resp structs.GenericResponse
  1133  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1134  				c.logger.Printf("[ERR] client: failed to update allocations: %v", err)
  1135  				syncTicker.Stop()
  1136  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1137  				staggered = true
  1138  			} else {
  1139  				updates = make(map[string]*structs.Allocation)
  1140  				if staggered {
  1141  					syncTicker.Stop()
  1142  					syncTicker = time.NewTicker(allocSyncIntv)
  1143  					staggered = false
  1144  				}
  1145  			}
  1146  		}
  1147  	}
  1148  }
  1149  
  1150  // allocUpdates holds the results of receiving updated allocations from the
  1151  // servers.
  1152  type allocUpdates struct {
  1153  	// pulled is the set of allocations that were downloaded from the servers.
  1154  	pulled map[string]*structs.Allocation
  1155  
  1156  	// filtered is the set of allocations that were not pulled because their
  1157  	// AllocModifyIndex didn't change.
  1158  	filtered map[string]struct{}
  1159  }
  1160  
  1161  // watchAllocations is used to scan for updates to allocations
  1162  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1163  	// The request and response for getting the map of allocations that should
  1164  	// be running on the Node to their AllocModifyIndex which is incremented
  1165  	// when the allocation is updated by the servers.
  1166  	n := c.Node()
  1167  	req := structs.NodeSpecificRequest{
  1168  		NodeID:   n.ID,
  1169  		SecretID: n.SecretID,
  1170  		QueryOptions: structs.QueryOptions{
  1171  			Region:     c.Region(),
  1172  			AllowStale: true,
  1173  		},
  1174  	}
  1175  	var resp structs.NodeClientAllocsResponse
  1176  
  1177  	// The request and response for pulling down the set of allocations that are
  1178  	// new, or updated server side.
  1179  	allocsReq := structs.AllocsGetRequest{
  1180  		QueryOptions: structs.QueryOptions{
  1181  			Region:     c.Region(),
  1182  			AllowStale: true,
  1183  		},
  1184  	}
  1185  	var allocsResp structs.AllocsGetResponse
  1186  
  1187  	for {
  1188  		// Get the allocation modify index map, blocking for updates. We will
  1189  		// use this to determine exactly what allocations need to be downloaded
  1190  		// in full.
  1191  		resp = structs.NodeClientAllocsResponse{}
  1192  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1193  		if err != nil {
  1194  			// Shutdown often causes EOF errors, so check for shutdown first
  1195  			select {
  1196  			case <-c.shutdownCh:
  1197  				return
  1198  			default:
  1199  			}
  1200  
  1201  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1202  			// servers are not fully upgraded before the clients register. This
  1203  			// can cause the SecretID to be lost
  1204  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1205  				c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err)
  1206  				c.retryRegisterNode()
  1207  			} else if err != noServersErr {
  1208  				c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err)
  1209  			}
  1210  			retry := c.retryIntv(getAllocRetryIntv)
  1211  			select {
  1212  			case <-c.serversDiscoveredCh:
  1213  				continue
  1214  			case <-time.After(retry):
  1215  				continue
  1216  			case <-c.shutdownCh:
  1217  				return
  1218  			}
  1219  		}
  1220  
  1221  		// Check for shutdown
  1222  		select {
  1223  		case <-c.shutdownCh:
  1224  			return
  1225  		default:
  1226  		}
  1227  
  1228  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1229  		// These are the allocations who have either not been updated, or whose
  1230  		// updates are a result of the client sending an update for the alloc.
  1231  		// This lets us reduce the network traffic to the server as we don't
  1232  		// need to pull all the allocations.
  1233  		var pull []string
  1234  		filtered := make(map[string]struct{})
  1235  		runners := c.getAllocRunners()
  1236  		for allocID, modifyIndex := range resp.Allocs {
  1237  			// Pull the allocation if we don't have an alloc runner for the
  1238  			// allocation or if the alloc runner requires an updated allocation.
  1239  			runner, ok := runners[allocID]
  1240  			if !ok || runner.shouldUpdate(modifyIndex) {
  1241  				pull = append(pull, allocID)
  1242  			} else {
  1243  				filtered[allocID] = struct{}{}
  1244  			}
  1245  		}
  1246  
  1247  		c.logger.Printf("[DEBUG] client: updated allocations at index %d (pulled %d) (filtered %d)",
  1248  			resp.Index, len(pull), len(filtered))
  1249  
  1250  		// Pull the allocations that passed filtering.
  1251  		allocsResp.Allocs = nil
  1252  		if len(pull) != 0 {
  1253  			// Pull the allocations that need to be updated.
  1254  			allocsReq.AllocIDs = pull
  1255  			allocsResp = structs.AllocsGetResponse{}
  1256  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1257  				c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err)
  1258  				retry := c.retryIntv(getAllocRetryIntv)
  1259  				select {
  1260  				case <-c.serversDiscoveredCh:
  1261  					continue
  1262  				case <-time.After(retry):
  1263  					continue
  1264  				case <-c.shutdownCh:
  1265  					return
  1266  				}
  1267  			}
  1268  
  1269  			// Check for shutdown
  1270  			select {
  1271  			case <-c.shutdownCh:
  1272  				return
  1273  			default:
  1274  			}
  1275  		}
  1276  
  1277  		// Update the query index.
  1278  		if resp.Index > req.MinQueryIndex {
  1279  			req.MinQueryIndex = resp.Index
  1280  		}
  1281  
  1282  		// Push the updates.
  1283  		pulled := make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1284  		for _, alloc := range allocsResp.Allocs {
  1285  			pulled[alloc.ID] = alloc
  1286  		}
  1287  		update := &allocUpdates{
  1288  			filtered: filtered,
  1289  			pulled:   pulled,
  1290  		}
  1291  		select {
  1292  		case updates <- update:
  1293  		case <-c.shutdownCh:
  1294  			return
  1295  		}
  1296  	}
  1297  }
  1298  
  1299  // watchNodeUpdates periodically checks for changes to the node attributes or meta map
  1300  func (c *Client) watchNodeUpdates() {
  1301  	c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv)
  1302  
  1303  	// Initialize the hashes
  1304  	_, attrHash, metaHash := c.hasNodeChanged(0, 0)
  1305  	var changed bool
  1306  	for {
  1307  		select {
  1308  		case <-time.After(c.retryIntv(nodeUpdateRetryIntv)):
  1309  			changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash)
  1310  			if changed {
  1311  				c.logger.Printf("[DEBUG] client: state changed, updating node.")
  1312  
  1313  				// Update the config copy.
  1314  				c.configLock.Lock()
  1315  				node := c.config.Node.Copy()
  1316  				c.configCopy.Node = node
  1317  				c.configLock.Unlock()
  1318  
  1319  				c.retryRegisterNode()
  1320  			}
  1321  		case <-c.shutdownCh:
  1322  			return
  1323  		}
  1324  	}
  1325  }
  1326  
  1327  // runAllocs is invoked when we get an updated set of allocations
  1328  func (c *Client) runAllocs(update *allocUpdates) {
  1329  	// Get the existing allocs
  1330  	c.allocLock.RLock()
  1331  	exist := make([]*structs.Allocation, 0, len(c.allocs))
  1332  	for _, ar := range c.allocs {
  1333  		exist = append(exist, ar.alloc)
  1334  	}
  1335  	c.allocLock.RUnlock()
  1336  
  1337  	// Diff the existing and updated allocations
  1338  	diff := diffAllocs(exist, update)
  1339  	c.logger.Printf("[DEBUG] client: %#v", diff)
  1340  
  1341  	// Remove the old allocations
  1342  	for _, remove := range diff.removed {
  1343  		if err := c.removeAlloc(remove); err != nil {
  1344  			c.logger.Printf("[ERR] client: failed to remove alloc '%s': %v",
  1345  				remove.ID, err)
  1346  		}
  1347  	}
  1348  
  1349  	// Update the existing allocations
  1350  	for _, update := range diff.updated {
  1351  		if err := c.updateAlloc(update.exist, update.updated); err != nil {
  1352  			c.logger.Printf("[ERR] client: failed to update alloc '%s': %v",
  1353  				update.exist.ID, err)
  1354  		}
  1355  
  1356  		// See if the updated alloc is getting migrated
  1357  		c.migratingAllocsLock.Lock()
  1358  		ch, ok := c.migratingAllocs[update.updated.ID]
  1359  		c.migratingAllocsLock.Unlock()
  1360  		if ok {
  1361  			// Stopping the migration if the allocation doesn't need any
  1362  			// migration
  1363  			if !update.updated.ShouldMigrate() {
  1364  				close(ch)
  1365  			}
  1366  		}
  1367  	}
  1368  
  1369  	// Start the new allocations
  1370  	for _, add := range diff.added {
  1371  		// If the allocation is chained and the previous allocation hasn't
  1372  		// terminated yet, then add the alloc to the blocked queue.
  1373  		ar, ok := c.getAllocRunners()[add.PreviousAllocation]
  1374  		if ok && !ar.Alloc().Terminated() {
  1375  			c.logger.Printf("[DEBUG] client: added alloc %q to blocked queue", add.ID)
  1376  			c.blockedAllocsLock.Lock()
  1377  			c.blockedAllocations[add.PreviousAllocation] = add
  1378  			c.blockedAllocsLock.Unlock()
  1379  			continue
  1380  		}
  1381  
  1382  		// This means the allocation has a previous allocation on another node
  1383  		// so we will block for the previous allocation to complete
  1384  		if add.PreviousAllocation != "" && !ok {
  1385  			c.migratingAllocsLock.Lock()
  1386  			c.migratingAllocs[add.ID] = make(chan struct{})
  1387  			c.migratingAllocsLock.Unlock()
  1388  			go c.blockForRemoteAlloc(add)
  1389  			continue
  1390  		}
  1391  
  1392  		// Setting the previous allocdir if the allocation had a terminal
  1393  		// previous allocation
  1394  		var prevAllocDir *allocdir.AllocDir
  1395  		tg := add.Job.LookupTaskGroup(add.TaskGroup)
  1396  		if tg != nil && tg.EphemeralDisk.Sticky == true && ar != nil {
  1397  			prevAllocDir = ar.GetAllocDir()
  1398  		}
  1399  
  1400  		if err := c.addAlloc(add, prevAllocDir); err != nil {
  1401  			c.logger.Printf("[ERR] client: failed to add alloc '%s': %v",
  1402  				add.ID, err)
  1403  		}
  1404  	}
  1405  
  1406  	// Persist our state
  1407  	if err := c.saveState(); err != nil {
  1408  		c.logger.Printf("[ERR] client: failed to save state: %v", err)
  1409  	}
  1410  }
  1411  
  1412  // blockForRemoteAlloc blocks until the previous allocation of an allocation has
  1413  // been terminated and migrates the snapshot data
  1414  func (c *Client) blockForRemoteAlloc(alloc *structs.Allocation) {
  1415  	// Removing the allocation from the set of allocs which are currently
  1416  	// undergoing migration
  1417  	defer func() {
  1418  		c.migratingAllocsLock.Lock()
  1419  		delete(c.migratingAllocs, alloc.ID)
  1420  		c.migratingAllocsLock.Unlock()
  1421  	}()
  1422  
  1423  	// prevAllocDir is the allocation directory of the previous allocation
  1424  	var prevAllocDir *allocdir.AllocDir
  1425  
  1426  	// If the allocation is not sticky then we won't wait for the previous
  1427  	// allocation to be terminal
  1428  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1429  	if tg == nil {
  1430  		c.logger.Printf("[ERR] client: task group %q not found in job %q", tg.Name, alloc.Job.ID)
  1431  		goto ADDALLOC
  1432  	}
  1433  
  1434  	// Wait for the remote previous alloc to be terminal if the alloc is sticky
  1435  	if tg.EphemeralDisk.Sticky {
  1436  		c.logger.Printf("[DEBUG] client: blocking alloc %q for previous allocation %q", alloc.ID, alloc.PreviousAllocation)
  1437  		// Block until the previous allocation migrates to terminal state
  1438  		prevAlloc, err := c.waitForAllocTerminal(alloc.PreviousAllocation)
  1439  		if err != nil {
  1440  			c.logger.Printf("[ERR] client: error waiting for allocation %q: %v",
  1441  				alloc.PreviousAllocation, err)
  1442  		}
  1443  
  1444  		// Migrate the data from the remote node
  1445  		prevAllocDir, err = c.migrateRemoteAllocDir(prevAlloc, alloc.ID)
  1446  		if err != nil {
  1447  			c.logger.Printf("[ERR] client: error migrating data from remote alloc %q: %v",
  1448  				alloc.PreviousAllocation, err)
  1449  		}
  1450  	}
  1451  
  1452  ADDALLOC:
  1453  	// Add the allocation
  1454  	if err := c.addAlloc(alloc, prevAllocDir); err != nil {
  1455  		c.logger.Printf("[ERR] client: error adding alloc: %v", err)
  1456  	}
  1457  }
  1458  
  1459  // waitForAllocTerminal waits for an allocation with the given alloc id to
  1460  // transition to terminal state and blocks the caller until then.
  1461  func (c *Client) waitForAllocTerminal(allocID string) (*structs.Allocation, error) {
  1462  	req := structs.AllocSpecificRequest{
  1463  		AllocID: allocID,
  1464  		QueryOptions: structs.QueryOptions{
  1465  			Region:     c.Region(),
  1466  			AllowStale: true,
  1467  		},
  1468  	}
  1469  
  1470  	for {
  1471  		resp := structs.SingleAllocResponse{}
  1472  		err := c.RPC("Alloc.GetAlloc", &req, &resp)
  1473  		if err != nil {
  1474  			c.logger.Printf("[ERR] client: failed to query allocation %q: %v", allocID, err)
  1475  			retry := c.retryIntv(getAllocRetryIntv)
  1476  			select {
  1477  			case <-time.After(retry):
  1478  				continue
  1479  			case <-c.shutdownCh:
  1480  				return nil, fmt.Errorf("aborting because client is shutting down")
  1481  			}
  1482  		}
  1483  		if resp.Alloc == nil {
  1484  			return nil, nil
  1485  		}
  1486  		if resp.Alloc.Terminated() {
  1487  			return resp.Alloc, nil
  1488  		}
  1489  
  1490  		// Update the query index.
  1491  		if resp.Index > req.MinQueryIndex {
  1492  			req.MinQueryIndex = resp.Index
  1493  		}
  1494  
  1495  	}
  1496  }
  1497  
  1498  // migrateRemoteAllocDir migrates the allocation directory from a remote node to
  1499  // the current node
  1500  func (c *Client) migrateRemoteAllocDir(alloc *structs.Allocation, allocID string) (*allocdir.AllocDir, error) {
  1501  	if alloc == nil {
  1502  		return nil, nil
  1503  	}
  1504  
  1505  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1506  	if tg == nil {
  1507  		return nil, fmt.Errorf("Task Group %q not found in job %q", tg.Name, alloc.Job.ID)
  1508  	}
  1509  
  1510  	// Skip migration of data if the ephemeral disk is not sticky or
  1511  	// migration is turned off.
  1512  	if !tg.EphemeralDisk.Sticky || !tg.EphemeralDisk.Migrate {
  1513  		return nil, nil
  1514  	}
  1515  
  1516  	node, err := c.getNode(alloc.NodeID)
  1517  
  1518  	// If the node is down then skip migrating the data
  1519  	if err != nil {
  1520  		return nil, fmt.Errorf("error retreiving node %v: %v", alloc.NodeID, err)
  1521  	}
  1522  
  1523  	// Check if node is nil
  1524  	if node == nil {
  1525  		return nil, fmt.Errorf("node %q doesn't exist", alloc.NodeID)
  1526  	}
  1527  
  1528  	// skip migration if the remote node is down
  1529  	if node.Status == structs.NodeStatusDown {
  1530  		c.logger.Printf("[INFO] client: not migrating data from alloc %q since node %q is down", alloc.ID, alloc.NodeID)
  1531  		return nil, nil
  1532  	}
  1533  
  1534  	// Create the previous alloc dir
  1535  	pathToAllocDir := filepath.Join(c.config.AllocDir, alloc.ID)
  1536  	if err := os.MkdirAll(pathToAllocDir, 0777); err != nil {
  1537  		c.logger.Printf("[ERR] client: error creating previous allocation dir: %v", err)
  1538  	}
  1539  
  1540  	// Get the snapshot
  1541  	scheme := "http"
  1542  	if node.TLSEnabled {
  1543  		scheme = "https"
  1544  	}
  1545  	// Create an API client
  1546  	apiConfig := nomadapi.DefaultConfig()
  1547  	apiConfig.Address = fmt.Sprintf("%s://%s", scheme, node.HTTPAddr)
  1548  	apiConfig.TLSConfig = &nomadapi.TLSConfig{
  1549  		CACert:     c.config.TLSConfig.CAFile,
  1550  		ClientCert: c.config.TLSConfig.CertFile,
  1551  		ClientKey:  c.config.TLSConfig.KeyFile,
  1552  	}
  1553  	apiClient, err := nomadapi.NewClient(apiConfig)
  1554  	if err != nil {
  1555  		return nil, err
  1556  	}
  1557  
  1558  	url := fmt.Sprintf("/v1/client/allocation/%v/snapshot", alloc.ID)
  1559  	resp, err := apiClient.Raw().Response(url, nil)
  1560  	if err != nil {
  1561  		os.RemoveAll(pathToAllocDir)
  1562  		c.logger.Printf("[ERR] client: error getting snapshot: %v", err)
  1563  		return nil, fmt.Errorf("error getting snapshot for alloc %v: %v", alloc.ID, err)
  1564  	}
  1565  
  1566  	tr := tar.NewReader(resp)
  1567  	defer resp.Close()
  1568  
  1569  	buf := make([]byte, 1024)
  1570  
  1571  	stopMigrating, ok := c.migratingAllocs[allocID]
  1572  	if !ok {
  1573  		os.RemoveAll(pathToAllocDir)
  1574  		return nil, fmt.Errorf("couldn't find a migration validity notifier for alloc: %v", alloc.ID)
  1575  	}
  1576  	for {
  1577  		// See if the alloc still needs migration
  1578  		select {
  1579  		case <-stopMigrating:
  1580  			os.RemoveAll(pathToAllocDir)
  1581  			c.logger.Printf("[INFO] client: stopping migration of allocdir for alloc: %v", alloc.ID)
  1582  			return nil, nil
  1583  		case <-c.shutdownCh:
  1584  			os.RemoveAll(pathToAllocDir)
  1585  			c.logger.Printf("[INFO] client: stopping migration of alloc %q since client is shutting down", alloc.ID)
  1586  			return nil, nil
  1587  		default:
  1588  		}
  1589  
  1590  		// Get the next header
  1591  		hdr, err := tr.Next()
  1592  
  1593  		// If the snapshot has ended then we create the previous
  1594  		// allocdir
  1595  		if err == io.EOF {
  1596  			prevAllocDir := allocdir.NewAllocDir(pathToAllocDir)
  1597  			return prevAllocDir, nil
  1598  		}
  1599  		// If there is an error then we avoid creating the alloc dir
  1600  		if err != nil {
  1601  			os.RemoveAll(pathToAllocDir)
  1602  			return nil, fmt.Errorf("error creating alloc dir for alloc %q: %v", alloc.ID, err)
  1603  		}
  1604  
  1605  		// If the header is for a directory we create the directory
  1606  		if hdr.Typeflag == tar.TypeDir {
  1607  			os.MkdirAll(filepath.Join(pathToAllocDir, hdr.Name), 0777)
  1608  			continue
  1609  		}
  1610  		// If the header is a file, we write to a file
  1611  		if hdr.Typeflag == tar.TypeReg {
  1612  			f, err := os.Create(filepath.Join(pathToAllocDir, hdr.Name))
  1613  			if err != nil {
  1614  				c.logger.Printf("[ERR] client: error creating file: %v", err)
  1615  				continue
  1616  			}
  1617  
  1618  			// We write in chunks of 32 bytes so that we can test if
  1619  			// the client is still alive
  1620  			for {
  1621  				if c.shutdown {
  1622  					f.Close()
  1623  					os.RemoveAll(pathToAllocDir)
  1624  					c.logger.Printf("[INFO] client: stopping migration of alloc %q because client is shutting down", alloc.ID)
  1625  					return nil, nil
  1626  				}
  1627  
  1628  				n, err := tr.Read(buf)
  1629  				if err != nil {
  1630  					f.Close()
  1631  					if err != io.EOF {
  1632  						return nil, fmt.Errorf("error reading snapshot: %v", err)
  1633  					}
  1634  					break
  1635  				}
  1636  				if _, err := f.Write(buf[:n]); err != nil {
  1637  					f.Close()
  1638  					os.RemoveAll(pathToAllocDir)
  1639  					return nil, fmt.Errorf("error writing to file %q: %v", f.Name(), err)
  1640  				}
  1641  			}
  1642  
  1643  		}
  1644  	}
  1645  }
  1646  
  1647  // getNode gets the node from the server with the given Node ID
  1648  func (c *Client) getNode(nodeID string) (*structs.Node, error) {
  1649  	req := structs.NodeSpecificRequest{
  1650  		NodeID: nodeID,
  1651  		QueryOptions: structs.QueryOptions{
  1652  			Region:     c.Region(),
  1653  			AllowStale: true,
  1654  		},
  1655  	}
  1656  
  1657  	resp := structs.SingleNodeResponse{}
  1658  	for {
  1659  		err := c.RPC("Node.GetNode", &req, &resp)
  1660  		if err != nil {
  1661  			c.logger.Printf("[ERR] client: failed to query node info %q: %v", nodeID, err)
  1662  			retry := c.retryIntv(getAllocRetryIntv)
  1663  			select {
  1664  			case <-time.After(retry):
  1665  				continue
  1666  			case <-c.shutdownCh:
  1667  				return nil, fmt.Errorf("aborting because client is shutting down")
  1668  			}
  1669  		}
  1670  		break
  1671  	}
  1672  
  1673  	return resp.Node, nil
  1674  }
  1675  
  1676  // removeAlloc is invoked when we should remove an allocation
  1677  func (c *Client) removeAlloc(alloc *structs.Allocation) error {
  1678  	c.allocLock.Lock()
  1679  	ar, ok := c.allocs[alloc.ID]
  1680  	if !ok {
  1681  		c.allocLock.Unlock()
  1682  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID)
  1683  		return nil
  1684  	}
  1685  	delete(c.allocs, alloc.ID)
  1686  	c.allocLock.Unlock()
  1687  
  1688  	ar.Destroy()
  1689  	return nil
  1690  }
  1691  
  1692  // updateAlloc is invoked when we should update an allocation
  1693  func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
  1694  	c.allocLock.RLock()
  1695  	ar, ok := c.allocs[exist.ID]
  1696  	c.allocLock.RUnlock()
  1697  	if !ok {
  1698  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID)
  1699  		return nil
  1700  	}
  1701  
  1702  	ar.Update(update)
  1703  	return nil
  1704  }
  1705  
  1706  // addAlloc is invoked when we should add an allocation
  1707  func (c *Client) addAlloc(alloc *structs.Allocation, prevAllocDir *allocdir.AllocDir) error {
  1708  	c.configLock.RLock()
  1709  	ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient)
  1710  	ar.SetPreviousAllocDir(prevAllocDir)
  1711  	c.configLock.RUnlock()
  1712  	go ar.Run()
  1713  
  1714  	// Store the alloc runner.
  1715  	c.allocLock.Lock()
  1716  	c.allocs[alloc.ID] = ar
  1717  	c.allocLock.Unlock()
  1718  	return nil
  1719  }
  1720  
  1721  // setupVaultClient creates an object to periodically renew tokens and secrets
  1722  // with vault.
  1723  func (c *Client) setupVaultClient() error {
  1724  	var err error
  1725  	if c.vaultClient, err =
  1726  		vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken); err != nil {
  1727  		return err
  1728  	}
  1729  
  1730  	if c.vaultClient == nil {
  1731  		c.logger.Printf("[ERR] client: failed to create vault client")
  1732  		return fmt.Errorf("failed to create vault client")
  1733  	}
  1734  
  1735  	// Start renewing tokens and secrets
  1736  	c.vaultClient.Start()
  1737  
  1738  	return nil
  1739  }
  1740  
  1741  // deriveToken takes in an allocation and a set of tasks and derives vault
  1742  // tokens for each of the tasks, unwraps all of them using the supplied vault
  1743  // client and returns a map of unwrapped tokens, indexed by the task name.
  1744  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  1745  	if alloc == nil {
  1746  		return nil, fmt.Errorf("nil allocation")
  1747  	}
  1748  
  1749  	if taskNames == nil || len(taskNames) == 0 {
  1750  		return nil, fmt.Errorf("missing task names")
  1751  	}
  1752  
  1753  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1754  	if group == nil {
  1755  		return nil, fmt.Errorf("group name in allocation is not present in job")
  1756  	}
  1757  
  1758  	verifiedTasks := []string{}
  1759  	found := false
  1760  	// Check if the given task names actually exist in the allocation
  1761  	for _, taskName := range taskNames {
  1762  		found = false
  1763  		for _, task := range group.Tasks {
  1764  			if task.Name == taskName {
  1765  				found = true
  1766  			}
  1767  		}
  1768  		if !found {
  1769  			c.logger.Printf("[ERR] task %q not found in the allocation", taskName)
  1770  			return nil, fmt.Errorf("task %q not found in the allocaition", taskName)
  1771  		}
  1772  		verifiedTasks = append(verifiedTasks, taskName)
  1773  	}
  1774  
  1775  	// DeriveVaultToken of nomad server can take in a set of tasks and
  1776  	// creates tokens for all the tasks.
  1777  	req := &structs.DeriveVaultTokenRequest{
  1778  		NodeID:   c.Node().ID,
  1779  		SecretID: c.Node().SecretID,
  1780  		AllocID:  alloc.ID,
  1781  		Tasks:    verifiedTasks,
  1782  		QueryOptions: structs.QueryOptions{
  1783  			Region:     c.Region(),
  1784  			AllowStale: true,
  1785  		},
  1786  	}
  1787  
  1788  	// Derive the tokens
  1789  	var resp structs.DeriveVaultTokenResponse
  1790  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  1791  		c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err)
  1792  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  1793  	}
  1794  	if resp.Error != nil {
  1795  		c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error)
  1796  		return nil, resp.Error
  1797  	}
  1798  	if resp.Tasks == nil {
  1799  		c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response")
  1800  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  1801  	}
  1802  
  1803  	unwrappedTokens := make(map[string]string)
  1804  
  1805  	// Retrieve the wrapped tokens from the response and unwrap it
  1806  	for _, taskName := range verifiedTasks {
  1807  		// Get the wrapped token
  1808  		wrappedToken, ok := resp.Tasks[taskName]
  1809  		if !ok {
  1810  			c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName)
  1811  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  1812  		}
  1813  
  1814  		// Unwrap the vault token
  1815  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  1816  		if err != nil {
  1817  			return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err)
  1818  		}
  1819  		if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" {
  1820  			return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName)
  1821  		}
  1822  
  1823  		// Append the unwrapped token to the return value
  1824  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  1825  	}
  1826  
  1827  	return unwrappedTokens, nil
  1828  }
  1829  
  1830  // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread)
  1831  func (c *Client) triggerDiscovery() {
  1832  	select {
  1833  	case c.triggerDiscoveryCh <- struct{}{}:
  1834  		// Discovery goroutine was released to execute
  1835  	default:
  1836  		// Discovery goroutine was already running
  1837  	}
  1838  }
  1839  
  1840  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  1841  // It's intended to be started in a goroutine. See triggerDiscovery() for
  1842  // causing consul discovery from other code locations.
  1843  func (c *Client) consulDiscovery() {
  1844  	for {
  1845  		select {
  1846  		case <-c.triggerDiscoveryCh:
  1847  			if err := c.consulDiscoveryImpl(); err != nil {
  1848  				c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err)
  1849  			}
  1850  		case <-c.shutdownCh:
  1851  			return
  1852  		}
  1853  	}
  1854  }
  1855  
  1856  func (c *Client) consulDiscoveryImpl() error {
  1857  	// Acquire heartbeat lock to prevent heartbeat from running
  1858  	// concurrently with discovery. Concurrent execution is safe, however
  1859  	// discovery is usually triggered when heartbeating has failed so
  1860  	// there's no point in allowing it.
  1861  	c.heartbeatLock.Lock()
  1862  	defer c.heartbeatLock.Unlock()
  1863  
  1864  	consulCatalog := c.consulSyncer.ConsulClient().Catalog()
  1865  	dcs, err := consulCatalog.Datacenters()
  1866  	if err != nil {
  1867  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  1868  	}
  1869  	if len(dcs) > 2 {
  1870  		// Query the local DC first, then shuffle the
  1871  		// remaining DCs.  Future heartbeats will cause Nomad
  1872  		// Clients to fixate on their local datacenter so
  1873  		// it's okay to talk with remote DCs.  If the no
  1874  		// Nomad servers are available within
  1875  		// datacenterQueryLimit, the next heartbeat will pick
  1876  		// a new set of servers so it's okay.
  1877  		shuffleStrings(dcs[1:])
  1878  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  1879  	}
  1880  
  1881  	// Query for servers in this client's region only
  1882  	region := c.Region()
  1883  	rpcargs := structs.GenericRequest{
  1884  		QueryOptions: structs.QueryOptions{
  1885  			Region: region,
  1886  		},
  1887  	}
  1888  
  1889  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  1890  	var mErr multierror.Error
  1891  	var servers endpoints
  1892  	c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
  1893  DISCOLOOP:
  1894  	for _, dc := range dcs {
  1895  		consulOpts := &consulapi.QueryOptions{
  1896  			AllowStale: true,
  1897  			Datacenter: dc,
  1898  			Near:       "_agent",
  1899  			WaitTime:   consul.DefaultQueryWaitDuration,
  1900  		}
  1901  		consulServices, _, err := consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  1902  		if err != nil {
  1903  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  1904  			continue
  1905  		}
  1906  
  1907  		for _, s := range consulServices {
  1908  			port := strconv.Itoa(s.ServicePort)
  1909  			addrstr := s.ServiceAddress
  1910  			if addrstr == "" {
  1911  				addrstr = s.Address
  1912  			}
  1913  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  1914  			if err != nil {
  1915  				mErr.Errors = append(mErr.Errors, err)
  1916  				continue
  1917  			}
  1918  			var peers []string
  1919  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  1920  				mErr.Errors = append(mErr.Errors, err)
  1921  				continue
  1922  			}
  1923  
  1924  			// Successfully received the Server peers list of the correct
  1925  			// region
  1926  			for _, p := range peers {
  1927  				addr, err := net.ResolveTCPAddr("tcp", p)
  1928  				if err != nil {
  1929  					mErr.Errors = append(mErr.Errors, err)
  1930  				}
  1931  				servers = append(servers, &endpoint{name: p, addr: addr})
  1932  			}
  1933  			if len(servers) > 0 {
  1934  				break DISCOLOOP
  1935  			}
  1936  		}
  1937  	}
  1938  	if len(servers) == 0 {
  1939  		if len(mErr.Errors) > 0 {
  1940  			return mErr.ErrorOrNil()
  1941  		}
  1942  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  1943  	}
  1944  
  1945  	c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers)
  1946  	c.servers.set(servers)
  1947  
  1948  	// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  1949  	// isn't receiving on this chan yet they'll still retry eventually.
  1950  	// This is a shortcircuit for the longer retry intervals.
  1951  	for {
  1952  		select {
  1953  		case c.serversDiscoveredCh <- struct{}{}:
  1954  		default:
  1955  			return nil
  1956  		}
  1957  	}
  1958  }
  1959  
  1960  // consulReaper periodically reaps unmatched domains from Consul. Intended to
  1961  // be called in its own goroutine. See consulReaperIntv for interval.
  1962  func (c *Client) consulReaper() {
  1963  	ticker := time.NewTicker(consulReaperIntv)
  1964  	defer ticker.Stop()
  1965  	lastok := true
  1966  	for {
  1967  		select {
  1968  		case <-ticker.C:
  1969  			if err := c.consulReaperImpl(); err != nil {
  1970  				if lastok {
  1971  					c.logger.Printf("[ERR] client.consul: error reaping services in consul: %v", err)
  1972  					lastok = false
  1973  				}
  1974  			} else {
  1975  				lastok = true
  1976  			}
  1977  		case <-c.shutdownCh:
  1978  			return
  1979  		}
  1980  	}
  1981  }
  1982  
  1983  // consulReaperImpl reaps unmatched domains from Consul.
  1984  func (c *Client) consulReaperImpl() error {
  1985  	const estInitialExecutorDomains = 8
  1986  
  1987  	// Create the domains to keep and add the server and client
  1988  	domains := make([]consul.ServiceDomain, 2, estInitialExecutorDomains)
  1989  	domains[0] = consul.ServerDomain
  1990  	domains[1] = consul.ClientDomain
  1991  
  1992  	for allocID, ar := range c.getAllocRunners() {
  1993  		ar.taskStatusLock.RLock()
  1994  		taskStates := copyTaskStates(ar.taskStates)
  1995  		ar.taskStatusLock.RUnlock()
  1996  		for taskName, taskState := range taskStates {
  1997  			// Only keep running tasks
  1998  			if taskState.State == structs.TaskStateRunning {
  1999  				d := consul.NewExecutorDomain(allocID, taskName)
  2000  				domains = append(domains, d)
  2001  			}
  2002  		}
  2003  	}
  2004  
  2005  	return c.consulSyncer.ReapUnmatched(domains)
  2006  }
  2007  
  2008  // collectHostStats collects host resource usage stats periodically
  2009  func (c *Client) collectHostStats() {
  2010  	// Start collecting host stats right away and then keep collecting every
  2011  	// collection interval
  2012  	next := time.NewTimer(0)
  2013  	defer next.Stop()
  2014  	for {
  2015  		select {
  2016  		case <-next.C:
  2017  			ru, err := c.hostStatsCollector.Collect()
  2018  			next.Reset(c.config.StatsCollectionInterval)
  2019  			if err != nil {
  2020  				c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err)
  2021  				continue
  2022  			}
  2023  
  2024  			c.resourceUsageLock.Lock()
  2025  			c.resourceUsage = ru
  2026  			c.resourceUsageLock.Unlock()
  2027  
  2028  			// Publish Node metrics if operator has opted in
  2029  			if c.config.PublishNodeMetrics {
  2030  				c.emitStats(ru)
  2031  			}
  2032  		case <-c.shutdownCh:
  2033  			return
  2034  		}
  2035  	}
  2036  }
  2037  
  2038  // emitStats pushes host resource usage stats to remote metrics collection sinks
  2039  func (c *Client) emitStats(hStats *stats.HostStats) {
  2040  	nodeID := c.Node().ID
  2041  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2042  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2043  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2044  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2045  
  2046  	metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime))
  2047  
  2048  	for _, cpu := range hStats.CPU {
  2049  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2050  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2051  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2052  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2053  	}
  2054  
  2055  	for _, disk := range hStats.DiskStats {
  2056  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2057  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2058  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2059  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2060  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2061  	}
  2062  }
  2063  
  2064  // resolveServer given a sever's address as a string, return it's resolved
  2065  // net.Addr or an error.
  2066  func resolveServer(s string) (net.Addr, error) {
  2067  	const defaultClientPort = "4647" // default client RPC port
  2068  	host, port, err := net.SplitHostPort(s)
  2069  	if err != nil {
  2070  		if strings.Contains(err.Error(), "missing port") {
  2071  			host = s
  2072  			port = defaultClientPort
  2073  		} else {
  2074  			return nil, err
  2075  		}
  2076  	}
  2077  	return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port))
  2078  }