github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"archive/tar"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"log"
    10  	"net"
    11  	"os"
    12  	"path/filepath"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/armon/go-metrics"
    19  	"github.com/boltdb/bolt"
    20  	consulapi "github.com/hashicorp/consul/api"
    21  	"github.com/hashicorp/consul/lib"
    22  	"github.com/hashicorp/go-multierror"
    23  	nomadapi "github.com/hashicorp/nomad/api"
    24  	"github.com/hashicorp/nomad/client/allocdir"
    25  	"github.com/hashicorp/nomad/client/config"
    26  	"github.com/hashicorp/nomad/client/driver"
    27  	"github.com/hashicorp/nomad/client/fingerprint"
    28  	"github.com/hashicorp/nomad/client/stats"
    29  	"github.com/hashicorp/nomad/client/vaultclient"
    30  	"github.com/hashicorp/nomad/command/agent/consul"
    31  	"github.com/hashicorp/nomad/helper"
    32  	"github.com/hashicorp/nomad/helper/tlsutil"
    33  	"github.com/hashicorp/nomad/nomad"
    34  	"github.com/hashicorp/nomad/nomad/structs"
    35  	vaultapi "github.com/hashicorp/vault/api"
    36  	"github.com/mitchellh/hashstructure"
    37  	"github.com/shirou/gopsutil/host"
    38  )
    39  
    40  const (
    41  	// clientRPCCache controls how long we keep an idle connection
    42  	// open to a server
    43  	clientRPCCache = 5 * time.Minute
    44  
    45  	// clientMaxStreams controsl how many idle streams we keep
    46  	// open to a server
    47  	clientMaxStreams = 2
    48  
    49  	// datacenterQueryLimit searches through up to this many adjacent
    50  	// datacenters looking for the Nomad server service.
    51  	datacenterQueryLimit = 9
    52  
    53  	// registerRetryIntv is minimum interval on which we retry
    54  	// registration. We pick a value between this and 2x this.
    55  	registerRetryIntv = 15 * time.Second
    56  
    57  	// getAllocRetryIntv is minimum interval on which we retry
    58  	// to fetch allocations. We pick a value between this and 2x this.
    59  	getAllocRetryIntv = 30 * time.Second
    60  
    61  	// devModeRetryIntv is the retry interval used for development
    62  	devModeRetryIntv = time.Second
    63  
    64  	// stateSnapshotIntv is how often the client snapshots state
    65  	stateSnapshotIntv = 60 * time.Second
    66  
    67  	// initialHeartbeatStagger is used to stagger the interval between
    68  	// starting and the intial heartbeat. After the intial heartbeat,
    69  	// we switch to using the TTL specified by the servers.
    70  	initialHeartbeatStagger = 10 * time.Second
    71  
    72  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    73  	// node attributes or meta map.
    74  	nodeUpdateRetryIntv = 5 * time.Second
    75  
    76  	// allocSyncIntv is the batching period of allocation updates before they
    77  	// are synced with the server.
    78  	allocSyncIntv = 200 * time.Millisecond
    79  
    80  	// allocSyncRetryIntv is the interval on which we retry updating
    81  	// the status of the allocation
    82  	allocSyncRetryIntv = 5 * time.Second
    83  )
    84  
    85  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
    86  // Client
    87  type ClientStatsReporter interface {
    88  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
    89  	// If it does not exist an error is reported.
    90  	GetAllocStats(allocID string) (AllocStatsReporter, error)
    91  
    92  	// LatestHostStats returns the latest resource usage stats for the host
    93  	LatestHostStats() *stats.HostStats
    94  }
    95  
    96  // Client is used to implement the client interaction with Nomad. Clients
    97  // are expected to register as a schedulable node to the servers, and to
    98  // run allocations as determined by the servers.
    99  type Client struct {
   100  	config *config.Config
   101  	start  time.Time
   102  
   103  	// stateDB is used to efficiently store client state.
   104  	stateDB *bolt.DB
   105  
   106  	// configCopy is a copy that should be passed to alloc-runners.
   107  	configCopy *config.Config
   108  	configLock sync.RWMutex
   109  
   110  	logger *log.Logger
   111  
   112  	connPool *nomad.ConnPool
   113  
   114  	// servers is the (optionally prioritized) list of nomad servers
   115  	servers *serverlist
   116  
   117  	// heartbeat related times for tracking how often to heartbeat
   118  	lastHeartbeat time.Time
   119  	heartbeatTTL  time.Duration
   120  	heartbeatLock sync.Mutex
   121  
   122  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   123  	triggerDiscoveryCh chan struct{}
   124  
   125  	// discovered will be ticked whenever Consul discovery completes
   126  	// succesfully
   127  	serversDiscoveredCh chan struct{}
   128  
   129  	// allocs is the current set of allocations
   130  	allocs    map[string]*AllocRunner
   131  	allocLock sync.RWMutex
   132  
   133  	// blockedAllocations are allocations which are blocked because their
   134  	// chained allocations haven't finished running
   135  	blockedAllocations map[string]*structs.Allocation
   136  	blockedAllocsLock  sync.RWMutex
   137  
   138  	// migratingAllocs is the set of allocs whose data migration is in flight
   139  	migratingAllocs     map[string]*migrateAllocCtrl
   140  	migratingAllocsLock sync.RWMutex
   141  
   142  	// allocUpdates stores allocations that need to be synced to the server.
   143  	allocUpdates chan *structs.Allocation
   144  
   145  	// consulService is Nomad's custom Consul client for managing services
   146  	// and checks.
   147  	consulService ConsulServiceAPI
   148  
   149  	// consulCatalog is the subset of Consul's Catalog API Nomad uses.
   150  	consulCatalog consul.CatalogAPI
   151  
   152  	// HostStatsCollector collects host resource usage stats
   153  	hostStatsCollector *stats.HostStatsCollector
   154  
   155  	shutdown     bool
   156  	shutdownCh   chan struct{}
   157  	shutdownLock sync.Mutex
   158  
   159  	// vaultClient is used to interact with Vault for token and secret renewals
   160  	vaultClient vaultclient.VaultClient
   161  
   162  	// garbageCollector is used to garbage collect terminal allocations present
   163  	// in the node automatically
   164  	garbageCollector *AllocGarbageCollector
   165  }
   166  
   167  // migrateAllocCtrl indicates whether migration is complete
   168  type migrateAllocCtrl struct {
   169  	alloc  *structs.Allocation
   170  	ch     chan struct{}
   171  	closed bool
   172  	chLock sync.Mutex
   173  }
   174  
   175  func newMigrateAllocCtrl(alloc *structs.Allocation) *migrateAllocCtrl {
   176  	return &migrateAllocCtrl{
   177  		ch:    make(chan struct{}),
   178  		alloc: alloc,
   179  	}
   180  }
   181  
   182  func (m *migrateAllocCtrl) closeCh() {
   183  	m.chLock.Lock()
   184  	defer m.chLock.Unlock()
   185  
   186  	if m.closed {
   187  		return
   188  	}
   189  
   190  	// If channel is not closed then close it
   191  	m.closed = true
   192  	close(m.ch)
   193  }
   194  
   195  var (
   196  	// noServersErr is returned by the RPC method when the client has no
   197  	// configured servers. This is used to trigger Consul discovery if
   198  	// enabled.
   199  	noServersErr = errors.New("no servers")
   200  )
   201  
   202  // NewClient is used to create a new client from the given configuration
   203  func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) {
   204  	// Create the tls wrapper
   205  	var tlsWrap tlsutil.RegionWrapper
   206  	if cfg.TLSConfig.EnableRPC {
   207  		tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper()
   208  		if err != nil {
   209  			return nil, err
   210  		}
   211  		tlsWrap = tw
   212  	}
   213  
   214  	// Create the client
   215  	c := &Client{
   216  		config:              cfg,
   217  		consulCatalog:       consulCatalog,
   218  		consulService:       consulService,
   219  		start:               time.Now(),
   220  		connPool:            nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap),
   221  		logger:              logger,
   222  		allocs:              make(map[string]*AllocRunner),
   223  		blockedAllocations:  make(map[string]*structs.Allocation),
   224  		allocUpdates:        make(chan *structs.Allocation, 64),
   225  		shutdownCh:          make(chan struct{}),
   226  		migratingAllocs:     make(map[string]*migrateAllocCtrl),
   227  		servers:             newServerList(),
   228  		triggerDiscoveryCh:  make(chan struct{}),
   229  		serversDiscoveredCh: make(chan struct{}),
   230  	}
   231  
   232  	// Initialize the client
   233  	if err := c.init(); err != nil {
   234  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   235  	}
   236  
   237  	// Add the stats collector
   238  	statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir)
   239  	c.hostStatsCollector = statsCollector
   240  
   241  	// Add the garbage collector
   242  	gcConfig := &GCConfig{
   243  		MaxAllocs:           cfg.GCMaxAllocs,
   244  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   245  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   246  		Interval:            cfg.GCInterval,
   247  		ParallelDestroys:    cfg.GCParallelDestroys,
   248  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   249  	}
   250  	c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig)
   251  	go c.garbageCollector.Run()
   252  
   253  	// Setup the node
   254  	if err := c.setupNode(); err != nil {
   255  		return nil, fmt.Errorf("node setup failed: %v", err)
   256  	}
   257  
   258  	// Fingerprint the node
   259  	if err := c.fingerprint(); err != nil {
   260  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   261  	}
   262  
   263  	// Scan for drivers
   264  	if err := c.setupDrivers(); err != nil {
   265  		return nil, fmt.Errorf("driver setup failed: %v", err)
   266  	}
   267  
   268  	// Setup the reserved resources
   269  	c.reservePorts()
   270  
   271  	// Store the config copy before restoring state but after it has been
   272  	// initialized.
   273  	c.configLock.Lock()
   274  	c.configCopy = c.config.Copy()
   275  	c.configLock.Unlock()
   276  
   277  	// Set the preconfigured list of static servers
   278  	c.configLock.RLock()
   279  	if len(c.configCopy.Servers) > 0 {
   280  		if err := c.SetServers(c.configCopy.Servers); err != nil {
   281  			logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
   282  		}
   283  	}
   284  	c.configLock.RUnlock()
   285  
   286  	// Setup Consul discovery if enabled
   287  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   288  		go c.consulDiscovery()
   289  		if len(c.servers.all()) == 0 {
   290  			// No configured servers; trigger discovery manually
   291  			c.triggerDiscoveryCh <- struct{}{}
   292  		}
   293  	}
   294  
   295  	// Setup the vault client for token and secret renewals
   296  	if err := c.setupVaultClient(); err != nil {
   297  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   298  	}
   299  
   300  	// Restore the state
   301  	if err := c.restoreState(); err != nil {
   302  		logger.Printf("[ERR] client: failed to restore state: %v", err)
   303  		logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+
   304  			"The safest way to proceed is to manually stop running task processes "+
   305  			"and remove Nomad's state (%q) and alloc (%q) directories before "+
   306  			"restarting. Lost allocations will be rescheduled.",
   307  			c.config.StateDir, c.config.AllocDir)
   308  		logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " +
   309  			"report as much information as possible to " +
   310  			"https://github.com/hashicorp/nomad/issues")
   311  		return nil, fmt.Errorf("failed to restore state")
   312  	}
   313  
   314  	// Register and then start heartbeating to the servers.
   315  	go c.registerAndHeartbeat()
   316  
   317  	// Begin periodic snapshotting of state.
   318  	go c.periodicSnapshot()
   319  
   320  	// Begin syncing allocations to the server
   321  	go c.allocSync()
   322  
   323  	// Start the client!
   324  	go c.run()
   325  
   326  	// Start collecting stats
   327  	go c.emitStats()
   328  
   329  	c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID)
   330  	return c, nil
   331  }
   332  
   333  // init is used to initialize the client and perform any setup
   334  // needed before we begin starting its various components.
   335  func (c *Client) init() error {
   336  	// Ensure the state dir exists if we have one
   337  	if c.config.StateDir != "" {
   338  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   339  			return fmt.Errorf("failed creating state dir: %s", err)
   340  		}
   341  
   342  	} else {
   343  		// Othewise make a temp directory to use.
   344  		p, err := ioutil.TempDir("", "NomadClient")
   345  		if err != nil {
   346  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   347  		}
   348  
   349  		p, err = filepath.EvalSymlinks(p)
   350  		if err != nil {
   351  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   352  		}
   353  
   354  		c.config.StateDir = p
   355  	}
   356  	c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir)
   357  
   358  	// Create or open the state database
   359  	db, err := bolt.Open(filepath.Join(c.config.StateDir, "state.db"), 0600, nil)
   360  	if err != nil {
   361  		return fmt.Errorf("failed to create state database: %v", err)
   362  	}
   363  	c.stateDB = db
   364  
   365  	// Ensure the alloc dir exists if we have one
   366  	if c.config.AllocDir != "" {
   367  		if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil {
   368  			return fmt.Errorf("failed creating alloc dir: %s", err)
   369  		}
   370  	} else {
   371  		// Othewise make a temp directory to use.
   372  		p, err := ioutil.TempDir("", "NomadClient")
   373  		if err != nil {
   374  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   375  		}
   376  
   377  		p, err = filepath.EvalSymlinks(p)
   378  		if err != nil {
   379  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   380  		}
   381  
   382  		// Change the permissions to have the execute bit
   383  		if err := os.Chmod(p, 0711); err != nil {
   384  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   385  		}
   386  
   387  		c.config.AllocDir = p
   388  	}
   389  
   390  	c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir)
   391  	return nil
   392  }
   393  
   394  // Leave is used to prepare the client to leave the cluster
   395  func (c *Client) Leave() error {
   396  	// TODO
   397  	return nil
   398  }
   399  
   400  // Datacenter returns the datacenter for the given client
   401  func (c *Client) Datacenter() string {
   402  	c.configLock.RLock()
   403  	dc := c.configCopy.Node.Datacenter
   404  	c.configLock.RUnlock()
   405  	return dc
   406  }
   407  
   408  // Region returns the region for the given client
   409  func (c *Client) Region() string {
   410  	return c.config.Region
   411  }
   412  
   413  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   414  // client.
   415  func (c *Client) RPCMajorVersion() int {
   416  	return structs.ApiMajorVersion
   417  }
   418  
   419  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   420  // client.
   421  func (c *Client) RPCMinorVersion() int {
   422  	return structs.ApiMinorVersion
   423  }
   424  
   425  // Shutdown is used to tear down the client
   426  func (c *Client) Shutdown() error {
   427  	c.logger.Printf("[INFO] client: shutting down")
   428  	c.shutdownLock.Lock()
   429  	defer c.shutdownLock.Unlock()
   430  
   431  	if c.shutdown {
   432  		return nil
   433  	}
   434  
   435  	// Defer closing the database
   436  	defer func() {
   437  		if err := c.stateDB.Close(); err != nil {
   438  			c.logger.Printf("[ERR] client: failed to close state database on shutdown: %v", err)
   439  		}
   440  	}()
   441  
   442  	// Stop renewing tokens and secrets
   443  	if c.vaultClient != nil {
   444  		c.vaultClient.Stop()
   445  	}
   446  
   447  	// Stop Garbage collector
   448  	c.garbageCollector.Stop()
   449  
   450  	// Destroy all the running allocations.
   451  	if c.config.DevMode {
   452  		for _, ar := range c.getAllocRunners() {
   453  			ar.Destroy()
   454  			<-ar.WaitCh()
   455  		}
   456  	}
   457  
   458  	c.shutdown = true
   459  	close(c.shutdownCh)
   460  	c.connPool.Shutdown()
   461  	return c.saveState()
   462  }
   463  
   464  // RPC is used to forward an RPC call to a nomad server, or fail if no servers.
   465  func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
   466  	// Invoke the RPCHandler if it exists
   467  	if c.config.RPCHandler != nil {
   468  		return c.config.RPCHandler.RPC(method, args, reply)
   469  	}
   470  
   471  	servers := c.servers.all()
   472  	if len(servers) == 0 {
   473  		return noServersErr
   474  	}
   475  
   476  	var mErr multierror.Error
   477  	for _, s := range servers {
   478  		// Make the RPC request
   479  		if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil {
   480  			errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err)
   481  			mErr.Errors = append(mErr.Errors, errmsg)
   482  			c.logger.Printf("[DEBUG] client: %v", errmsg)
   483  			c.servers.failed(s)
   484  			continue
   485  		}
   486  		c.servers.good(s)
   487  		return nil
   488  	}
   489  
   490  	return mErr.ErrorOrNil()
   491  }
   492  
   493  // Stats is used to return statistics for debugging and insight
   494  // for various sub-systems
   495  func (c *Client) Stats() map[string]map[string]string {
   496  	c.heartbeatLock.Lock()
   497  	defer c.heartbeatLock.Unlock()
   498  	stats := map[string]map[string]string{
   499  		"client": map[string]string{
   500  			"node_id":         c.Node().ID,
   501  			"known_servers":   c.servers.all().String(),
   502  			"num_allocations": strconv.Itoa(c.NumAllocs()),
   503  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   504  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   505  		},
   506  		"runtime": nomad.RuntimeStats(),
   507  	}
   508  	return stats
   509  }
   510  
   511  // CollectAllocation garbage collects a single allocation
   512  func (c *Client) CollectAllocation(allocID string) error {
   513  	return c.garbageCollector.Collect(allocID)
   514  }
   515  
   516  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   517  // state
   518  func (c *Client) CollectAllAllocs() error {
   519  	return c.garbageCollector.CollectAll()
   520  }
   521  
   522  // Node returns the locally registered node
   523  func (c *Client) Node() *structs.Node {
   524  	c.configLock.RLock()
   525  	defer c.configLock.RUnlock()
   526  	return c.config.Node
   527  }
   528  
   529  // StatsReporter exposes the various APIs related resource usage of a Nomad
   530  // client
   531  func (c *Client) StatsReporter() ClientStatsReporter {
   532  	return c
   533  }
   534  
   535  func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) {
   536  	c.allocLock.RLock()
   537  	defer c.allocLock.RUnlock()
   538  	ar, ok := c.allocs[allocID]
   539  	if !ok {
   540  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   541  	}
   542  	return ar.StatsReporter(), nil
   543  }
   544  
   545  // HostStats returns all the stats related to a Nomad client
   546  func (c *Client) LatestHostStats() *stats.HostStats {
   547  	return c.hostStatsCollector.Stats()
   548  }
   549  
   550  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   551  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   552  	c.allocLock.RLock()
   553  	defer c.allocLock.RUnlock()
   554  
   555  	ar, ok := c.allocs[allocID]
   556  	if !ok {
   557  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   558  	}
   559  	return ar.GetAllocDir(), nil
   560  }
   561  
   562  // GetClientAlloc returns the allocation from the client
   563  func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
   564  	all := c.allAllocs()
   565  	alloc, ok := all[allocID]
   566  	if !ok {
   567  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   568  	}
   569  	return alloc, nil
   570  }
   571  
   572  // GetServers returns the list of nomad servers this client is aware of.
   573  func (c *Client) GetServers() []string {
   574  	endpoints := c.servers.all()
   575  	res := make([]string, len(endpoints))
   576  	for i := range endpoints {
   577  		res[i] = endpoints[i].addr.String()
   578  	}
   579  	return res
   580  }
   581  
   582  // SetServers sets a new list of nomad servers to connect to. As long as one
   583  // server is resolvable no error is returned.
   584  func (c *Client) SetServers(servers []string) error {
   585  	endpoints := make([]*endpoint, 0, len(servers))
   586  	var merr multierror.Error
   587  	for _, s := range servers {
   588  		addr, err := resolveServer(s)
   589  		if err != nil {
   590  			c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err)
   591  			merr.Errors = append(merr.Errors, err)
   592  			continue
   593  		}
   594  
   595  		// Valid endpoint, append it without a priority as this API
   596  		// doesn't support different priorities for different servers
   597  		endpoints = append(endpoints, &endpoint{name: s, addr: addr})
   598  	}
   599  
   600  	// Only return errors if no servers are valid
   601  	if len(endpoints) == 0 {
   602  		if len(merr.Errors) > 0 {
   603  			return merr.ErrorOrNil()
   604  		}
   605  		return noServersErr
   606  	}
   607  
   608  	c.servers.set(endpoints)
   609  	return nil
   610  }
   611  
   612  // restoreState is used to restore our state from the data dir
   613  func (c *Client) restoreState() error {
   614  	if c.config.DevMode {
   615  		return nil
   616  	}
   617  
   618  	// COMPAT: Remove in 0.7.0
   619  	// 0.6.0 transistioned from individual state files to a single bolt-db.
   620  	// The upgrade path is to:
   621  	// Check if old state exists
   622  	//   If so, restore from that and delete old state
   623  	// Restore using state database
   624  
   625  	// Allocs holds the IDs of the allocations being restored
   626  	var allocs []string
   627  
   628  	// Upgrading tracks whether this is a pre 0.6.0 upgrade path
   629  	var upgrading bool
   630  
   631  	// Scan the directory
   632  	allocDir := filepath.Join(c.config.StateDir, "alloc")
   633  	list, err := ioutil.ReadDir(allocDir)
   634  	if err != nil && !os.IsNotExist(err) {
   635  		return fmt.Errorf("failed to list alloc state: %v", err)
   636  	} else if err == nil && len(list) != 0 {
   637  		upgrading = true
   638  		for _, entry := range list {
   639  			allocs = append(allocs, entry.Name())
   640  		}
   641  	} else {
   642  		// Normal path
   643  		err := c.stateDB.View(func(tx *bolt.Tx) error {
   644  			allocs, err = getAllAllocationIDs(tx)
   645  			if err != nil {
   646  				return fmt.Errorf("failed to list allocations: %v", err)
   647  			}
   648  			return nil
   649  		})
   650  		if err != nil {
   651  			return err
   652  		}
   653  	}
   654  
   655  	// Load each alloc back
   656  	var mErr multierror.Error
   657  	for _, id := range allocs {
   658  		alloc := &structs.Allocation{ID: id}
   659  
   660  		c.configLock.RLock()
   661  		ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService)
   662  		c.configLock.RUnlock()
   663  
   664  		c.allocLock.Lock()
   665  		c.allocs[id] = ar
   666  		c.allocLock.Unlock()
   667  
   668  		if err := ar.RestoreState(); err != nil {
   669  			c.logger.Printf("[ERR] client: failed to restore state for alloc %q: %v", id, err)
   670  			mErr.Errors = append(mErr.Errors, err)
   671  		} else {
   672  			go ar.Run()
   673  
   674  			if upgrading {
   675  				if err := ar.SaveState(); err != nil {
   676  					c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", id, err)
   677  				}
   678  			}
   679  		}
   680  	}
   681  
   682  	// Delete all the entries
   683  	if upgrading {
   684  		if err := os.RemoveAll(allocDir); err != nil {
   685  			mErr.Errors = append(mErr.Errors, err)
   686  		}
   687  	}
   688  
   689  	return mErr.ErrorOrNil()
   690  }
   691  
   692  // saveState is used to snapshot our state into the data dir.
   693  func (c *Client) saveState() error {
   694  	if c.config.DevMode {
   695  		return nil
   696  	}
   697  
   698  	var wg sync.WaitGroup
   699  	var l sync.Mutex
   700  	var mErr multierror.Error
   701  	runners := c.getAllocRunners()
   702  	wg.Add(len(runners))
   703  
   704  	for id, ar := range runners {
   705  		go func(id string, ar *AllocRunner) {
   706  			err := ar.SaveState()
   707  			if err != nil {
   708  				c.logger.Printf("[ERR] client: failed to save state for alloc %q: %v", id, err)
   709  				l.Lock()
   710  				multierror.Append(&mErr, err)
   711  				l.Unlock()
   712  			}
   713  			wg.Done()
   714  		}(id, ar)
   715  	}
   716  
   717  	wg.Wait()
   718  	return mErr.ErrorOrNil()
   719  }
   720  
   721  // getAllocRunners returns a snapshot of the current set of alloc runners.
   722  func (c *Client) getAllocRunners() map[string]*AllocRunner {
   723  	c.allocLock.RLock()
   724  	defer c.allocLock.RUnlock()
   725  	runners := make(map[string]*AllocRunner, len(c.allocs))
   726  	for id, ar := range c.allocs {
   727  		runners[id] = ar
   728  	}
   729  	return runners
   730  }
   731  
   732  // NumAllocs returns the number of allocs this client has. Used to
   733  // fulfill the AllocCounter interface for the GC.
   734  func (c *Client) NumAllocs() int {
   735  	c.allocLock.RLock()
   736  	n := len(c.allocs)
   737  	c.allocLock.RUnlock()
   738  
   739  	c.blockedAllocsLock.RLock()
   740  	n += len(c.blockedAllocations)
   741  	c.blockedAllocsLock.RUnlock()
   742  
   743  	c.migratingAllocsLock.RLock()
   744  	n += len(c.migratingAllocs)
   745  	c.migratingAllocsLock.RUnlock()
   746  
   747  	return n
   748  }
   749  
   750  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
   751  // The node ID is, if available, a persistent unique ID.  The secret ID is a
   752  // high-entropy random UUID.
   753  func (c *Client) nodeID() (id, secret string, err error) {
   754  	var hostID string
   755  	hostInfo, err := host.Info()
   756  	if !c.config.NoHostUUID && err == nil {
   757  		if hashed, ok := helper.HashUUID(hostInfo.HostID); ok {
   758  			hostID = hashed
   759  		}
   760  	}
   761  
   762  	if hostID == "" {
   763  		// Generate a random hostID if no constant ID is available on
   764  		// this platform.
   765  		hostID = structs.GenerateUUID()
   766  	}
   767  
   768  	// Do not persist in dev mode
   769  	if c.config.DevMode {
   770  		return hostID, structs.GenerateUUID(), nil
   771  	}
   772  
   773  	// Attempt to read existing ID
   774  	idPath := filepath.Join(c.config.StateDir, "client-id")
   775  	idBuf, err := ioutil.ReadFile(idPath)
   776  	if err != nil && !os.IsNotExist(err) {
   777  		return "", "", err
   778  	}
   779  
   780  	// Attempt to read existing secret ID
   781  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
   782  	secretBuf, err := ioutil.ReadFile(secretPath)
   783  	if err != nil && !os.IsNotExist(err) {
   784  		return "", "", err
   785  	}
   786  
   787  	// Use existing ID if any
   788  	if len(idBuf) != 0 {
   789  		id = strings.ToLower(string(idBuf))
   790  	} else {
   791  		id = hostID
   792  
   793  		// Persist the ID
   794  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
   795  			return "", "", err
   796  		}
   797  	}
   798  
   799  	if len(secretBuf) != 0 {
   800  		secret = string(secretBuf)
   801  	} else {
   802  		// Generate new ID
   803  		secret = structs.GenerateUUID()
   804  
   805  		// Persist the ID
   806  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
   807  			return "", "", err
   808  		}
   809  	}
   810  
   811  	return id, secret, nil
   812  }
   813  
   814  // setupNode is used to setup the initial node
   815  func (c *Client) setupNode() error {
   816  	node := c.config.Node
   817  	if node == nil {
   818  		node = &structs.Node{}
   819  		c.config.Node = node
   820  	}
   821  	// Generate an ID and secret for the node
   822  	id, secretID, err := c.nodeID()
   823  	if err != nil {
   824  		return fmt.Errorf("node ID setup failed: %v", err)
   825  	}
   826  
   827  	node.ID = id
   828  	node.SecretID = secretID
   829  	if node.Attributes == nil {
   830  		node.Attributes = make(map[string]string)
   831  	}
   832  	if node.Links == nil {
   833  		node.Links = make(map[string]string)
   834  	}
   835  	if node.Meta == nil {
   836  		node.Meta = make(map[string]string)
   837  	}
   838  	if node.Resources == nil {
   839  		node.Resources = &structs.Resources{}
   840  	}
   841  	if node.Reserved == nil {
   842  		node.Reserved = &structs.Resources{}
   843  	}
   844  	if node.Datacenter == "" {
   845  		node.Datacenter = "dc1"
   846  	}
   847  	if node.Name == "" {
   848  		node.Name, _ = os.Hostname()
   849  	}
   850  	if node.Name == "" {
   851  		node.Name = node.ID
   852  	}
   853  	node.Status = structs.NodeStatusInit
   854  	return nil
   855  }
   856  
   857  // reservePorts is used to reserve ports on the fingerprinted network devices.
   858  func (c *Client) reservePorts() {
   859  	c.configLock.RLock()
   860  	defer c.configLock.RUnlock()
   861  	global := c.config.GloballyReservedPorts
   862  	if len(global) == 0 {
   863  		return
   864  	}
   865  
   866  	node := c.config.Node
   867  	networks := node.Resources.Networks
   868  	reservedIndex := make(map[string]*structs.NetworkResource, len(networks))
   869  	for _, resNet := range node.Reserved.Networks {
   870  		reservedIndex[resNet.IP] = resNet
   871  	}
   872  
   873  	// Go through each network device and reserve ports on it.
   874  	for _, net := range networks {
   875  		res, ok := reservedIndex[net.IP]
   876  		if !ok {
   877  			res = net.Copy()
   878  			res.MBits = 0
   879  			reservedIndex[net.IP] = res
   880  		}
   881  
   882  		for _, portVal := range global {
   883  			p := structs.Port{Value: portVal}
   884  			res.ReservedPorts = append(res.ReservedPorts, p)
   885  		}
   886  	}
   887  
   888  	// Clear the reserved networks.
   889  	if node.Reserved == nil {
   890  		node.Reserved = new(structs.Resources)
   891  	} else {
   892  		node.Reserved.Networks = nil
   893  	}
   894  
   895  	// Restore the reserved networks
   896  	for _, net := range reservedIndex {
   897  		node.Reserved.Networks = append(node.Reserved.Networks, net)
   898  	}
   899  }
   900  
   901  // fingerprint is used to fingerprint the client and setup the node
   902  func (c *Client) fingerprint() error {
   903  	whitelist := c.config.ReadStringListToMap("fingerprint.whitelist")
   904  	whitelistEnabled := len(whitelist) > 0
   905  	blacklist := c.config.ReadStringListToMap("fingerprint.blacklist")
   906  
   907  	c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints())
   908  
   909  	var applied []string
   910  	var skipped []string
   911  	for _, name := range fingerprint.BuiltinFingerprints() {
   912  		// Skip modules that are not in the whitelist if it is enabled.
   913  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   914  			skipped = append(skipped, name)
   915  			continue
   916  		}
   917  		// Skip modules that are in the blacklist
   918  		if _, ok := blacklist[name]; ok {
   919  			skipped = append(skipped, name)
   920  			continue
   921  		}
   922  		f, err := fingerprint.NewFingerprint(name, c.logger)
   923  		if err != nil {
   924  			return err
   925  		}
   926  
   927  		c.configLock.Lock()
   928  		applies, err := f.Fingerprint(c.config, c.config.Node)
   929  		c.configLock.Unlock()
   930  		if err != nil {
   931  			return err
   932  		}
   933  		if applies {
   934  			applied = append(applied, name)
   935  		}
   936  		p, period := f.Periodic()
   937  		if p {
   938  			// TODO: If more periodic fingerprinters are added, then
   939  			// fingerprintPeriodic should be used to handle all the periodic
   940  			// fingerprinters by using a priority queue.
   941  			go c.fingerprintPeriodic(name, f, period)
   942  		}
   943  	}
   944  	c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied)
   945  	if len(skipped) != 0 {
   946  		c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped)
   947  	}
   948  	return nil
   949  }
   950  
   951  // fingerprintPeriodic runs a fingerprinter at the specified duration.
   952  func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) {
   953  	c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d)
   954  	for {
   955  		select {
   956  		case <-time.After(d):
   957  			c.configLock.Lock()
   958  			if _, err := f.Fingerprint(c.config, c.config.Node); err != nil {
   959  				c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err)
   960  			}
   961  			c.configLock.Unlock()
   962  		case <-c.shutdownCh:
   963  			return
   964  		}
   965  	}
   966  }
   967  
   968  // setupDrivers is used to find the available drivers
   969  func (c *Client) setupDrivers() error {
   970  	// Build the white/blacklists of drivers.
   971  	whitelist := c.config.ReadStringListToMap("driver.whitelist")
   972  	whitelistEnabled := len(whitelist) > 0
   973  	blacklist := c.config.ReadStringListToMap("driver.blacklist")
   974  
   975  	var avail []string
   976  	var skipped []string
   977  	driverCtx := driver.NewDriverContext("", "", c.config, c.config.Node, c.logger, nil)
   978  	for name := range driver.BuiltinDrivers {
   979  		// Skip fingerprinting drivers that are not in the whitelist if it is
   980  		// enabled.
   981  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   982  			skipped = append(skipped, name)
   983  			continue
   984  		}
   985  		// Skip fingerprinting drivers that are in the blacklist
   986  		if _, ok := blacklist[name]; ok {
   987  			skipped = append(skipped, name)
   988  			continue
   989  		}
   990  
   991  		d, err := driver.NewDriver(name, driverCtx)
   992  		if err != nil {
   993  			return err
   994  		}
   995  		c.configLock.Lock()
   996  		applies, err := d.Fingerprint(c.config, c.config.Node)
   997  		c.configLock.Unlock()
   998  		if err != nil {
   999  			return err
  1000  		}
  1001  		if applies {
  1002  			avail = append(avail, name)
  1003  		}
  1004  
  1005  		p, period := d.Periodic()
  1006  		if p {
  1007  			go c.fingerprintPeriodic(name, d, period)
  1008  		}
  1009  
  1010  	}
  1011  
  1012  	c.logger.Printf("[DEBUG] client: available drivers %v", avail)
  1013  
  1014  	if len(skipped) != 0 {
  1015  		c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped)
  1016  	}
  1017  
  1018  	return nil
  1019  }
  1020  
  1021  // retryIntv calculates a retry interval value given the base
  1022  func (c *Client) retryIntv(base time.Duration) time.Duration {
  1023  	if c.config.DevMode {
  1024  		return devModeRetryIntv
  1025  	}
  1026  	return base + lib.RandomStagger(base)
  1027  }
  1028  
  1029  // registerAndHeartbeat is a long lived goroutine used to register the client
  1030  // and then start heartbeatng to the server.
  1031  func (c *Client) registerAndHeartbeat() {
  1032  	// Register the node
  1033  	c.retryRegisterNode()
  1034  
  1035  	// Start watching changes for node changes
  1036  	go c.watchNodeUpdates()
  1037  
  1038  	// Setup the heartbeat timer, for the initial registration
  1039  	// we want to do this quickly. We want to do it extra quickly
  1040  	// in development mode.
  1041  	var heartbeat <-chan time.Time
  1042  	if c.config.DevMode {
  1043  		heartbeat = time.After(0)
  1044  	} else {
  1045  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1046  	}
  1047  
  1048  	for {
  1049  		select {
  1050  		case <-c.serversDiscoveredCh:
  1051  		case <-heartbeat:
  1052  		case <-c.shutdownCh:
  1053  			return
  1054  		}
  1055  
  1056  		if err := c.updateNodeStatus(); err != nil {
  1057  			// The servers have changed such that this node has not been
  1058  			// registered before
  1059  			if strings.Contains(err.Error(), "node not found") {
  1060  				// Re-register the node
  1061  				c.logger.Printf("[INFO] client: re-registering node")
  1062  				c.retryRegisterNode()
  1063  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
  1064  			} else {
  1065  				intv := c.retryIntv(registerRetryIntv)
  1066  				c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
  1067  				heartbeat = time.After(intv)
  1068  
  1069  				// if heartbeating fails, trigger Consul discovery
  1070  				c.triggerDiscovery()
  1071  			}
  1072  		} else {
  1073  			c.heartbeatLock.Lock()
  1074  			heartbeat = time.After(c.heartbeatTTL)
  1075  			c.heartbeatLock.Unlock()
  1076  		}
  1077  	}
  1078  }
  1079  
  1080  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
  1081  // state of the client
  1082  func (c *Client) periodicSnapshot() {
  1083  	// Create a snapshot timer
  1084  	snapshot := time.After(stateSnapshotIntv)
  1085  
  1086  	for {
  1087  		select {
  1088  		case <-snapshot:
  1089  			snapshot = time.After(stateSnapshotIntv)
  1090  			if err := c.saveState(); err != nil {
  1091  				c.logger.Printf("[ERR] client: failed to save state: %v", err)
  1092  			}
  1093  
  1094  		case <-c.shutdownCh:
  1095  			return
  1096  		}
  1097  	}
  1098  }
  1099  
  1100  // run is a long lived goroutine used to run the client
  1101  func (c *Client) run() {
  1102  	// Watch for changes in allocations
  1103  	allocUpdates := make(chan *allocUpdates, 8)
  1104  	go c.watchAllocations(allocUpdates)
  1105  
  1106  	for {
  1107  		select {
  1108  		case update := <-allocUpdates:
  1109  			c.runAllocs(update)
  1110  
  1111  		case <-c.shutdownCh:
  1112  			return
  1113  		}
  1114  	}
  1115  }
  1116  
  1117  // hasNodeChanged calculates a hash for the node attributes- and meta map.
  1118  // The new hash values are compared against the old (passed-in) hash values to
  1119  // determine if the node properties have changed. It returns the new hash values
  1120  // in case they are different from the old hash values.
  1121  func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) {
  1122  	c.configLock.RLock()
  1123  	defer c.configLock.RUnlock()
  1124  	newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil)
  1125  	if err != nil {
  1126  		c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err)
  1127  	}
  1128  	// Calculate node meta map hash
  1129  	newMetaHash, err := hashstructure.Hash(c.config.Node.Meta, nil)
  1130  	if err != nil {
  1131  		c.logger.Printf("[DEBUG] client: unable to calculate node meta hash: %v", err)
  1132  	}
  1133  	if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash {
  1134  		return true, newAttrHash, newMetaHash
  1135  	}
  1136  	return false, oldAttrHash, oldMetaHash
  1137  }
  1138  
  1139  // retryRegisterNode is used to register the node or update the registration and
  1140  // retry in case of failure.
  1141  func (c *Client) retryRegisterNode() {
  1142  	for {
  1143  		err := c.registerNode()
  1144  		if err == nil {
  1145  			// Registered!
  1146  			return
  1147  		}
  1148  
  1149  		if err == noServersErr {
  1150  			c.logger.Print("[DEBUG] client: registration waiting on servers")
  1151  			c.triggerDiscovery()
  1152  		} else {
  1153  			c.logger.Printf("[ERR] client: registration failure: %v", err)
  1154  		}
  1155  		select {
  1156  		case <-c.serversDiscoveredCh:
  1157  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1158  		case <-c.shutdownCh:
  1159  			return
  1160  		}
  1161  	}
  1162  }
  1163  
  1164  // registerNode is used to register the node or update the registration
  1165  func (c *Client) registerNode() error {
  1166  	node := c.Node()
  1167  	req := structs.NodeRegisterRequest{
  1168  		Node:         node,
  1169  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1170  	}
  1171  	var resp structs.NodeUpdateResponse
  1172  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1173  		return err
  1174  	}
  1175  
  1176  	// Update the node status to ready after we register.
  1177  	c.configLock.Lock()
  1178  	node.Status = structs.NodeStatusReady
  1179  	c.configLock.Unlock()
  1180  
  1181  	c.logger.Printf("[INFO] client: node registration complete")
  1182  	if len(resp.EvalIDs) != 0 {
  1183  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs))
  1184  	}
  1185  
  1186  	c.heartbeatLock.Lock()
  1187  	defer c.heartbeatLock.Unlock()
  1188  	c.lastHeartbeat = time.Now()
  1189  	c.heartbeatTTL = resp.HeartbeatTTL
  1190  	return nil
  1191  }
  1192  
  1193  // updateNodeStatus is used to heartbeat and update the status of the node
  1194  func (c *Client) updateNodeStatus() error {
  1195  	c.heartbeatLock.Lock()
  1196  	defer c.heartbeatLock.Unlock()
  1197  
  1198  	node := c.Node()
  1199  	req := structs.NodeUpdateStatusRequest{
  1200  		NodeID:       node.ID,
  1201  		Status:       structs.NodeStatusReady,
  1202  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1203  	}
  1204  	var resp structs.NodeUpdateResponse
  1205  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1206  		c.triggerDiscovery()
  1207  		return fmt.Errorf("failed to update status: %v", err)
  1208  	}
  1209  	if len(resp.EvalIDs) != 0 {
  1210  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs))
  1211  	}
  1212  	if resp.Index != 0 {
  1213  		c.logger.Printf("[DEBUG] client: state updated to %s", req.Status)
  1214  	}
  1215  
  1216  	// Update heartbeat time and ttl
  1217  	c.lastHeartbeat = time.Now()
  1218  	c.heartbeatTTL = resp.HeartbeatTTL
  1219  
  1220  	// Convert []*NodeServerInfo to []*endpoints
  1221  	localdc := c.Datacenter()
  1222  	servers := make(endpoints, 0, len(resp.Servers))
  1223  	for _, s := range resp.Servers {
  1224  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1225  		if err != nil {
  1226  			continue
  1227  		}
  1228  		e := endpoint{name: s.RPCAdvertiseAddr, addr: addr}
  1229  		if s.Datacenter != localdc {
  1230  			// server is non-local; de-prioritize
  1231  			e.priority = 1
  1232  		}
  1233  		servers = append(servers, &e)
  1234  	}
  1235  	if len(servers) == 0 {
  1236  		return fmt.Errorf("server returned no valid servers")
  1237  	}
  1238  	c.servers.set(servers)
  1239  
  1240  	// Begin polling Consul if there is no Nomad leader.  We could be
  1241  	// heartbeating to a Nomad server that is in the minority of a
  1242  	// partition of the Nomad server quorum, but this Nomad Agent still
  1243  	// has connectivity to the existing majority of Nomad Servers, but
  1244  	// only if it queries Consul.
  1245  	if resp.LeaderRPCAddr == "" {
  1246  		c.triggerDiscovery()
  1247  	}
  1248  
  1249  	return nil
  1250  }
  1251  
  1252  // updateAllocStatus is used to update the status of an allocation
  1253  func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
  1254  	// If this alloc was blocking another alloc and transitioned to a
  1255  	// terminal state then start the blocked allocation
  1256  	if alloc.Terminated() {
  1257  		c.blockedAllocsLock.Lock()
  1258  		blockedAlloc, ok := c.blockedAllocations[alloc.ID]
  1259  		if ok {
  1260  			var prevAllocDir *allocdir.AllocDir
  1261  			if ar, ok := c.getAllocRunners()[alloc.ID]; ok {
  1262  				tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1263  				if tg != nil && tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky {
  1264  					prevAllocDir = ar.GetAllocDir()
  1265  				}
  1266  			}
  1267  
  1268  			delete(c.blockedAllocations, blockedAlloc.PreviousAllocation)
  1269  			c.blockedAllocsLock.Unlock()
  1270  
  1271  			c.logger.Printf("[DEBUG] client: unblocking alloc %q because alloc %q terminated", blockedAlloc.ID, alloc.ID)
  1272  
  1273  			// Need to call addAlloc without holding the lock
  1274  			if err := c.addAlloc(blockedAlloc, prevAllocDir); err != nil {
  1275  				c.logger.Printf("[ERR] client: failed to add alloc which was previously blocked %q: %v",
  1276  					blockedAlloc.ID, err)
  1277  			}
  1278  		} else {
  1279  			c.blockedAllocsLock.Unlock()
  1280  		}
  1281  
  1282  		// Mark the allocation for GC if it is in terminal state
  1283  		if ar, ok := c.getAllocRunners()[alloc.ID]; ok {
  1284  			if err := c.garbageCollector.MarkForCollection(ar); err != nil {
  1285  				c.logger.Printf("[DEBUG] client: couldn't add alloc %q for GC: %v", alloc.ID, err)
  1286  			}
  1287  		}
  1288  	}
  1289  
  1290  	// Strip all the information that can be reconstructed at the server.  Only
  1291  	// send the fields that are updatable by the client.
  1292  	stripped := new(structs.Allocation)
  1293  	stripped.ID = alloc.ID
  1294  	stripped.NodeID = c.Node().ID
  1295  	stripped.TaskStates = alloc.TaskStates
  1296  	stripped.ClientStatus = alloc.ClientStatus
  1297  	stripped.ClientDescription = alloc.ClientDescription
  1298  	stripped.DeploymentStatus = alloc.DeploymentStatus
  1299  
  1300  	select {
  1301  	case c.allocUpdates <- stripped:
  1302  	case <-c.shutdownCh:
  1303  	}
  1304  }
  1305  
  1306  // allocSync is a long lived function that batches allocation updates to the
  1307  // server.
  1308  func (c *Client) allocSync() {
  1309  	staggered := false
  1310  	syncTicker := time.NewTicker(allocSyncIntv)
  1311  	updates := make(map[string]*structs.Allocation)
  1312  	for {
  1313  		select {
  1314  		case <-c.shutdownCh:
  1315  			syncTicker.Stop()
  1316  			return
  1317  		case alloc := <-c.allocUpdates:
  1318  			// Batch the allocation updates until the timer triggers.
  1319  			updates[alloc.ID] = alloc
  1320  		case <-syncTicker.C:
  1321  			// Fast path if there are no updates
  1322  			if len(updates) == 0 {
  1323  				continue
  1324  			}
  1325  
  1326  			sync := make([]*structs.Allocation, 0, len(updates))
  1327  			for _, alloc := range updates {
  1328  				sync = append(sync, alloc)
  1329  			}
  1330  
  1331  			// Send to server.
  1332  			args := structs.AllocUpdateRequest{
  1333  				Alloc:        sync,
  1334  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1335  			}
  1336  
  1337  			var resp structs.GenericResponse
  1338  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1339  				c.logger.Printf("[ERR] client: failed to update allocations: %v", err)
  1340  				syncTicker.Stop()
  1341  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1342  				staggered = true
  1343  			} else {
  1344  				updates = make(map[string]*structs.Allocation)
  1345  				if staggered {
  1346  					syncTicker.Stop()
  1347  					syncTicker = time.NewTicker(allocSyncIntv)
  1348  					staggered = false
  1349  				}
  1350  			}
  1351  		}
  1352  	}
  1353  }
  1354  
  1355  // allocUpdates holds the results of receiving updated allocations from the
  1356  // servers.
  1357  type allocUpdates struct {
  1358  	// pulled is the set of allocations that were downloaded from the servers.
  1359  	pulled map[string]*structs.Allocation
  1360  
  1361  	// filtered is the set of allocations that were not pulled because their
  1362  	// AllocModifyIndex didn't change.
  1363  	filtered map[string]struct{}
  1364  }
  1365  
  1366  // watchAllocations is used to scan for updates to allocations
  1367  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1368  	// The request and response for getting the map of allocations that should
  1369  	// be running on the Node to their AllocModifyIndex which is incremented
  1370  	// when the allocation is updated by the servers.
  1371  	n := c.Node()
  1372  	req := structs.NodeSpecificRequest{
  1373  		NodeID:   n.ID,
  1374  		SecretID: n.SecretID,
  1375  		QueryOptions: structs.QueryOptions{
  1376  			Region:     c.Region(),
  1377  			AllowStale: true,
  1378  		},
  1379  	}
  1380  	var resp structs.NodeClientAllocsResponse
  1381  
  1382  	// The request and response for pulling down the set of allocations that are
  1383  	// new, or updated server side.
  1384  	allocsReq := structs.AllocsGetRequest{
  1385  		QueryOptions: structs.QueryOptions{
  1386  			Region:     c.Region(),
  1387  			AllowStale: true,
  1388  		},
  1389  	}
  1390  	var allocsResp structs.AllocsGetResponse
  1391  
  1392  OUTER:
  1393  	for {
  1394  		// Get the allocation modify index map, blocking for updates. We will
  1395  		// use this to determine exactly what allocations need to be downloaded
  1396  		// in full.
  1397  		resp = structs.NodeClientAllocsResponse{}
  1398  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1399  		if err != nil {
  1400  			// Shutdown often causes EOF errors, so check for shutdown first
  1401  			select {
  1402  			case <-c.shutdownCh:
  1403  				return
  1404  			default:
  1405  			}
  1406  
  1407  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1408  			// servers are not fully upgraded before the clients register. This
  1409  			// can cause the SecretID to be lost
  1410  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1411  				c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err)
  1412  				c.retryRegisterNode()
  1413  			} else if err != noServersErr {
  1414  				c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err)
  1415  			}
  1416  			retry := c.retryIntv(getAllocRetryIntv)
  1417  			select {
  1418  			case <-c.serversDiscoveredCh:
  1419  				continue
  1420  			case <-time.After(retry):
  1421  				continue
  1422  			case <-c.shutdownCh:
  1423  				return
  1424  			}
  1425  		}
  1426  
  1427  		// Check for shutdown
  1428  		select {
  1429  		case <-c.shutdownCh:
  1430  			return
  1431  		default:
  1432  		}
  1433  
  1434  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1435  		// These are the allocations who have either not been updated, or whose
  1436  		// updates are a result of the client sending an update for the alloc.
  1437  		// This lets us reduce the network traffic to the server as we don't
  1438  		// need to pull all the allocations.
  1439  		var pull []string
  1440  		filtered := make(map[string]struct{})
  1441  		runners := c.getAllocRunners()
  1442  		var pullIndex uint64
  1443  		for allocID, modifyIndex := range resp.Allocs {
  1444  			// Pull the allocation if we don't have an alloc runner for the
  1445  			// allocation or if the alloc runner requires an updated allocation.
  1446  			runner, ok := runners[allocID]
  1447  
  1448  			if !ok || runner.shouldUpdate(modifyIndex) {
  1449  				// Only pull allocs that are required. Filtered
  1450  				// allocs might be at a higher index, so ignore
  1451  				// it.
  1452  				if modifyIndex > pullIndex {
  1453  					pullIndex = modifyIndex
  1454  				}
  1455  				pull = append(pull, allocID)
  1456  			} else {
  1457  				filtered[allocID] = struct{}{}
  1458  			}
  1459  		}
  1460  
  1461  		// Pull the allocations that passed filtering.
  1462  		allocsResp.Allocs = nil
  1463  		var pulledAllocs map[string]*structs.Allocation
  1464  		if len(pull) != 0 {
  1465  			// Pull the allocations that need to be updated.
  1466  			allocsReq.AllocIDs = pull
  1467  			allocsReq.MinQueryIndex = pullIndex - 1
  1468  			allocsResp = structs.AllocsGetResponse{}
  1469  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1470  				c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err)
  1471  				retry := c.retryIntv(getAllocRetryIntv)
  1472  				select {
  1473  				case <-c.serversDiscoveredCh:
  1474  					continue
  1475  				case <-time.After(retry):
  1476  					continue
  1477  				case <-c.shutdownCh:
  1478  					return
  1479  				}
  1480  			}
  1481  
  1482  			// Ensure that we received all the allocations we wanted
  1483  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1484  			for _, alloc := range allocsResp.Allocs {
  1485  				pulledAllocs[alloc.ID] = alloc
  1486  			}
  1487  
  1488  			for _, desiredID := range pull {
  1489  				if _, ok := pulledAllocs[desiredID]; !ok {
  1490  					// We didn't get everything we wanted. Do not update the
  1491  					// MinQueryIndex, sleep and then retry.
  1492  					wait := c.retryIntv(2 * time.Second)
  1493  					select {
  1494  					case <-time.After(wait):
  1495  						// Wait for the server we contact to receive the
  1496  						// allocations
  1497  						continue OUTER
  1498  					case <-c.shutdownCh:
  1499  						return
  1500  					}
  1501  				}
  1502  			}
  1503  
  1504  			// Check for shutdown
  1505  			select {
  1506  			case <-c.shutdownCh:
  1507  				return
  1508  			default:
  1509  			}
  1510  		}
  1511  
  1512  		c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)",
  1513  			resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered))
  1514  
  1515  		// Update the query index.
  1516  		if resp.Index > req.MinQueryIndex {
  1517  			req.MinQueryIndex = resp.Index
  1518  		}
  1519  
  1520  		// Push the updates.
  1521  		update := &allocUpdates{
  1522  			filtered: filtered,
  1523  			pulled:   pulledAllocs,
  1524  		}
  1525  		select {
  1526  		case updates <- update:
  1527  		case <-c.shutdownCh:
  1528  			return
  1529  		}
  1530  	}
  1531  }
  1532  
  1533  // watchNodeUpdates periodically checks for changes to the node attributes or meta map
  1534  func (c *Client) watchNodeUpdates() {
  1535  	c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv)
  1536  
  1537  	// Initialize the hashes
  1538  	_, attrHash, metaHash := c.hasNodeChanged(0, 0)
  1539  	var changed bool
  1540  	for {
  1541  		select {
  1542  		case <-time.After(c.retryIntv(nodeUpdateRetryIntv)):
  1543  			changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash)
  1544  			if changed {
  1545  				c.logger.Printf("[DEBUG] client: state changed, updating node.")
  1546  
  1547  				// Update the config copy.
  1548  				c.configLock.Lock()
  1549  				node := c.config.Node.Copy()
  1550  				c.configCopy.Node = node
  1551  				c.configLock.Unlock()
  1552  
  1553  				c.retryRegisterNode()
  1554  			}
  1555  		case <-c.shutdownCh:
  1556  			return
  1557  		}
  1558  	}
  1559  }
  1560  
  1561  // runAllocs is invoked when we get an updated set of allocations
  1562  func (c *Client) runAllocs(update *allocUpdates) {
  1563  	// Get the existing allocs
  1564  	c.allocLock.RLock()
  1565  	exist := make([]*structs.Allocation, 0, len(c.allocs))
  1566  	for _, ar := range c.allocs {
  1567  		exist = append(exist, ar.alloc)
  1568  	}
  1569  	c.allocLock.RUnlock()
  1570  
  1571  	// Diff the existing and updated allocations
  1572  	diff := diffAllocs(exist, update)
  1573  	c.logger.Printf("[DEBUG] client: %#v", diff)
  1574  
  1575  	// Remove the old allocations
  1576  	for _, remove := range diff.removed {
  1577  		if err := c.removeAlloc(remove); err != nil {
  1578  			c.logger.Printf("[ERR] client: failed to remove alloc '%s': %v", remove.ID, err)
  1579  		}
  1580  	}
  1581  
  1582  	// Update the existing allocations
  1583  	for _, update := range diff.updated {
  1584  		if err := c.updateAlloc(update.exist, update.updated); err != nil {
  1585  			c.logger.Printf("[ERR] client: failed to update alloc %q: %v",
  1586  				update.exist.ID, err)
  1587  		}
  1588  
  1589  		// See if the updated alloc is getting migrated
  1590  		c.migratingAllocsLock.RLock()
  1591  		ch, ok := c.migratingAllocs[update.updated.ID]
  1592  		c.migratingAllocsLock.RUnlock()
  1593  		if ok {
  1594  			// Stopping the migration if the allocation doesn't need any
  1595  			// migration
  1596  			if !update.updated.ShouldMigrate() {
  1597  				ch.closeCh()
  1598  			}
  1599  		}
  1600  	}
  1601  
  1602  	// Start the new allocations
  1603  	for _, add := range diff.added {
  1604  		// If the allocation is chained and the previous allocation hasn't
  1605  		// terminated yet, then add the alloc to the blocked queue.
  1606  		c.blockedAllocsLock.Lock()
  1607  		ar, ok := c.getAllocRunners()[add.PreviousAllocation]
  1608  		if ok && !ar.Alloc().Terminated() {
  1609  			// Check if the alloc is already present in the blocked allocations
  1610  			// map
  1611  			if _, ok := c.blockedAllocations[add.PreviousAllocation]; !ok {
  1612  				c.logger.Printf("[DEBUG] client: added alloc %q to blocked queue for previous alloc %q",
  1613  					add.ID, add.PreviousAllocation)
  1614  				c.blockedAllocations[add.PreviousAllocation] = add
  1615  			}
  1616  			c.blockedAllocsLock.Unlock()
  1617  			continue
  1618  		}
  1619  		c.blockedAllocsLock.Unlock()
  1620  
  1621  		// This means the allocation has a previous allocation on another node
  1622  		// so we will block for the previous allocation to complete
  1623  		if add.PreviousAllocation != "" && !ok {
  1624  			// Ensure that we are not blocking for the remote allocation if we
  1625  			// have already blocked
  1626  			c.migratingAllocsLock.Lock()
  1627  			if _, ok := c.migratingAllocs[add.ID]; !ok {
  1628  				// Check that we don't have an alloc runner already. This
  1629  				// prevents a race between a finishing blockForRemoteAlloc and
  1630  				// another invocation of runAllocs
  1631  				if _, ok := c.getAllocRunners()[add.PreviousAllocation]; !ok {
  1632  					c.migratingAllocs[add.ID] = newMigrateAllocCtrl(add)
  1633  					go c.blockForRemoteAlloc(add)
  1634  				}
  1635  			}
  1636  			c.migratingAllocsLock.Unlock()
  1637  			continue
  1638  		}
  1639  
  1640  		// Setting the previous allocdir if the allocation had a terminal
  1641  		// previous allocation
  1642  		var prevAllocDir *allocdir.AllocDir
  1643  		tg := add.Job.LookupTaskGroup(add.TaskGroup)
  1644  		if tg != nil && tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky && ar != nil {
  1645  			prevAllocDir = ar.GetAllocDir()
  1646  		}
  1647  
  1648  		if err := c.addAlloc(add, prevAllocDir); err != nil {
  1649  			c.logger.Printf("[ERR] client: failed to add alloc '%s': %v",
  1650  				add.ID, err)
  1651  		}
  1652  	}
  1653  }
  1654  
  1655  // blockForRemoteAlloc blocks until the previous allocation of an allocation has
  1656  // been terminated and migrates the snapshot data
  1657  func (c *Client) blockForRemoteAlloc(alloc *structs.Allocation) {
  1658  	// Removing the allocation from the set of allocs which are currently
  1659  	// undergoing migration
  1660  	defer func() {
  1661  		c.migratingAllocsLock.Lock()
  1662  		delete(c.migratingAllocs, alloc.ID)
  1663  		c.migratingAllocsLock.Unlock()
  1664  	}()
  1665  
  1666  	// prevAllocDir is the allocation directory of the previous allocation
  1667  	var prevAllocDir *allocdir.AllocDir
  1668  
  1669  	// If the allocation is not sticky then we won't wait for the previous
  1670  	// allocation to be terminal
  1671  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1672  	if tg == nil {
  1673  		c.logger.Printf("[ERR] client: task group %q not found in job %q", tg.Name, alloc.Job.ID)
  1674  		goto ADDALLOC
  1675  	}
  1676  
  1677  	// Wait for the remote previous alloc to be terminal if the alloc is sticky
  1678  	if tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky && tg.EphemeralDisk.Migrate {
  1679  		c.logger.Printf("[DEBUG] client: blocking alloc %q for previous allocation %q", alloc.ID, alloc.PreviousAllocation)
  1680  		// Block until the previous allocation migrates to terminal state
  1681  		stopCh := c.migratingAllocs[alloc.ID]
  1682  		prevAlloc, err := c.waitForAllocTerminal(alloc.PreviousAllocation, stopCh)
  1683  		if err != nil {
  1684  			c.logger.Printf("[ERR] client: error waiting for allocation %q: %v",
  1685  				alloc.PreviousAllocation, err)
  1686  		}
  1687  
  1688  		// Migrate the data from the remote node
  1689  		prevAllocDir, err = c.migrateRemoteAllocDir(prevAlloc, alloc.ID)
  1690  		if err != nil {
  1691  			c.logger.Printf("[ERR] client: error migrating data from remote alloc %q: %v",
  1692  				alloc.PreviousAllocation, err)
  1693  		}
  1694  	}
  1695  
  1696  ADDALLOC:
  1697  	// Add the allocation
  1698  	if err := c.addAlloc(alloc, prevAllocDir); err != nil {
  1699  		c.logger.Printf("[ERR] client: error adding alloc: %v", err)
  1700  	}
  1701  }
  1702  
  1703  // waitForAllocTerminal waits for an allocation with the given alloc id to
  1704  // transition to terminal state and blocks the caller until then.
  1705  func (c *Client) waitForAllocTerminal(allocID string, stopCh *migrateAllocCtrl) (*structs.Allocation, error) {
  1706  	req := structs.AllocSpecificRequest{
  1707  		AllocID: allocID,
  1708  		QueryOptions: structs.QueryOptions{
  1709  			Region:     c.Region(),
  1710  			AllowStale: true,
  1711  		},
  1712  	}
  1713  
  1714  	for {
  1715  		resp := structs.SingleAllocResponse{}
  1716  		err := c.RPC("Alloc.GetAlloc", &req, &resp)
  1717  		if err != nil {
  1718  			c.logger.Printf("[ERR] client: failed to query allocation %q: %v", allocID, err)
  1719  			retry := c.retryIntv(getAllocRetryIntv)
  1720  			select {
  1721  			case <-time.After(retry):
  1722  				continue
  1723  			case <-stopCh.ch:
  1724  				return nil, fmt.Errorf("giving up waiting on alloc %q since migration is not needed", allocID)
  1725  			case <-c.shutdownCh:
  1726  				return nil, fmt.Errorf("aborting because client is shutting down")
  1727  			}
  1728  		}
  1729  		if resp.Alloc == nil {
  1730  			return nil, nil
  1731  		}
  1732  		if resp.Alloc.Terminated() {
  1733  			return resp.Alloc, nil
  1734  		}
  1735  
  1736  		// Update the query index.
  1737  		if resp.Index > req.MinQueryIndex {
  1738  			req.MinQueryIndex = resp.Index
  1739  		}
  1740  
  1741  	}
  1742  }
  1743  
  1744  // migrateRemoteAllocDir migrates the allocation directory from a remote node to
  1745  // the current node
  1746  func (c *Client) migrateRemoteAllocDir(alloc *structs.Allocation, allocID string) (*allocdir.AllocDir, error) {
  1747  	if alloc == nil {
  1748  		return nil, nil
  1749  	}
  1750  
  1751  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1752  	if tg == nil {
  1753  		return nil, fmt.Errorf("Task Group %q not found in job %q", tg.Name, alloc.Job.ID)
  1754  	}
  1755  
  1756  	// Skip migration of data if the ephemeral disk is not sticky or
  1757  	// migration is turned off.
  1758  	if tg.EphemeralDisk == nil || !tg.EphemeralDisk.Sticky || !tg.EphemeralDisk.Migrate {
  1759  		return nil, nil
  1760  	}
  1761  
  1762  	node, err := c.getNode(alloc.NodeID)
  1763  
  1764  	// If the node is down then skip migrating the data
  1765  	if err != nil {
  1766  		return nil, fmt.Errorf("error retreiving node %v: %v", alloc.NodeID, err)
  1767  	}
  1768  
  1769  	// Check if node is nil
  1770  	if node == nil {
  1771  		return nil, fmt.Errorf("node %q doesn't exist", alloc.NodeID)
  1772  	}
  1773  
  1774  	// skip migration if the remote node is down
  1775  	if node.Status == structs.NodeStatusDown {
  1776  		c.logger.Printf("[INFO] client: not migrating data from alloc %q since node %q is down", alloc.ID, alloc.NodeID)
  1777  		return nil, nil
  1778  	}
  1779  
  1780  	// Create the previous alloc dir
  1781  	pathToAllocDir := filepath.Join(c.config.AllocDir, alloc.ID)
  1782  	if err := os.MkdirAll(pathToAllocDir, 0777); err != nil {
  1783  		c.logger.Printf("[ERR] client: error creating previous allocation dir: %v", err)
  1784  	}
  1785  
  1786  	// Get the snapshot
  1787  	scheme := "http"
  1788  	if node.TLSEnabled {
  1789  		scheme = "https"
  1790  	}
  1791  	// Create an API client
  1792  	apiConfig := nomadapi.DefaultConfig()
  1793  	apiConfig.Address = fmt.Sprintf("%s://%s", scheme, node.HTTPAddr)
  1794  	apiConfig.TLSConfig = &nomadapi.TLSConfig{
  1795  		CACert:     c.config.TLSConfig.CAFile,
  1796  		ClientCert: c.config.TLSConfig.CertFile,
  1797  		ClientKey:  c.config.TLSConfig.KeyFile,
  1798  	}
  1799  	apiClient, err := nomadapi.NewClient(apiConfig)
  1800  	if err != nil {
  1801  		return nil, err
  1802  	}
  1803  
  1804  	url := fmt.Sprintf("/v1/client/allocation/%v/snapshot", alloc.ID)
  1805  	resp, err := apiClient.Raw().Response(url, nil)
  1806  	if err != nil {
  1807  		os.RemoveAll(pathToAllocDir)
  1808  		c.logger.Printf("[ERR] client: error getting snapshot for alloc %q: %v", alloc.ID, err)
  1809  		return nil, fmt.Errorf("error getting snapshot for alloc %q: %v", alloc.ID, err)
  1810  	}
  1811  
  1812  	if err := c.unarchiveAllocDir(resp, allocID, pathToAllocDir); err != nil {
  1813  		return nil, err
  1814  	}
  1815  
  1816  	// If there were no errors then we create the allocdir
  1817  	prevAllocDir := allocdir.NewAllocDir(c.logger, pathToAllocDir)
  1818  	return prevAllocDir, nil
  1819  }
  1820  
  1821  // unarchiveAllocDir reads the stream of a compressed allocation directory and
  1822  // writes them to the disk.
  1823  func (c *Client) unarchiveAllocDir(resp io.ReadCloser, allocID string, pathToAllocDir string) error {
  1824  	tr := tar.NewReader(resp)
  1825  	defer resp.Close()
  1826  
  1827  	buf := make([]byte, 1024)
  1828  
  1829  	stopMigrating, ok := c.migratingAllocs[allocID]
  1830  	if !ok {
  1831  		os.RemoveAll(pathToAllocDir)
  1832  		return fmt.Errorf("Allocation %q is not marked for remote migration", allocID)
  1833  	}
  1834  	for {
  1835  		// See if the alloc still needs migration
  1836  		select {
  1837  		case <-stopMigrating.ch:
  1838  			os.RemoveAll(pathToAllocDir)
  1839  			c.logger.Printf("[INFO] client: stopping migration of allocdir for alloc: %v", allocID)
  1840  			return nil
  1841  		case <-c.shutdownCh:
  1842  			os.RemoveAll(pathToAllocDir)
  1843  			c.logger.Printf("[INFO] client: stopping migration of alloc %q since client is shutting down", allocID)
  1844  			return nil
  1845  		default:
  1846  		}
  1847  
  1848  		// Get the next header
  1849  		hdr, err := tr.Next()
  1850  
  1851  		// Snapshot has ended
  1852  		if err == io.EOF {
  1853  			return nil
  1854  		}
  1855  		// If there is an error then we avoid creating the alloc dir
  1856  		if err != nil {
  1857  			os.RemoveAll(pathToAllocDir)
  1858  			return fmt.Errorf("error creating alloc dir for alloc %q: %v", allocID, err)
  1859  		}
  1860  
  1861  		// If the header is for a directory we create the directory
  1862  		if hdr.Typeflag == tar.TypeDir {
  1863  			os.MkdirAll(filepath.Join(pathToAllocDir, hdr.Name), os.FileMode(hdr.Mode))
  1864  			continue
  1865  		}
  1866  		// If the header is for a symlink we create the symlink
  1867  		if hdr.Typeflag == tar.TypeSymlink {
  1868  			if err = os.Symlink(hdr.Linkname, filepath.Join(pathToAllocDir, hdr.Name)); err != nil {
  1869  				c.logger.Printf("[ERR] client: error creating symlink: %v", err)
  1870  			}
  1871  			continue
  1872  		}
  1873  		// If the header is a file, we write to a file
  1874  		if hdr.Typeflag == tar.TypeReg {
  1875  			f, err := os.Create(filepath.Join(pathToAllocDir, hdr.Name))
  1876  			if err != nil {
  1877  				c.logger.Printf("[ERR] client: error creating file: %v", err)
  1878  				continue
  1879  			}
  1880  
  1881  			// Setting the permissions of the file as the origin.
  1882  			if err := f.Chmod(os.FileMode(hdr.Mode)); err != nil {
  1883  				f.Close()
  1884  				c.logger.Printf("[ERR] client: error chmod-ing file %s: %v", f.Name(), err)
  1885  				return fmt.Errorf("error chmoding file %v", err)
  1886  			}
  1887  			if err := f.Chown(hdr.Uid, hdr.Gid); err != nil {
  1888  				f.Close()
  1889  				c.logger.Printf("[ERR] client: error chown-ing file %s: %v", f.Name(), err)
  1890  				return fmt.Errorf("error chowning file %v", err)
  1891  			}
  1892  
  1893  			// We write in chunks of 32 bytes so that we can test if
  1894  			// the client is still alive
  1895  			for {
  1896  				if c.shutdown {
  1897  					f.Close()
  1898  					os.RemoveAll(pathToAllocDir)
  1899  					c.logger.Printf("[INFO] client: stopping migration of alloc %q because client is shutting down", allocID)
  1900  					return nil
  1901  				}
  1902  
  1903  				n, err := tr.Read(buf)
  1904  				if err != nil {
  1905  					f.Close()
  1906  					if err != io.EOF {
  1907  						return fmt.Errorf("error reading snapshot: %v", err)
  1908  					}
  1909  					break
  1910  				}
  1911  				if _, err := f.Write(buf[:n]); err != nil {
  1912  					f.Close()
  1913  					os.RemoveAll(pathToAllocDir)
  1914  					return fmt.Errorf("error writing to file %q: %v", f.Name(), err)
  1915  				}
  1916  			}
  1917  
  1918  		}
  1919  	}
  1920  }
  1921  
  1922  // getNode gets the node from the server with the given Node ID
  1923  func (c *Client) getNode(nodeID string) (*structs.Node, error) {
  1924  	req := structs.NodeSpecificRequest{
  1925  		NodeID: nodeID,
  1926  		QueryOptions: structs.QueryOptions{
  1927  			Region:     c.Region(),
  1928  			AllowStale: true,
  1929  		},
  1930  	}
  1931  
  1932  	resp := structs.SingleNodeResponse{}
  1933  	for {
  1934  		err := c.RPC("Node.GetNode", &req, &resp)
  1935  		if err != nil {
  1936  			c.logger.Printf("[ERR] client: failed to query node info %q: %v", nodeID, err)
  1937  			retry := c.retryIntv(getAllocRetryIntv)
  1938  			select {
  1939  			case <-time.After(retry):
  1940  				continue
  1941  			case <-c.shutdownCh:
  1942  				return nil, fmt.Errorf("aborting because client is shutting down")
  1943  			}
  1944  		}
  1945  		break
  1946  	}
  1947  
  1948  	return resp.Node, nil
  1949  }
  1950  
  1951  // removeAlloc is invoked when we should remove an allocation
  1952  func (c *Client) removeAlloc(alloc *structs.Allocation) error {
  1953  	c.allocLock.Lock()
  1954  	ar, ok := c.allocs[alloc.ID]
  1955  	if !ok {
  1956  		c.allocLock.Unlock()
  1957  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID)
  1958  		return nil
  1959  	}
  1960  	delete(c.allocs, alloc.ID)
  1961  	c.allocLock.Unlock()
  1962  
  1963  	// Ensure the GC has a reference and then collect. Collecting through the GC
  1964  	// applies rate limiting
  1965  	c.garbageCollector.MarkForCollection(ar)
  1966  	go c.garbageCollector.Collect(alloc.ID)
  1967  
  1968  	return nil
  1969  }
  1970  
  1971  // updateAlloc is invoked when we should update an allocation
  1972  func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
  1973  	c.allocLock.RLock()
  1974  	ar, ok := c.allocs[exist.ID]
  1975  	c.allocLock.RUnlock()
  1976  	if !ok {
  1977  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID)
  1978  		return nil
  1979  	}
  1980  
  1981  	ar.Update(update)
  1982  	return nil
  1983  }
  1984  
  1985  // addAlloc is invoked when we should add an allocation
  1986  func (c *Client) addAlloc(alloc *structs.Allocation, prevAllocDir *allocdir.AllocDir) error {
  1987  	// Check if we already have an alloc runner
  1988  	c.allocLock.Lock()
  1989  	if _, ok := c.allocs[alloc.ID]; ok {
  1990  		c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID)
  1991  		c.allocLock.Unlock()
  1992  		return nil
  1993  	}
  1994  
  1995  	c.configLock.RLock()
  1996  	ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService)
  1997  	ar.SetPreviousAllocDir(prevAllocDir)
  1998  	c.configLock.RUnlock()
  1999  
  2000  	// Store the alloc runner.
  2001  	c.allocs[alloc.ID] = ar
  2002  
  2003  	if err := ar.SaveState(); err != nil {
  2004  		c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", alloc.ID, err)
  2005  	}
  2006  
  2007  	// Must release allocLock as GC acquires it to count allocs
  2008  	c.allocLock.Unlock()
  2009  
  2010  	// Make room for the allocation before running it
  2011  	if err := c.garbageCollector.MakeRoomFor([]*structs.Allocation{alloc}); err != nil {
  2012  		c.logger.Printf("[ERR] client: error making room for allocation: %v", err)
  2013  	}
  2014  
  2015  	go ar.Run()
  2016  	return nil
  2017  }
  2018  
  2019  // setupVaultClient creates an object to periodically renew tokens and secrets
  2020  // with vault.
  2021  func (c *Client) setupVaultClient() error {
  2022  	var err error
  2023  	if c.vaultClient, err =
  2024  		vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken); err != nil {
  2025  		return err
  2026  	}
  2027  
  2028  	if c.vaultClient == nil {
  2029  		c.logger.Printf("[ERR] client: failed to create vault client")
  2030  		return fmt.Errorf("failed to create vault client")
  2031  	}
  2032  
  2033  	// Start renewing tokens and secrets
  2034  	c.vaultClient.Start()
  2035  
  2036  	return nil
  2037  }
  2038  
  2039  // deriveToken takes in an allocation and a set of tasks and derives vault
  2040  // tokens for each of the tasks, unwraps all of them using the supplied vault
  2041  // client and returns a map of unwrapped tokens, indexed by the task name.
  2042  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  2043  	if alloc == nil {
  2044  		return nil, fmt.Errorf("nil allocation")
  2045  	}
  2046  
  2047  	if taskNames == nil || len(taskNames) == 0 {
  2048  		return nil, fmt.Errorf("missing task names")
  2049  	}
  2050  
  2051  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  2052  	if group == nil {
  2053  		return nil, fmt.Errorf("group name in allocation is not present in job")
  2054  	}
  2055  
  2056  	verifiedTasks := []string{}
  2057  	found := false
  2058  	// Check if the given task names actually exist in the allocation
  2059  	for _, taskName := range taskNames {
  2060  		found = false
  2061  		for _, task := range group.Tasks {
  2062  			if task.Name == taskName {
  2063  				found = true
  2064  			}
  2065  		}
  2066  		if !found {
  2067  			c.logger.Printf("[ERR] task %q not found in the allocation", taskName)
  2068  			return nil, fmt.Errorf("task %q not found in the allocaition", taskName)
  2069  		}
  2070  		verifiedTasks = append(verifiedTasks, taskName)
  2071  	}
  2072  
  2073  	// DeriveVaultToken of nomad server can take in a set of tasks and
  2074  	// creates tokens for all the tasks.
  2075  	req := &structs.DeriveVaultTokenRequest{
  2076  		NodeID:   c.Node().ID,
  2077  		SecretID: c.Node().SecretID,
  2078  		AllocID:  alloc.ID,
  2079  		Tasks:    verifiedTasks,
  2080  		QueryOptions: structs.QueryOptions{
  2081  			Region:     c.Region(),
  2082  			AllowStale: false,
  2083  		},
  2084  	}
  2085  
  2086  	// Derive the tokens
  2087  	var resp structs.DeriveVaultTokenResponse
  2088  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  2089  		c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err)
  2090  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  2091  	}
  2092  	if resp.Error != nil {
  2093  		c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error)
  2094  		return nil, resp.Error
  2095  	}
  2096  	if resp.Tasks == nil {
  2097  		c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response")
  2098  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  2099  	}
  2100  
  2101  	unwrappedTokens := make(map[string]string)
  2102  
  2103  	// Retrieve the wrapped tokens from the response and unwrap it
  2104  	for _, taskName := range verifiedTasks {
  2105  		// Get the wrapped token
  2106  		wrappedToken, ok := resp.Tasks[taskName]
  2107  		if !ok {
  2108  			c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName)
  2109  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  2110  		}
  2111  
  2112  		// Unwrap the vault token
  2113  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  2114  		if err != nil {
  2115  			return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err)
  2116  		}
  2117  		if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" {
  2118  			return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName)
  2119  		}
  2120  
  2121  		// Append the unwrapped token to the return value
  2122  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  2123  	}
  2124  
  2125  	return unwrappedTokens, nil
  2126  }
  2127  
  2128  // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread)
  2129  func (c *Client) triggerDiscovery() {
  2130  	select {
  2131  	case c.triggerDiscoveryCh <- struct{}{}:
  2132  		// Discovery goroutine was released to execute
  2133  	default:
  2134  		// Discovery goroutine was already running
  2135  	}
  2136  }
  2137  
  2138  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  2139  // It's intended to be started in a goroutine. See triggerDiscovery() for
  2140  // causing consul discovery from other code locations.
  2141  func (c *Client) consulDiscovery() {
  2142  	for {
  2143  		select {
  2144  		case <-c.triggerDiscoveryCh:
  2145  			if err := c.consulDiscoveryImpl(); err != nil {
  2146  				c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err)
  2147  			}
  2148  		case <-c.shutdownCh:
  2149  			return
  2150  		}
  2151  	}
  2152  }
  2153  
  2154  func (c *Client) consulDiscoveryImpl() error {
  2155  	// Acquire heartbeat lock to prevent heartbeat from running
  2156  	// concurrently with discovery. Concurrent execution is safe, however
  2157  	// discovery is usually triggered when heartbeating has failed so
  2158  	// there's no point in allowing it.
  2159  	c.heartbeatLock.Lock()
  2160  	defer c.heartbeatLock.Unlock()
  2161  
  2162  	dcs, err := c.consulCatalog.Datacenters()
  2163  	if err != nil {
  2164  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  2165  	}
  2166  	if len(dcs) > 2 {
  2167  		// Query the local DC first, then shuffle the
  2168  		// remaining DCs.  Future heartbeats will cause Nomad
  2169  		// Clients to fixate on their local datacenter so
  2170  		// it's okay to talk with remote DCs.  If the no
  2171  		// Nomad servers are available within
  2172  		// datacenterQueryLimit, the next heartbeat will pick
  2173  		// a new set of servers so it's okay.
  2174  		shuffleStrings(dcs[1:])
  2175  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  2176  	}
  2177  
  2178  	// Query for servers in this client's region only
  2179  	region := c.Region()
  2180  	rpcargs := structs.GenericRequest{
  2181  		QueryOptions: structs.QueryOptions{
  2182  			Region: region,
  2183  		},
  2184  	}
  2185  
  2186  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  2187  	var mErr multierror.Error
  2188  	var servers endpoints
  2189  	c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
  2190  DISCOLOOP:
  2191  	for _, dc := range dcs {
  2192  		consulOpts := &consulapi.QueryOptions{
  2193  			AllowStale: true,
  2194  			Datacenter: dc,
  2195  			Near:       "_agent",
  2196  			WaitTime:   consul.DefaultQueryWaitDuration,
  2197  		}
  2198  		consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  2199  		if err != nil {
  2200  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  2201  			continue
  2202  		}
  2203  
  2204  		for _, s := range consulServices {
  2205  			port := strconv.Itoa(s.ServicePort)
  2206  			addrstr := s.ServiceAddress
  2207  			if addrstr == "" {
  2208  				addrstr = s.Address
  2209  			}
  2210  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  2211  			if err != nil {
  2212  				mErr.Errors = append(mErr.Errors, err)
  2213  				continue
  2214  			}
  2215  			var peers []string
  2216  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  2217  				mErr.Errors = append(mErr.Errors, err)
  2218  				continue
  2219  			}
  2220  
  2221  			// Successfully received the Server peers list of the correct
  2222  			// region
  2223  			for _, p := range peers {
  2224  				addr, err := net.ResolveTCPAddr("tcp", p)
  2225  				if err != nil {
  2226  					mErr.Errors = append(mErr.Errors, err)
  2227  				}
  2228  				servers = append(servers, &endpoint{name: p, addr: addr})
  2229  			}
  2230  			if len(servers) > 0 {
  2231  				break DISCOLOOP
  2232  			}
  2233  		}
  2234  	}
  2235  	if len(servers) == 0 {
  2236  		if len(mErr.Errors) > 0 {
  2237  			return mErr.ErrorOrNil()
  2238  		}
  2239  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  2240  	}
  2241  
  2242  	c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers)
  2243  	c.servers.set(servers)
  2244  
  2245  	// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  2246  	// isn't receiving on this chan yet they'll still retry eventually.
  2247  	// This is a shortcircuit for the longer retry intervals.
  2248  	for {
  2249  		select {
  2250  		case c.serversDiscoveredCh <- struct{}{}:
  2251  		default:
  2252  			return nil
  2253  		}
  2254  	}
  2255  }
  2256  
  2257  // emitStats collects host resource usage stats periodically
  2258  func (c *Client) emitStats() {
  2259  	// Start collecting host stats right away and then keep collecting every
  2260  	// collection interval
  2261  	next := time.NewTimer(0)
  2262  	defer next.Stop()
  2263  	for {
  2264  		select {
  2265  		case <-next.C:
  2266  			err := c.hostStatsCollector.Collect()
  2267  			next.Reset(c.config.StatsCollectionInterval)
  2268  			if err != nil {
  2269  				c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err)
  2270  				continue
  2271  			}
  2272  
  2273  			// Publish Node metrics if operator has opted in
  2274  			if c.config.PublishNodeMetrics {
  2275  				c.emitHostStats(c.hostStatsCollector.Stats())
  2276  			}
  2277  
  2278  			c.emitClientMetrics()
  2279  		case <-c.shutdownCh:
  2280  			return
  2281  		}
  2282  	}
  2283  }
  2284  
  2285  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2286  func (c *Client) emitHostStats(hStats *stats.HostStats) {
  2287  	nodeID := c.Node().ID
  2288  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2289  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2290  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2291  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2292  
  2293  	metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime))
  2294  
  2295  	for _, cpu := range hStats.CPU {
  2296  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2297  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2298  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2299  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2300  	}
  2301  
  2302  	for _, disk := range hStats.DiskStats {
  2303  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2304  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2305  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2306  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2307  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2308  	}
  2309  
  2310  	// Get all the resources for the node
  2311  	c.configLock.RLock()
  2312  	node := c.configCopy.Node
  2313  	c.configLock.RUnlock()
  2314  	total := node.Resources
  2315  	res := node.Reserved
  2316  	allocated := c.getAllocatedResources(node)
  2317  
  2318  	// Emit allocated
  2319  	metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB))
  2320  	metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB))
  2321  	metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU))
  2322  	metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS))
  2323  
  2324  	for _, n := range allocated.Networks {
  2325  		metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2326  	}
  2327  
  2328  	// Emit unallocated
  2329  	unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB
  2330  	unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB
  2331  	unallocatedCpu := total.CPU - res.CPU - allocated.CPU
  2332  	unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS
  2333  	metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2334  	metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2335  	metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2336  	metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops))
  2337  
  2338  	for _, n := range allocated.Networks {
  2339  		totalMbits := 0
  2340  
  2341  		totalIdx := total.NetIndex(n)
  2342  		if totalIdx != -1 {
  2343  			totalMbits = total.Networks[totalIdx].MBits
  2344  			continue
  2345  		}
  2346  
  2347  		unallocatedMbits := totalMbits - n.MBits
  2348  		metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2349  	}
  2350  }
  2351  
  2352  // emitClientMetrics emits lower volume client metrics
  2353  func (c *Client) emitClientMetrics() {
  2354  	nodeID := c.Node().ID
  2355  
  2356  	// Emit allocation metrics
  2357  	c.blockedAllocsLock.RLock()
  2358  	blocked := len(c.blockedAllocations)
  2359  	c.blockedAllocsLock.RUnlock()
  2360  
  2361  	c.migratingAllocsLock.RLock()
  2362  	migrating := len(c.migratingAllocs)
  2363  	c.migratingAllocsLock.RUnlock()
  2364  
  2365  	pending, running, terminal := 0, 0, 0
  2366  	for _, ar := range c.getAllocRunners() {
  2367  		switch ar.Alloc().ClientStatus {
  2368  		case structs.AllocClientStatusPending:
  2369  			pending++
  2370  		case structs.AllocClientStatusRunning:
  2371  			running++
  2372  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2373  			terminal++
  2374  		}
  2375  	}
  2376  
  2377  	metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2378  	metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2379  	metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2380  	metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2381  	metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2382  }
  2383  
  2384  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources {
  2385  	// Unfortunately the allocs only have IP so we need to match them to the
  2386  	// device
  2387  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  2388  	for _, n := range selfNode.Resources.Networks {
  2389  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  2390  		if err != nil {
  2391  			continue
  2392  		}
  2393  		cidrToDevice[ipnet] = n.Device
  2394  	}
  2395  
  2396  	// Sum the allocated resources
  2397  	allocs := c.allAllocs()
  2398  	var allocated structs.Resources
  2399  	allocatedDeviceMbits := make(map[string]int)
  2400  	for _, alloc := range allocs {
  2401  		if !alloc.TerminalStatus() {
  2402  			allocated.Add(alloc.Resources)
  2403  			for _, allocatedNetwork := range alloc.Resources.Networks {
  2404  				for cidr, dev := range cidrToDevice {
  2405  					ip := net.ParseIP(allocatedNetwork.IP)
  2406  					if cidr.Contains(ip) {
  2407  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2408  						break
  2409  					}
  2410  				}
  2411  			}
  2412  		}
  2413  	}
  2414  
  2415  	// Clear the networks
  2416  	allocated.Networks = nil
  2417  	for dev, speed := range allocatedDeviceMbits {
  2418  		net := &structs.NetworkResource{
  2419  			Device: dev,
  2420  			MBits:  speed,
  2421  		}
  2422  		allocated.Networks = append(allocated.Networks, net)
  2423  	}
  2424  
  2425  	return &allocated
  2426  }
  2427  
  2428  // allAllocs returns all the allocations managed by the client
  2429  func (c *Client) allAllocs() map[string]*structs.Allocation {
  2430  	allocs := make(map[string]*structs.Allocation, 16)
  2431  	for _, ar := range c.getAllocRunners() {
  2432  		a := ar.Alloc()
  2433  		allocs[a.ID] = a
  2434  	}
  2435  	c.blockedAllocsLock.RLock()
  2436  	for _, alloc := range c.blockedAllocations {
  2437  		allocs[alloc.ID] = alloc
  2438  	}
  2439  	c.blockedAllocsLock.RUnlock()
  2440  
  2441  	c.migratingAllocsLock.RLock()
  2442  	for _, ctrl := range c.migratingAllocs {
  2443  		allocs[ctrl.alloc.ID] = ctrl.alloc
  2444  	}
  2445  	c.migratingAllocsLock.RUnlock()
  2446  	return allocs
  2447  }
  2448  
  2449  // resolveServer given a sever's address as a string, return it's resolved
  2450  // net.Addr or an error.
  2451  func resolveServer(s string) (net.Addr, error) {
  2452  	const defaultClientPort = "4647" // default client RPC port
  2453  	host, port, err := net.SplitHostPort(s)
  2454  	if err != nil {
  2455  		if strings.Contains(err.Error(), "missing port") {
  2456  			host = s
  2457  			port = defaultClientPort
  2458  		} else {
  2459  			return nil, err
  2460  		}
  2461  	}
  2462  	return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port))
  2463  }