github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"archive/tar"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"log"
    10  	"net"
    11  	"os"
    12  	"path/filepath"
    13  	"strconv"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/armon/go-metrics"
    19  	consulapi "github.com/hashicorp/consul/api"
    20  	"github.com/hashicorp/consul/lib"
    21  	"github.com/hashicorp/go-multierror"
    22  	vaultapi "github.com/hashicorp/vault/api"
    23  	"github.com/mitchellh/hashstructure"
    24  	nomadapi "github.com/ncodes/nomad/api"
    25  	"github.com/ncodes/nomad/client/allocdir"
    26  	"github.com/ncodes/nomad/client/config"
    27  	"github.com/ncodes/nomad/client/driver"
    28  	"github.com/ncodes/nomad/client/fingerprint"
    29  	"github.com/ncodes/nomad/client/stats"
    30  	"github.com/ncodes/nomad/client/vaultclient"
    31  	"github.com/ncodes/nomad/command/agent/consul"
    32  	"github.com/ncodes/nomad/helper"
    33  	"github.com/ncodes/nomad/helper/tlsutil"
    34  	"github.com/ncodes/nomad/nomad"
    35  	"github.com/ncodes/nomad/nomad/structs"
    36  	"github.com/shirou/gopsutil/host"
    37  )
    38  
    39  const (
    40  	// clientRPCCache controls how long we keep an idle connection
    41  	// open to a server
    42  	clientRPCCache = 5 * time.Minute
    43  
    44  	// clientMaxStreams controsl how many idle streams we keep
    45  	// open to a server
    46  	clientMaxStreams = 2
    47  
    48  	// datacenterQueryLimit searches through up to this many adjacent
    49  	// datacenters looking for the Nomad server service.
    50  	datacenterQueryLimit = 9
    51  
    52  	// consulReaperIntv is the interval at which the Consul reaper will
    53  	// run.
    54  	consulReaperIntv = 5 * time.Second
    55  
    56  	// registerRetryIntv is minimum interval on which we retry
    57  	// registration. We pick a value between this and 2x this.
    58  	registerRetryIntv = 15 * time.Second
    59  
    60  	// getAllocRetryIntv is minimum interval on which we retry
    61  	// to fetch allocations. We pick a value between this and 2x this.
    62  	getAllocRetryIntv = 30 * time.Second
    63  
    64  	// devModeRetryIntv is the retry interval used for development
    65  	devModeRetryIntv = time.Second
    66  
    67  	// stateSnapshotIntv is how often the client snapshots state
    68  	stateSnapshotIntv = 60 * time.Second
    69  
    70  	// initialHeartbeatStagger is used to stagger the interval between
    71  	// starting and the intial heartbeat. After the intial heartbeat,
    72  	// we switch to using the TTL specified by the servers.
    73  	initialHeartbeatStagger = 10 * time.Second
    74  
    75  	// nodeUpdateRetryIntv is how often the client checks for updates to the
    76  	// node attributes or meta map.
    77  	nodeUpdateRetryIntv = 5 * time.Second
    78  
    79  	// allocSyncIntv is the batching period of allocation updates before they
    80  	// are synced with the server.
    81  	allocSyncIntv = 200 * time.Millisecond
    82  
    83  	// allocSyncRetryIntv is the interval on which we retry updating
    84  	// the status of the allocation
    85  	allocSyncRetryIntv = 5 * time.Second
    86  )
    87  
    88  // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
    89  // Client
    90  type ClientStatsReporter interface {
    91  	// GetAllocStats returns the AllocStatsReporter for the passed allocation.
    92  	// If it does not exist an error is reported.
    93  	GetAllocStats(allocID string) (AllocStatsReporter, error)
    94  
    95  	// LatestHostStats returns the latest resource usage stats for the host
    96  	LatestHostStats() *stats.HostStats
    97  }
    98  
    99  // Client is used to implement the client interaction with Nomad. Clients
   100  // are expected to register as a schedulable node to the servers, and to
   101  // run allocations as determined by the servers.
   102  type Client struct {
   103  	config *config.Config
   104  	start  time.Time
   105  
   106  	// configCopy is a copy that should be passed to alloc-runners.
   107  	configCopy *config.Config
   108  	configLock sync.RWMutex
   109  
   110  	logger *log.Logger
   111  
   112  	connPool *nomad.ConnPool
   113  
   114  	// servers is the (optionally prioritized) list of nomad servers
   115  	servers *serverlist
   116  
   117  	// heartbeat related times for tracking how often to heartbeat
   118  	lastHeartbeat time.Time
   119  	heartbeatTTL  time.Duration
   120  	heartbeatLock sync.Mutex
   121  
   122  	// triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery
   123  	triggerDiscoveryCh chan struct{}
   124  
   125  	// discovered will be ticked whenever Consul discovery completes
   126  	// succesfully
   127  	serversDiscoveredCh chan struct{}
   128  
   129  	// allocs is the current set of allocations
   130  	allocs    map[string]*AllocRunner
   131  	allocLock sync.RWMutex
   132  
   133  	// blockedAllocations are allocations which are blocked because their
   134  	// chained allocations haven't finished running
   135  	blockedAllocations map[string]*structs.Allocation
   136  	blockedAllocsLock  sync.RWMutex
   137  
   138  	// migratingAllocs is the set of allocs whose data migration is in flight
   139  	migratingAllocs     map[string]*migrateAllocCtrl
   140  	migratingAllocsLock sync.Mutex
   141  
   142  	// allocUpdates stores allocations that need to be synced to the server.
   143  	allocUpdates chan *structs.Allocation
   144  
   145  	// consulSyncer advertises this Nomad Agent with Consul
   146  	consulSyncer *consul.Syncer
   147  
   148  	// HostStatsCollector collects host resource usage stats
   149  	hostStatsCollector *stats.HostStatsCollector
   150  
   151  	shutdown     bool
   152  	shutdownCh   chan struct{}
   153  	shutdownLock sync.Mutex
   154  
   155  	// vaultClient is used to interact with Vault for token and secret renewals
   156  	vaultClient vaultclient.VaultClient
   157  
   158  	// garbageCollector is used to garbage collect terminal allocations present
   159  	// in the node automatically
   160  	garbageCollector *AllocGarbageCollector
   161  }
   162  
   163  // migrateAllocCtrl indicates whether migration is complete
   164  type migrateAllocCtrl struct {
   165  	alloc  *structs.Allocation
   166  	ch     chan struct{}
   167  	closed bool
   168  	chLock sync.Mutex
   169  }
   170  
   171  func newMigrateAllocCtrl(alloc *structs.Allocation) *migrateAllocCtrl {
   172  	return &migrateAllocCtrl{
   173  		ch:    make(chan struct{}),
   174  		alloc: alloc,
   175  	}
   176  }
   177  
   178  func (m *migrateAllocCtrl) closeCh() {
   179  	m.chLock.Lock()
   180  	defer m.chLock.Unlock()
   181  
   182  	if m.closed {
   183  		return
   184  	}
   185  
   186  	// If channel is not closed then close it
   187  	m.closed = true
   188  	close(m.ch)
   189  }
   190  
   191  var (
   192  	// noServersErr is returned by the RPC method when the client has no
   193  	// configured servers. This is used to trigger Consul discovery if
   194  	// enabled.
   195  	noServersErr = errors.New("no servers")
   196  )
   197  
   198  // NewClient is used to create a new client from the given configuration
   199  func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Client, error) {
   200  	// Create the tls wrapper
   201  	var tlsWrap tlsutil.RegionWrapper
   202  	if cfg.TLSConfig.EnableRPC {
   203  		tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper()
   204  		if err != nil {
   205  			return nil, err
   206  		}
   207  		tlsWrap = tw
   208  	}
   209  
   210  	// Create the client
   211  	c := &Client{
   212  		config:              cfg,
   213  		consulSyncer:        consulSyncer,
   214  		start:               time.Now(),
   215  		connPool:            nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap),
   216  		logger:              logger,
   217  		allocs:              make(map[string]*AllocRunner),
   218  		blockedAllocations:  make(map[string]*structs.Allocation),
   219  		allocUpdates:        make(chan *structs.Allocation, 64),
   220  		shutdownCh:          make(chan struct{}),
   221  		migratingAllocs:     make(map[string]*migrateAllocCtrl),
   222  		servers:             newServerList(),
   223  		triggerDiscoveryCh:  make(chan struct{}),
   224  		serversDiscoveredCh: make(chan struct{}),
   225  	}
   226  
   227  	// Initialize the client
   228  	if err := c.init(); err != nil {
   229  		return nil, fmt.Errorf("failed to initialize client: %v", err)
   230  	}
   231  
   232  	// Add the stats collector
   233  	statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir)
   234  	c.hostStatsCollector = statsCollector
   235  
   236  	// Add the garbage collector
   237  	gcConfig := &GCConfig{
   238  		DiskUsageThreshold:  cfg.GCDiskUsageThreshold,
   239  		InodeUsageThreshold: cfg.GCInodeUsageThreshold,
   240  		Interval:            cfg.GCInterval,
   241  		ParallelDestroys:    cfg.GCParallelDestroys,
   242  		ReservedDiskMB:      cfg.Node.Reserved.DiskMB,
   243  	}
   244  	c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, gcConfig)
   245  
   246  	// Setup the node
   247  	if err := c.setupNode(); err != nil {
   248  		return nil, fmt.Errorf("node setup failed: %v", err)
   249  	}
   250  
   251  	// Fingerprint the node
   252  	if err := c.fingerprint(); err != nil {
   253  		return nil, fmt.Errorf("fingerprinting failed: %v", err)
   254  	}
   255  
   256  	// Scan for drivers
   257  	if err := c.setupDrivers(); err != nil {
   258  		return nil, fmt.Errorf("driver setup failed: %v", err)
   259  	}
   260  
   261  	// Setup the reserved resources
   262  	c.reservePorts()
   263  
   264  	// Store the config copy before restoring state but after it has been
   265  	// initialized.
   266  	c.configLock.Lock()
   267  	c.configCopy = c.config.Copy()
   268  	c.configLock.Unlock()
   269  
   270  	// Set the preconfigured list of static servers
   271  	c.configLock.RLock()
   272  	if len(c.configCopy.Servers) > 0 {
   273  		if err := c.SetServers(c.configCopy.Servers); err != nil {
   274  			logger.Printf("[WARN] client: None of the configured servers are valid: %v", err)
   275  		}
   276  	}
   277  	c.configLock.RUnlock()
   278  
   279  	// Setup Consul discovery if enabled
   280  	if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
   281  		go c.consulDiscovery()
   282  		if len(c.servers.all()) == 0 {
   283  			// No configured servers; trigger discovery manually
   284  			c.triggerDiscoveryCh <- struct{}{}
   285  		}
   286  	}
   287  
   288  	// Start Consul reaper
   289  	go c.consulReaper()
   290  
   291  	// Setup the vault client for token and secret renewals
   292  	if err := c.setupVaultClient(); err != nil {
   293  		return nil, fmt.Errorf("failed to setup vault client: %v", err)
   294  	}
   295  
   296  	// Restore the state
   297  	if err := c.restoreState(); err != nil {
   298  		return nil, fmt.Errorf("failed to restore state: %v", err)
   299  	}
   300  
   301  	// Register and then start heartbeating to the servers.
   302  	go c.registerAndHeartbeat()
   303  
   304  	// Begin periodic snapshotting of state.
   305  	go c.periodicSnapshot()
   306  
   307  	// Begin syncing allocations to the server
   308  	go c.allocSync()
   309  
   310  	// Start the client!
   311  	go c.run()
   312  
   313  	// Start collecting stats
   314  	go c.emitStats()
   315  
   316  	c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID)
   317  	return c, nil
   318  }
   319  
   320  // init is used to initialize the client and perform any setup
   321  // needed before we begin starting its various components.
   322  func (c *Client) init() error {
   323  	// Ensure the state dir exists if we have one
   324  	if c.config.StateDir != "" {
   325  		if err := os.MkdirAll(c.config.StateDir, 0700); err != nil {
   326  			return fmt.Errorf("failed creating state dir: %s", err)
   327  		}
   328  
   329  	} else {
   330  		// Othewise make a temp directory to use.
   331  		p, err := ioutil.TempDir("", "NomadClient")
   332  		if err != nil {
   333  			return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err)
   334  		}
   335  
   336  		p, err = filepath.EvalSymlinks(p)
   337  		if err != nil {
   338  			return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err)
   339  		}
   340  
   341  		c.config.StateDir = p
   342  	}
   343  	c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir)
   344  
   345  	// Ensure the alloc dir exists if we have one
   346  	if c.config.AllocDir != "" {
   347  		if err := os.MkdirAll(c.config.AllocDir, 0755); err != nil {
   348  			return fmt.Errorf("failed creating alloc dir: %s", err)
   349  		}
   350  	} else {
   351  		// Othewise make a temp directory to use.
   352  		p, err := ioutil.TempDir("", "NomadClient")
   353  		if err != nil {
   354  			return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err)
   355  		}
   356  
   357  		p, err = filepath.EvalSymlinks(p)
   358  		if err != nil {
   359  			return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err)
   360  		}
   361  
   362  		// Change the permissions to have the execute bit
   363  		if err := os.Chmod(p, 0755); err != nil {
   364  			return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err)
   365  		}
   366  
   367  		c.config.AllocDir = p
   368  	}
   369  
   370  	c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir)
   371  	return nil
   372  }
   373  
   374  // Leave is used to prepare the client to leave the cluster
   375  func (c *Client) Leave() error {
   376  	// TODO
   377  	return nil
   378  }
   379  
   380  // Datacenter returns the datacenter for the given client
   381  func (c *Client) Datacenter() string {
   382  	c.configLock.RLock()
   383  	dc := c.configCopy.Node.Datacenter
   384  	c.configLock.RUnlock()
   385  	return dc
   386  }
   387  
   388  // Region returns the region for the given client
   389  func (c *Client) Region() string {
   390  	return c.config.Region
   391  }
   392  
   393  // RPCMajorVersion returns the structs.ApiMajorVersion supported by the
   394  // client.
   395  func (c *Client) RPCMajorVersion() int {
   396  	return structs.ApiMajorVersion
   397  }
   398  
   399  // RPCMinorVersion returns the structs.ApiMinorVersion supported by the
   400  // client.
   401  func (c *Client) RPCMinorVersion() int {
   402  	return structs.ApiMinorVersion
   403  }
   404  
   405  // Shutdown is used to tear down the client
   406  func (c *Client) Shutdown() error {
   407  	c.logger.Printf("[INFO] client: shutting down")
   408  	c.shutdownLock.Lock()
   409  	defer c.shutdownLock.Unlock()
   410  
   411  	if c.shutdown {
   412  		return nil
   413  	}
   414  
   415  	// Stop renewing tokens and secrets
   416  	if c.vaultClient != nil {
   417  		c.vaultClient.Stop()
   418  	}
   419  
   420  	// Stop Garbage collector
   421  	c.garbageCollector.Stop()
   422  
   423  	// Destroy all the running allocations.
   424  	if c.config.DevMode {
   425  		for _, ar := range c.getAllocRunners() {
   426  			ar.Destroy()
   427  			<-ar.WaitCh()
   428  		}
   429  	}
   430  
   431  	c.shutdown = true
   432  	close(c.shutdownCh)
   433  	c.connPool.Shutdown()
   434  	return c.saveState()
   435  }
   436  
   437  // RPC is used to forward an RPC call to a nomad server, or fail if no servers.
   438  func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
   439  	// Invoke the RPCHandler if it exists
   440  	if c.config.RPCHandler != nil {
   441  		return c.config.RPCHandler.RPC(method, args, reply)
   442  	}
   443  
   444  	servers := c.servers.all()
   445  	if len(servers) == 0 {
   446  		return noServersErr
   447  	}
   448  
   449  	var mErr multierror.Error
   450  	for _, s := range servers {
   451  		// Make the RPC request
   452  		if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil {
   453  			errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err)
   454  			mErr.Errors = append(mErr.Errors, errmsg)
   455  			c.logger.Printf("[DEBUG] client: %v", errmsg)
   456  			c.servers.failed(s)
   457  			continue
   458  		}
   459  		c.servers.good(s)
   460  		return nil
   461  	}
   462  
   463  	return mErr.ErrorOrNil()
   464  }
   465  
   466  // Stats is used to return statistics for debugging and insight
   467  // for various sub-systems
   468  func (c *Client) Stats() map[string]map[string]string {
   469  	c.allocLock.RLock()
   470  	numAllocs := len(c.allocs)
   471  	c.allocLock.RUnlock()
   472  
   473  	c.heartbeatLock.Lock()
   474  	defer c.heartbeatLock.Unlock()
   475  	stats := map[string]map[string]string{
   476  		"client": map[string]string{
   477  			"node_id":         c.Node().ID,
   478  			"known_servers":   c.servers.all().String(),
   479  			"num_allocations": strconv.Itoa(numAllocs),
   480  			"last_heartbeat":  fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
   481  			"heartbeat_ttl":   fmt.Sprintf("%v", c.heartbeatTTL),
   482  		},
   483  		"runtime": nomad.RuntimeStats(),
   484  	}
   485  	return stats
   486  }
   487  
   488  // CollectAllocation garbage collects a single allocation
   489  func (c *Client) CollectAllocation(allocID string) error {
   490  	return c.garbageCollector.Collect(allocID)
   491  }
   492  
   493  // CollectAllAllocs garbage collects all allocations on a node in the terminal
   494  // state
   495  func (c *Client) CollectAllAllocs() error {
   496  	return c.garbageCollector.CollectAll()
   497  }
   498  
   499  // Node returns the locally registered node
   500  func (c *Client) Node() *structs.Node {
   501  	c.configLock.RLock()
   502  	defer c.configLock.RUnlock()
   503  	return c.config.Node
   504  }
   505  
   506  // StatsReporter exposes the various APIs related resource usage of a Nomad
   507  // client
   508  func (c *Client) StatsReporter() ClientStatsReporter {
   509  	return c
   510  }
   511  
   512  func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) {
   513  	c.allocLock.RLock()
   514  	defer c.allocLock.RUnlock()
   515  	ar, ok := c.allocs[allocID]
   516  	if !ok {
   517  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   518  	}
   519  	return ar.StatsReporter(), nil
   520  }
   521  
   522  // HostStats returns all the stats related to a Nomad client
   523  func (c *Client) LatestHostStats() *stats.HostStats {
   524  	return c.hostStatsCollector.Stats()
   525  }
   526  
   527  // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation
   528  func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
   529  	c.allocLock.RLock()
   530  	defer c.allocLock.RUnlock()
   531  
   532  	ar, ok := c.allocs[allocID]
   533  	if !ok {
   534  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   535  	}
   536  	return ar.GetAllocDir(), nil
   537  }
   538  
   539  // GetClientAlloc returns the allocation from the client
   540  func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
   541  	all := c.allAllocs()
   542  	alloc, ok := all[allocID]
   543  	if !ok {
   544  		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
   545  	}
   546  	return alloc, nil
   547  }
   548  
   549  // GetServers returns the list of nomad servers this client is aware of.
   550  func (c *Client) GetServers() []string {
   551  	endpoints := c.servers.all()
   552  	res := make([]string, len(endpoints))
   553  	for i := range endpoints {
   554  		res[i] = endpoints[i].addr.String()
   555  	}
   556  	return res
   557  }
   558  
   559  // SetServers sets a new list of nomad servers to connect to. As long as one
   560  // server is resolvable no error is returned.
   561  func (c *Client) SetServers(servers []string) error {
   562  	endpoints := make([]*endpoint, 0, len(servers))
   563  	var merr multierror.Error
   564  	for _, s := range servers {
   565  		addr, err := resolveServer(s)
   566  		if err != nil {
   567  			c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err)
   568  			merr.Errors = append(merr.Errors, err)
   569  			continue
   570  		}
   571  
   572  		// Valid endpoint, append it without a priority as this API
   573  		// doesn't support different priorities for different servers
   574  		endpoints = append(endpoints, &endpoint{name: s, addr: addr})
   575  	}
   576  
   577  	// Only return errors if no servers are valid
   578  	if len(endpoints) == 0 {
   579  		if len(merr.Errors) > 0 {
   580  			return merr.ErrorOrNil()
   581  		}
   582  		return noServersErr
   583  	}
   584  
   585  	c.servers.set(endpoints)
   586  	return nil
   587  }
   588  
   589  // restoreState is used to restore our state from the data dir
   590  func (c *Client) restoreState() error {
   591  	if c.config.DevMode {
   592  		return nil
   593  	}
   594  
   595  	// Scan the directory
   596  	list, err := ioutil.ReadDir(filepath.Join(c.config.StateDir, "alloc"))
   597  	if err != nil && os.IsNotExist(err) {
   598  		return nil
   599  	} else if err != nil {
   600  		return fmt.Errorf("failed to list alloc state: %v", err)
   601  	}
   602  
   603  	// Load each alloc back
   604  	var mErr multierror.Error
   605  	for _, entry := range list {
   606  		id := entry.Name()
   607  		alloc := &structs.Allocation{ID: id}
   608  		c.configLock.RLock()
   609  		ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient)
   610  		c.configLock.RUnlock()
   611  		c.allocLock.Lock()
   612  		c.allocs[id] = ar
   613  		c.allocLock.Unlock()
   614  		if err := ar.RestoreState(); err != nil {
   615  			c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err)
   616  			mErr.Errors = append(mErr.Errors, err)
   617  		} else {
   618  			go ar.Run()
   619  		}
   620  	}
   621  	return mErr.ErrorOrNil()
   622  }
   623  
   624  // saveState is used to snapshot our state into the data dir
   625  func (c *Client) saveState() error {
   626  	if c.config.DevMode {
   627  		return nil
   628  	}
   629  
   630  	var mErr multierror.Error
   631  	for id, ar := range c.getAllocRunners() {
   632  		if err := ar.SaveState(); err != nil {
   633  			c.logger.Printf("[ERR] client: failed to save state for alloc %s: %v",
   634  				id, err)
   635  			mErr.Errors = append(mErr.Errors, err)
   636  		}
   637  	}
   638  	return mErr.ErrorOrNil()
   639  }
   640  
   641  // getAllocRunners returns a snapshot of the current set of alloc runners.
   642  func (c *Client) getAllocRunners() map[string]*AllocRunner {
   643  	c.allocLock.RLock()
   644  	defer c.allocLock.RUnlock()
   645  	runners := make(map[string]*AllocRunner, len(c.allocs))
   646  	for id, ar := range c.allocs {
   647  		runners[id] = ar
   648  	}
   649  	return runners
   650  }
   651  
   652  // nodeID restores, or generates if necessary, a unique node ID and SecretID.
   653  // The node ID is, if available, a persistent unique ID.  The secret ID is a
   654  // high-entropy random UUID.
   655  func (c *Client) nodeID() (id, secret string, err error) {
   656  	var hostID string
   657  	hostInfo, err := host.Info()
   658  	if !c.config.NoHostUUID && err == nil && helper.IsUUID(hostInfo.HostID) {
   659  		hostID = hostInfo.HostID
   660  	} else {
   661  		// Generate a random hostID if no constant ID is available on
   662  		// this platform.
   663  		hostID = structs.GenerateUUID()
   664  	}
   665  
   666  	// Do not persist in dev mode
   667  	if c.config.DevMode {
   668  		return hostID, structs.GenerateUUID(), nil
   669  	}
   670  
   671  	// Attempt to read existing ID
   672  	idPath := filepath.Join(c.config.StateDir, "client-id")
   673  	idBuf, err := ioutil.ReadFile(idPath)
   674  	if err != nil && !os.IsNotExist(err) {
   675  		return "", "", err
   676  	}
   677  
   678  	// Attempt to read existing secret ID
   679  	secretPath := filepath.Join(c.config.StateDir, "secret-id")
   680  	secretBuf, err := ioutil.ReadFile(secretPath)
   681  	if err != nil && !os.IsNotExist(err) {
   682  		return "", "", err
   683  	}
   684  
   685  	// Use existing ID if any
   686  	if len(idBuf) != 0 {
   687  		id = strings.ToLower(string(idBuf))
   688  	} else {
   689  		id = hostID
   690  
   691  		// Persist the ID
   692  		if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil {
   693  			return "", "", err
   694  		}
   695  	}
   696  
   697  	if len(secretBuf) != 0 {
   698  		secret = string(secretBuf)
   699  	} else {
   700  		// Generate new ID
   701  		secret = structs.GenerateUUID()
   702  
   703  		// Persist the ID
   704  		if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil {
   705  			return "", "", err
   706  		}
   707  	}
   708  
   709  	return id, secret, nil
   710  }
   711  
   712  // setupNode is used to setup the initial node
   713  func (c *Client) setupNode() error {
   714  	node := c.config.Node
   715  	if node == nil {
   716  		node = &structs.Node{}
   717  		c.config.Node = node
   718  	}
   719  	// Generate an ID and secret for the node
   720  	id, secretID, err := c.nodeID()
   721  	if err != nil {
   722  		return fmt.Errorf("node ID setup failed: %v", err)
   723  	}
   724  
   725  	node.ID = id
   726  	node.SecretID = secretID
   727  	if node.Attributes == nil {
   728  		node.Attributes = make(map[string]string)
   729  	}
   730  	if node.Links == nil {
   731  		node.Links = make(map[string]string)
   732  	}
   733  	if node.Meta == nil {
   734  		node.Meta = make(map[string]string)
   735  	}
   736  	if node.Resources == nil {
   737  		node.Resources = &structs.Resources{}
   738  	}
   739  	if node.Reserved == nil {
   740  		node.Reserved = &structs.Resources{}
   741  	}
   742  	if node.Datacenter == "" {
   743  		node.Datacenter = "dc1"
   744  	}
   745  	if node.Name == "" {
   746  		node.Name, _ = os.Hostname()
   747  	}
   748  	if node.Name == "" {
   749  		node.Name = node.ID
   750  	}
   751  	node.Status = structs.NodeStatusInit
   752  	return nil
   753  }
   754  
   755  // reservePorts is used to reserve ports on the fingerprinted network devices.
   756  func (c *Client) reservePorts() {
   757  	c.configLock.RLock()
   758  	defer c.configLock.RUnlock()
   759  	global := c.config.GloballyReservedPorts
   760  	if len(global) == 0 {
   761  		return
   762  	}
   763  
   764  	node := c.config.Node
   765  	networks := node.Resources.Networks
   766  	reservedIndex := make(map[string]*structs.NetworkResource, len(networks))
   767  	for _, resNet := range node.Reserved.Networks {
   768  		reservedIndex[resNet.IP] = resNet
   769  	}
   770  
   771  	// Go through each network device and reserve ports on it.
   772  	for _, net := range networks {
   773  		res, ok := reservedIndex[net.IP]
   774  		if !ok {
   775  			res = net.Copy()
   776  			res.MBits = 0
   777  			reservedIndex[net.IP] = res
   778  		}
   779  
   780  		for _, portVal := range global {
   781  			p := structs.Port{Value: portVal}
   782  			res.ReservedPorts = append(res.ReservedPorts, p)
   783  		}
   784  	}
   785  
   786  	// Clear the reserved networks.
   787  	if node.Reserved == nil {
   788  		node.Reserved = new(structs.Resources)
   789  	} else {
   790  		node.Reserved.Networks = nil
   791  	}
   792  
   793  	// Restore the reserved networks
   794  	for _, net := range reservedIndex {
   795  		node.Reserved.Networks = append(node.Reserved.Networks, net)
   796  	}
   797  }
   798  
   799  // fingerprint is used to fingerprint the client and setup the node
   800  func (c *Client) fingerprint() error {
   801  	whitelist := c.config.ReadStringListToMap("fingerprint.whitelist")
   802  	whitelistEnabled := len(whitelist) > 0
   803  	blacklist := c.config.ReadStringListToMap("fingerprint.blacklist")
   804  
   805  	c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints())
   806  
   807  	var applied []string
   808  	var skipped []string
   809  	for _, name := range fingerprint.BuiltinFingerprints() {
   810  		// Skip modules that are not in the whitelist if it is enabled.
   811  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   812  			skipped = append(skipped, name)
   813  			continue
   814  		}
   815  		// Skip modules that are in the blacklist
   816  		if _, ok := blacklist[name]; ok {
   817  			skipped = append(skipped, name)
   818  			continue
   819  		}
   820  		f, err := fingerprint.NewFingerprint(name, c.logger)
   821  		if err != nil {
   822  			return err
   823  		}
   824  
   825  		c.configLock.Lock()
   826  		applies, err := f.Fingerprint(c.config, c.config.Node)
   827  		c.configLock.Unlock()
   828  		if err != nil {
   829  			return err
   830  		}
   831  		if applies {
   832  			applied = append(applied, name)
   833  		}
   834  		p, period := f.Periodic()
   835  		if p {
   836  			// TODO: If more periodic fingerprinters are added, then
   837  			// fingerprintPeriodic should be used to handle all the periodic
   838  			// fingerprinters by using a priority queue.
   839  			go c.fingerprintPeriodic(name, f, period)
   840  		}
   841  	}
   842  	c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied)
   843  	if len(skipped) != 0 {
   844  		c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped)
   845  	}
   846  	return nil
   847  }
   848  
   849  // fingerprintPeriodic runs a fingerprinter at the specified duration.
   850  func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) {
   851  	c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d)
   852  	for {
   853  		select {
   854  		case <-time.After(d):
   855  			c.configLock.Lock()
   856  			if _, err := f.Fingerprint(c.config, c.config.Node); err != nil {
   857  				c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err)
   858  			}
   859  			c.configLock.Unlock()
   860  		case <-c.shutdownCh:
   861  			return
   862  		}
   863  	}
   864  }
   865  
   866  // setupDrivers is used to find the available drivers
   867  func (c *Client) setupDrivers() error {
   868  	// Build the white/blacklists of drivers.
   869  	whitelist := c.config.ReadStringListToMap("driver.whitelist")
   870  	whitelistEnabled := len(whitelist) > 0
   871  	blacklist := c.config.ReadStringListToMap("driver.blacklist")
   872  
   873  	var avail []string
   874  	var skipped []string
   875  	driverCtx := driver.NewDriverContext("", "", c.config, c.config.Node, c.logger, nil, nil)
   876  	for name := range driver.BuiltinDrivers {
   877  		// Skip fingerprinting drivers that are not in the whitelist if it is
   878  		// enabled.
   879  		if _, ok := whitelist[name]; whitelistEnabled && !ok {
   880  			skipped = append(skipped, name)
   881  			continue
   882  		}
   883  		// Skip fingerprinting drivers that are in the blacklist
   884  		if _, ok := blacklist[name]; ok {
   885  			skipped = append(skipped, name)
   886  			continue
   887  		}
   888  
   889  		d, err := driver.NewDriver(name, driverCtx)
   890  		if err != nil {
   891  			return err
   892  		}
   893  		c.configLock.Lock()
   894  		applies, err := d.Fingerprint(c.config, c.config.Node)
   895  		c.configLock.Unlock()
   896  		if err != nil {
   897  			return err
   898  		}
   899  		if applies {
   900  			avail = append(avail, name)
   901  		}
   902  
   903  		p, period := d.Periodic()
   904  		if p {
   905  			go c.fingerprintPeriodic(name, d, period)
   906  		}
   907  
   908  	}
   909  
   910  	c.logger.Printf("[DEBUG] client: available drivers %v", avail)
   911  
   912  	if len(skipped) != 0 {
   913  		c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped)
   914  	}
   915  
   916  	return nil
   917  }
   918  
   919  // retryIntv calculates a retry interval value given the base
   920  func (c *Client) retryIntv(base time.Duration) time.Duration {
   921  	if c.config.DevMode {
   922  		return devModeRetryIntv
   923  	}
   924  	return base + lib.RandomStagger(base)
   925  }
   926  
   927  // registerAndHeartbeat is a long lived goroutine used to register the client
   928  // and then start heartbeatng to the server.
   929  func (c *Client) registerAndHeartbeat() {
   930  	// Register the node
   931  	c.retryRegisterNode()
   932  
   933  	// Start watching changes for node changes
   934  	go c.watchNodeUpdates()
   935  
   936  	// Setup the heartbeat timer, for the initial registration
   937  	// we want to do this quickly. We want to do it extra quickly
   938  	// in development mode.
   939  	var heartbeat <-chan time.Time
   940  	if c.config.DevMode {
   941  		heartbeat = time.After(0)
   942  	} else {
   943  		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
   944  	}
   945  
   946  	for {
   947  		select {
   948  		case <-c.serversDiscoveredCh:
   949  		case <-heartbeat:
   950  		case <-c.shutdownCh:
   951  			return
   952  		}
   953  
   954  		if err := c.updateNodeStatus(); err != nil {
   955  			// The servers have changed such that this node has not been
   956  			// registered before
   957  			if strings.Contains(err.Error(), "node not found") {
   958  				// Re-register the node
   959  				c.logger.Printf("[INFO] client: re-registering node")
   960  				c.retryRegisterNode()
   961  				heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
   962  			} else {
   963  				intv := c.retryIntv(registerRetryIntv)
   964  				c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err)
   965  				heartbeat = time.After(intv)
   966  
   967  				// if heartbeating fails, trigger Consul discovery
   968  				c.triggerDiscovery()
   969  			}
   970  		} else {
   971  			c.heartbeatLock.Lock()
   972  			heartbeat = time.After(c.heartbeatTTL)
   973  			c.heartbeatLock.Unlock()
   974  		}
   975  	}
   976  }
   977  
   978  // periodicSnapshot is a long lived goroutine used to periodically snapshot the
   979  // state of the client
   980  func (c *Client) periodicSnapshot() {
   981  	// Create a snapshot timer
   982  	snapshot := time.After(stateSnapshotIntv)
   983  
   984  	for {
   985  		select {
   986  		case <-snapshot:
   987  			snapshot = time.After(stateSnapshotIntv)
   988  			if err := c.saveState(); err != nil {
   989  				c.logger.Printf("[ERR] client: failed to save state: %v", err)
   990  			}
   991  
   992  		case <-c.shutdownCh:
   993  			return
   994  		}
   995  	}
   996  }
   997  
   998  // run is a long lived goroutine used to run the client
   999  func (c *Client) run() {
  1000  	// Watch for changes in allocations
  1001  	allocUpdates := make(chan *allocUpdates, 8)
  1002  	go c.watchAllocations(allocUpdates)
  1003  
  1004  	for {
  1005  		select {
  1006  		case update := <-allocUpdates:
  1007  			c.runAllocs(update)
  1008  
  1009  		case <-c.shutdownCh:
  1010  			return
  1011  		}
  1012  	}
  1013  }
  1014  
  1015  // hasNodeChanged calculates a hash for the node attributes- and meta map.
  1016  // The new hash values are compared against the old (passed-in) hash values to
  1017  // determine if the node properties have changed. It returns the new hash values
  1018  // in case they are different from the old hash values.
  1019  func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) {
  1020  	c.configLock.RLock()
  1021  	defer c.configLock.RUnlock()
  1022  	newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil)
  1023  	if err != nil {
  1024  		c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err)
  1025  	}
  1026  	// Calculate node meta map hash
  1027  	newMetaHash, err := hashstructure.Hash(c.config.Node.Meta, nil)
  1028  	if err != nil {
  1029  		c.logger.Printf("[DEBUG] client: unable to calculate node meta hash: %v", err)
  1030  	}
  1031  	if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash {
  1032  		return true, newAttrHash, newMetaHash
  1033  	}
  1034  	return false, oldAttrHash, oldMetaHash
  1035  }
  1036  
  1037  // retryRegisterNode is used to register the node or update the registration and
  1038  // retry in case of failure.
  1039  func (c *Client) retryRegisterNode() {
  1040  	for {
  1041  		err := c.registerNode()
  1042  		if err == nil {
  1043  			// Registered!
  1044  			return
  1045  		}
  1046  
  1047  		if err == noServersErr {
  1048  			c.logger.Print("[DEBUG] client: registration waiting on servers")
  1049  			c.triggerDiscovery()
  1050  		} else {
  1051  			c.logger.Printf("[ERR] client: registration failure: %v", err)
  1052  		}
  1053  		select {
  1054  		case <-c.serversDiscoveredCh:
  1055  		case <-time.After(c.retryIntv(registerRetryIntv)):
  1056  		case <-c.shutdownCh:
  1057  			return
  1058  		}
  1059  	}
  1060  }
  1061  
  1062  // registerNode is used to register the node or update the registration
  1063  func (c *Client) registerNode() error {
  1064  	node := c.Node()
  1065  	req := structs.NodeRegisterRequest{
  1066  		Node:         node,
  1067  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1068  	}
  1069  	var resp structs.NodeUpdateResponse
  1070  	if err := c.RPC("Node.Register", &req, &resp); err != nil {
  1071  		return err
  1072  	}
  1073  
  1074  	// Update the node status to ready after we register.
  1075  	c.configLock.Lock()
  1076  	node.Status = structs.NodeStatusReady
  1077  	c.configLock.Unlock()
  1078  
  1079  	c.logger.Printf("[INFO] client: node registration complete")
  1080  	if len(resp.EvalIDs) != 0 {
  1081  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs))
  1082  	}
  1083  
  1084  	c.heartbeatLock.Lock()
  1085  	defer c.heartbeatLock.Unlock()
  1086  	c.lastHeartbeat = time.Now()
  1087  	c.heartbeatTTL = resp.HeartbeatTTL
  1088  	return nil
  1089  }
  1090  
  1091  // updateNodeStatus is used to heartbeat and update the status of the node
  1092  func (c *Client) updateNodeStatus() error {
  1093  	c.heartbeatLock.Lock()
  1094  	defer c.heartbeatLock.Unlock()
  1095  
  1096  	node := c.Node()
  1097  	req := structs.NodeUpdateStatusRequest{
  1098  		NodeID:       node.ID,
  1099  		Status:       structs.NodeStatusReady,
  1100  		WriteRequest: structs.WriteRequest{Region: c.Region()},
  1101  	}
  1102  	var resp structs.NodeUpdateResponse
  1103  	if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil {
  1104  		c.triggerDiscovery()
  1105  		return fmt.Errorf("failed to update status: %v", err)
  1106  	}
  1107  	if len(resp.EvalIDs) != 0 {
  1108  		c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs))
  1109  	}
  1110  	if resp.Index != 0 {
  1111  		c.logger.Printf("[DEBUG] client: state updated to %s", req.Status)
  1112  	}
  1113  
  1114  	// Update heartbeat time and ttl
  1115  	c.lastHeartbeat = time.Now()
  1116  	c.heartbeatTTL = resp.HeartbeatTTL
  1117  
  1118  	// Convert []*NodeServerInfo to []*endpoints
  1119  	localdc := c.Datacenter()
  1120  	servers := make(endpoints, 0, len(resp.Servers))
  1121  	for _, s := range resp.Servers {
  1122  		addr, err := resolveServer(s.RPCAdvertiseAddr)
  1123  		if err != nil {
  1124  			continue
  1125  		}
  1126  		e := endpoint{name: s.RPCAdvertiseAddr, addr: addr}
  1127  		if s.Datacenter != localdc {
  1128  			// server is non-local; de-prioritize
  1129  			e.priority = 1
  1130  		}
  1131  		servers = append(servers, &e)
  1132  	}
  1133  	if len(servers) == 0 {
  1134  		return fmt.Errorf("server returned no valid servers")
  1135  	}
  1136  	c.servers.set(servers)
  1137  
  1138  	// Begin polling Consul if there is no Nomad leader.  We could be
  1139  	// heartbeating to a Nomad server that is in the minority of a
  1140  	// partition of the Nomad server quorum, but this Nomad Agent still
  1141  	// has connectivity to the existing majority of Nomad Servers, but
  1142  	// only if it queries Consul.
  1143  	if resp.LeaderRPCAddr == "" {
  1144  		c.triggerDiscovery()
  1145  	}
  1146  
  1147  	return nil
  1148  }
  1149  
  1150  // updateAllocStatus is used to update the status of an allocation
  1151  func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
  1152  	// If this alloc was blocking another alloc and transitioned to a
  1153  	// terminal state then start the blocked allocation
  1154  	c.blockedAllocsLock.Lock()
  1155  	if blockedAlloc, ok := c.blockedAllocations[alloc.ID]; ok && alloc.Terminated() {
  1156  		var prevAllocDir *allocdir.AllocDir
  1157  		if ar, ok := c.getAllocRunners()[alloc.ID]; ok {
  1158  			tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1159  			if tg != nil && tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky {
  1160  				prevAllocDir = ar.GetAllocDir()
  1161  			}
  1162  		}
  1163  		if err := c.addAlloc(blockedAlloc, prevAllocDir); err != nil {
  1164  			c.logger.Printf("[ERR] client: failed to add alloc which was previously blocked %q: %v",
  1165  				blockedAlloc.ID, err)
  1166  		}
  1167  		delete(c.blockedAllocations, blockedAlloc.PreviousAllocation)
  1168  	}
  1169  	c.blockedAllocsLock.Unlock()
  1170  
  1171  	// Mark the allocation for GC if it is in terminal state
  1172  	if alloc.Terminated() {
  1173  		if ar, ok := c.getAllocRunners()[alloc.ID]; ok {
  1174  			if err := c.garbageCollector.MarkForCollection(ar); err != nil {
  1175  				c.logger.Printf("[DEBUG] client: couldn't add alloc %v for GC: %v", alloc.ID, err)
  1176  			}
  1177  		}
  1178  	}
  1179  
  1180  	// Strip all the information that can be reconstructed at the server.  Only
  1181  	// send the fields that are updatable by the client.
  1182  	stripped := new(structs.Allocation)
  1183  	stripped.ID = alloc.ID
  1184  	stripped.NodeID = c.Node().ID
  1185  	stripped.TaskStates = alloc.TaskStates
  1186  	stripped.ClientStatus = alloc.ClientStatus
  1187  	stripped.ClientDescription = alloc.ClientDescription
  1188  
  1189  	select {
  1190  	case c.allocUpdates <- stripped:
  1191  	case <-c.shutdownCh:
  1192  	}
  1193  }
  1194  
  1195  // allocSync is a long lived function that batches allocation updates to the
  1196  // server.
  1197  func (c *Client) allocSync() {
  1198  	staggered := false
  1199  	syncTicker := time.NewTicker(allocSyncIntv)
  1200  	updates := make(map[string]*structs.Allocation)
  1201  	for {
  1202  		select {
  1203  		case <-c.shutdownCh:
  1204  			syncTicker.Stop()
  1205  			return
  1206  		case alloc := <-c.allocUpdates:
  1207  			// Batch the allocation updates until the timer triggers.
  1208  			updates[alloc.ID] = alloc
  1209  		case <-syncTicker.C:
  1210  			// Fast path if there are no updates
  1211  			if len(updates) == 0 {
  1212  				continue
  1213  			}
  1214  
  1215  			sync := make([]*structs.Allocation, 0, len(updates))
  1216  			for _, alloc := range updates {
  1217  				sync = append(sync, alloc)
  1218  			}
  1219  
  1220  			// Send to server.
  1221  			args := structs.AllocUpdateRequest{
  1222  				Alloc:        sync,
  1223  				WriteRequest: structs.WriteRequest{Region: c.Region()},
  1224  			}
  1225  
  1226  			var resp structs.GenericResponse
  1227  			if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil {
  1228  				c.logger.Printf("[ERR] client: failed to update allocations: %v", err)
  1229  				syncTicker.Stop()
  1230  				syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv))
  1231  				staggered = true
  1232  			} else {
  1233  				updates = make(map[string]*structs.Allocation)
  1234  				if staggered {
  1235  					syncTicker.Stop()
  1236  					syncTicker = time.NewTicker(allocSyncIntv)
  1237  					staggered = false
  1238  				}
  1239  			}
  1240  		}
  1241  	}
  1242  }
  1243  
  1244  // allocUpdates holds the results of receiving updated allocations from the
  1245  // servers.
  1246  type allocUpdates struct {
  1247  	// pulled is the set of allocations that were downloaded from the servers.
  1248  	pulled map[string]*structs.Allocation
  1249  
  1250  	// filtered is the set of allocations that were not pulled because their
  1251  	// AllocModifyIndex didn't change.
  1252  	filtered map[string]struct{}
  1253  }
  1254  
  1255  // watchAllocations is used to scan for updates to allocations
  1256  func (c *Client) watchAllocations(updates chan *allocUpdates) {
  1257  	// The request and response for getting the map of allocations that should
  1258  	// be running on the Node to their AllocModifyIndex which is incremented
  1259  	// when the allocation is updated by the servers.
  1260  	n := c.Node()
  1261  	req := structs.NodeSpecificRequest{
  1262  		NodeID:   n.ID,
  1263  		SecretID: n.SecretID,
  1264  		QueryOptions: structs.QueryOptions{
  1265  			Region:     c.Region(),
  1266  			AllowStale: true,
  1267  		},
  1268  	}
  1269  	var resp structs.NodeClientAllocsResponse
  1270  
  1271  	// The request and response for pulling down the set of allocations that are
  1272  	// new, or updated server side.
  1273  	allocsReq := structs.AllocsGetRequest{
  1274  		QueryOptions: structs.QueryOptions{
  1275  			Region:     c.Region(),
  1276  			AllowStale: true,
  1277  		},
  1278  	}
  1279  	var allocsResp structs.AllocsGetResponse
  1280  
  1281  OUTER:
  1282  	for {
  1283  		// Get the allocation modify index map, blocking for updates. We will
  1284  		// use this to determine exactly what allocations need to be downloaded
  1285  		// in full.
  1286  		resp = structs.NodeClientAllocsResponse{}
  1287  		err := c.RPC("Node.GetClientAllocs", &req, &resp)
  1288  		if err != nil {
  1289  			// Shutdown often causes EOF errors, so check for shutdown first
  1290  			select {
  1291  			case <-c.shutdownCh:
  1292  				return
  1293  			default:
  1294  			}
  1295  
  1296  			// COMPAT: Remove in 0.6. This is to allow the case in which the
  1297  			// servers are not fully upgraded before the clients register. This
  1298  			// can cause the SecretID to be lost
  1299  			if strings.Contains(err.Error(), "node secret ID does not match") {
  1300  				c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err)
  1301  				c.retryRegisterNode()
  1302  			} else if err != noServersErr {
  1303  				c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err)
  1304  			}
  1305  			retry := c.retryIntv(getAllocRetryIntv)
  1306  			select {
  1307  			case <-c.serversDiscoveredCh:
  1308  				continue
  1309  			case <-time.After(retry):
  1310  				continue
  1311  			case <-c.shutdownCh:
  1312  				return
  1313  			}
  1314  		}
  1315  
  1316  		// Check for shutdown
  1317  		select {
  1318  		case <-c.shutdownCh:
  1319  			return
  1320  		default:
  1321  		}
  1322  
  1323  		// Filter all allocations whose AllocModifyIndex was not incremented.
  1324  		// These are the allocations who have either not been updated, or whose
  1325  		// updates are a result of the client sending an update for the alloc.
  1326  		// This lets us reduce the network traffic to the server as we don't
  1327  		// need to pull all the allocations.
  1328  		var pull []string
  1329  		filtered := make(map[string]struct{})
  1330  		runners := c.getAllocRunners()
  1331  		var pullIndex uint64
  1332  		for allocID, modifyIndex := range resp.Allocs {
  1333  			// Pull the allocation if we don't have an alloc runner for the
  1334  			// allocation or if the alloc runner requires an updated allocation.
  1335  			runner, ok := runners[allocID]
  1336  
  1337  			if !ok || runner.shouldUpdate(modifyIndex) {
  1338  				// Only pull allocs that are required. Filtered
  1339  				// allocs might be at a higher index, so ignore
  1340  				// it.
  1341  				if modifyIndex > pullIndex {
  1342  					pullIndex = modifyIndex
  1343  				}
  1344  				pull = append(pull, allocID)
  1345  			} else {
  1346  				filtered[allocID] = struct{}{}
  1347  			}
  1348  		}
  1349  
  1350  		// Pull the allocations that passed filtering.
  1351  		allocsResp.Allocs = nil
  1352  		var pulledAllocs map[string]*structs.Allocation
  1353  		if len(pull) != 0 {
  1354  			// Pull the allocations that need to be updated.
  1355  			allocsReq.AllocIDs = pull
  1356  			allocsReq.MinQueryIndex = pullIndex - 1
  1357  			allocsResp = structs.AllocsGetResponse{}
  1358  			if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil {
  1359  				c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err)
  1360  				retry := c.retryIntv(getAllocRetryIntv)
  1361  				select {
  1362  				case <-c.serversDiscoveredCh:
  1363  					continue
  1364  				case <-time.After(retry):
  1365  					continue
  1366  				case <-c.shutdownCh:
  1367  					return
  1368  				}
  1369  			}
  1370  
  1371  			// Ensure that we received all the allocations we wanted
  1372  			pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs))
  1373  			for _, alloc := range allocsResp.Allocs {
  1374  				pulledAllocs[alloc.ID] = alloc
  1375  			}
  1376  
  1377  			for _, desiredID := range pull {
  1378  				if _, ok := pulledAllocs[desiredID]; !ok {
  1379  					// We didn't get everything we wanted. Do not update the
  1380  					// MinQueryIndex, sleep and then retry.
  1381  					wait := c.retryIntv(2 * time.Second)
  1382  					select {
  1383  					case <-time.After(wait):
  1384  						// Wait for the server we contact to receive the
  1385  						// allocations
  1386  						continue OUTER
  1387  					case <-c.shutdownCh:
  1388  						return
  1389  					}
  1390  				}
  1391  			}
  1392  
  1393  			// Check for shutdown
  1394  			select {
  1395  			case <-c.shutdownCh:
  1396  				return
  1397  			default:
  1398  			}
  1399  		}
  1400  
  1401  		c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)",
  1402  			resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered))
  1403  
  1404  		// Update the query index.
  1405  		if resp.Index > req.MinQueryIndex {
  1406  			req.MinQueryIndex = resp.Index
  1407  		}
  1408  
  1409  		// Push the updates.
  1410  		update := &allocUpdates{
  1411  			filtered: filtered,
  1412  			pulled:   pulledAllocs,
  1413  		}
  1414  		select {
  1415  		case updates <- update:
  1416  		case <-c.shutdownCh:
  1417  			return
  1418  		}
  1419  	}
  1420  }
  1421  
  1422  // watchNodeUpdates periodically checks for changes to the node attributes or meta map
  1423  func (c *Client) watchNodeUpdates() {
  1424  	c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv)
  1425  
  1426  	// Initialize the hashes
  1427  	_, attrHash, metaHash := c.hasNodeChanged(0, 0)
  1428  	var changed bool
  1429  	for {
  1430  		select {
  1431  		case <-time.After(c.retryIntv(nodeUpdateRetryIntv)):
  1432  			changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash)
  1433  			if changed {
  1434  				c.logger.Printf("[DEBUG] client: state changed, updating node.")
  1435  
  1436  				// Update the config copy.
  1437  				c.configLock.Lock()
  1438  				node := c.config.Node.Copy()
  1439  				c.configCopy.Node = node
  1440  				c.configLock.Unlock()
  1441  
  1442  				c.retryRegisterNode()
  1443  			}
  1444  		case <-c.shutdownCh:
  1445  			return
  1446  		}
  1447  	}
  1448  }
  1449  
  1450  // runAllocs is invoked when we get an updated set of allocations
  1451  func (c *Client) runAllocs(update *allocUpdates) {
  1452  	// Get the existing allocs
  1453  	c.allocLock.RLock()
  1454  	exist := make([]*structs.Allocation, 0, len(c.allocs))
  1455  	for _, ar := range c.allocs {
  1456  		exist = append(exist, ar.alloc)
  1457  	}
  1458  	c.allocLock.RUnlock()
  1459  
  1460  	// Diff the existing and updated allocations
  1461  	diff := diffAllocs(exist, update)
  1462  	c.logger.Printf("[DEBUG] client: %#v", diff)
  1463  
  1464  	// Remove the old allocations
  1465  	for _, remove := range diff.removed {
  1466  		if err := c.removeAlloc(remove); err != nil {
  1467  			c.logger.Printf("[ERR] client: failed to remove alloc '%s': %v",
  1468  				remove.ID, err)
  1469  		}
  1470  	}
  1471  
  1472  	// Update the existing allocations
  1473  	for _, update := range diff.updated {
  1474  		if err := c.updateAlloc(update.exist, update.updated); err != nil {
  1475  			c.logger.Printf("[ERR] client: failed to update alloc '%s': %v",
  1476  				update.exist.ID, err)
  1477  		}
  1478  
  1479  		// See if the updated alloc is getting migrated
  1480  		c.migratingAllocsLock.Lock()
  1481  		ch, ok := c.migratingAllocs[update.updated.ID]
  1482  		c.migratingAllocsLock.Unlock()
  1483  		if ok {
  1484  			// Stopping the migration if the allocation doesn't need any
  1485  			// migration
  1486  			if !update.updated.ShouldMigrate() {
  1487  				ch.closeCh()
  1488  			}
  1489  		}
  1490  	}
  1491  
  1492  	// Start the new allocations
  1493  	for _, add := range diff.added {
  1494  		// If the allocation is chained and the previous allocation hasn't
  1495  		// terminated yet, then add the alloc to the blocked queue.
  1496  		c.blockedAllocsLock.Lock()
  1497  		ar, ok := c.getAllocRunners()[add.PreviousAllocation]
  1498  		if ok && !ar.Alloc().Terminated() {
  1499  			// Check if the alloc is already present in the blocked allocations
  1500  			// map
  1501  			if _, ok := c.blockedAllocations[add.PreviousAllocation]; !ok {
  1502  				c.logger.Printf("[DEBUG] client: added alloc %q to blocked queue for previous allocation %q", add.ID,
  1503  					add.PreviousAllocation)
  1504  				c.blockedAllocations[add.PreviousAllocation] = add
  1505  			}
  1506  			c.blockedAllocsLock.Unlock()
  1507  			continue
  1508  		}
  1509  		c.blockedAllocsLock.Unlock()
  1510  
  1511  		// This means the allocation has a previous allocation on another node
  1512  		// so we will block for the previous allocation to complete
  1513  		if add.PreviousAllocation != "" && !ok {
  1514  			// Ensure that we are not blocking for the remote allocation if we
  1515  			// have already blocked
  1516  			c.migratingAllocsLock.Lock()
  1517  			if _, ok := c.migratingAllocs[add.ID]; !ok {
  1518  				// Check that we don't have an alloc runner already. This
  1519  				// prevents a race between a finishing blockForRemoteAlloc and
  1520  				// another invocation of runAllocs
  1521  				if _, ok := c.getAllocRunners()[add.PreviousAllocation]; !ok {
  1522  					c.migratingAllocs[add.ID] = newMigrateAllocCtrl(add)
  1523  					go c.blockForRemoteAlloc(add)
  1524  				}
  1525  			}
  1526  			c.migratingAllocsLock.Unlock()
  1527  			continue
  1528  		}
  1529  
  1530  		// Setting the previous allocdir if the allocation had a terminal
  1531  		// previous allocation
  1532  		var prevAllocDir *allocdir.AllocDir
  1533  		tg := add.Job.LookupTaskGroup(add.TaskGroup)
  1534  		if tg != nil && tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky && ar != nil {
  1535  			prevAllocDir = ar.GetAllocDir()
  1536  		}
  1537  
  1538  		if err := c.addAlloc(add, prevAllocDir); err != nil {
  1539  			c.logger.Printf("[ERR] client: failed to add alloc '%s': %v",
  1540  				add.ID, err)
  1541  		}
  1542  	}
  1543  
  1544  	// Persist our state
  1545  	if err := c.saveState(); err != nil {
  1546  		c.logger.Printf("[ERR] client: failed to save state: %v", err)
  1547  	}
  1548  }
  1549  
  1550  // blockForRemoteAlloc blocks until the previous allocation of an allocation has
  1551  // been terminated and migrates the snapshot data
  1552  func (c *Client) blockForRemoteAlloc(alloc *structs.Allocation) {
  1553  	// Removing the allocation from the set of allocs which are currently
  1554  	// undergoing migration
  1555  	defer func() {
  1556  		c.migratingAllocsLock.Lock()
  1557  		delete(c.migratingAllocs, alloc.ID)
  1558  		c.migratingAllocsLock.Unlock()
  1559  	}()
  1560  
  1561  	// prevAllocDir is the allocation directory of the previous allocation
  1562  	var prevAllocDir *allocdir.AllocDir
  1563  
  1564  	// If the allocation is not sticky then we won't wait for the previous
  1565  	// allocation to be terminal
  1566  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1567  	if tg == nil {
  1568  		c.logger.Printf("[ERR] client: task group %q not found in job %q", tg.Name, alloc.Job.ID)
  1569  		goto ADDALLOC
  1570  	}
  1571  
  1572  	// Wait for the remote previous alloc to be terminal if the alloc is sticky
  1573  	if tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky && tg.EphemeralDisk.Migrate {
  1574  		c.logger.Printf("[DEBUG] client: blocking alloc %q for previous allocation %q", alloc.ID, alloc.PreviousAllocation)
  1575  		// Block until the previous allocation migrates to terminal state
  1576  		stopCh := c.migratingAllocs[alloc.ID]
  1577  		prevAlloc, err := c.waitForAllocTerminal(alloc.PreviousAllocation, stopCh)
  1578  		if err != nil {
  1579  			c.logger.Printf("[ERR] client: error waiting for allocation %q: %v",
  1580  				alloc.PreviousAllocation, err)
  1581  		}
  1582  
  1583  		// Migrate the data from the remote node
  1584  		prevAllocDir, err = c.migrateRemoteAllocDir(prevAlloc, alloc.ID)
  1585  		if err != nil {
  1586  			c.logger.Printf("[ERR] client: error migrating data from remote alloc %q: %v",
  1587  				alloc.PreviousAllocation, err)
  1588  		}
  1589  	}
  1590  
  1591  ADDALLOC:
  1592  	// Add the allocation
  1593  	if err := c.addAlloc(alloc, prevAllocDir); err != nil {
  1594  		c.logger.Printf("[ERR] client: error adding alloc: %v", err)
  1595  	}
  1596  }
  1597  
  1598  // waitForAllocTerminal waits for an allocation with the given alloc id to
  1599  // transition to terminal state and blocks the caller until then.
  1600  func (c *Client) waitForAllocTerminal(allocID string, stopCh *migrateAllocCtrl) (*structs.Allocation, error) {
  1601  	req := structs.AllocSpecificRequest{
  1602  		AllocID: allocID,
  1603  		QueryOptions: structs.QueryOptions{
  1604  			Region:     c.Region(),
  1605  			AllowStale: true,
  1606  		},
  1607  	}
  1608  
  1609  	for {
  1610  		resp := structs.SingleAllocResponse{}
  1611  		err := c.RPC("Alloc.GetAlloc", &req, &resp)
  1612  		if err != nil {
  1613  			c.logger.Printf("[ERR] client: failed to query allocation %q: %v", allocID, err)
  1614  			retry := c.retryIntv(getAllocRetryIntv)
  1615  			select {
  1616  			case <-time.After(retry):
  1617  				continue
  1618  			case <-stopCh.ch:
  1619  				return nil, fmt.Errorf("giving up waiting on alloc %v since migration is not needed", allocID)
  1620  			case <-c.shutdownCh:
  1621  				return nil, fmt.Errorf("aborting because client is shutting down")
  1622  			}
  1623  		}
  1624  		if resp.Alloc == nil {
  1625  			return nil, nil
  1626  		}
  1627  		if resp.Alloc.Terminated() {
  1628  			return resp.Alloc, nil
  1629  		}
  1630  
  1631  		// Update the query index.
  1632  		if resp.Index > req.MinQueryIndex {
  1633  			req.MinQueryIndex = resp.Index
  1634  		}
  1635  
  1636  	}
  1637  }
  1638  
  1639  // migrateRemoteAllocDir migrates the allocation directory from a remote node to
  1640  // the current node
  1641  func (c *Client) migrateRemoteAllocDir(alloc *structs.Allocation, allocID string) (*allocdir.AllocDir, error) {
  1642  	if alloc == nil {
  1643  		return nil, nil
  1644  	}
  1645  
  1646  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1647  	if tg == nil {
  1648  		return nil, fmt.Errorf("Task Group %q not found in job %q", tg.Name, alloc.Job.ID)
  1649  	}
  1650  
  1651  	// Skip migration of data if the ephemeral disk is not sticky or
  1652  	// migration is turned off.
  1653  	if tg.EphemeralDisk == nil || !tg.EphemeralDisk.Sticky || !tg.EphemeralDisk.Migrate {
  1654  		return nil, nil
  1655  	}
  1656  
  1657  	node, err := c.getNode(alloc.NodeID)
  1658  
  1659  	// If the node is down then skip migrating the data
  1660  	if err != nil {
  1661  		return nil, fmt.Errorf("error retreiving node %v: %v", alloc.NodeID, err)
  1662  	}
  1663  
  1664  	// Check if node is nil
  1665  	if node == nil {
  1666  		return nil, fmt.Errorf("node %q doesn't exist", alloc.NodeID)
  1667  	}
  1668  
  1669  	// skip migration if the remote node is down
  1670  	if node.Status == structs.NodeStatusDown {
  1671  		c.logger.Printf("[INFO] client: not migrating data from alloc %q since node %q is down", alloc.ID, alloc.NodeID)
  1672  		return nil, nil
  1673  	}
  1674  
  1675  	// Create the previous alloc dir
  1676  	pathToAllocDir := filepath.Join(c.config.AllocDir, alloc.ID)
  1677  	if err := os.MkdirAll(pathToAllocDir, 0777); err != nil {
  1678  		c.logger.Printf("[ERR] client: error creating previous allocation dir: %v", err)
  1679  	}
  1680  
  1681  	// Get the snapshot
  1682  	scheme := "http"
  1683  	if node.TLSEnabled {
  1684  		scheme = "https"
  1685  	}
  1686  	// Create an API client
  1687  	apiConfig := nomadapi.DefaultConfig()
  1688  	apiConfig.Address = fmt.Sprintf("%s://%s", scheme, node.HTTPAddr)
  1689  	apiConfig.TLSConfig = &nomadapi.TLSConfig{
  1690  		CACert:     c.config.TLSConfig.CAFile,
  1691  		ClientCert: c.config.TLSConfig.CertFile,
  1692  		ClientKey:  c.config.TLSConfig.KeyFile,
  1693  	}
  1694  	apiClient, err := nomadapi.NewClient(apiConfig)
  1695  	if err != nil {
  1696  		return nil, err
  1697  	}
  1698  
  1699  	url := fmt.Sprintf("/v1/client/allocation/%v/snapshot", alloc.ID)
  1700  	resp, err := apiClient.Raw().Response(url, nil)
  1701  	if err != nil {
  1702  		os.RemoveAll(pathToAllocDir)
  1703  		c.logger.Printf("[ERR] client: error getting snapshot: %v", err)
  1704  		return nil, fmt.Errorf("error getting snapshot for alloc %v: %v", alloc.ID, err)
  1705  	}
  1706  
  1707  	if err := c.unarchiveAllocDir(resp, allocID, pathToAllocDir); err != nil {
  1708  		return nil, err
  1709  	}
  1710  
  1711  	// If there were no errors then we create the allocdir
  1712  	prevAllocDir := allocdir.NewAllocDir(c.logger, pathToAllocDir)
  1713  	return prevAllocDir, nil
  1714  }
  1715  
  1716  // unarchiveAllocDir reads the stream of a compressed allocation directory and
  1717  // writes them to the disk.
  1718  func (c *Client) unarchiveAllocDir(resp io.ReadCloser, allocID string, pathToAllocDir string) error {
  1719  	tr := tar.NewReader(resp)
  1720  	defer resp.Close()
  1721  
  1722  	buf := make([]byte, 1024)
  1723  
  1724  	stopMigrating, ok := c.migratingAllocs[allocID]
  1725  	if !ok {
  1726  		os.RemoveAll(pathToAllocDir)
  1727  		return fmt.Errorf("Allocation %q is not marked for remote migration", allocID)
  1728  	}
  1729  	for {
  1730  		// See if the alloc still needs migration
  1731  		select {
  1732  		case <-stopMigrating.ch:
  1733  			os.RemoveAll(pathToAllocDir)
  1734  			c.logger.Printf("[INFO] client: stopping migration of allocdir for alloc: %v", allocID)
  1735  			return nil
  1736  		case <-c.shutdownCh:
  1737  			os.RemoveAll(pathToAllocDir)
  1738  			c.logger.Printf("[INFO] client: stopping migration of alloc %q since client is shutting down", allocID)
  1739  			return nil
  1740  		default:
  1741  		}
  1742  
  1743  		// Get the next header
  1744  		hdr, err := tr.Next()
  1745  
  1746  		// Snapshot has ended
  1747  		if err == io.EOF {
  1748  			return nil
  1749  		}
  1750  		// If there is an error then we avoid creating the alloc dir
  1751  		if err != nil {
  1752  			os.RemoveAll(pathToAllocDir)
  1753  			return fmt.Errorf("error creating alloc dir for alloc %q: %v", allocID, err)
  1754  		}
  1755  
  1756  		// If the header is for a directory we create the directory
  1757  		if hdr.Typeflag == tar.TypeDir {
  1758  			os.MkdirAll(filepath.Join(pathToAllocDir, hdr.Name), os.FileMode(hdr.Mode))
  1759  			continue
  1760  		}
  1761  		// If the header is a file, we write to a file
  1762  		if hdr.Typeflag == tar.TypeReg {
  1763  			f, err := os.Create(filepath.Join(pathToAllocDir, hdr.Name))
  1764  			if err != nil {
  1765  				c.logger.Printf("[ERR] client: error creating file: %v", err)
  1766  				continue
  1767  			}
  1768  
  1769  			// Setting the permissions of the file as the origin.
  1770  			if err := f.Chmod(os.FileMode(hdr.Mode)); err != nil {
  1771  				f.Close()
  1772  				c.logger.Printf("[ERR] client: error chmod-ing file %s: %v", f.Name(), err)
  1773  				return fmt.Errorf("error chmoding file %v", err)
  1774  			}
  1775  			if err := f.Chown(hdr.Uid, hdr.Gid); err != nil {
  1776  				f.Close()
  1777  				c.logger.Printf("[ERR] client: error chown-ing file %s: %v", f.Name(), err)
  1778  				return fmt.Errorf("error chowning file %v", err)
  1779  			}
  1780  
  1781  			// We write in chunks of 32 bytes so that we can test if
  1782  			// the client is still alive
  1783  			for {
  1784  				if c.shutdown {
  1785  					f.Close()
  1786  					os.RemoveAll(pathToAllocDir)
  1787  					c.logger.Printf("[INFO] client: stopping migration of alloc %q because client is shutting down", allocID)
  1788  					return nil
  1789  				}
  1790  
  1791  				n, err := tr.Read(buf)
  1792  				if err != nil {
  1793  					f.Close()
  1794  					if err != io.EOF {
  1795  						return fmt.Errorf("error reading snapshot: %v", err)
  1796  					}
  1797  					break
  1798  				}
  1799  				if _, err := f.Write(buf[:n]); err != nil {
  1800  					f.Close()
  1801  					os.RemoveAll(pathToAllocDir)
  1802  					return fmt.Errorf("error writing to file %q: %v", f.Name(), err)
  1803  				}
  1804  			}
  1805  
  1806  		}
  1807  	}
  1808  }
  1809  
  1810  // getNode gets the node from the server with the given Node ID
  1811  func (c *Client) getNode(nodeID string) (*structs.Node, error) {
  1812  	req := structs.NodeSpecificRequest{
  1813  		NodeID: nodeID,
  1814  		QueryOptions: structs.QueryOptions{
  1815  			Region:     c.Region(),
  1816  			AllowStale: true,
  1817  		},
  1818  	}
  1819  
  1820  	resp := structs.SingleNodeResponse{}
  1821  	for {
  1822  		err := c.RPC("Node.GetNode", &req, &resp)
  1823  		if err != nil {
  1824  			c.logger.Printf("[ERR] client: failed to query node info %q: %v", nodeID, err)
  1825  			retry := c.retryIntv(getAllocRetryIntv)
  1826  			select {
  1827  			case <-time.After(retry):
  1828  				continue
  1829  			case <-c.shutdownCh:
  1830  				return nil, fmt.Errorf("aborting because client is shutting down")
  1831  			}
  1832  		}
  1833  		break
  1834  	}
  1835  
  1836  	return resp.Node, nil
  1837  }
  1838  
  1839  // removeAlloc is invoked when we should remove an allocation
  1840  func (c *Client) removeAlloc(alloc *structs.Allocation) error {
  1841  	c.allocLock.Lock()
  1842  	ar, ok := c.allocs[alloc.ID]
  1843  	if !ok {
  1844  		c.allocLock.Unlock()
  1845  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID)
  1846  		return nil
  1847  	}
  1848  	delete(c.allocs, alloc.ID)
  1849  	c.allocLock.Unlock()
  1850  
  1851  	// Ensure the GC has a reference and then collect. Collecting through the GC
  1852  	// applies rate limiting
  1853  	c.garbageCollector.MarkForCollection(ar)
  1854  	go c.garbageCollector.Collect(alloc.ID)
  1855  
  1856  	return nil
  1857  }
  1858  
  1859  // updateAlloc is invoked when we should update an allocation
  1860  func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
  1861  	c.allocLock.RLock()
  1862  	ar, ok := c.allocs[exist.ID]
  1863  	c.allocLock.RUnlock()
  1864  	if !ok {
  1865  		c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID)
  1866  		return nil
  1867  	}
  1868  
  1869  	ar.Update(update)
  1870  	return nil
  1871  }
  1872  
  1873  // addAlloc is invoked when we should add an allocation
  1874  func (c *Client) addAlloc(alloc *structs.Allocation, prevAllocDir *allocdir.AllocDir) error {
  1875  	// Check if we already have an alloc runner
  1876  	c.allocLock.Lock()
  1877  	if _, ok := c.allocs[alloc.ID]; ok {
  1878  		c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID)
  1879  		c.allocLock.Unlock()
  1880  		return nil
  1881  	}
  1882  	c.allocLock.Unlock()
  1883  
  1884  	// Make room for the allocation
  1885  	if err := c.garbageCollector.MakeRoomFor([]*structs.Allocation{alloc}); err != nil {
  1886  		c.logger.Printf("[ERR] client: error making room for allocation: %v", err)
  1887  	}
  1888  
  1889  	c.allocLock.Lock()
  1890  	defer c.allocLock.Unlock()
  1891  
  1892  	c.configLock.RLock()
  1893  	ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient)
  1894  	ar.SetPreviousAllocDir(prevAllocDir)
  1895  	c.configLock.RUnlock()
  1896  	go ar.Run()
  1897  
  1898  	// Store the alloc runner.
  1899  	c.allocs[alloc.ID] = ar
  1900  	return nil
  1901  }
  1902  
  1903  // setupVaultClient creates an object to periodically renew tokens and secrets
  1904  // with vault.
  1905  func (c *Client) setupVaultClient() error {
  1906  	var err error
  1907  	if c.vaultClient, err =
  1908  		vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken); err != nil {
  1909  		return err
  1910  	}
  1911  
  1912  	if c.vaultClient == nil {
  1913  		c.logger.Printf("[ERR] client: failed to create vault client")
  1914  		return fmt.Errorf("failed to create vault client")
  1915  	}
  1916  
  1917  	// Start renewing tokens and secrets
  1918  	c.vaultClient.Start()
  1919  
  1920  	return nil
  1921  }
  1922  
  1923  // deriveToken takes in an allocation and a set of tasks and derives vault
  1924  // tokens for each of the tasks, unwraps all of them using the supplied vault
  1925  // client and returns a map of unwrapped tokens, indexed by the task name.
  1926  func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) {
  1927  	if alloc == nil {
  1928  		return nil, fmt.Errorf("nil allocation")
  1929  	}
  1930  
  1931  	if taskNames == nil || len(taskNames) == 0 {
  1932  		return nil, fmt.Errorf("missing task names")
  1933  	}
  1934  
  1935  	group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  1936  	if group == nil {
  1937  		return nil, fmt.Errorf("group name in allocation is not present in job")
  1938  	}
  1939  
  1940  	verifiedTasks := []string{}
  1941  	found := false
  1942  	// Check if the given task names actually exist in the allocation
  1943  	for _, taskName := range taskNames {
  1944  		found = false
  1945  		for _, task := range group.Tasks {
  1946  			if task.Name == taskName {
  1947  				found = true
  1948  			}
  1949  		}
  1950  		if !found {
  1951  			c.logger.Printf("[ERR] task %q not found in the allocation", taskName)
  1952  			return nil, fmt.Errorf("task %q not found in the allocaition", taskName)
  1953  		}
  1954  		verifiedTasks = append(verifiedTasks, taskName)
  1955  	}
  1956  
  1957  	// DeriveVaultToken of nomad server can take in a set of tasks and
  1958  	// creates tokens for all the tasks.
  1959  	req := &structs.DeriveVaultTokenRequest{
  1960  		NodeID:   c.Node().ID,
  1961  		SecretID: c.Node().SecretID,
  1962  		AllocID:  alloc.ID,
  1963  		Tasks:    verifiedTasks,
  1964  		QueryOptions: structs.QueryOptions{
  1965  			Region:     c.Region(),
  1966  			AllowStale: false,
  1967  		},
  1968  	}
  1969  
  1970  	// Derive the tokens
  1971  	var resp structs.DeriveVaultTokenResponse
  1972  	if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil {
  1973  		c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err)
  1974  		return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err)
  1975  	}
  1976  	if resp.Error != nil {
  1977  		c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error)
  1978  		return nil, resp.Error
  1979  	}
  1980  	if resp.Tasks == nil {
  1981  		c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response")
  1982  		return nil, fmt.Errorf("failed to derive vault tokens: invalid response")
  1983  	}
  1984  
  1985  	unwrappedTokens := make(map[string]string)
  1986  
  1987  	// Retrieve the wrapped tokens from the response and unwrap it
  1988  	for _, taskName := range verifiedTasks {
  1989  		// Get the wrapped token
  1990  		wrappedToken, ok := resp.Tasks[taskName]
  1991  		if !ok {
  1992  			c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName)
  1993  			return nil, fmt.Errorf("wrapped token missing for task %q", taskName)
  1994  		}
  1995  
  1996  		// Unwrap the vault token
  1997  		unwrapResp, err := vclient.Logical().Unwrap(wrappedToken)
  1998  		if err != nil {
  1999  			return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err)
  2000  		}
  2001  		if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" {
  2002  			return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName)
  2003  		}
  2004  
  2005  		// Append the unwrapped token to the return value
  2006  		unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken
  2007  	}
  2008  
  2009  	return unwrappedTokens, nil
  2010  }
  2011  
  2012  // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread)
  2013  func (c *Client) triggerDiscovery() {
  2014  	select {
  2015  	case c.triggerDiscoveryCh <- struct{}{}:
  2016  		// Discovery goroutine was released to execute
  2017  	default:
  2018  		// Discovery goroutine was already running
  2019  	}
  2020  }
  2021  
  2022  // consulDiscovery waits for the signal to attempt server discovery via Consul.
  2023  // It's intended to be started in a goroutine. See triggerDiscovery() for
  2024  // causing consul discovery from other code locations.
  2025  func (c *Client) consulDiscovery() {
  2026  	for {
  2027  		select {
  2028  		case <-c.triggerDiscoveryCh:
  2029  			if err := c.consulDiscoveryImpl(); err != nil {
  2030  				c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err)
  2031  			}
  2032  		case <-c.shutdownCh:
  2033  			return
  2034  		}
  2035  	}
  2036  }
  2037  
  2038  func (c *Client) consulDiscoveryImpl() error {
  2039  	// Acquire heartbeat lock to prevent heartbeat from running
  2040  	// concurrently with discovery. Concurrent execution is safe, however
  2041  	// discovery is usually triggered when heartbeating has failed so
  2042  	// there's no point in allowing it.
  2043  	c.heartbeatLock.Lock()
  2044  	defer c.heartbeatLock.Unlock()
  2045  
  2046  	consulCatalog := c.consulSyncer.ConsulClient().Catalog()
  2047  	dcs, err := consulCatalog.Datacenters()
  2048  	if err != nil {
  2049  		return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
  2050  	}
  2051  	if len(dcs) > 2 {
  2052  		// Query the local DC first, then shuffle the
  2053  		// remaining DCs.  Future heartbeats will cause Nomad
  2054  		// Clients to fixate on their local datacenter so
  2055  		// it's okay to talk with remote DCs.  If the no
  2056  		// Nomad servers are available within
  2057  		// datacenterQueryLimit, the next heartbeat will pick
  2058  		// a new set of servers so it's okay.
  2059  		shuffleStrings(dcs[1:])
  2060  		dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
  2061  	}
  2062  
  2063  	// Query for servers in this client's region only
  2064  	region := c.Region()
  2065  	rpcargs := structs.GenericRequest{
  2066  		QueryOptions: structs.QueryOptions{
  2067  			Region: region,
  2068  		},
  2069  	}
  2070  
  2071  	serviceName := c.configCopy.ConsulConfig.ServerServiceName
  2072  	var mErr multierror.Error
  2073  	var servers endpoints
  2074  	c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
  2075  DISCOLOOP:
  2076  	for _, dc := range dcs {
  2077  		consulOpts := &consulapi.QueryOptions{
  2078  			AllowStale: true,
  2079  			Datacenter: dc,
  2080  			Near:       "_agent",
  2081  			WaitTime:   consul.DefaultQueryWaitDuration,
  2082  		}
  2083  		consulServices, _, err := consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts)
  2084  		if err != nil {
  2085  			mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err))
  2086  			continue
  2087  		}
  2088  
  2089  		for _, s := range consulServices {
  2090  			port := strconv.Itoa(s.ServicePort)
  2091  			addrstr := s.ServiceAddress
  2092  			if addrstr == "" {
  2093  				addrstr = s.Address
  2094  			}
  2095  			addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port))
  2096  			if err != nil {
  2097  				mErr.Errors = append(mErr.Errors, err)
  2098  				continue
  2099  			}
  2100  			var peers []string
  2101  			if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil {
  2102  				mErr.Errors = append(mErr.Errors, err)
  2103  				continue
  2104  			}
  2105  
  2106  			// Successfully received the Server peers list of the correct
  2107  			// region
  2108  			for _, p := range peers {
  2109  				addr, err := net.ResolveTCPAddr("tcp", p)
  2110  				if err != nil {
  2111  					mErr.Errors = append(mErr.Errors, err)
  2112  				}
  2113  				servers = append(servers, &endpoint{name: p, addr: addr})
  2114  			}
  2115  			if len(servers) > 0 {
  2116  				break DISCOLOOP
  2117  			}
  2118  		}
  2119  	}
  2120  	if len(servers) == 0 {
  2121  		if len(mErr.Errors) > 0 {
  2122  			return mErr.ErrorOrNil()
  2123  		}
  2124  		return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs)
  2125  	}
  2126  
  2127  	c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers)
  2128  	c.servers.set(servers)
  2129  
  2130  	// Notify waiting rpc calls. If a goroutine just failed an RPC call and
  2131  	// isn't receiving on this chan yet they'll still retry eventually.
  2132  	// This is a shortcircuit for the longer retry intervals.
  2133  	for {
  2134  		select {
  2135  		case c.serversDiscoveredCh <- struct{}{}:
  2136  		default:
  2137  			return nil
  2138  		}
  2139  	}
  2140  }
  2141  
  2142  // consulReaper periodically reaps unmatched domains from Consul. Intended to
  2143  // be called in its own goroutine. See consulReaperIntv for interval.
  2144  func (c *Client) consulReaper() {
  2145  	ticker := time.NewTicker(consulReaperIntv)
  2146  	defer ticker.Stop()
  2147  	lastok := true
  2148  	for {
  2149  		select {
  2150  		case <-ticker.C:
  2151  			if err := c.consulReaperImpl(); err != nil {
  2152  				if lastok {
  2153  					c.logger.Printf("[ERR] client.consul: error reaping services in consul: %v", err)
  2154  					lastok = false
  2155  				}
  2156  			} else {
  2157  				lastok = true
  2158  			}
  2159  		case <-c.shutdownCh:
  2160  			return
  2161  		}
  2162  	}
  2163  }
  2164  
  2165  // consulReaperImpl reaps unmatched domains from Consul.
  2166  func (c *Client) consulReaperImpl() error {
  2167  	const estInitialExecutorDomains = 8
  2168  
  2169  	// Create the domains to keep and add the server and client
  2170  	domains := make([]consul.ServiceDomain, 2, estInitialExecutorDomains)
  2171  	domains[0] = consul.ServerDomain
  2172  	domains[1] = consul.ClientDomain
  2173  
  2174  	for allocID, ar := range c.getAllocRunners() {
  2175  		ar.taskStatusLock.RLock()
  2176  		taskStates := copyTaskStates(ar.taskStates)
  2177  		ar.taskStatusLock.RUnlock()
  2178  		for taskName, taskState := range taskStates {
  2179  			// Only keep running tasks
  2180  			if taskState.State == structs.TaskStateRunning {
  2181  				d := consul.NewExecutorDomain(allocID, taskName)
  2182  				domains = append(domains, d)
  2183  			}
  2184  		}
  2185  	}
  2186  
  2187  	return c.consulSyncer.ReapUnmatched(domains)
  2188  }
  2189  
  2190  // emitStats collects host resource usage stats periodically
  2191  func (c *Client) emitStats() {
  2192  	// Start collecting host stats right away and then keep collecting every
  2193  	// collection interval
  2194  	next := time.NewTimer(0)
  2195  	defer next.Stop()
  2196  	for {
  2197  		select {
  2198  		case <-next.C:
  2199  			err := c.hostStatsCollector.Collect()
  2200  			next.Reset(c.config.StatsCollectionInterval)
  2201  			if err != nil {
  2202  				c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err)
  2203  				continue
  2204  			}
  2205  
  2206  			// Publish Node metrics if operator has opted in
  2207  			if c.config.PublishNodeMetrics {
  2208  				c.emitHostStats(c.hostStatsCollector.Stats())
  2209  			}
  2210  
  2211  			c.emitClientMetrics()
  2212  		case <-c.shutdownCh:
  2213  			return
  2214  		}
  2215  	}
  2216  }
  2217  
  2218  // emitHostStats pushes host resource usage stats to remote metrics collection sinks
  2219  func (c *Client) emitHostStats(hStats *stats.HostStats) {
  2220  	nodeID := c.Node().ID
  2221  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total))
  2222  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available))
  2223  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used))
  2224  	metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free))
  2225  
  2226  	metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime))
  2227  
  2228  	for _, cpu := range hStats.CPU {
  2229  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total))
  2230  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User))
  2231  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle))
  2232  		metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System))
  2233  	}
  2234  
  2235  	for _, disk := range hStats.DiskStats {
  2236  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size))
  2237  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used))
  2238  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available))
  2239  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent))
  2240  		metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent))
  2241  	}
  2242  
  2243  	// Get all the resources for the node
  2244  	c.configLock.RLock()
  2245  	node := c.configCopy.Node
  2246  	c.configLock.RUnlock()
  2247  	total := node.Resources
  2248  	res := node.Reserved
  2249  	allocated := c.getAllocatedResources(node)
  2250  
  2251  	// Emit allocated
  2252  	metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB))
  2253  	metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB))
  2254  	metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU))
  2255  	metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS))
  2256  
  2257  	for _, n := range allocated.Networks {
  2258  		metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits))
  2259  	}
  2260  
  2261  	// Emit unallocated
  2262  	unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB
  2263  	unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB
  2264  	unallocatedCpu := total.CPU - res.CPU - allocated.CPU
  2265  	unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS
  2266  	metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem))
  2267  	metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk))
  2268  	metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu))
  2269  	metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops))
  2270  
  2271  	for _, n := range allocated.Networks {
  2272  		totalMbits := 0
  2273  
  2274  		totalIdx := total.NetIndex(n)
  2275  		if totalIdx != -1 {
  2276  			totalMbits = total.Networks[totalIdx].MBits
  2277  			continue
  2278  		}
  2279  
  2280  		unallocatedMbits := totalMbits - n.MBits
  2281  		metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits))
  2282  	}
  2283  }
  2284  
  2285  // emitClientMetrics emits lower volume client metrics
  2286  func (c *Client) emitClientMetrics() {
  2287  	nodeID := c.Node().ID
  2288  
  2289  	// Emit allocation metrics
  2290  	c.migratingAllocsLock.Lock()
  2291  	migrating := len(c.migratingAllocs)
  2292  	c.migratingAllocsLock.Unlock()
  2293  
  2294  	c.blockedAllocsLock.Lock()
  2295  	blocked := len(c.blockedAllocations)
  2296  	c.blockedAllocsLock.Unlock()
  2297  
  2298  	pending, running, terminal := 0, 0, 0
  2299  	for _, ar := range c.getAllocRunners() {
  2300  		switch ar.Alloc().ClientStatus {
  2301  		case structs.AllocClientStatusPending:
  2302  			pending++
  2303  		case structs.AllocClientStatusRunning:
  2304  			running++
  2305  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  2306  			terminal++
  2307  		}
  2308  	}
  2309  
  2310  	metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating))
  2311  	metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked))
  2312  	metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending))
  2313  	metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running))
  2314  	metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal))
  2315  }
  2316  
  2317  func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources {
  2318  	// Unfortunately the allocs only have IP so we need to match them to the
  2319  	// device
  2320  	cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks))
  2321  	for _, n := range selfNode.Resources.Networks {
  2322  		_, ipnet, err := net.ParseCIDR(n.CIDR)
  2323  		if err != nil {
  2324  			continue
  2325  		}
  2326  		cidrToDevice[ipnet] = n.Device
  2327  	}
  2328  
  2329  	// Sum the allocated resources
  2330  	allocs := c.allAllocs()
  2331  	var allocated structs.Resources
  2332  	allocatedDeviceMbits := make(map[string]int)
  2333  	for _, alloc := range allocs {
  2334  		if !alloc.TerminalStatus() {
  2335  			allocated.Add(alloc.Resources)
  2336  			for _, allocatedNetwork := range alloc.Resources.Networks {
  2337  				for cidr, dev := range cidrToDevice {
  2338  					ip := net.ParseIP(allocatedNetwork.IP)
  2339  					if cidr.Contains(ip) {
  2340  						allocatedDeviceMbits[dev] += allocatedNetwork.MBits
  2341  						break
  2342  					}
  2343  				}
  2344  			}
  2345  		}
  2346  	}
  2347  
  2348  	// Clear the networks
  2349  	allocated.Networks = nil
  2350  	for dev, speed := range allocatedDeviceMbits {
  2351  		net := &structs.NetworkResource{
  2352  			Device: dev,
  2353  			MBits:  speed,
  2354  		}
  2355  		allocated.Networks = append(allocated.Networks, net)
  2356  	}
  2357  
  2358  	return &allocated
  2359  }
  2360  
  2361  // allAllocs returns all the allocations managed by the client
  2362  func (c *Client) allAllocs() map[string]*structs.Allocation {
  2363  	allocs := make(map[string]*structs.Allocation, 16)
  2364  	for _, ar := range c.getAllocRunners() {
  2365  		a := ar.Alloc()
  2366  		allocs[a.ID] = a
  2367  	}
  2368  	c.blockedAllocsLock.Lock()
  2369  	for _, alloc := range c.blockedAllocations {
  2370  		allocs[alloc.ID] = alloc
  2371  	}
  2372  	c.blockedAllocsLock.Unlock()
  2373  
  2374  	c.migratingAllocsLock.Lock()
  2375  	for _, ctrl := range c.migratingAllocs {
  2376  		allocs[ctrl.alloc.ID] = ctrl.alloc
  2377  	}
  2378  	c.migratingAllocsLock.Unlock()
  2379  	return allocs
  2380  }
  2381  
  2382  // resolveServer given a sever's address as a string, return it's resolved
  2383  // net.Addr or an error.
  2384  func resolveServer(s string) (net.Addr, error) {
  2385  	const defaultClientPort = "4647" // default client RPC port
  2386  	host, port, err := net.SplitHostPort(s)
  2387  	if err != nil {
  2388  		if strings.Contains(err.Error(), "missing port") {
  2389  			host = s
  2390  			port = defaultClientPort
  2391  		} else {
  2392  			return nil, err
  2393  		}
  2394  	}
  2395  	return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port))
  2396  }