github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/nomad/server.go (about)

     1  package nomad
     2  
     3  import (
     4  	"crypto/tls"
     5  	"errors"
     6  	"fmt"
     7  	"io/ioutil"
     8  	"log"
     9  	"net"
    10  	"net/rpc"
    11  	"os"
    12  	"path/filepath"
    13  	"reflect"
    14  	"sort"
    15  	"strconv"
    16  	"sync"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	consulapi "github.com/hashicorp/consul/api"
    21  	"github.com/hashicorp/consul/lib"
    22  	"github.com/hashicorp/go-multierror"
    23  	"github.com/hashicorp/nomad/command/agent/consul"
    24  	"github.com/hashicorp/nomad/helper/tlsutil"
    25  	"github.com/hashicorp/nomad/nomad/deploymentwatcher"
    26  	"github.com/hashicorp/nomad/nomad/state"
    27  	"github.com/hashicorp/nomad/nomad/structs"
    28  	"github.com/hashicorp/raft"
    29  	"github.com/hashicorp/raft-boltdb"
    30  	"github.com/hashicorp/serf/serf"
    31  )
    32  
    33  const (
    34  	// datacenterQueryLimit sets the max number of DCs that a Nomad
    35  	// Server will query to find bootstrap_expect servers.
    36  	datacenterQueryLimit = 25
    37  
    38  	// maxStaleLeadership is the maximum time we will permit this Nomad
    39  	// Server to go without seeing a valid Raft leader.
    40  	maxStaleLeadership = 15 * time.Second
    41  
    42  	// peersPollInterval is used as the polling interval between attempts
    43  	// to query Consul for Nomad Servers.
    44  	peersPollInterval = 45 * time.Second
    45  
    46  	// peersPollJitter is used to provide a slight amount of variance to
    47  	// the retry interval when querying Consul Servers
    48  	peersPollJitterFactor = 2
    49  
    50  	raftState         = "raft/"
    51  	serfSnapshot      = "serf/snapshot"
    52  	snapshotsRetained = 2
    53  
    54  	// serverRPCCache controls how long we keep an idle connection open to a server
    55  	serverRPCCache = 2 * time.Minute
    56  
    57  	// serverMaxStreams controsl how many idle streams we keep open to a server
    58  	serverMaxStreams = 64
    59  
    60  	// raftLogCacheSize is the maximum number of logs to cache in-memory.
    61  	// This is used to reduce disk I/O for the recently committed entries.
    62  	raftLogCacheSize = 512
    63  
    64  	// raftRemoveGracePeriod is how long we wait to allow a RemovePeer
    65  	// to replicate to gracefully leave the cluster.
    66  	raftRemoveGracePeriod = 5 * time.Second
    67  
    68  	// defaultConsulDiscoveryInterval is how often to poll Consul for new
    69  	// servers if there is no leader.
    70  	defaultConsulDiscoveryInterval time.Duration = 3 * time.Second
    71  
    72  	// defaultConsulDiscoveryIntervalRetry is how often to poll Consul for
    73  	// new servers if there is no leader and the last Consul query failed.
    74  	defaultConsulDiscoveryIntervalRetry time.Duration = 9 * time.Second
    75  )
    76  
    77  // Server is Nomad server which manages the job queues,
    78  // schedulers, and notification bus for agents.
    79  type Server struct {
    80  	config *Config
    81  	logger *log.Logger
    82  
    83  	// Connection pool to other Nomad servers
    84  	connPool *ConnPool
    85  
    86  	// Endpoints holds our RPC endpoints
    87  	endpoints endpoints
    88  
    89  	// The raft instance is used among Nomad nodes within the
    90  	// region to protect operations that require strong consistency
    91  	leaderCh      <-chan bool
    92  	raft          *raft.Raft
    93  	raftLayer     *RaftLayer
    94  	raftStore     *raftboltdb.BoltStore
    95  	raftInmem     *raft.InmemStore
    96  	raftTransport *raft.NetworkTransport
    97  
    98  	// fsm is the state machine used with Raft
    99  	fsm *nomadFSM
   100  
   101  	// rpcListener is used to listen for incoming connections
   102  	rpcListener  net.Listener
   103  	rpcServer    *rpc.Server
   104  	rpcAdvertise net.Addr
   105  
   106  	// rpcTLS is the TLS config for incoming TLS requests
   107  	rpcTLS *tls.Config
   108  
   109  	// peers is used to track the known Nomad servers. This is
   110  	// used for region forwarding and clustering.
   111  	peers      map[string][]*serverParts
   112  	localPeers map[raft.ServerAddress]*serverParts
   113  	peerLock   sync.RWMutex
   114  
   115  	// serf is the Serf cluster containing only Nomad
   116  	// servers. This is used for multi-region federation
   117  	// and automatic clustering within regions.
   118  	serf *serf.Serf
   119  
   120  	// reconcileCh is used to pass events from the serf handler
   121  	// into the leader manager. Mostly used to handle when servers
   122  	// join/leave from the region.
   123  	reconcileCh chan serf.Member
   124  
   125  	// eventCh is used to receive events from the serf cluster
   126  	eventCh chan serf.Event
   127  
   128  	// BlockedEvals is used to manage evaluations that are blocked on node
   129  	// capacity changes.
   130  	blockedEvals *BlockedEvals
   131  
   132  	// deploymentWatcher is used to watch deployments and their allocations and
   133  	// make the required calls to continue to transistion the deployment.
   134  	deploymentWatcher *deploymentwatcher.Watcher
   135  
   136  	// evalBroker is used to manage the in-progress evaluations
   137  	// that are waiting to be brokered to a sub-scheduler
   138  	evalBroker *EvalBroker
   139  
   140  	// periodicDispatcher is used to track and create evaluations for periodic jobs.
   141  	periodicDispatcher *PeriodicDispatch
   142  
   143  	// planQueue is used to manage the submitted allocation
   144  	// plans that are waiting to be assessed by the leader
   145  	planQueue *PlanQueue
   146  
   147  	// heartbeatTimers track the expiration time of each heartbeat that has
   148  	// a TTL. On expiration, the node status is updated to be 'down'.
   149  	heartbeatTimers     map[string]*time.Timer
   150  	heartbeatTimersLock sync.Mutex
   151  
   152  	// consulCatalog is used for discovering other Nomad Servers via Consul
   153  	consulCatalog consul.CatalogAPI
   154  
   155  	// vault is the client for communicating with Vault.
   156  	vault VaultClient
   157  
   158  	// Worker used for processing
   159  	workers []*Worker
   160  
   161  	left         bool
   162  	shutdown     bool
   163  	shutdownCh   chan struct{}
   164  	shutdownLock sync.Mutex
   165  }
   166  
   167  // Holds the RPC endpoints
   168  type endpoints struct {
   169  	Status     *Status
   170  	Node       *Node
   171  	Job        *Job
   172  	Eval       *Eval
   173  	Plan       *Plan
   174  	Alloc      *Alloc
   175  	Deployment *Deployment
   176  	Region     *Region
   177  	Periodic   *Periodic
   178  	System     *System
   179  	Operator   *Operator
   180  }
   181  
   182  // NewServer is used to construct a new Nomad server from the
   183  // configuration, potentially returning an error
   184  func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) {
   185  	// Check the protocol version
   186  	if err := config.CheckVersion(); err != nil {
   187  		return nil, err
   188  	}
   189  
   190  	// Create an eval broker
   191  	evalBroker, err := NewEvalBroker(
   192  		config.EvalNackTimeout,
   193  		config.EvalNackInitialReenqueueDelay,
   194  		config.EvalNackSubsequentReenqueueDelay,
   195  		config.EvalDeliveryLimit)
   196  	if err != nil {
   197  		return nil, err
   198  	}
   199  
   200  	// Create a new blocked eval tracker.
   201  	blockedEvals := NewBlockedEvals(evalBroker)
   202  
   203  	// Create a plan queue
   204  	planQueue, err := NewPlanQueue()
   205  	if err != nil {
   206  		return nil, err
   207  	}
   208  
   209  	// Configure TLS
   210  	var tlsWrap tlsutil.RegionWrapper
   211  	var incomingTLS *tls.Config
   212  	if config.TLSConfig.EnableRPC {
   213  		tlsConf := config.tlsConfig()
   214  		tw, err := tlsConf.OutgoingTLSWrapper()
   215  		if err != nil {
   216  			return nil, err
   217  		}
   218  		tlsWrap = tw
   219  
   220  		itls, err := tlsConf.IncomingTLSConfig()
   221  		if err != nil {
   222  			return nil, err
   223  		}
   224  		incomingTLS = itls
   225  	}
   226  
   227  	// Create the server
   228  	s := &Server{
   229  		config:        config,
   230  		consulCatalog: consulCatalog,
   231  		connPool:      NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap),
   232  		logger:        logger,
   233  		rpcServer:     rpc.NewServer(),
   234  		peers:         make(map[string][]*serverParts),
   235  		localPeers:    make(map[raft.ServerAddress]*serverParts),
   236  		reconcileCh:   make(chan serf.Member, 32),
   237  		eventCh:       make(chan serf.Event, 256),
   238  		evalBroker:    evalBroker,
   239  		blockedEvals:  blockedEvals,
   240  		planQueue:     planQueue,
   241  		rpcTLS:        incomingTLS,
   242  		shutdownCh:    make(chan struct{}),
   243  	}
   244  
   245  	// Create the periodic dispatcher for launching periodic jobs.
   246  	s.periodicDispatcher = NewPeriodicDispatch(s.logger, s)
   247  
   248  	// Setup Vault
   249  	if err := s.setupVaultClient(); err != nil {
   250  		s.Shutdown()
   251  		s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err)
   252  		return nil, fmt.Errorf("Failed to setup Vault client: %v", err)
   253  	}
   254  
   255  	// Initialize the RPC layer
   256  	if err := s.setupRPC(tlsWrap); err != nil {
   257  		s.Shutdown()
   258  		s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err)
   259  		return nil, fmt.Errorf("Failed to start RPC layer: %v", err)
   260  	}
   261  
   262  	// Initialize the Raft server
   263  	if err := s.setupRaft(); err != nil {
   264  		s.Shutdown()
   265  		s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err)
   266  		return nil, fmt.Errorf("Failed to start Raft: %v", err)
   267  	}
   268  
   269  	// Initialize the wan Serf
   270  	s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot)
   271  	if err != nil {
   272  		s.Shutdown()
   273  		s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err)
   274  		return nil, fmt.Errorf("Failed to start serf: %v", err)
   275  	}
   276  
   277  	// Initialize the scheduling workers
   278  	if err := s.setupWorkers(); err != nil {
   279  		s.Shutdown()
   280  		s.logger.Printf("[ERR] nomad: failed to start workers: %s", err)
   281  		return nil, fmt.Errorf("Failed to start workers: %v", err)
   282  	}
   283  
   284  	// Setup the Consul syncer
   285  	if err := s.setupConsulSyncer(); err != nil {
   286  		return nil, fmt.Errorf("failed to create server Consul syncer: %v", err)
   287  	}
   288  
   289  	// Setup the deployment watcher.
   290  	if err := s.setupDeploymentWatcher(); err != nil {
   291  		return nil, fmt.Errorf("failed to create deployment watcher: %v", err)
   292  	}
   293  
   294  	// Monitor leadership changes
   295  	go s.monitorLeadership()
   296  
   297  	// Start ingesting events for Serf
   298  	go s.serfEventHandler()
   299  
   300  	// Start the RPC listeners
   301  	go s.listen()
   302  
   303  	// Emit metrics for the eval broker
   304  	go evalBroker.EmitStats(time.Second, s.shutdownCh)
   305  
   306  	// Emit metrics for the plan queue
   307  	go planQueue.EmitStats(time.Second, s.shutdownCh)
   308  
   309  	// Emit metrics for the blocked eval tracker.
   310  	go blockedEvals.EmitStats(time.Second, s.shutdownCh)
   311  
   312  	// Emit metrics for the Vault client.
   313  	go s.vault.EmitStats(time.Second, s.shutdownCh)
   314  
   315  	// Emit metrics
   316  	go s.heartbeatStats()
   317  
   318  	// Done
   319  	return s, nil
   320  }
   321  
   322  // Shutdown is used to shutdown the server
   323  func (s *Server) Shutdown() error {
   324  	s.logger.Printf("[INFO] nomad: shutting down server")
   325  	s.shutdownLock.Lock()
   326  	defer s.shutdownLock.Unlock()
   327  
   328  	if s.shutdown {
   329  		return nil
   330  	}
   331  
   332  	s.shutdown = true
   333  	close(s.shutdownCh)
   334  
   335  	if s.serf != nil {
   336  		s.serf.Shutdown()
   337  	}
   338  
   339  	if s.raft != nil {
   340  		s.raftTransport.Close()
   341  		s.raftLayer.Close()
   342  		future := s.raft.Shutdown()
   343  		if err := future.Error(); err != nil {
   344  			s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err)
   345  		}
   346  		if s.raftStore != nil {
   347  			s.raftStore.Close()
   348  		}
   349  	}
   350  
   351  	// Shutdown the RPC listener
   352  	if s.rpcListener != nil {
   353  		s.rpcListener.Close()
   354  	}
   355  
   356  	// Close the connection pool
   357  	s.connPool.Shutdown()
   358  
   359  	// Close the fsm
   360  	if s.fsm != nil {
   361  		s.fsm.Close()
   362  	}
   363  
   364  	// Stop Vault token renewal
   365  	if s.vault != nil {
   366  		s.vault.Stop()
   367  	}
   368  
   369  	return nil
   370  }
   371  
   372  // IsShutdown checks if the server is shutdown
   373  func (s *Server) IsShutdown() bool {
   374  	select {
   375  	case <-s.shutdownCh:
   376  		return true
   377  	default:
   378  		return false
   379  	}
   380  }
   381  
   382  // Leave is used to prepare for a graceful shutdown of the server
   383  func (s *Server) Leave() error {
   384  	s.logger.Printf("[INFO] nomad: server starting leave")
   385  	s.left = true
   386  
   387  	// Check the number of known peers
   388  	numPeers, err := s.numPeers()
   389  	if err != nil {
   390  		s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err)
   391  		return err
   392  	}
   393  
   394  	// TODO (alexdadgar) - This will need to be updated once we support node
   395  	// IDs.
   396  	addr := s.raftTransport.LocalAddr()
   397  
   398  	// If we are the current leader, and we have any other peers (cluster has multiple
   399  	// servers), we should do a RemovePeer to safely reduce the quorum size. If we are
   400  	// not the leader, then we should issue our leave intention and wait to be removed
   401  	// for some sane period of time.
   402  	isLeader := s.IsLeader()
   403  	if isLeader && numPeers > 1 {
   404  		future := s.raft.RemovePeer(addr)
   405  		if err := future.Error(); err != nil {
   406  			s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err)
   407  		}
   408  	}
   409  
   410  	// Leave the gossip pool
   411  	if s.serf != nil {
   412  		if err := s.serf.Leave(); err != nil {
   413  			s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err)
   414  		}
   415  	}
   416  
   417  	// If we were not leader, wait to be safely removed from the cluster.
   418  	// We must wait to allow the raft replication to take place, otherwise
   419  	// an immediate shutdown could cause a loss of quorum.
   420  	if !isLeader {
   421  		left := false
   422  		limit := time.Now().Add(raftRemoveGracePeriod)
   423  		for !left && time.Now().Before(limit) {
   424  			// Sleep a while before we check.
   425  			time.Sleep(50 * time.Millisecond)
   426  
   427  			// Get the latest configuration.
   428  			future := s.raft.GetConfiguration()
   429  			if err := future.Error(); err != nil {
   430  				s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   431  				break
   432  			}
   433  
   434  			// See if we are no longer included.
   435  			left = true
   436  			for _, server := range future.Configuration().Servers {
   437  				if server.Address == addr {
   438  					left = false
   439  					break
   440  				}
   441  			}
   442  		}
   443  
   444  		// TODO (alexdadgar) With the old Raft library we used to force the
   445  		// peers set to empty when a graceful leave occurred. This would
   446  		// keep voting spam down if the server was restarted, but it was
   447  		// dangerous because the peers was inconsistent with the logs and
   448  		// snapshots, so it wasn't really safe in all cases for the server
   449  		// to become leader. This is now safe, but the log spam is noisy.
   450  		// The next new version of the library will have a "you are not a
   451  		// peer stop it" behavior that should address this. We will have
   452  		// to evaluate during the RC period if this interim situation is
   453  		// not too confusing for operators.
   454  
   455  		// TODO (alexdadgar) When we take a later new version of the Raft
   456  		// library it won't try to complete replication, so this peer
   457  		// may not realize that it has been removed. Need to revisit this
   458  		// and the warning here.
   459  		if !left {
   460  			s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout")
   461  		}
   462  	}
   463  	return nil
   464  }
   465  
   466  // Reload handles a config reload. Not all config fields can handle a reload.
   467  func (s *Server) Reload(config *Config) error {
   468  	if config == nil {
   469  		return fmt.Errorf("Reload given a nil config")
   470  	}
   471  
   472  	var mErr multierror.Error
   473  
   474  	// Handle the Vault reload. Vault should never be nil but just guard.
   475  	if s.vault != nil {
   476  		if err := s.vault.SetConfig(config.VaultConfig); err != nil {
   477  			multierror.Append(&mErr, err)
   478  		}
   479  	}
   480  
   481  	return mErr.ErrorOrNil()
   482  }
   483  
   484  // setupBootstrapHandler() creates the closure necessary to support a Consul
   485  // fallback handler.
   486  func (s *Server) setupBootstrapHandler() error {
   487  	// peersTimeout is used to indicate to the Consul Syncer that the
   488  	// current Nomad Server has a stale peer set.  peersTimeout will time
   489  	// out if the Consul Syncer bootstrapFn has not observed a Raft
   490  	// leader in maxStaleLeadership.  If peersTimeout has been triggered,
   491  	// the Consul Syncer will begin querying Consul for other Nomad
   492  	// Servers.
   493  	//
   494  	// NOTE: time.Timer is used vs time.Time in order to handle clock
   495  	// drift because time.Timer is implemented as a monotonic clock.
   496  	var peersTimeout *time.Timer = time.NewTimer(0)
   497  
   498  	// consulQueryCount is the number of times the bootstrapFn has been
   499  	// called, regardless of success.
   500  	var consulQueryCount uint64
   501  
   502  	// leadershipTimedOut is a helper method that returns true if the
   503  	// peersTimeout timer has expired.
   504  	leadershipTimedOut := func() bool {
   505  		select {
   506  		case <-peersTimeout.C:
   507  			return true
   508  		default:
   509  			return false
   510  		}
   511  	}
   512  
   513  	// The bootstrapFn callback handler is used to periodically poll
   514  	// Consul to look up the Nomad Servers in Consul.  In the event the
   515  	// server has been brought up without a `retry-join` configuration
   516  	// and this Server is partitioned from the rest of the cluster,
   517  	// periodically poll Consul to reattach this Server to other servers
   518  	// in the same region and automatically reform a quorum (assuming the
   519  	// correct number of servers required for quorum are present).
   520  	bootstrapFn := func() error {
   521  		// If there is a raft leader, do nothing
   522  		if s.raft.Leader() != "" {
   523  			peersTimeout.Reset(maxStaleLeadership)
   524  			return nil
   525  		}
   526  
   527  		// (ab)use serf.go's behavior of setting BootstrapExpect to
   528  		// zero if we have bootstrapped.  If we have bootstrapped
   529  		bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect)
   530  		if bootstrapExpect == 0 {
   531  			// This Nomad Server has been bootstrapped.  Rely on
   532  			// the peersTimeout firing as a guard to prevent
   533  			// aggressive querying of Consul.
   534  			if !leadershipTimedOut() {
   535  				return nil
   536  			}
   537  		} else {
   538  			if consulQueryCount > 0 && !leadershipTimedOut() {
   539  				return nil
   540  			}
   541  
   542  			// This Nomad Server has not been bootstrapped, reach
   543  			// out to Consul if our peer list is less than
   544  			// `bootstrap_expect`.
   545  			raftPeers, err := s.numPeers()
   546  			if err != nil {
   547  				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
   548  				return nil
   549  			}
   550  
   551  			// The necessary number of Nomad Servers required for
   552  			// quorum has been reached, we do not need to poll
   553  			// Consul.  Let the normal timeout-based strategy
   554  			// take over.
   555  			if raftPeers >= int(bootstrapExpect) {
   556  				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
   557  				return nil
   558  			}
   559  		}
   560  		consulQueryCount++
   561  
   562  		s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list")
   563  
   564  		dcs, err := s.consulCatalog.Datacenters()
   565  		if err != nil {
   566  			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
   567  			return fmt.Errorf("server.nomad: unable to query Consul datacenters: %v", err)
   568  		}
   569  		if len(dcs) > 2 {
   570  			// Query the local DC first, then shuffle the
   571  			// remaining DCs.  If additional calls to bootstrapFn
   572  			// are necessary, this Nomad Server will eventually
   573  			// walk all datacenter until it finds enough hosts to
   574  			// form a quorum.
   575  			shuffleStrings(dcs[1:])
   576  			dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
   577  		}
   578  
   579  		nomadServerServiceName := s.config.ConsulConfig.ServerServiceName
   580  		var mErr multierror.Error
   581  		const defaultMaxNumNomadServers = 8
   582  		nomadServerServices := make([]string, 0, defaultMaxNumNomadServers)
   583  		localNode := s.serf.Memberlist().LocalNode()
   584  		for _, dc := range dcs {
   585  			consulOpts := &consulapi.QueryOptions{
   586  				AllowStale: true,
   587  				Datacenter: dc,
   588  				Near:       "_agent",
   589  				WaitTime:   consul.DefaultQueryWaitDuration,
   590  			}
   591  			consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts)
   592  			if err != nil {
   593  				err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err)
   594  				s.logger.Printf("[WARN] server.nomad: %v", err)
   595  				mErr.Errors = append(mErr.Errors, err)
   596  				continue
   597  			}
   598  
   599  			for _, cs := range consulServices {
   600  				port := strconv.FormatInt(int64(cs.ServicePort), 10)
   601  				addr := cs.ServiceAddress
   602  				if addr == "" {
   603  					addr = cs.Address
   604  				}
   605  				if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort {
   606  					continue
   607  				}
   608  				serverAddr := net.JoinHostPort(addr, port)
   609  				nomadServerServices = append(nomadServerServices, serverAddr)
   610  			}
   611  		}
   612  
   613  		if len(nomadServerServices) == 0 {
   614  			if len(mErr.Errors) > 0 {
   615  				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
   616  				return mErr.ErrorOrNil()
   617  			}
   618  
   619  			// Log the error and return nil so future handlers
   620  			// can attempt to register the `nomad` service.
   621  			pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)
   622  			s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval)
   623  			peersTimeout.Reset(pollInterval)
   624  			return nil
   625  		}
   626  
   627  		numServersContacted, err := s.Join(nomadServerServices)
   628  		if err != nil {
   629  			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
   630  			return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err)
   631  		}
   632  
   633  		peersTimeout.Reset(maxStaleLeadership)
   634  		s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted)
   635  
   636  		return nil
   637  	}
   638  
   639  	// Hacky replacement for old ConsulSyncer Periodic Handler.
   640  	go func() {
   641  		lastOk := true
   642  		sync := time.NewTimer(0)
   643  		for {
   644  			select {
   645  			case <-sync.C:
   646  				d := defaultConsulDiscoveryInterval
   647  				if err := bootstrapFn(); err != nil {
   648  					// Only log if it worked last time
   649  					if lastOk {
   650  						lastOk = false
   651  						s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err)
   652  					}
   653  					d = defaultConsulDiscoveryIntervalRetry
   654  				}
   655  				sync.Reset(d)
   656  			case <-s.shutdownCh:
   657  				return
   658  			}
   659  		}
   660  	}()
   661  	return nil
   662  }
   663  
   664  // setupConsulSyncer creates Server-mode consul.Syncer which periodically
   665  // executes callbacks on a fixed interval.
   666  func (s *Server) setupConsulSyncer() error {
   667  	if s.config.ConsulConfig.ServerAutoJoin != nil && *s.config.ConsulConfig.ServerAutoJoin {
   668  		if err := s.setupBootstrapHandler(); err != nil {
   669  			return err
   670  		}
   671  	}
   672  
   673  	return nil
   674  }
   675  
   676  // setupDeploymentWatcher creates a deployment watcher that consumes the RPC
   677  // endpoints for state information and makes transistions via Raft through a
   678  // shim that provides the appropriate methods.
   679  func (s *Server) setupDeploymentWatcher() error {
   680  
   681  	// Create the shims
   682  	stateShim := &deploymentWatcherStateShim{
   683  		region:         s.Region(),
   684  		evaluations:    s.endpoints.Job.Evaluations,
   685  		allocations:    s.endpoints.Deployment.Allocations,
   686  		list:           s.endpoints.Deployment.List,
   687  		getDeployment:  s.endpoints.Deployment.GetDeployment,
   688  		getJobVersions: s.endpoints.Job.GetJobVersions,
   689  		getJob:         s.endpoints.Job.GetJob,
   690  	}
   691  	raftShim := &deploymentWatcherRaftShim{
   692  		apply: s.raftApply,
   693  	}
   694  
   695  	// Create the deployment watcher
   696  	s.deploymentWatcher = deploymentwatcher.NewDeploymentsWatcher(
   697  		s.logger, stateShim, raftShim,
   698  		deploymentwatcher.LimitStateQueriesPerSecond,
   699  		deploymentwatcher.CrossDeploymentEvalBatchDuration)
   700  
   701  	return nil
   702  }
   703  
   704  // setupVaultClient is used to set up the Vault API client.
   705  func (s *Server) setupVaultClient() error {
   706  	v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors)
   707  	if err != nil {
   708  		return err
   709  	}
   710  	s.vault = v
   711  	return nil
   712  }
   713  
   714  // setupRPC is used to setup the RPC listener
   715  func (s *Server) setupRPC(tlsWrap tlsutil.RegionWrapper) error {
   716  	// Create endpoints
   717  	s.endpoints.Alloc = &Alloc{s}
   718  	s.endpoints.Eval = &Eval{s}
   719  	s.endpoints.Job = &Job{s}
   720  	s.endpoints.Node = &Node{srv: s}
   721  	s.endpoints.Deployment = &Deployment{srv: s}
   722  	s.endpoints.Operator = &Operator{s}
   723  	s.endpoints.Periodic = &Periodic{s}
   724  	s.endpoints.Plan = &Plan{s}
   725  	s.endpoints.Region = &Region{s}
   726  	s.endpoints.Status = &Status{s}
   727  	s.endpoints.System = &System{s}
   728  
   729  	// Register the handlers
   730  	s.rpcServer.Register(s.endpoints.Alloc)
   731  	s.rpcServer.Register(s.endpoints.Eval)
   732  	s.rpcServer.Register(s.endpoints.Job)
   733  	s.rpcServer.Register(s.endpoints.Node)
   734  	s.rpcServer.Register(s.endpoints.Deployment)
   735  	s.rpcServer.Register(s.endpoints.Operator)
   736  	s.rpcServer.Register(s.endpoints.Periodic)
   737  	s.rpcServer.Register(s.endpoints.Plan)
   738  	s.rpcServer.Register(s.endpoints.Region)
   739  	s.rpcServer.Register(s.endpoints.Status)
   740  	s.rpcServer.Register(s.endpoints.System)
   741  
   742  	list, err := net.ListenTCP("tcp", s.config.RPCAddr)
   743  	if err != nil {
   744  		return err
   745  	}
   746  	s.rpcListener = list
   747  
   748  	if s.config.RPCAdvertise != nil {
   749  		s.rpcAdvertise = s.config.RPCAdvertise
   750  	} else {
   751  		s.rpcAdvertise = s.rpcListener.Addr()
   752  	}
   753  
   754  	// Verify that we have a usable advertise address
   755  	addr, ok := s.rpcAdvertise.(*net.TCPAddr)
   756  	if !ok {
   757  		list.Close()
   758  		return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr)
   759  	}
   760  	if addr.IP.IsUnspecified() {
   761  		list.Close()
   762  		return fmt.Errorf("RPC advertise address is not advertisable: %v", addr)
   763  	}
   764  
   765  	wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap)
   766  	s.raftLayer = NewRaftLayer(s.rpcAdvertise, wrapper)
   767  	return nil
   768  }
   769  
   770  // setupRaft is used to setup and initialize Raft
   771  func (s *Server) setupRaft() error {
   772  	// If we have an unclean exit then attempt to close the Raft store.
   773  	defer func() {
   774  		if s.raft == nil && s.raftStore != nil {
   775  			if err := s.raftStore.Close(); err != nil {
   776  				s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err)
   777  			}
   778  		}
   779  	}()
   780  
   781  	// Create the FSM
   782  	var err error
   783  	s.fsm, err = NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput)
   784  	if err != nil {
   785  		return err
   786  	}
   787  
   788  	// Create a transport layer
   789  	trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout,
   790  		s.config.LogOutput)
   791  	s.raftTransport = trans
   792  
   793  	// Make sure we set the LogOutput.
   794  	s.config.RaftConfig.LogOutput = s.config.LogOutput
   795  
   796  	// Our version of Raft protocol requires the LocalID to match the network
   797  	// address of the transport.
   798  	s.config.RaftConfig.LocalID = raft.ServerID(trans.LocalAddr())
   799  
   800  	// Build an all in-memory setup for dev mode, otherwise prepare a full
   801  	// disk-based setup.
   802  	var log raft.LogStore
   803  	var stable raft.StableStore
   804  	var snap raft.SnapshotStore
   805  	if s.config.DevMode {
   806  		store := raft.NewInmemStore()
   807  		s.raftInmem = store
   808  		stable = store
   809  		log = store
   810  		snap = raft.NewDiscardSnapshotStore()
   811  
   812  	} else {
   813  		// Create the base raft path
   814  		path := filepath.Join(s.config.DataDir, raftState)
   815  		if err := ensurePath(path, true); err != nil {
   816  			return err
   817  		}
   818  
   819  		// Create the BoltDB backend
   820  		store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db"))
   821  		if err != nil {
   822  			return err
   823  		}
   824  		s.raftStore = store
   825  		stable = store
   826  
   827  		// Wrap the store in a LogCache to improve performance
   828  		cacheStore, err := raft.NewLogCache(raftLogCacheSize, store)
   829  		if err != nil {
   830  			store.Close()
   831  			return err
   832  		}
   833  		log = cacheStore
   834  
   835  		// Create the snapshot store
   836  		snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput)
   837  		if err != nil {
   838  			if s.raftStore != nil {
   839  				s.raftStore.Close()
   840  			}
   841  			return err
   842  		}
   843  		snap = snapshots
   844  
   845  		// For an existing cluster being upgraded to the new version of
   846  		// Raft, we almost never want to run recovery based on the old
   847  		// peers.json file. We create a peers.info file with a helpful
   848  		// note about where peers.json went, and use that as a sentinel
   849  		// to avoid ingesting the old one that first time (if we have to
   850  		// create the peers.info file because it's not there, we also
   851  		// blow away any existing peers.json file).
   852  		peersFile := filepath.Join(path, "peers.json")
   853  		peersInfoFile := filepath.Join(path, "peers.info")
   854  		if _, err := os.Stat(peersInfoFile); os.IsNotExist(err) {
   855  			if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0755); err != nil {
   856  				return fmt.Errorf("failed to write peers.info file: %v", err)
   857  			}
   858  
   859  			// Blow away the peers.json file if present, since the
   860  			// peers.info sentinel wasn't there.
   861  			if _, err := os.Stat(peersFile); err == nil {
   862  				if err := os.Remove(peersFile); err != nil {
   863  					return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err)
   864  				}
   865  				s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)")
   866  			}
   867  		} else if _, err := os.Stat(peersFile); err == nil {
   868  			s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...")
   869  			configuration, err := raft.ReadPeersJSON(peersFile)
   870  			if err != nil {
   871  				return fmt.Errorf("recovery failed to parse peers.json: %v", err)
   872  			}
   873  			tmpFsm, err := NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput)
   874  			if err != nil {
   875  				return fmt.Errorf("recovery failed to make temp FSM: %v", err)
   876  			}
   877  			if err := raft.RecoverCluster(s.config.RaftConfig, tmpFsm,
   878  				log, stable, snap, trans, configuration); err != nil {
   879  				return fmt.Errorf("recovery failed: %v", err)
   880  			}
   881  			if err := os.Remove(peersFile); err != nil {
   882  				return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err)
   883  			}
   884  			s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery")
   885  		}
   886  	}
   887  
   888  	// If we are in bootstrap or dev mode and the state is clean then we can
   889  	// bootstrap now.
   890  	if s.config.Bootstrap || s.config.DevMode {
   891  		hasState, err := raft.HasExistingState(log, stable, snap)
   892  		if err != nil {
   893  			return err
   894  		}
   895  		if !hasState {
   896  			// TODO (alexdadgar) - This will need to be updated when
   897  			// we add support for node IDs.
   898  			configuration := raft.Configuration{
   899  				Servers: []raft.Server{
   900  					raft.Server{
   901  						ID:      raft.ServerID(trans.LocalAddr()),
   902  						Address: trans.LocalAddr(),
   903  					},
   904  				},
   905  			}
   906  			if err := raft.BootstrapCluster(s.config.RaftConfig,
   907  				log, stable, snap, trans, configuration); err != nil {
   908  				return err
   909  			}
   910  		}
   911  	}
   912  
   913  	// Setup the leader channel
   914  	leaderCh := make(chan bool, 1)
   915  	s.config.RaftConfig.NotifyCh = leaderCh
   916  	s.leaderCh = leaderCh
   917  
   918  	// Setup the Raft store
   919  	s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, snap, trans)
   920  	if err != nil {
   921  		return err
   922  	}
   923  	return nil
   924  }
   925  
   926  // setupSerf is used to setup and initialize a Serf
   927  func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) {
   928  	conf.Init()
   929  	conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region)
   930  	conf.Tags["role"] = "nomad"
   931  	conf.Tags["region"] = s.config.Region
   932  	conf.Tags["dc"] = s.config.Datacenter
   933  	conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion)
   934  	conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion)
   935  	conf.Tags["build"] = s.config.Build
   936  	conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port)
   937  	if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) {
   938  		conf.Tags["bootstrap"] = "1"
   939  	}
   940  	bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect)
   941  	if bootstrapExpect != 0 {
   942  		conf.Tags["expect"] = fmt.Sprintf("%d", bootstrapExpect)
   943  	}
   944  	conf.MemberlistConfig.LogOutput = s.config.LogOutput
   945  	conf.LogOutput = s.config.LogOutput
   946  	conf.EventCh = ch
   947  	if !s.config.DevMode {
   948  		conf.SnapshotPath = filepath.Join(s.config.DataDir, path)
   949  		if err := ensurePath(conf.SnapshotPath, false); err != nil {
   950  			return nil, err
   951  		}
   952  	}
   953  	conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion]
   954  	conf.RejoinAfterLeave = true
   955  	conf.Merge = &serfMergeDelegate{}
   956  
   957  	// Until Nomad supports this fully, we disable automatic resolution.
   958  	// When enabled, the Serf gossip may just turn off if we are the minority
   959  	// node which is rather unexpected.
   960  	conf.EnableNameConflictResolution = false
   961  	return serf.Create(conf)
   962  }
   963  
   964  // setupWorkers is used to start the scheduling workers
   965  func (s *Server) setupWorkers() error {
   966  	// Check if all the schedulers are disabled
   967  	if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 {
   968  		s.logger.Printf("[WARN] nomad: no enabled schedulers")
   969  		return nil
   970  	}
   971  
   972  	// Start the workers
   973  	for i := 0; i < s.config.NumSchedulers; i++ {
   974  		if w, err := NewWorker(s); err != nil {
   975  			return err
   976  		} else {
   977  			s.workers = append(s.workers, w)
   978  		}
   979  	}
   980  	s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v",
   981  		s.config.NumSchedulers, s.config.EnabledSchedulers)
   982  	return nil
   983  }
   984  
   985  // numPeers is used to check on the number of known peers, including the local
   986  // node.
   987  func (s *Server) numPeers() (int, error) {
   988  	future := s.raft.GetConfiguration()
   989  	if err := future.Error(); err != nil {
   990  		return 0, err
   991  	}
   992  	configuration := future.Configuration()
   993  	return len(configuration.Servers), nil
   994  }
   995  
   996  // IsLeader checks if this server is the cluster leader
   997  func (s *Server) IsLeader() bool {
   998  	return s.raft.State() == raft.Leader
   999  }
  1000  
  1001  // Join is used to have Nomad join the gossip ring
  1002  // The target address should be another node listening on the
  1003  // Serf address
  1004  func (s *Server) Join(addrs []string) (int, error) {
  1005  	return s.serf.Join(addrs, true)
  1006  }
  1007  
  1008  // LocalMember is used to return the local node
  1009  func (c *Server) LocalMember() serf.Member {
  1010  	return c.serf.LocalMember()
  1011  }
  1012  
  1013  // Members is used to return the members of the serf cluster
  1014  func (s *Server) Members() []serf.Member {
  1015  	return s.serf.Members()
  1016  }
  1017  
  1018  // RemoveFailedNode is used to remove a failed node from the cluster
  1019  func (s *Server) RemoveFailedNode(node string) error {
  1020  	return s.serf.RemoveFailedNode(node)
  1021  }
  1022  
  1023  // KeyManager returns the Serf keyring manager
  1024  func (s *Server) KeyManager() *serf.KeyManager {
  1025  	return s.serf.KeyManager()
  1026  }
  1027  
  1028  // Encrypted determines if gossip is encrypted
  1029  func (s *Server) Encrypted() bool {
  1030  	return s.serf.EncryptionEnabled()
  1031  }
  1032  
  1033  // State returns the underlying state store. This should *not*
  1034  // be used to modify state directly.
  1035  func (s *Server) State() *state.StateStore {
  1036  	return s.fsm.State()
  1037  }
  1038  
  1039  // Regions returns the known regions in the cluster.
  1040  func (s *Server) Regions() []string {
  1041  	s.peerLock.RLock()
  1042  	defer s.peerLock.RUnlock()
  1043  
  1044  	regions := make([]string, 0, len(s.peers))
  1045  	for region, _ := range s.peers {
  1046  		regions = append(regions, region)
  1047  	}
  1048  	sort.Strings(regions)
  1049  	return regions
  1050  }
  1051  
  1052  // inmemCodec is used to do an RPC call without going over a network
  1053  type inmemCodec struct {
  1054  	method string
  1055  	args   interface{}
  1056  	reply  interface{}
  1057  	err    error
  1058  }
  1059  
  1060  func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error {
  1061  	req.ServiceMethod = i.method
  1062  	return nil
  1063  }
  1064  
  1065  func (i *inmemCodec) ReadRequestBody(args interface{}) error {
  1066  	sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args)))
  1067  	dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args)))
  1068  	dst.Set(sourceValue)
  1069  	return nil
  1070  }
  1071  
  1072  func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error {
  1073  	if resp.Error != "" {
  1074  		i.err = errors.New(resp.Error)
  1075  		return nil
  1076  	}
  1077  	sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply)))
  1078  	dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply)))
  1079  	dst.Set(sourceValue)
  1080  	return nil
  1081  }
  1082  
  1083  func (i *inmemCodec) Close() error {
  1084  	return nil
  1085  }
  1086  
  1087  // RPC is used to make a local RPC call
  1088  func (s *Server) RPC(method string, args interface{}, reply interface{}) error {
  1089  	codec := &inmemCodec{
  1090  		method: method,
  1091  		args:   args,
  1092  		reply:  reply,
  1093  	}
  1094  	if err := s.rpcServer.ServeRequest(codec); err != nil {
  1095  		return err
  1096  	}
  1097  	return codec.err
  1098  }
  1099  
  1100  // Stats is used to return statistics for debugging and insight
  1101  // for various sub-systems
  1102  func (s *Server) Stats() map[string]map[string]string {
  1103  	toString := func(v uint64) string {
  1104  		return strconv.FormatUint(v, 10)
  1105  	}
  1106  	stats := map[string]map[string]string{
  1107  		"nomad": map[string]string{
  1108  			"server":        "true",
  1109  			"leader":        fmt.Sprintf("%v", s.IsLeader()),
  1110  			"leader_addr":   string(s.raft.Leader()),
  1111  			"bootstrap":     fmt.Sprintf("%v", s.config.Bootstrap),
  1112  			"known_regions": toString(uint64(len(s.peers))),
  1113  		},
  1114  		"raft":    s.raft.Stats(),
  1115  		"serf":    s.serf.Stats(),
  1116  		"runtime": RuntimeStats(),
  1117  	}
  1118  
  1119  	return stats
  1120  }
  1121  
  1122  // Region retuns the region of the server
  1123  func (s *Server) Region() string {
  1124  	return s.config.Region
  1125  }
  1126  
  1127  // Datacenter returns the data center of the server
  1128  func (s *Server) Datacenter() string {
  1129  	return s.config.Datacenter
  1130  }
  1131  
  1132  // GetConfig returns the config of the server for testing purposes only
  1133  func (s *Server) GetConfig() *Config {
  1134  	return s.config
  1135  }
  1136  
  1137  // peersInfoContent is used to help operators understand what happened to the
  1138  // peers.json file. This is written to a file called peers.info in the same
  1139  // location.
  1140  const peersInfoContent = `
  1141  As of Nomad 0.5.5, the peers.json file is only used for recovery
  1142  after an outage. It should be formatted as a JSON array containing the address
  1143  and port of each Consul server in the cluster, like this:
  1144  
  1145  ["10.1.0.1:4647","10.1.0.2:4647","10.1.0.3:4647"]
  1146  
  1147  Under normal operation, the peers.json file will not be present.
  1148  
  1149  When Nomad starts for the first time, it will create this peers.info file and
  1150  delete any existing peers.json file so that recovery doesn't occur on the first
  1151  startup.
  1152  
  1153  Once this peers.info file is present, any peers.json file will be ingested at
  1154  startup, and will set the Raft peer configuration manually to recover from an
  1155  outage. It's crucial that all servers in the cluster are shut down before
  1156  creating the peers.json file, and that all servers receive the same
  1157  configuration. Once the peers.json file is successfully ingested and applied, it
  1158  will be deleted.
  1159  
  1160  Please see https://www.nomadproject.io/guides/outage.html for more information.
  1161  `