github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/server.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "crypto/tls" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "net" 10 "net/rpc" 11 "os" 12 "path/filepath" 13 "sort" 14 "strconv" 15 "sync" 16 "sync/atomic" 17 "time" 18 19 "github.com/hashicorp/consul/agent/consul/autopilot" 20 consulapi "github.com/hashicorp/consul/api" 21 "github.com/hashicorp/consul/lib" 22 multierror "github.com/hashicorp/go-multierror" 23 lru "github.com/hashicorp/golang-lru" 24 "github.com/hashicorp/nomad/command/agent/consul" 25 "github.com/hashicorp/nomad/helper/codec" 26 "github.com/hashicorp/nomad/helper/pool" 27 "github.com/hashicorp/nomad/helper/stats" 28 "github.com/hashicorp/nomad/helper/tlsutil" 29 "github.com/hashicorp/nomad/nomad/deploymentwatcher" 30 "github.com/hashicorp/nomad/nomad/state" 31 "github.com/hashicorp/nomad/nomad/structs" 32 "github.com/hashicorp/nomad/nomad/structs/config" 33 "github.com/hashicorp/raft" 34 raftboltdb "github.com/hashicorp/raft-boltdb" 35 "github.com/hashicorp/serf/serf" 36 ) 37 38 const ( 39 // datacenterQueryLimit sets the max number of DCs that a Nomad 40 // Server will query to find bootstrap_expect servers. 41 datacenterQueryLimit = 25 42 43 // maxStaleLeadership is the maximum time we will permit this Nomad 44 // Server to go without seeing a valid Raft leader. 45 maxStaleLeadership = 15 * time.Second 46 47 // peersPollInterval is used as the polling interval between attempts 48 // to query Consul for Nomad Servers. 49 peersPollInterval = 45 * time.Second 50 51 // peersPollJitter is used to provide a slight amount of variance to 52 // the retry interval when querying Consul Servers 53 peersPollJitterFactor = 2 54 55 raftState = "raft/" 56 serfSnapshot = "serf/snapshot" 57 snapshotsRetained = 2 58 59 // serverRPCCache controls how long we keep an idle connection open to a server 60 serverRPCCache = 2 * time.Minute 61 62 // serverMaxStreams controsl how many idle streams we keep open to a server 63 serverMaxStreams = 64 64 65 // raftLogCacheSize is the maximum number of logs to cache in-memory. 66 // This is used to reduce disk I/O for the recently committed entries. 67 raftLogCacheSize = 512 68 69 // raftRemoveGracePeriod is how long we wait to allow a RemovePeer 70 // to replicate to gracefully leave the cluster. 71 raftRemoveGracePeriod = 5 * time.Second 72 73 // defaultConsulDiscoveryInterval is how often to poll Consul for new 74 // servers if there is no leader. 75 defaultConsulDiscoveryInterval time.Duration = 3 * time.Second 76 77 // defaultConsulDiscoveryIntervalRetry is how often to poll Consul for 78 // new servers if there is no leader and the last Consul query failed. 79 defaultConsulDiscoveryIntervalRetry time.Duration = 9 * time.Second 80 81 // aclCacheSize is the number of ACL objects to keep cached. ACLs have a parsing and 82 // construction cost, so we keep the hot objects cached to reduce the ACL token resolution time. 83 aclCacheSize = 512 84 ) 85 86 // Server is Nomad server which manages the job queues, 87 // schedulers, and notification bus for agents. 88 type Server struct { 89 config *Config 90 91 logger *log.Logger 92 93 // Connection pool to other Nomad servers 94 connPool *pool.ConnPool 95 96 // The raft instance is used among Nomad nodes within the 97 // region to protect operations that require strong consistency 98 leaderCh <-chan bool 99 raft *raft.Raft 100 raftLayer *RaftLayer 101 raftStore *raftboltdb.BoltStore 102 raftInmem *raft.InmemStore 103 raftTransport *raft.NetworkTransport 104 105 // autopilot is the Autopilot instance for this server. 106 autopilot *autopilot.Autopilot 107 108 // fsm is the state machine used with Raft 109 fsm *nomadFSM 110 111 // rpcListener is used to listen for incoming connections 112 rpcListener net.Listener 113 listenerCh chan struct{} 114 115 // tlsWrap is used to wrap outbound connections using TLS. It should be 116 // accessed using the lock. 117 tlsWrap tlsutil.RegionWrapper 118 tlsWrapLock sync.RWMutex 119 120 // rpcServer is the static RPC server that is used by the local agent. 121 rpcServer *rpc.Server 122 123 // rpcAdvertise is the advertised address for the RPC listener. 124 rpcAdvertise net.Addr 125 126 // rpcTLS is the TLS config for incoming TLS requests 127 rpcTLS *tls.Config 128 rpcCancel context.CancelFunc 129 130 // staticEndpoints is the set of static endpoints that can be reused across 131 // all RPC connections 132 staticEndpoints endpoints 133 134 // streamingRpcs is the registry holding our streaming RPC handlers. 135 streamingRpcs *structs.StreamingRpcRegistery 136 137 // nodeConns is the set of multiplexed node connections we have keyed by 138 // NodeID 139 nodeConns map[string]*nodeConnState 140 nodeConnsLock sync.RWMutex 141 142 // peers is used to track the known Nomad servers. This is 143 // used for region forwarding and clustering. 144 peers map[string][]*serverParts 145 localPeers map[raft.ServerAddress]*serverParts 146 peerLock sync.RWMutex 147 148 // serf is the Serf cluster containing only Nomad 149 // servers. This is used for multi-region federation 150 // and automatic clustering within regions. 151 serf *serf.Serf 152 153 // reconcileCh is used to pass events from the serf handler 154 // into the leader manager. Mostly used to handle when servers 155 // join/leave from the region. 156 reconcileCh chan serf.Member 157 158 // eventCh is used to receive events from the serf cluster 159 eventCh chan serf.Event 160 161 // BlockedEvals is used to manage evaluations that are blocked on node 162 // capacity changes. 163 blockedEvals *BlockedEvals 164 165 // deploymentWatcher is used to watch deployments and their allocations and 166 // make the required calls to continue to transition the deployment. 167 deploymentWatcher *deploymentwatcher.Watcher 168 169 // evalBroker is used to manage the in-progress evaluations 170 // that are waiting to be brokered to a sub-scheduler 171 evalBroker *EvalBroker 172 173 // periodicDispatcher is used to track and create evaluations for periodic jobs. 174 periodicDispatcher *PeriodicDispatch 175 176 // planQueue is used to manage the submitted allocation 177 // plans that are waiting to be assessed by the leader 178 planQueue *PlanQueue 179 180 // heartbeatTimers track the expiration time of each heartbeat that has 181 // a TTL. On expiration, the node status is updated to be 'down'. 182 heartbeatTimers map[string]*time.Timer 183 heartbeatTimersLock sync.Mutex 184 185 // consulCatalog is used for discovering other Nomad Servers via Consul 186 consulCatalog consul.CatalogAPI 187 188 // vault is the client for communicating with Vault. 189 vault VaultClient 190 191 // Worker used for processing 192 workers []*Worker 193 194 // aclCache is used to maintain the parsed ACL objects 195 aclCache *lru.TwoQueueCache 196 197 // leaderAcl is the management ACL token that is valid when resolved by the 198 // current leader. 199 leaderAcl string 200 leaderAclLock sync.Mutex 201 202 // statsFetcher is used by autopilot to check the status of the other 203 // Nomad router. 204 statsFetcher *StatsFetcher 205 206 // EnterpriseState is used to fill in state for Pro/Ent builds 207 EnterpriseState 208 209 left bool 210 shutdown bool 211 shutdownCh chan struct{} 212 shutdownLock sync.Mutex 213 } 214 215 // Holds the RPC endpoints 216 type endpoints struct { 217 Status *Status 218 Node *Node 219 Job *Job 220 Eval *Eval 221 Plan *Plan 222 Alloc *Alloc 223 Deployment *Deployment 224 Region *Region 225 Search *Search 226 Periodic *Periodic 227 System *System 228 Operator *Operator 229 ACL *ACL 230 Enterprise *EnterpriseEndpoints 231 232 // Client endpoints 233 ClientStats *ClientStats 234 FileSystem *FileSystem 235 ClientAllocations *ClientAllocations 236 } 237 238 // NewServer is used to construct a new Nomad server from the 239 // configuration, potentially returning an error 240 func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) { 241 // Check the protocol version 242 if err := config.CheckVersion(); err != nil { 243 return nil, err 244 } 245 246 // Create an eval broker 247 evalBroker, err := NewEvalBroker( 248 config.EvalNackTimeout, 249 config.EvalNackInitialReenqueueDelay, 250 config.EvalNackSubsequentReenqueueDelay, 251 config.EvalDeliveryLimit) 252 if err != nil { 253 return nil, err 254 } 255 256 // Create a new blocked eval tracker. 257 blockedEvals := NewBlockedEvals(evalBroker) 258 259 // Create a plan queue 260 planQueue, err := NewPlanQueue() 261 if err != nil { 262 return nil, err 263 } 264 265 // Configure TLS 266 tlsConf := config.tlsConfig() 267 incomingTLS, tlsWrap, err := getTLSConf(config.TLSConfig.EnableRPC, tlsConf) 268 if err != nil { 269 return nil, err 270 } 271 272 // Create the ACL object cache 273 aclCache, err := lru.New2Q(aclCacheSize) 274 if err != nil { 275 return nil, err 276 } 277 278 // Create the server 279 s := &Server{ 280 config: config, 281 consulCatalog: consulCatalog, 282 connPool: pool.NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), 283 logger: logger, 284 tlsWrap: tlsWrap, 285 rpcServer: rpc.NewServer(), 286 streamingRpcs: structs.NewStreamingRpcRegistery(), 287 nodeConns: make(map[string]*nodeConnState), 288 peers: make(map[string][]*serverParts), 289 localPeers: make(map[raft.ServerAddress]*serverParts), 290 reconcileCh: make(chan serf.Member, 32), 291 eventCh: make(chan serf.Event, 256), 292 evalBroker: evalBroker, 293 blockedEvals: blockedEvals, 294 planQueue: planQueue, 295 rpcTLS: incomingTLS, 296 aclCache: aclCache, 297 shutdownCh: make(chan struct{}), 298 } 299 300 // Create the periodic dispatcher for launching periodic jobs. 301 s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) 302 303 // Initialize the stats fetcher that autopilot will use. 304 s.statsFetcher = NewStatsFetcher(logger, s.connPool, s.config.Region) 305 306 // Setup Vault 307 if err := s.setupVaultClient(); err != nil { 308 s.Shutdown() 309 s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err) 310 return nil, fmt.Errorf("Failed to setup Vault client: %v", err) 311 } 312 313 // Initialize the RPC layer 314 if err := s.setupRPC(tlsWrap); err != nil { 315 s.Shutdown() 316 s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) 317 return nil, fmt.Errorf("Failed to start RPC layer: %v", err) 318 } 319 320 // Initialize the Raft server 321 if err := s.setupRaft(); err != nil { 322 s.Shutdown() 323 s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err) 324 return nil, fmt.Errorf("Failed to start Raft: %v", err) 325 } 326 327 // Initialize the wan Serf 328 s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) 329 if err != nil { 330 s.Shutdown() 331 s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) 332 return nil, fmt.Errorf("Failed to start serf: %v", err) 333 } 334 335 // Initialize the scheduling workers 336 if err := s.setupWorkers(); err != nil { 337 s.Shutdown() 338 s.logger.Printf("[ERR] nomad: failed to start workers: %s", err) 339 return nil, fmt.Errorf("Failed to start workers: %v", err) 340 } 341 342 // Setup the Consul syncer 343 if err := s.setupConsulSyncer(); err != nil { 344 return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) 345 } 346 347 // Setup the deployment watcher. 348 if err := s.setupDeploymentWatcher(); err != nil { 349 return nil, fmt.Errorf("failed to create deployment watcher: %v", err) 350 } 351 352 // Setup the enterprise state 353 if err := s.setupEnterprise(config); err != nil { 354 return nil, err 355 } 356 357 // Monitor leadership changes 358 go s.monitorLeadership() 359 360 // Start ingesting events for Serf 361 go s.serfEventHandler() 362 363 // start the RPC listener for the server 364 s.startRPCListener() 365 366 // Emit metrics for the eval broker 367 go evalBroker.EmitStats(time.Second, s.shutdownCh) 368 369 // Emit metrics for the plan queue 370 go planQueue.EmitStats(time.Second, s.shutdownCh) 371 372 // Emit metrics for the blocked eval tracker. 373 go blockedEvals.EmitStats(time.Second, s.shutdownCh) 374 375 // Emit metrics for the Vault client. 376 go s.vault.EmitStats(time.Second, s.shutdownCh) 377 378 // Emit metrics 379 go s.heartbeatStats() 380 381 // Start enterprise background workers 382 s.startEnterpriseBackground() 383 384 // Done 385 return s, nil 386 } 387 388 // startRPCListener starts the server's the RPC listener 389 func (s *Server) startRPCListener() { 390 ctx, cancel := context.WithCancel(context.Background()) 391 s.rpcCancel = cancel 392 go func() { 393 defer close(s.listenerCh) 394 s.listen(ctx) 395 }() 396 } 397 398 // createRPCListener creates the server's RPC listener 399 func (s *Server) createRPCListener() (*net.TCPListener, error) { 400 s.listenerCh = make(chan struct{}) 401 listener, err := net.ListenTCP("tcp", s.config.RPCAddr) 402 if err != nil { 403 s.logger.Printf("[ERR] nomad: error when initializing TLS listener %s", err) 404 return listener, err 405 } 406 407 s.rpcListener = listener 408 return listener, nil 409 } 410 411 // getTLSConf gets the server's TLS configuration based on the config supplied 412 // by the operator 413 func getTLSConf(enableRPC bool, tlsConf *tlsutil.Config) (*tls.Config, tlsutil.RegionWrapper, error) { 414 var tlsWrap tlsutil.RegionWrapper 415 var incomingTLS *tls.Config 416 if enableRPC { 417 tw, err := tlsConf.OutgoingTLSWrapper() 418 if err != nil { 419 return nil, nil, err 420 } 421 tlsWrap = tw 422 423 itls, err := tlsConf.IncomingTLSConfig() 424 if err != nil { 425 return nil, nil, err 426 } 427 incomingTLS = itls 428 } 429 return incomingTLS, tlsWrap, nil 430 } 431 432 // reloadTLSConnections updates a server's TLS configuration and reloads RPC 433 // connections. 434 func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { 435 s.logger.Printf("[INFO] nomad: reloading server connections due to configuration changes") 436 437 tlsConf := tlsutil.NewTLSConfiguration(newTLSConfig) 438 incomingTLS, tlsWrap, err := getTLSConf(newTLSConfig.EnableRPC, tlsConf) 439 if err != nil { 440 s.logger.Printf("[ERR] nomad: unable to reset TLS context %s", err) 441 return err 442 } 443 444 // Store the new tls wrapper. 445 s.tlsWrapLock.Lock() 446 s.tlsWrap = tlsWrap 447 s.tlsWrapLock.Unlock() 448 449 if s.rpcCancel == nil { 450 err = fmt.Errorf("No existing RPC server to reset.") 451 s.logger.Printf("[ERR] nomad: %s", err) 452 return err 453 } 454 455 s.rpcCancel() 456 457 // Keeping configuration in sync is important for other places that require 458 // access to config information, such as rpc.go, where we decide on what kind 459 // of network connections to accept depending on the server configuration 460 s.config.TLSConfig = newTLSConfig 461 462 s.rpcTLS = incomingTLS 463 s.connPool.ReloadTLS(tlsWrap) 464 465 // reinitialize our rpc listener 466 s.rpcListener.Close() 467 <-s.listenerCh 468 s.startRPCListener() 469 470 listener, err := s.createRPCListener() 471 if err != nil { 472 listener.Close() 473 return err 474 } 475 476 // Close and reload existing Raft connections 477 wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap) 478 s.raftLayer.ReloadTLS(wrapper) 479 s.raftTransport.CloseStreams() 480 481 s.logger.Printf("[DEBUG] nomad: finished reloading server connections") 482 return nil 483 } 484 485 // Shutdown is used to shutdown the server 486 func (s *Server) Shutdown() error { 487 s.logger.Printf("[INFO] nomad: shutting down server") 488 s.shutdownLock.Lock() 489 defer s.shutdownLock.Unlock() 490 491 if s.shutdown { 492 return nil 493 } 494 495 s.shutdown = true 496 close(s.shutdownCh) 497 498 if s.serf != nil { 499 s.serf.Shutdown() 500 } 501 502 if s.raft != nil { 503 s.raftTransport.Close() 504 s.raftLayer.Close() 505 future := s.raft.Shutdown() 506 if err := future.Error(); err != nil { 507 s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) 508 } 509 if s.raftStore != nil { 510 s.raftStore.Close() 511 } 512 } 513 514 // Shutdown the RPC listener 515 if s.rpcListener != nil { 516 s.rpcListener.Close() 517 } 518 519 // Close the connection pool 520 s.connPool.Shutdown() 521 522 // Close the fsm 523 if s.fsm != nil { 524 s.fsm.Close() 525 } 526 527 // Stop Vault token renewal 528 if s.vault != nil { 529 s.vault.Stop() 530 } 531 532 return nil 533 } 534 535 // IsShutdown checks if the server is shutdown 536 func (s *Server) IsShutdown() bool { 537 select { 538 case <-s.shutdownCh: 539 return true 540 default: 541 return false 542 } 543 } 544 545 // Leave is used to prepare for a graceful shutdown of the server 546 func (s *Server) Leave() error { 547 s.logger.Printf("[INFO] nomad: server starting leave") 548 s.left = true 549 550 // Check the number of known peers 551 numPeers, err := s.numPeers() 552 if err != nil { 553 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 554 return err 555 } 556 557 addr := s.raftTransport.LocalAddr() 558 559 // If we are the current leader, and we have any other peers (cluster has multiple 560 // servers), we should do a RemovePeer to safely reduce the quorum size. If we are 561 // not the leader, then we should issue our leave intention and wait to be removed 562 // for some sane period of time. 563 isLeader := s.IsLeader() 564 if isLeader && numPeers > 1 { 565 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 566 if err != nil { 567 return err 568 } 569 570 if minRaftProtocol >= 2 && s.config.RaftConfig.ProtocolVersion >= 3 { 571 future := s.raft.RemoveServer(raft.ServerID(s.config.NodeID), 0, 0) 572 if err := future.Error(); err != nil { 573 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 574 } 575 } else { 576 future := s.raft.RemovePeer(addr) 577 if err := future.Error(); err != nil { 578 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 579 } 580 } 581 } 582 583 // Leave the gossip pool 584 if s.serf != nil { 585 if err := s.serf.Leave(); err != nil { 586 s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) 587 } 588 } 589 590 // If we were not leader, wait to be safely removed from the cluster. 591 // We must wait to allow the raft replication to take place, otherwise 592 // an immediate shutdown could cause a loss of quorum. 593 if !isLeader { 594 left := false 595 limit := time.Now().Add(raftRemoveGracePeriod) 596 for !left && time.Now().Before(limit) { 597 // Sleep a while before we check. 598 time.Sleep(50 * time.Millisecond) 599 600 // Get the latest configuration. 601 future := s.raft.GetConfiguration() 602 if err := future.Error(); err != nil { 603 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 604 break 605 } 606 607 // See if we are no longer included. 608 left = true 609 for _, server := range future.Configuration().Servers { 610 if server.Address == addr { 611 left = false 612 break 613 } 614 } 615 } 616 617 // TODO (alexdadgar) With the old Raft library we used to force the 618 // peers set to empty when a graceful leave occurred. This would 619 // keep voting spam down if the server was restarted, but it was 620 // dangerous because the peers was inconsistent with the logs and 621 // snapshots, so it wasn't really safe in all cases for the server 622 // to become leader. This is now safe, but the log spam is noisy. 623 // The next new version of the library will have a "you are not a 624 // peer stop it" behavior that should address this. We will have 625 // to evaluate during the RC period if this interim situation is 626 // not too confusing for operators. 627 628 // TODO (alexdadgar) When we take a later new version of the Raft 629 // library it won't try to complete replication, so this peer 630 // may not realize that it has been removed. Need to revisit this 631 // and the warning here. 632 if !left { 633 s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout") 634 } 635 } 636 return nil 637 } 638 639 // Reload handles a config reload specific to server-only configuration. Not 640 // all config fields can handle a reload. 641 func (s *Server) Reload(newConfig *Config) error { 642 if newConfig == nil { 643 return fmt.Errorf("Reload given a nil config") 644 } 645 646 var mErr multierror.Error 647 648 // Handle the Vault reload. Vault should never be nil but just guard. 649 if s.vault != nil { 650 if err := s.vault.SetConfig(newConfig.VaultConfig); err != nil { 651 multierror.Append(&mErr, err) 652 } 653 } 654 655 if !newConfig.TLSConfig.Equals(s.config.TLSConfig) { 656 if err := s.reloadTLSConnections(newConfig.TLSConfig); err != nil { 657 s.logger.Printf("[ERR] nomad: error reloading server TLS configuration: %s", err) 658 multierror.Append(&mErr, err) 659 } 660 } 661 662 return mErr.ErrorOrNil() 663 } 664 665 // setupBootstrapHandler() creates the closure necessary to support a Consul 666 // fallback handler. 667 func (s *Server) setupBootstrapHandler() error { 668 // peersTimeout is used to indicate to the Consul Syncer that the 669 // current Nomad Server has a stale peer set. peersTimeout will time 670 // out if the Consul Syncer bootstrapFn has not observed a Raft 671 // leader in maxStaleLeadership. If peersTimeout has been triggered, 672 // the Consul Syncer will begin querying Consul for other Nomad 673 // Servers. 674 // 675 // NOTE: time.Timer is used vs time.Time in order to handle clock 676 // drift because time.Timer is implemented as a monotonic clock. 677 var peersTimeout *time.Timer = time.NewTimer(0) 678 679 // consulQueryCount is the number of times the bootstrapFn has been 680 // called, regardless of success. 681 var consulQueryCount uint64 682 683 // leadershipTimedOut is a helper method that returns true if the 684 // peersTimeout timer has expired. 685 leadershipTimedOut := func() bool { 686 select { 687 case <-peersTimeout.C: 688 return true 689 default: 690 return false 691 } 692 } 693 694 // The bootstrapFn callback handler is used to periodically poll 695 // Consul to look up the Nomad Servers in Consul. In the event the 696 // server has been brought up without a `retry-join` configuration 697 // and this Server is partitioned from the rest of the cluster, 698 // periodically poll Consul to reattach this Server to other servers 699 // in the same region and automatically reform a quorum (assuming the 700 // correct number of servers required for quorum are present). 701 bootstrapFn := func() error { 702 // If there is a raft leader, do nothing 703 if s.raft.Leader() != "" { 704 peersTimeout.Reset(maxStaleLeadership) 705 return nil 706 } 707 708 // (ab)use serf.go's behavior of setting BootstrapExpect to 709 // zero if we have bootstrapped. If we have bootstrapped 710 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 711 if bootstrapExpect == 0 { 712 // This Nomad Server has been bootstrapped. Rely on 713 // the peersTimeout firing as a guard to prevent 714 // aggressive querying of Consul. 715 if !leadershipTimedOut() { 716 return nil 717 } 718 } else { 719 if consulQueryCount > 0 && !leadershipTimedOut() { 720 return nil 721 } 722 723 // This Nomad Server has not been bootstrapped, reach 724 // out to Consul if our peer list is less than 725 // `bootstrap_expect`. 726 raftPeers, err := s.numPeers() 727 if err != nil { 728 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 729 return nil 730 } 731 732 // The necessary number of Nomad Servers required for 733 // quorum has been reached, we do not need to poll 734 // Consul. Let the normal timeout-based strategy 735 // take over. 736 if raftPeers >= int(bootstrapExpect) { 737 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 738 return nil 739 } 740 } 741 consulQueryCount++ 742 743 s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") 744 745 dcs, err := s.consulCatalog.Datacenters() 746 if err != nil { 747 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 748 return fmt.Errorf("server.nomad: unable to query Consul datacenters: %v", err) 749 } 750 if len(dcs) > 2 { 751 // Query the local DC first, then shuffle the 752 // remaining DCs. If additional calls to bootstrapFn 753 // are necessary, this Nomad Server will eventually 754 // walk all datacenter until it finds enough hosts to 755 // form a quorum. 756 shuffleStrings(dcs[1:]) 757 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 758 } 759 760 nomadServerServiceName := s.config.ConsulConfig.ServerServiceName 761 var mErr multierror.Error 762 const defaultMaxNumNomadServers = 8 763 nomadServerServices := make([]string, 0, defaultMaxNumNomadServers) 764 localNode := s.serf.Memberlist().LocalNode() 765 for _, dc := range dcs { 766 consulOpts := &consulapi.QueryOptions{ 767 AllowStale: true, 768 Datacenter: dc, 769 Near: "_agent", 770 WaitTime: consul.DefaultQueryWaitDuration, 771 } 772 consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) 773 if err != nil { 774 err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) 775 s.logger.Printf("[WARN] server.nomad: %v", err) 776 mErr.Errors = append(mErr.Errors, err) 777 continue 778 } 779 780 for _, cs := range consulServices { 781 port := strconv.FormatInt(int64(cs.ServicePort), 10) 782 addr := cs.ServiceAddress 783 if addr == "" { 784 addr = cs.Address 785 } 786 if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort { 787 continue 788 } 789 serverAddr := net.JoinHostPort(addr, port) 790 nomadServerServices = append(nomadServerServices, serverAddr) 791 } 792 } 793 794 if len(nomadServerServices) == 0 { 795 if len(mErr.Errors) > 0 { 796 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 797 return mErr.ErrorOrNil() 798 } 799 800 // Log the error and return nil so future handlers 801 // can attempt to register the `nomad` service. 802 pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor) 803 s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval) 804 peersTimeout.Reset(pollInterval) 805 return nil 806 } 807 808 numServersContacted, err := s.Join(nomadServerServices) 809 if err != nil { 810 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 811 return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err) 812 } 813 814 peersTimeout.Reset(maxStaleLeadership) 815 s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted) 816 817 return nil 818 } 819 820 // Hacky replacement for old ConsulSyncer Periodic Handler. 821 go func() { 822 lastOk := true 823 sync := time.NewTimer(0) 824 for { 825 select { 826 case <-sync.C: 827 d := defaultConsulDiscoveryInterval 828 if err := bootstrapFn(); err != nil { 829 // Only log if it worked last time 830 if lastOk { 831 lastOk = false 832 s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err) 833 } 834 d = defaultConsulDiscoveryIntervalRetry 835 } 836 sync.Reset(d) 837 case <-s.shutdownCh: 838 return 839 } 840 } 841 }() 842 return nil 843 } 844 845 // setupConsulSyncer creates Server-mode consul.Syncer which periodically 846 // executes callbacks on a fixed interval. 847 func (s *Server) setupConsulSyncer() error { 848 if s.config.ConsulConfig.ServerAutoJoin != nil && *s.config.ConsulConfig.ServerAutoJoin { 849 if err := s.setupBootstrapHandler(); err != nil { 850 return err 851 } 852 } 853 854 return nil 855 } 856 857 // setupDeploymentWatcher creates a deployment watcher that consumes the RPC 858 // endpoints for state information and makes transistions via Raft through a 859 // shim that provides the appropriate methods. 860 func (s *Server) setupDeploymentWatcher() error { 861 862 // Create the raft shim type to restrict the set of raft methods that can be 863 // made 864 raftShim := &deploymentWatcherRaftShim{ 865 apply: s.raftApply, 866 } 867 868 // Create the deployment watcher 869 s.deploymentWatcher = deploymentwatcher.NewDeploymentsWatcher( 870 s.logger, raftShim, 871 deploymentwatcher.LimitStateQueriesPerSecond, 872 deploymentwatcher.CrossDeploymentEvalBatchDuration) 873 874 return nil 875 } 876 877 // setupVaultClient is used to set up the Vault API client. 878 func (s *Server) setupVaultClient() error { 879 v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors) 880 if err != nil { 881 return err 882 } 883 s.vault = v 884 return nil 885 } 886 887 // setupRPC is used to setup the RPC listener 888 func (s *Server) setupRPC(tlsWrap tlsutil.RegionWrapper) error { 889 // Populate the static RPC server 890 s.setupRpcServer(s.rpcServer, nil) 891 892 listener, err := s.createRPCListener() 893 if err != nil { 894 listener.Close() 895 return err 896 } 897 898 if s.config.RPCAdvertise != nil { 899 s.rpcAdvertise = s.config.RPCAdvertise 900 } else { 901 s.rpcAdvertise = s.rpcListener.Addr() 902 } 903 904 // Verify that we have a usable advertise address 905 addr, ok := s.rpcAdvertise.(*net.TCPAddr) 906 if !ok { 907 listener.Close() 908 return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr) 909 } 910 if addr.IP.IsUnspecified() { 911 listener.Close() 912 return fmt.Errorf("RPC advertise address is not advertisable: %v", addr) 913 } 914 915 wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap) 916 s.raftLayer = NewRaftLayer(s.rpcAdvertise, wrapper) 917 return nil 918 } 919 920 // setupRpcServer is used to populate an RPC server with endpoints 921 func (s *Server) setupRpcServer(server *rpc.Server, ctx *RPCContext) { 922 // Add the static endpoints to the RPC server. 923 if s.staticEndpoints.Status == nil { 924 // Initialize the list just once 925 s.staticEndpoints.ACL = &ACL{s} 926 s.staticEndpoints.Alloc = &Alloc{s} 927 s.staticEndpoints.Eval = &Eval{s} 928 s.staticEndpoints.Job = &Job{s} 929 s.staticEndpoints.Node = &Node{srv: s} // Add but don't register 930 s.staticEndpoints.Deployment = &Deployment{srv: s} 931 s.staticEndpoints.Operator = &Operator{s} 932 s.staticEndpoints.Periodic = &Periodic{s} 933 s.staticEndpoints.Plan = &Plan{s} 934 s.staticEndpoints.Region = &Region{s} 935 s.staticEndpoints.Status = &Status{s} 936 s.staticEndpoints.System = &System{s} 937 s.staticEndpoints.Search = &Search{s} 938 s.staticEndpoints.Enterprise = NewEnterpriseEndpoints(s) 939 940 // Client endpoints 941 s.staticEndpoints.ClientStats = &ClientStats{s} 942 s.staticEndpoints.ClientAllocations = &ClientAllocations{s} 943 944 // Streaming endpoints 945 s.staticEndpoints.FileSystem = &FileSystem{s} 946 s.staticEndpoints.FileSystem.register() 947 } 948 949 // Register the static handlers 950 server.Register(s.staticEndpoints.ACL) 951 server.Register(s.staticEndpoints.Alloc) 952 server.Register(s.staticEndpoints.Eval) 953 server.Register(s.staticEndpoints.Job) 954 server.Register(s.staticEndpoints.Deployment) 955 server.Register(s.staticEndpoints.Operator) 956 server.Register(s.staticEndpoints.Periodic) 957 server.Register(s.staticEndpoints.Plan) 958 server.Register(s.staticEndpoints.Region) 959 server.Register(s.staticEndpoints.Status) 960 server.Register(s.staticEndpoints.System) 961 server.Register(s.staticEndpoints.Search) 962 s.staticEndpoints.Enterprise.Register(server) 963 server.Register(s.staticEndpoints.ClientStats) 964 server.Register(s.staticEndpoints.ClientAllocations) 965 server.Register(s.staticEndpoints.FileSystem) 966 967 // Create new dynamic endpoints and add them to the RPC server. 968 node := &Node{srv: s, ctx: ctx} 969 970 // Register the dynamic endpoints 971 server.Register(node) 972 } 973 974 // setupRaft is used to setup and initialize Raft 975 func (s *Server) setupRaft() error { 976 // If we have an unclean exit then attempt to close the Raft store. 977 defer func() { 978 if s.raft == nil && s.raftStore != nil { 979 if err := s.raftStore.Close(); err != nil { 980 s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err) 981 } 982 } 983 }() 984 985 // Create the FSM 986 fsmConfig := &FSMConfig{ 987 EvalBroker: s.evalBroker, 988 Periodic: s.periodicDispatcher, 989 Blocked: s.blockedEvals, 990 LogOutput: s.config.LogOutput, 991 Region: s.Region(), 992 } 993 var err error 994 s.fsm, err = NewFSM(fsmConfig) 995 if err != nil { 996 return err 997 } 998 999 // Create a transport layer 1000 trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout, 1001 s.config.LogOutput) 1002 s.raftTransport = trans 1003 1004 // Make sure we set the LogOutput. 1005 s.config.RaftConfig.LogOutput = s.config.LogOutput 1006 1007 // Our version of Raft protocol requires the LocalID to match the network 1008 // address of the transport. 1009 s.config.RaftConfig.LocalID = raft.ServerID(trans.LocalAddr()) 1010 if s.config.RaftConfig.ProtocolVersion >= 3 { 1011 s.config.RaftConfig.LocalID = raft.ServerID(s.config.NodeID) 1012 } 1013 1014 // Build an all in-memory setup for dev mode, otherwise prepare a full 1015 // disk-based setup. 1016 var log raft.LogStore 1017 var stable raft.StableStore 1018 var snap raft.SnapshotStore 1019 if s.config.DevMode { 1020 store := raft.NewInmemStore() 1021 s.raftInmem = store 1022 stable = store 1023 log = store 1024 snap = raft.NewDiscardSnapshotStore() 1025 1026 } else { 1027 // Create the base raft path 1028 path := filepath.Join(s.config.DataDir, raftState) 1029 if err := ensurePath(path, true); err != nil { 1030 return err 1031 } 1032 1033 // Create the BoltDB backend 1034 store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db")) 1035 if err != nil { 1036 return err 1037 } 1038 s.raftStore = store 1039 stable = store 1040 1041 // Wrap the store in a LogCache to improve performance 1042 cacheStore, err := raft.NewLogCache(raftLogCacheSize, store) 1043 if err != nil { 1044 store.Close() 1045 return err 1046 } 1047 log = cacheStore 1048 1049 // Create the snapshot store 1050 snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput) 1051 if err != nil { 1052 if s.raftStore != nil { 1053 s.raftStore.Close() 1054 } 1055 return err 1056 } 1057 snap = snapshots 1058 1059 // For an existing cluster being upgraded to the new version of 1060 // Raft, we almost never want to run recovery based on the old 1061 // peers.json file. We create a peers.info file with a helpful 1062 // note about where peers.json went, and use that as a sentinel 1063 // to avoid ingesting the old one that first time (if we have to 1064 // create the peers.info file because it's not there, we also 1065 // blow away any existing peers.json file). 1066 peersFile := filepath.Join(path, "peers.json") 1067 peersInfoFile := filepath.Join(path, "peers.info") 1068 if _, err := os.Stat(peersInfoFile); os.IsNotExist(err) { 1069 if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0755); err != nil { 1070 return fmt.Errorf("failed to write peers.info file: %v", err) 1071 } 1072 1073 // Blow away the peers.json file if present, since the 1074 // peers.info sentinel wasn't there. 1075 if _, err := os.Stat(peersFile); err == nil { 1076 if err := os.Remove(peersFile); err != nil { 1077 return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 1078 } 1079 s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)") 1080 } 1081 } else if _, err := os.Stat(peersFile); err == nil { 1082 s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...") 1083 configuration, err := raft.ReadPeersJSON(peersFile) 1084 if err != nil { 1085 return fmt.Errorf("recovery failed to parse peers.json: %v", err) 1086 } 1087 tmpFsm, err := NewFSM(fsmConfig) 1088 if err != nil { 1089 return fmt.Errorf("recovery failed to make temp FSM: %v", err) 1090 } 1091 if err := raft.RecoverCluster(s.config.RaftConfig, tmpFsm, 1092 log, stable, snap, trans, configuration); err != nil { 1093 return fmt.Errorf("recovery failed: %v", err) 1094 } 1095 if err := os.Remove(peersFile); err != nil { 1096 return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 1097 } 1098 s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery") 1099 } 1100 } 1101 1102 // If we are in bootstrap or dev mode and the state is clean then we can 1103 // bootstrap now. 1104 if s.config.Bootstrap || s.config.DevMode { 1105 hasState, err := raft.HasExistingState(log, stable, snap) 1106 if err != nil { 1107 return err 1108 } 1109 if !hasState { 1110 configuration := raft.Configuration{ 1111 Servers: []raft.Server{ 1112 { 1113 ID: s.config.RaftConfig.LocalID, 1114 Address: trans.LocalAddr(), 1115 }, 1116 }, 1117 } 1118 if err := raft.BootstrapCluster(s.config.RaftConfig, 1119 log, stable, snap, trans, configuration); err != nil { 1120 return err 1121 } 1122 } 1123 } 1124 1125 // Setup the leader channel 1126 leaderCh := make(chan bool, 1) 1127 s.config.RaftConfig.NotifyCh = leaderCh 1128 s.leaderCh = leaderCh 1129 1130 // Setup the Raft store 1131 s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, snap, trans) 1132 if err != nil { 1133 return err 1134 } 1135 return nil 1136 } 1137 1138 // setupSerf is used to setup and initialize a Serf 1139 func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { 1140 conf.Init() 1141 conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) 1142 conf.Tags["role"] = "nomad" 1143 conf.Tags["region"] = s.config.Region 1144 conf.Tags["dc"] = s.config.Datacenter 1145 conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion) 1146 conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion) 1147 conf.Tags["build"] = s.config.Build 1148 conf.Tags["raft_vsn"] = fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion) 1149 conf.Tags["id"] = s.config.NodeID 1150 conf.Tags["rpc_addr"] = s.rpcAdvertise.(*net.TCPAddr).IP.String() 1151 conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) 1152 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 1153 conf.Tags["bootstrap"] = "1" 1154 } 1155 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 1156 if bootstrapExpect != 0 { 1157 conf.Tags["expect"] = fmt.Sprintf("%d", bootstrapExpect) 1158 } 1159 if s.config.NonVoter { 1160 conf.Tags["nonvoter"] = "1" 1161 } 1162 if s.config.RedundancyZone != "" { 1163 conf.Tags[AutopilotRZTag] = s.config.RedundancyZone 1164 } 1165 if s.config.UpgradeVersion != "" { 1166 conf.Tags[AutopilotVersionTag] = s.config.UpgradeVersion 1167 } 1168 conf.MemberlistConfig.LogOutput = s.config.LogOutput 1169 conf.LogOutput = s.config.LogOutput 1170 conf.EventCh = ch 1171 if !s.config.DevMode { 1172 conf.SnapshotPath = filepath.Join(s.config.DataDir, path) 1173 if err := ensurePath(conf.SnapshotPath, false); err != nil { 1174 return nil, err 1175 } 1176 } 1177 conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] 1178 conf.RejoinAfterLeave = true 1179 conf.Merge = &serfMergeDelegate{} 1180 1181 // Until Nomad supports this fully, we disable automatic resolution. 1182 // When enabled, the Serf gossip may just turn off if we are the minority 1183 // node which is rather unexpected. 1184 conf.EnableNameConflictResolution = false 1185 return serf.Create(conf) 1186 } 1187 1188 // setupWorkers is used to start the scheduling workers 1189 func (s *Server) setupWorkers() error { 1190 // Check if all the schedulers are disabled 1191 if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { 1192 s.logger.Printf("[WARN] nomad: no enabled schedulers") 1193 return nil 1194 } 1195 1196 // Start the workers 1197 for i := 0; i < s.config.NumSchedulers; i++ { 1198 if w, err := NewWorker(s); err != nil { 1199 return err 1200 } else { 1201 s.workers = append(s.workers, w) 1202 } 1203 } 1204 s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", 1205 s.config.NumSchedulers, s.config.EnabledSchedulers) 1206 return nil 1207 } 1208 1209 // numPeers is used to check on the number of known peers, including the local 1210 // node. 1211 func (s *Server) numPeers() (int, error) { 1212 future := s.raft.GetConfiguration() 1213 if err := future.Error(); err != nil { 1214 return 0, err 1215 } 1216 configuration := future.Configuration() 1217 return len(configuration.Servers), nil 1218 } 1219 1220 // IsLeader checks if this server is the cluster leader 1221 func (s *Server) IsLeader() bool { 1222 return s.raft.State() == raft.Leader 1223 } 1224 1225 // Join is used to have Nomad join the gossip ring 1226 // The target address should be another node listening on the 1227 // Serf address 1228 func (s *Server) Join(addrs []string) (int, error) { 1229 return s.serf.Join(addrs, true) 1230 } 1231 1232 // LocalMember is used to return the local node 1233 func (c *Server) LocalMember() serf.Member { 1234 return c.serf.LocalMember() 1235 } 1236 1237 // Members is used to return the members of the serf cluster 1238 func (s *Server) Members() []serf.Member { 1239 return s.serf.Members() 1240 } 1241 1242 // RemoveFailedNode is used to remove a failed node from the cluster 1243 func (s *Server) RemoveFailedNode(node string) error { 1244 return s.serf.RemoveFailedNode(node) 1245 } 1246 1247 // KeyManager returns the Serf keyring manager 1248 func (s *Server) KeyManager() *serf.KeyManager { 1249 return s.serf.KeyManager() 1250 } 1251 1252 // Encrypted determines if gossip is encrypted 1253 func (s *Server) Encrypted() bool { 1254 return s.serf.EncryptionEnabled() 1255 } 1256 1257 // State returns the underlying state store. This should *not* 1258 // be used to modify state directly. 1259 func (s *Server) State() *state.StateStore { 1260 return s.fsm.State() 1261 } 1262 1263 // setLeaderAcl stores the given ACL token as the current leader's ACL token. 1264 func (s *Server) setLeaderAcl(token string) { 1265 s.leaderAclLock.Lock() 1266 s.leaderAcl = token 1267 s.leaderAclLock.Unlock() 1268 } 1269 1270 // getLeaderAcl retrieves the leader's ACL token 1271 func (s *Server) getLeaderAcl() string { 1272 s.leaderAclLock.Lock() 1273 defer s.leaderAclLock.Unlock() 1274 return s.leaderAcl 1275 } 1276 1277 // Regions returns the known regions in the cluster. 1278 func (s *Server) Regions() []string { 1279 s.peerLock.RLock() 1280 defer s.peerLock.RUnlock() 1281 1282 regions := make([]string, 0, len(s.peers)) 1283 for region := range s.peers { 1284 regions = append(regions, region) 1285 } 1286 sort.Strings(regions) 1287 return regions 1288 } 1289 1290 // RPC is used to make a local RPC call 1291 func (s *Server) RPC(method string, args interface{}, reply interface{}) error { 1292 codec := &codec.InmemCodec{ 1293 Method: method, 1294 Args: args, 1295 Reply: reply, 1296 } 1297 if err := s.rpcServer.ServeRequest(codec); err != nil { 1298 return err 1299 } 1300 return codec.Err 1301 } 1302 1303 // StreamingRpcHandler is used to make a streaming RPC call. 1304 func (s *Server) StreamingRpcHandler(method string) (structs.StreamingRpcHandler, error) { 1305 return s.streamingRpcs.GetHandler(method) 1306 } 1307 1308 // Stats is used to return statistics for debugging and insight 1309 // for various sub-systems 1310 func (s *Server) Stats() map[string]map[string]string { 1311 toString := func(v uint64) string { 1312 return strconv.FormatUint(v, 10) 1313 } 1314 stats := map[string]map[string]string{ 1315 "nomad": { 1316 "server": "true", 1317 "leader": fmt.Sprintf("%v", s.IsLeader()), 1318 "leader_addr": string(s.raft.Leader()), 1319 "bootstrap": fmt.Sprintf("%v", s.config.Bootstrap), 1320 "known_regions": toString(uint64(len(s.peers))), 1321 }, 1322 "raft": s.raft.Stats(), 1323 "serf": s.serf.Stats(), 1324 "runtime": stats.RuntimeStats(), 1325 } 1326 1327 return stats 1328 } 1329 1330 // Region returns the region of the server 1331 func (s *Server) Region() string { 1332 return s.config.Region 1333 } 1334 1335 // Datacenter returns the data center of the server 1336 func (s *Server) Datacenter() string { 1337 return s.config.Datacenter 1338 } 1339 1340 // GetConfig returns the config of the server for testing purposes only 1341 func (s *Server) GetConfig() *Config { 1342 return s.config 1343 } 1344 1345 // ReplicationToken returns the token used for replication. We use a method to support 1346 // dynamic reloading of this value later. 1347 func (s *Server) ReplicationToken() string { 1348 return s.config.ReplicationToken 1349 } 1350 1351 // peersInfoContent is used to help operators understand what happened to the 1352 // peers.json file. This is written to a file called peers.info in the same 1353 // location. 1354 const peersInfoContent = ` 1355 As of Nomad 0.5.5, the peers.json file is only used for recovery 1356 after an outage. It should be formatted as a JSON array containing the address 1357 and port (RPC) of each Nomad server in the cluster, like this: 1358 1359 ["10.1.0.1:4647","10.1.0.2:4647","10.1.0.3:4647"] 1360 1361 Under normal operation, the peers.json file will not be present. 1362 1363 When Nomad starts for the first time, it will create this peers.info file and 1364 delete any existing peers.json file so that recovery doesn't occur on the first 1365 startup. 1366 1367 Once this peers.info file is present, any peers.json file will be ingested at 1368 startup, and will set the Raft peer configuration manually to recover from an 1369 outage. It's crucial that all servers in the cluster are shut down before 1370 creating the peers.json file, and that all servers receive the same 1371 configuration. Once the peers.json file is successfully ingested and applied, it 1372 will be deleted. 1373 1374 Please see https://www.nomadproject.io/guides/outage.html for more information. 1375 `