github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/server.go (about) 1 package nomad 2 3 import ( 4 "crypto/tls" 5 "errors" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "net" 10 "net/rpc" 11 "os" 12 "path/filepath" 13 "reflect" 14 "sort" 15 "strconv" 16 "sync" 17 "sync/atomic" 18 "time" 19 20 consulapi "github.com/hashicorp/consul/api" 21 "github.com/hashicorp/consul/lib" 22 "github.com/hashicorp/go-multierror" 23 "github.com/hashicorp/nomad/command/agent/consul" 24 "github.com/hashicorp/nomad/helper/tlsutil" 25 "github.com/hashicorp/nomad/nomad/state" 26 "github.com/hashicorp/nomad/nomad/structs" 27 "github.com/hashicorp/raft" 28 "github.com/hashicorp/raft-boltdb" 29 "github.com/hashicorp/serf/serf" 30 ) 31 32 const ( 33 // datacenterQueryLimit sets the max number of DCs that a Nomad 34 // Server will query to find bootstrap_expect servers. 35 datacenterQueryLimit = 25 36 37 // maxStaleLeadership is the maximum time we will permit this Nomad 38 // Server to go without seeing a valid Raft leader. 39 maxStaleLeadership = 15 * time.Second 40 41 // peersPollInterval is used as the polling interval between attempts 42 // to query Consul for Nomad Servers. 43 peersPollInterval = 45 * time.Second 44 45 // peersPollJitter is used to provide a slight amount of variance to 46 // the retry interval when querying Consul Servers 47 peersPollJitterFactor = 2 48 49 raftState = "raft/" 50 serfSnapshot = "serf/snapshot" 51 snapshotsRetained = 2 52 53 // serverRPCCache controls how long we keep an idle connection open to a server 54 serverRPCCache = 2 * time.Minute 55 56 // serverMaxStreams controsl how many idle streams we keep open to a server 57 serverMaxStreams = 64 58 59 // raftLogCacheSize is the maximum number of logs to cache in-memory. 60 // This is used to reduce disk I/O for the recently committed entries. 61 raftLogCacheSize = 512 62 63 // raftRemoveGracePeriod is how long we wait to allow a RemovePeer 64 // to replicate to gracefully leave the cluster. 65 raftRemoveGracePeriod = 5 * time.Second 66 ) 67 68 // Server is Nomad server which manages the job queues, 69 // schedulers, and notification bus for agents. 70 type Server struct { 71 config *Config 72 logger *log.Logger 73 74 // Connection pool to other Nomad servers 75 connPool *ConnPool 76 77 // Endpoints holds our RPC endpoints 78 endpoints endpoints 79 80 // The raft instance is used among Nomad nodes within the 81 // region to protect operations that require strong consistency 82 leaderCh <-chan bool 83 raft *raft.Raft 84 raftLayer *RaftLayer 85 raftStore *raftboltdb.BoltStore 86 raftInmem *raft.InmemStore 87 raftTransport *raft.NetworkTransport 88 89 // fsm is the state machine used with Raft 90 fsm *nomadFSM 91 92 // rpcListener is used to listen for incoming connections 93 rpcListener net.Listener 94 rpcServer *rpc.Server 95 rpcAdvertise net.Addr 96 97 // rpcTLS is the TLS config for incoming TLS requests 98 rpcTLS *tls.Config 99 100 // peers is used to track the known Nomad servers. This is 101 // used for region forwarding and clustering. 102 peers map[string][]*serverParts 103 localPeers map[raft.ServerAddress]*serverParts 104 peerLock sync.RWMutex 105 106 // serf is the Serf cluster containing only Nomad 107 // servers. This is used for multi-region federation 108 // and automatic clustering within regions. 109 serf *serf.Serf 110 111 // reconcileCh is used to pass events from the serf handler 112 // into the leader manager. Mostly used to handle when servers 113 // join/leave from the region. 114 reconcileCh chan serf.Member 115 116 // eventCh is used to receive events from the serf cluster 117 eventCh chan serf.Event 118 119 // evalBroker is used to manage the in-progress evaluations 120 // that are waiting to be brokered to a sub-scheduler 121 evalBroker *EvalBroker 122 123 // BlockedEvals is used to manage evaluations that are blocked on node 124 // capacity changes. 125 blockedEvals *BlockedEvals 126 127 // planQueue is used to manage the submitted allocation 128 // plans that are waiting to be assessed by the leader 129 planQueue *PlanQueue 130 131 // periodicDispatcher is used to track and create evaluations for periodic jobs. 132 periodicDispatcher *PeriodicDispatch 133 134 // heartbeatTimers track the expiration time of each heartbeat that has 135 // a TTL. On expiration, the node status is updated to be 'down'. 136 heartbeatTimers map[string]*time.Timer 137 heartbeatTimersLock sync.Mutex 138 139 // consulSyncer advertises this Nomad Agent with Consul 140 consulSyncer *consul.Syncer 141 142 // vault is the client for communicating with Vault. 143 vault VaultClient 144 145 // Worker used for processing 146 workers []*Worker 147 148 left bool 149 shutdown bool 150 shutdownCh chan struct{} 151 shutdownLock sync.Mutex 152 } 153 154 // Holds the RPC endpoints 155 type endpoints struct { 156 Status *Status 157 Node *Node 158 Job *Job 159 Eval *Eval 160 Plan *Plan 161 Alloc *Alloc 162 Region *Region 163 Periodic *Periodic 164 System *System 165 Operator *Operator 166 } 167 168 // NewServer is used to construct a new Nomad server from the 169 // configuration, potentially returning an error 170 func NewServer(config *Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Server, error) { 171 // Check the protocol version 172 if err := config.CheckVersion(); err != nil { 173 return nil, err 174 } 175 176 // Create an eval broker 177 evalBroker, err := NewEvalBroker(config.EvalNackTimeout, config.EvalDeliveryLimit) 178 if err != nil { 179 return nil, err 180 } 181 182 // Create a new blocked eval tracker. 183 blockedEvals := NewBlockedEvals(evalBroker) 184 185 // Create a plan queue 186 planQueue, err := NewPlanQueue() 187 if err != nil { 188 return nil, err 189 } 190 191 // Configure TLS 192 var tlsWrap tlsutil.RegionWrapper 193 var incomingTLS *tls.Config 194 if config.TLSConfig.EnableRPC { 195 tlsConf := config.tlsConfig() 196 tw, err := tlsConf.OutgoingTLSWrapper() 197 if err != nil { 198 return nil, err 199 } 200 tlsWrap = tw 201 202 itls, err := tlsConf.IncomingTLSConfig() 203 if err != nil { 204 return nil, err 205 } 206 incomingTLS = itls 207 } 208 209 // Create the server 210 s := &Server{ 211 config: config, 212 consulSyncer: consulSyncer, 213 connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), 214 logger: logger, 215 rpcServer: rpc.NewServer(), 216 peers: make(map[string][]*serverParts), 217 localPeers: make(map[raft.ServerAddress]*serverParts), 218 reconcileCh: make(chan serf.Member, 32), 219 eventCh: make(chan serf.Event, 256), 220 evalBroker: evalBroker, 221 blockedEvals: blockedEvals, 222 planQueue: planQueue, 223 rpcTLS: incomingTLS, 224 shutdownCh: make(chan struct{}), 225 } 226 227 // Create the periodic dispatcher for launching periodic jobs. 228 s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) 229 230 // Setup Vault 231 if err := s.setupVaultClient(); err != nil { 232 s.Shutdown() 233 s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err) 234 return nil, fmt.Errorf("Failed to setup Vault client: %v", err) 235 } 236 237 // Initialize the RPC layer 238 if err := s.setupRPC(tlsWrap); err != nil { 239 s.Shutdown() 240 s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) 241 return nil, fmt.Errorf("Failed to start RPC layer: %v", err) 242 } 243 244 // Initialize the Raft server 245 if err := s.setupRaft(); err != nil { 246 s.Shutdown() 247 s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err) 248 return nil, fmt.Errorf("Failed to start Raft: %v", err) 249 } 250 251 // Initialize the wan Serf 252 s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) 253 if err != nil { 254 s.Shutdown() 255 s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) 256 return nil, fmt.Errorf("Failed to start serf: %v", err) 257 } 258 259 // Initialize the scheduling workers 260 if err := s.setupWorkers(); err != nil { 261 s.Shutdown() 262 s.logger.Printf("[ERR] nomad: failed to start workers: %s", err) 263 return nil, fmt.Errorf("Failed to start workers: %v", err) 264 } 265 266 // Setup the Consul syncer 267 if err := s.setupConsulSyncer(); err != nil { 268 return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) 269 } 270 271 // Monitor leadership changes 272 go s.monitorLeadership() 273 274 // Start ingesting events for Serf 275 go s.serfEventHandler() 276 277 // Start the RPC listeners 278 go s.listen() 279 280 // Emit metrics for the eval broker 281 go evalBroker.EmitStats(time.Second, s.shutdownCh) 282 283 // Emit metrics for the plan queue 284 go planQueue.EmitStats(time.Second, s.shutdownCh) 285 286 // Emit metrics for the blocked eval tracker. 287 go blockedEvals.EmitStats(time.Second, s.shutdownCh) 288 289 // Emit metrics for the Vault client. 290 go s.vault.EmitStats(time.Second, s.shutdownCh) 291 292 // Emit metrics 293 go s.heartbeatStats() 294 295 // Done 296 return s, nil 297 } 298 299 // Shutdown is used to shutdown the server 300 func (s *Server) Shutdown() error { 301 s.logger.Printf("[INFO] nomad: shutting down server") 302 s.shutdownLock.Lock() 303 defer s.shutdownLock.Unlock() 304 305 if s.shutdown { 306 return nil 307 } 308 309 s.shutdown = true 310 close(s.shutdownCh) 311 312 if s.serf != nil { 313 s.serf.Shutdown() 314 } 315 316 if s.raft != nil { 317 s.raftTransport.Close() 318 s.raftLayer.Close() 319 future := s.raft.Shutdown() 320 if err := future.Error(); err != nil { 321 s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) 322 } 323 if s.raftStore != nil { 324 s.raftStore.Close() 325 } 326 } 327 328 // Shutdown the RPC listener 329 if s.rpcListener != nil { 330 s.rpcListener.Close() 331 } 332 333 // Close the connection pool 334 s.connPool.Shutdown() 335 336 // Close the fsm 337 if s.fsm != nil { 338 s.fsm.Close() 339 } 340 341 // Stop Vault token renewal 342 if s.vault != nil { 343 s.vault.Stop() 344 } 345 346 return nil 347 } 348 349 // IsShutdown checks if the server is shutdown 350 func (s *Server) IsShutdown() bool { 351 select { 352 case <-s.shutdownCh: 353 return true 354 default: 355 return false 356 } 357 } 358 359 // Leave is used to prepare for a graceful shutdown of the server 360 func (s *Server) Leave() error { 361 s.logger.Printf("[INFO] nomad: server starting leave") 362 s.left = true 363 364 // Check the number of known peers 365 numPeers, err := s.numPeers() 366 if err != nil { 367 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 368 return err 369 } 370 371 // TODO (alexdadgar) - This will need to be updated once we support node 372 // IDs. 373 addr := s.raftTransport.LocalAddr() 374 375 // If we are the current leader, and we have any other peers (cluster has multiple 376 // servers), we should do a RemovePeer to safely reduce the quorum size. If we are 377 // not the leader, then we should issue our leave intention and wait to be removed 378 // for some sane period of time. 379 isLeader := s.IsLeader() 380 if isLeader && numPeers > 1 { 381 future := s.raft.RemovePeer(addr) 382 if err := future.Error(); err != nil { 383 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 384 } 385 } 386 387 // Leave the gossip pool 388 if s.serf != nil { 389 if err := s.serf.Leave(); err != nil { 390 s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) 391 } 392 } 393 394 // If we were not leader, wait to be safely removed from the cluster. 395 // We must wait to allow the raft replication to take place, otherwise 396 // an immediate shutdown could cause a loss of quorum. 397 if !isLeader { 398 left := false 399 limit := time.Now().Add(raftRemoveGracePeriod) 400 for !left && time.Now().Before(limit) { 401 // Sleep a while before we check. 402 time.Sleep(50 * time.Millisecond) 403 404 // Get the latest configuration. 405 future := s.raft.GetConfiguration() 406 if err := future.Error(); err != nil { 407 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 408 break 409 } 410 411 // See if we are no longer included. 412 left = true 413 for _, server := range future.Configuration().Servers { 414 if server.Address == addr { 415 left = false 416 break 417 } 418 } 419 } 420 421 // TODO (alexdadgar) With the old Raft library we used to force the 422 // peers set to empty when a graceful leave occurred. This would 423 // keep voting spam down if the server was restarted, but it was 424 // dangerous because the peers was inconsistent with the logs and 425 // snapshots, so it wasn't really safe in all cases for the server 426 // to become leader. This is now safe, but the log spam is noisy. 427 // The next new version of the library will have a "you are not a 428 // peer stop it" behavior that should address this. We will have 429 // to evaluate during the RC period if this interim situation is 430 // not too confusing for operators. 431 432 // TODO (alexdadgar) When we take a later new version of the Raft 433 // library it won't try to complete replication, so this peer 434 // may not realize that it has been removed. Need to revisit this 435 // and the warning here. 436 if !left { 437 s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout") 438 } 439 } 440 return nil 441 } 442 443 // Reload handles a config reload. Not all config fields can handle a reload. 444 func (s *Server) Reload(config *Config) error { 445 if config == nil { 446 return fmt.Errorf("Reload given a nil config") 447 } 448 449 var mErr multierror.Error 450 451 // Handle the Vault reload. Vault should never be nil but just guard. 452 if s.vault != nil { 453 if err := s.vault.SetConfig(config.VaultConfig); err != nil { 454 multierror.Append(&mErr, err) 455 } 456 } 457 458 return mErr.ErrorOrNil() 459 } 460 461 // setupBootstrapHandler() creates the closure necessary to support a Consul 462 // fallback handler. 463 func (s *Server) setupBootstrapHandler() error { 464 // peersTimeout is used to indicate to the Consul Syncer that the 465 // current Nomad Server has a stale peer set. peersTimeout will time 466 // out if the Consul Syncer bootstrapFn has not observed a Raft 467 // leader in maxStaleLeadership. If peersTimeout has been triggered, 468 // the Consul Syncer will begin querying Consul for other Nomad 469 // Servers. 470 // 471 // NOTE: time.Timer is used vs time.Time in order to handle clock 472 // drift because time.Timer is implemented as a monotonic clock. 473 var peersTimeout *time.Timer = time.NewTimer(0) 474 475 // consulQueryCount is the number of times the bootstrapFn has been 476 // called, regardless of success. 477 var consulQueryCount uint64 478 479 // leadershipTimedOut is a helper method that returns true if the 480 // peersTimeout timer has expired. 481 leadershipTimedOut := func() bool { 482 select { 483 case <-peersTimeout.C: 484 return true 485 default: 486 return false 487 } 488 } 489 490 // The bootstrapFn callback handler is used to periodically poll 491 // Consul to look up the Nomad Servers in Consul. In the event the 492 // server has been brought up without a `retry-join` configuration 493 // and this Server is partitioned from the rest of the cluster, 494 // periodically poll Consul to reattach this Server to other servers 495 // in the same region and automatically reform a quorum (assuming the 496 // correct number of servers required for quorum are present). 497 bootstrapFn := func() error { 498 // If there is a raft leader, do nothing 499 if s.raft.Leader() != "" { 500 peersTimeout.Reset(maxStaleLeadership) 501 return nil 502 } 503 504 // (ab)use serf.go's behavior of setting BootstrapExpect to 505 // zero if we have bootstrapped. If we have bootstrapped 506 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 507 if bootstrapExpect == 0 { 508 // This Nomad Server has been bootstrapped. Rely on 509 // the peersTimeout firing as a guard to prevent 510 // aggressive querying of Consul. 511 if !leadershipTimedOut() { 512 return nil 513 } 514 } else { 515 if consulQueryCount > 0 && !leadershipTimedOut() { 516 return nil 517 } 518 519 // This Nomad Server has not been bootstrapped, reach 520 // out to Consul if our peer list is less than 521 // `bootstrap_expect`. 522 raftPeers, err := s.numPeers() 523 if err != nil { 524 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 525 return nil 526 } 527 528 // The necessary number of Nomad Servers required for 529 // quorum has been reached, we do not need to poll 530 // Consul. Let the normal timeout-based strategy 531 // take over. 532 if raftPeers >= int(bootstrapExpect) { 533 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 534 return nil 535 } 536 } 537 consulQueryCount++ 538 539 s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") 540 541 consulCatalog := s.consulSyncer.ConsulClient().Catalog() 542 dcs, err := consulCatalog.Datacenters() 543 if err != nil { 544 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 545 return fmt.Errorf("server.nomad: unable to query Consul datacenters: %v", err) 546 } 547 if len(dcs) > 2 { 548 // Query the local DC first, then shuffle the 549 // remaining DCs. If additional calls to bootstrapFn 550 // are necessary, this Nomad Server will eventually 551 // walk all datacenter until it finds enough hosts to 552 // form a quorum. 553 shuffleStrings(dcs[1:]) 554 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 555 } 556 557 nomadServerServiceName := s.config.ConsulConfig.ServerServiceName 558 var mErr multierror.Error 559 const defaultMaxNumNomadServers = 8 560 nomadServerServices := make([]string, 0, defaultMaxNumNomadServers) 561 localNode := s.serf.Memberlist().LocalNode() 562 for _, dc := range dcs { 563 consulOpts := &consulapi.QueryOptions{ 564 AllowStale: true, 565 Datacenter: dc, 566 Near: "_agent", 567 WaitTime: consul.DefaultQueryWaitDuration, 568 } 569 consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) 570 if err != nil { 571 err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) 572 s.logger.Printf("[WARN] server.nomad: %v", err) 573 mErr.Errors = append(mErr.Errors, err) 574 continue 575 } 576 577 for _, cs := range consulServices { 578 port := strconv.FormatInt(int64(cs.ServicePort), 10) 579 addr := cs.ServiceAddress 580 if addr == "" { 581 addr = cs.Address 582 } 583 if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort { 584 continue 585 } 586 serverAddr := net.JoinHostPort(addr, port) 587 nomadServerServices = append(nomadServerServices, serverAddr) 588 } 589 } 590 591 if len(nomadServerServices) == 0 { 592 if len(mErr.Errors) > 0 { 593 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 594 return mErr.ErrorOrNil() 595 } 596 597 // Log the error and return nil so future handlers 598 // can attempt to register the `nomad` service. 599 pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor) 600 s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval) 601 peersTimeout.Reset(pollInterval) 602 return nil 603 } 604 605 numServersContacted, err := s.Join(nomadServerServices) 606 if err != nil { 607 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 608 return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err) 609 } 610 611 peersTimeout.Reset(maxStaleLeadership) 612 s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted) 613 614 return nil 615 } 616 617 s.consulSyncer.AddPeriodicHandler("Nomad Server Fallback Server Handler", bootstrapFn) 618 return nil 619 } 620 621 // setupConsulSyncer creates Server-mode consul.Syncer which periodically 622 // executes callbacks on a fixed interval. 623 func (s *Server) setupConsulSyncer() error { 624 if s.config.ConsulConfig.ServerAutoJoin != nil && *s.config.ConsulConfig.ServerAutoJoin { 625 if err := s.setupBootstrapHandler(); err != nil { 626 return err 627 } 628 } 629 630 return nil 631 } 632 633 // setupVaultClient is used to set up the Vault API client. 634 func (s *Server) setupVaultClient() error { 635 v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors) 636 if err != nil { 637 return err 638 } 639 s.vault = v 640 return nil 641 } 642 643 // setupRPC is used to setup the RPC listener 644 func (s *Server) setupRPC(tlsWrap tlsutil.RegionWrapper) error { 645 // Create endpoints 646 s.endpoints.Alloc = &Alloc{s} 647 s.endpoints.Eval = &Eval{s} 648 s.endpoints.Job = &Job{s} 649 s.endpoints.Node = &Node{srv: s} 650 s.endpoints.Operator = &Operator{s} 651 s.endpoints.Periodic = &Periodic{s} 652 s.endpoints.Plan = &Plan{s} 653 s.endpoints.Region = &Region{s} 654 s.endpoints.Status = &Status{s} 655 s.endpoints.System = &System{s} 656 657 // Register the handlers 658 s.rpcServer.Register(s.endpoints.Alloc) 659 s.rpcServer.Register(s.endpoints.Eval) 660 s.rpcServer.Register(s.endpoints.Job) 661 s.rpcServer.Register(s.endpoints.Node) 662 s.rpcServer.Register(s.endpoints.Operator) 663 s.rpcServer.Register(s.endpoints.Periodic) 664 s.rpcServer.Register(s.endpoints.Plan) 665 s.rpcServer.Register(s.endpoints.Region) 666 s.rpcServer.Register(s.endpoints.Status) 667 s.rpcServer.Register(s.endpoints.System) 668 669 list, err := net.ListenTCP("tcp", s.config.RPCAddr) 670 if err != nil { 671 return err 672 } 673 s.rpcListener = list 674 675 if s.config.RPCAdvertise != nil { 676 s.rpcAdvertise = s.config.RPCAdvertise 677 } else { 678 s.rpcAdvertise = s.rpcListener.Addr() 679 } 680 681 // Verify that we have a usable advertise address 682 addr, ok := s.rpcAdvertise.(*net.TCPAddr) 683 if !ok { 684 list.Close() 685 return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr) 686 } 687 if addr.IP.IsUnspecified() { 688 list.Close() 689 return fmt.Errorf("RPC advertise address is not advertisable: %v", addr) 690 } 691 692 wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap) 693 s.raftLayer = NewRaftLayer(s.rpcAdvertise, wrapper) 694 return nil 695 } 696 697 // setupRaft is used to setup and initialize Raft 698 func (s *Server) setupRaft() error { 699 // If we have an unclean exit then attempt to close the Raft store. 700 defer func() { 701 if s.raft == nil && s.raftStore != nil { 702 if err := s.raftStore.Close(); err != nil { 703 s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err) 704 } 705 } 706 }() 707 708 // Create the FSM 709 var err error 710 s.fsm, err = NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput) 711 if err != nil { 712 return err 713 } 714 715 // Create a transport layer 716 trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout, 717 s.config.LogOutput) 718 s.raftTransport = trans 719 720 // Make sure we set the LogOutput. 721 s.config.RaftConfig.LogOutput = s.config.LogOutput 722 723 // Our version of Raft protocol requires the LocalID to match the network 724 // address of the transport. 725 s.config.RaftConfig.LocalID = raft.ServerID(trans.LocalAddr()) 726 727 // Build an all in-memory setup for dev mode, otherwise prepare a full 728 // disk-based setup. 729 var log raft.LogStore 730 var stable raft.StableStore 731 var snap raft.SnapshotStore 732 if s.config.DevMode { 733 store := raft.NewInmemStore() 734 s.raftInmem = store 735 stable = store 736 log = store 737 snap = raft.NewDiscardSnapshotStore() 738 739 } else { 740 // Create the base raft path 741 path := filepath.Join(s.config.DataDir, raftState) 742 if err := ensurePath(path, true); err != nil { 743 return err 744 } 745 746 // Create the BoltDB backend 747 store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db")) 748 if err != nil { 749 return err 750 } 751 s.raftStore = store 752 stable = store 753 754 // Wrap the store in a LogCache to improve performance 755 cacheStore, err := raft.NewLogCache(raftLogCacheSize, store) 756 if err != nil { 757 store.Close() 758 return err 759 } 760 log = cacheStore 761 762 // Create the snapshot store 763 snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput) 764 if err != nil { 765 if s.raftStore != nil { 766 s.raftStore.Close() 767 } 768 return err 769 } 770 snap = snapshots 771 772 // For an existing cluster being upgraded to the new version of 773 // Raft, we almost never want to run recovery based on the old 774 // peers.json file. We create a peers.info file with a helpful 775 // note about where peers.json went, and use that as a sentinel 776 // to avoid ingesting the old one that first time (if we have to 777 // create the peers.info file because it's not there, we also 778 // blow away any existing peers.json file). 779 peersFile := filepath.Join(path, "peers.json") 780 peersInfoFile := filepath.Join(path, "peers.info") 781 if _, err := os.Stat(peersInfoFile); os.IsNotExist(err) { 782 if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0755); err != nil { 783 return fmt.Errorf("failed to write peers.info file: %v", err) 784 } 785 786 // Blow away the peers.json file if present, since the 787 // peers.info sentinel wasn't there. 788 if _, err := os.Stat(peersFile); err == nil { 789 if err := os.Remove(peersFile); err != nil { 790 return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 791 } 792 s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)") 793 } 794 } else if _, err := os.Stat(peersFile); err == nil { 795 s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...") 796 configuration, err := raft.ReadPeersJSON(peersFile) 797 if err != nil { 798 return fmt.Errorf("recovery failed to parse peers.json: %v", err) 799 } 800 tmpFsm, err := NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput) 801 if err != nil { 802 return fmt.Errorf("recovery failed to make temp FSM: %v", err) 803 } 804 if err := raft.RecoverCluster(s.config.RaftConfig, tmpFsm, 805 log, stable, snap, trans, configuration); err != nil { 806 return fmt.Errorf("recovery failed: %v", err) 807 } 808 if err := os.Remove(peersFile); err != nil { 809 return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 810 } 811 s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery") 812 } 813 } 814 815 // If we are in bootstrap or dev mode and the state is clean then we can 816 // bootstrap now. 817 if s.config.Bootstrap || s.config.DevMode { 818 hasState, err := raft.HasExistingState(log, stable, snap) 819 if err != nil { 820 return err 821 } 822 if !hasState { 823 // TODO (alexdadgar) - This will need to be updated when 824 // we add support for node IDs. 825 configuration := raft.Configuration{ 826 Servers: []raft.Server{ 827 raft.Server{ 828 ID: raft.ServerID(trans.LocalAddr()), 829 Address: trans.LocalAddr(), 830 }, 831 }, 832 } 833 if err := raft.BootstrapCluster(s.config.RaftConfig, 834 log, stable, snap, trans, configuration); err != nil { 835 return err 836 } 837 } 838 } 839 840 // Setup the leader channel 841 leaderCh := make(chan bool, 1) 842 s.config.RaftConfig.NotifyCh = leaderCh 843 s.leaderCh = leaderCh 844 845 // Setup the Raft store 846 s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, snap, trans) 847 if err != nil { 848 return err 849 } 850 return nil 851 } 852 853 // setupSerf is used to setup and initialize a Serf 854 func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { 855 conf.Init() 856 conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) 857 conf.Tags["role"] = "nomad" 858 conf.Tags["region"] = s.config.Region 859 conf.Tags["dc"] = s.config.Datacenter 860 conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion) 861 conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion) 862 conf.Tags["build"] = s.config.Build 863 conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) 864 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 865 conf.Tags["bootstrap"] = "1" 866 } 867 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 868 if bootstrapExpect != 0 { 869 conf.Tags["expect"] = fmt.Sprintf("%d", bootstrapExpect) 870 } 871 conf.MemberlistConfig.LogOutput = s.config.LogOutput 872 conf.LogOutput = s.config.LogOutput 873 conf.EventCh = ch 874 if !s.config.DevMode { 875 conf.SnapshotPath = filepath.Join(s.config.DataDir, path) 876 if err := ensurePath(conf.SnapshotPath, false); err != nil { 877 return nil, err 878 } 879 } 880 conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] 881 conf.RejoinAfterLeave = true 882 conf.Merge = &serfMergeDelegate{} 883 884 // Until Nomad supports this fully, we disable automatic resolution. 885 // When enabled, the Serf gossip may just turn off if we are the minority 886 // node which is rather unexpected. 887 conf.EnableNameConflictResolution = false 888 return serf.Create(conf) 889 } 890 891 // setupWorkers is used to start the scheduling workers 892 func (s *Server) setupWorkers() error { 893 // Check if all the schedulers are disabled 894 if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { 895 s.logger.Printf("[WARN] nomad: no enabled schedulers") 896 return nil 897 } 898 899 // Start the workers 900 for i := 0; i < s.config.NumSchedulers; i++ { 901 if w, err := NewWorker(s); err != nil { 902 return err 903 } else { 904 s.workers = append(s.workers, w) 905 } 906 } 907 s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", 908 s.config.NumSchedulers, s.config.EnabledSchedulers) 909 return nil 910 } 911 912 // numPeers is used to check on the number of known peers, including the local 913 // node. 914 func (s *Server) numPeers() (int, error) { 915 future := s.raft.GetConfiguration() 916 if err := future.Error(); err != nil { 917 return 0, err 918 } 919 configuration := future.Configuration() 920 return len(configuration.Servers), nil 921 } 922 923 // IsLeader checks if this server is the cluster leader 924 func (s *Server) IsLeader() bool { 925 return s.raft.State() == raft.Leader 926 } 927 928 // Join is used to have Nomad join the gossip ring 929 // The target address should be another node listening on the 930 // Serf address 931 func (s *Server) Join(addrs []string) (int, error) { 932 return s.serf.Join(addrs, true) 933 } 934 935 // LocalMember is used to return the local node 936 func (c *Server) LocalMember() serf.Member { 937 return c.serf.LocalMember() 938 } 939 940 // Members is used to return the members of the serf cluster 941 func (s *Server) Members() []serf.Member { 942 return s.serf.Members() 943 } 944 945 // RemoveFailedNode is used to remove a failed node from the cluster 946 func (s *Server) RemoveFailedNode(node string) error { 947 return s.serf.RemoveFailedNode(node) 948 } 949 950 // KeyManager returns the Serf keyring manager 951 func (s *Server) KeyManager() *serf.KeyManager { 952 return s.serf.KeyManager() 953 } 954 955 // Encrypted determines if gossip is encrypted 956 func (s *Server) Encrypted() bool { 957 return s.serf.EncryptionEnabled() 958 } 959 960 // State returns the underlying state store. This should *not* 961 // be used to modify state directly. 962 func (s *Server) State() *state.StateStore { 963 return s.fsm.State() 964 } 965 966 // Regions returns the known regions in the cluster. 967 func (s *Server) Regions() []string { 968 s.peerLock.RLock() 969 defer s.peerLock.RUnlock() 970 971 regions := make([]string, 0, len(s.peers)) 972 for region, _ := range s.peers { 973 regions = append(regions, region) 974 } 975 sort.Strings(regions) 976 return regions 977 } 978 979 // inmemCodec is used to do an RPC call without going over a network 980 type inmemCodec struct { 981 method string 982 args interface{} 983 reply interface{} 984 err error 985 } 986 987 func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error { 988 req.ServiceMethod = i.method 989 return nil 990 } 991 992 func (i *inmemCodec) ReadRequestBody(args interface{}) error { 993 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args))) 994 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args))) 995 dst.Set(sourceValue) 996 return nil 997 } 998 999 func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error { 1000 if resp.Error != "" { 1001 i.err = errors.New(resp.Error) 1002 return nil 1003 } 1004 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply))) 1005 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply))) 1006 dst.Set(sourceValue) 1007 return nil 1008 } 1009 1010 func (i *inmemCodec) Close() error { 1011 return nil 1012 } 1013 1014 // RPC is used to make a local RPC call 1015 func (s *Server) RPC(method string, args interface{}, reply interface{}) error { 1016 codec := &inmemCodec{ 1017 method: method, 1018 args: args, 1019 reply: reply, 1020 } 1021 if err := s.rpcServer.ServeRequest(codec); err != nil { 1022 return err 1023 } 1024 return codec.err 1025 } 1026 1027 // Stats is used to return statistics for debugging and insight 1028 // for various sub-systems 1029 func (s *Server) Stats() map[string]map[string]string { 1030 toString := func(v uint64) string { 1031 return strconv.FormatUint(v, 10) 1032 } 1033 stats := map[string]map[string]string{ 1034 "nomad": map[string]string{ 1035 "server": "true", 1036 "leader": fmt.Sprintf("%v", s.IsLeader()), 1037 "leader_addr": string(s.raft.Leader()), 1038 "bootstrap": fmt.Sprintf("%v", s.config.Bootstrap), 1039 "known_regions": toString(uint64(len(s.peers))), 1040 }, 1041 "raft": s.raft.Stats(), 1042 "serf": s.serf.Stats(), 1043 "runtime": RuntimeStats(), 1044 } 1045 1046 return stats 1047 } 1048 1049 // Region retuns the region of the server 1050 func (s *Server) Region() string { 1051 return s.config.Region 1052 } 1053 1054 // Datacenter returns the data center of the server 1055 func (s *Server) Datacenter() string { 1056 return s.config.Datacenter 1057 } 1058 1059 // GetConfig returns the config of the server for testing purposes only 1060 func (s *Server) GetConfig() *Config { 1061 return s.config 1062 } 1063 1064 // peersInfoContent is used to help operators understand what happened to the 1065 // peers.json file. This is written to a file called peers.info in the same 1066 // location. 1067 const peersInfoContent = ` 1068 As of Nomad 0.5.5, the peers.json file is only used for recovery 1069 after an outage. It should be formatted as a JSON array containing the address 1070 and port of each Consul server in the cluster, like this: 1071 1072 ["10.1.0.1:4647","10.1.0.2:4647","10.1.0.3:4647"] 1073 1074 Under normal operation, the peers.json file will not be present. 1075 1076 When Nomad starts for the first time, it will create this peers.info file and 1077 delete any existing peers.json file so that recovery doesn't occur on the first 1078 startup. 1079 1080 Once this peers.info file is present, any peers.json file will be ingested at 1081 startup, and will set the Raft peer configuration manually to recover from an 1082 outage. It's crucial that all servers in the cluster are shut down before 1083 creating the peers.json file, and that all servers receive the same 1084 configuration. Once the peers.json file is successfully ingested and applied, it 1085 will be deleted. 1086 1087 Please see https://www.nomadproject.io/guides/outage.html for more information. 1088 `