github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/nomad/server.go (about) 1 package nomad 2 3 import ( 4 "crypto/tls" 5 "errors" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "net" 10 "net/rpc" 11 "os" 12 "path/filepath" 13 "reflect" 14 "sort" 15 "strconv" 16 "sync" 17 "sync/atomic" 18 "time" 19 20 consulapi "github.com/hashicorp/consul/api" 21 "github.com/hashicorp/consul/lib" 22 "github.com/hashicorp/go-multierror" 23 "github.com/hashicorp/nomad/command/agent/consul" 24 "github.com/hashicorp/nomad/helper/tlsutil" 25 "github.com/hashicorp/nomad/nomad/deploymentwatcher" 26 "github.com/hashicorp/nomad/nomad/state" 27 "github.com/hashicorp/nomad/nomad/structs" 28 "github.com/hashicorp/raft" 29 "github.com/hashicorp/raft-boltdb" 30 "github.com/hashicorp/serf/serf" 31 ) 32 33 const ( 34 // datacenterQueryLimit sets the max number of DCs that a Nomad 35 // Server will query to find bootstrap_expect servers. 36 datacenterQueryLimit = 25 37 38 // maxStaleLeadership is the maximum time we will permit this Nomad 39 // Server to go without seeing a valid Raft leader. 40 maxStaleLeadership = 15 * time.Second 41 42 // peersPollInterval is used as the polling interval between attempts 43 // to query Consul for Nomad Servers. 44 peersPollInterval = 45 * time.Second 45 46 // peersPollJitter is used to provide a slight amount of variance to 47 // the retry interval when querying Consul Servers 48 peersPollJitterFactor = 2 49 50 raftState = "raft/" 51 serfSnapshot = "serf/snapshot" 52 snapshotsRetained = 2 53 54 // serverRPCCache controls how long we keep an idle connection open to a server 55 serverRPCCache = 2 * time.Minute 56 57 // serverMaxStreams controsl how many idle streams we keep open to a server 58 serverMaxStreams = 64 59 60 // raftLogCacheSize is the maximum number of logs to cache in-memory. 61 // This is used to reduce disk I/O for the recently committed entries. 62 raftLogCacheSize = 512 63 64 // raftRemoveGracePeriod is how long we wait to allow a RemovePeer 65 // to replicate to gracefully leave the cluster. 66 raftRemoveGracePeriod = 5 * time.Second 67 68 // defaultConsulDiscoveryInterval is how often to poll Consul for new 69 // servers if there is no leader. 70 defaultConsulDiscoveryInterval time.Duration = 3 * time.Second 71 72 // defaultConsulDiscoveryIntervalRetry is how often to poll Consul for 73 // new servers if there is no leader and the last Consul query failed. 74 defaultConsulDiscoveryIntervalRetry time.Duration = 9 * time.Second 75 ) 76 77 // Server is Nomad server which manages the job queues, 78 // schedulers, and notification bus for agents. 79 type Server struct { 80 config *Config 81 logger *log.Logger 82 83 // Connection pool to other Nomad servers 84 connPool *ConnPool 85 86 // Endpoints holds our RPC endpoints 87 endpoints endpoints 88 89 // The raft instance is used among Nomad nodes within the 90 // region to protect operations that require strong consistency 91 leaderCh <-chan bool 92 raft *raft.Raft 93 raftLayer *RaftLayer 94 raftStore *raftboltdb.BoltStore 95 raftInmem *raft.InmemStore 96 raftTransport *raft.NetworkTransport 97 98 // fsm is the state machine used with Raft 99 fsm *nomadFSM 100 101 // rpcListener is used to listen for incoming connections 102 rpcListener net.Listener 103 rpcServer *rpc.Server 104 rpcAdvertise net.Addr 105 106 // rpcTLS is the TLS config for incoming TLS requests 107 rpcTLS *tls.Config 108 109 // peers is used to track the known Nomad servers. This is 110 // used for region forwarding and clustering. 111 peers map[string][]*serverParts 112 localPeers map[raft.ServerAddress]*serverParts 113 peerLock sync.RWMutex 114 115 // serf is the Serf cluster containing only Nomad 116 // servers. This is used for multi-region federation 117 // and automatic clustering within regions. 118 serf *serf.Serf 119 120 // reconcileCh is used to pass events from the serf handler 121 // into the leader manager. Mostly used to handle when servers 122 // join/leave from the region. 123 reconcileCh chan serf.Member 124 125 // eventCh is used to receive events from the serf cluster 126 eventCh chan serf.Event 127 128 // BlockedEvals is used to manage evaluations that are blocked on node 129 // capacity changes. 130 blockedEvals *BlockedEvals 131 132 // deploymentWatcher is used to watch deployments and their allocations and 133 // make the required calls to continue to transistion the deployment. 134 deploymentWatcher *deploymentwatcher.Watcher 135 136 // evalBroker is used to manage the in-progress evaluations 137 // that are waiting to be brokered to a sub-scheduler 138 evalBroker *EvalBroker 139 140 // periodicDispatcher is used to track and create evaluations for periodic jobs. 141 periodicDispatcher *PeriodicDispatch 142 143 // planQueue is used to manage the submitted allocation 144 // plans that are waiting to be assessed by the leader 145 planQueue *PlanQueue 146 147 // heartbeatTimers track the expiration time of each heartbeat that has 148 // a TTL. On expiration, the node status is updated to be 'down'. 149 heartbeatTimers map[string]*time.Timer 150 heartbeatTimersLock sync.Mutex 151 152 // consulCatalog is used for discovering other Nomad Servers via Consul 153 consulCatalog consul.CatalogAPI 154 155 // vault is the client for communicating with Vault. 156 vault VaultClient 157 158 // Worker used for processing 159 workers []*Worker 160 161 left bool 162 shutdown bool 163 shutdownCh chan struct{} 164 shutdownLock sync.Mutex 165 } 166 167 // Holds the RPC endpoints 168 type endpoints struct { 169 Status *Status 170 Node *Node 171 Job *Job 172 Eval *Eval 173 Plan *Plan 174 Alloc *Alloc 175 Deployment *Deployment 176 Region *Region 177 Periodic *Periodic 178 System *System 179 Operator *Operator 180 } 181 182 // NewServer is used to construct a new Nomad server from the 183 // configuration, potentially returning an error 184 func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) { 185 // Check the protocol version 186 if err := config.CheckVersion(); err != nil { 187 return nil, err 188 } 189 190 // Create an eval broker 191 evalBroker, err := NewEvalBroker( 192 config.EvalNackTimeout, 193 config.EvalNackInitialReenqueueDelay, 194 config.EvalNackSubsequentReenqueueDelay, 195 config.EvalDeliveryLimit) 196 if err != nil { 197 return nil, err 198 } 199 200 // Create a new blocked eval tracker. 201 blockedEvals := NewBlockedEvals(evalBroker) 202 203 // Create a plan queue 204 planQueue, err := NewPlanQueue() 205 if err != nil { 206 return nil, err 207 } 208 209 // Configure TLS 210 var tlsWrap tlsutil.RegionWrapper 211 var incomingTLS *tls.Config 212 if config.TLSConfig.EnableRPC { 213 tlsConf := config.tlsConfig() 214 tw, err := tlsConf.OutgoingTLSWrapper() 215 if err != nil { 216 return nil, err 217 } 218 tlsWrap = tw 219 220 itls, err := tlsConf.IncomingTLSConfig() 221 if err != nil { 222 return nil, err 223 } 224 incomingTLS = itls 225 } 226 227 // Create the server 228 s := &Server{ 229 config: config, 230 consulCatalog: consulCatalog, 231 connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), 232 logger: logger, 233 rpcServer: rpc.NewServer(), 234 peers: make(map[string][]*serverParts), 235 localPeers: make(map[raft.ServerAddress]*serverParts), 236 reconcileCh: make(chan serf.Member, 32), 237 eventCh: make(chan serf.Event, 256), 238 evalBroker: evalBroker, 239 blockedEvals: blockedEvals, 240 planQueue: planQueue, 241 rpcTLS: incomingTLS, 242 shutdownCh: make(chan struct{}), 243 } 244 245 // Create the periodic dispatcher for launching periodic jobs. 246 s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) 247 248 // Setup Vault 249 if err := s.setupVaultClient(); err != nil { 250 s.Shutdown() 251 s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err) 252 return nil, fmt.Errorf("Failed to setup Vault client: %v", err) 253 } 254 255 // Initialize the RPC layer 256 if err := s.setupRPC(tlsWrap); err != nil { 257 s.Shutdown() 258 s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) 259 return nil, fmt.Errorf("Failed to start RPC layer: %v", err) 260 } 261 262 // Initialize the Raft server 263 if err := s.setupRaft(); err != nil { 264 s.Shutdown() 265 s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err) 266 return nil, fmt.Errorf("Failed to start Raft: %v", err) 267 } 268 269 // Initialize the wan Serf 270 s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) 271 if err != nil { 272 s.Shutdown() 273 s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) 274 return nil, fmt.Errorf("Failed to start serf: %v", err) 275 } 276 277 // Initialize the scheduling workers 278 if err := s.setupWorkers(); err != nil { 279 s.Shutdown() 280 s.logger.Printf("[ERR] nomad: failed to start workers: %s", err) 281 return nil, fmt.Errorf("Failed to start workers: %v", err) 282 } 283 284 // Setup the Consul syncer 285 if err := s.setupConsulSyncer(); err != nil { 286 return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) 287 } 288 289 // Setup the deployment watcher. 290 if err := s.setupDeploymentWatcher(); err != nil { 291 return nil, fmt.Errorf("failed to create deployment watcher: %v", err) 292 } 293 294 // Monitor leadership changes 295 go s.monitorLeadership() 296 297 // Start ingesting events for Serf 298 go s.serfEventHandler() 299 300 // Start the RPC listeners 301 go s.listen() 302 303 // Emit metrics for the eval broker 304 go evalBroker.EmitStats(time.Second, s.shutdownCh) 305 306 // Emit metrics for the plan queue 307 go planQueue.EmitStats(time.Second, s.shutdownCh) 308 309 // Emit metrics for the blocked eval tracker. 310 go blockedEvals.EmitStats(time.Second, s.shutdownCh) 311 312 // Emit metrics for the Vault client. 313 go s.vault.EmitStats(time.Second, s.shutdownCh) 314 315 // Emit metrics 316 go s.heartbeatStats() 317 318 // Done 319 return s, nil 320 } 321 322 // Shutdown is used to shutdown the server 323 func (s *Server) Shutdown() error { 324 s.logger.Printf("[INFO] nomad: shutting down server") 325 s.shutdownLock.Lock() 326 defer s.shutdownLock.Unlock() 327 328 if s.shutdown { 329 return nil 330 } 331 332 s.shutdown = true 333 close(s.shutdownCh) 334 335 if s.serf != nil { 336 s.serf.Shutdown() 337 } 338 339 if s.raft != nil { 340 s.raftTransport.Close() 341 s.raftLayer.Close() 342 future := s.raft.Shutdown() 343 if err := future.Error(); err != nil { 344 s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) 345 } 346 if s.raftStore != nil { 347 s.raftStore.Close() 348 } 349 } 350 351 // Shutdown the RPC listener 352 if s.rpcListener != nil { 353 s.rpcListener.Close() 354 } 355 356 // Close the connection pool 357 s.connPool.Shutdown() 358 359 // Close the fsm 360 if s.fsm != nil { 361 s.fsm.Close() 362 } 363 364 // Stop Vault token renewal 365 if s.vault != nil { 366 s.vault.Stop() 367 } 368 369 return nil 370 } 371 372 // IsShutdown checks if the server is shutdown 373 func (s *Server) IsShutdown() bool { 374 select { 375 case <-s.shutdownCh: 376 return true 377 default: 378 return false 379 } 380 } 381 382 // Leave is used to prepare for a graceful shutdown of the server 383 func (s *Server) Leave() error { 384 s.logger.Printf("[INFO] nomad: server starting leave") 385 s.left = true 386 387 // Check the number of known peers 388 numPeers, err := s.numPeers() 389 if err != nil { 390 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 391 return err 392 } 393 394 // TODO (alexdadgar) - This will need to be updated once we support node 395 // IDs. 396 addr := s.raftTransport.LocalAddr() 397 398 // If we are the current leader, and we have any other peers (cluster has multiple 399 // servers), we should do a RemovePeer to safely reduce the quorum size. If we are 400 // not the leader, then we should issue our leave intention and wait to be removed 401 // for some sane period of time. 402 isLeader := s.IsLeader() 403 if isLeader && numPeers > 1 { 404 future := s.raft.RemovePeer(addr) 405 if err := future.Error(); err != nil { 406 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 407 } 408 } 409 410 // Leave the gossip pool 411 if s.serf != nil { 412 if err := s.serf.Leave(); err != nil { 413 s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) 414 } 415 } 416 417 // If we were not leader, wait to be safely removed from the cluster. 418 // We must wait to allow the raft replication to take place, otherwise 419 // an immediate shutdown could cause a loss of quorum. 420 if !isLeader { 421 left := false 422 limit := time.Now().Add(raftRemoveGracePeriod) 423 for !left && time.Now().Before(limit) { 424 // Sleep a while before we check. 425 time.Sleep(50 * time.Millisecond) 426 427 // Get the latest configuration. 428 future := s.raft.GetConfiguration() 429 if err := future.Error(); err != nil { 430 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 431 break 432 } 433 434 // See if we are no longer included. 435 left = true 436 for _, server := range future.Configuration().Servers { 437 if server.Address == addr { 438 left = false 439 break 440 } 441 } 442 } 443 444 // TODO (alexdadgar) With the old Raft library we used to force the 445 // peers set to empty when a graceful leave occurred. This would 446 // keep voting spam down if the server was restarted, but it was 447 // dangerous because the peers was inconsistent with the logs and 448 // snapshots, so it wasn't really safe in all cases for the server 449 // to become leader. This is now safe, but the log spam is noisy. 450 // The next new version of the library will have a "you are not a 451 // peer stop it" behavior that should address this. We will have 452 // to evaluate during the RC period if this interim situation is 453 // not too confusing for operators. 454 455 // TODO (alexdadgar) When we take a later new version of the Raft 456 // library it won't try to complete replication, so this peer 457 // may not realize that it has been removed. Need to revisit this 458 // and the warning here. 459 if !left { 460 s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout") 461 } 462 } 463 return nil 464 } 465 466 // Reload handles a config reload. Not all config fields can handle a reload. 467 func (s *Server) Reload(config *Config) error { 468 if config == nil { 469 return fmt.Errorf("Reload given a nil config") 470 } 471 472 var mErr multierror.Error 473 474 // Handle the Vault reload. Vault should never be nil but just guard. 475 if s.vault != nil { 476 if err := s.vault.SetConfig(config.VaultConfig); err != nil { 477 multierror.Append(&mErr, err) 478 } 479 } 480 481 return mErr.ErrorOrNil() 482 } 483 484 // setupBootstrapHandler() creates the closure necessary to support a Consul 485 // fallback handler. 486 func (s *Server) setupBootstrapHandler() error { 487 // peersTimeout is used to indicate to the Consul Syncer that the 488 // current Nomad Server has a stale peer set. peersTimeout will time 489 // out if the Consul Syncer bootstrapFn has not observed a Raft 490 // leader in maxStaleLeadership. If peersTimeout has been triggered, 491 // the Consul Syncer will begin querying Consul for other Nomad 492 // Servers. 493 // 494 // NOTE: time.Timer is used vs time.Time in order to handle clock 495 // drift because time.Timer is implemented as a monotonic clock. 496 var peersTimeout *time.Timer = time.NewTimer(0) 497 498 // consulQueryCount is the number of times the bootstrapFn has been 499 // called, regardless of success. 500 var consulQueryCount uint64 501 502 // leadershipTimedOut is a helper method that returns true if the 503 // peersTimeout timer has expired. 504 leadershipTimedOut := func() bool { 505 select { 506 case <-peersTimeout.C: 507 return true 508 default: 509 return false 510 } 511 } 512 513 // The bootstrapFn callback handler is used to periodically poll 514 // Consul to look up the Nomad Servers in Consul. In the event the 515 // server has been brought up without a `retry-join` configuration 516 // and this Server is partitioned from the rest of the cluster, 517 // periodically poll Consul to reattach this Server to other servers 518 // in the same region and automatically reform a quorum (assuming the 519 // correct number of servers required for quorum are present). 520 bootstrapFn := func() error { 521 // If there is a raft leader, do nothing 522 if s.raft.Leader() != "" { 523 peersTimeout.Reset(maxStaleLeadership) 524 return nil 525 } 526 527 // (ab)use serf.go's behavior of setting BootstrapExpect to 528 // zero if we have bootstrapped. If we have bootstrapped 529 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 530 if bootstrapExpect == 0 { 531 // This Nomad Server has been bootstrapped. Rely on 532 // the peersTimeout firing as a guard to prevent 533 // aggressive querying of Consul. 534 if !leadershipTimedOut() { 535 return nil 536 } 537 } else { 538 if consulQueryCount > 0 && !leadershipTimedOut() { 539 return nil 540 } 541 542 // This Nomad Server has not been bootstrapped, reach 543 // out to Consul if our peer list is less than 544 // `bootstrap_expect`. 545 raftPeers, err := s.numPeers() 546 if err != nil { 547 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 548 return nil 549 } 550 551 // The necessary number of Nomad Servers required for 552 // quorum has been reached, we do not need to poll 553 // Consul. Let the normal timeout-based strategy 554 // take over. 555 if raftPeers >= int(bootstrapExpect) { 556 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 557 return nil 558 } 559 } 560 consulQueryCount++ 561 562 s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") 563 564 dcs, err := s.consulCatalog.Datacenters() 565 if err != nil { 566 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 567 return fmt.Errorf("server.nomad: unable to query Consul datacenters: %v", err) 568 } 569 if len(dcs) > 2 { 570 // Query the local DC first, then shuffle the 571 // remaining DCs. If additional calls to bootstrapFn 572 // are necessary, this Nomad Server will eventually 573 // walk all datacenter until it finds enough hosts to 574 // form a quorum. 575 shuffleStrings(dcs[1:]) 576 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 577 } 578 579 nomadServerServiceName := s.config.ConsulConfig.ServerServiceName 580 var mErr multierror.Error 581 const defaultMaxNumNomadServers = 8 582 nomadServerServices := make([]string, 0, defaultMaxNumNomadServers) 583 localNode := s.serf.Memberlist().LocalNode() 584 for _, dc := range dcs { 585 consulOpts := &consulapi.QueryOptions{ 586 AllowStale: true, 587 Datacenter: dc, 588 Near: "_agent", 589 WaitTime: consul.DefaultQueryWaitDuration, 590 } 591 consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) 592 if err != nil { 593 err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) 594 s.logger.Printf("[WARN] server.nomad: %v", err) 595 mErr.Errors = append(mErr.Errors, err) 596 continue 597 } 598 599 for _, cs := range consulServices { 600 port := strconv.FormatInt(int64(cs.ServicePort), 10) 601 addr := cs.ServiceAddress 602 if addr == "" { 603 addr = cs.Address 604 } 605 if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort { 606 continue 607 } 608 serverAddr := net.JoinHostPort(addr, port) 609 nomadServerServices = append(nomadServerServices, serverAddr) 610 } 611 } 612 613 if len(nomadServerServices) == 0 { 614 if len(mErr.Errors) > 0 { 615 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 616 return mErr.ErrorOrNil() 617 } 618 619 // Log the error and return nil so future handlers 620 // can attempt to register the `nomad` service. 621 pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor) 622 s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval) 623 peersTimeout.Reset(pollInterval) 624 return nil 625 } 626 627 numServersContacted, err := s.Join(nomadServerServices) 628 if err != nil { 629 peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) 630 return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err) 631 } 632 633 peersTimeout.Reset(maxStaleLeadership) 634 s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted) 635 636 return nil 637 } 638 639 // Hacky replacement for old ConsulSyncer Periodic Handler. 640 go func() { 641 lastOk := true 642 sync := time.NewTimer(0) 643 for { 644 select { 645 case <-sync.C: 646 d := defaultConsulDiscoveryInterval 647 if err := bootstrapFn(); err != nil { 648 // Only log if it worked last time 649 if lastOk { 650 lastOk = false 651 s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err) 652 } 653 d = defaultConsulDiscoveryIntervalRetry 654 } 655 sync.Reset(d) 656 case <-s.shutdownCh: 657 return 658 } 659 } 660 }() 661 return nil 662 } 663 664 // setupConsulSyncer creates Server-mode consul.Syncer which periodically 665 // executes callbacks on a fixed interval. 666 func (s *Server) setupConsulSyncer() error { 667 if s.config.ConsulConfig.ServerAutoJoin != nil && *s.config.ConsulConfig.ServerAutoJoin { 668 if err := s.setupBootstrapHandler(); err != nil { 669 return err 670 } 671 } 672 673 return nil 674 } 675 676 // setupDeploymentWatcher creates a deployment watcher that consumes the RPC 677 // endpoints for state information and makes transistions via Raft through a 678 // shim that provides the appropriate methods. 679 func (s *Server) setupDeploymentWatcher() error { 680 681 // Create the shims 682 stateShim := &deploymentWatcherStateShim{ 683 region: s.Region(), 684 evaluations: s.endpoints.Job.Evaluations, 685 allocations: s.endpoints.Deployment.Allocations, 686 list: s.endpoints.Deployment.List, 687 getDeployment: s.endpoints.Deployment.GetDeployment, 688 getJobVersions: s.endpoints.Job.GetJobVersions, 689 getJob: s.endpoints.Job.GetJob, 690 } 691 raftShim := &deploymentWatcherRaftShim{ 692 apply: s.raftApply, 693 } 694 695 // Create the deployment watcher 696 s.deploymentWatcher = deploymentwatcher.NewDeploymentsWatcher( 697 s.logger, stateShim, raftShim, 698 deploymentwatcher.LimitStateQueriesPerSecond, 699 deploymentwatcher.CrossDeploymentEvalBatchDuration) 700 701 return nil 702 } 703 704 // setupVaultClient is used to set up the Vault API client. 705 func (s *Server) setupVaultClient() error { 706 v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors) 707 if err != nil { 708 return err 709 } 710 s.vault = v 711 return nil 712 } 713 714 // setupRPC is used to setup the RPC listener 715 func (s *Server) setupRPC(tlsWrap tlsutil.RegionWrapper) error { 716 // Create endpoints 717 s.endpoints.Alloc = &Alloc{s} 718 s.endpoints.Eval = &Eval{s} 719 s.endpoints.Job = &Job{s} 720 s.endpoints.Node = &Node{srv: s} 721 s.endpoints.Deployment = &Deployment{srv: s} 722 s.endpoints.Operator = &Operator{s} 723 s.endpoints.Periodic = &Periodic{s} 724 s.endpoints.Plan = &Plan{s} 725 s.endpoints.Region = &Region{s} 726 s.endpoints.Status = &Status{s} 727 s.endpoints.System = &System{s} 728 729 // Register the handlers 730 s.rpcServer.Register(s.endpoints.Alloc) 731 s.rpcServer.Register(s.endpoints.Eval) 732 s.rpcServer.Register(s.endpoints.Job) 733 s.rpcServer.Register(s.endpoints.Node) 734 s.rpcServer.Register(s.endpoints.Deployment) 735 s.rpcServer.Register(s.endpoints.Operator) 736 s.rpcServer.Register(s.endpoints.Periodic) 737 s.rpcServer.Register(s.endpoints.Plan) 738 s.rpcServer.Register(s.endpoints.Region) 739 s.rpcServer.Register(s.endpoints.Status) 740 s.rpcServer.Register(s.endpoints.System) 741 742 list, err := net.ListenTCP("tcp", s.config.RPCAddr) 743 if err != nil { 744 return err 745 } 746 s.rpcListener = list 747 748 if s.config.RPCAdvertise != nil { 749 s.rpcAdvertise = s.config.RPCAdvertise 750 } else { 751 s.rpcAdvertise = s.rpcListener.Addr() 752 } 753 754 // Verify that we have a usable advertise address 755 addr, ok := s.rpcAdvertise.(*net.TCPAddr) 756 if !ok { 757 list.Close() 758 return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr) 759 } 760 if addr.IP.IsUnspecified() { 761 list.Close() 762 return fmt.Errorf("RPC advertise address is not advertisable: %v", addr) 763 } 764 765 wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap) 766 s.raftLayer = NewRaftLayer(s.rpcAdvertise, wrapper) 767 return nil 768 } 769 770 // setupRaft is used to setup and initialize Raft 771 func (s *Server) setupRaft() error { 772 // If we have an unclean exit then attempt to close the Raft store. 773 defer func() { 774 if s.raft == nil && s.raftStore != nil { 775 if err := s.raftStore.Close(); err != nil { 776 s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err) 777 } 778 } 779 }() 780 781 // Create the FSM 782 var err error 783 s.fsm, err = NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput) 784 if err != nil { 785 return err 786 } 787 788 // Create a transport layer 789 trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout, 790 s.config.LogOutput) 791 s.raftTransport = trans 792 793 // Make sure we set the LogOutput. 794 s.config.RaftConfig.LogOutput = s.config.LogOutput 795 796 // Our version of Raft protocol requires the LocalID to match the network 797 // address of the transport. 798 s.config.RaftConfig.LocalID = raft.ServerID(trans.LocalAddr()) 799 800 // Build an all in-memory setup for dev mode, otherwise prepare a full 801 // disk-based setup. 802 var log raft.LogStore 803 var stable raft.StableStore 804 var snap raft.SnapshotStore 805 if s.config.DevMode { 806 store := raft.NewInmemStore() 807 s.raftInmem = store 808 stable = store 809 log = store 810 snap = raft.NewDiscardSnapshotStore() 811 812 } else { 813 // Create the base raft path 814 path := filepath.Join(s.config.DataDir, raftState) 815 if err := ensurePath(path, true); err != nil { 816 return err 817 } 818 819 // Create the BoltDB backend 820 store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db")) 821 if err != nil { 822 return err 823 } 824 s.raftStore = store 825 stable = store 826 827 // Wrap the store in a LogCache to improve performance 828 cacheStore, err := raft.NewLogCache(raftLogCacheSize, store) 829 if err != nil { 830 store.Close() 831 return err 832 } 833 log = cacheStore 834 835 // Create the snapshot store 836 snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput) 837 if err != nil { 838 if s.raftStore != nil { 839 s.raftStore.Close() 840 } 841 return err 842 } 843 snap = snapshots 844 845 // For an existing cluster being upgraded to the new version of 846 // Raft, we almost never want to run recovery based on the old 847 // peers.json file. We create a peers.info file with a helpful 848 // note about where peers.json went, and use that as a sentinel 849 // to avoid ingesting the old one that first time (if we have to 850 // create the peers.info file because it's not there, we also 851 // blow away any existing peers.json file). 852 peersFile := filepath.Join(path, "peers.json") 853 peersInfoFile := filepath.Join(path, "peers.info") 854 if _, err := os.Stat(peersInfoFile); os.IsNotExist(err) { 855 if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0755); err != nil { 856 return fmt.Errorf("failed to write peers.info file: %v", err) 857 } 858 859 // Blow away the peers.json file if present, since the 860 // peers.info sentinel wasn't there. 861 if _, err := os.Stat(peersFile); err == nil { 862 if err := os.Remove(peersFile); err != nil { 863 return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 864 } 865 s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)") 866 } 867 } else if _, err := os.Stat(peersFile); err == nil { 868 s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...") 869 configuration, err := raft.ReadPeersJSON(peersFile) 870 if err != nil { 871 return fmt.Errorf("recovery failed to parse peers.json: %v", err) 872 } 873 tmpFsm, err := NewFSM(s.evalBroker, s.periodicDispatcher, s.blockedEvals, s.config.LogOutput) 874 if err != nil { 875 return fmt.Errorf("recovery failed to make temp FSM: %v", err) 876 } 877 if err := raft.RecoverCluster(s.config.RaftConfig, tmpFsm, 878 log, stable, snap, trans, configuration); err != nil { 879 return fmt.Errorf("recovery failed: %v", err) 880 } 881 if err := os.Remove(peersFile); err != nil { 882 return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err) 883 } 884 s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery") 885 } 886 } 887 888 // If we are in bootstrap or dev mode and the state is clean then we can 889 // bootstrap now. 890 if s.config.Bootstrap || s.config.DevMode { 891 hasState, err := raft.HasExistingState(log, stable, snap) 892 if err != nil { 893 return err 894 } 895 if !hasState { 896 // TODO (alexdadgar) - This will need to be updated when 897 // we add support for node IDs. 898 configuration := raft.Configuration{ 899 Servers: []raft.Server{ 900 raft.Server{ 901 ID: raft.ServerID(trans.LocalAddr()), 902 Address: trans.LocalAddr(), 903 }, 904 }, 905 } 906 if err := raft.BootstrapCluster(s.config.RaftConfig, 907 log, stable, snap, trans, configuration); err != nil { 908 return err 909 } 910 } 911 } 912 913 // Setup the leader channel 914 leaderCh := make(chan bool, 1) 915 s.config.RaftConfig.NotifyCh = leaderCh 916 s.leaderCh = leaderCh 917 918 // Setup the Raft store 919 s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, snap, trans) 920 if err != nil { 921 return err 922 } 923 return nil 924 } 925 926 // setupSerf is used to setup and initialize a Serf 927 func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { 928 conf.Init() 929 conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) 930 conf.Tags["role"] = "nomad" 931 conf.Tags["region"] = s.config.Region 932 conf.Tags["dc"] = s.config.Datacenter 933 conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion) 934 conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion) 935 conf.Tags["build"] = s.config.Build 936 conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) 937 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 938 conf.Tags["bootstrap"] = "1" 939 } 940 bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect) 941 if bootstrapExpect != 0 { 942 conf.Tags["expect"] = fmt.Sprintf("%d", bootstrapExpect) 943 } 944 conf.MemberlistConfig.LogOutput = s.config.LogOutput 945 conf.LogOutput = s.config.LogOutput 946 conf.EventCh = ch 947 if !s.config.DevMode { 948 conf.SnapshotPath = filepath.Join(s.config.DataDir, path) 949 if err := ensurePath(conf.SnapshotPath, false); err != nil { 950 return nil, err 951 } 952 } 953 conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] 954 conf.RejoinAfterLeave = true 955 conf.Merge = &serfMergeDelegate{} 956 957 // Until Nomad supports this fully, we disable automatic resolution. 958 // When enabled, the Serf gossip may just turn off if we are the minority 959 // node which is rather unexpected. 960 conf.EnableNameConflictResolution = false 961 return serf.Create(conf) 962 } 963 964 // setupWorkers is used to start the scheduling workers 965 func (s *Server) setupWorkers() error { 966 // Check if all the schedulers are disabled 967 if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { 968 s.logger.Printf("[WARN] nomad: no enabled schedulers") 969 return nil 970 } 971 972 // Start the workers 973 for i := 0; i < s.config.NumSchedulers; i++ { 974 if w, err := NewWorker(s); err != nil { 975 return err 976 } else { 977 s.workers = append(s.workers, w) 978 } 979 } 980 s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", 981 s.config.NumSchedulers, s.config.EnabledSchedulers) 982 return nil 983 } 984 985 // numPeers is used to check on the number of known peers, including the local 986 // node. 987 func (s *Server) numPeers() (int, error) { 988 future := s.raft.GetConfiguration() 989 if err := future.Error(); err != nil { 990 return 0, err 991 } 992 configuration := future.Configuration() 993 return len(configuration.Servers), nil 994 } 995 996 // IsLeader checks if this server is the cluster leader 997 func (s *Server) IsLeader() bool { 998 return s.raft.State() == raft.Leader 999 } 1000 1001 // Join is used to have Nomad join the gossip ring 1002 // The target address should be another node listening on the 1003 // Serf address 1004 func (s *Server) Join(addrs []string) (int, error) { 1005 return s.serf.Join(addrs, true) 1006 } 1007 1008 // LocalMember is used to return the local node 1009 func (c *Server) LocalMember() serf.Member { 1010 return c.serf.LocalMember() 1011 } 1012 1013 // Members is used to return the members of the serf cluster 1014 func (s *Server) Members() []serf.Member { 1015 return s.serf.Members() 1016 } 1017 1018 // RemoveFailedNode is used to remove a failed node from the cluster 1019 func (s *Server) RemoveFailedNode(node string) error { 1020 return s.serf.RemoveFailedNode(node) 1021 } 1022 1023 // KeyManager returns the Serf keyring manager 1024 func (s *Server) KeyManager() *serf.KeyManager { 1025 return s.serf.KeyManager() 1026 } 1027 1028 // Encrypted determines if gossip is encrypted 1029 func (s *Server) Encrypted() bool { 1030 return s.serf.EncryptionEnabled() 1031 } 1032 1033 // State returns the underlying state store. This should *not* 1034 // be used to modify state directly. 1035 func (s *Server) State() *state.StateStore { 1036 return s.fsm.State() 1037 } 1038 1039 // Regions returns the known regions in the cluster. 1040 func (s *Server) Regions() []string { 1041 s.peerLock.RLock() 1042 defer s.peerLock.RUnlock() 1043 1044 regions := make([]string, 0, len(s.peers)) 1045 for region, _ := range s.peers { 1046 regions = append(regions, region) 1047 } 1048 sort.Strings(regions) 1049 return regions 1050 } 1051 1052 // inmemCodec is used to do an RPC call without going over a network 1053 type inmemCodec struct { 1054 method string 1055 args interface{} 1056 reply interface{} 1057 err error 1058 } 1059 1060 func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error { 1061 req.ServiceMethod = i.method 1062 return nil 1063 } 1064 1065 func (i *inmemCodec) ReadRequestBody(args interface{}) error { 1066 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args))) 1067 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args))) 1068 dst.Set(sourceValue) 1069 return nil 1070 } 1071 1072 func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error { 1073 if resp.Error != "" { 1074 i.err = errors.New(resp.Error) 1075 return nil 1076 } 1077 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply))) 1078 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply))) 1079 dst.Set(sourceValue) 1080 return nil 1081 } 1082 1083 func (i *inmemCodec) Close() error { 1084 return nil 1085 } 1086 1087 // RPC is used to make a local RPC call 1088 func (s *Server) RPC(method string, args interface{}, reply interface{}) error { 1089 codec := &inmemCodec{ 1090 method: method, 1091 args: args, 1092 reply: reply, 1093 } 1094 if err := s.rpcServer.ServeRequest(codec); err != nil { 1095 return err 1096 } 1097 return codec.err 1098 } 1099 1100 // Stats is used to return statistics for debugging and insight 1101 // for various sub-systems 1102 func (s *Server) Stats() map[string]map[string]string { 1103 toString := func(v uint64) string { 1104 return strconv.FormatUint(v, 10) 1105 } 1106 stats := map[string]map[string]string{ 1107 "nomad": map[string]string{ 1108 "server": "true", 1109 "leader": fmt.Sprintf("%v", s.IsLeader()), 1110 "leader_addr": string(s.raft.Leader()), 1111 "bootstrap": fmt.Sprintf("%v", s.config.Bootstrap), 1112 "known_regions": toString(uint64(len(s.peers))), 1113 }, 1114 "raft": s.raft.Stats(), 1115 "serf": s.serf.Stats(), 1116 "runtime": RuntimeStats(), 1117 } 1118 1119 return stats 1120 } 1121 1122 // Region retuns the region of the server 1123 func (s *Server) Region() string { 1124 return s.config.Region 1125 } 1126 1127 // Datacenter returns the data center of the server 1128 func (s *Server) Datacenter() string { 1129 return s.config.Datacenter 1130 } 1131 1132 // GetConfig returns the config of the server for testing purposes only 1133 func (s *Server) GetConfig() *Config { 1134 return s.config 1135 } 1136 1137 // peersInfoContent is used to help operators understand what happened to the 1138 // peers.json file. This is written to a file called peers.info in the same 1139 // location. 1140 const peersInfoContent = ` 1141 As of Nomad 0.5.5, the peers.json file is only used for recovery 1142 after an outage. It should be formatted as a JSON array containing the address 1143 and port of each Consul server in the cluster, like this: 1144 1145 ["10.1.0.1:4647","10.1.0.2:4647","10.1.0.3:4647"] 1146 1147 Under normal operation, the peers.json file will not be present. 1148 1149 When Nomad starts for the first time, it will create this peers.info file and 1150 delete any existing peers.json file so that recovery doesn't occur on the first 1151 startup. 1152 1153 Once this peers.info file is present, any peers.json file will be ingested at 1154 startup, and will set the Raft peer configuration manually to recover from an 1155 outage. It's crucial that all servers in the cluster are shut down before 1156 creating the peers.json file, and that all servers receive the same 1157 configuration. Once the peers.json file is successfully ingested and applied, it 1158 will be deleted. 1159 1160 Please see https://www.nomadproject.io/guides/outage.html for more information. 1161 `