github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/server.go (about) 1 package nomad 2 3 import ( 4 "crypto/tls" 5 "errors" 6 "fmt" 7 "log" 8 "net" 9 "net/rpc" 10 "os" 11 "path/filepath" 12 "reflect" 13 "sort" 14 "strconv" 15 "sync" 16 "time" 17 18 "github.com/hashicorp/consul/tlsutil" 19 "github.com/hashicorp/nomad/nomad/state" 20 "github.com/hashicorp/raft" 21 "github.com/hashicorp/raft-boltdb" 22 "github.com/hashicorp/serf/serf" 23 ) 24 25 const ( 26 raftState = "raft/" 27 serfSnapshot = "serf/snapshot" 28 snapshotsRetained = 2 29 30 // serverRPCCache controls how long we keep an idle connection open to a server 31 serverRPCCache = 2 * time.Minute 32 33 // serverMaxStreams controsl how many idle streams we keep open to a server 34 serverMaxStreams = 64 35 36 // raftLogCacheSize is the maximum number of logs to cache in-memory. 37 // This is used to reduce disk I/O for the recently commited entries. 38 raftLogCacheSize = 512 39 40 // raftRemoveGracePeriod is how long we wait to allow a RemovePeer 41 // to replicate to gracefully leave the cluster. 42 raftRemoveGracePeriod = 5 * time.Second 43 44 // apiMajorVersion is returned as part of the Status.Version request. 45 // It should be incremented anytime the APIs are changed in a way that 46 // would break clients for sane client versioning. 47 apiMajorVersion = 1 48 49 // apiMinorVersion is returned as part of the Status.Version request. 50 // It should be incremented anytime the APIs are changed to allow 51 // for sane client versioning. Minor changes should be compatible 52 // within the major version. 53 apiMinorVersion = 1 54 ) 55 56 // Server is Nomad server which manages the job queues, 57 // schedulers, and notification bus for agents. 58 type Server struct { 59 config *Config 60 logger *log.Logger 61 62 // Connection pool to other Nomad servers 63 connPool *ConnPool 64 65 // Endpoints holds our RPC endpoints 66 endpoints endpoints 67 68 // The raft instance is used among Nomad nodes within the 69 // region to protect operations that require strong consistency 70 leaderCh <-chan bool 71 raft *raft.Raft 72 raftLayer *RaftLayer 73 raftPeers raft.PeerStore 74 raftStore *raftboltdb.BoltStore 75 raftInmem *raft.InmemStore 76 raftTransport *raft.NetworkTransport 77 78 // fsm is the state machine used with Raft 79 fsm *nomadFSM 80 81 // rpcListener is used to listen for incoming connections 82 rpcListener net.Listener 83 rpcServer *rpc.Server 84 rpcAdvertise net.Addr 85 86 // rpcTLS is the TLS config for incoming TLS requests 87 rpcTLS *tls.Config 88 89 // peers is used to track the known Nomad servers. This is 90 // used for region forwarding and clustering. 91 peers map[string][]*serverParts 92 localPeers map[string]*serverParts 93 peerLock sync.RWMutex 94 95 // serf is the Serf cluster containing only Nomad 96 // servers. This is used for multi-region federation 97 // and automatic clustering within regions. 98 serf *serf.Serf 99 100 // reconcileCh is used to pass events from the serf handler 101 // into the leader manager. Mostly used to handle when servers 102 // join/leave from the region. 103 reconcileCh chan serf.Member 104 105 // eventCh is used to receive events from the serf cluster 106 eventCh chan serf.Event 107 108 // evalBroker is used to manage the in-progress evaluations 109 // that are waiting to be brokered to a sub-scheduler 110 evalBroker *EvalBroker 111 112 // planQueue is used to manage the submitted allocation 113 // plans that are waiting to be assessed by the leader 114 planQueue *PlanQueue 115 116 // periodicDispatcher is used to track and create evaluations for periodic jobs. 117 periodicDispatcher *PeriodicDispatch 118 119 // heartbeatTimers track the expiration time of each heartbeat that has 120 // a TTL. On expiration, the node status is updated to be 'down'. 121 heartbeatTimers map[string]*time.Timer 122 heartbeatTimersLock sync.Mutex 123 124 // Worker used for processing 125 workers []*Worker 126 127 left bool 128 shutdown bool 129 shutdownCh chan struct{} 130 shutdownLock sync.Mutex 131 } 132 133 // Holds the RPC endpoints 134 type endpoints struct { 135 Status *Status 136 Node *Node 137 Job *Job 138 Eval *Eval 139 Plan *Plan 140 Alloc *Alloc 141 Region *Region 142 Periodic *Periodic 143 } 144 145 // NewServer is used to construct a new Nomad server from the 146 // configuration, potentially returning an error 147 func NewServer(config *Config) (*Server, error) { 148 // Check the protocol version 149 if err := config.CheckVersion(); err != nil { 150 return nil, err 151 } 152 153 // Ensure we have a log output 154 if config.LogOutput == nil { 155 config.LogOutput = os.Stderr 156 } 157 158 // Create a logger 159 logger := log.New(config.LogOutput, "", log.LstdFlags) 160 161 // Create an eval broker 162 evalBroker, err := NewEvalBroker(config.EvalNackTimeout, config.EvalDeliveryLimit) 163 if err != nil { 164 return nil, err 165 } 166 167 // Create a plan queue 168 planQueue, err := NewPlanQueue() 169 if err != nil { 170 return nil, err 171 } 172 173 // Create the server 174 s := &Server{ 175 config: config, 176 connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, nil), 177 logger: logger, 178 rpcServer: rpc.NewServer(), 179 peers: make(map[string][]*serverParts), 180 localPeers: make(map[string]*serverParts), 181 reconcileCh: make(chan serf.Member, 32), 182 eventCh: make(chan serf.Event, 256), 183 evalBroker: evalBroker, 184 planQueue: planQueue, 185 shutdownCh: make(chan struct{}), 186 } 187 188 // Create the periodic dispatcher for launching periodic jobs. 189 s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) 190 191 // Initialize the RPC layer 192 // TODO: TLS... 193 if err := s.setupRPC(nil); err != nil { 194 s.Shutdown() 195 logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) 196 return nil, fmt.Errorf("Failed to start RPC layer: %v", err) 197 } 198 199 // Initialize the Raft server 200 if err := s.setupRaft(); err != nil { 201 s.Shutdown() 202 logger.Printf("[ERR] nomad: failed to start Raft: %s", err) 203 return nil, fmt.Errorf("Failed to start Raft: %v", err) 204 } 205 206 // Initialize the wan Serf 207 s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) 208 if err != nil { 209 s.Shutdown() 210 logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) 211 return nil, fmt.Errorf("Failed to start serf: %v", err) 212 } 213 214 // Intialize the scheduling workers 215 if err := s.setupWorkers(); err != nil { 216 s.Shutdown() 217 logger.Printf("[ERR] nomad: failed to start workers: %s", err) 218 return nil, fmt.Errorf("Failed to start workers: %v", err) 219 } 220 221 // Monitor leadership changes 222 go s.monitorLeadership() 223 224 // Start ingesting events for Serf 225 go s.serfEventHandler() 226 227 // Start the RPC listeners 228 go s.listen() 229 230 // Emit metrics for the eval broker 231 go evalBroker.EmitStats(time.Second, s.shutdownCh) 232 233 // Emit metrics for the plan queue 234 go planQueue.EmitStats(time.Second, s.shutdownCh) 235 236 // Emit metrics 237 go s.heartbeatStats() 238 239 // Done 240 return s, nil 241 } 242 243 // Shutdown is used to shutdown the server 244 func (s *Server) Shutdown() error { 245 s.logger.Printf("[INFO] nomad: shutting down server") 246 s.shutdownLock.Lock() 247 defer s.shutdownLock.Unlock() 248 249 if s.shutdown { 250 return nil 251 } 252 253 s.shutdown = true 254 close(s.shutdownCh) 255 256 if s.serf != nil { 257 s.serf.Shutdown() 258 } 259 260 if s.raft != nil { 261 s.raftTransport.Close() 262 s.raftLayer.Close() 263 future := s.raft.Shutdown() 264 if err := future.Error(); err != nil { 265 s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) 266 } 267 if s.raftStore != nil { 268 s.raftStore.Close() 269 } 270 } 271 272 // Shutdown the RPC listener 273 if s.rpcListener != nil { 274 s.rpcListener.Close() 275 } 276 277 // Close the connection pool 278 s.connPool.Shutdown() 279 280 // Close the fsm 281 if s.fsm != nil { 282 s.fsm.Close() 283 } 284 return nil 285 } 286 287 // IsShutdown checks if the server is shutdown 288 func (s *Server) IsShutdown() bool { 289 select { 290 case <-s.shutdownCh: 291 return true 292 default: 293 return false 294 } 295 } 296 297 // Leave is used to prepare for a graceful shutdown of the server 298 func (s *Server) Leave() error { 299 s.logger.Printf("[INFO] nomad: server starting leave") 300 s.left = true 301 302 // Check the number of known peers 303 numPeers, err := s.numOtherPeers() 304 if err != nil { 305 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 306 return err 307 } 308 309 // If we are the current leader, and we have any other peers (cluster has multiple 310 // servers), we should do a RemovePeer to safely reduce the quorum size. If we are 311 // not the leader, then we should issue our leave intention and wait to be removed 312 // for some sane period of time. 313 isLeader := s.IsLeader() 314 if isLeader && numPeers > 0 { 315 future := s.raft.RemovePeer(s.raftTransport.LocalAddr()) 316 if err := future.Error(); err != nil && err != raft.ErrUnknownPeer { 317 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 318 } 319 } 320 321 // Leave the gossip pool 322 if s.serf != nil { 323 if err := s.serf.Leave(); err != nil { 324 s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) 325 } 326 } 327 328 // If we were not leader, wait to be safely removed from the cluster. 329 // We must wait to allow the raft replication to take place, otherwise 330 // an immediate shutdown could cause a loss of quorum. 331 if !isLeader { 332 limit := time.Now().Add(raftRemoveGracePeriod) 333 for numPeers > 0 && time.Now().Before(limit) { 334 // Update the number of peers 335 numPeers, err = s.numOtherPeers() 336 if err != nil { 337 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 338 break 339 } 340 341 // Avoid the sleep if we are done 342 if numPeers == 0 { 343 break 344 } 345 346 // Sleep a while and check again 347 time.Sleep(50 * time.Millisecond) 348 } 349 if numPeers != 0 { 350 s.logger.Printf("[WARN] nomad: failed to leave raft peer set gracefully, timeout") 351 } 352 } 353 return nil 354 } 355 356 // setupRPC is used to setup the RPC listener 357 func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error { 358 // Create endpoints 359 s.endpoints.Status = &Status{s} 360 s.endpoints.Node = &Node{s} 361 s.endpoints.Job = &Job{s} 362 s.endpoints.Eval = &Eval{s} 363 s.endpoints.Plan = &Plan{s} 364 s.endpoints.Alloc = &Alloc{s} 365 s.endpoints.Region = &Region{s} 366 s.endpoints.Periodic = &Periodic{s} 367 368 // Register the handlers 369 s.rpcServer.Register(s.endpoints.Status) 370 s.rpcServer.Register(s.endpoints.Node) 371 s.rpcServer.Register(s.endpoints.Job) 372 s.rpcServer.Register(s.endpoints.Eval) 373 s.rpcServer.Register(s.endpoints.Plan) 374 s.rpcServer.Register(s.endpoints.Alloc) 375 s.rpcServer.Register(s.endpoints.Region) 376 s.rpcServer.Register(s.endpoints.Periodic) 377 378 list, err := net.ListenTCP("tcp", s.config.RPCAddr) 379 if err != nil { 380 return err 381 } 382 s.rpcListener = list 383 384 if s.config.RPCAdvertise != nil { 385 s.rpcAdvertise = s.config.RPCAdvertise 386 } else { 387 s.rpcAdvertise = s.rpcListener.Addr() 388 } 389 390 // Verify that we have a usable advertise address 391 addr, ok := s.rpcAdvertise.(*net.TCPAddr) 392 if !ok { 393 list.Close() 394 return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr) 395 } 396 if addr.IP.IsUnspecified() { 397 list.Close() 398 return fmt.Errorf("RPC advertise address is not advertisable: %v", addr) 399 } 400 401 // Provide a DC specific wrapper. Raft replication is only 402 // ever done in the same datacenter, so we can provide it as a constant. 403 // wrapper := tlsutil.SpecificDC(s.config.Datacenter, tlsWrap) 404 // TODO: TLS... 405 s.raftLayer = NewRaftLayer(s.rpcAdvertise, nil) 406 return nil 407 } 408 409 // setupRaft is used to setup and initialize Raft 410 func (s *Server) setupRaft() error { 411 // If we are in bootstrap mode, enable a single node cluster 412 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 413 s.config.RaftConfig.EnableSingleNode = true 414 } 415 416 // Create the FSM 417 var err error 418 s.fsm, err = NewFSM(s.evalBroker, s.periodicDispatcher, s.config.LogOutput) 419 if err != nil { 420 return err 421 } 422 423 // Create a transport layer 424 trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout, 425 s.config.LogOutput) 426 s.raftTransport = trans 427 428 // Create the backend raft store for logs and stable storage 429 var log raft.LogStore 430 var stable raft.StableStore 431 var snap raft.SnapshotStore 432 var peers raft.PeerStore 433 if s.config.DevMode { 434 store := raft.NewInmemStore() 435 s.raftInmem = store 436 stable = store 437 log = store 438 snap = raft.NewDiscardSnapshotStore() 439 peers = &raft.StaticPeers{} 440 s.raftPeers = peers 441 442 } else { 443 // Create the base raft path 444 path := filepath.Join(s.config.DataDir, raftState) 445 if err := ensurePath(path, true); err != nil { 446 return err 447 } 448 449 // Create the BoltDB backend 450 store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db")) 451 if err != nil { 452 return err 453 } 454 s.raftStore = store 455 stable = store 456 457 // Wrap the store in a LogCache to improve performance 458 cacheStore, err := raft.NewLogCache(raftLogCacheSize, store) 459 if err != nil { 460 store.Close() 461 return err 462 } 463 log = cacheStore 464 465 // Create the snapshot store 466 snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput) 467 if err != nil { 468 if s.raftStore != nil { 469 s.raftStore.Close() 470 } 471 return err 472 } 473 snap = snapshots 474 475 // Setup the peer store 476 s.raftPeers = raft.NewJSONPeers(path, trans) 477 peers = s.raftPeers 478 } 479 480 // Ensure local host is always included if we are in bootstrap mode 481 if s.config.RaftConfig.EnableSingleNode { 482 p, err := peers.Peers() 483 if err != nil { 484 if s.raftStore != nil { 485 s.raftStore.Close() 486 } 487 return err 488 } 489 if !raft.PeerContained(p, trans.LocalAddr()) { 490 peers.SetPeers(raft.AddUniquePeer(p, trans.LocalAddr())) 491 } 492 } 493 494 // Make sure we set the LogOutput 495 s.config.RaftConfig.LogOutput = s.config.LogOutput 496 497 // Setup the leader channel 498 leaderCh := make(chan bool, 1) 499 s.config.RaftConfig.NotifyCh = leaderCh 500 s.leaderCh = leaderCh 501 502 // Setup the Raft store 503 s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, 504 snap, peers, trans) 505 if err != nil { 506 if s.raftStore != nil { 507 s.raftStore.Close() 508 } 509 trans.Close() 510 return err 511 } 512 return nil 513 } 514 515 // setupSerf is used to setup and initialize a Serf 516 func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { 517 conf.Init() 518 conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) 519 conf.Tags["role"] = "nomad" 520 conf.Tags["region"] = s.config.Region 521 conf.Tags["dc"] = s.config.Datacenter 522 conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion) 523 conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) 524 conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) 525 conf.Tags["build"] = s.config.Build 526 conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) 527 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 528 conf.Tags["bootstrap"] = "1" 529 } 530 if s.config.BootstrapExpect != 0 { 531 conf.Tags["expect"] = fmt.Sprintf("%d", s.config.BootstrapExpect) 532 } 533 conf.MemberlistConfig.LogOutput = s.config.LogOutput 534 conf.LogOutput = s.config.LogOutput 535 conf.EventCh = ch 536 if !s.config.DevMode { 537 conf.SnapshotPath = filepath.Join(s.config.DataDir, path) 538 if err := ensurePath(conf.SnapshotPath, false); err != nil { 539 return nil, err 540 } 541 } 542 conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] 543 conf.RejoinAfterLeave = true 544 conf.Merge = &serfMergeDelegate{} 545 546 // Until Nomad supports this fully, we disable automatic resolution. 547 // When enabled, the Serf gossip may just turn off if we are the minority 548 // node which is rather unexpected. 549 conf.EnableNameConflictResolution = false 550 return serf.Create(conf) 551 } 552 553 // setupWorkers is used to start the scheduling workers 554 func (s *Server) setupWorkers() error { 555 // Check if all the schedulers are disabled 556 if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { 557 s.logger.Printf("[WARN] nomad: no enabled schedulers") 558 return nil 559 } 560 561 // Start the workers 562 for i := 0; i < s.config.NumSchedulers; i++ { 563 if w, err := NewWorker(s); err != nil { 564 return err 565 } else { 566 s.workers = append(s.workers, w) 567 } 568 } 569 s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", 570 s.config.NumSchedulers, s.config.EnabledSchedulers) 571 return nil 572 } 573 574 // numOtherPeers is used to check on the number of known peers 575 // excluding the local ndoe 576 func (s *Server) numOtherPeers() (int, error) { 577 peers, err := s.raftPeers.Peers() 578 if err != nil { 579 return 0, err 580 } 581 otherPeers := raft.ExcludePeer(peers, s.raftTransport.LocalAddr()) 582 return len(otherPeers), nil 583 } 584 585 // IsLeader checks if this server is the cluster leader 586 func (s *Server) IsLeader() bool { 587 return s.raft.State() == raft.Leader 588 } 589 590 // Join is used to have Nomad join the gossip ring 591 // The target address should be another node listening on the 592 // Serf address 593 func (s *Server) Join(addrs []string) (int, error) { 594 return s.serf.Join(addrs, true) 595 } 596 597 // LocalMember is used to return the local node 598 func (c *Server) LocalMember() serf.Member { 599 return c.serf.LocalMember() 600 } 601 602 // Members is used to return the members of the serf cluster 603 func (s *Server) Members() []serf.Member { 604 return s.serf.Members() 605 } 606 607 // RemoveFailedNode is used to remove a failed node from the cluster 608 func (s *Server) RemoveFailedNode(node string) error { 609 return s.serf.RemoveFailedNode(node) 610 } 611 612 // KeyManager returns the Serf keyring manager 613 func (s *Server) KeyManager() *serf.KeyManager { 614 return s.serf.KeyManager() 615 } 616 617 // Encrypted determines if gossip is encrypted 618 func (s *Server) Encrypted() bool { 619 return s.serf.EncryptionEnabled() 620 } 621 622 // State returns the underlying state store. This should *not* 623 // be used to modify state directly. 624 func (s *Server) State() *state.StateStore { 625 return s.fsm.State() 626 } 627 628 // Regions returns the known regions in the cluster. 629 func (s *Server) Regions() []string { 630 s.peerLock.RLock() 631 defer s.peerLock.RUnlock() 632 633 regions := make([]string, 0, len(s.peers)) 634 for region, _ := range s.peers { 635 regions = append(regions, region) 636 } 637 sort.Strings(regions) 638 return regions 639 } 640 641 // inmemCodec is used to do an RPC call without going over a network 642 type inmemCodec struct { 643 method string 644 args interface{} 645 reply interface{} 646 err error 647 } 648 649 func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error { 650 req.ServiceMethod = i.method 651 return nil 652 } 653 654 func (i *inmemCodec) ReadRequestBody(args interface{}) error { 655 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args))) 656 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args))) 657 dst.Set(sourceValue) 658 return nil 659 } 660 661 func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error { 662 if resp.Error != "" { 663 i.err = errors.New(resp.Error) 664 return nil 665 } 666 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply))) 667 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply))) 668 dst.Set(sourceValue) 669 return nil 670 } 671 672 func (i *inmemCodec) Close() error { 673 return nil 674 } 675 676 // RPC is used to make a local RPC call 677 func (s *Server) RPC(method string, args interface{}, reply interface{}) error { 678 codec := &inmemCodec{ 679 method: method, 680 args: args, 681 reply: reply, 682 } 683 if err := s.rpcServer.ServeRequest(codec); err != nil { 684 return err 685 } 686 return codec.err 687 } 688 689 // Stats is used to return statistics for debugging and insight 690 // for various sub-systems 691 func (s *Server) Stats() map[string]map[string]string { 692 toString := func(v uint64) string { 693 return strconv.FormatUint(v, 10) 694 } 695 stats := map[string]map[string]string{ 696 "nomad": map[string]string{ 697 "server": "true", 698 "leader": fmt.Sprintf("%v", s.IsLeader()), 699 "bootstrap": fmt.Sprintf("%v", s.config.Bootstrap), 700 "known_regions": toString(uint64(len(s.peers))), 701 }, 702 "raft": s.raft.Stats(), 703 "serf": s.serf.Stats(), 704 "runtime": RuntimeStats(), 705 } 706 return stats 707 }