github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/server.go (about) 1 package nomad 2 3 import ( 4 "crypto/tls" 5 "errors" 6 "fmt" 7 "log" 8 "net" 9 "net/rpc" 10 "os" 11 "path/filepath" 12 "reflect" 13 "strconv" 14 "sync" 15 "time" 16 17 "github.com/hashicorp/consul/tlsutil" 18 "github.com/hashicorp/nomad/nomad/state" 19 "github.com/hashicorp/raft" 20 "github.com/hashicorp/raft-boltdb" 21 "github.com/hashicorp/serf/serf" 22 ) 23 24 const ( 25 raftState = "raft/" 26 serfSnapshot = "serf/snapshot" 27 snapshotsRetained = 2 28 29 // serverRPCCache controls how long we keep an idle connection open to a server 30 serverRPCCache = 2 * time.Minute 31 32 // serverMaxStreams controsl how many idle streams we keep open to a server 33 serverMaxStreams = 64 34 35 // raftLogCacheSize is the maximum number of logs to cache in-memory. 36 // This is used to reduce disk I/O for the recently commited entries. 37 raftLogCacheSize = 512 38 39 // raftRemoveGracePeriod is how long we wait to allow a RemovePeer 40 // to replicate to gracefully leave the cluster. 41 raftRemoveGracePeriod = 5 * time.Second 42 43 // apiMajorVersion is returned as part of the Status.Version request. 44 // It should be incremented anytime the APIs are changed in a way that 45 // would break clients for sane client versioning. 46 apiMajorVersion = 1 47 48 // apiMinorVersion is returned as part of the Status.Version request. 49 // It should be incremented anytime the APIs are changed to allow 50 // for sane client versioning. Minor changes should be compatible 51 // within the major version. 52 apiMinorVersion = 1 53 ) 54 55 // Server is Nomad server which manages the job queues, 56 // schedulers, and notification bus for agents. 57 type Server struct { 58 config *Config 59 logger *log.Logger 60 61 // Connection pool to other Nomad servers 62 connPool *ConnPool 63 64 // Endpoints holds our RPC endpoints 65 endpoints endpoints 66 67 // The raft instance is used among Nomad nodes within the 68 // region to protect operations that require strong consistency 69 leaderCh <-chan bool 70 raft *raft.Raft 71 raftLayer *RaftLayer 72 raftPeers raft.PeerStore 73 raftStore *raftboltdb.BoltStore 74 raftInmem *raft.InmemStore 75 raftTransport *raft.NetworkTransport 76 77 // fsm is the state machine used with Raft 78 fsm *nomadFSM 79 80 // rpcListener is used to listen for incoming connections 81 rpcListener net.Listener 82 rpcServer *rpc.Server 83 rpcAdvertise net.Addr 84 85 // rpcTLS is the TLS config for incoming TLS requests 86 rpcTLS *tls.Config 87 88 // peers is used to track the known Nomad servers. This is 89 // used for region forwarding and clustering. 90 peers map[string][]*serverParts 91 localPeers map[string]*serverParts 92 peerLock sync.RWMutex 93 94 // serf is the Serf cluster containing only Nomad 95 // servers. This is used for multi-region federation 96 // and automatic clustering within regions. 97 serf *serf.Serf 98 99 // reconcileCh is used to pass events from the serf handler 100 // into the leader manager. Mostly used to handle when servers 101 // join/leave from the region. 102 reconcileCh chan serf.Member 103 104 // eventCh is used to receive events from the serf cluster 105 eventCh chan serf.Event 106 107 // evalBroker is used to manage the in-progress evaluations 108 // that are waiting to be brokered to a sub-scheduler 109 evalBroker *EvalBroker 110 111 // planQueue is used to manage the submitted allocation 112 // plans that are waiting to be assessed by the leader 113 planQueue *PlanQueue 114 115 // heartbeatTimers track the expiration time of each heartbeat that has 116 // a TTL. On expiration, the node status is updated to be 'down'. 117 heartbeatTimers map[string]*time.Timer 118 heartbeatTimersLock sync.Mutex 119 120 // Worker used for processing 121 workers []*Worker 122 123 left bool 124 shutdown bool 125 shutdownCh chan struct{} 126 shutdownLock sync.Mutex 127 } 128 129 // Holds the RPC endpoints 130 type endpoints struct { 131 Status *Status 132 Node *Node 133 Job *Job 134 Eval *Eval 135 Plan *Plan 136 Alloc *Alloc 137 } 138 139 // NewServer is used to construct a new Nomad server from the 140 // configuration, potentially returning an error 141 func NewServer(config *Config) (*Server, error) { 142 // Check the protocol version 143 if err := config.CheckVersion(); err != nil { 144 return nil, err 145 } 146 147 // Ensure we have a log output 148 if config.LogOutput == nil { 149 config.LogOutput = os.Stderr 150 } 151 152 // Create a logger 153 logger := log.New(config.LogOutput, "", log.LstdFlags) 154 155 // Create an eval broker 156 evalBroker, err := NewEvalBroker(config.EvalNackTimeout, config.EvalDeliveryLimit) 157 if err != nil { 158 return nil, err 159 } 160 161 // Create a plan queue 162 planQueue, err := NewPlanQueue() 163 if err != nil { 164 return nil, err 165 } 166 167 // Create the server 168 s := &Server{ 169 config: config, 170 connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, nil), 171 logger: logger, 172 rpcServer: rpc.NewServer(), 173 peers: make(map[string][]*serverParts), 174 localPeers: make(map[string]*serverParts), 175 reconcileCh: make(chan serf.Member, 32), 176 eventCh: make(chan serf.Event, 256), 177 evalBroker: evalBroker, 178 planQueue: planQueue, 179 shutdownCh: make(chan struct{}), 180 } 181 182 // Initialize the RPC layer 183 // TODO: TLS... 184 if err := s.setupRPC(nil); err != nil { 185 s.Shutdown() 186 logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) 187 return nil, fmt.Errorf("Failed to start RPC layer: %v", err) 188 } 189 190 // Initialize the Raft server 191 if err := s.setupRaft(); err != nil { 192 s.Shutdown() 193 logger.Printf("[ERR] nomad: failed to start Raft: %s", err) 194 return nil, fmt.Errorf("Failed to start Raft: %v", err) 195 } 196 197 // Initialize the wan Serf 198 s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) 199 if err != nil { 200 s.Shutdown() 201 logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) 202 return nil, fmt.Errorf("Failed to start serf: %v", err) 203 } 204 205 // Intialize the scheduling workers 206 if err := s.setupWorkers(); err != nil { 207 s.Shutdown() 208 logger.Printf("[ERR] nomad: failed to start workers: %s", err) 209 return nil, fmt.Errorf("Failed to start workers: %v", err) 210 } 211 212 // Monitor leadership changes 213 go s.monitorLeadership() 214 215 // Start ingesting events for Serf 216 go s.serfEventHandler() 217 218 // Start the RPC listeners 219 go s.listen() 220 221 // Emit metrics for the eval broker 222 go evalBroker.EmitStats(time.Second, s.shutdownCh) 223 224 // Emit metrics for the plan queue 225 go planQueue.EmitStats(time.Second, s.shutdownCh) 226 227 // Emit metrics 228 go s.heartbeatStats() 229 230 // Done 231 return s, nil 232 } 233 234 // Shutdown is used to shutdown the server 235 func (s *Server) Shutdown() error { 236 s.logger.Printf("[INFO] nomad: shutting down server") 237 s.shutdownLock.Lock() 238 defer s.shutdownLock.Unlock() 239 240 if s.shutdown { 241 return nil 242 } 243 244 s.shutdown = true 245 close(s.shutdownCh) 246 247 if s.serf != nil { 248 s.serf.Shutdown() 249 } 250 251 if s.raft != nil { 252 s.raftTransport.Close() 253 s.raftLayer.Close() 254 future := s.raft.Shutdown() 255 if err := future.Error(); err != nil { 256 s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) 257 } 258 if s.raftStore != nil { 259 s.raftStore.Close() 260 } 261 } 262 263 // Shutdown the RPC listener 264 if s.rpcListener != nil { 265 s.rpcListener.Close() 266 } 267 268 // Close the connection pool 269 s.connPool.Shutdown() 270 271 // Close the fsm 272 if s.fsm != nil { 273 s.fsm.Close() 274 } 275 return nil 276 } 277 278 // IsShutdown checks if the server is shutdown 279 func (s *Server) IsShutdown() bool { 280 select { 281 case <-s.shutdownCh: 282 return true 283 default: 284 return false 285 } 286 } 287 288 // Leave is used to prepare for a graceful shutdown of the server 289 func (s *Server) Leave() error { 290 s.logger.Printf("[INFO] nomad: server starting leave") 291 s.left = true 292 293 // Check the number of known peers 294 numPeers, err := s.numOtherPeers() 295 if err != nil { 296 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 297 return err 298 } 299 300 // If we are the current leader, and we have any other peers (cluster has multiple 301 // servers), we should do a RemovePeer to safely reduce the quorum size. If we are 302 // not the leader, then we should issue our leave intention and wait to be removed 303 // for some sane period of time. 304 isLeader := s.IsLeader() 305 if isLeader && numPeers > 0 { 306 future := s.raft.RemovePeer(s.raftTransport.LocalAddr()) 307 if err := future.Error(); err != nil && err != raft.ErrUnknownPeer { 308 s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) 309 } 310 } 311 312 // Leave the gossip pool 313 if s.serf != nil { 314 if err := s.serf.Leave(); err != nil { 315 s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) 316 } 317 } 318 319 // If we were not leader, wait to be safely removed from the cluster. 320 // We must wait to allow the raft replication to take place, otherwise 321 // an immediate shutdown could cause a loss of quorum. 322 if !isLeader { 323 limit := time.Now().Add(raftRemoveGracePeriod) 324 for numPeers > 0 && time.Now().Before(limit) { 325 // Update the number of peers 326 numPeers, err = s.numOtherPeers() 327 if err != nil { 328 s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) 329 break 330 } 331 332 // Avoid the sleep if we are done 333 if numPeers == 0 { 334 break 335 } 336 337 // Sleep a while and check again 338 time.Sleep(50 * time.Millisecond) 339 } 340 if numPeers != 0 { 341 s.logger.Printf("[WARN] nomad: failed to leave raft peer set gracefully, timeout") 342 } 343 } 344 return nil 345 } 346 347 // setupRPC is used to setup the RPC listener 348 func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error { 349 // Create endpoints 350 s.endpoints.Status = &Status{s} 351 s.endpoints.Node = &Node{s} 352 s.endpoints.Job = &Job{s} 353 s.endpoints.Eval = &Eval{s} 354 s.endpoints.Plan = &Plan{s} 355 s.endpoints.Alloc = &Alloc{s} 356 357 // Register the handlers 358 s.rpcServer.Register(s.endpoints.Status) 359 s.rpcServer.Register(s.endpoints.Node) 360 s.rpcServer.Register(s.endpoints.Job) 361 s.rpcServer.Register(s.endpoints.Eval) 362 s.rpcServer.Register(s.endpoints.Plan) 363 s.rpcServer.Register(s.endpoints.Alloc) 364 365 list, err := net.ListenTCP("tcp", s.config.RPCAddr) 366 if err != nil { 367 return err 368 } 369 s.rpcListener = list 370 371 if s.config.RPCAdvertise != nil { 372 s.rpcAdvertise = s.config.RPCAdvertise 373 } else { 374 s.rpcAdvertise = s.rpcListener.Addr() 375 } 376 377 // Verify that we have a usable advertise address 378 addr, ok := s.rpcAdvertise.(*net.TCPAddr) 379 if !ok { 380 list.Close() 381 return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr) 382 } 383 if addr.IP.IsUnspecified() { 384 list.Close() 385 return fmt.Errorf("RPC advertise address is not advertisable: %v", addr) 386 } 387 388 // Provide a DC specific wrapper. Raft replication is only 389 // ever done in the same datacenter, so we can provide it as a constant. 390 // wrapper := tlsutil.SpecificDC(s.config.Datacenter, tlsWrap) 391 // TODO: TLS... 392 s.raftLayer = NewRaftLayer(s.rpcAdvertise, nil) 393 return nil 394 } 395 396 // setupRaft is used to setup and initialize Raft 397 func (s *Server) setupRaft() error { 398 // If we are in bootstrap mode, enable a single node cluster 399 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 400 s.config.RaftConfig.EnableSingleNode = true 401 } 402 403 // Create the FSM 404 var err error 405 s.fsm, err = NewFSM(s.evalBroker, s.config.LogOutput) 406 if err != nil { 407 return err 408 } 409 410 // Create a transport layer 411 trans := raft.NewNetworkTransport(s.raftLayer, 3, s.config.RaftTimeout, 412 s.config.LogOutput) 413 s.raftTransport = trans 414 415 // Create the backend raft store for logs and stable storage 416 var log raft.LogStore 417 var stable raft.StableStore 418 var snap raft.SnapshotStore 419 var peers raft.PeerStore 420 if s.config.DevMode { 421 store := raft.NewInmemStore() 422 s.raftInmem = store 423 stable = store 424 log = store 425 snap = raft.NewDiscardSnapshotStore() 426 peers = &raft.StaticPeers{} 427 s.raftPeers = peers 428 429 } else { 430 // Create the base raft path 431 path := filepath.Join(s.config.DataDir, raftState) 432 if err := ensurePath(path, true); err != nil { 433 return err 434 } 435 436 // Create the BoltDB backend 437 store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db")) 438 if err != nil { 439 return err 440 } 441 s.raftStore = store 442 stable = store 443 444 // Wrap the store in a LogCache to improve performance 445 cacheStore, err := raft.NewLogCache(raftLogCacheSize, store) 446 if err != nil { 447 store.Close() 448 return err 449 } 450 log = cacheStore 451 452 // Create the snapshot store 453 snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput) 454 if err != nil { 455 if s.raftStore != nil { 456 s.raftStore.Close() 457 } 458 return err 459 } 460 snap = snapshots 461 462 // Setup the peer store 463 s.raftPeers = raft.NewJSONPeers(path, trans) 464 peers = s.raftPeers 465 } 466 467 // Ensure local host is always included if we are in bootstrap mode 468 if s.config.RaftConfig.EnableSingleNode { 469 p, err := peers.Peers() 470 if err != nil { 471 if s.raftStore != nil { 472 s.raftStore.Close() 473 } 474 return err 475 } 476 if !raft.PeerContained(p, trans.LocalAddr()) { 477 peers.SetPeers(raft.AddUniquePeer(p, trans.LocalAddr())) 478 } 479 } 480 481 // Make sure we set the LogOutput 482 s.config.RaftConfig.LogOutput = s.config.LogOutput 483 484 // Setup the leader channel 485 leaderCh := make(chan bool, 1) 486 s.config.RaftConfig.NotifyCh = leaderCh 487 s.leaderCh = leaderCh 488 489 // Setup the Raft store 490 s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, 491 snap, peers, trans) 492 if err != nil { 493 if s.raftStore != nil { 494 s.raftStore.Close() 495 } 496 trans.Close() 497 return err 498 } 499 return nil 500 } 501 502 // setupSerf is used to setup and initialize a Serf 503 func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { 504 conf.Init() 505 conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) 506 conf.Tags["role"] = "nomad" 507 conf.Tags["region"] = s.config.Region 508 conf.Tags["dc"] = s.config.Datacenter 509 conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion) 510 conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) 511 conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) 512 conf.Tags["build"] = s.config.Build 513 conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) 514 if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { 515 conf.Tags["bootstrap"] = "1" 516 } 517 if s.config.BootstrapExpect != 0 { 518 conf.Tags["expect"] = fmt.Sprintf("%d", s.config.BootstrapExpect) 519 } 520 conf.MemberlistConfig.LogOutput = s.config.LogOutput 521 conf.LogOutput = s.config.LogOutput 522 conf.EventCh = ch 523 if !s.config.DevMode { 524 conf.SnapshotPath = filepath.Join(s.config.DataDir, path) 525 if err := ensurePath(conf.SnapshotPath, false); err != nil { 526 return nil, err 527 } 528 } 529 conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] 530 conf.RejoinAfterLeave = true 531 conf.Merge = &serfMergeDelegate{} 532 533 // Until Nomad supports this fully, we disable automatic resolution. 534 // When enabled, the Serf gossip may just turn off if we are the minority 535 // node which is rather unexpected. 536 conf.EnableNameConflictResolution = false 537 return serf.Create(conf) 538 } 539 540 // setupWorkers is used to start the scheduling workers 541 func (s *Server) setupWorkers() error { 542 // Check if all the schedulers are disabled 543 if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { 544 s.logger.Printf("[WARN] nomad: no enabled schedulers") 545 return nil 546 } 547 548 // Start the workers 549 for i := 0; i < s.config.NumSchedulers; i++ { 550 if w, err := NewWorker(s); err != nil { 551 return err 552 } else { 553 s.workers = append(s.workers, w) 554 } 555 } 556 s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", 557 s.config.NumSchedulers, s.config.EnabledSchedulers) 558 return nil 559 } 560 561 // numOtherPeers is used to check on the number of known peers 562 // excluding the local ndoe 563 func (s *Server) numOtherPeers() (int, error) { 564 peers, err := s.raftPeers.Peers() 565 if err != nil { 566 return 0, err 567 } 568 otherPeers := raft.ExcludePeer(peers, s.raftTransport.LocalAddr()) 569 return len(otherPeers), nil 570 } 571 572 // IsLeader checks if this server is the cluster leader 573 func (s *Server) IsLeader() bool { 574 return s.raft.State() == raft.Leader 575 } 576 577 // Join is used to have Nomad join the gossip ring 578 // The target address should be another node listening on the 579 // Serf address 580 func (s *Server) Join(addrs []string) (int, error) { 581 return s.serf.Join(addrs, true) 582 } 583 584 // LocalMember is used to return the local node 585 func (c *Server) LocalMember() serf.Member { 586 return c.serf.LocalMember() 587 } 588 589 // Members is used to return the members of the serf cluster 590 func (s *Server) Members() []serf.Member { 591 return s.serf.Members() 592 } 593 594 // RemoveFailedNode is used to remove a failed node from the cluster 595 func (s *Server) RemoveFailedNode(node string) error { 596 return s.serf.RemoveFailedNode(node) 597 } 598 599 // KeyManager returns the Serf keyring manager 600 func (s *Server) KeyManager() *serf.KeyManager { 601 return s.serf.KeyManager() 602 } 603 604 // Encrypted determines if gossip is encrypted 605 func (s *Server) Encrypted() bool { 606 return s.serf.EncryptionEnabled() 607 } 608 609 // State returns the underlying state store. This should *not* 610 // be used to modify state directly. 611 func (s *Server) State() *state.StateStore { 612 return s.fsm.State() 613 } 614 615 // inmemCodec is used to do an RPC call without going over a network 616 type inmemCodec struct { 617 method string 618 args interface{} 619 reply interface{} 620 err error 621 } 622 623 func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error { 624 req.ServiceMethod = i.method 625 return nil 626 } 627 628 func (i *inmemCodec) ReadRequestBody(args interface{}) error { 629 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args))) 630 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args))) 631 dst.Set(sourceValue) 632 return nil 633 } 634 635 func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error { 636 if resp.Error != "" { 637 i.err = errors.New(resp.Error) 638 return nil 639 } 640 sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply))) 641 dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply))) 642 dst.Set(sourceValue) 643 return nil 644 } 645 646 func (i *inmemCodec) Close() error { 647 return nil 648 } 649 650 // RPC is used to make a local RPC call 651 func (s *Server) RPC(method string, args interface{}, reply interface{}) error { 652 codec := &inmemCodec{ 653 method: method, 654 args: args, 655 reply: reply, 656 } 657 if err := s.rpcServer.ServeRequest(codec); err != nil { 658 return err 659 } 660 return codec.err 661 } 662 663 // Stats is used to return statistics for debugging and insight 664 // for various sub-systems 665 func (s *Server) Stats() map[string]map[string]string { 666 toString := func(v uint64) string { 667 return strconv.FormatUint(v, 10) 668 } 669 stats := map[string]map[string]string{ 670 "nomad": map[string]string{ 671 "server": "true", 672 "leader": fmt.Sprintf("%v", s.IsLeader()), 673 "bootstrap": fmt.Sprintf("%v", s.config.Bootstrap), 674 "known_regions": toString(uint64(len(s.peers))), 675 }, 676 "raft": s.raft.Stats(), 677 "serf": s.serf.Stats(), 678 "runtime": RuntimeStats(), 679 } 680 return stats 681 }