github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "math/rand" 8 "net" 9 "time" 10 11 "github.com/armon/go-metrics" 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/structs" 14 "github.com/hashicorp/raft" 15 "github.com/hashicorp/serf/serf" 16 ) 17 18 const ( 19 // failedEvalUnblockInterval is the interval at which failed evaluations are 20 // unblocked to re-enter the scheduler. A failed evaluation occurs under 21 // high contention when the schedulers plan does not make progress. 22 failedEvalUnblockInterval = 1 * time.Minute 23 ) 24 25 // monitorLeadership is used to monitor if we acquire or lose our role 26 // as the leader in the Raft cluster. There is some work the leader is 27 // expected to do, so we must react to changes 28 func (s *Server) monitorLeadership() { 29 var stopCh chan struct{} 30 for { 31 select { 32 case isLeader := <-s.leaderCh: 33 if isLeader { 34 stopCh = make(chan struct{}) 35 go s.leaderLoop(stopCh) 36 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 37 } else if stopCh != nil { 38 close(stopCh) 39 stopCh = nil 40 s.logger.Printf("[INFO] nomad: cluster leadership lost") 41 } 42 case <-s.shutdownCh: 43 return 44 } 45 } 46 } 47 48 // leaderLoop runs as long as we are the leader to run various 49 // maintence activities 50 func (s *Server) leaderLoop(stopCh chan struct{}) { 51 // Ensure we revoke leadership on stepdown 52 defer s.revokeLeadership() 53 54 var reconcileCh chan serf.Member 55 establishedLeader := false 56 57 RECONCILE: 58 // Setup a reconciliation timer 59 reconcileCh = nil 60 interval := time.After(s.config.ReconcileInterval) 61 62 // Apply a raft barrier to ensure our FSM is caught up 63 start := time.Now() 64 barrier := s.raft.Barrier(0) 65 if err := barrier.Error(); err != nil { 66 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 67 goto WAIT 68 } 69 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 70 71 // Check if we need to handle initial leadership actions 72 if !establishedLeader { 73 if err := s.establishLeadership(stopCh); err != nil { 74 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", 75 err) 76 goto WAIT 77 } 78 establishedLeader = true 79 } 80 81 // Reconcile any missing data 82 if err := s.reconcile(); err != nil { 83 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 84 goto WAIT 85 } 86 87 // Initial reconcile worked, now we can process the channel 88 // updates 89 reconcileCh = s.reconcileCh 90 91 WAIT: 92 // Wait until leadership is lost 93 for { 94 select { 95 case <-stopCh: 96 return 97 case <-s.shutdownCh: 98 return 99 case <-interval: 100 goto RECONCILE 101 case member := <-reconcileCh: 102 s.reconcileMember(member) 103 } 104 } 105 } 106 107 // establishLeadership is invoked once we become leader and are able 108 // to invoke an initial barrier. The barrier is used to ensure any 109 // previously inflight transactions have been committed and that our 110 // state is up-to-date. 111 func (s *Server) establishLeadership(stopCh chan struct{}) error { 112 // Disable workers to free half the cores for use in the plan queue and 113 // evaluation broker 114 if numWorkers := len(s.workers); numWorkers > 1 { 115 // Disabling 3/4 of the workers frees CPU for raft and the 116 // plan applier which uses 1/2 the cores. 117 for i := 0; i < (3 * numWorkers / 4); i++ { 118 s.workers[i].SetPause(true) 119 } 120 } 121 122 // Enable the plan queue, since we are now the leader 123 s.planQueue.SetEnabled(true) 124 125 // Start the plan evaluator 126 go s.planApply() 127 128 // Enable the eval broker, since we are now the leader 129 s.evalBroker.SetEnabled(true) 130 131 // Enable the blocked eval tracker, since we are now the leader 132 s.blockedEvals.SetEnabled(true) 133 134 // Enable the deployment watcher, since we are now the leader 135 if err := s.deploymentWatcher.SetEnabled(true); err != nil { 136 return err 137 } 138 139 // Restore the eval broker state 140 if err := s.restoreEvals(); err != nil { 141 return err 142 } 143 144 // Activate the vault client 145 s.vault.SetActive(true) 146 if err := s.restoreRevokingAccessors(); err != nil { 147 return err 148 } 149 150 // Enable the periodic dispatcher, since we are now the leader. 151 s.periodicDispatcher.SetEnabled(true) 152 s.periodicDispatcher.Start() 153 154 // Restore the periodic dispatcher state 155 if err := s.restorePeriodicDispatcher(); err != nil { 156 return err 157 } 158 159 // Scheduler periodic jobs 160 go s.schedulePeriodic(stopCh) 161 162 // Reap any failed evaluations 163 go s.reapFailedEvaluations(stopCh) 164 165 // Reap any duplicate blocked evaluations 166 go s.reapDupBlockedEvaluations(stopCh) 167 168 // Periodically unblock failed allocations 169 go s.periodicUnblockFailedEvals(stopCh) 170 171 // Setup the heartbeat timers. This is done both when starting up or when 172 // a leader fail over happens. Since the timers are maintained by the leader 173 // node, effectively this means all the timers are renewed at the time of failover. 174 // The TTL contract is that the session will not be expired before the TTL, 175 // so expiring it later is allowable. 176 // 177 // This MUST be done after the initial barrier to ensure the latest Nodes 178 // are available to be initialized. Otherwise initialization may use stale 179 // data. 180 if err := s.initializeHeartbeatTimers(); err != nil { 181 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 182 return err 183 } 184 185 // COMPAT 0.4 - 0.4.1 186 // Reconcile the summaries of the registered jobs. We reconcile summaries 187 // only if the server is 0.4.1 since summaries are not present in 0.4 they 188 // might be incorrect after upgrading to 0.4.1 the summaries might not be 189 // correct 190 if err := s.reconcileJobSummaries(); err != nil { 191 return fmt.Errorf("unable to reconcile job summaries: %v", err) 192 } 193 return nil 194 } 195 196 // restoreEvals is used to restore pending evaluations into the eval broker and 197 // blocked evaluations into the blocked eval tracker. The broker and blocked 198 // eval tracker is maintained only by the leader, so it must be restored anytime 199 // a leadership transition takes place. 200 func (s *Server) restoreEvals() error { 201 // Get an iterator over every evaluation 202 ws := memdb.NewWatchSet() 203 iter, err := s.fsm.State().Evals(ws) 204 if err != nil { 205 return fmt.Errorf("failed to get evaluations: %v", err) 206 } 207 208 for { 209 raw := iter.Next() 210 if raw == nil { 211 break 212 } 213 eval := raw.(*structs.Evaluation) 214 215 if eval.ShouldEnqueue() { 216 s.evalBroker.Enqueue(eval) 217 } else if eval.ShouldBlock() { 218 s.blockedEvals.Block(eval) 219 } 220 } 221 return nil 222 } 223 224 // restoreRevokingAccessors is used to restore Vault accessors that should be 225 // revoked. 226 func (s *Server) restoreRevokingAccessors() error { 227 // An accessor should be revoked if its allocation or node is terminal 228 ws := memdb.NewWatchSet() 229 state := s.fsm.State() 230 iter, err := state.VaultAccessors(ws) 231 if err != nil { 232 return fmt.Errorf("failed to get vault accessors: %v", err) 233 } 234 235 var revoke []*structs.VaultAccessor 236 for { 237 raw := iter.Next() 238 if raw == nil { 239 break 240 } 241 242 va := raw.(*structs.VaultAccessor) 243 244 // Check the allocation 245 alloc, err := state.AllocByID(ws, va.AllocID) 246 if err != nil { 247 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 248 } 249 if alloc == nil || alloc.Terminated() { 250 // No longer running and should be revoked 251 revoke = append(revoke, va) 252 continue 253 } 254 255 // Check the node 256 node, err := state.NodeByID(ws, va.NodeID) 257 if err != nil { 258 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 259 } 260 if node == nil || node.TerminalStatus() { 261 // Node is terminal so any accessor from it should be revoked 262 revoke = append(revoke, va) 263 continue 264 } 265 } 266 267 if len(revoke) != 0 { 268 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 269 return fmt.Errorf("failed to revoke tokens: %v", err) 270 } 271 } 272 273 return nil 274 } 275 276 // restorePeriodicDispatcher is used to restore all periodic jobs into the 277 // periodic dispatcher. It also determines if a periodic job should have been 278 // created during the leadership transition and force runs them. The periodic 279 // dispatcher is maintained only by the leader, so it must be restored anytime a 280 // leadership transition takes place. 281 func (s *Server) restorePeriodicDispatcher() error { 282 ws := memdb.NewWatchSet() 283 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 284 if err != nil { 285 return fmt.Errorf("failed to get periodic jobs: %v", err) 286 } 287 288 now := time.Now() 289 for i := iter.Next(); i != nil; i = iter.Next() { 290 job := i.(*structs.Job) 291 s.periodicDispatcher.Add(job) 292 293 // If the periodic job has never been launched before, launch will hold 294 // the time the periodic job was added. Otherwise it has the last launch 295 // time of the periodic job. 296 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.ID) 297 if err != nil || launch == nil { 298 return fmt.Errorf("failed to get periodic launch time: %v", err) 299 } 300 301 // nextLaunch is the next launch that should occur. 302 nextLaunch := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 303 304 // We skip force launching the job if there should be no next launch 305 // (the zero case) or if the next launch time is in the future. If it is 306 // in the future, it will be handled by the periodic dispatcher. 307 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 308 continue 309 } 310 311 if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil { 312 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 313 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 314 return errors.New(msg) 315 } 316 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 317 " run during leadership establishment", job.ID) 318 } 319 320 return nil 321 } 322 323 // schedulePeriodic is used to do periodic job dispatch while we are leader 324 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 325 evalGC := time.NewTicker(s.config.EvalGCInterval) 326 defer evalGC.Stop() 327 nodeGC := time.NewTicker(s.config.NodeGCInterval) 328 defer nodeGC.Stop() 329 jobGC := time.NewTicker(s.config.JobGCInterval) 330 defer jobGC.Stop() 331 332 // getLatest grabs the latest index from the state store. It returns true if 333 // the index was retrieved successfully. 334 getLatest := func() (uint64, bool) { 335 snapshotIndex, err := s.fsm.State().LatestIndex() 336 if err != nil { 337 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 338 return 0, false 339 } 340 341 return snapshotIndex, true 342 } 343 344 for { 345 346 select { 347 case <-evalGC.C: 348 if index, ok := getLatest(); ok { 349 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 350 } 351 case <-nodeGC.C: 352 if index, ok := getLatest(); ok { 353 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 354 } 355 case <-jobGC.C: 356 if index, ok := getLatest(); ok { 357 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 358 } 359 case <-stopCh: 360 return 361 } 362 } 363 } 364 365 // coreJobEval returns an evaluation for a core job 366 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 367 return &structs.Evaluation{ 368 ID: structs.GenerateUUID(), 369 Priority: structs.CoreJobPriority, 370 Type: structs.JobTypeCore, 371 TriggeredBy: structs.EvalTriggerScheduled, 372 JobID: job, 373 Status: structs.EvalStatusPending, 374 ModifyIndex: modifyIndex, 375 } 376 } 377 378 // reapFailedEvaluations is used to reap evaluations that 379 // have reached their delivery limit and should be failed 380 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 381 for { 382 select { 383 case <-stopCh: 384 return 385 default: 386 // Scan for a failed evaluation 387 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 388 if err != nil { 389 return 390 } 391 if eval == nil { 392 continue 393 } 394 395 // Update the status to failed 396 updateEval := eval.Copy() 397 updateEval.Status = structs.EvalStatusFailed 398 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 399 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) 400 401 // Create a follow-up evaluation that will be used to retry the 402 // scheduling for the job after the cluster is hopefully more stable 403 // due to the fairly large backoff. 404 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 405 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 406 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 407 408 // Update via Raft 409 req := structs.EvalUpdateRequest{ 410 Evals: []*structs.Evaluation{updateEval, followupEval}, 411 } 412 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 413 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) 414 continue 415 } 416 417 // Ack completion 418 s.evalBroker.Ack(eval.ID, token) 419 } 420 } 421 } 422 423 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 424 // should be cancelled. 425 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 426 for { 427 select { 428 case <-stopCh: 429 return 430 default: 431 // Scan for duplicate blocked evals. 432 dups := s.blockedEvals.GetDuplicates(time.Second) 433 if dups == nil { 434 continue 435 } 436 437 cancel := make([]*structs.Evaluation, len(dups)) 438 for i, dup := range dups { 439 // Update the status to cancelled 440 newEval := dup.Copy() 441 newEval.Status = structs.EvalStatusCancelled 442 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 443 cancel[i] = newEval 444 } 445 446 // Update via Raft 447 req := structs.EvalUpdateRequest{ 448 Evals: cancel, 449 } 450 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 451 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 452 continue 453 } 454 } 455 } 456 } 457 458 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 459 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 460 ticker := time.NewTicker(failedEvalUnblockInterval) 461 defer ticker.Stop() 462 for { 463 select { 464 case <-stopCh: 465 return 466 case <-ticker.C: 467 // Unblock the failed allocations 468 s.blockedEvals.UnblockFailed() 469 } 470 } 471 } 472 473 // revokeLeadership is invoked once we step down as leader. 474 // This is used to cleanup any state that may be specific to a leader. 475 func (s *Server) revokeLeadership() error { 476 // Disable the plan queue, since we are no longer leader 477 s.planQueue.SetEnabled(false) 478 479 // Disable the eval broker, since it is only useful as a leader 480 s.evalBroker.SetEnabled(false) 481 482 // Disable the blocked eval tracker, since it is only useful as a leader 483 s.blockedEvals.SetEnabled(false) 484 485 // Disable the periodic dispatcher, since it is only useful as a leader 486 s.periodicDispatcher.SetEnabled(false) 487 488 // Disable the Vault client as it is only useful as a leader. 489 s.vault.SetActive(false) 490 491 // Disable the deployment watcher as it is only useful as a leader. 492 if err := s.deploymentWatcher.SetEnabled(false); err != nil { 493 return err 494 } 495 496 // Clear the heartbeat timers on either shutdown or step down, 497 // since we are no longer responsible for TTL expirations. 498 if err := s.clearAllHeartbeatTimers(); err != nil { 499 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 500 return err 501 } 502 503 // Unpause our worker if we paused previously 504 if len(s.workers) > 1 { 505 for i := 0; i < len(s.workers)/2; i++ { 506 s.workers[i].SetPause(false) 507 } 508 } 509 return nil 510 } 511 512 // reconcile is used to reconcile the differences between Serf 513 // membership and what is reflected in our strongly consistent store. 514 func (s *Server) reconcile() error { 515 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 516 members := s.serf.Members() 517 for _, member := range members { 518 if err := s.reconcileMember(member); err != nil { 519 return err 520 } 521 } 522 return nil 523 } 524 525 // reconcileMember is used to do an async reconcile of a single serf member 526 func (s *Server) reconcileMember(member serf.Member) error { 527 // Check if this is a member we should handle 528 valid, parts := isNomadServer(member) 529 if !valid || parts.Region != s.config.Region { 530 return nil 531 } 532 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 533 534 // Do not reconcile ourself 535 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 536 return nil 537 } 538 539 var err error 540 switch member.Status { 541 case serf.StatusAlive: 542 err = s.addRaftPeer(member, parts) 543 case serf.StatusLeft, StatusReap: 544 err = s.removeRaftPeer(member, parts) 545 } 546 if err != nil { 547 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 548 member, err) 549 return err 550 } 551 return nil 552 } 553 554 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 555 // the system 556 // COMPAT 0.4 -> 0.4.1 557 func (s *Server) reconcileJobSummaries() error { 558 index, err := s.fsm.state.LatestIndex() 559 if err != nil { 560 return fmt.Errorf("unable to read latest index: %v", err) 561 } 562 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 563 564 args := &structs.GenericResponse{} 565 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 566 if _, _, err = s.raftApply(msg, args); err != nil { 567 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 568 } 569 570 return nil 571 } 572 573 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 574 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 575 // Do not join ourselfs 576 if m.Name == s.config.NodeName { 577 s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) 578 return nil 579 } 580 581 // Check for possibility of multiple bootstrap nodes 582 if parts.Bootstrap { 583 members := s.serf.Members() 584 for _, member := range members { 585 valid, p := isNomadServer(member) 586 if valid && member.Name != m.Name && p.Bootstrap { 587 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 588 return nil 589 } 590 } 591 } 592 593 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 594 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 595 596 // See if it's already in the configuration. It's harmless to re-add it 597 // but we want to avoid doing that if possible to prevent useless Raft 598 // log entries. 599 configFuture := s.raft.GetConfiguration() 600 if err := configFuture.Error(); err != nil { 601 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 602 return err 603 } 604 for _, server := range configFuture.Configuration().Servers { 605 if server.Address == raft.ServerAddress(addr) { 606 return nil 607 } 608 } 609 610 // Attempt to add as a peer 611 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 612 if err := addFuture.Error(); err != nil { 613 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 614 return err 615 } else if err == nil { 616 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 617 } 618 return nil 619 } 620 621 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 622 // or is reaped 623 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 624 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 625 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 626 627 // See if it's already in the configuration. It's harmless to re-remove it 628 // but we want to avoid doing that if possible to prevent useless Raft 629 // log entries. 630 configFuture := s.raft.GetConfiguration() 631 if err := configFuture.Error(); err != nil { 632 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 633 return err 634 } 635 for _, server := range configFuture.Configuration().Servers { 636 if server.Address == raft.ServerAddress(addr) { 637 goto REMOVE 638 } 639 } 640 return nil 641 642 REMOVE: 643 // Attempt to remove as a peer. 644 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 645 if err := future.Error(); err != nil { 646 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 647 parts, err) 648 return err 649 } 650 return nil 651 }