github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "math/rand" 9 "net" 10 "sync" 11 "time" 12 13 "golang.org/x/time/rate" 14 15 "strings" 16 17 "github.com/armon/go-metrics" 18 memdb "github.com/hashicorp/go-memdb" 19 "github.com/hashicorp/go-version" 20 "github.com/hashicorp/nomad/helper/uuid" 21 "github.com/hashicorp/nomad/nomad/state" 22 "github.com/hashicorp/nomad/nomad/structs" 23 "github.com/hashicorp/raft" 24 "github.com/hashicorp/serf/serf" 25 ) 26 27 const ( 28 // failedEvalUnblockInterval is the interval at which failed evaluations are 29 // unblocked to re-enter the scheduler. A failed evaluation occurs under 30 // high contention when the schedulers plan does not make progress. 31 failedEvalUnblockInterval = 1 * time.Minute 32 33 // replicationRateLimit is used to rate limit how often data is replicated 34 // between the authoritative region and the local region 35 replicationRateLimit rate.Limit = 10.0 36 37 // barrierWriteTimeout is used to give Raft a chance to process a 38 // possible loss of leadership event if we are unable to get a barrier 39 // while leader. 40 barrierWriteTimeout = 2 * time.Minute 41 ) 42 43 var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) 44 45 // monitorLeadership is used to monitor if we acquire or lose our role 46 // as the leader in the Raft cluster. There is some work the leader is 47 // expected to do, so we must react to changes 48 func (s *Server) monitorLeadership() { 49 var weAreLeaderCh chan struct{} 50 var leaderLoop sync.WaitGroup 51 for { 52 select { 53 case isLeader := <-s.leaderCh: 54 switch { 55 case isLeader: 56 if weAreLeaderCh != nil { 57 s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") 58 continue 59 } 60 61 weAreLeaderCh = make(chan struct{}) 62 leaderLoop.Add(1) 63 go func(ch chan struct{}) { 64 defer leaderLoop.Done() 65 s.leaderLoop(ch) 66 }(weAreLeaderCh) 67 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 68 69 default: 70 if weAreLeaderCh == nil { 71 s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") 72 continue 73 } 74 75 s.logger.Printf("[DEBUG] nomad: shutting down leader loop") 76 close(weAreLeaderCh) 77 leaderLoop.Wait() 78 weAreLeaderCh = nil 79 s.logger.Printf("[INFO] nomad: cluster leadership lost") 80 } 81 82 case <-s.shutdownCh: 83 return 84 } 85 } 86 } 87 88 // leaderLoop runs as long as we are the leader to run various 89 // maintenance activities 90 func (s *Server) leaderLoop(stopCh chan struct{}) { 91 var reconcileCh chan serf.Member 92 establishedLeader := false 93 94 RECONCILE: 95 // Setup a reconciliation timer 96 reconcileCh = nil 97 interval := time.After(s.config.ReconcileInterval) 98 99 // Apply a raft barrier to ensure our FSM is caught up 100 start := time.Now() 101 barrier := s.raft.Barrier(barrierWriteTimeout) 102 if err := barrier.Error(); err != nil { 103 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 104 goto WAIT 105 } 106 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 107 108 // Check if we need to handle initial leadership actions 109 if !establishedLeader { 110 if err := s.establishLeadership(stopCh); err != nil { 111 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) 112 113 // Immediately revoke leadership since we didn't successfully 114 // establish leadership. 115 if err := s.revokeLeadership(); err != nil { 116 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 117 } 118 119 goto WAIT 120 } 121 122 establishedLeader = true 123 defer func() { 124 if err := s.revokeLeadership(); err != nil { 125 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 126 } 127 }() 128 } 129 130 // Reconcile any missing data 131 if err := s.reconcile(); err != nil { 132 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 133 goto WAIT 134 } 135 136 // Initial reconcile worked, now we can process the channel 137 // updates 138 reconcileCh = s.reconcileCh 139 140 // Poll the stop channel to give it priority so we don't waste time 141 // trying to perform the other operations if we have been asked to shut 142 // down. 143 select { 144 case <-stopCh: 145 return 146 default: 147 } 148 149 WAIT: 150 // Wait until leadership is lost 151 for { 152 select { 153 case <-stopCh: 154 return 155 case <-s.shutdownCh: 156 return 157 case <-interval: 158 goto RECONCILE 159 case member := <-reconcileCh: 160 s.reconcileMember(member) 161 } 162 } 163 } 164 165 // establishLeadership is invoked once we become leader and are able 166 // to invoke an initial barrier. The barrier is used to ensure any 167 // previously inflight transactions have been committed and that our 168 // state is up-to-date. 169 func (s *Server) establishLeadership(stopCh chan struct{}) error { 170 defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now()) 171 172 // Generate a leader ACL token. This will allow the leader to issue work 173 // that requires a valid ACL token. 174 s.setLeaderAcl(uuid.Generate()) 175 176 // Disable workers to free half the cores for use in the plan queue and 177 // evaluation broker 178 if numWorkers := len(s.workers); numWorkers > 1 { 179 // Disabling 3/4 of the workers frees CPU for raft and the 180 // plan applier which uses 1/2 the cores. 181 for i := 0; i < (3 * numWorkers / 4); i++ { 182 s.workers[i].SetPause(true) 183 } 184 } 185 186 // Initialize and start the autopilot routine 187 s.getOrCreateAutopilotConfig() 188 s.autopilot.Start() 189 190 // Enable the plan queue, since we are now the leader 191 s.planQueue.SetEnabled(true) 192 193 // Start the plan evaluator 194 go s.planApply() 195 196 // Enable the eval broker, since we are now the leader 197 s.evalBroker.SetEnabled(true) 198 199 // Enable the blocked eval tracker, since we are now the leader 200 s.blockedEvals.SetEnabled(true) 201 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 202 203 // Enable the deployment watcher, since we are now the leader 204 s.deploymentWatcher.SetEnabled(true, s.State()) 205 206 // Enable the NodeDrainer 207 s.nodeDrainer.SetEnabled(true, s.State()) 208 209 // Restore the eval broker state 210 if err := s.restoreEvals(); err != nil { 211 return err 212 } 213 214 // Activate the vault client 215 s.vault.SetActive(true) 216 if err := s.restoreRevokingAccessors(); err != nil { 217 return err 218 } 219 220 // Enable the periodic dispatcher, since we are now the leader. 221 s.periodicDispatcher.SetEnabled(true) 222 223 // Restore the periodic dispatcher state 224 if err := s.restorePeriodicDispatcher(); err != nil { 225 return err 226 } 227 228 // Scheduler periodic jobs 229 go s.schedulePeriodic(stopCh) 230 231 // Reap any failed evaluations 232 go s.reapFailedEvaluations(stopCh) 233 234 // Reap any duplicate blocked evaluations 235 go s.reapDupBlockedEvaluations(stopCh) 236 237 // Periodically unblock failed allocations 238 go s.periodicUnblockFailedEvals(stopCh) 239 240 // Periodically publish job summary metrics 241 go s.publishJobSummaryMetrics(stopCh) 242 243 // Setup the heartbeat timers. This is done both when starting up or when 244 // a leader fail over happens. Since the timers are maintained by the leader 245 // node, effectively this means all the timers are renewed at the time of failover. 246 // The TTL contract is that the session will not be expired before the TTL, 247 // so expiring it later is allowable. 248 // 249 // This MUST be done after the initial barrier to ensure the latest Nodes 250 // are available to be initialized. Otherwise initialization may use stale 251 // data. 252 if err := s.initializeHeartbeatTimers(); err != nil { 253 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 254 return err 255 } 256 257 // COMPAT 0.4 - 0.4.1 258 // Reconcile the summaries of the registered jobs. We reconcile summaries 259 // only if the server is 0.4.1 since summaries are not present in 0.4 they 260 // might be incorrect after upgrading to 0.4.1 the summaries might not be 261 // correct 262 if err := s.reconcileJobSummaries(); err != nil { 263 return fmt.Errorf("unable to reconcile job summaries: %v", err) 264 } 265 266 // Start replication of ACLs and Policies if they are enabled, 267 // and we are not the authoritative region. 268 if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion { 269 go s.replicateACLPolicies(stopCh) 270 go s.replicateACLTokens(stopCh) 271 } 272 273 // Setup any enterprise systems required. 274 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 275 return err 276 } 277 278 return nil 279 } 280 281 // restoreEvals is used to restore pending evaluations into the eval broker and 282 // blocked evaluations into the blocked eval tracker. The broker and blocked 283 // eval tracker is maintained only by the leader, so it must be restored anytime 284 // a leadership transition takes place. 285 func (s *Server) restoreEvals() error { 286 // Get an iterator over every evaluation 287 ws := memdb.NewWatchSet() 288 iter, err := s.fsm.State().Evals(ws) 289 if err != nil { 290 return fmt.Errorf("failed to get evaluations: %v", err) 291 } 292 293 for { 294 raw := iter.Next() 295 if raw == nil { 296 break 297 } 298 eval := raw.(*structs.Evaluation) 299 300 if eval.ShouldEnqueue() { 301 s.evalBroker.Enqueue(eval) 302 } else if eval.ShouldBlock() { 303 s.blockedEvals.Block(eval) 304 } 305 } 306 return nil 307 } 308 309 // restoreRevokingAccessors is used to restore Vault accessors that should be 310 // revoked. 311 func (s *Server) restoreRevokingAccessors() error { 312 // An accessor should be revoked if its allocation or node is terminal 313 ws := memdb.NewWatchSet() 314 state := s.fsm.State() 315 iter, err := state.VaultAccessors(ws) 316 if err != nil { 317 return fmt.Errorf("failed to get vault accessors: %v", err) 318 } 319 320 var revoke []*structs.VaultAccessor 321 for { 322 raw := iter.Next() 323 if raw == nil { 324 break 325 } 326 327 va := raw.(*structs.VaultAccessor) 328 329 // Check the allocation 330 alloc, err := state.AllocByID(ws, va.AllocID) 331 if err != nil { 332 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 333 } 334 if alloc == nil || alloc.Terminated() { 335 // No longer running and should be revoked 336 revoke = append(revoke, va) 337 continue 338 } 339 340 // Check the node 341 node, err := state.NodeByID(ws, va.NodeID) 342 if err != nil { 343 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 344 } 345 if node == nil || node.TerminalStatus() { 346 // Node is terminal so any accessor from it should be revoked 347 revoke = append(revoke, va) 348 continue 349 } 350 } 351 352 if len(revoke) != 0 { 353 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 354 return fmt.Errorf("failed to revoke tokens: %v", err) 355 } 356 } 357 358 return nil 359 } 360 361 // restorePeriodicDispatcher is used to restore all periodic jobs into the 362 // periodic dispatcher. It also determines if a periodic job should have been 363 // created during the leadership transition and force runs them. The periodic 364 // dispatcher is maintained only by the leader, so it must be restored anytime a 365 // leadership transition takes place. 366 func (s *Server) restorePeriodicDispatcher() error { 367 ws := memdb.NewWatchSet() 368 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 369 if err != nil { 370 return fmt.Errorf("failed to get periodic jobs: %v", err) 371 } 372 373 now := time.Now() 374 for i := iter.Next(); i != nil; i = iter.Next() { 375 job := i.(*structs.Job) 376 377 // We skip adding parameterized jobs because they themselves aren't 378 // tracked, only the dispatched children are. 379 if job.IsParameterized() { 380 continue 381 } 382 383 if err := s.periodicDispatcher.Add(job); err != nil { 384 s.logger.Printf("[ERR] nomad.periodic: %v", err) 385 continue 386 } 387 388 // We do not need to force run the job since it isn't active. 389 if !job.IsPeriodicActive() { 390 continue 391 } 392 393 // If the periodic job has never been launched before, launch will hold 394 // the time the periodic job was added. Otherwise it has the last launch 395 // time of the periodic job. 396 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 397 if err != nil { 398 return fmt.Errorf("failed to get periodic launch time: %v", err) 399 } 400 if launch == nil { 401 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 402 job.ID, job.Namespace) 403 } 404 405 // nextLaunch is the next launch that should occur. 406 nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 407 if err != nil { 408 s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err) 409 continue 410 } 411 412 // We skip force launching the job if there should be no next launch 413 // (the zero case) or if the next launch time is in the future. If it is 414 // in the future, it will be handled by the periodic dispatcher. 415 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 416 continue 417 } 418 419 if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { 420 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 421 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 422 return errors.New(msg) 423 } 424 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 425 " run during leadership establishment", job.ID) 426 } 427 428 return nil 429 } 430 431 // schedulePeriodic is used to do periodic job dispatch while we are leader 432 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 433 evalGC := time.NewTicker(s.config.EvalGCInterval) 434 defer evalGC.Stop() 435 nodeGC := time.NewTicker(s.config.NodeGCInterval) 436 defer nodeGC.Stop() 437 jobGC := time.NewTicker(s.config.JobGCInterval) 438 defer jobGC.Stop() 439 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 440 defer deploymentGC.Stop() 441 442 // getLatest grabs the latest index from the state store. It returns true if 443 // the index was retrieved successfully. 444 getLatest := func() (uint64, bool) { 445 snapshotIndex, err := s.fsm.State().LatestIndex() 446 if err != nil { 447 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 448 return 0, false 449 } 450 451 return snapshotIndex, true 452 } 453 454 for { 455 456 select { 457 case <-evalGC.C: 458 if index, ok := getLatest(); ok { 459 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 460 } 461 case <-nodeGC.C: 462 if index, ok := getLatest(); ok { 463 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 464 } 465 case <-jobGC.C: 466 if index, ok := getLatest(); ok { 467 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 468 } 469 case <-deploymentGC.C: 470 if index, ok := getLatest(); ok { 471 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 472 } 473 case <-stopCh: 474 return 475 } 476 } 477 } 478 479 // coreJobEval returns an evaluation for a core job 480 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 481 return &structs.Evaluation{ 482 ID: uuid.Generate(), 483 Namespace: "-", 484 Priority: structs.CoreJobPriority, 485 Type: structs.JobTypeCore, 486 TriggeredBy: structs.EvalTriggerScheduled, 487 JobID: job, 488 LeaderACL: s.getLeaderAcl(), 489 Status: structs.EvalStatusPending, 490 ModifyIndex: modifyIndex, 491 } 492 } 493 494 // reapFailedEvaluations is used to reap evaluations that 495 // have reached their delivery limit and should be failed 496 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 497 for { 498 select { 499 case <-stopCh: 500 return 501 default: 502 // Scan for a failed evaluation 503 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 504 if err != nil { 505 return 506 } 507 if eval == nil { 508 continue 509 } 510 511 // Update the status to failed 512 updateEval := eval.Copy() 513 updateEval.Status = structs.EvalStatusFailed 514 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 515 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) 516 517 // Create a follow-up evaluation that will be used to retry the 518 // scheduling for the job after the cluster is hopefully more stable 519 // due to the fairly large backoff. 520 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 521 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 522 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 523 524 // Update via Raft 525 req := structs.EvalUpdateRequest{ 526 Evals: []*structs.Evaluation{updateEval, followupEval}, 527 } 528 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 529 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) 530 continue 531 } 532 533 // Ack completion 534 s.evalBroker.Ack(eval.ID, token) 535 } 536 } 537 } 538 539 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 540 // should be cancelled. 541 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 542 for { 543 select { 544 case <-stopCh: 545 return 546 default: 547 // Scan for duplicate blocked evals. 548 dups := s.blockedEvals.GetDuplicates(time.Second) 549 if dups == nil { 550 continue 551 } 552 553 cancel := make([]*structs.Evaluation, len(dups)) 554 for i, dup := range dups { 555 // Update the status to cancelled 556 newEval := dup.Copy() 557 newEval.Status = structs.EvalStatusCancelled 558 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 559 cancel[i] = newEval 560 } 561 562 // Update via Raft 563 req := structs.EvalUpdateRequest{ 564 Evals: cancel, 565 } 566 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 567 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 568 continue 569 } 570 } 571 } 572 } 573 574 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 575 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 576 ticker := time.NewTicker(failedEvalUnblockInterval) 577 defer ticker.Stop() 578 for { 579 select { 580 case <-stopCh: 581 return 582 case <-ticker.C: 583 // Unblock the failed allocations 584 s.blockedEvals.UnblockFailed() 585 } 586 } 587 } 588 589 // publishJobSummaryMetrics publishes the job summaries as metrics 590 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 591 timer := time.NewTimer(0) 592 defer timer.Stop() 593 594 for { 595 select { 596 case <-stopCh: 597 return 598 case <-timer.C: 599 timer.Reset(s.config.StatsCollectionInterval) 600 state, err := s.State().Snapshot() 601 if err != nil { 602 s.logger.Printf("[ERR] nomad: failed to get state: %v", err) 603 continue 604 } 605 ws := memdb.NewWatchSet() 606 iter, err := state.JobSummaries(ws) 607 if err != nil { 608 s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) 609 continue 610 } 611 612 for { 613 raw := iter.Next() 614 if raw == nil { 615 break 616 } 617 summary := raw.(*structs.JobSummary) 618 for name, tgSummary := range summary.Summary { 619 if !s.config.DisableTaggedMetrics { 620 labels := []metrics.Label{ 621 { 622 Name: "job", 623 Value: summary.JobID, 624 }, 625 { 626 Name: "task_group", 627 Value: name, 628 }, 629 } 630 631 if strings.Contains(summary.JobID, "/dispatch-") { 632 jobInfo := strings.Split(summary.JobID, "/dispatch-") 633 labels = append(labels, metrics.Label{ 634 Name: "parent_id", 635 Value: jobInfo[0], 636 }, metrics.Label{ 637 Name: "dispatch_id", 638 Value: jobInfo[1], 639 }) 640 } 641 642 if strings.Contains(summary.JobID, "/periodic-") { 643 jobInfo := strings.Split(summary.JobID, "/periodic-") 644 labels = append(labels, metrics.Label{ 645 Name: "parent_id", 646 Value: jobInfo[0], 647 }, metrics.Label{ 648 Name: "periodic_id", 649 Value: jobInfo[1], 650 }) 651 } 652 653 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 654 float32(tgSummary.Queued), labels) 655 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 656 float32(tgSummary.Complete), labels) 657 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 658 float32(tgSummary.Failed), labels) 659 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 660 float32(tgSummary.Running), labels) 661 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 662 float32(tgSummary.Starting), labels) 663 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 664 float32(tgSummary.Lost), labels) 665 } 666 if s.config.BackwardsCompatibleMetrics { 667 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued)) 668 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete)) 669 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed)) 670 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running)) 671 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting)) 672 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost)) 673 } 674 } 675 } 676 } 677 } 678 } 679 680 // revokeLeadership is invoked once we step down as leader. 681 // This is used to cleanup any state that may be specific to a leader. 682 func (s *Server) revokeLeadership() error { 683 defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now()) 684 685 // Clear the leader token since we are no longer the leader. 686 s.setLeaderAcl("") 687 688 // Disable autopilot 689 s.autopilot.Stop() 690 691 // Disable the plan queue, since we are no longer leader 692 s.planQueue.SetEnabled(false) 693 694 // Disable the eval broker, since it is only useful as a leader 695 s.evalBroker.SetEnabled(false) 696 697 // Disable the blocked eval tracker, since it is only useful as a leader 698 s.blockedEvals.SetEnabled(false) 699 700 // Disable the periodic dispatcher, since it is only useful as a leader 701 s.periodicDispatcher.SetEnabled(false) 702 703 // Disable the Vault client as it is only useful as a leader. 704 s.vault.SetActive(false) 705 706 // Disable the deployment watcher as it is only useful as a leader. 707 s.deploymentWatcher.SetEnabled(false, nil) 708 709 // Disable the node drainer 710 s.nodeDrainer.SetEnabled(false, nil) 711 712 // Disable any enterprise systems required. 713 if err := s.revokeEnterpriseLeadership(); err != nil { 714 return err 715 } 716 717 // Clear the heartbeat timers on either shutdown or step down, 718 // since we are no longer responsible for TTL expirations. 719 if err := s.clearAllHeartbeatTimers(); err != nil { 720 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 721 return err 722 } 723 724 // Unpause our worker if we paused previously 725 if len(s.workers) > 1 { 726 for i := 0; i < len(s.workers)/2; i++ { 727 s.workers[i].SetPause(false) 728 } 729 } 730 return nil 731 } 732 733 // reconcile is used to reconcile the differences between Serf 734 // membership and what is reflected in our strongly consistent store. 735 func (s *Server) reconcile() error { 736 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 737 members := s.serf.Members() 738 for _, member := range members { 739 if err := s.reconcileMember(member); err != nil { 740 return err 741 } 742 } 743 return nil 744 } 745 746 // reconcileMember is used to do an async reconcile of a single serf member 747 func (s *Server) reconcileMember(member serf.Member) error { 748 // Check if this is a member we should handle 749 valid, parts := isNomadServer(member) 750 if !valid || parts.Region != s.config.Region { 751 return nil 752 } 753 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 754 755 var err error 756 switch member.Status { 757 case serf.StatusAlive: 758 err = s.addRaftPeer(member, parts) 759 case serf.StatusLeft, StatusReap: 760 err = s.removeRaftPeer(member, parts) 761 } 762 if err != nil { 763 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 764 member, err) 765 return err 766 } 767 return nil 768 } 769 770 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 771 // the system 772 // COMPAT 0.4 -> 0.4.1 773 func (s *Server) reconcileJobSummaries() error { 774 index, err := s.fsm.state.LatestIndex() 775 if err != nil { 776 return fmt.Errorf("unable to read latest index: %v", err) 777 } 778 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 779 780 args := &structs.GenericResponse{} 781 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 782 if _, _, err = s.raftApply(msg, args); err != nil { 783 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 784 } 785 786 return nil 787 } 788 789 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 790 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 791 // Check for possibility of multiple bootstrap nodes 792 members := s.serf.Members() 793 if parts.Bootstrap { 794 for _, member := range members { 795 valid, p := isNomadServer(member) 796 if valid && member.Name != m.Name && p.Bootstrap { 797 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 798 return nil 799 } 800 } 801 } 802 803 // Processing ourselves could result in trying to remove ourselves to 804 // fix up our address, which would make us step down. This is only 805 // safe to attempt if there are multiple servers available. 806 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 807 configFuture := s.raft.GetConfiguration() 808 if err := configFuture.Error(); err != nil { 809 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 810 return err 811 } 812 813 if m.Name == s.config.NodeName { 814 if l := len(configFuture.Configuration().Servers); l < 3 { 815 s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name) 816 return nil 817 } 818 } 819 820 // See if it's already in the configuration. It's harmless to re-add it 821 // but we want to avoid doing that if possible to prevent useless Raft 822 // log entries. If the address is the same but the ID changed, remove the 823 // old server before adding the new one. 824 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 825 if err != nil { 826 return err 827 } 828 for _, server := range configFuture.Configuration().Servers { 829 // No-op if the raft version is too low 830 if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) { 831 return nil 832 } 833 834 // If the address or ID matches an existing server, see if we need to remove the old one first 835 if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { 836 // Exit with no-op if this is being called on an existing server and both the ID and address match 837 if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { 838 return nil 839 } 840 future := s.raft.RemoveServer(server.ID, 0, 0) 841 if server.Address == raft.ServerAddress(addr) { 842 if err := future.Error(); err != nil { 843 return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) 844 } 845 s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address) 846 } else { 847 if err := future.Error(); err != nil { 848 return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) 849 } 850 s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID) 851 } 852 } 853 } 854 855 // Attempt to add as a peer 856 switch { 857 case minRaftProtocol >= 3: 858 addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 859 if err := addFuture.Error(); err != nil { 860 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 861 return err 862 } 863 case minRaftProtocol == 2 && parts.RaftVersion >= 3: 864 addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 865 if err := addFuture.Error(); err != nil { 866 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 867 return err 868 } 869 default: 870 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 871 if err := addFuture.Error(); err != nil { 872 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 873 return err 874 } 875 } 876 877 return nil 878 } 879 880 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 881 // or is reaped 882 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 883 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 884 885 // See if it's already in the configuration. It's harmless to re-remove it 886 // but we want to avoid doing that if possible to prevent useless Raft 887 // log entries. 888 configFuture := s.raft.GetConfiguration() 889 if err := configFuture.Error(); err != nil { 890 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 891 return err 892 } 893 894 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 895 if err != nil { 896 return err 897 } 898 899 // Pick which remove API to use based on how the server was added. 900 for _, server := range configFuture.Configuration().Servers { 901 // If we understand the new add/remove APIs and the server was added by ID, use the new remove API 902 if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { 903 s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID) 904 future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) 905 if err := future.Error(); err != nil { 906 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 907 server.ID, err) 908 return err 909 } 910 break 911 } else if server.Address == raft.ServerAddress(addr) { 912 // If not, use the old remove API 913 s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address) 914 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 915 if err := future.Error(); err != nil { 916 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 917 addr, err) 918 return err 919 } 920 break 921 } 922 } 923 924 return nil 925 } 926 927 // replicateACLPolicies is used to replicate ACL policies from 928 // the authoritative region to this region. 929 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 930 req := structs.ACLPolicyListRequest{ 931 QueryOptions: structs.QueryOptions{ 932 Region: s.config.AuthoritativeRegion, 933 AllowStale: true, 934 }, 935 } 936 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 937 s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) 938 939 START: 940 for { 941 select { 942 case <-stopCh: 943 return 944 default: 945 // Rate limit how often we attempt replication 946 limiter.Wait(context.Background()) 947 948 // Fetch the list of policies 949 var resp structs.ACLPolicyListResponse 950 req.AuthToken = s.ReplicationToken() 951 err := s.forwardRegion(s.config.AuthoritativeRegion, 952 "ACL.ListPolicies", &req, &resp) 953 if err != nil { 954 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 955 goto ERR_WAIT 956 } 957 958 // Perform a two-way diff 959 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 960 961 // Delete policies that should not exist 962 if len(delete) > 0 { 963 args := &structs.ACLPolicyDeleteRequest{ 964 Names: delete, 965 } 966 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 967 if err != nil { 968 s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) 969 goto ERR_WAIT 970 } 971 } 972 973 // Fetch any outdated policies 974 var fetched []*structs.ACLPolicy 975 if len(update) > 0 { 976 req := structs.ACLPolicySetRequest{ 977 Names: update, 978 QueryOptions: structs.QueryOptions{ 979 Region: s.config.AuthoritativeRegion, 980 AuthToken: s.ReplicationToken(), 981 AllowStale: true, 982 MinQueryIndex: resp.Index - 1, 983 }, 984 } 985 var reply structs.ACLPolicySetResponse 986 if err := s.forwardRegion(s.config.AuthoritativeRegion, 987 "ACL.GetPolicies", &req, &reply); err != nil { 988 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 989 goto ERR_WAIT 990 } 991 for _, policy := range reply.Policies { 992 fetched = append(fetched, policy) 993 } 994 } 995 996 // Update local policies 997 if len(fetched) > 0 { 998 args := &structs.ACLPolicyUpsertRequest{ 999 Policies: fetched, 1000 } 1001 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 1002 if err != nil { 1003 s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) 1004 goto ERR_WAIT 1005 } 1006 } 1007 1008 // Update the minimum query index, blocks until there 1009 // is a change. 1010 req.MinQueryIndex = resp.Index 1011 } 1012 } 1013 1014 ERR_WAIT: 1015 select { 1016 case <-time.After(s.config.ReplicationBackoff): 1017 goto START 1018 case <-stopCh: 1019 return 1020 } 1021 } 1022 1023 // diffACLPolicies is used to perform a two-way diff between the local 1024 // policies and the remote policies to determine which policies need to 1025 // be deleted or updated. 1026 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 1027 // Construct a set of the local and remote policies 1028 local := make(map[string][]byte) 1029 remote := make(map[string]struct{}) 1030 1031 // Add all the local policies 1032 iter, err := state.ACLPolicies(nil) 1033 if err != nil { 1034 panic("failed to iterate local policies") 1035 } 1036 for { 1037 raw := iter.Next() 1038 if raw == nil { 1039 break 1040 } 1041 policy := raw.(*structs.ACLPolicy) 1042 local[policy.Name] = policy.Hash 1043 } 1044 1045 // Iterate over the remote policies 1046 for _, rp := range remoteList { 1047 remote[rp.Name] = struct{}{} 1048 1049 // Check if the policy is missing locally 1050 if localHash, ok := local[rp.Name]; !ok { 1051 update = append(update, rp.Name) 1052 1053 // Check if policy is newer remotely and there is a hash mis-match. 1054 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1055 update = append(update, rp.Name) 1056 } 1057 } 1058 1059 // Check if policy should be deleted 1060 for lp := range local { 1061 if _, ok := remote[lp]; !ok { 1062 delete = append(delete, lp) 1063 } 1064 } 1065 return 1066 } 1067 1068 // replicateACLTokens is used to replicate global ACL tokens from 1069 // the authoritative region to this region. 1070 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 1071 req := structs.ACLTokenListRequest{ 1072 GlobalOnly: true, 1073 QueryOptions: structs.QueryOptions{ 1074 Region: s.config.AuthoritativeRegion, 1075 AllowStale: true, 1076 }, 1077 } 1078 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1079 s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) 1080 1081 START: 1082 for { 1083 select { 1084 case <-stopCh: 1085 return 1086 default: 1087 // Rate limit how often we attempt replication 1088 limiter.Wait(context.Background()) 1089 1090 // Fetch the list of tokens 1091 var resp structs.ACLTokenListResponse 1092 req.AuthToken = s.ReplicationToken() 1093 err := s.forwardRegion(s.config.AuthoritativeRegion, 1094 "ACL.ListTokens", &req, &resp) 1095 if err != nil { 1096 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1097 goto ERR_WAIT 1098 } 1099 1100 // Perform a two-way diff 1101 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 1102 1103 // Delete tokens that should not exist 1104 if len(delete) > 0 { 1105 args := &structs.ACLTokenDeleteRequest{ 1106 AccessorIDs: delete, 1107 } 1108 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1109 if err != nil { 1110 s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) 1111 goto ERR_WAIT 1112 } 1113 } 1114 1115 // Fetch any outdated policies. 1116 var fetched []*structs.ACLToken 1117 if len(update) > 0 { 1118 req := structs.ACLTokenSetRequest{ 1119 AccessorIDS: update, 1120 QueryOptions: structs.QueryOptions{ 1121 Region: s.config.AuthoritativeRegion, 1122 AuthToken: s.ReplicationToken(), 1123 AllowStale: true, 1124 MinQueryIndex: resp.Index - 1, 1125 }, 1126 } 1127 var reply structs.ACLTokenSetResponse 1128 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1129 "ACL.GetTokens", &req, &reply); err != nil { 1130 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1131 goto ERR_WAIT 1132 } 1133 for _, token := range reply.Tokens { 1134 fetched = append(fetched, token) 1135 } 1136 } 1137 1138 // Update local tokens 1139 if len(fetched) > 0 { 1140 args := &structs.ACLTokenUpsertRequest{ 1141 Tokens: fetched, 1142 } 1143 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1144 if err != nil { 1145 s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) 1146 goto ERR_WAIT 1147 } 1148 } 1149 1150 // Update the minimum query index, blocks until there 1151 // is a change. 1152 req.MinQueryIndex = resp.Index 1153 } 1154 } 1155 1156 ERR_WAIT: 1157 select { 1158 case <-time.After(s.config.ReplicationBackoff): 1159 goto START 1160 case <-stopCh: 1161 return 1162 } 1163 } 1164 1165 // diffACLTokens is used to perform a two-way diff between the local 1166 // tokens and the remote tokens to determine which tokens need to 1167 // be deleted or updated. 1168 func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1169 // Construct a set of the local and remote policies 1170 local := make(map[string][]byte) 1171 remote := make(map[string]struct{}) 1172 1173 // Add all the local global tokens 1174 iter, err := state.ACLTokensByGlobal(nil, true) 1175 if err != nil { 1176 panic("failed to iterate local tokens") 1177 } 1178 for { 1179 raw := iter.Next() 1180 if raw == nil { 1181 break 1182 } 1183 token := raw.(*structs.ACLToken) 1184 local[token.AccessorID] = token.Hash 1185 } 1186 1187 // Iterate over the remote tokens 1188 for _, rp := range remoteList { 1189 remote[rp.AccessorID] = struct{}{} 1190 1191 // Check if the token is missing locally 1192 if localHash, ok := local[rp.AccessorID]; !ok { 1193 update = append(update, rp.AccessorID) 1194 1195 // Check if policy is newer remotely and there is a hash mis-match. 1196 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1197 update = append(update, rp.AccessorID) 1198 } 1199 } 1200 1201 // Check if local token should be deleted 1202 for lp := range local { 1203 if _, ok := remote[lp]; !ok { 1204 delete = append(delete, lp) 1205 } 1206 } 1207 return 1208 } 1209 1210 // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary 1211 func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { 1212 state := s.fsm.State() 1213 _, config, err := state.AutopilotConfig() 1214 if err != nil { 1215 s.logger.Printf("[ERR] autopilot: failed to get config: %v", err) 1216 return nil 1217 } 1218 if config != nil { 1219 return config 1220 } 1221 1222 if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) { 1223 s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String()) 1224 return nil 1225 } 1226 1227 config = s.config.AutopilotConfig 1228 req := structs.AutopilotSetConfigRequest{Config: *config} 1229 if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { 1230 s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err) 1231 return nil 1232 } 1233 1234 return config 1235 }