github.com/manicqin/nomad@v0.9.5/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "math/rand" 8 "net" 9 "sync" 10 "time" 11 12 "golang.org/x/time/rate" 13 14 "strings" 15 16 metrics "github.com/armon/go-metrics" 17 log "github.com/hashicorp/go-hclog" 18 memdb "github.com/hashicorp/go-memdb" 19 version "github.com/hashicorp/go-version" 20 "github.com/hashicorp/nomad/helper/uuid" 21 "github.com/hashicorp/nomad/nomad/state" 22 "github.com/hashicorp/nomad/nomad/structs" 23 "github.com/hashicorp/raft" 24 "github.com/hashicorp/serf/serf" 25 ) 26 27 const ( 28 // failedEvalUnblockInterval is the interval at which failed evaluations are 29 // unblocked to re-enter the scheduler. A failed evaluation occurs under 30 // high contention when the schedulers plan does not make progress. 31 failedEvalUnblockInterval = 1 * time.Minute 32 33 // replicationRateLimit is used to rate limit how often data is replicated 34 // between the authoritative region and the local region 35 replicationRateLimit rate.Limit = 10.0 36 37 // barrierWriteTimeout is used to give Raft a chance to process a 38 // possible loss of leadership event if we are unable to get a barrier 39 // while leader. 40 barrierWriteTimeout = 2 * time.Minute 41 ) 42 43 var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) 44 45 var minSchedulerConfigVersion = version.Must(version.NewVersion("0.9.0")) 46 47 // Default configuration for scheduler with preemption enabled for system jobs 48 var defaultSchedulerConfig = &structs.SchedulerConfiguration{ 49 PreemptionConfig: structs.PreemptionConfig{ 50 SystemSchedulerEnabled: true, 51 BatchSchedulerEnabled: false, 52 ServiceSchedulerEnabled: false, 53 }, 54 } 55 56 // monitorLeadership is used to monitor if we acquire or lose our role 57 // as the leader in the Raft cluster. There is some work the leader is 58 // expected to do, so we must react to changes 59 func (s *Server) monitorLeadership() { 60 var weAreLeaderCh chan struct{} 61 var leaderLoop sync.WaitGroup 62 for { 63 select { 64 case isLeader := <-s.leaderCh: 65 switch { 66 case isLeader: 67 if weAreLeaderCh != nil { 68 s.logger.Error("attempted to start the leader loop while running") 69 continue 70 } 71 72 weAreLeaderCh = make(chan struct{}) 73 leaderLoop.Add(1) 74 go func(ch chan struct{}) { 75 defer leaderLoop.Done() 76 s.leaderLoop(ch) 77 }(weAreLeaderCh) 78 s.logger.Info("cluster leadership acquired") 79 80 default: 81 if weAreLeaderCh == nil { 82 s.logger.Error("attempted to stop the leader loop while not running") 83 continue 84 } 85 86 s.logger.Debug("shutting down leader loop") 87 close(weAreLeaderCh) 88 leaderLoop.Wait() 89 weAreLeaderCh = nil 90 s.logger.Info("cluster leadership lost") 91 } 92 93 case <-s.shutdownCh: 94 return 95 } 96 } 97 } 98 99 // leaderLoop runs as long as we are the leader to run various 100 // maintenance activities 101 func (s *Server) leaderLoop(stopCh chan struct{}) { 102 var reconcileCh chan serf.Member 103 establishedLeader := false 104 105 RECONCILE: 106 // Setup a reconciliation timer 107 reconcileCh = nil 108 interval := time.After(s.config.ReconcileInterval) 109 110 // Apply a raft barrier to ensure our FSM is caught up 111 start := time.Now() 112 barrier := s.raft.Barrier(barrierWriteTimeout) 113 if err := barrier.Error(); err != nil { 114 s.logger.Error("failed to wait for barrier", "error", err) 115 goto WAIT 116 } 117 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 118 119 // Check if we need to handle initial leadership actions 120 if !establishedLeader { 121 if err := s.establishLeadership(stopCh); err != nil { 122 s.logger.Error("failed to establish leadership", "error", err) 123 124 // Immediately revoke leadership since we didn't successfully 125 // establish leadership. 126 if err := s.revokeLeadership(); err != nil { 127 s.logger.Error("failed to revoke leadership", "error", err) 128 } 129 130 goto WAIT 131 } 132 133 establishedLeader = true 134 defer func() { 135 if err := s.revokeLeadership(); err != nil { 136 s.logger.Error("failed to revoke leadership", "error", err) 137 } 138 }() 139 } 140 141 // Reconcile any missing data 142 if err := s.reconcile(); err != nil { 143 s.logger.Error("failed to reconcile", "error", err) 144 goto WAIT 145 } 146 147 // Initial reconcile worked, now we can process the channel 148 // updates 149 reconcileCh = s.reconcileCh 150 151 // Poll the stop channel to give it priority so we don't waste time 152 // trying to perform the other operations if we have been asked to shut 153 // down. 154 select { 155 case <-stopCh: 156 return 157 default: 158 } 159 160 WAIT: 161 // Wait until leadership is lost 162 for { 163 select { 164 case <-stopCh: 165 return 166 case <-s.shutdownCh: 167 return 168 case <-interval: 169 goto RECONCILE 170 case member := <-reconcileCh: 171 s.reconcileMember(member) 172 } 173 } 174 } 175 176 // establishLeadership is invoked once we become leader and are able 177 // to invoke an initial barrier. The barrier is used to ensure any 178 // previously inflight transactions have been committed and that our 179 // state is up-to-date. 180 func (s *Server) establishLeadership(stopCh chan struct{}) error { 181 defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now()) 182 183 // Generate a leader ACL token. This will allow the leader to issue work 184 // that requires a valid ACL token. 185 s.setLeaderAcl(uuid.Generate()) 186 187 // Disable workers to free half the cores for use in the plan queue and 188 // evaluation broker 189 if numWorkers := len(s.workers); numWorkers > 1 { 190 // Disabling 3/4 of the workers frees CPU for raft and the 191 // plan applier which uses 1/2 the cores. 192 for i := 0; i < (3 * numWorkers / 4); i++ { 193 s.workers[i].SetPause(true) 194 } 195 } 196 197 // Initialize and start the autopilot routine 198 s.getOrCreateAutopilotConfig() 199 s.autopilot.Start() 200 201 // Initialize scheduler configuration 202 s.getOrCreateSchedulerConfig() 203 204 // Enable the plan queue, since we are now the leader 205 s.planQueue.SetEnabled(true) 206 207 // Start the plan evaluator 208 go s.planApply() 209 210 // Enable the eval broker, since we are now the leader 211 s.evalBroker.SetEnabled(true) 212 213 // Enable the blocked eval tracker, since we are now the leader 214 s.blockedEvals.SetEnabled(true) 215 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 216 217 // Enable the deployment watcher, since we are now the leader 218 s.deploymentWatcher.SetEnabled(true, s.State()) 219 220 // Enable the NodeDrainer 221 s.nodeDrainer.SetEnabled(true, s.State()) 222 223 // Restore the eval broker state 224 if err := s.restoreEvals(); err != nil { 225 return err 226 } 227 228 // Activate the vault client 229 s.vault.SetActive(true) 230 if err := s.restoreRevokingAccessors(); err != nil { 231 return err 232 } 233 234 // Enable the periodic dispatcher, since we are now the leader. 235 s.periodicDispatcher.SetEnabled(true) 236 237 // Restore the periodic dispatcher state 238 if err := s.restorePeriodicDispatcher(); err != nil { 239 return err 240 } 241 242 // Scheduler periodic jobs 243 go s.schedulePeriodic(stopCh) 244 245 // Reap any failed evaluations 246 go s.reapFailedEvaluations(stopCh) 247 248 // Reap any duplicate blocked evaluations 249 go s.reapDupBlockedEvaluations(stopCh) 250 251 // Periodically unblock failed allocations 252 go s.periodicUnblockFailedEvals(stopCh) 253 254 // Periodically publish job summary metrics 255 go s.publishJobSummaryMetrics(stopCh) 256 257 // Periodically publish job status metrics 258 go s.publishJobStatusMetrics(stopCh) 259 260 // Setup the heartbeat timers. This is done both when starting up or when 261 // a leader fail over happens. Since the timers are maintained by the leader 262 // node, effectively this means all the timers are renewed at the time of failover. 263 // The TTL contract is that the session will not be expired before the TTL, 264 // so expiring it later is allowable. 265 // 266 // This MUST be done after the initial barrier to ensure the latest Nodes 267 // are available to be initialized. Otherwise initialization may use stale 268 // data. 269 if err := s.initializeHeartbeatTimers(); err != nil { 270 s.logger.Error("heartbeat timer setup failed", "error", err) 271 return err 272 } 273 274 // Start replication of ACLs and Policies if they are enabled, 275 // and we are not the authoritative region. 276 if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion { 277 go s.replicateACLPolicies(stopCh) 278 go s.replicateACLTokens(stopCh) 279 } 280 281 // Setup any enterprise systems required. 282 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 283 return err 284 } 285 286 s.setConsistentReadReady() 287 288 return nil 289 } 290 291 // restoreEvals is used to restore pending evaluations into the eval broker and 292 // blocked evaluations into the blocked eval tracker. The broker and blocked 293 // eval tracker is maintained only by the leader, so it must be restored anytime 294 // a leadership transition takes place. 295 func (s *Server) restoreEvals() error { 296 // Get an iterator over every evaluation 297 ws := memdb.NewWatchSet() 298 iter, err := s.fsm.State().Evals(ws) 299 if err != nil { 300 return fmt.Errorf("failed to get evaluations: %v", err) 301 } 302 303 for { 304 raw := iter.Next() 305 if raw == nil { 306 break 307 } 308 eval := raw.(*structs.Evaluation) 309 310 if eval.ShouldEnqueue() { 311 s.evalBroker.Enqueue(eval) 312 } else if eval.ShouldBlock() { 313 s.blockedEvals.Block(eval) 314 } 315 } 316 return nil 317 } 318 319 // restoreRevokingAccessors is used to restore Vault accessors that should be 320 // revoked. 321 func (s *Server) restoreRevokingAccessors() error { 322 // An accessor should be revoked if its allocation or node is terminal 323 ws := memdb.NewWatchSet() 324 state := s.fsm.State() 325 iter, err := state.VaultAccessors(ws) 326 if err != nil { 327 return fmt.Errorf("failed to get vault accessors: %v", err) 328 } 329 330 var revoke []*structs.VaultAccessor 331 for { 332 raw := iter.Next() 333 if raw == nil { 334 break 335 } 336 337 va := raw.(*structs.VaultAccessor) 338 339 // Check the allocation 340 alloc, err := state.AllocByID(ws, va.AllocID) 341 if err != nil { 342 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 343 } 344 if alloc == nil || alloc.Terminated() { 345 // No longer running and should be revoked 346 revoke = append(revoke, va) 347 continue 348 } 349 350 // Check the node 351 node, err := state.NodeByID(ws, va.NodeID) 352 if err != nil { 353 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 354 } 355 if node == nil || node.TerminalStatus() { 356 // Node is terminal so any accessor from it should be revoked 357 revoke = append(revoke, va) 358 continue 359 } 360 } 361 362 if len(revoke) != 0 { 363 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 364 return fmt.Errorf("failed to revoke tokens: %v", err) 365 } 366 } 367 368 return nil 369 } 370 371 // restorePeriodicDispatcher is used to restore all periodic jobs into the 372 // periodic dispatcher. It also determines if a periodic job should have been 373 // created during the leadership transition and force runs them. The periodic 374 // dispatcher is maintained only by the leader, so it must be restored anytime a 375 // leadership transition takes place. 376 func (s *Server) restorePeriodicDispatcher() error { 377 logger := s.logger.Named("periodic") 378 ws := memdb.NewWatchSet() 379 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 380 if err != nil { 381 return fmt.Errorf("failed to get periodic jobs: %v", err) 382 } 383 384 now := time.Now() 385 for i := iter.Next(); i != nil; i = iter.Next() { 386 job := i.(*structs.Job) 387 388 // We skip adding parameterized jobs because they themselves aren't 389 // tracked, only the dispatched children are. 390 if job.IsParameterized() { 391 continue 392 } 393 394 if err := s.periodicDispatcher.Add(job); err != nil { 395 logger.Error("failed to add job to periodic dispatcher", "error", err) 396 continue 397 } 398 399 // We do not need to force run the job since it isn't active. 400 if !job.IsPeriodicActive() { 401 continue 402 } 403 404 // If the periodic job has never been launched before, launch will hold 405 // the time the periodic job was added. Otherwise it has the last launch 406 // time of the periodic job. 407 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 408 if err != nil { 409 return fmt.Errorf("failed to get periodic launch time: %v", err) 410 } 411 if launch == nil { 412 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 413 job.ID, job.Namespace) 414 } 415 416 // nextLaunch is the next launch that should occur. 417 nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 418 if err != nil { 419 logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err) 420 continue 421 } 422 423 // We skip force launching the job if there should be no next launch 424 // (the zero case) or if the next launch time is in the future. If it is 425 // in the future, it will be handled by the periodic dispatcher. 426 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 427 continue 428 } 429 430 if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { 431 logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err) 432 return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err) 433 } 434 logger.Debug("periodic job force runned during leadership establishment", "job", job.NamespacedID()) 435 } 436 437 return nil 438 } 439 440 // schedulePeriodic is used to do periodic job dispatch while we are leader 441 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 442 evalGC := time.NewTicker(s.config.EvalGCInterval) 443 defer evalGC.Stop() 444 nodeGC := time.NewTicker(s.config.NodeGCInterval) 445 defer nodeGC.Stop() 446 jobGC := time.NewTicker(s.config.JobGCInterval) 447 defer jobGC.Stop() 448 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 449 defer deploymentGC.Stop() 450 451 // getLatest grabs the latest index from the state store. It returns true if 452 // the index was retrieved successfully. 453 getLatest := func() (uint64, bool) { 454 snapshotIndex, err := s.fsm.State().LatestIndex() 455 if err != nil { 456 s.logger.Error("failed to determine state store's index", "error", err) 457 return 0, false 458 } 459 460 return snapshotIndex, true 461 } 462 463 for { 464 465 select { 466 case <-evalGC.C: 467 if index, ok := getLatest(); ok { 468 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 469 } 470 case <-nodeGC.C: 471 if index, ok := getLatest(); ok { 472 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 473 } 474 case <-jobGC.C: 475 if index, ok := getLatest(); ok { 476 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 477 } 478 case <-deploymentGC.C: 479 if index, ok := getLatest(); ok { 480 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 481 } 482 case <-stopCh: 483 return 484 } 485 } 486 } 487 488 // coreJobEval returns an evaluation for a core job 489 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 490 return &structs.Evaluation{ 491 ID: uuid.Generate(), 492 Namespace: "-", 493 Priority: structs.CoreJobPriority, 494 Type: structs.JobTypeCore, 495 TriggeredBy: structs.EvalTriggerScheduled, 496 JobID: job, 497 LeaderACL: s.getLeaderAcl(), 498 Status: structs.EvalStatusPending, 499 ModifyIndex: modifyIndex, 500 } 501 } 502 503 // reapFailedEvaluations is used to reap evaluations that 504 // have reached their delivery limit and should be failed 505 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 506 for { 507 select { 508 case <-stopCh: 509 return 510 default: 511 // Scan for a failed evaluation 512 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 513 if err != nil { 514 return 515 } 516 if eval == nil { 517 continue 518 } 519 520 // Update the status to failed 521 updateEval := eval.Copy() 522 updateEval.Status = structs.EvalStatusFailed 523 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 524 s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString()) 525 526 // Create a follow-up evaluation that will be used to retry the 527 // scheduling for the job after the cluster is hopefully more stable 528 // due to the fairly large backoff. 529 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 530 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 531 532 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 533 updateEval.NextEval = followupEval.ID 534 updateEval.UpdateModifyTime() 535 536 // Update via Raft 537 req := structs.EvalUpdateRequest{ 538 Evals: []*structs.Evaluation{updateEval, followupEval}, 539 } 540 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 541 s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) 542 continue 543 } 544 545 // Ack completion 546 s.evalBroker.Ack(eval.ID, token) 547 } 548 } 549 } 550 551 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 552 // should be cancelled. 553 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 554 for { 555 select { 556 case <-stopCh: 557 return 558 default: 559 // Scan for duplicate blocked evals. 560 dups := s.blockedEvals.GetDuplicates(time.Second) 561 if dups == nil { 562 continue 563 } 564 565 cancel := make([]*structs.Evaluation, len(dups)) 566 for i, dup := range dups { 567 // Update the status to cancelled 568 newEval := dup.Copy() 569 newEval.Status = structs.EvalStatusCancelled 570 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 571 newEval.UpdateModifyTime() 572 cancel[i] = newEval 573 } 574 575 // Update via Raft 576 req := structs.EvalUpdateRequest{ 577 Evals: cancel, 578 } 579 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 580 s.logger.Error("failed to update duplicate evals", "evals", log.Fmt("%#v", cancel), "error", err) 581 continue 582 } 583 } 584 } 585 } 586 587 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 588 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 589 ticker := time.NewTicker(failedEvalUnblockInterval) 590 defer ticker.Stop() 591 for { 592 select { 593 case <-stopCh: 594 return 595 case <-ticker.C: 596 // Unblock the failed allocations 597 s.blockedEvals.UnblockFailed() 598 } 599 } 600 } 601 602 // publishJobSummaryMetrics publishes the job summaries as metrics 603 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 604 timer := time.NewTimer(0) 605 defer timer.Stop() 606 607 for { 608 select { 609 case <-stopCh: 610 return 611 case <-timer.C: 612 timer.Reset(s.config.StatsCollectionInterval) 613 state, err := s.State().Snapshot() 614 if err != nil { 615 s.logger.Error("failed to get state", "error", err) 616 continue 617 } 618 ws := memdb.NewWatchSet() 619 iter, err := state.JobSummaries(ws) 620 if err != nil { 621 s.logger.Error("failed to get job summaries", "error", err) 622 continue 623 } 624 625 for { 626 raw := iter.Next() 627 if raw == nil { 628 break 629 } 630 summary := raw.(*structs.JobSummary) 631 if s.config.DisableDispatchedJobSummaryMetrics { 632 job, err := state.JobByID(ws, summary.Namespace, summary.JobID) 633 if err != nil { 634 s.logger.Error("error getting job for summary", "error", err) 635 continue 636 } 637 if job.Dispatched { 638 continue 639 } 640 } 641 s.iterateJobSummaryMetrics(summary) 642 } 643 } 644 } 645 } 646 647 func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) { 648 for name, tgSummary := range summary.Summary { 649 if !s.config.DisableTaggedMetrics { 650 labels := []metrics.Label{ 651 { 652 Name: "job_name", 653 Value: summary.JobID, 654 }, 655 { 656 Name: "task_group", 657 Value: name, 658 }, 659 { 660 Name: "namespace", 661 Value: summary.Namespace, 662 }, 663 } 664 665 if strings.Contains(summary.JobID, "/dispatch-") { 666 jobInfo := strings.Split(summary.JobID, "/dispatch-") 667 labels = append(labels, metrics.Label{ 668 Name: "parent_id", 669 Value: jobInfo[0], 670 }, metrics.Label{ 671 Name: "dispatch_id", 672 Value: jobInfo[1], 673 }) 674 } 675 676 if strings.Contains(summary.JobID, "/periodic-") { 677 jobInfo := strings.Split(summary.JobID, "/periodic-") 678 labels = append(labels, metrics.Label{ 679 Name: "parent_id", 680 Value: jobInfo[0], 681 }, metrics.Label{ 682 Name: "periodic_id", 683 Value: jobInfo[1], 684 }) 685 } 686 687 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 688 float32(tgSummary.Queued), labels) 689 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 690 float32(tgSummary.Complete), labels) 691 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 692 float32(tgSummary.Failed), labels) 693 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 694 float32(tgSummary.Running), labels) 695 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 696 float32(tgSummary.Starting), labels) 697 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 698 float32(tgSummary.Lost), labels) 699 } 700 if s.config.BackwardsCompatibleMetrics { 701 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued)) 702 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete)) 703 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed)) 704 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running)) 705 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting)) 706 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost)) 707 } 708 } 709 } 710 711 // publishJobStatusMetrics publishes the job statuses as metrics 712 func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) { 713 timer := time.NewTimer(0) 714 defer timer.Stop() 715 716 for { 717 select { 718 case <-stopCh: 719 return 720 case <-timer.C: 721 timer.Reset(s.config.StatsCollectionInterval) 722 state, err := s.State().Snapshot() 723 if err != nil { 724 s.logger.Error("failed to get state", "error", err) 725 continue 726 } 727 ws := memdb.NewWatchSet() 728 iter, err := state.Jobs(ws) 729 if err != nil { 730 s.logger.Error("failed to get job statuses", "error", err) 731 continue 732 } 733 734 s.iterateJobStatusMetrics(&iter) 735 } 736 } 737 } 738 739 func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) { 740 var pending int64 // Sum of all jobs in 'pending' state 741 var running int64 // Sum of all jobs in 'running' state 742 var dead int64 // Sum of all jobs in 'dead' state 743 744 for { 745 raw := (*jobs).Next() 746 if raw == nil { 747 break 748 } 749 750 job := raw.(*structs.Job) 751 752 switch job.Status { 753 case structs.JobStatusPending: 754 pending++ 755 case structs.JobStatusRunning: 756 running++ 757 case structs.JobStatusDead: 758 dead++ 759 } 760 } 761 762 metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending)) 763 metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running)) 764 metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead)) 765 } 766 767 // revokeLeadership is invoked once we step down as leader. 768 // This is used to cleanup any state that may be specific to a leader. 769 func (s *Server) revokeLeadership() error { 770 defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now()) 771 772 s.resetConsistentReadReady() 773 774 // Clear the leader token since we are no longer the leader. 775 s.setLeaderAcl("") 776 777 // Disable autopilot 778 s.autopilot.Stop() 779 780 // Disable the plan queue, since we are no longer leader 781 s.planQueue.SetEnabled(false) 782 783 // Disable the eval broker, since it is only useful as a leader 784 s.evalBroker.SetEnabled(false) 785 786 // Disable the blocked eval tracker, since it is only useful as a leader 787 s.blockedEvals.SetEnabled(false) 788 789 // Disable the periodic dispatcher, since it is only useful as a leader 790 s.periodicDispatcher.SetEnabled(false) 791 792 // Disable the Vault client as it is only useful as a leader. 793 s.vault.SetActive(false) 794 795 // Disable the deployment watcher as it is only useful as a leader. 796 s.deploymentWatcher.SetEnabled(false, nil) 797 798 // Disable the node drainer 799 s.nodeDrainer.SetEnabled(false, nil) 800 801 // Disable any enterprise systems required. 802 if err := s.revokeEnterpriseLeadership(); err != nil { 803 return err 804 } 805 806 // Clear the heartbeat timers on either shutdown or step down, 807 // since we are no longer responsible for TTL expirations. 808 if err := s.clearAllHeartbeatTimers(); err != nil { 809 s.logger.Error("clearing heartbeat timers failed", "error", err) 810 return err 811 } 812 813 // Unpause our worker if we paused previously 814 if len(s.workers) > 1 { 815 for i := 0; i < len(s.workers)/2; i++ { 816 s.workers[i].SetPause(false) 817 } 818 } 819 return nil 820 } 821 822 // reconcile is used to reconcile the differences between Serf 823 // membership and what is reflected in our strongly consistent store. 824 func (s *Server) reconcile() error { 825 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 826 members := s.serf.Members() 827 for _, member := range members { 828 if err := s.reconcileMember(member); err != nil { 829 return err 830 } 831 } 832 return nil 833 } 834 835 // reconcileMember is used to do an async reconcile of a single serf member 836 func (s *Server) reconcileMember(member serf.Member) error { 837 // Check if this is a member we should handle 838 valid, parts := isNomadServer(member) 839 if !valid || parts.Region != s.config.Region { 840 return nil 841 } 842 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 843 844 var err error 845 switch member.Status { 846 case serf.StatusAlive: 847 err = s.addRaftPeer(member, parts) 848 case serf.StatusLeft, StatusReap: 849 err = s.removeRaftPeer(member, parts) 850 } 851 if err != nil { 852 s.logger.Error("failed to reconcile member", "member", member, "error", err) 853 return err 854 } 855 return nil 856 } 857 858 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 859 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 860 // Check for possibility of multiple bootstrap nodes 861 members := s.serf.Members() 862 if parts.Bootstrap { 863 for _, member := range members { 864 valid, p := isNomadServer(member) 865 if valid && member.Name != m.Name && p.Bootstrap { 866 s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode", 867 "existing_peer", member.Name, "joining_peer", m.Name) 868 return nil 869 } 870 } 871 } 872 873 // Processing ourselves could result in trying to remove ourselves to 874 // fix up our address, which would make us step down. This is only 875 // safe to attempt if there are multiple servers available. 876 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 877 configFuture := s.raft.GetConfiguration() 878 if err := configFuture.Error(); err != nil { 879 s.logger.Error("failed to get raft configuration", "error", err) 880 return err 881 } 882 883 if m.Name == s.config.NodeName { 884 if l := len(configFuture.Configuration().Servers); l < 3 { 885 s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name) 886 return nil 887 } 888 } 889 890 // See if it's already in the configuration. It's harmless to re-add it 891 // but we want to avoid doing that if possible to prevent useless Raft 892 // log entries. If the address is the same but the ID changed, remove the 893 // old server before adding the new one. 894 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 895 if err != nil { 896 return err 897 } 898 for _, server := range configFuture.Configuration().Servers { 899 // No-op if the raft version is too low 900 if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) { 901 return nil 902 } 903 904 // If the address or ID matches an existing server, see if we need to remove the old one first 905 if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { 906 // Exit with no-op if this is being called on an existing server and both the ID and address match 907 if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { 908 return nil 909 } 910 future := s.raft.RemoveServer(server.ID, 0, 0) 911 if server.Address == raft.ServerAddress(addr) { 912 if err := future.Error(); err != nil { 913 return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) 914 } 915 s.logger.Info("removed server with duplicate address", "address", server.Address) 916 } else { 917 if err := future.Error(); err != nil { 918 return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) 919 } 920 s.logger.Info("removed server with duplicate ID", "id", server.ID) 921 } 922 } 923 } 924 925 // Attempt to add as a peer 926 switch { 927 case minRaftProtocol >= 3: 928 addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 929 if err := addFuture.Error(); err != nil { 930 s.logger.Error("failed to add raft peer", "error", err) 931 return err 932 } 933 case minRaftProtocol == 2 && parts.RaftVersion >= 3: 934 addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 935 if err := addFuture.Error(); err != nil { 936 s.logger.Error("failed to add raft peer", "error", err) 937 return err 938 } 939 default: 940 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 941 if err := addFuture.Error(); err != nil { 942 s.logger.Error("failed to add raft peer", "error", err) 943 return err 944 } 945 } 946 947 return nil 948 } 949 950 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 951 // or is reaped 952 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 953 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 954 955 // See if it's already in the configuration. It's harmless to re-remove it 956 // but we want to avoid doing that if possible to prevent useless Raft 957 // log entries. 958 configFuture := s.raft.GetConfiguration() 959 if err := configFuture.Error(); err != nil { 960 s.logger.Error("failed to get raft configuration", "error", err) 961 return err 962 } 963 964 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 965 if err != nil { 966 return err 967 } 968 969 // Pick which remove API to use based on how the server was added. 970 for _, server := range configFuture.Configuration().Servers { 971 // If we understand the new add/remove APIs and the server was added by ID, use the new remove API 972 if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { 973 s.logger.Info("removing server by ID", "id", server.ID) 974 future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) 975 if err := future.Error(); err != nil { 976 s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err) 977 return err 978 } 979 break 980 } else if server.Address == raft.ServerAddress(addr) { 981 // If not, use the old remove API 982 s.logger.Info("removing server by address", "address", server.Address) 983 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 984 if err := future.Error(); err != nil { 985 s.logger.Error("failed to remove raft peer", "address", addr, "error", err) 986 return err 987 } 988 break 989 } 990 } 991 992 return nil 993 } 994 995 // replicateACLPolicies is used to replicate ACL policies from 996 // the authoritative region to this region. 997 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 998 req := structs.ACLPolicyListRequest{ 999 QueryOptions: structs.QueryOptions{ 1000 Region: s.config.AuthoritativeRegion, 1001 AllowStale: true, 1002 }, 1003 } 1004 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1005 s.logger.Debug("starting ACL policy replication from authoritative region", "authoritative_region", req.Region) 1006 1007 START: 1008 for { 1009 select { 1010 case <-stopCh: 1011 return 1012 default: 1013 // Rate limit how often we attempt replication 1014 limiter.Wait(context.Background()) 1015 1016 // Fetch the list of policies 1017 var resp structs.ACLPolicyListResponse 1018 req.AuthToken = s.ReplicationToken() 1019 err := s.forwardRegion(s.config.AuthoritativeRegion, 1020 "ACL.ListPolicies", &req, &resp) 1021 if err != nil { 1022 s.logger.Error("failed to fetch policies from authoritative region", "error", err) 1023 goto ERR_WAIT 1024 } 1025 1026 // Perform a two-way diff 1027 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 1028 1029 // Delete policies that should not exist 1030 if len(delete) > 0 { 1031 args := &structs.ACLPolicyDeleteRequest{ 1032 Names: delete, 1033 } 1034 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 1035 if err != nil { 1036 s.logger.Error("failed to delete policies", "error", err) 1037 goto ERR_WAIT 1038 } 1039 } 1040 1041 // Fetch any outdated policies 1042 var fetched []*structs.ACLPolicy 1043 if len(update) > 0 { 1044 req := structs.ACLPolicySetRequest{ 1045 Names: update, 1046 QueryOptions: structs.QueryOptions{ 1047 Region: s.config.AuthoritativeRegion, 1048 AuthToken: s.ReplicationToken(), 1049 AllowStale: true, 1050 MinQueryIndex: resp.Index - 1, 1051 }, 1052 } 1053 var reply structs.ACLPolicySetResponse 1054 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1055 "ACL.GetPolicies", &req, &reply); err != nil { 1056 s.logger.Error("failed to fetch policies from authoritative region", "error", err) 1057 goto ERR_WAIT 1058 } 1059 for _, policy := range reply.Policies { 1060 fetched = append(fetched, policy) 1061 } 1062 } 1063 1064 // Update local policies 1065 if len(fetched) > 0 { 1066 args := &structs.ACLPolicyUpsertRequest{ 1067 Policies: fetched, 1068 } 1069 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 1070 if err != nil { 1071 s.logger.Error("failed to update policies", "error", err) 1072 goto ERR_WAIT 1073 } 1074 } 1075 1076 // Update the minimum query index, blocks until there 1077 // is a change. 1078 req.MinQueryIndex = resp.Index 1079 } 1080 } 1081 1082 ERR_WAIT: 1083 select { 1084 case <-time.After(s.config.ReplicationBackoff): 1085 goto START 1086 case <-stopCh: 1087 return 1088 } 1089 } 1090 1091 // diffACLPolicies is used to perform a two-way diff between the local 1092 // policies and the remote policies to determine which policies need to 1093 // be deleted or updated. 1094 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 1095 // Construct a set of the local and remote policies 1096 local := make(map[string][]byte) 1097 remote := make(map[string]struct{}) 1098 1099 // Add all the local policies 1100 iter, err := state.ACLPolicies(nil) 1101 if err != nil { 1102 panic("failed to iterate local policies") 1103 } 1104 for { 1105 raw := iter.Next() 1106 if raw == nil { 1107 break 1108 } 1109 policy := raw.(*structs.ACLPolicy) 1110 local[policy.Name] = policy.Hash 1111 } 1112 1113 // Iterate over the remote policies 1114 for _, rp := range remoteList { 1115 remote[rp.Name] = struct{}{} 1116 1117 // Check if the policy is missing locally 1118 if localHash, ok := local[rp.Name]; !ok { 1119 update = append(update, rp.Name) 1120 1121 // Check if policy is newer remotely and there is a hash mis-match. 1122 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1123 update = append(update, rp.Name) 1124 } 1125 } 1126 1127 // Check if policy should be deleted 1128 for lp := range local { 1129 if _, ok := remote[lp]; !ok { 1130 delete = append(delete, lp) 1131 } 1132 } 1133 return 1134 } 1135 1136 // replicateACLTokens is used to replicate global ACL tokens from 1137 // the authoritative region to this region. 1138 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 1139 req := structs.ACLTokenListRequest{ 1140 GlobalOnly: true, 1141 QueryOptions: structs.QueryOptions{ 1142 Region: s.config.AuthoritativeRegion, 1143 AllowStale: true, 1144 }, 1145 } 1146 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1147 s.logger.Debug("starting ACL token replication from authoritative region", "authoritative_region", req.Region) 1148 1149 START: 1150 for { 1151 select { 1152 case <-stopCh: 1153 return 1154 default: 1155 // Rate limit how often we attempt replication 1156 limiter.Wait(context.Background()) 1157 1158 // Fetch the list of tokens 1159 var resp structs.ACLTokenListResponse 1160 req.AuthToken = s.ReplicationToken() 1161 err := s.forwardRegion(s.config.AuthoritativeRegion, 1162 "ACL.ListTokens", &req, &resp) 1163 if err != nil { 1164 s.logger.Error("failed to fetch tokens from authoritative region", "error", err) 1165 goto ERR_WAIT 1166 } 1167 1168 // Perform a two-way diff 1169 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 1170 1171 // Delete tokens that should not exist 1172 if len(delete) > 0 { 1173 args := &structs.ACLTokenDeleteRequest{ 1174 AccessorIDs: delete, 1175 } 1176 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1177 if err != nil { 1178 s.logger.Error("failed to delete tokens", "error", err) 1179 goto ERR_WAIT 1180 } 1181 } 1182 1183 // Fetch any outdated policies. 1184 var fetched []*structs.ACLToken 1185 if len(update) > 0 { 1186 req := structs.ACLTokenSetRequest{ 1187 AccessorIDS: update, 1188 QueryOptions: structs.QueryOptions{ 1189 Region: s.config.AuthoritativeRegion, 1190 AuthToken: s.ReplicationToken(), 1191 AllowStale: true, 1192 MinQueryIndex: resp.Index - 1, 1193 }, 1194 } 1195 var reply structs.ACLTokenSetResponse 1196 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1197 "ACL.GetTokens", &req, &reply); err != nil { 1198 s.logger.Error("failed to fetch tokens from authoritative region", "error", err) 1199 goto ERR_WAIT 1200 } 1201 for _, token := range reply.Tokens { 1202 fetched = append(fetched, token) 1203 } 1204 } 1205 1206 // Update local tokens 1207 if len(fetched) > 0 { 1208 args := &structs.ACLTokenUpsertRequest{ 1209 Tokens: fetched, 1210 } 1211 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1212 if err != nil { 1213 s.logger.Error("failed to update tokens", "error", err) 1214 goto ERR_WAIT 1215 } 1216 } 1217 1218 // Update the minimum query index, blocks until there 1219 // is a change. 1220 req.MinQueryIndex = resp.Index 1221 } 1222 } 1223 1224 ERR_WAIT: 1225 select { 1226 case <-time.After(s.config.ReplicationBackoff): 1227 goto START 1228 case <-stopCh: 1229 return 1230 } 1231 } 1232 1233 // diffACLTokens is used to perform a two-way diff between the local 1234 // tokens and the remote tokens to determine which tokens need to 1235 // be deleted or updated. 1236 func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1237 // Construct a set of the local and remote policies 1238 local := make(map[string][]byte) 1239 remote := make(map[string]struct{}) 1240 1241 // Add all the local global tokens 1242 iter, err := state.ACLTokensByGlobal(nil, true) 1243 if err != nil { 1244 panic("failed to iterate local tokens") 1245 } 1246 for { 1247 raw := iter.Next() 1248 if raw == nil { 1249 break 1250 } 1251 token := raw.(*structs.ACLToken) 1252 local[token.AccessorID] = token.Hash 1253 } 1254 1255 // Iterate over the remote tokens 1256 for _, rp := range remoteList { 1257 remote[rp.AccessorID] = struct{}{} 1258 1259 // Check if the token is missing locally 1260 if localHash, ok := local[rp.AccessorID]; !ok { 1261 update = append(update, rp.AccessorID) 1262 1263 // Check if policy is newer remotely and there is a hash mis-match. 1264 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1265 update = append(update, rp.AccessorID) 1266 } 1267 } 1268 1269 // Check if local token should be deleted 1270 for lp := range local { 1271 if _, ok := remote[lp]; !ok { 1272 delete = append(delete, lp) 1273 } 1274 } 1275 return 1276 } 1277 1278 // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary 1279 func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { 1280 state := s.fsm.State() 1281 _, config, err := state.AutopilotConfig() 1282 if err != nil { 1283 s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err) 1284 return nil 1285 } 1286 if config != nil { 1287 return config 1288 } 1289 1290 if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion, false) { 1291 s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion) 1292 return nil 1293 } 1294 1295 config = s.config.AutopilotConfig 1296 req := structs.AutopilotSetConfigRequest{Config: *config} 1297 if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { 1298 s.logger.Named("autopilot").Error("failed to initialize config", "error", err) 1299 return nil 1300 } 1301 1302 return config 1303 } 1304 1305 // getOrCreateSchedulerConfig is used to get the scheduler config. We create a default 1306 // config if it doesn't already exist for bootstrapping an empty cluster 1307 func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration { 1308 state := s.fsm.State() 1309 _, config, err := state.SchedulerConfig() 1310 if err != nil { 1311 s.logger.Named("core").Error("failed to get scheduler config", "error", err) 1312 return nil 1313 } 1314 if config != nil { 1315 return config 1316 } 1317 if !ServersMeetMinimumVersion(s.Members(), minSchedulerConfigVersion, false) { 1318 s.logger.Named("core").Warn("can't initialize scheduler config until all servers are above minimum version", "min_version", minSchedulerConfigVersion) 1319 return nil 1320 } 1321 1322 req := structs.SchedulerSetConfigRequest{Config: *defaultSchedulerConfig} 1323 if _, _, err = s.raftApply(structs.SchedulerConfigRequestType, req); err != nil { 1324 s.logger.Named("core").Error("failed to initialize config", "error", err) 1325 return nil 1326 } 1327 1328 return config 1329 }