github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "math/rand" 9 "net" 10 "sync" 11 "time" 12 13 "golang.org/x/time/rate" 14 15 "github.com/armon/go-metrics" 16 memdb "github.com/hashicorp/go-memdb" 17 "github.com/hashicorp/go-version" 18 "github.com/hashicorp/nomad/helper/uuid" 19 "github.com/hashicorp/nomad/nomad/state" 20 "github.com/hashicorp/nomad/nomad/structs" 21 "github.com/hashicorp/raft" 22 "github.com/hashicorp/serf/serf" 23 ) 24 25 const ( 26 // failedEvalUnblockInterval is the interval at which failed evaluations are 27 // unblocked to re-enter the scheduler. A failed evaluation occurs under 28 // high contention when the schedulers plan does not make progress. 29 failedEvalUnblockInterval = 1 * time.Minute 30 31 // replicationRateLimit is used to rate limit how often data is replicated 32 // between the authoritative region and the local region 33 replicationRateLimit rate.Limit = 10.0 34 35 // barrierWriteTimeout is used to give Raft a chance to process a 36 // possible loss of leadership event if we are unable to get a barrier 37 // while leader. 38 barrierWriteTimeout = 2 * time.Minute 39 ) 40 41 var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) 42 43 // monitorLeadership is used to monitor if we acquire or lose our role 44 // as the leader in the Raft cluster. There is some work the leader is 45 // expected to do, so we must react to changes 46 func (s *Server) monitorLeadership() { 47 var weAreLeaderCh chan struct{} 48 var leaderLoop sync.WaitGroup 49 for { 50 select { 51 case isLeader := <-s.leaderCh: 52 switch { 53 case isLeader: 54 if weAreLeaderCh != nil { 55 s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") 56 continue 57 } 58 59 weAreLeaderCh = make(chan struct{}) 60 leaderLoop.Add(1) 61 go func(ch chan struct{}) { 62 defer leaderLoop.Done() 63 s.leaderLoop(ch) 64 }(weAreLeaderCh) 65 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 66 67 default: 68 if weAreLeaderCh == nil { 69 s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") 70 continue 71 } 72 73 s.logger.Printf("[DEBUG] nomad: shutting down leader loop") 74 close(weAreLeaderCh) 75 leaderLoop.Wait() 76 weAreLeaderCh = nil 77 s.logger.Printf("[INFO] nomad: cluster leadership lost") 78 } 79 80 case <-s.shutdownCh: 81 return 82 } 83 } 84 } 85 86 // leaderLoop runs as long as we are the leader to run various 87 // maintenance activities 88 func (s *Server) leaderLoop(stopCh chan struct{}) { 89 var reconcileCh chan serf.Member 90 establishedLeader := false 91 92 RECONCILE: 93 // Setup a reconciliation timer 94 reconcileCh = nil 95 interval := time.After(s.config.ReconcileInterval) 96 97 // Apply a raft barrier to ensure our FSM is caught up 98 start := time.Now() 99 barrier := s.raft.Barrier(barrierWriteTimeout) 100 if err := barrier.Error(); err != nil { 101 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 102 goto WAIT 103 } 104 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 105 106 // Check if we need to handle initial leadership actions 107 if !establishedLeader { 108 if err := s.establishLeadership(stopCh); err != nil { 109 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) 110 111 // Immediately revoke leadership since we didn't successfully 112 // establish leadership. 113 if err := s.revokeLeadership(); err != nil { 114 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 115 } 116 117 goto WAIT 118 } 119 120 establishedLeader = true 121 defer func() { 122 if err := s.revokeLeadership(); err != nil { 123 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 124 } 125 }() 126 } 127 128 // Reconcile any missing data 129 if err := s.reconcile(); err != nil { 130 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 131 goto WAIT 132 } 133 134 // Initial reconcile worked, now we can process the channel 135 // updates 136 reconcileCh = s.reconcileCh 137 138 // Poll the stop channel to give it priority so we don't waste time 139 // trying to perform the other operations if we have been asked to shut 140 // down. 141 select { 142 case <-stopCh: 143 return 144 default: 145 } 146 147 WAIT: 148 // Wait until leadership is lost 149 for { 150 select { 151 case <-stopCh: 152 return 153 case <-s.shutdownCh: 154 return 155 case <-interval: 156 goto RECONCILE 157 case member := <-reconcileCh: 158 s.reconcileMember(member) 159 } 160 } 161 } 162 163 // establishLeadership is invoked once we become leader and are able 164 // to invoke an initial barrier. The barrier is used to ensure any 165 // previously inflight transactions have been committed and that our 166 // state is up-to-date. 167 func (s *Server) establishLeadership(stopCh chan struct{}) error { 168 defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now()) 169 170 // Generate a leader ACL token. This will allow the leader to issue work 171 // that requires a valid ACL token. 172 s.setLeaderAcl(uuid.Generate()) 173 174 // Disable workers to free half the cores for use in the plan queue and 175 // evaluation broker 176 if numWorkers := len(s.workers); numWorkers > 1 { 177 // Disabling 3/4 of the workers frees CPU for raft and the 178 // plan applier which uses 1/2 the cores. 179 for i := 0; i < (3 * numWorkers / 4); i++ { 180 s.workers[i].SetPause(true) 181 } 182 } 183 184 // Initialize and start the autopilot routine 185 s.getOrCreateAutopilotConfig() 186 s.autopilot.Start() 187 188 // Enable the plan queue, since we are now the leader 189 s.planQueue.SetEnabled(true) 190 191 // Start the plan evaluator 192 go s.planApply() 193 194 // Enable the eval broker, since we are now the leader 195 s.evalBroker.SetEnabled(true) 196 197 // Enable the blocked eval tracker, since we are now the leader 198 s.blockedEvals.SetEnabled(true) 199 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 200 201 // Enable the deployment watcher, since we are now the leader 202 s.deploymentWatcher.SetEnabled(true, s.State()) 203 204 // Enable the NodeDrainer 205 s.nodeDrainer.SetEnabled(true, s.State()) 206 207 // Restore the eval broker state 208 if err := s.restoreEvals(); err != nil { 209 return err 210 } 211 212 // Activate the vault client 213 s.vault.SetActive(true) 214 if err := s.restoreRevokingAccessors(); err != nil { 215 return err 216 } 217 218 // Enable the periodic dispatcher, since we are now the leader. 219 s.periodicDispatcher.SetEnabled(true) 220 221 // Restore the periodic dispatcher state 222 if err := s.restorePeriodicDispatcher(); err != nil { 223 return err 224 } 225 226 // Scheduler periodic jobs 227 go s.schedulePeriodic(stopCh) 228 229 // Reap any failed evaluations 230 go s.reapFailedEvaluations(stopCh) 231 232 // Reap any duplicate blocked evaluations 233 go s.reapDupBlockedEvaluations(stopCh) 234 235 // Periodically unblock failed allocations 236 go s.periodicUnblockFailedEvals(stopCh) 237 238 // Periodically publish job summary metrics 239 go s.publishJobSummaryMetrics(stopCh) 240 241 // Setup the heartbeat timers. This is done both when starting up or when 242 // a leader fail over happens. Since the timers are maintained by the leader 243 // node, effectively this means all the timers are renewed at the time of failover. 244 // The TTL contract is that the session will not be expired before the TTL, 245 // so expiring it later is allowable. 246 // 247 // This MUST be done after the initial barrier to ensure the latest Nodes 248 // are available to be initialized. Otherwise initialization may use stale 249 // data. 250 if err := s.initializeHeartbeatTimers(); err != nil { 251 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 252 return err 253 } 254 255 // COMPAT 0.4 - 0.4.1 256 // Reconcile the summaries of the registered jobs. We reconcile summaries 257 // only if the server is 0.4.1 since summaries are not present in 0.4 they 258 // might be incorrect after upgrading to 0.4.1 the summaries might not be 259 // correct 260 if err := s.reconcileJobSummaries(); err != nil { 261 return fmt.Errorf("unable to reconcile job summaries: %v", err) 262 } 263 264 // Start replication of ACLs and Policies if they are enabled, 265 // and we are not the authoritative region. 266 if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion { 267 go s.replicateACLPolicies(stopCh) 268 go s.replicateACLTokens(stopCh) 269 } 270 271 // Setup any enterprise systems required. 272 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 273 return err 274 } 275 276 return nil 277 } 278 279 // restoreEvals is used to restore pending evaluations into the eval broker and 280 // blocked evaluations into the blocked eval tracker. The broker and blocked 281 // eval tracker is maintained only by the leader, so it must be restored anytime 282 // a leadership transition takes place. 283 func (s *Server) restoreEvals() error { 284 // Get an iterator over every evaluation 285 ws := memdb.NewWatchSet() 286 iter, err := s.fsm.State().Evals(ws) 287 if err != nil { 288 return fmt.Errorf("failed to get evaluations: %v", err) 289 } 290 291 for { 292 raw := iter.Next() 293 if raw == nil { 294 break 295 } 296 eval := raw.(*structs.Evaluation) 297 298 if eval.ShouldEnqueue() { 299 s.evalBroker.Enqueue(eval) 300 } else if eval.ShouldBlock() { 301 s.blockedEvals.Block(eval) 302 } 303 } 304 return nil 305 } 306 307 // restoreRevokingAccessors is used to restore Vault accessors that should be 308 // revoked. 309 func (s *Server) restoreRevokingAccessors() error { 310 // An accessor should be revoked if its allocation or node is terminal 311 ws := memdb.NewWatchSet() 312 state := s.fsm.State() 313 iter, err := state.VaultAccessors(ws) 314 if err != nil { 315 return fmt.Errorf("failed to get vault accessors: %v", err) 316 } 317 318 var revoke []*structs.VaultAccessor 319 for { 320 raw := iter.Next() 321 if raw == nil { 322 break 323 } 324 325 va := raw.(*structs.VaultAccessor) 326 327 // Check the allocation 328 alloc, err := state.AllocByID(ws, va.AllocID) 329 if err != nil { 330 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 331 } 332 if alloc == nil || alloc.Terminated() { 333 // No longer running and should be revoked 334 revoke = append(revoke, va) 335 continue 336 } 337 338 // Check the node 339 node, err := state.NodeByID(ws, va.NodeID) 340 if err != nil { 341 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 342 } 343 if node == nil || node.TerminalStatus() { 344 // Node is terminal so any accessor from it should be revoked 345 revoke = append(revoke, va) 346 continue 347 } 348 } 349 350 if len(revoke) != 0 { 351 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 352 return fmt.Errorf("failed to revoke tokens: %v", err) 353 } 354 } 355 356 return nil 357 } 358 359 // restorePeriodicDispatcher is used to restore all periodic jobs into the 360 // periodic dispatcher. It also determines if a periodic job should have been 361 // created during the leadership transition and force runs them. The periodic 362 // dispatcher is maintained only by the leader, so it must be restored anytime a 363 // leadership transition takes place. 364 func (s *Server) restorePeriodicDispatcher() error { 365 ws := memdb.NewWatchSet() 366 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 367 if err != nil { 368 return fmt.Errorf("failed to get periodic jobs: %v", err) 369 } 370 371 now := time.Now() 372 for i := iter.Next(); i != nil; i = iter.Next() { 373 job := i.(*structs.Job) 374 375 // We skip adding parameterized jobs because they themselves aren't 376 // tracked, only the dispatched children are. 377 if job.IsParameterized() { 378 continue 379 } 380 381 if err := s.periodicDispatcher.Add(job); err != nil { 382 s.logger.Printf("[ERR] nomad.periodic: %v", err) 383 continue 384 } 385 386 // We do not need to force run the job since it isn't active. 387 if !job.IsPeriodicActive() { 388 continue 389 } 390 391 // If the periodic job has never been launched before, launch will hold 392 // the time the periodic job was added. Otherwise it has the last launch 393 // time of the periodic job. 394 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 395 if err != nil { 396 return fmt.Errorf("failed to get periodic launch time: %v", err) 397 } 398 if launch == nil { 399 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 400 job.ID, job.Namespace) 401 } 402 403 // nextLaunch is the next launch that should occur. 404 nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 405 if err != nil { 406 s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err) 407 continue 408 } 409 410 // We skip force launching the job if there should be no next launch 411 // (the zero case) or if the next launch time is in the future. If it is 412 // in the future, it will be handled by the periodic dispatcher. 413 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 414 continue 415 } 416 417 if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { 418 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 419 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 420 return errors.New(msg) 421 } 422 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 423 " run during leadership establishment", job.ID) 424 } 425 426 return nil 427 } 428 429 // schedulePeriodic is used to do periodic job dispatch while we are leader 430 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 431 evalGC := time.NewTicker(s.config.EvalGCInterval) 432 defer evalGC.Stop() 433 nodeGC := time.NewTicker(s.config.NodeGCInterval) 434 defer nodeGC.Stop() 435 jobGC := time.NewTicker(s.config.JobGCInterval) 436 defer jobGC.Stop() 437 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 438 defer deploymentGC.Stop() 439 440 // getLatest grabs the latest index from the state store. It returns true if 441 // the index was retrieved successfully. 442 getLatest := func() (uint64, bool) { 443 snapshotIndex, err := s.fsm.State().LatestIndex() 444 if err != nil { 445 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 446 return 0, false 447 } 448 449 return snapshotIndex, true 450 } 451 452 for { 453 454 select { 455 case <-evalGC.C: 456 if index, ok := getLatest(); ok { 457 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 458 } 459 case <-nodeGC.C: 460 if index, ok := getLatest(); ok { 461 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 462 } 463 case <-jobGC.C: 464 if index, ok := getLatest(); ok { 465 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 466 } 467 case <-deploymentGC.C: 468 if index, ok := getLatest(); ok { 469 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 470 } 471 case <-stopCh: 472 return 473 } 474 } 475 } 476 477 // coreJobEval returns an evaluation for a core job 478 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 479 return &structs.Evaluation{ 480 ID: uuid.Generate(), 481 Namespace: "-", 482 Priority: structs.CoreJobPriority, 483 Type: structs.JobTypeCore, 484 TriggeredBy: structs.EvalTriggerScheduled, 485 JobID: job, 486 LeaderACL: s.getLeaderAcl(), 487 Status: structs.EvalStatusPending, 488 ModifyIndex: modifyIndex, 489 } 490 } 491 492 // reapFailedEvaluations is used to reap evaluations that 493 // have reached their delivery limit and should be failed 494 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 495 for { 496 select { 497 case <-stopCh: 498 return 499 default: 500 // Scan for a failed evaluation 501 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 502 if err != nil { 503 return 504 } 505 if eval == nil { 506 continue 507 } 508 509 // Update the status to failed 510 updateEval := eval.Copy() 511 updateEval.Status = structs.EvalStatusFailed 512 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 513 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) 514 515 // Create a follow-up evaluation that will be used to retry the 516 // scheduling for the job after the cluster is hopefully more stable 517 // due to the fairly large backoff. 518 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 519 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 520 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 521 522 // Update via Raft 523 req := structs.EvalUpdateRequest{ 524 Evals: []*structs.Evaluation{updateEval, followupEval}, 525 } 526 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 527 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) 528 continue 529 } 530 531 // Ack completion 532 s.evalBroker.Ack(eval.ID, token) 533 } 534 } 535 } 536 537 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 538 // should be cancelled. 539 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 540 for { 541 select { 542 case <-stopCh: 543 return 544 default: 545 // Scan for duplicate blocked evals. 546 dups := s.blockedEvals.GetDuplicates(time.Second) 547 if dups == nil { 548 continue 549 } 550 551 cancel := make([]*structs.Evaluation, len(dups)) 552 for i, dup := range dups { 553 // Update the status to cancelled 554 newEval := dup.Copy() 555 newEval.Status = structs.EvalStatusCancelled 556 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 557 cancel[i] = newEval 558 } 559 560 // Update via Raft 561 req := structs.EvalUpdateRequest{ 562 Evals: cancel, 563 } 564 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 565 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 566 continue 567 } 568 } 569 } 570 } 571 572 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 573 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 574 ticker := time.NewTicker(failedEvalUnblockInterval) 575 defer ticker.Stop() 576 for { 577 select { 578 case <-stopCh: 579 return 580 case <-ticker.C: 581 // Unblock the failed allocations 582 s.blockedEvals.UnblockFailed() 583 } 584 } 585 } 586 587 // publishJobSummaryMetrics publishes the job summaries as metrics 588 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 589 timer := time.NewTimer(0) 590 defer timer.Stop() 591 592 for { 593 select { 594 case <-stopCh: 595 return 596 case <-timer.C: 597 timer.Reset(s.config.StatsCollectionInterval) 598 state, err := s.State().Snapshot() 599 if err != nil { 600 s.logger.Printf("[ERR] nomad: failed to get state: %v", err) 601 continue 602 } 603 ws := memdb.NewWatchSet() 604 iter, err := state.JobSummaries(ws) 605 if err != nil { 606 s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) 607 continue 608 } 609 610 for { 611 raw := iter.Next() 612 if raw == nil { 613 break 614 } 615 summary := raw.(*structs.JobSummary) 616 for name, tgSummary := range summary.Summary { 617 if !s.config.DisableTaggedMetrics { 618 labels := []metrics.Label{ 619 { 620 Name: "job", 621 Value: summary.JobID, 622 }, 623 { 624 Name: "task_group", 625 Value: name, 626 }, 627 } 628 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 629 float32(tgSummary.Queued), labels) 630 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 631 float32(tgSummary.Complete), labels) 632 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 633 float32(tgSummary.Failed), labels) 634 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 635 float32(tgSummary.Running), labels) 636 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 637 float32(tgSummary.Starting), labels) 638 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 639 float32(tgSummary.Lost), labels) 640 } 641 if s.config.BackwardsCompatibleMetrics { 642 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued)) 643 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete)) 644 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed)) 645 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running)) 646 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting)) 647 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost)) 648 } 649 } 650 } 651 } 652 } 653 } 654 655 // revokeLeadership is invoked once we step down as leader. 656 // This is used to cleanup any state that may be specific to a leader. 657 func (s *Server) revokeLeadership() error { 658 defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now()) 659 660 // Clear the leader token since we are no longer the leader. 661 s.setLeaderAcl("") 662 663 // Disable autopilot 664 s.autopilot.Stop() 665 666 // Disable the plan queue, since we are no longer leader 667 s.planQueue.SetEnabled(false) 668 669 // Disable the eval broker, since it is only useful as a leader 670 s.evalBroker.SetEnabled(false) 671 672 // Disable the blocked eval tracker, since it is only useful as a leader 673 s.blockedEvals.SetEnabled(false) 674 675 // Disable the periodic dispatcher, since it is only useful as a leader 676 s.periodicDispatcher.SetEnabled(false) 677 678 // Disable the Vault client as it is only useful as a leader. 679 s.vault.SetActive(false) 680 681 // Disable the deployment watcher as it is only useful as a leader. 682 s.deploymentWatcher.SetEnabled(false, nil) 683 684 // Disable the node drainer 685 s.nodeDrainer.SetEnabled(false, nil) 686 687 // Disable any enterprise systems required. 688 if err := s.revokeEnterpriseLeadership(); err != nil { 689 return err 690 } 691 692 // Clear the heartbeat timers on either shutdown or step down, 693 // since we are no longer responsible for TTL expirations. 694 if err := s.clearAllHeartbeatTimers(); err != nil { 695 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 696 return err 697 } 698 699 // Unpause our worker if we paused previously 700 if len(s.workers) > 1 { 701 for i := 0; i < len(s.workers)/2; i++ { 702 s.workers[i].SetPause(false) 703 } 704 } 705 return nil 706 } 707 708 // reconcile is used to reconcile the differences between Serf 709 // membership and what is reflected in our strongly consistent store. 710 func (s *Server) reconcile() error { 711 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 712 members := s.serf.Members() 713 for _, member := range members { 714 if err := s.reconcileMember(member); err != nil { 715 return err 716 } 717 } 718 return nil 719 } 720 721 // reconcileMember is used to do an async reconcile of a single serf member 722 func (s *Server) reconcileMember(member serf.Member) error { 723 // Check if this is a member we should handle 724 valid, parts := isNomadServer(member) 725 if !valid || parts.Region != s.config.Region { 726 return nil 727 } 728 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 729 730 // Do not reconcile ourself 731 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 732 return nil 733 } 734 735 var err error 736 switch member.Status { 737 case serf.StatusAlive: 738 err = s.addRaftPeer(member, parts) 739 case serf.StatusLeft, StatusReap: 740 err = s.removeRaftPeer(member, parts) 741 } 742 if err != nil { 743 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 744 member, err) 745 return err 746 } 747 return nil 748 } 749 750 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 751 // the system 752 // COMPAT 0.4 -> 0.4.1 753 func (s *Server) reconcileJobSummaries() error { 754 index, err := s.fsm.state.LatestIndex() 755 if err != nil { 756 return fmt.Errorf("unable to read latest index: %v", err) 757 } 758 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 759 760 args := &structs.GenericResponse{} 761 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 762 if _, _, err = s.raftApply(msg, args); err != nil { 763 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 764 } 765 766 return nil 767 } 768 769 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 770 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 771 // Do not join ourselfs 772 if m.Name == s.config.NodeName { 773 s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) 774 return nil 775 } 776 777 // Check for possibility of multiple bootstrap nodes 778 members := s.serf.Members() 779 if parts.Bootstrap { 780 for _, member := range members { 781 valid, p := isNomadServer(member) 782 if valid && member.Name != m.Name && p.Bootstrap { 783 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 784 return nil 785 } 786 } 787 } 788 789 // See if it's already in the configuration. It's harmless to re-add it 790 // but we want to avoid doing that if possible to prevent useless Raft 791 // log entries. 792 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 793 configFuture := s.raft.GetConfiguration() 794 if err := configFuture.Error(); err != nil { 795 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 796 return err 797 } 798 for _, server := range configFuture.Configuration().Servers { 799 if server.Address == raft.ServerAddress(addr) { 800 return nil 801 } 802 } 803 804 // See if it's already in the configuration. It's harmless to re-add it 805 // but we want to avoid doing that if possible to prevent useless Raft 806 // log entries. If the address is the same but the ID changed, remove the 807 // old server before adding the new one. 808 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 809 if err != nil { 810 return err 811 } 812 for _, server := range configFuture.Configuration().Servers { 813 // No-op if the raft version is too low 814 if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) { 815 return nil 816 } 817 818 // If the address or ID matches an existing server, see if we need to remove the old one first 819 if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { 820 // Exit with no-op if this is being called on an existing server 821 if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { 822 return nil 823 } 824 future := s.raft.RemoveServer(server.ID, 0, 0) 825 if server.Address == raft.ServerAddress(addr) { 826 if err := future.Error(); err != nil { 827 return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) 828 } 829 s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address) 830 } else { 831 if err := future.Error(); err != nil { 832 return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) 833 } 834 s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID) 835 } 836 } 837 } 838 839 // Attempt to add as a peer 840 switch { 841 case minRaftProtocol >= 3: 842 addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 843 if err := addFuture.Error(); err != nil { 844 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 845 return err 846 } 847 case minRaftProtocol == 2 && parts.RaftVersion >= 3: 848 addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 849 if err := addFuture.Error(); err != nil { 850 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 851 return err 852 } 853 default: 854 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 855 if err := addFuture.Error(); err != nil { 856 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 857 return err 858 } 859 } 860 861 return nil 862 } 863 864 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 865 // or is reaped 866 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 867 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 868 869 // See if it's already in the configuration. It's harmless to re-remove it 870 // but we want to avoid doing that if possible to prevent useless Raft 871 // log entries. 872 configFuture := s.raft.GetConfiguration() 873 if err := configFuture.Error(); err != nil { 874 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 875 return err 876 } 877 878 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 879 if err != nil { 880 return err 881 } 882 883 // Pick which remove API to use based on how the server was added. 884 for _, server := range configFuture.Configuration().Servers { 885 // If we understand the new add/remove APIs and the server was added by ID, use the new remove API 886 if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { 887 s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID) 888 future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) 889 if err := future.Error(); err != nil { 890 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 891 server.ID, err) 892 return err 893 } 894 break 895 } else if server.Address == raft.ServerAddress(addr) { 896 // If not, use the old remove API 897 s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address) 898 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 899 if err := future.Error(); err != nil { 900 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 901 addr, err) 902 return err 903 } 904 break 905 } 906 } 907 908 return nil 909 } 910 911 // replicateACLPolicies is used to replicate ACL policies from 912 // the authoritative region to this region. 913 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 914 req := structs.ACLPolicyListRequest{ 915 QueryOptions: structs.QueryOptions{ 916 Region: s.config.AuthoritativeRegion, 917 AllowStale: true, 918 }, 919 } 920 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 921 s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) 922 923 START: 924 for { 925 select { 926 case <-stopCh: 927 return 928 default: 929 // Rate limit how often we attempt replication 930 limiter.Wait(context.Background()) 931 932 // Fetch the list of policies 933 var resp structs.ACLPolicyListResponse 934 req.AuthToken = s.ReplicationToken() 935 err := s.forwardRegion(s.config.AuthoritativeRegion, 936 "ACL.ListPolicies", &req, &resp) 937 if err != nil { 938 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 939 goto ERR_WAIT 940 } 941 942 // Perform a two-way diff 943 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 944 945 // Delete policies that should not exist 946 if len(delete) > 0 { 947 args := &structs.ACLPolicyDeleteRequest{ 948 Names: delete, 949 } 950 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 951 if err != nil { 952 s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) 953 goto ERR_WAIT 954 } 955 } 956 957 // Fetch any outdated policies 958 var fetched []*structs.ACLPolicy 959 if len(update) > 0 { 960 req := structs.ACLPolicySetRequest{ 961 Names: update, 962 QueryOptions: structs.QueryOptions{ 963 Region: s.config.AuthoritativeRegion, 964 AuthToken: s.ReplicationToken(), 965 AllowStale: true, 966 MinQueryIndex: resp.Index - 1, 967 }, 968 } 969 var reply structs.ACLPolicySetResponse 970 if err := s.forwardRegion(s.config.AuthoritativeRegion, 971 "ACL.GetPolicies", &req, &reply); err != nil { 972 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 973 goto ERR_WAIT 974 } 975 for _, policy := range reply.Policies { 976 fetched = append(fetched, policy) 977 } 978 } 979 980 // Update local policies 981 if len(fetched) > 0 { 982 args := &structs.ACLPolicyUpsertRequest{ 983 Policies: fetched, 984 } 985 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 986 if err != nil { 987 s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) 988 goto ERR_WAIT 989 } 990 } 991 992 // Update the minimum query index, blocks until there 993 // is a change. 994 req.MinQueryIndex = resp.Index 995 } 996 } 997 998 ERR_WAIT: 999 select { 1000 case <-time.After(s.config.ReplicationBackoff): 1001 goto START 1002 case <-stopCh: 1003 return 1004 } 1005 } 1006 1007 // diffACLPolicies is used to perform a two-way diff between the local 1008 // policies and the remote policies to determine which policies need to 1009 // be deleted or updated. 1010 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 1011 // Construct a set of the local and remote policies 1012 local := make(map[string][]byte) 1013 remote := make(map[string]struct{}) 1014 1015 // Add all the local policies 1016 iter, err := state.ACLPolicies(nil) 1017 if err != nil { 1018 panic("failed to iterate local policies") 1019 } 1020 for { 1021 raw := iter.Next() 1022 if raw == nil { 1023 break 1024 } 1025 policy := raw.(*structs.ACLPolicy) 1026 local[policy.Name] = policy.Hash 1027 } 1028 1029 // Iterate over the remote policies 1030 for _, rp := range remoteList { 1031 remote[rp.Name] = struct{}{} 1032 1033 // Check if the policy is missing locally 1034 if localHash, ok := local[rp.Name]; !ok { 1035 update = append(update, rp.Name) 1036 1037 // Check if policy is newer remotely and there is a hash mis-match. 1038 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1039 update = append(update, rp.Name) 1040 } 1041 } 1042 1043 // Check if policy should be deleted 1044 for lp := range local { 1045 if _, ok := remote[lp]; !ok { 1046 delete = append(delete, lp) 1047 } 1048 } 1049 return 1050 } 1051 1052 // replicateACLTokens is used to replicate global ACL tokens from 1053 // the authoritative region to this region. 1054 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 1055 req := structs.ACLTokenListRequest{ 1056 GlobalOnly: true, 1057 QueryOptions: structs.QueryOptions{ 1058 Region: s.config.AuthoritativeRegion, 1059 AllowStale: true, 1060 }, 1061 } 1062 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1063 s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) 1064 1065 START: 1066 for { 1067 select { 1068 case <-stopCh: 1069 return 1070 default: 1071 // Rate limit how often we attempt replication 1072 limiter.Wait(context.Background()) 1073 1074 // Fetch the list of tokens 1075 var resp structs.ACLTokenListResponse 1076 req.AuthToken = s.ReplicationToken() 1077 err := s.forwardRegion(s.config.AuthoritativeRegion, 1078 "ACL.ListTokens", &req, &resp) 1079 if err != nil { 1080 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1081 goto ERR_WAIT 1082 } 1083 1084 // Perform a two-way diff 1085 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 1086 1087 // Delete tokens that should not exist 1088 if len(delete) > 0 { 1089 args := &structs.ACLTokenDeleteRequest{ 1090 AccessorIDs: delete, 1091 } 1092 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1093 if err != nil { 1094 s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) 1095 goto ERR_WAIT 1096 } 1097 } 1098 1099 // Fetch any outdated policies. 1100 var fetched []*structs.ACLToken 1101 if len(update) > 0 { 1102 req := structs.ACLTokenSetRequest{ 1103 AccessorIDS: update, 1104 QueryOptions: structs.QueryOptions{ 1105 Region: s.config.AuthoritativeRegion, 1106 AuthToken: s.ReplicationToken(), 1107 AllowStale: true, 1108 MinQueryIndex: resp.Index - 1, 1109 }, 1110 } 1111 var reply structs.ACLTokenSetResponse 1112 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1113 "ACL.GetTokens", &req, &reply); err != nil { 1114 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1115 goto ERR_WAIT 1116 } 1117 for _, token := range reply.Tokens { 1118 fetched = append(fetched, token) 1119 } 1120 } 1121 1122 // Update local tokens 1123 if len(fetched) > 0 { 1124 args := &structs.ACLTokenUpsertRequest{ 1125 Tokens: fetched, 1126 } 1127 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1128 if err != nil { 1129 s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) 1130 goto ERR_WAIT 1131 } 1132 } 1133 1134 // Update the minimum query index, blocks until there 1135 // is a change. 1136 req.MinQueryIndex = resp.Index 1137 } 1138 } 1139 1140 ERR_WAIT: 1141 select { 1142 case <-time.After(s.config.ReplicationBackoff): 1143 goto START 1144 case <-stopCh: 1145 return 1146 } 1147 } 1148 1149 // diffACLTokens is used to perform a two-way diff between the local 1150 // tokens and the remote tokens to determine which tokens need to 1151 // be deleted or updated. 1152 func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1153 // Construct a set of the local and remote policies 1154 local := make(map[string][]byte) 1155 remote := make(map[string]struct{}) 1156 1157 // Add all the local global tokens 1158 iter, err := state.ACLTokensByGlobal(nil, true) 1159 if err != nil { 1160 panic("failed to iterate local tokens") 1161 } 1162 for { 1163 raw := iter.Next() 1164 if raw == nil { 1165 break 1166 } 1167 token := raw.(*structs.ACLToken) 1168 local[token.AccessorID] = token.Hash 1169 } 1170 1171 // Iterate over the remote tokens 1172 for _, rp := range remoteList { 1173 remote[rp.AccessorID] = struct{}{} 1174 1175 // Check if the token is missing locally 1176 if localHash, ok := local[rp.AccessorID]; !ok { 1177 update = append(update, rp.AccessorID) 1178 1179 // Check if policy is newer remotely and there is a hash mis-match. 1180 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1181 update = append(update, rp.AccessorID) 1182 } 1183 } 1184 1185 // Check if local token should be deleted 1186 for lp := range local { 1187 if _, ok := remote[lp]; !ok { 1188 delete = append(delete, lp) 1189 } 1190 } 1191 return 1192 } 1193 1194 // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary 1195 func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { 1196 state := s.fsm.State() 1197 _, config, err := state.AutopilotConfig() 1198 if err != nil { 1199 s.logger.Printf("[ERR] autopilot: failed to get config: %v", err) 1200 return nil 1201 } 1202 if config != nil { 1203 return config 1204 } 1205 1206 if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) { 1207 s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String()) 1208 return nil 1209 } 1210 1211 config = s.config.AutopilotConfig 1212 req := structs.AutopilotSetConfigRequest{Config: *config} 1213 if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { 1214 s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err) 1215 return nil 1216 } 1217 1218 return config 1219 }