github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "math/rand" 9 "net" 10 "sync" 11 "time" 12 13 "golang.org/x/time/rate" 14 15 "github.com/armon/go-metrics" 16 memdb "github.com/hashicorp/go-memdb" 17 "github.com/hashicorp/go-version" 18 "github.com/hashicorp/nomad/helper/uuid" 19 "github.com/hashicorp/nomad/nomad/state" 20 "github.com/hashicorp/nomad/nomad/structs" 21 "github.com/hashicorp/raft" 22 "github.com/hashicorp/serf/serf" 23 ) 24 25 const ( 26 // failedEvalUnblockInterval is the interval at which failed evaluations are 27 // unblocked to re-enter the scheduler. A failed evaluation occurs under 28 // high contention when the schedulers plan does not make progress. 29 failedEvalUnblockInterval = 1 * time.Minute 30 31 // replicationRateLimit is used to rate limit how often data is replicated 32 // between the authoritative region and the local region 33 replicationRateLimit rate.Limit = 10.0 34 35 // barrierWriteTimeout is used to give Raft a chance to process a 36 // possible loss of leadership event if we are unable to get a barrier 37 // while leader. 38 barrierWriteTimeout = 2 * time.Minute 39 ) 40 41 var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) 42 43 // monitorLeadership is used to monitor if we acquire or lose our role 44 // as the leader in the Raft cluster. There is some work the leader is 45 // expected to do, so we must react to changes 46 func (s *Server) monitorLeadership() { 47 var weAreLeaderCh chan struct{} 48 var leaderLoop sync.WaitGroup 49 for { 50 select { 51 case isLeader := <-s.leaderCh: 52 switch { 53 case isLeader: 54 if weAreLeaderCh != nil { 55 s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") 56 continue 57 } 58 59 weAreLeaderCh = make(chan struct{}) 60 leaderLoop.Add(1) 61 go func(ch chan struct{}) { 62 defer leaderLoop.Done() 63 s.leaderLoop(ch) 64 }(weAreLeaderCh) 65 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 66 67 default: 68 if weAreLeaderCh == nil { 69 s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") 70 continue 71 } 72 73 s.logger.Printf("[DEBUG] nomad: shutting down leader loop") 74 close(weAreLeaderCh) 75 leaderLoop.Wait() 76 weAreLeaderCh = nil 77 s.logger.Printf("[INFO] nomad: cluster leadership lost") 78 } 79 80 case <-s.shutdownCh: 81 return 82 } 83 } 84 } 85 86 // leaderLoop runs as long as we are the leader to run various 87 // maintence activities 88 func (s *Server) leaderLoop(stopCh chan struct{}) { 89 var reconcileCh chan serf.Member 90 establishedLeader := false 91 92 RECONCILE: 93 // Setup a reconciliation timer 94 reconcileCh = nil 95 interval := time.After(s.config.ReconcileInterval) 96 97 // Apply a raft barrier to ensure our FSM is caught up 98 start := time.Now() 99 barrier := s.raft.Barrier(barrierWriteTimeout) 100 if err := barrier.Error(); err != nil { 101 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 102 goto WAIT 103 } 104 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 105 106 // Check if we need to handle initial leadership actions 107 if !establishedLeader { 108 if err := s.establishLeadership(stopCh); err != nil { 109 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) 110 goto WAIT 111 } 112 establishedLeader = true 113 defer func() { 114 if err := s.revokeLeadership(); err != nil { 115 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 116 } 117 }() 118 } 119 120 // Reconcile any missing data 121 if err := s.reconcile(); err != nil { 122 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 123 goto WAIT 124 } 125 126 // Initial reconcile worked, now we can process the channel 127 // updates 128 reconcileCh = s.reconcileCh 129 130 // Poll the stop channel to give it priority so we don't waste time 131 // trying to perform the other operations if we have been asked to shut 132 // down. 133 select { 134 case <-stopCh: 135 return 136 default: 137 } 138 139 WAIT: 140 // Wait until leadership is lost 141 for { 142 select { 143 case <-stopCh: 144 return 145 case <-s.shutdownCh: 146 return 147 case <-interval: 148 goto RECONCILE 149 case member := <-reconcileCh: 150 s.reconcileMember(member) 151 } 152 } 153 } 154 155 // establishLeadership is invoked once we become leader and are able 156 // to invoke an initial barrier. The barrier is used to ensure any 157 // previously inflight transactions have been committed and that our 158 // state is up-to-date. 159 func (s *Server) establishLeadership(stopCh chan struct{}) error { 160 // Generate a leader ACL token. This will allow the leader to issue work 161 // that requires a valid ACL token. 162 s.setLeaderAcl(uuid.Generate()) 163 164 // Disable workers to free half the cores for use in the plan queue and 165 // evaluation broker 166 if numWorkers := len(s.workers); numWorkers > 1 { 167 // Disabling 3/4 of the workers frees CPU for raft and the 168 // plan applier which uses 1/2 the cores. 169 for i := 0; i < (3 * numWorkers / 4); i++ { 170 s.workers[i].SetPause(true) 171 } 172 } 173 174 // Initialize and start the autopilot routine 175 s.getOrCreateAutopilotConfig() 176 s.autopilot.Start() 177 178 // Enable the plan queue, since we are now the leader 179 s.planQueue.SetEnabled(true) 180 181 // Start the plan evaluator 182 go s.planApply() 183 184 // Enable the eval broker, since we are now the leader 185 s.evalBroker.SetEnabled(true) 186 187 // Enable the blocked eval tracker, since we are now the leader 188 s.blockedEvals.SetEnabled(true) 189 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 190 191 // Enable the deployment watcher, since we are now the leader 192 if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil { 193 return err 194 } 195 196 // Restore the eval broker state 197 if err := s.restoreEvals(); err != nil { 198 return err 199 } 200 201 // Activate the vault client 202 s.vault.SetActive(true) 203 if err := s.restoreRevokingAccessors(); err != nil { 204 return err 205 } 206 207 // Enable the periodic dispatcher, since we are now the leader. 208 s.periodicDispatcher.SetEnabled(true) 209 210 // Restore the periodic dispatcher state 211 if err := s.restorePeriodicDispatcher(); err != nil { 212 return err 213 } 214 215 // Scheduler periodic jobs 216 go s.schedulePeriodic(stopCh) 217 218 // Reap any failed evaluations 219 go s.reapFailedEvaluations(stopCh) 220 221 // Reap any duplicate blocked evaluations 222 go s.reapDupBlockedEvaluations(stopCh) 223 224 // Periodically unblock failed allocations 225 go s.periodicUnblockFailedEvals(stopCh) 226 227 // Periodically publish job summary metrics 228 go s.publishJobSummaryMetrics(stopCh) 229 230 // Setup the heartbeat timers. This is done both when starting up or when 231 // a leader fail over happens. Since the timers are maintained by the leader 232 // node, effectively this means all the timers are renewed at the time of failover. 233 // The TTL contract is that the session will not be expired before the TTL, 234 // so expiring it later is allowable. 235 // 236 // This MUST be done after the initial barrier to ensure the latest Nodes 237 // are available to be initialized. Otherwise initialization may use stale 238 // data. 239 if err := s.initializeHeartbeatTimers(); err != nil { 240 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 241 return err 242 } 243 244 // COMPAT 0.4 - 0.4.1 245 // Reconcile the summaries of the registered jobs. We reconcile summaries 246 // only if the server is 0.4.1 since summaries are not present in 0.4 they 247 // might be incorrect after upgrading to 0.4.1 the summaries might not be 248 // correct 249 if err := s.reconcileJobSummaries(); err != nil { 250 return fmt.Errorf("unable to reconcile job summaries: %v", err) 251 } 252 253 // Start replication of ACLs and Policies if they are enabled, 254 // and we are not the authoritative region. 255 if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion { 256 go s.replicateACLPolicies(stopCh) 257 go s.replicateACLTokens(stopCh) 258 } 259 260 // Setup any enterprise systems required. 261 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 262 return err 263 } 264 265 return nil 266 } 267 268 // restoreEvals is used to restore pending evaluations into the eval broker and 269 // blocked evaluations into the blocked eval tracker. The broker and blocked 270 // eval tracker is maintained only by the leader, so it must be restored anytime 271 // a leadership transition takes place. 272 func (s *Server) restoreEvals() error { 273 // Get an iterator over every evaluation 274 ws := memdb.NewWatchSet() 275 iter, err := s.fsm.State().Evals(ws) 276 if err != nil { 277 return fmt.Errorf("failed to get evaluations: %v", err) 278 } 279 280 for { 281 raw := iter.Next() 282 if raw == nil { 283 break 284 } 285 eval := raw.(*structs.Evaluation) 286 287 if eval.ShouldEnqueue() { 288 s.evalBroker.Enqueue(eval) 289 } else if eval.ShouldBlock() { 290 s.blockedEvals.Block(eval) 291 } 292 } 293 return nil 294 } 295 296 // restoreRevokingAccessors is used to restore Vault accessors that should be 297 // revoked. 298 func (s *Server) restoreRevokingAccessors() error { 299 // An accessor should be revoked if its allocation or node is terminal 300 ws := memdb.NewWatchSet() 301 state := s.fsm.State() 302 iter, err := state.VaultAccessors(ws) 303 if err != nil { 304 return fmt.Errorf("failed to get vault accessors: %v", err) 305 } 306 307 var revoke []*structs.VaultAccessor 308 for { 309 raw := iter.Next() 310 if raw == nil { 311 break 312 } 313 314 va := raw.(*structs.VaultAccessor) 315 316 // Check the allocation 317 alloc, err := state.AllocByID(ws, va.AllocID) 318 if err != nil { 319 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 320 } 321 if alloc == nil || alloc.Terminated() { 322 // No longer running and should be revoked 323 revoke = append(revoke, va) 324 continue 325 } 326 327 // Check the node 328 node, err := state.NodeByID(ws, va.NodeID) 329 if err != nil { 330 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 331 } 332 if node == nil || node.TerminalStatus() { 333 // Node is terminal so any accessor from it should be revoked 334 revoke = append(revoke, va) 335 continue 336 } 337 } 338 339 if len(revoke) != 0 { 340 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 341 return fmt.Errorf("failed to revoke tokens: %v", err) 342 } 343 } 344 345 return nil 346 } 347 348 // restorePeriodicDispatcher is used to restore all periodic jobs into the 349 // periodic dispatcher. It also determines if a periodic job should have been 350 // created during the leadership transition and force runs them. The periodic 351 // dispatcher is maintained only by the leader, so it must be restored anytime a 352 // leadership transition takes place. 353 func (s *Server) restorePeriodicDispatcher() error { 354 ws := memdb.NewWatchSet() 355 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 356 if err != nil { 357 return fmt.Errorf("failed to get periodic jobs: %v", err) 358 } 359 360 now := time.Now() 361 for i := iter.Next(); i != nil; i = iter.Next() { 362 job := i.(*structs.Job) 363 364 // We skip adding parameterized jobs because they themselves aren't 365 // tracked, only the dispatched children are. 366 if job.IsParameterized() { 367 continue 368 } 369 370 if err := s.periodicDispatcher.Add(job); err != nil { 371 return err 372 } 373 374 // We do not need to force run the job since it isn't active. 375 if !job.IsPeriodicActive() { 376 continue 377 } 378 379 // If the periodic job has never been launched before, launch will hold 380 // the time the periodic job was added. Otherwise it has the last launch 381 // time of the periodic job. 382 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 383 if err != nil { 384 return fmt.Errorf("failed to get periodic launch time: %v", err) 385 } 386 if launch == nil { 387 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 388 job.ID, job.Namespace) 389 } 390 391 // nextLaunch is the next launch that should occur. 392 nextLaunch := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 393 394 // We skip force launching the job if there should be no next launch 395 // (the zero case) or if the next launch time is in the future. If it is 396 // in the future, it will be handled by the periodic dispatcher. 397 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 398 continue 399 } 400 401 if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { 402 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 403 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 404 return errors.New(msg) 405 } 406 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 407 " run during leadership establishment", job.ID) 408 } 409 410 return nil 411 } 412 413 // schedulePeriodic is used to do periodic job dispatch while we are leader 414 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 415 evalGC := time.NewTicker(s.config.EvalGCInterval) 416 defer evalGC.Stop() 417 nodeGC := time.NewTicker(s.config.NodeGCInterval) 418 defer nodeGC.Stop() 419 jobGC := time.NewTicker(s.config.JobGCInterval) 420 defer jobGC.Stop() 421 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 422 defer deploymentGC.Stop() 423 424 // getLatest grabs the latest index from the state store. It returns true if 425 // the index was retrieved successfully. 426 getLatest := func() (uint64, bool) { 427 snapshotIndex, err := s.fsm.State().LatestIndex() 428 if err != nil { 429 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 430 return 0, false 431 } 432 433 return snapshotIndex, true 434 } 435 436 for { 437 438 select { 439 case <-evalGC.C: 440 if index, ok := getLatest(); ok { 441 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 442 } 443 case <-nodeGC.C: 444 if index, ok := getLatest(); ok { 445 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 446 } 447 case <-jobGC.C: 448 if index, ok := getLatest(); ok { 449 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 450 } 451 case <-deploymentGC.C: 452 if index, ok := getLatest(); ok { 453 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 454 } 455 case <-stopCh: 456 return 457 } 458 } 459 } 460 461 // coreJobEval returns an evaluation for a core job 462 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 463 return &structs.Evaluation{ 464 ID: uuid.Generate(), 465 Namespace: "-", 466 Priority: structs.CoreJobPriority, 467 Type: structs.JobTypeCore, 468 TriggeredBy: structs.EvalTriggerScheduled, 469 JobID: job, 470 LeaderACL: s.getLeaderAcl(), 471 Status: structs.EvalStatusPending, 472 ModifyIndex: modifyIndex, 473 } 474 } 475 476 // reapFailedEvaluations is used to reap evaluations that 477 // have reached their delivery limit and should be failed 478 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 479 for { 480 select { 481 case <-stopCh: 482 return 483 default: 484 // Scan for a failed evaluation 485 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 486 if err != nil { 487 return 488 } 489 if eval == nil { 490 continue 491 } 492 493 // Update the status to failed 494 updateEval := eval.Copy() 495 updateEval.Status = structs.EvalStatusFailed 496 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 497 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) 498 499 // Create a follow-up evaluation that will be used to retry the 500 // scheduling for the job after the cluster is hopefully more stable 501 // due to the fairly large backoff. 502 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 503 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 504 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 505 506 // Update via Raft 507 req := structs.EvalUpdateRequest{ 508 Evals: []*structs.Evaluation{updateEval, followupEval}, 509 } 510 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 511 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) 512 continue 513 } 514 515 // Ack completion 516 s.evalBroker.Ack(eval.ID, token) 517 } 518 } 519 } 520 521 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 522 // should be cancelled. 523 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 524 for { 525 select { 526 case <-stopCh: 527 return 528 default: 529 // Scan for duplicate blocked evals. 530 dups := s.blockedEvals.GetDuplicates(time.Second) 531 if dups == nil { 532 continue 533 } 534 535 cancel := make([]*structs.Evaluation, len(dups)) 536 for i, dup := range dups { 537 // Update the status to cancelled 538 newEval := dup.Copy() 539 newEval.Status = structs.EvalStatusCancelled 540 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 541 cancel[i] = newEval 542 } 543 544 // Update via Raft 545 req := structs.EvalUpdateRequest{ 546 Evals: cancel, 547 } 548 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 549 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 550 continue 551 } 552 } 553 } 554 } 555 556 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 557 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 558 ticker := time.NewTicker(failedEvalUnblockInterval) 559 defer ticker.Stop() 560 for { 561 select { 562 case <-stopCh: 563 return 564 case <-ticker.C: 565 // Unblock the failed allocations 566 s.blockedEvals.UnblockFailed() 567 } 568 } 569 } 570 571 // publishJobSummaryMetrics publishes the job summaries as metrics 572 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 573 timer := time.NewTimer(0) 574 defer timer.Stop() 575 576 for { 577 select { 578 case <-stopCh: 579 return 580 case <-timer.C: 581 timer.Reset(s.config.StatsCollectionInterval) 582 state, err := s.State().Snapshot() 583 if err != nil { 584 s.logger.Printf("[ERR] nomad: failed to get state: %v", err) 585 continue 586 } 587 ws := memdb.NewWatchSet() 588 iter, err := state.JobSummaries(ws) 589 if err != nil { 590 s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) 591 continue 592 } 593 594 for { 595 raw := iter.Next() 596 if raw == nil { 597 break 598 } 599 summary := raw.(*structs.JobSummary) 600 for name, tgSummary := range summary.Summary { 601 if !s.config.DisableTaggedMetrics { 602 labels := []metrics.Label{ 603 { 604 Name: "job", 605 Value: summary.JobID, 606 }, 607 { 608 Name: "task_group", 609 Value: name, 610 }, 611 } 612 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 613 float32(tgSummary.Queued), labels) 614 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 615 float32(tgSummary.Complete), labels) 616 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 617 float32(tgSummary.Failed), labels) 618 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 619 float32(tgSummary.Running), labels) 620 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 621 float32(tgSummary.Starting), labels) 622 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 623 float32(tgSummary.Lost), labels) 624 } 625 if s.config.BackwardsCompatibleMetrics { 626 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued)) 627 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete)) 628 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed)) 629 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running)) 630 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting)) 631 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost)) 632 } 633 } 634 } 635 } 636 } 637 } 638 639 // revokeLeadership is invoked once we step down as leader. 640 // This is used to cleanup any state that may be specific to a leader. 641 func (s *Server) revokeLeadership() error { 642 // Clear the leader token since we are no longer the leader. 643 s.setLeaderAcl("") 644 645 // Disable autopilot 646 s.autopilot.Stop() 647 648 // Disable the plan queue, since we are no longer leader 649 s.planQueue.SetEnabled(false) 650 651 // Disable the eval broker, since it is only useful as a leader 652 s.evalBroker.SetEnabled(false) 653 654 // Disable the blocked eval tracker, since it is only useful as a leader 655 s.blockedEvals.SetEnabled(false) 656 657 // Disable the periodic dispatcher, since it is only useful as a leader 658 s.periodicDispatcher.SetEnabled(false) 659 660 // Disable the Vault client as it is only useful as a leader. 661 s.vault.SetActive(false) 662 663 // Disable the deployment watcher as it is only useful as a leader. 664 if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil { 665 return err 666 } 667 668 // Disable any enterprise systems required. 669 if err := s.revokeEnterpriseLeadership(); err != nil { 670 return err 671 } 672 673 // Clear the heartbeat timers on either shutdown or step down, 674 // since we are no longer responsible for TTL expirations. 675 if err := s.clearAllHeartbeatTimers(); err != nil { 676 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 677 return err 678 } 679 680 // Unpause our worker if we paused previously 681 if len(s.workers) > 1 { 682 for i := 0; i < len(s.workers)/2; i++ { 683 s.workers[i].SetPause(false) 684 } 685 } 686 return nil 687 } 688 689 // reconcile is used to reconcile the differences between Serf 690 // membership and what is reflected in our strongly consistent store. 691 func (s *Server) reconcile() error { 692 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 693 members := s.serf.Members() 694 for _, member := range members { 695 if err := s.reconcileMember(member); err != nil { 696 return err 697 } 698 } 699 return nil 700 } 701 702 // reconcileMember is used to do an async reconcile of a single serf member 703 func (s *Server) reconcileMember(member serf.Member) error { 704 // Check if this is a member we should handle 705 valid, parts := isNomadServer(member) 706 if !valid || parts.Region != s.config.Region { 707 return nil 708 } 709 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 710 711 // Do not reconcile ourself 712 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 713 return nil 714 } 715 716 var err error 717 switch member.Status { 718 case serf.StatusAlive: 719 err = s.addRaftPeer(member, parts) 720 case serf.StatusLeft, StatusReap: 721 err = s.removeRaftPeer(member, parts) 722 } 723 if err != nil { 724 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 725 member, err) 726 return err 727 } 728 return nil 729 } 730 731 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 732 // the system 733 // COMPAT 0.4 -> 0.4.1 734 func (s *Server) reconcileJobSummaries() error { 735 index, err := s.fsm.state.LatestIndex() 736 if err != nil { 737 return fmt.Errorf("unable to read latest index: %v", err) 738 } 739 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 740 741 args := &structs.GenericResponse{} 742 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 743 if _, _, err = s.raftApply(msg, args); err != nil { 744 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 745 } 746 747 return nil 748 } 749 750 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 751 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 752 // Do not join ourselfs 753 if m.Name == s.config.NodeName { 754 s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) 755 return nil 756 } 757 758 // Check for possibility of multiple bootstrap nodes 759 members := s.serf.Members() 760 if parts.Bootstrap { 761 for _, member := range members { 762 valid, p := isNomadServer(member) 763 if valid && member.Name != m.Name && p.Bootstrap { 764 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 765 return nil 766 } 767 } 768 } 769 770 // See if it's already in the configuration. It's harmless to re-add it 771 // but we want to avoid doing that if possible to prevent useless Raft 772 // log entries. 773 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 774 configFuture := s.raft.GetConfiguration() 775 if err := configFuture.Error(); err != nil { 776 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 777 return err 778 } 779 for _, server := range configFuture.Configuration().Servers { 780 if server.Address == raft.ServerAddress(addr) { 781 return nil 782 } 783 } 784 785 // See if it's already in the configuration. It's harmless to re-add it 786 // but we want to avoid doing that if possible to prevent useless Raft 787 // log entries. If the address is the same but the ID changed, remove the 788 // old server before adding the new one. 789 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 790 if err != nil { 791 return err 792 } 793 for _, server := range configFuture.Configuration().Servers { 794 // No-op if the raft version is too low 795 if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) { 796 return nil 797 } 798 799 // If the address or ID matches an existing server, see if we need to remove the old one first 800 if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { 801 // Exit with no-op if this is being called on an existing server 802 if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { 803 return nil 804 } 805 future := s.raft.RemoveServer(server.ID, 0, 0) 806 if server.Address == raft.ServerAddress(addr) { 807 if err := future.Error(); err != nil { 808 return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) 809 } 810 s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address) 811 } else { 812 if err := future.Error(); err != nil { 813 return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) 814 } 815 s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID) 816 } 817 } 818 } 819 820 // Attempt to add as a peer 821 switch { 822 case minRaftProtocol >= 3: 823 addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 824 if err := addFuture.Error(); err != nil { 825 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 826 return err 827 } 828 case minRaftProtocol == 2 && parts.RaftVersion >= 3: 829 addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 830 if err := addFuture.Error(); err != nil { 831 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 832 return err 833 } 834 default: 835 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 836 if err := addFuture.Error(); err != nil { 837 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 838 return err 839 } 840 } 841 842 return nil 843 } 844 845 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 846 // or is reaped 847 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 848 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 849 850 // See if it's already in the configuration. It's harmless to re-remove it 851 // but we want to avoid doing that if possible to prevent useless Raft 852 // log entries. 853 configFuture := s.raft.GetConfiguration() 854 if err := configFuture.Error(); err != nil { 855 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 856 return err 857 } 858 859 minRaftProtocol, err := s.autopilot.MinRaftProtocol() 860 if err != nil { 861 return err 862 } 863 864 // Pick which remove API to use based on how the server was added. 865 for _, server := range configFuture.Configuration().Servers { 866 // If we understand the new add/remove APIs and the server was added by ID, use the new remove API 867 if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { 868 s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID) 869 future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) 870 if err := future.Error(); err != nil { 871 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 872 server.ID, err) 873 return err 874 } 875 break 876 } else if server.Address == raft.ServerAddress(addr) { 877 // If not, use the old remove API 878 s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address) 879 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 880 if err := future.Error(); err != nil { 881 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 882 addr, err) 883 return err 884 } 885 break 886 } 887 } 888 889 return nil 890 } 891 892 // replicateACLPolicies is used to replicate ACL policies from 893 // the authoritative region to this region. 894 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 895 req := structs.ACLPolicyListRequest{ 896 QueryOptions: structs.QueryOptions{ 897 Region: s.config.AuthoritativeRegion, 898 AllowStale: true, 899 }, 900 } 901 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 902 s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) 903 904 START: 905 for { 906 select { 907 case <-stopCh: 908 return 909 default: 910 // Rate limit how often we attempt replication 911 limiter.Wait(context.Background()) 912 913 // Fetch the list of policies 914 var resp structs.ACLPolicyListResponse 915 req.AuthToken = s.ReplicationToken() 916 err := s.forwardRegion(s.config.AuthoritativeRegion, 917 "ACL.ListPolicies", &req, &resp) 918 if err != nil { 919 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 920 goto ERR_WAIT 921 } 922 923 // Perform a two-way diff 924 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 925 926 // Delete policies that should not exist 927 if len(delete) > 0 { 928 args := &structs.ACLPolicyDeleteRequest{ 929 Names: delete, 930 } 931 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 932 if err != nil { 933 s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) 934 goto ERR_WAIT 935 } 936 } 937 938 // Fetch any outdated policies 939 var fetched []*structs.ACLPolicy 940 if len(update) > 0 { 941 req := structs.ACLPolicySetRequest{ 942 Names: update, 943 QueryOptions: structs.QueryOptions{ 944 Region: s.config.AuthoritativeRegion, 945 AuthToken: s.ReplicationToken(), 946 AllowStale: true, 947 MinQueryIndex: resp.Index - 1, 948 }, 949 } 950 var reply structs.ACLPolicySetResponse 951 if err := s.forwardRegion(s.config.AuthoritativeRegion, 952 "ACL.GetPolicies", &req, &reply); err != nil { 953 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 954 goto ERR_WAIT 955 } 956 for _, policy := range reply.Policies { 957 fetched = append(fetched, policy) 958 } 959 } 960 961 // Update local policies 962 if len(fetched) > 0 { 963 args := &structs.ACLPolicyUpsertRequest{ 964 Policies: fetched, 965 } 966 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 967 if err != nil { 968 s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) 969 goto ERR_WAIT 970 } 971 } 972 973 // Update the minimum query index, blocks until there 974 // is a change. 975 req.MinQueryIndex = resp.Index 976 } 977 } 978 979 ERR_WAIT: 980 select { 981 case <-time.After(s.config.ReplicationBackoff): 982 goto START 983 case <-stopCh: 984 return 985 } 986 } 987 988 // diffACLPolicies is used to perform a two-way diff between the local 989 // policies and the remote policies to determine which policies need to 990 // be deleted or updated. 991 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 992 // Construct a set of the local and remote policies 993 local := make(map[string][]byte) 994 remote := make(map[string]struct{}) 995 996 // Add all the local policies 997 iter, err := state.ACLPolicies(nil) 998 if err != nil { 999 panic("failed to iterate local policies") 1000 } 1001 for { 1002 raw := iter.Next() 1003 if raw == nil { 1004 break 1005 } 1006 policy := raw.(*structs.ACLPolicy) 1007 local[policy.Name] = policy.Hash 1008 } 1009 1010 // Iterate over the remote policies 1011 for _, rp := range remoteList { 1012 remote[rp.Name] = struct{}{} 1013 1014 // Check if the policy is missing locally 1015 if localHash, ok := local[rp.Name]; !ok { 1016 update = append(update, rp.Name) 1017 1018 // Check if policy is newer remotely and there is a hash mis-match. 1019 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1020 update = append(update, rp.Name) 1021 } 1022 } 1023 1024 // Check if policy should be deleted 1025 for lp := range local { 1026 if _, ok := remote[lp]; !ok { 1027 delete = append(delete, lp) 1028 } 1029 } 1030 return 1031 } 1032 1033 // replicateACLTokens is used to replicate global ACL tokens from 1034 // the authoritative region to this region. 1035 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 1036 req := structs.ACLTokenListRequest{ 1037 GlobalOnly: true, 1038 QueryOptions: structs.QueryOptions{ 1039 Region: s.config.AuthoritativeRegion, 1040 AllowStale: true, 1041 }, 1042 } 1043 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1044 s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) 1045 1046 START: 1047 for { 1048 select { 1049 case <-stopCh: 1050 return 1051 default: 1052 // Rate limit how often we attempt replication 1053 limiter.Wait(context.Background()) 1054 1055 // Fetch the list of tokens 1056 var resp structs.ACLTokenListResponse 1057 req.AuthToken = s.ReplicationToken() 1058 err := s.forwardRegion(s.config.AuthoritativeRegion, 1059 "ACL.ListTokens", &req, &resp) 1060 if err != nil { 1061 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1062 goto ERR_WAIT 1063 } 1064 1065 // Perform a two-way diff 1066 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 1067 1068 // Delete tokens that should not exist 1069 if len(delete) > 0 { 1070 args := &structs.ACLTokenDeleteRequest{ 1071 AccessorIDs: delete, 1072 } 1073 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1074 if err != nil { 1075 s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) 1076 goto ERR_WAIT 1077 } 1078 } 1079 1080 // Fetch any outdated policies. 1081 var fetched []*structs.ACLToken 1082 if len(update) > 0 { 1083 req := structs.ACLTokenSetRequest{ 1084 AccessorIDS: update, 1085 QueryOptions: structs.QueryOptions{ 1086 Region: s.config.AuthoritativeRegion, 1087 AuthToken: s.ReplicationToken(), 1088 AllowStale: true, 1089 MinQueryIndex: resp.Index - 1, 1090 }, 1091 } 1092 var reply structs.ACLTokenSetResponse 1093 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1094 "ACL.GetTokens", &req, &reply); err != nil { 1095 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1096 goto ERR_WAIT 1097 } 1098 for _, token := range reply.Tokens { 1099 fetched = append(fetched, token) 1100 } 1101 } 1102 1103 // Update local tokens 1104 if len(fetched) > 0 { 1105 args := &structs.ACLTokenUpsertRequest{ 1106 Tokens: fetched, 1107 } 1108 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1109 if err != nil { 1110 s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) 1111 goto ERR_WAIT 1112 } 1113 } 1114 1115 // Update the minimum query index, blocks until there 1116 // is a change. 1117 req.MinQueryIndex = resp.Index 1118 } 1119 } 1120 1121 ERR_WAIT: 1122 select { 1123 case <-time.After(s.config.ReplicationBackoff): 1124 goto START 1125 case <-stopCh: 1126 return 1127 } 1128 } 1129 1130 // diffACLTokens is used to perform a two-way diff between the local 1131 // tokens and the remote tokens to determine which tokens need to 1132 // be deleted or updated. 1133 func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1134 // Construct a set of the local and remote policies 1135 local := make(map[string][]byte) 1136 remote := make(map[string]struct{}) 1137 1138 // Add all the local global tokens 1139 iter, err := state.ACLTokensByGlobal(nil, true) 1140 if err != nil { 1141 panic("failed to iterate local tokens") 1142 } 1143 for { 1144 raw := iter.Next() 1145 if raw == nil { 1146 break 1147 } 1148 token := raw.(*structs.ACLToken) 1149 local[token.AccessorID] = token.Hash 1150 } 1151 1152 // Iterate over the remote tokens 1153 for _, rp := range remoteList { 1154 remote[rp.AccessorID] = struct{}{} 1155 1156 // Check if the token is missing locally 1157 if localHash, ok := local[rp.AccessorID]; !ok { 1158 update = append(update, rp.AccessorID) 1159 1160 // Check if policy is newer remotely and there is a hash mis-match. 1161 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1162 update = append(update, rp.AccessorID) 1163 } 1164 } 1165 1166 // Check if local token should be deleted 1167 for lp := range local { 1168 if _, ok := remote[lp]; !ok { 1169 delete = append(delete, lp) 1170 } 1171 } 1172 return 1173 } 1174 1175 // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary 1176 func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { 1177 state := s.fsm.State() 1178 _, config, err := state.AutopilotConfig() 1179 if err != nil { 1180 s.logger.Printf("[ERR] autopilot: failed to get config: %v", err) 1181 return nil 1182 } 1183 if config != nil { 1184 return config 1185 } 1186 1187 if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) { 1188 s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String()) 1189 return nil 1190 } 1191 1192 config = s.config.AutopilotConfig 1193 req := structs.AutopilotSetConfigRequest{Config: *config} 1194 if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { 1195 s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err) 1196 return nil 1197 } 1198 1199 return config 1200 }