github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "math/rand" 9 "net" 10 "sync" 11 "time" 12 13 "golang.org/x/time/rate" 14 15 "github.com/armon/go-metrics" 16 memdb "github.com/hashicorp/go-memdb" 17 "github.com/hashicorp/nomad/helper/uuid" 18 "github.com/hashicorp/nomad/nomad/state" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/raft" 21 "github.com/hashicorp/serf/serf" 22 ) 23 24 const ( 25 // failedEvalUnblockInterval is the interval at which failed evaluations are 26 // unblocked to re-enter the scheduler. A failed evaluation occurs under 27 // high contention when the schedulers plan does not make progress. 28 failedEvalUnblockInterval = 1 * time.Minute 29 30 // replicationRateLimit is used to rate limit how often data is replicated 31 // between the authoritative region and the local region 32 replicationRateLimit rate.Limit = 10.0 33 34 // barrierWriteTimeout is used to give Raft a chance to process a 35 // possible loss of leadership event if we are unable to get a barrier 36 // while leader. 37 barrierWriteTimeout = 2 * time.Minute 38 ) 39 40 // monitorLeadership is used to monitor if we acquire or lose our role 41 // as the leader in the Raft cluster. There is some work the leader is 42 // expected to do, so we must react to changes 43 func (s *Server) monitorLeadership() { 44 var weAreLeaderCh chan struct{} 45 var leaderLoop sync.WaitGroup 46 for { 47 select { 48 case isLeader := <-s.leaderCh: 49 switch { 50 case isLeader: 51 if weAreLeaderCh != nil { 52 s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") 53 continue 54 } 55 56 weAreLeaderCh = make(chan struct{}) 57 leaderLoop.Add(1) 58 go func(ch chan struct{}) { 59 defer leaderLoop.Done() 60 s.leaderLoop(ch) 61 }(weAreLeaderCh) 62 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 63 64 default: 65 if weAreLeaderCh == nil { 66 s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") 67 continue 68 } 69 70 s.logger.Printf("[DEBUG] nomad: shutting down leader loop") 71 close(weAreLeaderCh) 72 leaderLoop.Wait() 73 weAreLeaderCh = nil 74 s.logger.Printf("[INFO] nomad: cluster leadership lost") 75 } 76 77 case <-s.shutdownCh: 78 return 79 } 80 } 81 } 82 83 // leaderLoop runs as long as we are the leader to run various 84 // maintence activities 85 func (s *Server) leaderLoop(stopCh chan struct{}) { 86 var reconcileCh chan serf.Member 87 establishedLeader := false 88 89 RECONCILE: 90 // Setup a reconciliation timer 91 reconcileCh = nil 92 interval := time.After(s.config.ReconcileInterval) 93 94 // Apply a raft barrier to ensure our FSM is caught up 95 start := time.Now() 96 barrier := s.raft.Barrier(barrierWriteTimeout) 97 if err := barrier.Error(); err != nil { 98 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 99 goto WAIT 100 } 101 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 102 103 // Check if we need to handle initial leadership actions 104 if !establishedLeader { 105 if err := s.establishLeadership(stopCh); err != nil { 106 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) 107 goto WAIT 108 } 109 establishedLeader = true 110 defer func() { 111 if err := s.revokeLeadership(); err != nil { 112 s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) 113 } 114 }() 115 } 116 117 // Reconcile any missing data 118 if err := s.reconcile(); err != nil { 119 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 120 goto WAIT 121 } 122 123 // Initial reconcile worked, now we can process the channel 124 // updates 125 reconcileCh = s.reconcileCh 126 127 // Poll the stop channel to give it priority so we don't waste time 128 // trying to perform the other operations if we have been asked to shut 129 // down. 130 select { 131 case <-stopCh: 132 return 133 default: 134 } 135 136 WAIT: 137 // Wait until leadership is lost 138 for { 139 select { 140 case <-stopCh: 141 return 142 case <-s.shutdownCh: 143 return 144 case <-interval: 145 goto RECONCILE 146 case member := <-reconcileCh: 147 s.reconcileMember(member) 148 } 149 } 150 } 151 152 // establishLeadership is invoked once we become leader and are able 153 // to invoke an initial barrier. The barrier is used to ensure any 154 // previously inflight transactions have been committed and that our 155 // state is up-to-date. 156 func (s *Server) establishLeadership(stopCh chan struct{}) error { 157 // Generate a leader ACL token. This will allow the leader to issue work 158 // that requires a valid ACL token. 159 s.setLeaderAcl(uuid.Generate()) 160 161 // Disable workers to free half the cores for use in the plan queue and 162 // evaluation broker 163 if numWorkers := len(s.workers); numWorkers > 1 { 164 // Disabling 3/4 of the workers frees CPU for raft and the 165 // plan applier which uses 1/2 the cores. 166 for i := 0; i < (3 * numWorkers / 4); i++ { 167 s.workers[i].SetPause(true) 168 } 169 } 170 171 // Enable the plan queue, since we are now the leader 172 s.planQueue.SetEnabled(true) 173 174 // Start the plan evaluator 175 go s.planApply() 176 177 // Enable the eval broker, since we are now the leader 178 s.evalBroker.SetEnabled(true) 179 180 // Enable the blocked eval tracker, since we are now the leader 181 s.blockedEvals.SetEnabled(true) 182 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 183 184 // Enable the deployment watcher, since we are now the leader 185 if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil { 186 return err 187 } 188 189 // Restore the eval broker state 190 if err := s.restoreEvals(); err != nil { 191 return err 192 } 193 194 // Activate the vault client 195 s.vault.SetActive(true) 196 if err := s.restoreRevokingAccessors(); err != nil { 197 return err 198 } 199 200 // Enable the periodic dispatcher, since we are now the leader. 201 s.periodicDispatcher.SetEnabled(true) 202 203 // Restore the periodic dispatcher state 204 if err := s.restorePeriodicDispatcher(); err != nil { 205 return err 206 } 207 208 // Scheduler periodic jobs 209 go s.schedulePeriodic(stopCh) 210 211 // Reap any failed evaluations 212 go s.reapFailedEvaluations(stopCh) 213 214 // Reap any duplicate blocked evaluations 215 go s.reapDupBlockedEvaluations(stopCh) 216 217 // Periodically unblock failed allocations 218 go s.periodicUnblockFailedEvals(stopCh) 219 220 // Periodically publish job summary metrics 221 go s.publishJobSummaryMetrics(stopCh) 222 223 // Setup the heartbeat timers. This is done both when starting up or when 224 // a leader fail over happens. Since the timers are maintained by the leader 225 // node, effectively this means all the timers are renewed at the time of failover. 226 // The TTL contract is that the session will not be expired before the TTL, 227 // so expiring it later is allowable. 228 // 229 // This MUST be done after the initial barrier to ensure the latest Nodes 230 // are available to be initialized. Otherwise initialization may use stale 231 // data. 232 if err := s.initializeHeartbeatTimers(); err != nil { 233 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 234 return err 235 } 236 237 // COMPAT 0.4 - 0.4.1 238 // Reconcile the summaries of the registered jobs. We reconcile summaries 239 // only if the server is 0.4.1 since summaries are not present in 0.4 they 240 // might be incorrect after upgrading to 0.4.1 the summaries might not be 241 // correct 242 if err := s.reconcileJobSummaries(); err != nil { 243 return fmt.Errorf("unable to reconcile job summaries: %v", err) 244 } 245 246 // Start replication of ACLs and Policies if they are enabled, 247 // and we are not the authoritative region. 248 if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion { 249 go s.replicateACLPolicies(stopCh) 250 go s.replicateACLTokens(stopCh) 251 } 252 253 // Setup any enterprise systems required. 254 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 255 return err 256 } 257 258 return nil 259 } 260 261 // restoreEvals is used to restore pending evaluations into the eval broker and 262 // blocked evaluations into the blocked eval tracker. The broker and blocked 263 // eval tracker is maintained only by the leader, so it must be restored anytime 264 // a leadership transition takes place. 265 func (s *Server) restoreEvals() error { 266 // Get an iterator over every evaluation 267 ws := memdb.NewWatchSet() 268 iter, err := s.fsm.State().Evals(ws) 269 if err != nil { 270 return fmt.Errorf("failed to get evaluations: %v", err) 271 } 272 273 for { 274 raw := iter.Next() 275 if raw == nil { 276 break 277 } 278 eval := raw.(*structs.Evaluation) 279 280 if eval.ShouldEnqueue() { 281 s.evalBroker.Enqueue(eval) 282 } else if eval.ShouldBlock() { 283 s.blockedEvals.Block(eval) 284 } 285 } 286 return nil 287 } 288 289 // restoreRevokingAccessors is used to restore Vault accessors that should be 290 // revoked. 291 func (s *Server) restoreRevokingAccessors() error { 292 // An accessor should be revoked if its allocation or node is terminal 293 ws := memdb.NewWatchSet() 294 state := s.fsm.State() 295 iter, err := state.VaultAccessors(ws) 296 if err != nil { 297 return fmt.Errorf("failed to get vault accessors: %v", err) 298 } 299 300 var revoke []*structs.VaultAccessor 301 for { 302 raw := iter.Next() 303 if raw == nil { 304 break 305 } 306 307 va := raw.(*structs.VaultAccessor) 308 309 // Check the allocation 310 alloc, err := state.AllocByID(ws, va.AllocID) 311 if err != nil { 312 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 313 } 314 if alloc == nil || alloc.Terminated() { 315 // No longer running and should be revoked 316 revoke = append(revoke, va) 317 continue 318 } 319 320 // Check the node 321 node, err := state.NodeByID(ws, va.NodeID) 322 if err != nil { 323 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 324 } 325 if node == nil || node.TerminalStatus() { 326 // Node is terminal so any accessor from it should be revoked 327 revoke = append(revoke, va) 328 continue 329 } 330 } 331 332 if len(revoke) != 0 { 333 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 334 return fmt.Errorf("failed to revoke tokens: %v", err) 335 } 336 } 337 338 return nil 339 } 340 341 // restorePeriodicDispatcher is used to restore all periodic jobs into the 342 // periodic dispatcher. It also determines if a periodic job should have been 343 // created during the leadership transition and force runs them. The periodic 344 // dispatcher is maintained only by the leader, so it must be restored anytime a 345 // leadership transition takes place. 346 func (s *Server) restorePeriodicDispatcher() error { 347 ws := memdb.NewWatchSet() 348 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 349 if err != nil { 350 return fmt.Errorf("failed to get periodic jobs: %v", err) 351 } 352 353 now := time.Now() 354 for i := iter.Next(); i != nil; i = iter.Next() { 355 job := i.(*structs.Job) 356 357 // We skip adding parameterized jobs because they themselves aren't 358 // tracked, only the dispatched children are. 359 if job.IsParameterized() { 360 continue 361 } 362 363 if err := s.periodicDispatcher.Add(job); err != nil { 364 return err 365 } 366 367 // We do not need to force run the job since it isn't active. 368 if !job.IsPeriodicActive() { 369 continue 370 } 371 372 // If the periodic job has never been launched before, launch will hold 373 // the time the periodic job was added. Otherwise it has the last launch 374 // time of the periodic job. 375 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 376 if err != nil { 377 return fmt.Errorf("failed to get periodic launch time: %v", err) 378 } 379 if launch == nil { 380 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 381 job.ID, job.Namespace) 382 } 383 384 // nextLaunch is the next launch that should occur. 385 nextLaunch := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 386 387 // We skip force launching the job if there should be no next launch 388 // (the zero case) or if the next launch time is in the future. If it is 389 // in the future, it will be handled by the periodic dispatcher. 390 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 391 continue 392 } 393 394 if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { 395 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 396 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 397 return errors.New(msg) 398 } 399 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 400 " run during leadership establishment", job.ID) 401 } 402 403 return nil 404 } 405 406 // schedulePeriodic is used to do periodic job dispatch while we are leader 407 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 408 evalGC := time.NewTicker(s.config.EvalGCInterval) 409 defer evalGC.Stop() 410 nodeGC := time.NewTicker(s.config.NodeGCInterval) 411 defer nodeGC.Stop() 412 jobGC := time.NewTicker(s.config.JobGCInterval) 413 defer jobGC.Stop() 414 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 415 defer deploymentGC.Stop() 416 417 // getLatest grabs the latest index from the state store. It returns true if 418 // the index was retrieved successfully. 419 getLatest := func() (uint64, bool) { 420 snapshotIndex, err := s.fsm.State().LatestIndex() 421 if err != nil { 422 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 423 return 0, false 424 } 425 426 return snapshotIndex, true 427 } 428 429 for { 430 431 select { 432 case <-evalGC.C: 433 if index, ok := getLatest(); ok { 434 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 435 } 436 case <-nodeGC.C: 437 if index, ok := getLatest(); ok { 438 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 439 } 440 case <-jobGC.C: 441 if index, ok := getLatest(); ok { 442 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 443 } 444 case <-deploymentGC.C: 445 if index, ok := getLatest(); ok { 446 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 447 } 448 case <-stopCh: 449 return 450 } 451 } 452 } 453 454 // coreJobEval returns an evaluation for a core job 455 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 456 return &structs.Evaluation{ 457 ID: uuid.Generate(), 458 Namespace: "-", 459 Priority: structs.CoreJobPriority, 460 Type: structs.JobTypeCore, 461 TriggeredBy: structs.EvalTriggerScheduled, 462 JobID: job, 463 LeaderACL: s.getLeaderAcl(), 464 Status: structs.EvalStatusPending, 465 ModifyIndex: modifyIndex, 466 } 467 } 468 469 // reapFailedEvaluations is used to reap evaluations that 470 // have reached their delivery limit and should be failed 471 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 472 for { 473 select { 474 case <-stopCh: 475 return 476 default: 477 // Scan for a failed evaluation 478 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 479 if err != nil { 480 return 481 } 482 if eval == nil { 483 continue 484 } 485 486 // Update the status to failed 487 updateEval := eval.Copy() 488 updateEval.Status = structs.EvalStatusFailed 489 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 490 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) 491 492 // Create a follow-up evaluation that will be used to retry the 493 // scheduling for the job after the cluster is hopefully more stable 494 // due to the fairly large backoff. 495 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 496 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 497 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 498 499 // Update via Raft 500 req := structs.EvalUpdateRequest{ 501 Evals: []*structs.Evaluation{updateEval, followupEval}, 502 } 503 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 504 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) 505 continue 506 } 507 508 // Ack completion 509 s.evalBroker.Ack(eval.ID, token) 510 } 511 } 512 } 513 514 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 515 // should be cancelled. 516 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 517 for { 518 select { 519 case <-stopCh: 520 return 521 default: 522 // Scan for duplicate blocked evals. 523 dups := s.blockedEvals.GetDuplicates(time.Second) 524 if dups == nil { 525 continue 526 } 527 528 cancel := make([]*structs.Evaluation, len(dups)) 529 for i, dup := range dups { 530 // Update the status to cancelled 531 newEval := dup.Copy() 532 newEval.Status = structs.EvalStatusCancelled 533 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 534 cancel[i] = newEval 535 } 536 537 // Update via Raft 538 req := structs.EvalUpdateRequest{ 539 Evals: cancel, 540 } 541 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 542 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 543 continue 544 } 545 } 546 } 547 } 548 549 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 550 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 551 ticker := time.NewTicker(failedEvalUnblockInterval) 552 defer ticker.Stop() 553 for { 554 select { 555 case <-stopCh: 556 return 557 case <-ticker.C: 558 // Unblock the failed allocations 559 s.blockedEvals.UnblockFailed() 560 } 561 } 562 } 563 564 // publishJobSummaryMetrics publishes the job summaries as metrics 565 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 566 timer := time.NewTimer(0) 567 defer timer.Stop() 568 569 for { 570 select { 571 case <-stopCh: 572 return 573 case <-timer.C: 574 timer.Reset(s.config.StatsCollectionInterval) 575 state, err := s.State().Snapshot() 576 if err != nil { 577 s.logger.Printf("[ERR] nomad: failed to get state: %v", err) 578 continue 579 } 580 ws := memdb.NewWatchSet() 581 iter, err := state.JobSummaries(ws) 582 if err != nil { 583 s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) 584 continue 585 } 586 587 for { 588 raw := iter.Next() 589 if raw == nil { 590 break 591 } 592 summary := raw.(*structs.JobSummary) 593 for name, tgSummary := range summary.Summary { 594 if !s.config.DisableTaggedMetrics { 595 labels := []metrics.Label{ 596 { 597 Name: "job", 598 Value: summary.JobID, 599 }, 600 { 601 Name: "task_group", 602 Value: name, 603 }, 604 } 605 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 606 float32(tgSummary.Queued), labels) 607 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 608 float32(tgSummary.Complete), labels) 609 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 610 float32(tgSummary.Failed), labels) 611 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 612 float32(tgSummary.Running), labels) 613 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 614 float32(tgSummary.Starting), labels) 615 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 616 float32(tgSummary.Lost), labels) 617 } 618 if s.config.BackwardsCompatibleMetrics { 619 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued)) 620 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete)) 621 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed)) 622 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running)) 623 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting)) 624 metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost)) 625 } 626 } 627 } 628 } 629 } 630 } 631 632 // revokeLeadership is invoked once we step down as leader. 633 // This is used to cleanup any state that may be specific to a leader. 634 func (s *Server) revokeLeadership() error { 635 // Clear the leader token since we are no longer the leader. 636 s.setLeaderAcl("") 637 638 // Disable the plan queue, since we are no longer leader 639 s.planQueue.SetEnabled(false) 640 641 // Disable the eval broker, since it is only useful as a leader 642 s.evalBroker.SetEnabled(false) 643 644 // Disable the blocked eval tracker, since it is only useful as a leader 645 s.blockedEvals.SetEnabled(false) 646 647 // Disable the periodic dispatcher, since it is only useful as a leader 648 s.periodicDispatcher.SetEnabled(false) 649 650 // Disable the Vault client as it is only useful as a leader. 651 s.vault.SetActive(false) 652 653 // Disable the deployment watcher as it is only useful as a leader. 654 if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil { 655 return err 656 } 657 658 // Disable any enterprise systems required. 659 if err := s.revokeEnterpriseLeadership(); err != nil { 660 return err 661 } 662 663 // Clear the heartbeat timers on either shutdown or step down, 664 // since we are no longer responsible for TTL expirations. 665 if err := s.clearAllHeartbeatTimers(); err != nil { 666 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 667 return err 668 } 669 670 // Unpause our worker if we paused previously 671 if len(s.workers) > 1 { 672 for i := 0; i < len(s.workers)/2; i++ { 673 s.workers[i].SetPause(false) 674 } 675 } 676 return nil 677 } 678 679 // reconcile is used to reconcile the differences between Serf 680 // membership and what is reflected in our strongly consistent store. 681 func (s *Server) reconcile() error { 682 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 683 members := s.serf.Members() 684 for _, member := range members { 685 if err := s.reconcileMember(member); err != nil { 686 return err 687 } 688 } 689 return nil 690 } 691 692 // reconcileMember is used to do an async reconcile of a single serf member 693 func (s *Server) reconcileMember(member serf.Member) error { 694 // Check if this is a member we should handle 695 valid, parts := isNomadServer(member) 696 if !valid || parts.Region != s.config.Region { 697 return nil 698 } 699 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 700 701 // Do not reconcile ourself 702 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 703 return nil 704 } 705 706 var err error 707 switch member.Status { 708 case serf.StatusAlive: 709 err = s.addRaftPeer(member, parts) 710 case serf.StatusLeft, StatusReap: 711 err = s.removeRaftPeer(member, parts) 712 } 713 if err != nil { 714 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 715 member, err) 716 return err 717 } 718 return nil 719 } 720 721 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 722 // the system 723 // COMPAT 0.4 -> 0.4.1 724 func (s *Server) reconcileJobSummaries() error { 725 index, err := s.fsm.state.LatestIndex() 726 if err != nil { 727 return fmt.Errorf("unable to read latest index: %v", err) 728 } 729 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 730 731 args := &structs.GenericResponse{} 732 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 733 if _, _, err = s.raftApply(msg, args); err != nil { 734 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 735 } 736 737 return nil 738 } 739 740 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 741 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 742 // Do not join ourselfs 743 if m.Name == s.config.NodeName { 744 s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) 745 return nil 746 } 747 748 // Check for possibility of multiple bootstrap nodes 749 if parts.Bootstrap { 750 members := s.serf.Members() 751 for _, member := range members { 752 valid, p := isNomadServer(member) 753 if valid && member.Name != m.Name && p.Bootstrap { 754 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 755 return nil 756 } 757 } 758 } 759 760 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 761 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 762 763 // See if it's already in the configuration. It's harmless to re-add it 764 // but we want to avoid doing that if possible to prevent useless Raft 765 // log entries. 766 configFuture := s.raft.GetConfiguration() 767 if err := configFuture.Error(); err != nil { 768 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 769 return err 770 } 771 for _, server := range configFuture.Configuration().Servers { 772 if server.Address == raft.ServerAddress(addr) { 773 return nil 774 } 775 } 776 777 // Attempt to add as a peer 778 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 779 if err := addFuture.Error(); err != nil { 780 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 781 return err 782 } else if err == nil { 783 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 784 } 785 return nil 786 } 787 788 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 789 // or is reaped 790 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 791 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 792 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 793 794 // See if it's already in the configuration. It's harmless to re-remove it 795 // but we want to avoid doing that if possible to prevent useless Raft 796 // log entries. 797 configFuture := s.raft.GetConfiguration() 798 if err := configFuture.Error(); err != nil { 799 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 800 return err 801 } 802 for _, server := range configFuture.Configuration().Servers { 803 if server.Address == raft.ServerAddress(addr) { 804 goto REMOVE 805 } 806 } 807 return nil 808 809 REMOVE: 810 // Attempt to remove as a peer. 811 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 812 if err := future.Error(); err != nil { 813 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 814 parts, err) 815 return err 816 } 817 return nil 818 } 819 820 // replicateACLPolicies is used to replicate ACL policies from 821 // the authoritative region to this region. 822 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 823 req := structs.ACLPolicyListRequest{ 824 QueryOptions: structs.QueryOptions{ 825 Region: s.config.AuthoritativeRegion, 826 AllowStale: true, 827 }, 828 } 829 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 830 s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) 831 832 START: 833 for { 834 select { 835 case <-stopCh: 836 return 837 default: 838 // Rate limit how often we attempt replication 839 limiter.Wait(context.Background()) 840 841 // Fetch the list of policies 842 var resp structs.ACLPolicyListResponse 843 req.AuthToken = s.ReplicationToken() 844 err := s.forwardRegion(s.config.AuthoritativeRegion, 845 "ACL.ListPolicies", &req, &resp) 846 if err != nil { 847 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 848 goto ERR_WAIT 849 } 850 851 // Perform a two-way diff 852 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 853 854 // Delete policies that should not exist 855 if len(delete) > 0 { 856 args := &structs.ACLPolicyDeleteRequest{ 857 Names: delete, 858 } 859 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 860 if err != nil { 861 s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) 862 goto ERR_WAIT 863 } 864 } 865 866 // Fetch any outdated policies 867 var fetched []*structs.ACLPolicy 868 if len(update) > 0 { 869 req := structs.ACLPolicySetRequest{ 870 Names: update, 871 QueryOptions: structs.QueryOptions{ 872 Region: s.config.AuthoritativeRegion, 873 AuthToken: s.ReplicationToken(), 874 AllowStale: true, 875 MinQueryIndex: resp.Index - 1, 876 }, 877 } 878 var reply structs.ACLPolicySetResponse 879 if err := s.forwardRegion(s.config.AuthoritativeRegion, 880 "ACL.GetPolicies", &req, &reply); err != nil { 881 s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) 882 goto ERR_WAIT 883 } 884 for _, policy := range reply.Policies { 885 fetched = append(fetched, policy) 886 } 887 } 888 889 // Update local policies 890 if len(fetched) > 0 { 891 args := &structs.ACLPolicyUpsertRequest{ 892 Policies: fetched, 893 } 894 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 895 if err != nil { 896 s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) 897 goto ERR_WAIT 898 } 899 } 900 901 // Update the minimum query index, blocks until there 902 // is a change. 903 req.MinQueryIndex = resp.Index 904 } 905 } 906 907 ERR_WAIT: 908 select { 909 case <-time.After(s.config.ReplicationBackoff): 910 goto START 911 case <-stopCh: 912 return 913 } 914 } 915 916 // diffACLPolicies is used to perform a two-way diff between the local 917 // policies and the remote policies to determine which policies need to 918 // be deleted or updated. 919 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 920 // Construct a set of the local and remote policies 921 local := make(map[string][]byte) 922 remote := make(map[string]struct{}) 923 924 // Add all the local policies 925 iter, err := state.ACLPolicies(nil) 926 if err != nil { 927 panic("failed to iterate local policies") 928 } 929 for { 930 raw := iter.Next() 931 if raw == nil { 932 break 933 } 934 policy := raw.(*structs.ACLPolicy) 935 local[policy.Name] = policy.Hash 936 } 937 938 // Iterate over the remote policies 939 for _, rp := range remoteList { 940 remote[rp.Name] = struct{}{} 941 942 // Check if the policy is missing locally 943 if localHash, ok := local[rp.Name]; !ok { 944 update = append(update, rp.Name) 945 946 // Check if policy is newer remotely and there is a hash mis-match. 947 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 948 update = append(update, rp.Name) 949 } 950 } 951 952 // Check if policy should be deleted 953 for lp := range local { 954 if _, ok := remote[lp]; !ok { 955 delete = append(delete, lp) 956 } 957 } 958 return 959 } 960 961 // replicateACLTokens is used to replicate global ACL tokens from 962 // the authoritative region to this region. 963 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 964 req := structs.ACLTokenListRequest{ 965 GlobalOnly: true, 966 QueryOptions: structs.QueryOptions{ 967 Region: s.config.AuthoritativeRegion, 968 AllowStale: true, 969 }, 970 } 971 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 972 s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) 973 974 START: 975 for { 976 select { 977 case <-stopCh: 978 return 979 default: 980 // Rate limit how often we attempt replication 981 limiter.Wait(context.Background()) 982 983 // Fetch the list of tokens 984 var resp structs.ACLTokenListResponse 985 req.AuthToken = s.ReplicationToken() 986 err := s.forwardRegion(s.config.AuthoritativeRegion, 987 "ACL.ListTokens", &req, &resp) 988 if err != nil { 989 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 990 goto ERR_WAIT 991 } 992 993 // Perform a two-way diff 994 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 995 996 // Delete tokens that should not exist 997 if len(delete) > 0 { 998 args := &structs.ACLTokenDeleteRequest{ 999 AccessorIDs: delete, 1000 } 1001 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1002 if err != nil { 1003 s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) 1004 goto ERR_WAIT 1005 } 1006 } 1007 1008 // Fetch any outdated policies. 1009 var fetched []*structs.ACLToken 1010 if len(update) > 0 { 1011 req := structs.ACLTokenSetRequest{ 1012 AccessorIDS: update, 1013 QueryOptions: structs.QueryOptions{ 1014 Region: s.config.AuthoritativeRegion, 1015 AuthToken: s.ReplicationToken(), 1016 AllowStale: true, 1017 MinQueryIndex: resp.Index - 1, 1018 }, 1019 } 1020 var reply structs.ACLTokenSetResponse 1021 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1022 "ACL.GetTokens", &req, &reply); err != nil { 1023 s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) 1024 goto ERR_WAIT 1025 } 1026 for _, token := range reply.Tokens { 1027 fetched = append(fetched, token) 1028 } 1029 } 1030 1031 // Update local tokens 1032 if len(fetched) > 0 { 1033 args := &structs.ACLTokenUpsertRequest{ 1034 Tokens: fetched, 1035 } 1036 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1037 if err != nil { 1038 s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) 1039 goto ERR_WAIT 1040 } 1041 } 1042 1043 // Update the minimum query index, blocks until there 1044 // is a change. 1045 req.MinQueryIndex = resp.Index 1046 } 1047 } 1048 1049 ERR_WAIT: 1050 select { 1051 case <-time.After(s.config.ReplicationBackoff): 1052 goto START 1053 case <-stopCh: 1054 return 1055 } 1056 } 1057 1058 // diffACLTokens is used to perform a two-way diff between the local 1059 // tokens and the remote tokens to determine which tokens need to 1060 // be deleted or updated. 1061 func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1062 // Construct a set of the local and remote policies 1063 local := make(map[string][]byte) 1064 remote := make(map[string]struct{}) 1065 1066 // Add all the local global tokens 1067 iter, err := state.ACLTokensByGlobal(nil, true) 1068 if err != nil { 1069 panic("failed to iterate local tokens") 1070 } 1071 for { 1072 raw := iter.Next() 1073 if raw == nil { 1074 break 1075 } 1076 token := raw.(*structs.ACLToken) 1077 local[token.AccessorID] = token.Hash 1078 } 1079 1080 // Iterate over the remote tokens 1081 for _, rp := range remoteList { 1082 remote[rp.AccessorID] = struct{}{} 1083 1084 // Check if the token is missing locally 1085 if localHash, ok := local[rp.AccessorID]; !ok { 1086 update = append(update, rp.AccessorID) 1087 1088 // Check if policy is newer remotely and there is a hash mis-match. 1089 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1090 update = append(update, rp.AccessorID) 1091 } 1092 } 1093 1094 // Check if local token should be deleted 1095 for lp := range local { 1096 if _, ok := remote[lp]; !ok { 1097 delete = append(delete, lp) 1098 } 1099 } 1100 return 1101 }