github.com/hernad/nomad@v1.6.112/nomad/leader.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "bytes" 8 "context" 9 "fmt" 10 "math/rand" 11 "net" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/armon/go-metrics" 17 "github.com/hashicorp/go-hclog" 18 "github.com/hashicorp/go-memdb" 19 "github.com/hashicorp/go-version" 20 "github.com/hernad/nomad/helper" 21 "github.com/hernad/nomad/helper/uuid" 22 "github.com/hernad/nomad/nomad/state" 23 "github.com/hernad/nomad/nomad/structs" 24 "github.com/hashicorp/raft" 25 "github.com/hashicorp/serf/serf" 26 "golang.org/x/time/rate" 27 ) 28 29 const ( 30 // failedEvalUnblockInterval is the interval at which failed evaluations are 31 // unblocked to re-enter the scheduler. A failed evaluation occurs under 32 // high contention when the schedulers plan does not make progress. 33 failedEvalUnblockInterval = 1 * time.Minute 34 35 // replicationRateLimit is used to rate limit how often data is replicated 36 // between the authoritative region and the local region 37 replicationRateLimit rate.Limit = 10.0 38 39 // barrierWriteTimeout is used to give Raft a chance to process a 40 // possible loss of leadership event if we are unable to get a barrier 41 // while leader. 42 barrierWriteTimeout = 2 * time.Minute 43 ) 44 45 var minAutopilotVersion = version.Must(version.NewVersion("0.8.0")) 46 47 var minSchedulerConfigVersion = version.Must(version.NewVersion("0.9.0")) 48 49 var minClusterIDVersion = version.Must(version.NewVersion("0.10.4")) 50 51 var minOneTimeAuthenticationTokenVersion = version.Must(version.NewVersion("1.1.0")) 52 53 // minACLRoleVersion is the Nomad version at which the ACL role table was 54 // introduced. It forms the minimum version all federated servers must meet 55 // before the feature can be used. 56 var minACLRoleVersion = version.Must(version.NewVersion("1.4.0")) 57 58 // minACLAuthMethodVersion is the Nomad version at which the ACL auth methods 59 // table was introduced. It forms the minimum version all federated servers must 60 // meet before the feature can be used. 61 var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0")) 62 63 // minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type 64 // was introduced. It forms the minimum version all federated servers must 65 // meet before the feature can be used. 66 var minACLJWTAuthMethodVersion = version.Must(version.NewVersion("1.5.4")) 67 68 // minACLBindingRuleVersion is the Nomad version at which the ACL binding rules 69 // table was introduced. It forms the minimum version all federated servers 70 // must meet before the feature can be used. 71 var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0")) 72 73 // minNomadServiceRegistrationVersion is the Nomad version at which the service 74 // registrations table was introduced. It forms the minimum version all local 75 // servers must meet before the feature can be used. 76 var minNomadServiceRegistrationVersion = version.Must(version.NewVersion("1.3.0")) 77 78 // Any writes to node pools requires that all servers are on version 1.6.0 to 79 // prevent older versions of the server from crashing. 80 var minNodePoolsVersion = version.Must(version.NewVersion("1.6.0")) 81 82 // monitorLeadership is used to monitor if we acquire or lose our role 83 // as the leader in the Raft cluster. There is some work the leader is 84 // expected to do, so we must react to changes 85 func (s *Server) monitorLeadership() { 86 var weAreLeaderCh chan struct{} 87 var leaderLoop sync.WaitGroup 88 89 leaderCh := s.raft.LeaderCh() 90 91 leaderStep := func(isLeader bool) { 92 if isLeader { 93 if weAreLeaderCh != nil { 94 s.logger.Error("attempted to start the leader loop while running") 95 return 96 } 97 98 weAreLeaderCh = make(chan struct{}) 99 leaderLoop.Add(1) 100 go func(ch chan struct{}) { 101 defer leaderLoop.Done() 102 s.leaderLoop(ch) 103 }(weAreLeaderCh) 104 s.logger.Info("cluster leadership acquired") 105 return 106 } 107 108 if weAreLeaderCh == nil { 109 s.logger.Error("attempted to stop the leader loop while not running") 110 return 111 } 112 113 s.logger.Debug("shutting down leader loop") 114 close(weAreLeaderCh) 115 leaderLoop.Wait() 116 weAreLeaderCh = nil 117 s.logger.Info("cluster leadership lost") 118 } 119 120 wasLeader := false 121 for { 122 select { 123 case isLeader := <-leaderCh: 124 if wasLeader != isLeader { 125 wasLeader = isLeader 126 // normal case where we went through a transition 127 leaderStep(isLeader) 128 } else if wasLeader && isLeader { 129 // Server lost but then gained leadership immediately. 130 // During this time, this server may have received 131 // Raft transitions that haven't been applied to the FSM 132 // yet. 133 // Ensure that that FSM caught up and eval queues are refreshed 134 s.logger.Warn("cluster leadership lost and gained leadership immediately. Could indicate network issues, memory paging, or high CPU load.") 135 136 leaderStep(false) 137 leaderStep(true) 138 } else { 139 // Server gained but lost leadership immediately 140 // before it reacted; nothing to do, move on 141 s.logger.Warn("cluster leadership gained and lost leadership immediately. Could indicate network issues, memory paging, or high CPU load.") 142 } 143 case <-s.shutdownCh: 144 if weAreLeaderCh != nil { 145 leaderStep(false) 146 } 147 return 148 } 149 } 150 } 151 152 func (s *Server) leadershipTransfer() error { 153 retryCount := 3 154 for i := 0; i < retryCount; i++ { 155 err := s.raft.LeadershipTransfer().Error() 156 if err == nil { 157 s.logger.Info("successfully transferred leadership") 158 return nil 159 } 160 161 // Don't retry if the Raft version doesn't support leadership transfer 162 // since this will never succeed. 163 if err == raft.ErrUnsupportedProtocol { 164 return fmt.Errorf("leadership transfer not supported with Raft version lower than 3") 165 } 166 167 s.logger.Error("failed to transfer leadership attempt, will retry", 168 "attempt", i, 169 "retry_limit", retryCount, 170 "error", err, 171 ) 172 } 173 return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount) 174 } 175 176 // leaderLoop runs as long as we are the leader to run various 177 // maintenance activities 178 func (s *Server) leaderLoop(stopCh chan struct{}) { 179 var reconcileCh chan serf.Member 180 establishedLeader := false 181 182 RECONCILE: 183 // Setup a reconciliation timer 184 reconcileCh = nil 185 interval := time.After(s.config.ReconcileInterval) 186 187 // Apply a raft barrier to ensure our FSM is caught up 188 start := time.Now() 189 barrier := s.raft.Barrier(barrierWriteTimeout) 190 if err := barrier.Error(); err != nil { 191 s.logger.Error("failed to wait for barrier", "error", err) 192 goto WAIT 193 } 194 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 195 196 // Check if we need to handle initial leadership actions 197 if !establishedLeader { 198 if err := s.establishLeadership(stopCh); err != nil { 199 s.logger.Error("failed to establish leadership", "error", err) 200 201 // Immediately revoke leadership since we didn't successfully 202 // establish leadership. 203 if err := s.revokeLeadership(); err != nil { 204 s.logger.Error("failed to revoke leadership", "error", err) 205 } 206 207 // Attempt to transfer leadership. If successful, leave the 208 // leaderLoop since this node is no longer the leader. Otherwise 209 // try to establish leadership again after 5 seconds. 210 if err := s.leadershipTransfer(); err != nil { 211 s.logger.Error("failed to transfer leadership", "error", err) 212 interval = time.After(5 * time.Second) 213 goto WAIT 214 } 215 return 216 } 217 218 establishedLeader = true 219 defer func() { 220 if err := s.revokeLeadership(); err != nil { 221 s.logger.Error("failed to revoke leadership", "error", err) 222 } 223 }() 224 } 225 226 // Reconcile any missing data 227 if err := s.reconcile(); err != nil { 228 s.logger.Error("failed to reconcile", "error", err) 229 goto WAIT 230 } 231 232 // Initial reconcile worked, now we can process the channel 233 // updates 234 reconcileCh = s.reconcileCh 235 236 // Poll the stop channel to give it priority so we don't waste time 237 // trying to perform the other operations if we have been asked to shut 238 // down. 239 select { 240 case <-stopCh: 241 return 242 default: 243 } 244 245 WAIT: 246 // Wait until leadership is lost or periodically reconcile as long as we 247 // are the leader, or when Serf events arrive. 248 for { 249 select { 250 case <-stopCh: 251 // Lost leadership. 252 return 253 case <-s.shutdownCh: 254 return 255 case <-interval: 256 goto RECONCILE 257 case member := <-reconcileCh: 258 s.reconcileMember(member) 259 case errCh := <-s.reassertLeaderCh: 260 // Recompute leader state, by asserting leadership and 261 // repopulating leader states. 262 263 // Check first if we are indeed the leaders first. We 264 // can get into this state when the initial 265 // establishLeadership has failed. 266 // Afterwards we will be waiting for the interval to 267 // trigger a reconciliation and can potentially end up 268 // here. There is no point to reassert because this 269 // agent was never leader in the first place. 270 if !establishedLeader { 271 errCh <- fmt.Errorf("leadership has not been established") 272 continue 273 } 274 275 // refresh leadership state 276 s.revokeLeadership() 277 err := s.establishLeadership(stopCh) 278 errCh <- err 279 280 // In case establishLeadership fails, try to transfer leadership. 281 // At this point Raft thinks we are the leader, but Nomad did not 282 // complete the required steps to act as the leader. 283 if err != nil { 284 if err := s.leadershipTransfer(); err != nil { 285 // establishedLeader was true before, but it no longer is 286 // since we revoked leadership and leadershipTransfer also 287 // failed. 288 // Stay in the leaderLoop with establishedLeader set to 289 // false so we try to establish leadership again in the 290 // next loop. 291 establishedLeader = false 292 interval = time.After(5 * time.Second) 293 goto WAIT 294 } 295 296 // leadershipTransfer was successful and it is 297 // time to leave the leaderLoop. 298 return 299 } 300 } 301 } 302 } 303 304 // establishLeadership is invoked once we become leader and are able 305 // to invoke an initial barrier. The barrier is used to ensure any 306 // previously inflight transactions have been committed and that our 307 // state is up-to-date. 308 func (s *Server) establishLeadership(stopCh chan struct{}) error { 309 defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now()) 310 311 // Generate a leader ACL token. This will allow the leader to issue work 312 // that requires a valid ACL token. 313 s.setLeaderAcl(uuid.Generate()) 314 315 // Disable workers to free half the cores for use in the plan queue and 316 // evaluation broker 317 s.handlePausableWorkers(true) 318 319 // Initialize and start the autopilot routine 320 s.getOrCreateAutopilotConfig() 321 s.autopilot.Start(s.shutdownCtx) 322 323 // Initialize scheduler configuration. 324 schedulerConfig := s.getOrCreateSchedulerConfig() 325 326 // Initialize the ClusterID 327 _, _ = s.ClusterID() 328 // todo: use cluster ID for stuff, later! 329 330 // Enable the plan queue, since we are now the leader 331 s.planQueue.SetEnabled(true) 332 333 // Start the plan evaluator 334 go s.planApply() 335 336 // Start the eval broker and blocked eval broker if these are not paused by 337 // the operator. 338 restoreEvals := s.handleEvalBrokerStateChange(schedulerConfig) 339 340 // Enable the deployment watcher, since we are now the leader 341 s.deploymentWatcher.SetEnabled(true, s.State()) 342 343 // Enable the NodeDrainer 344 s.nodeDrainer.SetEnabled(true, s.State()) 345 346 // Enable the volume watcher, since we are now the leader 347 s.volumeWatcher.SetEnabled(true, s.State(), s.getLeaderAcl()) 348 349 // Restore the eval broker state and blocked eval state. If these are 350 // currently paused, we do not need to do this. 351 if restoreEvals { 352 if err := s.restoreEvals(); err != nil { 353 return err 354 } 355 } 356 357 // Activate the vault client 358 s.vault.SetActive(true) 359 360 // Enable the periodic dispatcher, since we are now the leader. 361 s.periodicDispatcher.SetEnabled(true) 362 363 // Activate RPC now that local FSM caught up with Raft (as evident by Barrier call success) 364 // and all leader related components (e.g. broker queue) are enabled. 365 // Auxiliary processes (e.g. background, bookkeeping, and cleanup tasks can start after) 366 s.setConsistentReadReady() 367 368 // Further clean ups and follow up that don't block RPC consistency 369 370 // Create the first root key if it doesn't already exist 371 go s.initializeKeyring(stopCh) 372 373 // Restore the periodic dispatcher state 374 if err := s.restorePeriodicDispatcher(); err != nil { 375 return err 376 } 377 378 // Schedule periodic jobs which include expired local ACL token garbage 379 // collection. 380 go s.schedulePeriodic(stopCh) 381 382 // Reap any failed evaluations 383 go s.reapFailedEvaluations(stopCh) 384 385 // Reap any duplicate blocked evaluations 386 go s.reapDupBlockedEvaluations(stopCh) 387 388 // Reap any cancelable evaluations 389 s.reapCancelableEvalsCh = s.reapCancelableEvaluations(stopCh) 390 391 // Periodically unblock failed allocations 392 go s.periodicUnblockFailedEvals(stopCh) 393 394 // Periodically publish job summary metrics 395 go s.publishJobSummaryMetrics(stopCh) 396 397 // Periodically publish job status metrics 398 go s.publishJobStatusMetrics(stopCh) 399 400 // Setup the heartbeat timers. This is done both when starting up or when 401 // a leader fail over happens. Since the timers are maintained by the leader 402 // node, effectively this means all the timers are renewed at the time of failover. 403 // The TTL contract is that the session will not be expired before the TTL, 404 // so expiring it later is allowable. 405 // 406 // This MUST be done after the initial barrier to ensure the latest Nodes 407 // are available to be initialized. Otherwise initialization may use stale 408 // data. 409 if err := s.initializeHeartbeatTimers(); err != nil { 410 s.logger.Error("heartbeat timer setup failed", "error", err) 411 return err 412 } 413 414 // If ACLs are enabled, the leader needs to start a number of long-lived 415 // routines. Exactly which routines, depends on whether this leader is 416 // running within the authoritative region or not. 417 if s.config.ACLEnabled { 418 419 // The authoritative region is responsible for garbage collecting 420 // expired global tokens. Otherwise, non-authoritative regions need to 421 // replicate policies, tokens, and namespaces. 422 switch s.config.AuthoritativeRegion { 423 case s.config.Region: 424 go s.schedulePeriodicAuthoritative(stopCh) 425 default: 426 go s.replicateACLPolicies(stopCh) 427 go s.replicateACLTokens(stopCh) 428 go s.replicateACLRoles(stopCh) 429 go s.replicateACLAuthMethods(stopCh) 430 go s.replicateACLBindingRules(stopCh) 431 go s.replicateNamespaces(stopCh) 432 go s.replicateNodePools(stopCh) 433 } 434 } 435 436 // Setup any enterprise systems required. 437 if err := s.establishEnterpriseLeadership(stopCh); err != nil { 438 return err 439 } 440 441 // Cleanup orphaned Vault token accessors 442 if err := s.revokeVaultAccessorsOnRestore(); err != nil { 443 return err 444 } 445 446 // Cleanup orphaned Service Identity token accessors 447 if err := s.revokeSITokenAccessorsOnRestore(); err != nil { 448 return err 449 } 450 451 return nil 452 } 453 454 // replicateNamespaces is used to replicate namespaces from the authoritative 455 // region to this region. 456 func (s *Server) replicateNamespaces(stopCh chan struct{}) { 457 req := structs.NamespaceListRequest{ 458 QueryOptions: structs.QueryOptions{ 459 Region: s.config.AuthoritativeRegion, 460 AllowStale: true, 461 }, 462 } 463 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 464 s.logger.Debug("starting namespace replication from authoritative region", "region", req.Region) 465 466 START: 467 for { 468 select { 469 case <-stopCh: 470 return 471 default: 472 } 473 474 // Rate limit how often we attempt replication 475 limiter.Wait(context.Background()) 476 477 // Fetch the list of namespaces 478 var resp structs.NamespaceListResponse 479 req.AuthToken = s.ReplicationToken() 480 err := s.forwardRegion(s.config.AuthoritativeRegion, "Namespace.ListNamespaces", &req, &resp) 481 if err != nil { 482 s.logger.Error("failed to fetch namespaces from authoritative region", "error", err) 483 goto ERR_WAIT 484 } 485 486 // Perform a two-way diff 487 delete, update := diffNamespaces(s.State(), req.MinQueryIndex, resp.Namespaces) 488 489 // Delete namespaces that should not exist 490 if len(delete) > 0 { 491 args := &structs.NamespaceDeleteRequest{ 492 Namespaces: delete, 493 } 494 _, _, err := s.raftApply(structs.NamespaceDeleteRequestType, args) 495 if err != nil { 496 s.logger.Error("failed to delete namespaces", "error", err) 497 goto ERR_WAIT 498 } 499 } 500 501 // Fetch any outdated namespaces 502 var fetched []*structs.Namespace 503 if len(update) > 0 { 504 req := structs.NamespaceSetRequest{ 505 Namespaces: update, 506 QueryOptions: structs.QueryOptions{ 507 Region: s.config.AuthoritativeRegion, 508 AuthToken: s.ReplicationToken(), 509 AllowStale: true, 510 MinQueryIndex: resp.Index - 1, 511 }, 512 } 513 var reply structs.NamespaceSetResponse 514 if err := s.forwardRegion(s.config.AuthoritativeRegion, "Namespace.GetNamespaces", &req, &reply); err != nil { 515 s.logger.Error("failed to fetch namespaces from authoritative region", "error", err) 516 goto ERR_WAIT 517 } 518 for _, namespace := range reply.Namespaces { 519 fetched = append(fetched, namespace) 520 } 521 } 522 523 // Update local namespaces 524 if len(fetched) > 0 { 525 args := &structs.NamespaceUpsertRequest{ 526 Namespaces: fetched, 527 } 528 _, _, err := s.raftApply(structs.NamespaceUpsertRequestType, args) 529 if err != nil { 530 s.logger.Error("failed to update namespaces", "error", err) 531 goto ERR_WAIT 532 } 533 } 534 535 // Update the minimum query index, blocks until there is a change. 536 req.MinQueryIndex = resp.Index 537 } 538 539 ERR_WAIT: 540 select { 541 case <-time.After(s.config.ReplicationBackoff): 542 goto START 543 case <-stopCh: 544 return 545 } 546 } 547 548 func (s *Server) handlePausableWorkers(isLeader bool) { 549 for _, w := range s.pausableWorkers() { 550 if isLeader { 551 w.Pause() 552 } else { 553 w.Resume() 554 } 555 } 556 } 557 558 // diffNamespaces is used to perform a two-way diff between the local namespaces 559 // and the remote namespaces to determine which namespaces need to be deleted or 560 // updated. 561 func diffNamespaces(state *state.StateStore, minIndex uint64, remoteList []*structs.Namespace) (delete []string, update []string) { 562 // Construct a set of the local and remote namespaces 563 local := make(map[string][]byte) 564 remote := make(map[string]struct{}) 565 566 // Add all the local namespaces 567 iter, err := state.Namespaces(nil) 568 if err != nil { 569 panic("failed to iterate local namespaces") 570 } 571 for { 572 raw := iter.Next() 573 if raw == nil { 574 break 575 } 576 namespace := raw.(*structs.Namespace) 577 local[namespace.Name] = namespace.Hash 578 } 579 580 // Iterate over the remote namespaces 581 for _, rns := range remoteList { 582 remote[rns.Name] = struct{}{} 583 584 // Check if the namespace is missing locally 585 if localHash, ok := local[rns.Name]; !ok { 586 update = append(update, rns.Name) 587 588 // Check if the namespace is newer remotely and there is a hash 589 // mis-match. 590 } else if rns.ModifyIndex > minIndex && !bytes.Equal(localHash, rns.Hash) { 591 update = append(update, rns.Name) 592 } 593 } 594 595 // Check if namespaces should be deleted 596 for lns := range local { 597 if _, ok := remote[lns]; !ok { 598 delete = append(delete, lns) 599 } 600 } 601 return 602 } 603 604 // replicateNodePools is used to replicate node pools from the authoritative 605 // region to this region. 606 func (s *Server) replicateNodePools(stopCh chan struct{}) { 607 req := structs.NodePoolListRequest{ 608 QueryOptions: structs.QueryOptions{ 609 Region: s.config.AuthoritativeRegion, 610 AllowStale: true, 611 }, 612 } 613 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 614 s.logger.Debug("starting node pool replication from authoritative region", "region", req.Region) 615 616 for { 617 select { 618 case <-stopCh: 619 return 620 default: 621 } 622 623 // Rate limit how often we attempt replication 624 limiter.Wait(context.Background()) 625 626 if !ServersMeetMinimumVersion( 627 s.serf.Members(), s.Region(), minNodePoolsVersion, true) { 628 s.logger.Trace( 629 "all servers must be upgraded to 1.6.0 before Node Pools can be replicated") 630 if s.replicationBackoffContinue(stopCh) { 631 continue 632 } else { 633 return 634 } 635 } 636 637 var resp structs.NodePoolListResponse 638 req.AuthToken = s.ReplicationToken() 639 err := s.forwardRegion(s.config.AuthoritativeRegion, "NodePool.List", &req, &resp) 640 if err != nil { 641 s.logger.Error("failed to fetch node pools from authoritative region", "error", err) 642 if s.replicationBackoffContinue(stopCh) { 643 continue 644 } else { 645 return 646 } 647 } 648 649 // Perform a two-way diff 650 delete, update := diffNodePools(s.State(), req.MinQueryIndex, resp.NodePools) 651 652 // A significant amount of time could pass between the last check 653 // on whether we should stop the replication process. Therefore, do 654 // a check here, before calling Raft. 655 select { 656 case <-stopCh: 657 return 658 default: 659 } 660 661 // Delete node pools that should not exist 662 if len(delete) > 0 { 663 args := &structs.NodePoolDeleteRequest{ 664 Names: delete, 665 } 666 _, _, err := s.raftApply(structs.NodePoolDeleteRequestType, args) 667 if err != nil { 668 s.logger.Error("failed to delete node pools", "error", err) 669 if s.replicationBackoffContinue(stopCh) { 670 continue 671 } else { 672 return 673 } 674 } 675 } 676 677 // Update local node pools 678 if len(update) > 0 { 679 args := &structs.NodePoolUpsertRequest{ 680 NodePools: update, 681 } 682 _, _, err := s.raftApply(structs.NodePoolUpsertRequestType, args) 683 if err != nil { 684 s.logger.Error("failed to update node pools", "error", err) 685 if s.replicationBackoffContinue(stopCh) { 686 continue 687 } else { 688 return 689 } 690 } 691 } 692 693 // Update the minimum query index, blocks until there is a change. 694 req.MinQueryIndex = resp.Index 695 } 696 } 697 698 // diffNodePools is used to perform a two-way diff between the local node pools 699 // and the remote node pools to determine which node pools need to be deleted or 700 // updated. 701 func diffNodePools(store *state.StateStore, minIndex uint64, remoteList []*structs.NodePool) (delete []string, update []*structs.NodePool) { 702 // Construct a set of the local and remote node pools 703 local := make(map[string][]byte) 704 remote := make(map[string]struct{}) 705 706 // Add all the local node pools 707 iter, err := store.NodePools(nil, state.SortDefault) 708 if err != nil { 709 panic("failed to iterate local node pools") 710 } 711 for { 712 raw := iter.Next() 713 if raw == nil { 714 break 715 } 716 pool := raw.(*structs.NodePool) 717 local[pool.Name] = pool.Hash 718 } 719 720 for _, rnp := range remoteList { 721 remote[rnp.Name] = struct{}{} 722 723 if localHash, ok := local[rnp.Name]; !ok { 724 // Node pools that are missing locally should be added 725 update = append(update, rnp) 726 727 } else if rnp.ModifyIndex > minIndex && !bytes.Equal(localHash, rnp.Hash) { 728 // Node pools that have been added/updated more recently than the 729 // last index we saw, and have a hash mismatch with what we have 730 // locally, should be updated. 731 update = append(update, rnp) 732 } 733 } 734 735 // Node pools that don't exist on the remote should be deleted 736 for lnp := range local { 737 if _, ok := remote[lnp]; !ok { 738 delete = append(delete, lnp) 739 } 740 } 741 return 742 } 743 744 // restoreEvals is used to restore pending evaluations into the eval broker and 745 // blocked evaluations into the blocked eval tracker. The broker and blocked 746 // eval tracker is maintained only by the leader, so it must be restored anytime 747 // a leadership transition takes place. 748 func (s *Server) restoreEvals() error { 749 // Get an iterator over every evaluation 750 ws := memdb.NewWatchSet() 751 iter, err := s.fsm.State().Evals(ws, false) 752 if err != nil { 753 return fmt.Errorf("failed to get evaluations: %v", err) 754 } 755 756 for { 757 raw := iter.Next() 758 if raw == nil { 759 break 760 } 761 eval := raw.(*structs.Evaluation) 762 763 if eval.ShouldEnqueue() { 764 s.evalBroker.Enqueue(eval) 765 } else if eval.ShouldBlock() { 766 s.blockedEvals.Block(eval) 767 } 768 } 769 return nil 770 } 771 772 // revokeVaultAccessorsOnRestore is used to restore Vault accessors that should be 773 // revoked. 774 func (s *Server) revokeVaultAccessorsOnRestore() error { 775 // An accessor should be revoked if its allocation or node is terminal 776 ws := memdb.NewWatchSet() 777 state := s.fsm.State() 778 iter, err := state.VaultAccessors(ws) 779 if err != nil { 780 return fmt.Errorf("failed to get vault accessors: %v", err) 781 } 782 783 var revoke []*structs.VaultAccessor 784 for { 785 raw := iter.Next() 786 if raw == nil { 787 break 788 } 789 790 va := raw.(*structs.VaultAccessor) 791 792 // Check the allocation 793 alloc, err := state.AllocByID(ws, va.AllocID) 794 if err != nil { 795 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 796 } 797 if alloc == nil || alloc.Terminated() { 798 // No longer running and should be revoked 799 revoke = append(revoke, va) 800 continue 801 } 802 803 // Check the node 804 node, err := state.NodeByID(ws, va.NodeID) 805 if err != nil { 806 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 807 } 808 if node == nil || node.TerminalStatus() { 809 // Node is terminal so any accessor from it should be revoked 810 revoke = append(revoke, va) 811 continue 812 } 813 } 814 815 if len(revoke) != 0 { 816 s.logger.Info("revoking vault accessors after becoming leader", "accessors", len(revoke)) 817 818 if err := s.vault.MarkForRevocation(revoke); err != nil { 819 return fmt.Errorf("failed to revoke tokens: %v", err) 820 } 821 } 822 823 return nil 824 } 825 826 // revokeSITokenAccessorsOnRestore is used to revoke Service Identity token 827 // accessors on behalf of allocs that are now gone / terminal. 828 func (s *Server) revokeSITokenAccessorsOnRestore() error { 829 ws := memdb.NewWatchSet() 830 fsmState := s.fsm.State() 831 iter, err := fsmState.SITokenAccessors(ws) 832 if err != nil { 833 return fmt.Errorf("failed to get SI token accessors: %w", err) 834 } 835 836 var toRevoke []*structs.SITokenAccessor 837 for raw := iter.Next(); raw != nil; raw = iter.Next() { 838 accessor := raw.(*structs.SITokenAccessor) 839 840 // Check the allocation 841 alloc, err := fsmState.AllocByID(ws, accessor.AllocID) 842 if err != nil { 843 return fmt.Errorf("failed to lookup alloc %q: %w", accessor.AllocID, err) 844 } 845 if alloc == nil || alloc.Terminated() { 846 // no longer running and associated accessors should be revoked 847 toRevoke = append(toRevoke, accessor) 848 continue 849 } 850 851 // Check the node 852 node, err := fsmState.NodeByID(ws, accessor.NodeID) 853 if err != nil { 854 return fmt.Errorf("failed to lookup node %q: %w", accessor.NodeID, err) 855 } 856 if node == nil || node.TerminalStatus() { 857 // node is terminal and associated accessors should be revoked 858 toRevoke = append(toRevoke, accessor) 859 continue 860 } 861 } 862 863 if len(toRevoke) > 0 { 864 s.logger.Info("revoking consul accessors after becoming leader", "accessors", len(toRevoke)) 865 s.consulACLs.MarkForRevocation(toRevoke) 866 } 867 868 return nil 869 } 870 871 // restorePeriodicDispatcher is used to restore all periodic jobs into the 872 // periodic dispatcher. It also determines if a periodic job should have been 873 // created during the leadership transition and force runs them. The periodic 874 // dispatcher is maintained only by the leader, so it must be restored anytime a 875 // leadership transition takes place. 876 func (s *Server) restorePeriodicDispatcher() error { 877 logger := s.logger.Named("periodic") 878 ws := memdb.NewWatchSet() 879 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 880 if err != nil { 881 return fmt.Errorf("failed to get periodic jobs: %v", err) 882 } 883 884 now := time.Now() 885 for i := iter.Next(); i != nil; i = iter.Next() { 886 job := i.(*structs.Job) 887 888 // We skip adding parameterized jobs because they themselves aren't 889 // tracked, only the dispatched children are. 890 if job.IsParameterized() { 891 continue 892 } 893 894 if err := s.periodicDispatcher.Add(job); err != nil { 895 logger.Error("failed to add job to periodic dispatcher", "error", err) 896 continue 897 } 898 899 // We do not need to force run the job since it isn't active. 900 if !job.IsPeriodicActive() { 901 continue 902 } 903 904 // If the periodic job has never been launched before, launch will hold 905 // the time the periodic job was added. Otherwise it has the last launch 906 // time of the periodic job. 907 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 908 if err != nil { 909 return fmt.Errorf("failed to get periodic launch time: %v", err) 910 } 911 if launch == nil { 912 return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q", 913 job.ID, job.Namespace) 914 } 915 916 // nextLaunch is the next launch that should occur. 917 nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) 918 if err != nil { 919 logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err) 920 continue 921 } 922 923 // We skip force launching the job if there should be no next launch 924 // (the zero case) or if the next launch time is in the future. If it is 925 // in the future, it will be handled by the periodic dispatcher. 926 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 927 continue 928 } 929 930 // We skip if the job doesn't allow overlap and there are already 931 // instances running 932 allowed, err := s.cronJobOverlapAllowed(job) 933 if err != nil { 934 return fmt.Errorf("failed to get job status: %v", err) 935 } 936 if !allowed { 937 continue 938 } 939 940 if _, err := s.periodicDispatcher.ForceEval(job.Namespace, job.ID); err != nil { 941 logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err) 942 return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err) 943 } 944 945 logger.Debug("periodic job force run during leadership establishment", "job", job.NamespacedID()) 946 } 947 948 return nil 949 } 950 951 // cronJobOverlapAllowed checks if the job allows for overlap and if there are already 952 // instances of the job running in order to determine if a new evaluation needs to 953 // be created upon periodic dispatcher restore 954 func (s *Server) cronJobOverlapAllowed(job *structs.Job) (bool, error) { 955 if job.Periodic.ProhibitOverlap { 956 running, err := s.periodicDispatcher.dispatcher.RunningChildren(job) 957 if err != nil { 958 return false, fmt.Errorf("failed to determine if periodic job has running children %q error %q", job.NamespacedID(), err) 959 } 960 961 if running { 962 return false, nil 963 } 964 } 965 966 return true, nil 967 } 968 969 // schedulePeriodic is used to do periodic job dispatch while we are leader 970 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 971 evalGC := time.NewTicker(s.config.EvalGCInterval) 972 defer evalGC.Stop() 973 nodeGC := time.NewTicker(s.config.NodeGCInterval) 974 defer nodeGC.Stop() 975 jobGC := time.NewTicker(s.config.JobGCInterval) 976 defer jobGC.Stop() 977 deploymentGC := time.NewTicker(s.config.DeploymentGCInterval) 978 defer deploymentGC.Stop() 979 csiPluginGC := time.NewTicker(s.config.CSIPluginGCInterval) 980 defer csiPluginGC.Stop() 981 csiVolumeClaimGC := time.NewTicker(s.config.CSIVolumeClaimGCInterval) 982 defer csiVolumeClaimGC.Stop() 983 oneTimeTokenGC := time.NewTicker(s.config.OneTimeTokenGCInterval) 984 defer oneTimeTokenGC.Stop() 985 rootKeyGC := time.NewTicker(s.config.RootKeyGCInterval) 986 defer rootKeyGC.Stop() 987 variablesRekey := time.NewTicker(s.config.VariablesRekeyInterval) 988 defer variablesRekey.Stop() 989 990 // Set up the expired ACL local token garbage collection timer. 991 localTokenExpiredGC, localTokenExpiredGCStop := helper.NewSafeTimer(s.config.ACLTokenExpirationGCInterval) 992 defer localTokenExpiredGCStop() 993 994 for { 995 996 select { 997 case <-evalGC.C: 998 if index, ok := s.getLatestIndex(); ok { 999 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 1000 } 1001 case <-nodeGC.C: 1002 if index, ok := s.getLatestIndex(); ok { 1003 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 1004 } 1005 case <-jobGC.C: 1006 if index, ok := s.getLatestIndex(); ok { 1007 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 1008 } 1009 case <-deploymentGC.C: 1010 if index, ok := s.getLatestIndex(); ok { 1011 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index)) 1012 } 1013 case <-csiPluginGC.C: 1014 if index, ok := s.getLatestIndex(); ok { 1015 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIPluginGC, index)) 1016 } 1017 case <-csiVolumeClaimGC.C: 1018 if index, ok := s.getLatestIndex(); ok { 1019 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIVolumeClaimGC, index)) 1020 } 1021 case <-oneTimeTokenGC.C: 1022 if !ServersMeetMinimumVersion(s.Members(), s.Region(), minOneTimeAuthenticationTokenVersion, false) { 1023 continue 1024 } 1025 1026 if index, ok := s.getLatestIndex(); ok { 1027 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobOneTimeTokenGC, index)) 1028 } 1029 case <-localTokenExpiredGC.C: 1030 if index, ok := s.getLatestIndex(); ok { 1031 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobLocalTokenExpiredGC, index)) 1032 } 1033 localTokenExpiredGC.Reset(s.config.ACLTokenExpirationGCInterval) 1034 case <-rootKeyGC.C: 1035 if index, ok := s.getLatestIndex(); ok { 1036 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index)) 1037 } 1038 case <-variablesRekey.C: 1039 if index, ok := s.getLatestIndex(); ok { 1040 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobVariablesRekey, index)) 1041 } 1042 case <-stopCh: 1043 return 1044 } 1045 } 1046 } 1047 1048 // schedulePeriodicAuthoritative is a long-lived routine intended for use on 1049 // the leader within the authoritative region only. It periodically queues work 1050 // onto the _core scheduler for ACL based activities such as removing expired 1051 // global ACL tokens. 1052 func (s *Server) schedulePeriodicAuthoritative(stopCh chan struct{}) { 1053 1054 // Set up the expired ACL global token garbage collection timer. 1055 globalTokenExpiredGC, globalTokenExpiredGCStop := helper.NewSafeTimer(s.config.ACLTokenExpirationGCInterval) 1056 defer globalTokenExpiredGCStop() 1057 1058 for { 1059 select { 1060 case <-globalTokenExpiredGC.C: 1061 if index, ok := s.getLatestIndex(); ok { 1062 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobGlobalTokenExpiredGC, index)) 1063 } 1064 globalTokenExpiredGC.Reset(s.config.ACLTokenExpirationGCInterval) 1065 case <-stopCh: 1066 return 1067 } 1068 } 1069 } 1070 1071 // getLatestIndex is a helper function which returns the latest index from the 1072 // state store. The boolean return indicates whether the call has been 1073 // successful or not. 1074 func (s *Server) getLatestIndex() (uint64, bool) { 1075 snapshotIndex, err := s.fsm.State().LatestIndex() 1076 if err != nil { 1077 s.logger.Error("failed to determine state store's index", "error", err) 1078 return 0, false 1079 } 1080 return snapshotIndex, true 1081 } 1082 1083 // coreJobEval returns an evaluation for a core job 1084 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 1085 return &structs.Evaluation{ 1086 ID: uuid.Generate(), 1087 Namespace: "-", 1088 Priority: structs.CoreJobPriority, 1089 Type: structs.JobTypeCore, 1090 TriggeredBy: structs.EvalTriggerScheduled, 1091 JobID: job, 1092 LeaderACL: s.getLeaderAcl(), 1093 Status: structs.EvalStatusPending, 1094 ModifyIndex: modifyIndex, 1095 } 1096 } 1097 1098 // reapFailedEvaluations is used to reap evaluations that 1099 // have reached their delivery limit and should be failed 1100 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 1101 for { 1102 select { 1103 case <-stopCh: 1104 return 1105 default: 1106 // Scan for a failed evaluation 1107 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 1108 if err != nil { 1109 return 1110 } 1111 if eval == nil { 1112 continue 1113 } 1114 1115 // Update the status to failed 1116 updateEval := eval.Copy() 1117 updateEval.Status = structs.EvalStatusFailed 1118 updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 1119 s.logger.Warn("eval reached delivery limit, marking as failed", 1120 "eval", hclog.Fmt("%#v", updateEval)) 1121 1122 // Core job evals that fail or span leader elections will never 1123 // succeed because the follow-up doesn't have the leader ACL. We 1124 // rely on the leader to schedule new core jobs periodically 1125 // instead. 1126 if eval.Type != structs.JobTypeCore { 1127 1128 // Create a follow-up evaluation that will be used to retry the 1129 // scheduling for the job after the cluster is hopefully more stable 1130 // due to the fairly large backoff. 1131 followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + 1132 time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) 1133 1134 followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) 1135 updateEval.NextEval = followupEval.ID 1136 updateEval.UpdateModifyTime() 1137 1138 // Update via Raft 1139 req := structs.EvalUpdateRequest{ 1140 Evals: []*structs.Evaluation{updateEval, followupEval}, 1141 } 1142 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 1143 s.logger.Error("failed to update failed eval and create a follow-up", 1144 "eval", hclog.Fmt("%#v", updateEval), "error", err) 1145 continue 1146 } 1147 } 1148 // Ack completion 1149 s.evalBroker.Ack(eval.ID, token) 1150 } 1151 } 1152 } 1153 1154 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 1155 // should be cancelled. 1156 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 1157 for { 1158 select { 1159 case <-stopCh: 1160 return 1161 default: 1162 // Scan for duplicate blocked evals. 1163 dups := s.blockedEvals.GetDuplicates(time.Second) 1164 if dups == nil { 1165 continue 1166 } 1167 1168 cancel := make([]*structs.Evaluation, len(dups)) 1169 for i, dup := range dups { 1170 // Update the status to cancelled 1171 newEval := dup.Copy() 1172 newEval.Status = structs.EvalStatusCancelled 1173 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 1174 newEval.UpdateModifyTime() 1175 cancel[i] = newEval 1176 } 1177 1178 // Update via Raft 1179 req := structs.EvalUpdateRequest{ 1180 Evals: cancel, 1181 } 1182 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 1183 s.logger.Error("failed to update duplicate evals", "evals", hclog.Fmt("%#v", cancel), "error", err) 1184 continue 1185 } 1186 } 1187 } 1188 } 1189 1190 // reapCancelableEvaluations is used to reap evaluations that were marked 1191 // cancelable by the eval broker and should be canceled. These get swept up 1192 // whenever an eval Acks, but this ensures that we don't have a straggling batch 1193 // when the cluster doesn't have any more work to do. Returns a wake-up channel 1194 // that can be used to trigger a new reap without waiting for the timer 1195 func (s *Server) reapCancelableEvaluations(stopCh chan struct{}) chan struct{} { 1196 1197 wakeCh := make(chan struct{}, 1) 1198 go func() { 1199 1200 timer, cancel := helper.NewSafeTimer(s.config.EvalReapCancelableInterval) 1201 defer cancel() 1202 for { 1203 select { 1204 case <-stopCh: 1205 return 1206 case <-wakeCh: 1207 cancelCancelableEvals(s) 1208 case <-timer.C: 1209 cancelCancelableEvals(s) 1210 timer.Reset(s.config.EvalReapCancelableInterval) 1211 } 1212 } 1213 }() 1214 1215 return wakeCh 1216 } 1217 1218 const cancelableEvalsBatchSize = 728 // structs.MaxUUIDsPerWriteRequest / 10 1219 1220 // cancelCancelableEvals pulls a batch of cancelable evaluations from the eval 1221 // broker and updates their status to canceled. 1222 func cancelCancelableEvals(srv *Server) error { 1223 1224 const cancelDesc = "canceled after more recent eval was processed" 1225 1226 // We *can* send larger raft logs but rough benchmarks show that a smaller 1227 // page size strikes a balance between throughput and time we block the FSM 1228 // apply for other operations 1229 cancelable := srv.evalBroker.Cancelable(cancelableEvalsBatchSize) 1230 if len(cancelable) > 0 { 1231 for i, eval := range cancelable { 1232 eval = eval.Copy() 1233 eval.Status = structs.EvalStatusCancelled 1234 eval.StatusDescription = cancelDesc 1235 eval.UpdateModifyTime() 1236 cancelable[i] = eval 1237 } 1238 1239 update := &structs.EvalUpdateRequest{ 1240 Evals: cancelable, 1241 WriteRequest: structs.WriteRequest{Region: srv.Region()}, 1242 } 1243 _, _, err := srv.raftApply(structs.EvalUpdateRequestType, update) 1244 if err != nil { 1245 srv.logger.Warn("eval cancel failed", "error", err, "method", "ack") 1246 return err 1247 } 1248 } 1249 return nil 1250 } 1251 1252 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 1253 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 1254 ticker := time.NewTicker(failedEvalUnblockInterval) 1255 defer ticker.Stop() 1256 for { 1257 select { 1258 case <-stopCh: 1259 return 1260 case <-ticker.C: 1261 // Unblock the failed allocations 1262 s.blockedEvals.UnblockFailed() 1263 } 1264 } 1265 } 1266 1267 // publishJobSummaryMetrics publishes the job summaries as metrics 1268 func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { 1269 timer := time.NewTimer(0) 1270 defer timer.Stop() 1271 1272 for { 1273 select { 1274 case <-stopCh: 1275 return 1276 case <-timer.C: 1277 timer.Reset(s.config.StatsCollectionInterval) 1278 state, err := s.State().Snapshot() 1279 if err != nil { 1280 s.logger.Error("failed to get state", "error", err) 1281 continue 1282 } 1283 ws := memdb.NewWatchSet() 1284 iter, err := state.JobSummaries(ws) 1285 if err != nil { 1286 s.logger.Error("failed to get job summaries", "error", err) 1287 continue 1288 } 1289 1290 for { 1291 raw := iter.Next() 1292 if raw == nil { 1293 break 1294 } 1295 summary := raw.(*structs.JobSummary) 1296 if s.config.DisableDispatchedJobSummaryMetrics { 1297 job, err := state.JobByID(ws, summary.Namespace, summary.JobID) 1298 if err != nil { 1299 s.logger.Error("error getting job for summary", "error", err) 1300 continue 1301 } 1302 if job.Dispatched { 1303 continue 1304 } 1305 } 1306 s.iterateJobSummaryMetrics(summary) 1307 } 1308 } 1309 } 1310 } 1311 1312 func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) { 1313 for name, tgSummary := range summary.Summary { 1314 labels := []metrics.Label{ 1315 { 1316 Name: "job", 1317 Value: summary.JobID, 1318 }, 1319 { 1320 Name: "task_group", 1321 Value: name, 1322 }, 1323 { 1324 Name: "namespace", 1325 Value: summary.Namespace, 1326 }, 1327 } 1328 1329 if strings.Contains(summary.JobID, "/dispatch-") { 1330 jobInfo := strings.Split(summary.JobID, "/dispatch-") 1331 labels = append(labels, metrics.Label{ 1332 Name: "parent_id", 1333 Value: jobInfo[0], 1334 }, metrics.Label{ 1335 Name: "dispatch_id", 1336 Value: jobInfo[1], 1337 }) 1338 } 1339 1340 if strings.Contains(summary.JobID, "/periodic-") { 1341 jobInfo := strings.Split(summary.JobID, "/periodic-") 1342 labels = append(labels, metrics.Label{ 1343 Name: "parent_id", 1344 Value: jobInfo[0], 1345 }, metrics.Label{ 1346 Name: "periodic_id", 1347 Value: jobInfo[1], 1348 }) 1349 } 1350 1351 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"}, 1352 float32(tgSummary.Queued), labels) 1353 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"}, 1354 float32(tgSummary.Complete), labels) 1355 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"}, 1356 float32(tgSummary.Failed), labels) 1357 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"}, 1358 float32(tgSummary.Running), labels) 1359 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"}, 1360 float32(tgSummary.Starting), labels) 1361 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, 1362 float32(tgSummary.Lost), labels) 1363 metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "unknown"}, 1364 float32(tgSummary.Unknown), labels) 1365 } 1366 } 1367 1368 // publishJobStatusMetrics publishes the job statuses as metrics 1369 func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) { 1370 timer := time.NewTimer(0) 1371 defer timer.Stop() 1372 1373 for { 1374 select { 1375 case <-stopCh: 1376 return 1377 case <-timer.C: 1378 timer.Reset(s.config.StatsCollectionInterval) 1379 state, err := s.State().Snapshot() 1380 if err != nil { 1381 s.logger.Error("failed to get state", "error", err) 1382 continue 1383 } 1384 ws := memdb.NewWatchSet() 1385 iter, err := state.Jobs(ws) 1386 if err != nil { 1387 s.logger.Error("failed to get job statuses", "error", err) 1388 continue 1389 } 1390 1391 s.iterateJobStatusMetrics(&iter) 1392 } 1393 } 1394 } 1395 1396 func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) { 1397 var pending int64 // Sum of all jobs in 'pending' state 1398 var running int64 // Sum of all jobs in 'running' state 1399 var dead int64 // Sum of all jobs in 'dead' state 1400 1401 for { 1402 raw := (*jobs).Next() 1403 if raw == nil { 1404 break 1405 } 1406 1407 job := raw.(*structs.Job) 1408 1409 switch job.Status { 1410 case structs.JobStatusPending: 1411 pending++ 1412 case structs.JobStatusRunning: 1413 running++ 1414 case structs.JobStatusDead: 1415 dead++ 1416 } 1417 } 1418 1419 metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending)) 1420 metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running)) 1421 metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead)) 1422 } 1423 1424 // revokeLeadership is invoked once we step down as leader. 1425 // This is used to cleanup any state that may be specific to a leader. 1426 func (s *Server) revokeLeadership() error { 1427 defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now()) 1428 1429 s.resetConsistentReadReady() 1430 1431 // Clear the leader token since we are no longer the leader. 1432 s.setLeaderAcl("") 1433 1434 // Disable autopilot 1435 s.autopilot.Stop() 1436 1437 // Disable the plan queue, since we are no longer leader 1438 s.planQueue.SetEnabled(false) 1439 1440 // Disable the eval broker and blocked evals. We do not need to check the 1441 // scheduler configuration paused eval broker value, as the brokers should 1442 // always be paused on the non-leader. 1443 s.brokerLock.Lock() 1444 s.evalBroker.SetEnabled(false) 1445 s.blockedEvals.SetEnabled(false) 1446 s.brokerLock.Unlock() 1447 1448 // Disable the periodic dispatcher, since it is only useful as a leader 1449 s.periodicDispatcher.SetEnabled(false) 1450 1451 // Disable the Vault client as it is only useful as a leader. 1452 s.vault.SetActive(false) 1453 1454 // Disable the deployment watcher as it is only useful as a leader. 1455 s.deploymentWatcher.SetEnabled(false, nil) 1456 1457 // Disable the node drainer 1458 s.nodeDrainer.SetEnabled(false, nil) 1459 1460 // Disable the volume watcher 1461 s.volumeWatcher.SetEnabled(false, nil, "") 1462 1463 // Disable any enterprise systems required. 1464 if err := s.revokeEnterpriseLeadership(); err != nil { 1465 return err 1466 } 1467 1468 // Clear the heartbeat timers on either shutdown or step down, 1469 // since we are no longer responsible for TTL expirations. 1470 if err := s.clearAllHeartbeatTimers(); err != nil { 1471 s.logger.Error("clearing heartbeat timers failed", "error", err) 1472 return err 1473 } 1474 1475 // Unpause our worker if we paused previously 1476 s.handlePausableWorkers(false) 1477 1478 return nil 1479 } 1480 1481 // pausableWorkers returns a slice of the workers 1482 // to pause on leader transitions. 1483 // 1484 // Upon leadership establishment, pause workers to free half 1485 // the cores for use in the plan queue and evaluation broker 1486 func (s *Server) pausableWorkers() []*Worker { 1487 n := len(s.workers) 1488 if n <= 1 { 1489 return []*Worker{} 1490 } 1491 1492 // Disabling 3/4 of the workers frees CPU for raft and the 1493 // plan applier which uses 1/2 the cores. 1494 return s.workers[:3*n/4] 1495 } 1496 1497 // reconcile is used to reconcile the differences between Serf 1498 // membership and what is reflected in our strongly consistent store. 1499 func (s *Server) reconcile() error { 1500 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 1501 members := s.serf.Members() 1502 for _, member := range members { 1503 if err := s.reconcileMember(member); err != nil { 1504 return err 1505 } 1506 } 1507 return nil 1508 } 1509 1510 // reconcileMember is used to do an async reconcile of a single serf member 1511 func (s *Server) reconcileMember(member serf.Member) error { 1512 // Check if this is a member we should handle 1513 valid, parts := isNomadServer(member) 1514 if !valid || parts.Region != s.config.Region { 1515 return nil 1516 } 1517 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 1518 1519 var err error 1520 switch member.Status { 1521 case serf.StatusAlive: 1522 err = s.addRaftPeer(member, parts) 1523 case serf.StatusLeft, StatusReap: 1524 err = s.removeRaftPeer(member, parts) 1525 } 1526 if err != nil { 1527 s.logger.Error("failed to reconcile member", "member", member, "error", err) 1528 return err 1529 } 1530 return nil 1531 } 1532 1533 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 1534 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 1535 // Check for possibility of multiple bootstrap nodes 1536 members := s.serf.Members() 1537 if parts.Bootstrap { 1538 for _, member := range members { 1539 valid, p := isNomadServer(member) 1540 if valid && member.Name != m.Name && p.Bootstrap { 1541 s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode", 1542 "existing_peer", member.Name, "joining_peer", m.Name) 1543 return nil 1544 } 1545 } 1546 } 1547 1548 // Processing ourselves could result in trying to remove ourselves to 1549 // fix up our address, which would make us step down. This is only 1550 // safe to attempt if there are multiple servers available. 1551 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 1552 configFuture := s.raft.GetConfiguration() 1553 if err := configFuture.Error(); err != nil { 1554 s.logger.Error("failed to get raft configuration", "error", err) 1555 return err 1556 } 1557 1558 if m.Name == s.config.NodeName { 1559 if l := len(configFuture.Configuration().Servers); l < 3 { 1560 s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name) 1561 return nil 1562 } 1563 } 1564 1565 // See if it's already in the configuration. It's harmless to re-add it 1566 // but we want to avoid doing that if possible to prevent useless Raft 1567 // log entries. If the address is the same but the ID changed, remove the 1568 // old server before adding the new one. 1569 minRaftProtocol, err := s.MinRaftProtocol() 1570 if err != nil { 1571 return err 1572 } 1573 for _, server := range configFuture.Configuration().Servers { 1574 // No-op if the raft version is too low 1575 if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) { 1576 return nil 1577 } 1578 1579 // If the address or ID matches an existing server, see if we need to remove the old one first 1580 if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { 1581 // Exit with no-op if this is being called on an existing server and both the ID and address match 1582 if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { 1583 return nil 1584 } 1585 future := s.raft.RemoveServer(server.ID, 0, 0) 1586 if server.Address == raft.ServerAddress(addr) { 1587 if err := future.Error(); err != nil { 1588 return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) 1589 } 1590 s.logger.Info("removed server with duplicate address", "address", server.Address) 1591 } else { 1592 if err := future.Error(); err != nil { 1593 return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) 1594 } 1595 s.logger.Info("removed server with duplicate ID", "id", server.ID) 1596 } 1597 } 1598 } 1599 1600 // Attempt to add as a peer 1601 switch { 1602 case minRaftProtocol >= 3: 1603 addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 1604 if err := addFuture.Error(); err != nil { 1605 s.logger.Error("failed to add raft peer", "error", err) 1606 return err 1607 } 1608 case minRaftProtocol == 2 && parts.RaftVersion >= 3: 1609 addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) 1610 if err := addFuture.Error(); err != nil { 1611 s.logger.Error("failed to add raft peer", "error", err) 1612 return err 1613 } 1614 default: 1615 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 1616 if err := addFuture.Error(); err != nil { 1617 s.logger.Error("failed to add raft peer", "error", err) 1618 return err 1619 } 1620 } 1621 1622 return nil 1623 } 1624 1625 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 1626 // or is reaped 1627 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 1628 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 1629 1630 // See if it's already in the configuration. It's harmless to re-remove it 1631 // but we want to avoid doing that if possible to prevent useless Raft 1632 // log entries. 1633 configFuture := s.raft.GetConfiguration() 1634 if err := configFuture.Error(); err != nil { 1635 s.logger.Error("failed to get raft configuration", "error", err) 1636 return err 1637 } 1638 1639 minRaftProtocol, err := s.MinRaftProtocol() 1640 if err != nil { 1641 return err 1642 } 1643 1644 // Pick which remove API to use based on how the server was added. 1645 for _, server := range configFuture.Configuration().Servers { 1646 // Check if this is the server to remove based on how it was registered. 1647 // Raft v2 servers are registered by address. 1648 // Raft v3 servers are registered by ID. 1649 if server.ID == raft.ServerID(parts.ID) || server.Address == raft.ServerAddress(addr) { 1650 // Use the new add/remove APIs if we understand them. 1651 if minRaftProtocol >= 2 { 1652 s.logger.Info("removing server by ID", "id", server.ID) 1653 future := s.raft.RemoveServer(server.ID, 0, 0) 1654 if err := future.Error(); err != nil { 1655 s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err) 1656 return err 1657 } 1658 } else { 1659 // If not, use the old remove API 1660 s.logger.Info("removing server by address", "address", server.Address) 1661 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 1662 if err := future.Error(); err != nil { 1663 s.logger.Error("failed to remove raft peer", "address", addr, "error", err) 1664 return err 1665 } 1666 } 1667 break 1668 } 1669 } 1670 1671 return nil 1672 } 1673 1674 // replicateACLPolicies is used to replicate ACL policies from 1675 // the authoritative region to this region. 1676 func (s *Server) replicateACLPolicies(stopCh chan struct{}) { 1677 req := structs.ACLPolicyListRequest{ 1678 QueryOptions: structs.QueryOptions{ 1679 Region: s.config.AuthoritativeRegion, 1680 AllowStale: true, 1681 }, 1682 } 1683 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1684 s.logger.Debug("starting ACL policy replication from authoritative region", "authoritative_region", req.Region) 1685 1686 START: 1687 for { 1688 select { 1689 case <-stopCh: 1690 return 1691 default: 1692 // Rate limit how often we attempt replication 1693 limiter.Wait(context.Background()) 1694 1695 // Fetch the list of policies 1696 var resp structs.ACLPolicyListResponse 1697 req.AuthToken = s.ReplicationToken() 1698 err := s.forwardRegion(s.config.AuthoritativeRegion, 1699 "ACL.ListPolicies", &req, &resp) 1700 if err != nil { 1701 s.logger.Error("failed to fetch policies from authoritative region", "error", err) 1702 goto ERR_WAIT 1703 } 1704 1705 // Perform a two-way diff 1706 delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies) 1707 1708 // Delete policies that should not exist 1709 if len(delete) > 0 { 1710 args := &structs.ACLPolicyDeleteRequest{ 1711 Names: delete, 1712 } 1713 _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) 1714 if err != nil { 1715 s.logger.Error("failed to delete policies", "error", err) 1716 goto ERR_WAIT 1717 } 1718 } 1719 1720 // Fetch any outdated policies 1721 var fetched []*structs.ACLPolicy 1722 if len(update) > 0 { 1723 req := structs.ACLPolicySetRequest{ 1724 Names: update, 1725 QueryOptions: structs.QueryOptions{ 1726 Region: s.config.AuthoritativeRegion, 1727 AuthToken: s.ReplicationToken(), 1728 AllowStale: true, 1729 MinQueryIndex: resp.Index - 1, 1730 }, 1731 } 1732 var reply structs.ACLPolicySetResponse 1733 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1734 "ACL.GetPolicies", &req, &reply); err != nil { 1735 s.logger.Error("failed to fetch policies from authoritative region", "error", err) 1736 goto ERR_WAIT 1737 } 1738 for _, policy := range reply.Policies { 1739 fetched = append(fetched, policy) 1740 } 1741 } 1742 1743 // Update local policies 1744 if len(fetched) > 0 { 1745 args := &structs.ACLPolicyUpsertRequest{ 1746 Policies: fetched, 1747 } 1748 _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) 1749 if err != nil { 1750 s.logger.Error("failed to update policies", "error", err) 1751 goto ERR_WAIT 1752 } 1753 } 1754 1755 // Update the minimum query index, blocks until there 1756 // is a change. 1757 req.MinQueryIndex = resp.Index 1758 } 1759 } 1760 1761 ERR_WAIT: 1762 select { 1763 case <-time.After(s.config.ReplicationBackoff): 1764 goto START 1765 case <-stopCh: 1766 return 1767 } 1768 } 1769 1770 // diffACLPolicies is used to perform a two-way diff between the local 1771 // policies and the remote policies to determine which policies need to 1772 // be deleted or updated. 1773 func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) { 1774 // Construct a set of the local and remote policies 1775 local := make(map[string][]byte) 1776 remote := make(map[string]struct{}) 1777 1778 // Add all the local policies 1779 iter, err := state.ACLPolicies(nil) 1780 if err != nil { 1781 panic("failed to iterate local policies") 1782 } 1783 for { 1784 raw := iter.Next() 1785 if raw == nil { 1786 break 1787 } 1788 policy := raw.(*structs.ACLPolicy) 1789 local[policy.Name] = policy.Hash 1790 } 1791 1792 // Iterate over the remote policies 1793 for _, rp := range remoteList { 1794 remote[rp.Name] = struct{}{} 1795 1796 // Check if the policy is missing locally 1797 if localHash, ok := local[rp.Name]; !ok { 1798 update = append(update, rp.Name) 1799 1800 // Check if policy is newer remotely and there is a hash mis-match. 1801 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1802 update = append(update, rp.Name) 1803 } 1804 } 1805 1806 // Check if policy should be deleted 1807 for lp := range local { 1808 if _, ok := remote[lp]; !ok { 1809 delete = append(delete, lp) 1810 } 1811 } 1812 return 1813 } 1814 1815 // replicateACLTokens is used to replicate global ACL tokens from 1816 // the authoritative region to this region. 1817 func (s *Server) replicateACLTokens(stopCh chan struct{}) { 1818 req := structs.ACLTokenListRequest{ 1819 GlobalOnly: true, 1820 QueryOptions: structs.QueryOptions{ 1821 Region: s.config.AuthoritativeRegion, 1822 AllowStale: true, 1823 }, 1824 } 1825 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1826 s.logger.Debug("starting ACL token replication from authoritative region", "authoritative_region", req.Region) 1827 1828 START: 1829 for { 1830 select { 1831 case <-stopCh: 1832 return 1833 default: 1834 // Rate limit how often we attempt replication 1835 limiter.Wait(context.Background()) 1836 1837 // Fetch the list of tokens 1838 var resp structs.ACLTokenListResponse 1839 req.AuthToken = s.ReplicationToken() 1840 err := s.forwardRegion(s.config.AuthoritativeRegion, 1841 "ACL.ListTokens", &req, &resp) 1842 if err != nil { 1843 s.logger.Error("failed to fetch tokens from authoritative region", "error", err) 1844 goto ERR_WAIT 1845 } 1846 1847 // Perform a two-way diff 1848 delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens) 1849 1850 // Delete tokens that should not exist 1851 if len(delete) > 0 { 1852 args := &structs.ACLTokenDeleteRequest{ 1853 AccessorIDs: delete, 1854 } 1855 _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) 1856 if err != nil { 1857 s.logger.Error("failed to delete tokens", "error", err) 1858 goto ERR_WAIT 1859 } 1860 } 1861 1862 // Fetch any outdated policies. 1863 var fetched []*structs.ACLToken 1864 if len(update) > 0 { 1865 req := structs.ACLTokenSetRequest{ 1866 AccessorIDS: update, 1867 QueryOptions: structs.QueryOptions{ 1868 Region: s.config.AuthoritativeRegion, 1869 AuthToken: s.ReplicationToken(), 1870 AllowStale: true, 1871 MinQueryIndex: resp.Index - 1, 1872 }, 1873 } 1874 var reply structs.ACLTokenSetResponse 1875 if err := s.forwardRegion(s.config.AuthoritativeRegion, 1876 "ACL.GetTokens", &req, &reply); err != nil { 1877 s.logger.Error("failed to fetch tokens from authoritative region", "error", err) 1878 goto ERR_WAIT 1879 } 1880 for _, token := range reply.Tokens { 1881 fetched = append(fetched, token) 1882 } 1883 } 1884 1885 // Update local tokens 1886 if len(fetched) > 0 { 1887 args := &structs.ACLTokenUpsertRequest{ 1888 Tokens: fetched, 1889 } 1890 _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) 1891 if err != nil { 1892 s.logger.Error("failed to update tokens", "error", err) 1893 goto ERR_WAIT 1894 } 1895 } 1896 1897 // Update the minimum query index, blocks until there 1898 // is a change. 1899 req.MinQueryIndex = resp.Index 1900 } 1901 } 1902 1903 ERR_WAIT: 1904 select { 1905 case <-time.After(s.config.ReplicationBackoff): 1906 goto START 1907 case <-stopCh: 1908 return 1909 } 1910 } 1911 1912 // diffACLTokens is used to perform a two-way diff between the local 1913 // tokens and the remote tokens to determine which tokens need to 1914 // be deleted or updated. 1915 func diffACLTokens(store *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) { 1916 // Construct a set of the local and remote policies 1917 local := make(map[string][]byte) 1918 remote := make(map[string]struct{}) 1919 1920 // Add all the local global tokens 1921 iter, err := store.ACLTokensByGlobal(nil, true, state.SortDefault) 1922 if err != nil { 1923 panic("failed to iterate local tokens") 1924 } 1925 for { 1926 raw := iter.Next() 1927 if raw == nil { 1928 break 1929 } 1930 token := raw.(*structs.ACLToken) 1931 local[token.AccessorID] = token.Hash 1932 } 1933 1934 // Iterate over the remote tokens 1935 for _, rp := range remoteList { 1936 remote[rp.AccessorID] = struct{}{} 1937 1938 // Check if the token is missing locally 1939 if localHash, ok := local[rp.AccessorID]; !ok { 1940 update = append(update, rp.AccessorID) 1941 1942 // Check if policy is newer remotely and there is a hash mis-match. 1943 } else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) { 1944 update = append(update, rp.AccessorID) 1945 } 1946 } 1947 1948 // Check if local token should be deleted 1949 for lp := range local { 1950 if _, ok := remote[lp]; !ok { 1951 delete = append(delete, lp) 1952 } 1953 } 1954 return 1955 } 1956 1957 // replicateACLRoles is used to replicate ACL Roles from the authoritative 1958 // region to this region. The loop should only be run on the leader within the 1959 // federated region. 1960 func (s *Server) replicateACLRoles(stopCh chan struct{}) { 1961 1962 // Generate our request object. We only need to do this once and reuse it 1963 // for every RPC request. The MinQueryIndex is updated after every 1964 // successful replication loop, so the next query acts as a blocking query 1965 // and only returns upon a change in the authoritative region. 1966 req := structs.ACLRolesListRequest{ 1967 QueryOptions: structs.QueryOptions{ 1968 AllowStale: true, 1969 Region: s.config.AuthoritativeRegion, 1970 }, 1971 } 1972 1973 // Create our replication rate limiter for ACL roles and log a lovely 1974 // message to indicate the process is starting. 1975 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 1976 s.logger.Debug("starting ACL Role replication from authoritative region", 1977 "authoritative_region", req.Region) 1978 1979 // Enter the main ACL Role replication loop that will only exit when the 1980 // stopCh is closed. 1981 // 1982 // Any error encountered will use the replicationBackoffContinue function 1983 // which handles replication backoff and shutdown coordination in the event 1984 // of an error inside the loop. 1985 for { 1986 select { 1987 case <-stopCh: 1988 return 1989 default: 1990 1991 // Rate limit how often we attempt replication. It is OK to ignore 1992 // the error as the context will never be cancelled and the limit 1993 // parameters are controlled internally. 1994 _ = limiter.Wait(context.Background()) 1995 1996 if !ServersMeetMinimumVersion( 1997 s.serf.Members(), s.Region(), minACLRoleVersion, true) { 1998 s.logger.Trace( 1999 "all servers must be upgraded to 1.4.0 or later before ACL Roles can be replicated") 2000 if s.replicationBackoffContinue(stopCh) { 2001 continue 2002 } else { 2003 return 2004 } 2005 } 2006 2007 // Set the replication token on each replication iteration so that 2008 // it is always current and can handle agent SIGHUP reloads. 2009 req.AuthToken = s.ReplicationToken() 2010 2011 var resp structs.ACLRolesListResponse 2012 2013 // Make the list RPC request to the authoritative region, so we 2014 // capture the latest ACL role listing. 2015 err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListRolesRPCMethod, &req, &resp) 2016 if err != nil { 2017 s.logger.Error("failed to fetch ACL Roles from authoritative region", "error", err) 2018 if s.replicationBackoffContinue(stopCh) { 2019 continue 2020 } else { 2021 return 2022 } 2023 } 2024 2025 // Perform a two-way diff on the ACL roles. 2026 toDelete, toUpdate := diffACLRoles(s.State(), req.MinQueryIndex, resp.ACLRoles) 2027 2028 // A significant amount of time could pass between the last check 2029 // on whether we should stop the replication process. Therefore, do 2030 // a check here, before calling Raft. 2031 select { 2032 case <-stopCh: 2033 return 2034 default: 2035 } 2036 2037 // If we have ACL roles to delete, make this call directly to Raft. 2038 if len(toDelete) > 0 { 2039 args := structs.ACLRolesDeleteByIDRequest{ACLRoleIDs: toDelete} 2040 _, _, err := s.raftApply(structs.ACLRolesDeleteByIDRequestType, &args) 2041 2042 // If the error was because we lost leadership while calling 2043 // Raft, avoid logging as this can be confusing to operators. 2044 if err != nil { 2045 if err != raft.ErrLeadershipLost { 2046 s.logger.Error("failed to delete ACL roles", "error", err) 2047 } 2048 if s.replicationBackoffContinue(stopCh) { 2049 continue 2050 } else { 2051 return 2052 } 2053 } 2054 } 2055 2056 // Fetch any outdated policies. 2057 var fetched []*structs.ACLRole 2058 if len(toUpdate) > 0 { 2059 req := structs.ACLRolesByIDRequest{ 2060 ACLRoleIDs: toUpdate, 2061 QueryOptions: structs.QueryOptions{ 2062 Region: s.config.AuthoritativeRegion, 2063 AuthToken: s.ReplicationToken(), 2064 AllowStale: true, 2065 MinQueryIndex: resp.Index - 1, 2066 }, 2067 } 2068 var reply structs.ACLRolesByIDResponse 2069 if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetRolesByIDRPCMethod, &req, &reply); err != nil { 2070 s.logger.Error("failed to fetch ACL Roles from authoritative region", "error", err) 2071 if s.replicationBackoffContinue(stopCh) { 2072 continue 2073 } else { 2074 return 2075 } 2076 } 2077 for _, aclRole := range reply.ACLRoles { 2078 fetched = append(fetched, aclRole) 2079 } 2080 } 2081 2082 // Update local tokens 2083 if len(fetched) > 0 { 2084 2085 // The replication of ACL roles and policies are independent, 2086 // therefore we cannot ensure the policies linked within the 2087 // role are present. We must set allow missing to true. 2088 args := structs.ACLRolesUpsertRequest{ 2089 ACLRoles: fetched, 2090 AllowMissingPolicies: true, 2091 } 2092 2093 // Perform the upsert directly via Raft. 2094 _, _, err := s.raftApply(structs.ACLRolesUpsertRequestType, &args) 2095 if err != nil { 2096 s.logger.Error("failed to update ACL roles", "error", err) 2097 if s.replicationBackoffContinue(stopCh) { 2098 continue 2099 } else { 2100 return 2101 } 2102 } 2103 } 2104 2105 // Update the minimum query index, blocks until there is a change. 2106 req.MinQueryIndex = resp.Index 2107 } 2108 } 2109 } 2110 2111 // diffACLRoles is used to perform a two-way diff between the local ACL Roles 2112 // and the remote Roles to determine which tokens need to be deleted or 2113 // updated. The returned array's contain ACL Role IDs. 2114 func diffACLRoles( 2115 store *state.StateStore, minIndex uint64, remoteList []*structs.ACLRoleListStub) ( 2116 delete []string, update []string) { 2117 2118 // The local ACL role tracking is keyed by the role ID and the value is the 2119 // hash of the role. 2120 local := make(map[string][]byte) 2121 2122 // The remote ACL role tracking is keyed by the role ID; the value is an 2123 // empty struct as we already have the full object. 2124 remote := make(map[string]struct{}) 2125 2126 // Read all the ACL role currently held within our local state. This panic 2127 // will only happen as a developer making a mistake with naming the index 2128 // to use. 2129 iter, err := store.GetACLRoles(nil) 2130 if err != nil { 2131 panic(fmt.Sprintf("failed to iterate local ACL roles: %v", err)) 2132 } 2133 2134 // Iterate the local ACL roles and add them to our tracking of local roles. 2135 for raw := iter.Next(); raw != nil; raw = iter.Next() { 2136 aclRole := raw.(*structs.ACLRole) 2137 local[aclRole.ID] = aclRole.Hash 2138 } 2139 2140 // Iterate over the remote ACL roles. 2141 for _, remoteACLRole := range remoteList { 2142 remote[remoteACLRole.ID] = struct{}{} 2143 2144 // Identify whether the ACL role is within the local state. If it is 2145 // not, add this to our update list. 2146 if localHash, ok := local[remoteACLRole.ID]; !ok { 2147 update = append(update, remoteACLRole.ID) 2148 2149 // Check if ACL role is newer remotely and there is a hash 2150 // mismatch. 2151 } else if remoteACLRole.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLRole.Hash) { 2152 update = append(update, remoteACLRole.ID) 2153 } 2154 } 2155 2156 // If we have ACL roles within state which are no longer present in the 2157 // authoritative region we should delete them. 2158 for localACLRole := range local { 2159 if _, ok := remote[localACLRole]; !ok { 2160 delete = append(delete, localACLRole) 2161 } 2162 } 2163 return 2164 } 2165 2166 // replicateACLAuthMethods is used to replicate ACL Authentication Methods from 2167 // the authoritative region to this region. The loop should only be run on the 2168 // leader within the federated region. 2169 func (s *Server) replicateACLAuthMethods(stopCh chan struct{}) { 2170 2171 // Generate our request object. We only need to do this once and reuse it 2172 // for every RPC request. The MinQueryIndex is updated after every 2173 // successful replication loop, so the next query acts as a blocking query 2174 // and only returns upon a change in the authoritative region. 2175 req := structs.ACLAuthMethodListRequest{ 2176 QueryOptions: structs.QueryOptions{ 2177 AllowStale: true, 2178 Region: s.config.AuthoritativeRegion, 2179 }, 2180 } 2181 2182 // Create our replication rate limiter for ACL auth-methods and log a 2183 // lovely message to indicate the process is starting. 2184 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 2185 s.logger.Debug("starting ACL Auth-Methods replication from authoritative region", 2186 "authoritative_region", req.Region) 2187 2188 // Enter the main ACL auth-methods replication loop that will only exit 2189 // when the stopCh is closed. 2190 // 2191 // Any error encountered will use the replicationBackoffContinue function 2192 // which handles replication backoff and shutdown coordination in the event 2193 // of an error inside the loop. 2194 for { 2195 select { 2196 case <-stopCh: 2197 return 2198 default: 2199 2200 // Rate limit how often we attempt replication. It is OK to ignore 2201 // the error as the context will never be cancelled and the limit 2202 // parameters are controlled internally. 2203 _ = limiter.Wait(context.Background()) 2204 2205 if !ServersMeetMinimumVersion( 2206 s.serf.Members(), s.Region(), minACLAuthMethodVersion, true) { 2207 s.logger.Trace( 2208 "all servers must be upgraded to 1.5.0 or later before ACL Auth Methods can be replicated") 2209 if s.replicationBackoffContinue(stopCh) { 2210 continue 2211 } else { 2212 return 2213 } 2214 } 2215 2216 // Set the replication token on each replication iteration so that 2217 // it is always current and can handle agent SIGHUP reloads. 2218 req.AuthToken = s.ReplicationToken() 2219 2220 var resp structs.ACLAuthMethodListResponse 2221 2222 // Make the list RPC request to the authoritative region, so we 2223 // capture the latest ACL auth-method listing. 2224 err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListAuthMethodsRPCMethod, &req, &resp) 2225 if err != nil { 2226 s.logger.Error("failed to fetch ACL auth-methods from authoritative region", "error", err) 2227 if s.replicationBackoffContinue(stopCh) { 2228 continue 2229 } else { 2230 return 2231 } 2232 } 2233 2234 // Perform a two-way diff on the ACL auth-methods. 2235 toDelete, toUpdate := diffACLAuthMethods(s.State(), req.MinQueryIndex, resp.AuthMethods) 2236 2237 // A significant amount of time could pass between the last check 2238 // on whether we should stop the replication process. Therefore, do 2239 // a check here, before calling Raft. 2240 select { 2241 case <-stopCh: 2242 return 2243 default: 2244 } 2245 2246 // If we have ACL auth-methods to delete, make this call directly 2247 // to Raft. 2248 if len(toDelete) > 0 { 2249 args := structs.ACLAuthMethodDeleteRequest{Names: toDelete} 2250 _, _, err := s.raftApply(structs.ACLAuthMethodsDeleteRequestType, &args) 2251 2252 // If the error was because we lost leadership while calling 2253 // Raft, avoid logging as this can be confusing to operators. 2254 if err != nil { 2255 if err != raft.ErrLeadershipLost { 2256 s.logger.Error("failed to delete ACL auth-methods", "error", err) 2257 } 2258 if s.replicationBackoffContinue(stopCh) { 2259 continue 2260 } else { 2261 return 2262 } 2263 } 2264 } 2265 2266 // Fetch any outdated auth-methods. 2267 var fetched []*structs.ACLAuthMethod 2268 if len(toUpdate) > 0 { 2269 req := structs.ACLAuthMethodsGetRequest{ 2270 Names: toUpdate, 2271 QueryOptions: structs.QueryOptions{ 2272 Region: s.config.AuthoritativeRegion, 2273 AuthToken: s.ReplicationToken(), 2274 AllowStale: true, 2275 MinQueryIndex: resp.Index - 1, 2276 }, 2277 } 2278 var reply structs.ACLAuthMethodsGetResponse 2279 if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetAuthMethodsRPCMethod, &req, &reply); err != nil { 2280 s.logger.Error("failed to fetch ACL auth-methods from authoritative region", "error", err) 2281 if s.replicationBackoffContinue(stopCh) { 2282 continue 2283 } else { 2284 return 2285 } 2286 } 2287 for _, aclAuthMethod := range reply.AuthMethods { 2288 fetched = append(fetched, aclAuthMethod) 2289 } 2290 } 2291 2292 // Update local auth-methods. 2293 if len(fetched) > 0 { 2294 args := structs.ACLAuthMethodUpsertRequest{ 2295 AuthMethods: fetched, 2296 } 2297 2298 // Perform the upsert directly via Raft. 2299 _, _, err := s.raftApply(structs.ACLAuthMethodsUpsertRequestType, &args) 2300 if err != nil { 2301 s.logger.Error("failed to update ACL auth-methods", "error", err) 2302 if s.replicationBackoffContinue(stopCh) { 2303 continue 2304 } else { 2305 return 2306 } 2307 } 2308 } 2309 2310 // Update the minimum query index, blocks until there is a change. 2311 req.MinQueryIndex = resp.Index 2312 } 2313 } 2314 } 2315 2316 // diffACLAuthMethods is used to perform a two-way diff between the local ACL 2317 // auth-methods and the remote auth-methods to determine which ones need to be 2318 // deleted or updated. The returned array's contain ACL auth-method names. 2319 func diffACLAuthMethods( 2320 store *state.StateStore, minIndex uint64, remoteList []*structs.ACLAuthMethodStub) ( 2321 delete []string, update []string) { 2322 2323 // The local ACL auth-method tracking is keyed by the name and the value is 2324 // the hash of the auth-method. 2325 local := make(map[string][]byte) 2326 2327 // The remote ACL auth-method tracking is keyed by the name; the value is 2328 // an empty struct as we already have the full object. 2329 remote := make(map[string]struct{}) 2330 2331 // Read all the ACL auth-methods currently held within our local state. 2332 // This panic will only happen as a developer making a mistake with naming 2333 // the index to use. 2334 iter, err := store.GetACLAuthMethods(nil) 2335 if err != nil { 2336 panic(fmt.Sprintf("failed to iterate local ACL roles: %v", err)) 2337 } 2338 2339 // Iterate the local ACL auth-methods and add them to our tracking of 2340 // local auth-methods 2341 for raw := iter.Next(); raw != nil; raw = iter.Next() { 2342 aclAuthMethod := raw.(*structs.ACLAuthMethod) 2343 local[aclAuthMethod.Name] = aclAuthMethod.Hash 2344 } 2345 2346 // Iterate over the remote ACL auth-methods. 2347 for _, remoteACLAuthMethod := range remoteList { 2348 remote[remoteACLAuthMethod.Name] = struct{}{} 2349 2350 // Identify whether the ACL auth-method is within the local state. If 2351 // it is not, add this to our update list. 2352 if localHash, ok := local[remoteACLAuthMethod.Name]; !ok { 2353 update = append(update, remoteACLAuthMethod.Name) 2354 2355 // Check if ACL auth-method is newer remotely and there is a hash 2356 // mismatch. 2357 } else if remoteACLAuthMethod.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLAuthMethod.Hash) { 2358 update = append(update, remoteACLAuthMethod.Name) 2359 } 2360 } 2361 2362 // If we have ACL auth-methods within state which are no longer present in 2363 // the authoritative region we should delete them. 2364 for localACLAuthMethod := range local { 2365 if _, ok := remote[localACLAuthMethod]; !ok { 2366 delete = append(delete, localACLAuthMethod) 2367 } 2368 } 2369 return 2370 } 2371 2372 // replicateACLBindingRules is used to replicate ACL binding rules from the 2373 // authoritative region to this region. The loop should only be run on the 2374 // leader within the federated region. 2375 func (s *Server) replicateACLBindingRules(stopCh chan struct{}) { 2376 2377 // Generate our request object. We only need to do this once and reuse it 2378 // for every RPC request. The MinQueryIndex is updated after every 2379 // successful replication loop, so the next query acts as a blocking query 2380 // and only returns upon a change in the authoritative region. 2381 req := structs.ACLBindingRulesListRequest{ 2382 QueryOptions: structs.QueryOptions{ 2383 AllowStale: true, 2384 Region: s.config.AuthoritativeRegion, 2385 }, 2386 } 2387 2388 // Create our replication rate limiter for ACL binding rules and log a 2389 // lovely message to indicate the process is starting. 2390 limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) 2391 s.logger.Debug("starting ACL Binding Rules replication from authoritative region", 2392 "authoritative_region", req.Region) 2393 2394 // Enter the main ACL binding rules replication loop that will only exit 2395 // when the stopCh is closed. 2396 // 2397 // Any error encountered will use the replicationBackoffContinue function 2398 // which handles replication backoff and shutdown coordination in the event 2399 // of an error inside the loop. 2400 for { 2401 select { 2402 case <-stopCh: 2403 return 2404 default: 2405 2406 // Rate limit how often we attempt replication. It is OK to ignore 2407 // the error as the context will never be cancelled and the limit 2408 // parameters are controlled internally. 2409 _ = limiter.Wait(context.Background()) 2410 2411 if !ServersMeetMinimumVersion( 2412 s.serf.Members(), s.Region(), minACLBindingRuleVersion, true) { 2413 s.logger.Trace( 2414 "all servers must be upgraded to 1.5.0 or later before ACL Binding Rules can be replicated") 2415 if s.replicationBackoffContinue(stopCh) { 2416 continue 2417 } else { 2418 return 2419 } 2420 } 2421 2422 // Set the replication token on each replication iteration so that 2423 // it is always current and can handle agent SIGHUP reloads. 2424 req.AuthToken = s.ReplicationToken() 2425 2426 var resp structs.ACLBindingRulesListResponse 2427 2428 // Make the list RPC request to the authoritative region, so we 2429 // capture the latest ACL binding rules listing. 2430 err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListBindingRulesRPCMethod, &req, &resp) 2431 if err != nil { 2432 s.logger.Error("failed to fetch ACL binding rules from authoritative region", "error", err) 2433 if s.replicationBackoffContinue(stopCh) { 2434 continue 2435 } else { 2436 return 2437 } 2438 } 2439 2440 // Perform a two-way diff on the ACL binding rules. 2441 toDelete, toUpdate := diffACLBindingRules(s.State(), req.MinQueryIndex, resp.ACLBindingRules) 2442 2443 // A significant amount of time could pass between the last check 2444 // on whether we should stop the replication process. Therefore, do 2445 // a check here, before calling Raft. 2446 select { 2447 case <-stopCh: 2448 return 2449 default: 2450 } 2451 2452 // If we have ACL binding rules to delete, make this call directly 2453 // to Raft. 2454 if len(toDelete) > 0 { 2455 args := structs.ACLBindingRulesDeleteRequest{ACLBindingRuleIDs: toDelete} 2456 _, _, err := s.raftApply(structs.ACLBindingRulesDeleteRequestType, &args) 2457 2458 // If the error was because we lost leadership while calling 2459 // Raft, avoid logging as this can be confusing to operators. 2460 if err != nil { 2461 if err != raft.ErrLeadershipLost { 2462 s.logger.Error("failed to delete ACL binding rules", "error", err) 2463 } 2464 if s.replicationBackoffContinue(stopCh) { 2465 continue 2466 } else { 2467 return 2468 } 2469 } 2470 } 2471 2472 // Fetch any outdated binding rules. 2473 var fetched []*structs.ACLBindingRule 2474 if len(toUpdate) > 0 { 2475 req := structs.ACLBindingRulesRequest{ 2476 ACLBindingRuleIDs: toUpdate, 2477 QueryOptions: structs.QueryOptions{ 2478 Region: s.config.AuthoritativeRegion, 2479 AuthToken: s.ReplicationToken(), 2480 AllowStale: true, 2481 MinQueryIndex: resp.Index - 1, 2482 }, 2483 } 2484 var reply structs.ACLBindingRulesResponse 2485 if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetBindingRulesRPCMethod, &req, &reply); err != nil { 2486 s.logger.Error("failed to fetch ACL binding rules from authoritative region", "error", err) 2487 if s.replicationBackoffContinue(stopCh) { 2488 continue 2489 } else { 2490 return 2491 } 2492 } 2493 for _, aclBindingRule := range reply.ACLBindingRules { 2494 fetched = append(fetched, aclBindingRule) 2495 } 2496 } 2497 2498 // Update local binding rules. 2499 if len(fetched) > 0 { 2500 args := structs.ACLBindingRulesUpsertRequest{ 2501 ACLBindingRules: fetched, 2502 AllowMissingAuthMethods: true, 2503 } 2504 2505 // Perform the upsert directly via Raft. 2506 _, _, err := s.raftApply(structs.ACLBindingRulesUpsertRequestType, &args) 2507 if err != nil { 2508 s.logger.Error("failed to update ACL binding rules", "error", err) 2509 if s.replicationBackoffContinue(stopCh) { 2510 continue 2511 } else { 2512 return 2513 } 2514 } 2515 } 2516 2517 // Update the minimum query index, blocks until there is a change. 2518 req.MinQueryIndex = resp.Index 2519 } 2520 } 2521 } 2522 2523 // diffACLBindingRules is used to perform a two-way diff between the local ACL 2524 // binding rules and the remote binding rules to determine which ones need to be 2525 // deleted or updated. The returned array's contain ACL binding rule names. 2526 func diffACLBindingRules( 2527 store *state.StateStore, minIndex uint64, remoteList []*structs.ACLBindingRuleListStub) ( 2528 delete []string, update []string) { 2529 2530 // The local ACL binding rules tracking is keyed by the name and the value 2531 // is the hash of the auth-method. 2532 local := make(map[string][]byte) 2533 2534 // The remote ACL binding rules tracking is keyed by the name; the value is 2535 // an empty struct as we already have the full object. 2536 remote := make(map[string]struct{}) 2537 2538 // Read all the ACL binding rules currently held within our local state. 2539 // This panic will only happen as a developer making a mistake with naming 2540 // the index to use. 2541 iter, err := store.GetACLBindingRules(nil) 2542 if err != nil { 2543 panic(fmt.Sprintf("failed to iterate local ACL binding rules: %v", err)) 2544 } 2545 2546 // Iterate the local ACL binding rules and add them to our tracking of 2547 // local binding rules. 2548 for raw := iter.Next(); raw != nil; raw = iter.Next() { 2549 aclBindingRule := raw.(*structs.ACLBindingRule) 2550 local[aclBindingRule.ID] = aclBindingRule.Hash 2551 } 2552 2553 // Iterate over the remote ACL binding rules. 2554 for _, remoteACLBindingRule := range remoteList { 2555 remote[remoteACLBindingRule.ID] = struct{}{} 2556 2557 // Identify whether the ACL auth-method is within the local state. If 2558 // it is not, add this to our update list. 2559 if localHash, ok := local[remoteACLBindingRule.ID]; !ok { 2560 update = append(update, remoteACLBindingRule.ID) 2561 2562 // Check if the ACL binding rule is newer remotely and there is a 2563 // hash mismatch. 2564 } else if remoteACLBindingRule.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLBindingRule.Hash) { 2565 update = append(update, remoteACLBindingRule.ID) 2566 } 2567 } 2568 2569 // If we have ACL binding rules within state which are no longer present in 2570 // the authoritative region we should delete them. 2571 for localACLBindingRules := range local { 2572 if _, ok := remote[localACLBindingRules]; !ok { 2573 delete = append(delete, localACLBindingRules) 2574 } 2575 } 2576 return 2577 } 2578 2579 // replicationBackoffContinue should be used when a replication loop encounters 2580 // an error and wants to wait until either the backoff time has been met, or 2581 // the stopCh has been closed. The boolean indicates whether the replication 2582 // process should continue. 2583 // 2584 // Typical use: 2585 // 2586 // if s.replicationBackoffContinue(stopCh) { 2587 // continue 2588 // } else { 2589 // return 2590 // } 2591 func (s *Server) replicationBackoffContinue(stopCh chan struct{}) bool { 2592 2593 timer, timerStopFn := helper.NewSafeTimer(s.config.ReplicationBackoff) 2594 defer timerStopFn() 2595 2596 select { 2597 case <-timer.C: 2598 return true 2599 case <-stopCh: 2600 return false 2601 } 2602 } 2603 2604 // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary 2605 func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { 2606 state := s.fsm.State() 2607 _, config, err := state.AutopilotConfig() 2608 if err != nil { 2609 s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err) 2610 return nil 2611 } 2612 if config != nil { 2613 return config 2614 } 2615 2616 if !ServersMeetMinimumVersion(s.Members(), AllRegions, minAutopilotVersion, false) { 2617 s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion) 2618 return nil 2619 } 2620 2621 config = s.config.AutopilotConfig 2622 req := structs.AutopilotSetConfigRequest{Config: *config} 2623 if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { 2624 s.logger.Named("autopilot").Error("failed to initialize config", "error", err) 2625 return nil 2626 } 2627 2628 return config 2629 } 2630 2631 // getOrCreateSchedulerConfig is used to get the scheduler config. We create a default 2632 // config if it doesn't already exist for bootstrapping an empty cluster 2633 func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration { 2634 state := s.fsm.State() 2635 _, config, err := state.SchedulerConfig() 2636 if err != nil { 2637 s.logger.Named("core").Error("failed to get scheduler config", "error", err) 2638 return nil 2639 } 2640 if config != nil { 2641 return config 2642 } 2643 if !ServersMeetMinimumVersion(s.Members(), s.Region(), minSchedulerConfigVersion, false) { 2644 s.logger.Named("core").Warn("can't initialize scheduler config until all servers are above minimum version", "min_version", minSchedulerConfigVersion) 2645 return nil 2646 } 2647 2648 req := structs.SchedulerSetConfigRequest{Config: s.config.DefaultSchedulerConfig} 2649 if _, _, err = s.raftApply(structs.SchedulerConfigRequestType, req); err != nil { 2650 s.logger.Named("core").Error("failed to initialize config", "error", err) 2651 return nil 2652 } 2653 2654 return config 2655 } 2656 2657 var minVersionKeyring = version.Must(version.NewVersion("1.4.0")) 2658 2659 // initializeKeyring creates the first root key if the leader doesn't 2660 // already have one. The metadata will be replicated via raft and then 2661 // the followers will get the key material from their own key 2662 // replication. 2663 func (s *Server) initializeKeyring(stopCh <-chan struct{}) { 2664 2665 logger := s.logger.Named("keyring") 2666 2667 store := s.fsm.State() 2668 keyMeta, err := store.GetActiveRootKeyMeta(nil) 2669 if err != nil { 2670 logger.Error("failed to get active key: %v", err) 2671 return 2672 } 2673 if keyMeta != nil { 2674 return 2675 } 2676 2677 logger.Trace("verifying cluster is ready to initialize keyring") 2678 for { 2679 select { 2680 case <-stopCh: 2681 return 2682 default: 2683 } 2684 2685 if ServersMeetMinimumVersion(s.serf.Members(), s.Region(), minVersionKeyring, true) { 2686 break 2687 } 2688 } 2689 // we might have lost leadership during the version check 2690 if !s.IsLeader() { 2691 return 2692 } 2693 2694 logger.Trace("initializing keyring") 2695 2696 rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) 2697 rootKey.Meta.SetActive() 2698 if err != nil { 2699 logger.Error("could not initialize keyring: %v", err) 2700 return 2701 } 2702 2703 err = s.encrypter.AddKey(rootKey) 2704 if err != nil { 2705 logger.Error("could not add initial key to keyring: %v", err) 2706 return 2707 } 2708 2709 if _, _, err = s.raftApply(structs.RootKeyMetaUpsertRequestType, 2710 structs.KeyringUpdateRootKeyMetaRequest{ 2711 RootKeyMeta: rootKey.Meta, 2712 }); err != nil { 2713 logger.Error("could not initialize keyring: %v", err) 2714 return 2715 } 2716 2717 logger.Info("initialized keyring", "id", rootKey.Meta.KeyID) 2718 } 2719 2720 func (s *Server) generateClusterID() (string, error) { 2721 if !ServersMeetMinimumVersion(s.Members(), AllRegions, minClusterIDVersion, false) { 2722 s.logger.Named("core").Warn("cannot initialize cluster ID until all servers are above minimum version", "min_version", minClusterIDVersion) 2723 return "", fmt.Errorf("cluster ID cannot be created until all servers are above minimum version %s", minClusterIDVersion) 2724 } 2725 2726 newMeta := structs.ClusterMetadata{ClusterID: uuid.Generate(), CreateTime: time.Now().UnixNano()} 2727 if _, _, err := s.raftApply(structs.ClusterMetadataRequestType, newMeta); err != nil { 2728 s.logger.Named("core").Error("failed to create cluster ID", "error", err) 2729 return "", fmt.Errorf("failed to create cluster ID: %w", err) 2730 } 2731 2732 s.logger.Named("core").Info("established cluster id", "cluster_id", newMeta.ClusterID, "create_time", newMeta.CreateTime) 2733 return newMeta.ClusterID, nil 2734 } 2735 2736 // handleEvalBrokerStateChange handles changing the evalBroker and blockedEvals 2737 // enabled status based on the passed scheduler configuration. The boolean 2738 // response indicates whether the caller needs to call restoreEvals() due to 2739 // the brokers being enabled. It is for use when the change must take the 2740 // scheduler configuration into account. This is not needed when calling 2741 // revokeLeadership, as the configuration doesn't matter, and we need to ensure 2742 // the brokers are stopped. 2743 // 2744 // The function checks the server is the leader and uses a mutex to avoid any 2745 // potential timings problems. Consider the following timings: 2746 // - operator updates the configuration via the API 2747 // - the RPC handler applies the change via Raft 2748 // - leadership transitions with write barrier 2749 // - the RPC handler call this function to enact the change 2750 // 2751 // The mutex also protects against a situation where leadership is revoked 2752 // while this function is being called. Ensuring the correct series of actions 2753 // occurs so that state stays consistent. 2754 func (s *Server) handleEvalBrokerStateChange(schedConfig *structs.SchedulerConfiguration) bool { 2755 2756 // Grab the lock first. Once we have this we can be sure to run everything 2757 // needed before any leader transition can attempt to modify the state. 2758 s.brokerLock.Lock() 2759 defer s.brokerLock.Unlock() 2760 2761 // If we are no longer the leader, exit early. 2762 if !s.IsLeader() { 2763 return false 2764 } 2765 2766 // enableEvalBroker tracks whether the evalBroker and blockedEvals 2767 // processes should be enabled or not. It allows us to answer this question 2768 // whether using a persisted Raft configuration, or the default bootstrap 2769 // config. 2770 var enableBrokers, restoreEvals bool 2771 2772 // The scheduler config can only be persisted to Raft once quorum has been 2773 // established. If this is a fresh cluster, we need to use the default 2774 // scheduler config, otherwise we can use the persisted object. 2775 switch schedConfig { 2776 case nil: 2777 enableBrokers = !s.config.DefaultSchedulerConfig.PauseEvalBroker 2778 default: 2779 enableBrokers = !schedConfig.PauseEvalBroker 2780 } 2781 2782 // If the evalBroker status is changing, set the new state. 2783 if enableBrokers != s.evalBroker.Enabled() { 2784 s.logger.Info("eval broker status modified", "paused", !enableBrokers) 2785 s.evalBroker.SetEnabled(enableBrokers) 2786 restoreEvals = enableBrokers 2787 } 2788 2789 // If the blockedEvals status is changing, set the new state. 2790 if enableBrokers != s.blockedEvals.Enabled() { 2791 s.logger.Info("blocked evals status modified", "paused", !enableBrokers) 2792 s.blockedEvals.SetEnabled(enableBrokers) 2793 restoreEvals = enableBrokers 2794 2795 if enableBrokers { 2796 s.blockedEvals.SetTimetable(s.fsm.TimeTable()) 2797 } 2798 } 2799 2800 return restoreEvals 2801 }