github.com/hernad/nomad@v1.6.112/nomad/leader.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"fmt"
    10  	"math/rand"
    11  	"net"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/armon/go-metrics"
    17  	"github.com/hashicorp/go-hclog"
    18  	"github.com/hashicorp/go-memdb"
    19  	"github.com/hashicorp/go-version"
    20  	"github.com/hernad/nomad/helper"
    21  	"github.com/hernad/nomad/helper/uuid"
    22  	"github.com/hernad/nomad/nomad/state"
    23  	"github.com/hernad/nomad/nomad/structs"
    24  	"github.com/hashicorp/raft"
    25  	"github.com/hashicorp/serf/serf"
    26  	"golang.org/x/time/rate"
    27  )
    28  
    29  const (
    30  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    31  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    32  	// high contention when the schedulers plan does not make progress.
    33  	failedEvalUnblockInterval = 1 * time.Minute
    34  
    35  	// replicationRateLimit is used to rate limit how often data is replicated
    36  	// between the authoritative region and the local region
    37  	replicationRateLimit rate.Limit = 10.0
    38  
    39  	// barrierWriteTimeout is used to give Raft a chance to process a
    40  	// possible loss of leadership event if we are unable to get a barrier
    41  	// while leader.
    42  	barrierWriteTimeout = 2 * time.Minute
    43  )
    44  
    45  var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    46  
    47  var minSchedulerConfigVersion = version.Must(version.NewVersion("0.9.0"))
    48  
    49  var minClusterIDVersion = version.Must(version.NewVersion("0.10.4"))
    50  
    51  var minOneTimeAuthenticationTokenVersion = version.Must(version.NewVersion("1.1.0"))
    52  
    53  // minACLRoleVersion is the Nomad version at which the ACL role table was
    54  // introduced. It forms the minimum version all federated servers must meet
    55  // before the feature can be used.
    56  var minACLRoleVersion = version.Must(version.NewVersion("1.4.0"))
    57  
    58  // minACLAuthMethodVersion is the Nomad version at which the ACL auth methods
    59  // table was introduced. It forms the minimum version all federated servers must
    60  // meet before the feature can be used.
    61  var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0"))
    62  
    63  // minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type
    64  // was introduced. It forms the minimum version all federated servers must
    65  // meet before the feature can be used.
    66  var minACLJWTAuthMethodVersion = version.Must(version.NewVersion("1.5.4"))
    67  
    68  // minACLBindingRuleVersion is the Nomad version at which the ACL binding rules
    69  // table was introduced. It forms the minimum version all federated servers
    70  // must meet before the feature can be used.
    71  var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0"))
    72  
    73  // minNomadServiceRegistrationVersion is the Nomad version at which the service
    74  // registrations table was introduced. It forms the minimum version all local
    75  // servers must meet before the feature can be used.
    76  var minNomadServiceRegistrationVersion = version.Must(version.NewVersion("1.3.0"))
    77  
    78  // Any writes to node pools requires that all servers are on version 1.6.0 to
    79  // prevent older versions of the server from crashing.
    80  var minNodePoolsVersion = version.Must(version.NewVersion("1.6.0"))
    81  
    82  // monitorLeadership is used to monitor if we acquire or lose our role
    83  // as the leader in the Raft cluster. There is some work the leader is
    84  // expected to do, so we must react to changes
    85  func (s *Server) monitorLeadership() {
    86  	var weAreLeaderCh chan struct{}
    87  	var leaderLoop sync.WaitGroup
    88  
    89  	leaderCh := s.raft.LeaderCh()
    90  
    91  	leaderStep := func(isLeader bool) {
    92  		if isLeader {
    93  			if weAreLeaderCh != nil {
    94  				s.logger.Error("attempted to start the leader loop while running")
    95  				return
    96  			}
    97  
    98  			weAreLeaderCh = make(chan struct{})
    99  			leaderLoop.Add(1)
   100  			go func(ch chan struct{}) {
   101  				defer leaderLoop.Done()
   102  				s.leaderLoop(ch)
   103  			}(weAreLeaderCh)
   104  			s.logger.Info("cluster leadership acquired")
   105  			return
   106  		}
   107  
   108  		if weAreLeaderCh == nil {
   109  			s.logger.Error("attempted to stop the leader loop while not running")
   110  			return
   111  		}
   112  
   113  		s.logger.Debug("shutting down leader loop")
   114  		close(weAreLeaderCh)
   115  		leaderLoop.Wait()
   116  		weAreLeaderCh = nil
   117  		s.logger.Info("cluster leadership lost")
   118  	}
   119  
   120  	wasLeader := false
   121  	for {
   122  		select {
   123  		case isLeader := <-leaderCh:
   124  			if wasLeader != isLeader {
   125  				wasLeader = isLeader
   126  				// normal case where we went through a transition
   127  				leaderStep(isLeader)
   128  			} else if wasLeader && isLeader {
   129  				// Server lost but then gained leadership immediately.
   130  				// During this time, this server may have received
   131  				// Raft transitions that haven't been applied to the FSM
   132  				// yet.
   133  				// Ensure that that FSM caught up and eval queues are refreshed
   134  				s.logger.Warn("cluster leadership lost and gained leadership immediately.  Could indicate network issues, memory paging, or high CPU load.")
   135  
   136  				leaderStep(false)
   137  				leaderStep(true)
   138  			} else {
   139  				// Server gained but lost leadership immediately
   140  				// before it reacted; nothing to do, move on
   141  				s.logger.Warn("cluster leadership gained and lost leadership immediately.  Could indicate network issues, memory paging, or high CPU load.")
   142  			}
   143  		case <-s.shutdownCh:
   144  			if weAreLeaderCh != nil {
   145  				leaderStep(false)
   146  			}
   147  			return
   148  		}
   149  	}
   150  }
   151  
   152  func (s *Server) leadershipTransfer() error {
   153  	retryCount := 3
   154  	for i := 0; i < retryCount; i++ {
   155  		err := s.raft.LeadershipTransfer().Error()
   156  		if err == nil {
   157  			s.logger.Info("successfully transferred leadership")
   158  			return nil
   159  		}
   160  
   161  		// Don't retry if the Raft version doesn't support leadership transfer
   162  		// since this will never succeed.
   163  		if err == raft.ErrUnsupportedProtocol {
   164  			return fmt.Errorf("leadership transfer not supported with Raft version lower than 3")
   165  		}
   166  
   167  		s.logger.Error("failed to transfer leadership attempt, will retry",
   168  			"attempt", i,
   169  			"retry_limit", retryCount,
   170  			"error", err,
   171  		)
   172  	}
   173  	return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount)
   174  }
   175  
   176  // leaderLoop runs as long as we are the leader to run various
   177  // maintenance activities
   178  func (s *Server) leaderLoop(stopCh chan struct{}) {
   179  	var reconcileCh chan serf.Member
   180  	establishedLeader := false
   181  
   182  RECONCILE:
   183  	// Setup a reconciliation timer
   184  	reconcileCh = nil
   185  	interval := time.After(s.config.ReconcileInterval)
   186  
   187  	// Apply a raft barrier to ensure our FSM is caught up
   188  	start := time.Now()
   189  	barrier := s.raft.Barrier(barrierWriteTimeout)
   190  	if err := barrier.Error(); err != nil {
   191  		s.logger.Error("failed to wait for barrier", "error", err)
   192  		goto WAIT
   193  	}
   194  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   195  
   196  	// Check if we need to handle initial leadership actions
   197  	if !establishedLeader {
   198  		if err := s.establishLeadership(stopCh); err != nil {
   199  			s.logger.Error("failed to establish leadership", "error", err)
   200  
   201  			// Immediately revoke leadership since we didn't successfully
   202  			// establish leadership.
   203  			if err := s.revokeLeadership(); err != nil {
   204  				s.logger.Error("failed to revoke leadership", "error", err)
   205  			}
   206  
   207  			// Attempt to transfer leadership. If successful, leave the
   208  			// leaderLoop since this node is no longer the leader. Otherwise
   209  			// try to establish leadership again after 5 seconds.
   210  			if err := s.leadershipTransfer(); err != nil {
   211  				s.logger.Error("failed to transfer leadership", "error", err)
   212  				interval = time.After(5 * time.Second)
   213  				goto WAIT
   214  			}
   215  			return
   216  		}
   217  
   218  		establishedLeader = true
   219  		defer func() {
   220  			if err := s.revokeLeadership(); err != nil {
   221  				s.logger.Error("failed to revoke leadership", "error", err)
   222  			}
   223  		}()
   224  	}
   225  
   226  	// Reconcile any missing data
   227  	if err := s.reconcile(); err != nil {
   228  		s.logger.Error("failed to reconcile", "error", err)
   229  		goto WAIT
   230  	}
   231  
   232  	// Initial reconcile worked, now we can process the channel
   233  	// updates
   234  	reconcileCh = s.reconcileCh
   235  
   236  	// Poll the stop channel to give it priority so we don't waste time
   237  	// trying to perform the other operations if we have been asked to shut
   238  	// down.
   239  	select {
   240  	case <-stopCh:
   241  		return
   242  	default:
   243  	}
   244  
   245  WAIT:
   246  	// Wait until leadership is lost or periodically reconcile as long as we
   247  	// are the leader, or when Serf events arrive.
   248  	for {
   249  		select {
   250  		case <-stopCh:
   251  			// Lost leadership.
   252  			return
   253  		case <-s.shutdownCh:
   254  			return
   255  		case <-interval:
   256  			goto RECONCILE
   257  		case member := <-reconcileCh:
   258  			s.reconcileMember(member)
   259  		case errCh := <-s.reassertLeaderCh:
   260  			// Recompute leader state, by asserting leadership and
   261  			// repopulating leader states.
   262  
   263  			// Check first if we are indeed the leaders first. We
   264  			// can get into this state when the initial
   265  			// establishLeadership has failed.
   266  			// Afterwards we will be waiting for the interval to
   267  			// trigger a reconciliation and can potentially end up
   268  			// here. There is no point to reassert because this
   269  			// agent was never leader in the first place.
   270  			if !establishedLeader {
   271  				errCh <- fmt.Errorf("leadership has not been established")
   272  				continue
   273  			}
   274  
   275  			// refresh leadership state
   276  			s.revokeLeadership()
   277  			err := s.establishLeadership(stopCh)
   278  			errCh <- err
   279  
   280  			// In case establishLeadership fails, try to transfer leadership.
   281  			// At this point Raft thinks we are the leader, but Nomad did not
   282  			// complete the required steps to act as the leader.
   283  			if err != nil {
   284  				if err := s.leadershipTransfer(); err != nil {
   285  					// establishedLeader was true before, but it no longer is
   286  					// since we revoked leadership and leadershipTransfer also
   287  					// failed.
   288  					// Stay in the leaderLoop with establishedLeader set to
   289  					// false so we try to establish leadership again in the
   290  					// next loop.
   291  					establishedLeader = false
   292  					interval = time.After(5 * time.Second)
   293  					goto WAIT
   294  				}
   295  
   296  				// leadershipTransfer was successful and it is
   297  				// time to leave the leaderLoop.
   298  				return
   299  			}
   300  		}
   301  	}
   302  }
   303  
   304  // establishLeadership is invoked once we become leader and are able
   305  // to invoke an initial barrier. The barrier is used to ensure any
   306  // previously inflight transactions have been committed and that our
   307  // state is up-to-date.
   308  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   309  	defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now())
   310  
   311  	// Generate a leader ACL token. This will allow the leader to issue work
   312  	// that requires a valid ACL token.
   313  	s.setLeaderAcl(uuid.Generate())
   314  
   315  	// Disable workers to free half the cores for use in the plan queue and
   316  	// evaluation broker
   317  	s.handlePausableWorkers(true)
   318  
   319  	// Initialize and start the autopilot routine
   320  	s.getOrCreateAutopilotConfig()
   321  	s.autopilot.Start(s.shutdownCtx)
   322  
   323  	// Initialize scheduler configuration.
   324  	schedulerConfig := s.getOrCreateSchedulerConfig()
   325  
   326  	// Initialize the ClusterID
   327  	_, _ = s.ClusterID()
   328  	// todo: use cluster ID for stuff, later!
   329  
   330  	// Enable the plan queue, since we are now the leader
   331  	s.planQueue.SetEnabled(true)
   332  
   333  	// Start the plan evaluator
   334  	go s.planApply()
   335  
   336  	// Start the eval broker and blocked eval broker if these are not paused by
   337  	// the operator.
   338  	restoreEvals := s.handleEvalBrokerStateChange(schedulerConfig)
   339  
   340  	// Enable the deployment watcher, since we are now the leader
   341  	s.deploymentWatcher.SetEnabled(true, s.State())
   342  
   343  	// Enable the NodeDrainer
   344  	s.nodeDrainer.SetEnabled(true, s.State())
   345  
   346  	// Enable the volume watcher, since we are now the leader
   347  	s.volumeWatcher.SetEnabled(true, s.State(), s.getLeaderAcl())
   348  
   349  	// Restore the eval broker state and blocked eval state. If these are
   350  	// currently paused, we do not need to do this.
   351  	if restoreEvals {
   352  		if err := s.restoreEvals(); err != nil {
   353  			return err
   354  		}
   355  	}
   356  
   357  	// Activate the vault client
   358  	s.vault.SetActive(true)
   359  
   360  	// Enable the periodic dispatcher, since we are now the leader.
   361  	s.periodicDispatcher.SetEnabled(true)
   362  
   363  	// Activate RPC now that local FSM caught up with Raft (as evident by Barrier call success)
   364  	// and all leader related components (e.g. broker queue) are enabled.
   365  	// Auxiliary processes (e.g. background, bookkeeping, and cleanup tasks can start after)
   366  	s.setConsistentReadReady()
   367  
   368  	// Further clean ups and follow up that don't block RPC consistency
   369  
   370  	// Create the first root key if it doesn't already exist
   371  	go s.initializeKeyring(stopCh)
   372  
   373  	// Restore the periodic dispatcher state
   374  	if err := s.restorePeriodicDispatcher(); err != nil {
   375  		return err
   376  	}
   377  
   378  	// Schedule periodic jobs which include expired local ACL token garbage
   379  	// collection.
   380  	go s.schedulePeriodic(stopCh)
   381  
   382  	// Reap any failed evaluations
   383  	go s.reapFailedEvaluations(stopCh)
   384  
   385  	// Reap any duplicate blocked evaluations
   386  	go s.reapDupBlockedEvaluations(stopCh)
   387  
   388  	// Reap any cancelable evaluations
   389  	s.reapCancelableEvalsCh = s.reapCancelableEvaluations(stopCh)
   390  
   391  	// Periodically unblock failed allocations
   392  	go s.periodicUnblockFailedEvals(stopCh)
   393  
   394  	// Periodically publish job summary metrics
   395  	go s.publishJobSummaryMetrics(stopCh)
   396  
   397  	// Periodically publish job status metrics
   398  	go s.publishJobStatusMetrics(stopCh)
   399  
   400  	// Setup the heartbeat timers. This is done both when starting up or when
   401  	// a leader fail over happens. Since the timers are maintained by the leader
   402  	// node, effectively this means all the timers are renewed at the time of failover.
   403  	// The TTL contract is that the session will not be expired before the TTL,
   404  	// so expiring it later is allowable.
   405  	//
   406  	// This MUST be done after the initial barrier to ensure the latest Nodes
   407  	// are available to be initialized. Otherwise initialization may use stale
   408  	// data.
   409  	if err := s.initializeHeartbeatTimers(); err != nil {
   410  		s.logger.Error("heartbeat timer setup failed", "error", err)
   411  		return err
   412  	}
   413  
   414  	// If ACLs are enabled, the leader needs to start a number of long-lived
   415  	// routines. Exactly which routines, depends on whether this leader is
   416  	// running within the authoritative region or not.
   417  	if s.config.ACLEnabled {
   418  
   419  		// The authoritative region is responsible for garbage collecting
   420  		// expired global tokens. Otherwise, non-authoritative regions need to
   421  		// replicate policies, tokens, and namespaces.
   422  		switch s.config.AuthoritativeRegion {
   423  		case s.config.Region:
   424  			go s.schedulePeriodicAuthoritative(stopCh)
   425  		default:
   426  			go s.replicateACLPolicies(stopCh)
   427  			go s.replicateACLTokens(stopCh)
   428  			go s.replicateACLRoles(stopCh)
   429  			go s.replicateACLAuthMethods(stopCh)
   430  			go s.replicateACLBindingRules(stopCh)
   431  			go s.replicateNamespaces(stopCh)
   432  			go s.replicateNodePools(stopCh)
   433  		}
   434  	}
   435  
   436  	// Setup any enterprise systems required.
   437  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   438  		return err
   439  	}
   440  
   441  	// Cleanup orphaned Vault token accessors
   442  	if err := s.revokeVaultAccessorsOnRestore(); err != nil {
   443  		return err
   444  	}
   445  
   446  	// Cleanup orphaned Service Identity token accessors
   447  	if err := s.revokeSITokenAccessorsOnRestore(); err != nil {
   448  		return err
   449  	}
   450  
   451  	return nil
   452  }
   453  
   454  // replicateNamespaces is used to replicate namespaces from the authoritative
   455  // region to this region.
   456  func (s *Server) replicateNamespaces(stopCh chan struct{}) {
   457  	req := structs.NamespaceListRequest{
   458  		QueryOptions: structs.QueryOptions{
   459  			Region:     s.config.AuthoritativeRegion,
   460  			AllowStale: true,
   461  		},
   462  	}
   463  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   464  	s.logger.Debug("starting namespace replication from authoritative region", "region", req.Region)
   465  
   466  START:
   467  	for {
   468  		select {
   469  		case <-stopCh:
   470  			return
   471  		default:
   472  		}
   473  
   474  		// Rate limit how often we attempt replication
   475  		limiter.Wait(context.Background())
   476  
   477  		// Fetch the list of namespaces
   478  		var resp structs.NamespaceListResponse
   479  		req.AuthToken = s.ReplicationToken()
   480  		err := s.forwardRegion(s.config.AuthoritativeRegion, "Namespace.ListNamespaces", &req, &resp)
   481  		if err != nil {
   482  			s.logger.Error("failed to fetch namespaces from authoritative region", "error", err)
   483  			goto ERR_WAIT
   484  		}
   485  
   486  		// Perform a two-way diff
   487  		delete, update := diffNamespaces(s.State(), req.MinQueryIndex, resp.Namespaces)
   488  
   489  		// Delete namespaces that should not exist
   490  		if len(delete) > 0 {
   491  			args := &structs.NamespaceDeleteRequest{
   492  				Namespaces: delete,
   493  			}
   494  			_, _, err := s.raftApply(structs.NamespaceDeleteRequestType, args)
   495  			if err != nil {
   496  				s.logger.Error("failed to delete namespaces", "error", err)
   497  				goto ERR_WAIT
   498  			}
   499  		}
   500  
   501  		// Fetch any outdated namespaces
   502  		var fetched []*structs.Namespace
   503  		if len(update) > 0 {
   504  			req := structs.NamespaceSetRequest{
   505  				Namespaces: update,
   506  				QueryOptions: structs.QueryOptions{
   507  					Region:        s.config.AuthoritativeRegion,
   508  					AuthToken:     s.ReplicationToken(),
   509  					AllowStale:    true,
   510  					MinQueryIndex: resp.Index - 1,
   511  				},
   512  			}
   513  			var reply structs.NamespaceSetResponse
   514  			if err := s.forwardRegion(s.config.AuthoritativeRegion, "Namespace.GetNamespaces", &req, &reply); err != nil {
   515  				s.logger.Error("failed to fetch namespaces from authoritative region", "error", err)
   516  				goto ERR_WAIT
   517  			}
   518  			for _, namespace := range reply.Namespaces {
   519  				fetched = append(fetched, namespace)
   520  			}
   521  		}
   522  
   523  		// Update local namespaces
   524  		if len(fetched) > 0 {
   525  			args := &structs.NamespaceUpsertRequest{
   526  				Namespaces: fetched,
   527  			}
   528  			_, _, err := s.raftApply(structs.NamespaceUpsertRequestType, args)
   529  			if err != nil {
   530  				s.logger.Error("failed to update namespaces", "error", err)
   531  				goto ERR_WAIT
   532  			}
   533  		}
   534  
   535  		// Update the minimum query index, blocks until there is a change.
   536  		req.MinQueryIndex = resp.Index
   537  	}
   538  
   539  ERR_WAIT:
   540  	select {
   541  	case <-time.After(s.config.ReplicationBackoff):
   542  		goto START
   543  	case <-stopCh:
   544  		return
   545  	}
   546  }
   547  
   548  func (s *Server) handlePausableWorkers(isLeader bool) {
   549  	for _, w := range s.pausableWorkers() {
   550  		if isLeader {
   551  			w.Pause()
   552  		} else {
   553  			w.Resume()
   554  		}
   555  	}
   556  }
   557  
   558  // diffNamespaces is used to perform a two-way diff between the local namespaces
   559  // and the remote namespaces to determine which namespaces need to be deleted or
   560  // updated.
   561  func diffNamespaces(state *state.StateStore, minIndex uint64, remoteList []*structs.Namespace) (delete []string, update []string) {
   562  	// Construct a set of the local and remote namespaces
   563  	local := make(map[string][]byte)
   564  	remote := make(map[string]struct{})
   565  
   566  	// Add all the local namespaces
   567  	iter, err := state.Namespaces(nil)
   568  	if err != nil {
   569  		panic("failed to iterate local namespaces")
   570  	}
   571  	for {
   572  		raw := iter.Next()
   573  		if raw == nil {
   574  			break
   575  		}
   576  		namespace := raw.(*structs.Namespace)
   577  		local[namespace.Name] = namespace.Hash
   578  	}
   579  
   580  	// Iterate over the remote namespaces
   581  	for _, rns := range remoteList {
   582  		remote[rns.Name] = struct{}{}
   583  
   584  		// Check if the namespace is missing locally
   585  		if localHash, ok := local[rns.Name]; !ok {
   586  			update = append(update, rns.Name)
   587  
   588  			// Check if the namespace is newer remotely and there is a hash
   589  			// mis-match.
   590  		} else if rns.ModifyIndex > minIndex && !bytes.Equal(localHash, rns.Hash) {
   591  			update = append(update, rns.Name)
   592  		}
   593  	}
   594  
   595  	// Check if namespaces should be deleted
   596  	for lns := range local {
   597  		if _, ok := remote[lns]; !ok {
   598  			delete = append(delete, lns)
   599  		}
   600  	}
   601  	return
   602  }
   603  
   604  // replicateNodePools is used to replicate node pools from the authoritative
   605  // region to this region.
   606  func (s *Server) replicateNodePools(stopCh chan struct{}) {
   607  	req := structs.NodePoolListRequest{
   608  		QueryOptions: structs.QueryOptions{
   609  			Region:     s.config.AuthoritativeRegion,
   610  			AllowStale: true,
   611  		},
   612  	}
   613  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   614  	s.logger.Debug("starting node pool replication from authoritative region", "region", req.Region)
   615  
   616  	for {
   617  		select {
   618  		case <-stopCh:
   619  			return
   620  		default:
   621  		}
   622  
   623  		// Rate limit how often we attempt replication
   624  		limiter.Wait(context.Background())
   625  
   626  		if !ServersMeetMinimumVersion(
   627  			s.serf.Members(), s.Region(), minNodePoolsVersion, true) {
   628  			s.logger.Trace(
   629  				"all servers must be upgraded to 1.6.0 before Node Pools can be replicated")
   630  			if s.replicationBackoffContinue(stopCh) {
   631  				continue
   632  			} else {
   633  				return
   634  			}
   635  		}
   636  
   637  		var resp structs.NodePoolListResponse
   638  		req.AuthToken = s.ReplicationToken()
   639  		err := s.forwardRegion(s.config.AuthoritativeRegion, "NodePool.List", &req, &resp)
   640  		if err != nil {
   641  			s.logger.Error("failed to fetch node pools from authoritative region", "error", err)
   642  			if s.replicationBackoffContinue(stopCh) {
   643  				continue
   644  			} else {
   645  				return
   646  			}
   647  		}
   648  
   649  		// Perform a two-way diff
   650  		delete, update := diffNodePools(s.State(), req.MinQueryIndex, resp.NodePools)
   651  
   652  		// A significant amount of time could pass between the last check
   653  		// on whether we should stop the replication process. Therefore, do
   654  		// a check here, before calling Raft.
   655  		select {
   656  		case <-stopCh:
   657  			return
   658  		default:
   659  		}
   660  
   661  		// Delete node pools that should not exist
   662  		if len(delete) > 0 {
   663  			args := &structs.NodePoolDeleteRequest{
   664  				Names: delete,
   665  			}
   666  			_, _, err := s.raftApply(structs.NodePoolDeleteRequestType, args)
   667  			if err != nil {
   668  				s.logger.Error("failed to delete node pools", "error", err)
   669  				if s.replicationBackoffContinue(stopCh) {
   670  					continue
   671  				} else {
   672  					return
   673  				}
   674  			}
   675  		}
   676  
   677  		// Update local node pools
   678  		if len(update) > 0 {
   679  			args := &structs.NodePoolUpsertRequest{
   680  				NodePools: update,
   681  			}
   682  			_, _, err := s.raftApply(structs.NodePoolUpsertRequestType, args)
   683  			if err != nil {
   684  				s.logger.Error("failed to update node pools", "error", err)
   685  				if s.replicationBackoffContinue(stopCh) {
   686  					continue
   687  				} else {
   688  					return
   689  				}
   690  			}
   691  		}
   692  
   693  		// Update the minimum query index, blocks until there is a change.
   694  		req.MinQueryIndex = resp.Index
   695  	}
   696  }
   697  
   698  // diffNodePools is used to perform a two-way diff between the local node pools
   699  // and the remote node pools to determine which node pools need to be deleted or
   700  // updated.
   701  func diffNodePools(store *state.StateStore, minIndex uint64, remoteList []*structs.NodePool) (delete []string, update []*structs.NodePool) {
   702  	// Construct a set of the local and remote node pools
   703  	local := make(map[string][]byte)
   704  	remote := make(map[string]struct{})
   705  
   706  	// Add all the local node pools
   707  	iter, err := store.NodePools(nil, state.SortDefault)
   708  	if err != nil {
   709  		panic("failed to iterate local node pools")
   710  	}
   711  	for {
   712  		raw := iter.Next()
   713  		if raw == nil {
   714  			break
   715  		}
   716  		pool := raw.(*structs.NodePool)
   717  		local[pool.Name] = pool.Hash
   718  	}
   719  
   720  	for _, rnp := range remoteList {
   721  		remote[rnp.Name] = struct{}{}
   722  
   723  		if localHash, ok := local[rnp.Name]; !ok {
   724  			// Node pools that are missing locally should be added
   725  			update = append(update, rnp)
   726  
   727  		} else if rnp.ModifyIndex > minIndex && !bytes.Equal(localHash, rnp.Hash) {
   728  			// Node pools that have been added/updated more recently than the
   729  			// last index we saw, and have a hash mismatch with what we have
   730  			// locally, should be updated.
   731  			update = append(update, rnp)
   732  		}
   733  	}
   734  
   735  	// Node pools that don't exist on the remote should be deleted
   736  	for lnp := range local {
   737  		if _, ok := remote[lnp]; !ok {
   738  			delete = append(delete, lnp)
   739  		}
   740  	}
   741  	return
   742  }
   743  
   744  // restoreEvals is used to restore pending evaluations into the eval broker and
   745  // blocked evaluations into the blocked eval tracker. The broker and blocked
   746  // eval tracker is maintained only by the leader, so it must be restored anytime
   747  // a leadership transition takes place.
   748  func (s *Server) restoreEvals() error {
   749  	// Get an iterator over every evaluation
   750  	ws := memdb.NewWatchSet()
   751  	iter, err := s.fsm.State().Evals(ws, false)
   752  	if err != nil {
   753  		return fmt.Errorf("failed to get evaluations: %v", err)
   754  	}
   755  
   756  	for {
   757  		raw := iter.Next()
   758  		if raw == nil {
   759  			break
   760  		}
   761  		eval := raw.(*structs.Evaluation)
   762  
   763  		if eval.ShouldEnqueue() {
   764  			s.evalBroker.Enqueue(eval)
   765  		} else if eval.ShouldBlock() {
   766  			s.blockedEvals.Block(eval)
   767  		}
   768  	}
   769  	return nil
   770  }
   771  
   772  // revokeVaultAccessorsOnRestore is used to restore Vault accessors that should be
   773  // revoked.
   774  func (s *Server) revokeVaultAccessorsOnRestore() error {
   775  	// An accessor should be revoked if its allocation or node is terminal
   776  	ws := memdb.NewWatchSet()
   777  	state := s.fsm.State()
   778  	iter, err := state.VaultAccessors(ws)
   779  	if err != nil {
   780  		return fmt.Errorf("failed to get vault accessors: %v", err)
   781  	}
   782  
   783  	var revoke []*structs.VaultAccessor
   784  	for {
   785  		raw := iter.Next()
   786  		if raw == nil {
   787  			break
   788  		}
   789  
   790  		va := raw.(*structs.VaultAccessor)
   791  
   792  		// Check the allocation
   793  		alloc, err := state.AllocByID(ws, va.AllocID)
   794  		if err != nil {
   795  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   796  		}
   797  		if alloc == nil || alloc.Terminated() {
   798  			// No longer running and should be revoked
   799  			revoke = append(revoke, va)
   800  			continue
   801  		}
   802  
   803  		// Check the node
   804  		node, err := state.NodeByID(ws, va.NodeID)
   805  		if err != nil {
   806  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   807  		}
   808  		if node == nil || node.TerminalStatus() {
   809  			// Node is terminal so any accessor from it should be revoked
   810  			revoke = append(revoke, va)
   811  			continue
   812  		}
   813  	}
   814  
   815  	if len(revoke) != 0 {
   816  		s.logger.Info("revoking vault accessors after becoming leader", "accessors", len(revoke))
   817  
   818  		if err := s.vault.MarkForRevocation(revoke); err != nil {
   819  			return fmt.Errorf("failed to revoke tokens: %v", err)
   820  		}
   821  	}
   822  
   823  	return nil
   824  }
   825  
   826  // revokeSITokenAccessorsOnRestore is used to revoke Service Identity token
   827  // accessors on behalf of allocs that are now gone / terminal.
   828  func (s *Server) revokeSITokenAccessorsOnRestore() error {
   829  	ws := memdb.NewWatchSet()
   830  	fsmState := s.fsm.State()
   831  	iter, err := fsmState.SITokenAccessors(ws)
   832  	if err != nil {
   833  		return fmt.Errorf("failed to get SI token accessors: %w", err)
   834  	}
   835  
   836  	var toRevoke []*structs.SITokenAccessor
   837  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   838  		accessor := raw.(*structs.SITokenAccessor)
   839  
   840  		// Check the allocation
   841  		alloc, err := fsmState.AllocByID(ws, accessor.AllocID)
   842  		if err != nil {
   843  			return fmt.Errorf("failed to lookup alloc %q: %w", accessor.AllocID, err)
   844  		}
   845  		if alloc == nil || alloc.Terminated() {
   846  			// no longer running and associated accessors should be revoked
   847  			toRevoke = append(toRevoke, accessor)
   848  			continue
   849  		}
   850  
   851  		// Check the node
   852  		node, err := fsmState.NodeByID(ws, accessor.NodeID)
   853  		if err != nil {
   854  			return fmt.Errorf("failed to lookup node %q: %w", accessor.NodeID, err)
   855  		}
   856  		if node == nil || node.TerminalStatus() {
   857  			// node is terminal and associated accessors should be revoked
   858  			toRevoke = append(toRevoke, accessor)
   859  			continue
   860  		}
   861  	}
   862  
   863  	if len(toRevoke) > 0 {
   864  		s.logger.Info("revoking consul accessors after becoming leader", "accessors", len(toRevoke))
   865  		s.consulACLs.MarkForRevocation(toRevoke)
   866  	}
   867  
   868  	return nil
   869  }
   870  
   871  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   872  // periodic dispatcher. It also determines if a periodic job should have been
   873  // created during the leadership transition and force runs them. The periodic
   874  // dispatcher is maintained only by the leader, so it must be restored anytime a
   875  // leadership transition takes place.
   876  func (s *Server) restorePeriodicDispatcher() error {
   877  	logger := s.logger.Named("periodic")
   878  	ws := memdb.NewWatchSet()
   879  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   880  	if err != nil {
   881  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   882  	}
   883  
   884  	now := time.Now()
   885  	for i := iter.Next(); i != nil; i = iter.Next() {
   886  		job := i.(*structs.Job)
   887  
   888  		// We skip adding parameterized jobs because they themselves aren't
   889  		// tracked, only the dispatched children are.
   890  		if job.IsParameterized() {
   891  			continue
   892  		}
   893  
   894  		if err := s.periodicDispatcher.Add(job); err != nil {
   895  			logger.Error("failed to add job to periodic dispatcher", "error", err)
   896  			continue
   897  		}
   898  
   899  		// We do not need to force run the job since it isn't active.
   900  		if !job.IsPeriodicActive() {
   901  			continue
   902  		}
   903  
   904  		// If the periodic job has never been launched before, launch will hold
   905  		// the time the periodic job was added. Otherwise it has the last launch
   906  		// time of the periodic job.
   907  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   908  		if err != nil {
   909  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   910  		}
   911  		if launch == nil {
   912  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   913  				job.ID, job.Namespace)
   914  		}
   915  
   916  		// nextLaunch is the next launch that should occur.
   917  		nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   918  		if err != nil {
   919  			logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err)
   920  			continue
   921  		}
   922  
   923  		// We skip force launching the job if  there should be no next launch
   924  		// (the zero case) or if the next launch time is in the future. If it is
   925  		// in the future, it will be handled by the periodic dispatcher.
   926  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   927  			continue
   928  		}
   929  
   930  		// We skip if the job doesn't allow overlap and there are already
   931  		// instances running
   932  		allowed, err := s.cronJobOverlapAllowed(job)
   933  		if err != nil {
   934  			return fmt.Errorf("failed to get job status: %v", err)
   935  		}
   936  		if !allowed {
   937  			continue
   938  		}
   939  
   940  		if _, err := s.periodicDispatcher.ForceEval(job.Namespace, job.ID); err != nil {
   941  			logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err)
   942  			return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err)
   943  		}
   944  
   945  		logger.Debug("periodic job force run during leadership establishment", "job", job.NamespacedID())
   946  	}
   947  
   948  	return nil
   949  }
   950  
   951  // cronJobOverlapAllowed checks if the job allows for overlap and if there are already
   952  // instances of the job running in order to determine if a new evaluation needs to
   953  // be created upon periodic dispatcher restore
   954  func (s *Server) cronJobOverlapAllowed(job *structs.Job) (bool, error) {
   955  	if job.Periodic.ProhibitOverlap {
   956  		running, err := s.periodicDispatcher.dispatcher.RunningChildren(job)
   957  		if err != nil {
   958  			return false, fmt.Errorf("failed to determine if periodic job has running children %q error %q", job.NamespacedID(), err)
   959  		}
   960  
   961  		if running {
   962  			return false, nil
   963  		}
   964  	}
   965  
   966  	return true, nil
   967  }
   968  
   969  // schedulePeriodic is used to do periodic job dispatch while we are leader
   970  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   971  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   972  	defer evalGC.Stop()
   973  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   974  	defer nodeGC.Stop()
   975  	jobGC := time.NewTicker(s.config.JobGCInterval)
   976  	defer jobGC.Stop()
   977  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   978  	defer deploymentGC.Stop()
   979  	csiPluginGC := time.NewTicker(s.config.CSIPluginGCInterval)
   980  	defer csiPluginGC.Stop()
   981  	csiVolumeClaimGC := time.NewTicker(s.config.CSIVolumeClaimGCInterval)
   982  	defer csiVolumeClaimGC.Stop()
   983  	oneTimeTokenGC := time.NewTicker(s.config.OneTimeTokenGCInterval)
   984  	defer oneTimeTokenGC.Stop()
   985  	rootKeyGC := time.NewTicker(s.config.RootKeyGCInterval)
   986  	defer rootKeyGC.Stop()
   987  	variablesRekey := time.NewTicker(s.config.VariablesRekeyInterval)
   988  	defer variablesRekey.Stop()
   989  
   990  	// Set up the expired ACL local token garbage collection timer.
   991  	localTokenExpiredGC, localTokenExpiredGCStop := helper.NewSafeTimer(s.config.ACLTokenExpirationGCInterval)
   992  	defer localTokenExpiredGCStop()
   993  
   994  	for {
   995  
   996  		select {
   997  		case <-evalGC.C:
   998  			if index, ok := s.getLatestIndex(); ok {
   999  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
  1000  			}
  1001  		case <-nodeGC.C:
  1002  			if index, ok := s.getLatestIndex(); ok {
  1003  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
  1004  			}
  1005  		case <-jobGC.C:
  1006  			if index, ok := s.getLatestIndex(); ok {
  1007  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
  1008  			}
  1009  		case <-deploymentGC.C:
  1010  			if index, ok := s.getLatestIndex(); ok {
  1011  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
  1012  			}
  1013  		case <-csiPluginGC.C:
  1014  			if index, ok := s.getLatestIndex(); ok {
  1015  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIPluginGC, index))
  1016  			}
  1017  		case <-csiVolumeClaimGC.C:
  1018  			if index, ok := s.getLatestIndex(); ok {
  1019  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIVolumeClaimGC, index))
  1020  			}
  1021  		case <-oneTimeTokenGC.C:
  1022  			if !ServersMeetMinimumVersion(s.Members(), s.Region(), minOneTimeAuthenticationTokenVersion, false) {
  1023  				continue
  1024  			}
  1025  
  1026  			if index, ok := s.getLatestIndex(); ok {
  1027  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobOneTimeTokenGC, index))
  1028  			}
  1029  		case <-localTokenExpiredGC.C:
  1030  			if index, ok := s.getLatestIndex(); ok {
  1031  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobLocalTokenExpiredGC, index))
  1032  			}
  1033  			localTokenExpiredGC.Reset(s.config.ACLTokenExpirationGCInterval)
  1034  		case <-rootKeyGC.C:
  1035  			if index, ok := s.getLatestIndex(); ok {
  1036  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index))
  1037  			}
  1038  		case <-variablesRekey.C:
  1039  			if index, ok := s.getLatestIndex(); ok {
  1040  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobVariablesRekey, index))
  1041  			}
  1042  		case <-stopCh:
  1043  			return
  1044  		}
  1045  	}
  1046  }
  1047  
  1048  // schedulePeriodicAuthoritative is a long-lived routine intended for use on
  1049  // the leader within the authoritative region only. It periodically queues work
  1050  // onto the _core scheduler for ACL based activities such as removing expired
  1051  // global ACL tokens.
  1052  func (s *Server) schedulePeriodicAuthoritative(stopCh chan struct{}) {
  1053  
  1054  	// Set up the expired ACL global token garbage collection timer.
  1055  	globalTokenExpiredGC, globalTokenExpiredGCStop := helper.NewSafeTimer(s.config.ACLTokenExpirationGCInterval)
  1056  	defer globalTokenExpiredGCStop()
  1057  
  1058  	for {
  1059  		select {
  1060  		case <-globalTokenExpiredGC.C:
  1061  			if index, ok := s.getLatestIndex(); ok {
  1062  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobGlobalTokenExpiredGC, index))
  1063  			}
  1064  			globalTokenExpiredGC.Reset(s.config.ACLTokenExpirationGCInterval)
  1065  		case <-stopCh:
  1066  			return
  1067  		}
  1068  	}
  1069  }
  1070  
  1071  // getLatestIndex is a helper function which returns the latest index from the
  1072  // state store. The boolean return indicates whether the call has been
  1073  // successful or not.
  1074  func (s *Server) getLatestIndex() (uint64, bool) {
  1075  	snapshotIndex, err := s.fsm.State().LatestIndex()
  1076  	if err != nil {
  1077  		s.logger.Error("failed to determine state store's index", "error", err)
  1078  		return 0, false
  1079  	}
  1080  	return snapshotIndex, true
  1081  }
  1082  
  1083  // coreJobEval returns an evaluation for a core job
  1084  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
  1085  	return &structs.Evaluation{
  1086  		ID:          uuid.Generate(),
  1087  		Namespace:   "-",
  1088  		Priority:    structs.CoreJobPriority,
  1089  		Type:        structs.JobTypeCore,
  1090  		TriggeredBy: structs.EvalTriggerScheduled,
  1091  		JobID:       job,
  1092  		LeaderACL:   s.getLeaderAcl(),
  1093  		Status:      structs.EvalStatusPending,
  1094  		ModifyIndex: modifyIndex,
  1095  	}
  1096  }
  1097  
  1098  // reapFailedEvaluations is used to reap evaluations that
  1099  // have reached their delivery limit and should be failed
  1100  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
  1101  	for {
  1102  		select {
  1103  		case <-stopCh:
  1104  			return
  1105  		default:
  1106  			// Scan for a failed evaluation
  1107  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
  1108  			if err != nil {
  1109  				return
  1110  			}
  1111  			if eval == nil {
  1112  				continue
  1113  			}
  1114  
  1115  			// Update the status to failed
  1116  			updateEval := eval.Copy()
  1117  			updateEval.Status = structs.EvalStatusFailed
  1118  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
  1119  			s.logger.Warn("eval reached delivery limit, marking as failed",
  1120  				"eval", hclog.Fmt("%#v", updateEval))
  1121  
  1122  			// Core job evals that fail or span leader elections will never
  1123  			// succeed because the follow-up doesn't have the leader ACL. We
  1124  			// rely on the leader to schedule new core jobs periodically
  1125  			// instead.
  1126  			if eval.Type != structs.JobTypeCore {
  1127  
  1128  				// Create a follow-up evaluation that will be used to retry the
  1129  				// scheduling for the job after the cluster is hopefully more stable
  1130  				// due to the fairly large backoff.
  1131  				followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
  1132  					time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
  1133  
  1134  				followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
  1135  				updateEval.NextEval = followupEval.ID
  1136  				updateEval.UpdateModifyTime()
  1137  
  1138  				// Update via Raft
  1139  				req := structs.EvalUpdateRequest{
  1140  					Evals: []*structs.Evaluation{updateEval, followupEval},
  1141  				}
  1142  				if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
  1143  					s.logger.Error("failed to update failed eval and create a follow-up",
  1144  						"eval", hclog.Fmt("%#v", updateEval), "error", err)
  1145  					continue
  1146  				}
  1147  			}
  1148  			// Ack completion
  1149  			s.evalBroker.Ack(eval.ID, token)
  1150  		}
  1151  	}
  1152  }
  1153  
  1154  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
  1155  // should be cancelled.
  1156  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
  1157  	for {
  1158  		select {
  1159  		case <-stopCh:
  1160  			return
  1161  		default:
  1162  			// Scan for duplicate blocked evals.
  1163  			dups := s.blockedEvals.GetDuplicates(time.Second)
  1164  			if dups == nil {
  1165  				continue
  1166  			}
  1167  
  1168  			cancel := make([]*structs.Evaluation, len(dups))
  1169  			for i, dup := range dups {
  1170  				// Update the status to cancelled
  1171  				newEval := dup.Copy()
  1172  				newEval.Status = structs.EvalStatusCancelled
  1173  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
  1174  				newEval.UpdateModifyTime()
  1175  				cancel[i] = newEval
  1176  			}
  1177  
  1178  			// Update via Raft
  1179  			req := structs.EvalUpdateRequest{
  1180  				Evals: cancel,
  1181  			}
  1182  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
  1183  				s.logger.Error("failed to update duplicate evals", "evals", hclog.Fmt("%#v", cancel), "error", err)
  1184  				continue
  1185  			}
  1186  		}
  1187  	}
  1188  }
  1189  
  1190  // reapCancelableEvaluations is used to reap evaluations that were marked
  1191  // cancelable by the eval broker and should be canceled. These get swept up
  1192  // whenever an eval Acks, but this ensures that we don't have a straggling batch
  1193  // when the cluster doesn't have any more work to do. Returns a wake-up channel
  1194  // that can be used to trigger a new reap without waiting for the timer
  1195  func (s *Server) reapCancelableEvaluations(stopCh chan struct{}) chan struct{} {
  1196  
  1197  	wakeCh := make(chan struct{}, 1)
  1198  	go func() {
  1199  
  1200  		timer, cancel := helper.NewSafeTimer(s.config.EvalReapCancelableInterval)
  1201  		defer cancel()
  1202  		for {
  1203  			select {
  1204  			case <-stopCh:
  1205  				return
  1206  			case <-wakeCh:
  1207  				cancelCancelableEvals(s)
  1208  			case <-timer.C:
  1209  				cancelCancelableEvals(s)
  1210  				timer.Reset(s.config.EvalReapCancelableInterval)
  1211  			}
  1212  		}
  1213  	}()
  1214  
  1215  	return wakeCh
  1216  }
  1217  
  1218  const cancelableEvalsBatchSize = 728 // structs.MaxUUIDsPerWriteRequest / 10
  1219  
  1220  // cancelCancelableEvals pulls a batch of cancelable evaluations from the eval
  1221  // broker and updates their status to canceled.
  1222  func cancelCancelableEvals(srv *Server) error {
  1223  
  1224  	const cancelDesc = "canceled after more recent eval was processed"
  1225  
  1226  	// We *can* send larger raft logs but rough benchmarks show that a smaller
  1227  	// page size strikes a balance between throughput and time we block the FSM
  1228  	// apply for other operations
  1229  	cancelable := srv.evalBroker.Cancelable(cancelableEvalsBatchSize)
  1230  	if len(cancelable) > 0 {
  1231  		for i, eval := range cancelable {
  1232  			eval = eval.Copy()
  1233  			eval.Status = structs.EvalStatusCancelled
  1234  			eval.StatusDescription = cancelDesc
  1235  			eval.UpdateModifyTime()
  1236  			cancelable[i] = eval
  1237  		}
  1238  
  1239  		update := &structs.EvalUpdateRequest{
  1240  			Evals:        cancelable,
  1241  			WriteRequest: structs.WriteRequest{Region: srv.Region()},
  1242  		}
  1243  		_, _, err := srv.raftApply(structs.EvalUpdateRequestType, update)
  1244  		if err != nil {
  1245  			srv.logger.Warn("eval cancel failed", "error", err, "method", "ack")
  1246  			return err
  1247  		}
  1248  	}
  1249  	return nil
  1250  }
  1251  
  1252  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
  1253  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
  1254  	ticker := time.NewTicker(failedEvalUnblockInterval)
  1255  	defer ticker.Stop()
  1256  	for {
  1257  		select {
  1258  		case <-stopCh:
  1259  			return
  1260  		case <-ticker.C:
  1261  			// Unblock the failed allocations
  1262  			s.blockedEvals.UnblockFailed()
  1263  		}
  1264  	}
  1265  }
  1266  
  1267  // publishJobSummaryMetrics publishes the job summaries as metrics
  1268  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
  1269  	timer := time.NewTimer(0)
  1270  	defer timer.Stop()
  1271  
  1272  	for {
  1273  		select {
  1274  		case <-stopCh:
  1275  			return
  1276  		case <-timer.C:
  1277  			timer.Reset(s.config.StatsCollectionInterval)
  1278  			state, err := s.State().Snapshot()
  1279  			if err != nil {
  1280  				s.logger.Error("failed to get state", "error", err)
  1281  				continue
  1282  			}
  1283  			ws := memdb.NewWatchSet()
  1284  			iter, err := state.JobSummaries(ws)
  1285  			if err != nil {
  1286  				s.logger.Error("failed to get job summaries", "error", err)
  1287  				continue
  1288  			}
  1289  
  1290  			for {
  1291  				raw := iter.Next()
  1292  				if raw == nil {
  1293  					break
  1294  				}
  1295  				summary := raw.(*structs.JobSummary)
  1296  				if s.config.DisableDispatchedJobSummaryMetrics {
  1297  					job, err := state.JobByID(ws, summary.Namespace, summary.JobID)
  1298  					if err != nil {
  1299  						s.logger.Error("error getting job for summary", "error", err)
  1300  						continue
  1301  					}
  1302  					if job.Dispatched {
  1303  						continue
  1304  					}
  1305  				}
  1306  				s.iterateJobSummaryMetrics(summary)
  1307  			}
  1308  		}
  1309  	}
  1310  }
  1311  
  1312  func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
  1313  	for name, tgSummary := range summary.Summary {
  1314  		labels := []metrics.Label{
  1315  			{
  1316  				Name:  "job",
  1317  				Value: summary.JobID,
  1318  			},
  1319  			{
  1320  				Name:  "task_group",
  1321  				Value: name,
  1322  			},
  1323  			{
  1324  				Name:  "namespace",
  1325  				Value: summary.Namespace,
  1326  			},
  1327  		}
  1328  
  1329  		if strings.Contains(summary.JobID, "/dispatch-") {
  1330  			jobInfo := strings.Split(summary.JobID, "/dispatch-")
  1331  			labels = append(labels, metrics.Label{
  1332  				Name:  "parent_id",
  1333  				Value: jobInfo[0],
  1334  			}, metrics.Label{
  1335  				Name:  "dispatch_id",
  1336  				Value: jobInfo[1],
  1337  			})
  1338  		}
  1339  
  1340  		if strings.Contains(summary.JobID, "/periodic-") {
  1341  			jobInfo := strings.Split(summary.JobID, "/periodic-")
  1342  			labels = append(labels, metrics.Label{
  1343  				Name:  "parent_id",
  1344  				Value: jobInfo[0],
  1345  			}, metrics.Label{
  1346  				Name:  "periodic_id",
  1347  				Value: jobInfo[1],
  1348  			})
  1349  		}
  1350  
  1351  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
  1352  			float32(tgSummary.Queued), labels)
  1353  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
  1354  			float32(tgSummary.Complete), labels)
  1355  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
  1356  			float32(tgSummary.Failed), labels)
  1357  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
  1358  			float32(tgSummary.Running), labels)
  1359  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
  1360  			float32(tgSummary.Starting), labels)
  1361  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
  1362  			float32(tgSummary.Lost), labels)
  1363  		metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "unknown"},
  1364  			float32(tgSummary.Unknown), labels)
  1365  	}
  1366  }
  1367  
  1368  // publishJobStatusMetrics publishes the job statuses as metrics
  1369  func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) {
  1370  	timer := time.NewTimer(0)
  1371  	defer timer.Stop()
  1372  
  1373  	for {
  1374  		select {
  1375  		case <-stopCh:
  1376  			return
  1377  		case <-timer.C:
  1378  			timer.Reset(s.config.StatsCollectionInterval)
  1379  			state, err := s.State().Snapshot()
  1380  			if err != nil {
  1381  				s.logger.Error("failed to get state", "error", err)
  1382  				continue
  1383  			}
  1384  			ws := memdb.NewWatchSet()
  1385  			iter, err := state.Jobs(ws)
  1386  			if err != nil {
  1387  				s.logger.Error("failed to get job statuses", "error", err)
  1388  				continue
  1389  			}
  1390  
  1391  			s.iterateJobStatusMetrics(&iter)
  1392  		}
  1393  	}
  1394  }
  1395  
  1396  func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) {
  1397  	var pending int64 // Sum of all jobs in 'pending' state
  1398  	var running int64 // Sum of all jobs in 'running' state
  1399  	var dead int64    // Sum of all jobs in 'dead' state
  1400  
  1401  	for {
  1402  		raw := (*jobs).Next()
  1403  		if raw == nil {
  1404  			break
  1405  		}
  1406  
  1407  		job := raw.(*structs.Job)
  1408  
  1409  		switch job.Status {
  1410  		case structs.JobStatusPending:
  1411  			pending++
  1412  		case structs.JobStatusRunning:
  1413  			running++
  1414  		case structs.JobStatusDead:
  1415  			dead++
  1416  		}
  1417  	}
  1418  
  1419  	metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending))
  1420  	metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running))
  1421  	metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead))
  1422  }
  1423  
  1424  // revokeLeadership is invoked once we step down as leader.
  1425  // This is used to cleanup any state that may be specific to a leader.
  1426  func (s *Server) revokeLeadership() error {
  1427  	defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now())
  1428  
  1429  	s.resetConsistentReadReady()
  1430  
  1431  	// Clear the leader token since we are no longer the leader.
  1432  	s.setLeaderAcl("")
  1433  
  1434  	// Disable autopilot
  1435  	s.autopilot.Stop()
  1436  
  1437  	// Disable the plan queue, since we are no longer leader
  1438  	s.planQueue.SetEnabled(false)
  1439  
  1440  	// Disable the eval broker and blocked evals. We do not need to check the
  1441  	// scheduler configuration paused eval broker value, as the brokers should
  1442  	// always be paused on the non-leader.
  1443  	s.brokerLock.Lock()
  1444  	s.evalBroker.SetEnabled(false)
  1445  	s.blockedEvals.SetEnabled(false)
  1446  	s.brokerLock.Unlock()
  1447  
  1448  	// Disable the periodic dispatcher, since it is only useful as a leader
  1449  	s.periodicDispatcher.SetEnabled(false)
  1450  
  1451  	// Disable the Vault client as it is only useful as a leader.
  1452  	s.vault.SetActive(false)
  1453  
  1454  	// Disable the deployment watcher as it is only useful as a leader.
  1455  	s.deploymentWatcher.SetEnabled(false, nil)
  1456  
  1457  	// Disable the node drainer
  1458  	s.nodeDrainer.SetEnabled(false, nil)
  1459  
  1460  	// Disable the volume watcher
  1461  	s.volumeWatcher.SetEnabled(false, nil, "")
  1462  
  1463  	// Disable any enterprise systems required.
  1464  	if err := s.revokeEnterpriseLeadership(); err != nil {
  1465  		return err
  1466  	}
  1467  
  1468  	// Clear the heartbeat timers on either shutdown or step down,
  1469  	// since we are no longer responsible for TTL expirations.
  1470  	if err := s.clearAllHeartbeatTimers(); err != nil {
  1471  		s.logger.Error("clearing heartbeat timers failed", "error", err)
  1472  		return err
  1473  	}
  1474  
  1475  	// Unpause our worker if we paused previously
  1476  	s.handlePausableWorkers(false)
  1477  
  1478  	return nil
  1479  }
  1480  
  1481  // pausableWorkers returns a slice of the workers
  1482  // to pause on leader transitions.
  1483  //
  1484  // Upon leadership establishment, pause workers to free half
  1485  // the cores for use in the plan queue and evaluation broker
  1486  func (s *Server) pausableWorkers() []*Worker {
  1487  	n := len(s.workers)
  1488  	if n <= 1 {
  1489  		return []*Worker{}
  1490  	}
  1491  
  1492  	// Disabling 3/4 of the workers frees CPU for raft and the
  1493  	// plan applier which uses 1/2 the cores.
  1494  	return s.workers[:3*n/4]
  1495  }
  1496  
  1497  // reconcile is used to reconcile the differences between Serf
  1498  // membership and what is reflected in our strongly consistent store.
  1499  func (s *Server) reconcile() error {
  1500  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
  1501  	members := s.serf.Members()
  1502  	for _, member := range members {
  1503  		if err := s.reconcileMember(member); err != nil {
  1504  			return err
  1505  		}
  1506  	}
  1507  	return nil
  1508  }
  1509  
  1510  // reconcileMember is used to do an async reconcile of a single serf member
  1511  func (s *Server) reconcileMember(member serf.Member) error {
  1512  	// Check if this is a member we should handle
  1513  	valid, parts := isNomadServer(member)
  1514  	if !valid || parts.Region != s.config.Region {
  1515  		return nil
  1516  	}
  1517  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
  1518  
  1519  	var err error
  1520  	switch member.Status {
  1521  	case serf.StatusAlive:
  1522  		err = s.addRaftPeer(member, parts)
  1523  	case serf.StatusLeft, StatusReap:
  1524  		err = s.removeRaftPeer(member, parts)
  1525  	}
  1526  	if err != nil {
  1527  		s.logger.Error("failed to reconcile member", "member", member, "error", err)
  1528  		return err
  1529  	}
  1530  	return nil
  1531  }
  1532  
  1533  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
  1534  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
  1535  	// Check for possibility of multiple bootstrap nodes
  1536  	members := s.serf.Members()
  1537  	if parts.Bootstrap {
  1538  		for _, member := range members {
  1539  			valid, p := isNomadServer(member)
  1540  			if valid && member.Name != m.Name && p.Bootstrap {
  1541  				s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode",
  1542  					"existing_peer", member.Name, "joining_peer", m.Name)
  1543  				return nil
  1544  			}
  1545  		}
  1546  	}
  1547  
  1548  	// Processing ourselves could result in trying to remove ourselves to
  1549  	// fix up our address, which would make us step down. This is only
  1550  	// safe to attempt if there are multiple servers available.
  1551  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
  1552  	configFuture := s.raft.GetConfiguration()
  1553  	if err := configFuture.Error(); err != nil {
  1554  		s.logger.Error("failed to get raft configuration", "error", err)
  1555  		return err
  1556  	}
  1557  
  1558  	if m.Name == s.config.NodeName {
  1559  		if l := len(configFuture.Configuration().Servers); l < 3 {
  1560  			s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name)
  1561  			return nil
  1562  		}
  1563  	}
  1564  
  1565  	// See if it's already in the configuration. It's harmless to re-add it
  1566  	// but we want to avoid doing that if possible to prevent useless Raft
  1567  	// log entries. If the address is the same but the ID changed, remove the
  1568  	// old server before adding the new one.
  1569  	minRaftProtocol, err := s.MinRaftProtocol()
  1570  	if err != nil {
  1571  		return err
  1572  	}
  1573  	for _, server := range configFuture.Configuration().Servers {
  1574  		// No-op if the raft version is too low
  1575  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
  1576  			return nil
  1577  		}
  1578  
  1579  		// If the address or ID matches an existing server, see if we need to remove the old one first
  1580  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
  1581  			// Exit with no-op if this is being called on an existing server and both the ID and address match
  1582  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
  1583  				return nil
  1584  			}
  1585  			future := s.raft.RemoveServer(server.ID, 0, 0)
  1586  			if server.Address == raft.ServerAddress(addr) {
  1587  				if err := future.Error(); err != nil {
  1588  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
  1589  				}
  1590  				s.logger.Info("removed server with duplicate address", "address", server.Address)
  1591  			} else {
  1592  				if err := future.Error(); err != nil {
  1593  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
  1594  				}
  1595  				s.logger.Info("removed server with duplicate ID", "id", server.ID)
  1596  			}
  1597  		}
  1598  	}
  1599  
  1600  	// Attempt to add as a peer
  1601  	switch {
  1602  	case minRaftProtocol >= 3:
  1603  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1604  		if err := addFuture.Error(); err != nil {
  1605  			s.logger.Error("failed to add raft peer", "error", err)
  1606  			return err
  1607  		}
  1608  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
  1609  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1610  		if err := addFuture.Error(); err != nil {
  1611  			s.logger.Error("failed to add raft peer", "error", err)
  1612  			return err
  1613  		}
  1614  	default:
  1615  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
  1616  		if err := addFuture.Error(); err != nil {
  1617  			s.logger.Error("failed to add raft peer", "error", err)
  1618  			return err
  1619  		}
  1620  	}
  1621  
  1622  	return nil
  1623  }
  1624  
  1625  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
  1626  // or is reaped
  1627  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
  1628  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
  1629  
  1630  	// See if it's already in the configuration. It's harmless to re-remove it
  1631  	// but we want to avoid doing that if possible to prevent useless Raft
  1632  	// log entries.
  1633  	configFuture := s.raft.GetConfiguration()
  1634  	if err := configFuture.Error(); err != nil {
  1635  		s.logger.Error("failed to get raft configuration", "error", err)
  1636  		return err
  1637  	}
  1638  
  1639  	minRaftProtocol, err := s.MinRaftProtocol()
  1640  	if err != nil {
  1641  		return err
  1642  	}
  1643  
  1644  	// Pick which remove API to use based on how the server was added.
  1645  	for _, server := range configFuture.Configuration().Servers {
  1646  		// Check if this is the server to remove based on how it was registered.
  1647  		// Raft v2 servers are registered by address.
  1648  		// Raft v3 servers are registered by ID.
  1649  		if server.ID == raft.ServerID(parts.ID) || server.Address == raft.ServerAddress(addr) {
  1650  			// Use the new add/remove APIs if we understand them.
  1651  			if minRaftProtocol >= 2 {
  1652  				s.logger.Info("removing server by ID", "id", server.ID)
  1653  				future := s.raft.RemoveServer(server.ID, 0, 0)
  1654  				if err := future.Error(); err != nil {
  1655  					s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err)
  1656  					return err
  1657  				}
  1658  			} else {
  1659  				// If not, use the old remove API
  1660  				s.logger.Info("removing server by address", "address", server.Address)
  1661  				future := s.raft.RemovePeer(raft.ServerAddress(addr))
  1662  				if err := future.Error(); err != nil {
  1663  					s.logger.Error("failed to remove raft peer", "address", addr, "error", err)
  1664  					return err
  1665  				}
  1666  			}
  1667  			break
  1668  		}
  1669  	}
  1670  
  1671  	return nil
  1672  }
  1673  
  1674  // replicateACLPolicies is used to replicate ACL policies from
  1675  // the authoritative region to this region.
  1676  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
  1677  	req := structs.ACLPolicyListRequest{
  1678  		QueryOptions: structs.QueryOptions{
  1679  			Region:     s.config.AuthoritativeRegion,
  1680  			AllowStale: true,
  1681  		},
  1682  	}
  1683  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1684  	s.logger.Debug("starting ACL policy replication from authoritative region", "authoritative_region", req.Region)
  1685  
  1686  START:
  1687  	for {
  1688  		select {
  1689  		case <-stopCh:
  1690  			return
  1691  		default:
  1692  			// Rate limit how often we attempt replication
  1693  			limiter.Wait(context.Background())
  1694  
  1695  			// Fetch the list of policies
  1696  			var resp structs.ACLPolicyListResponse
  1697  			req.AuthToken = s.ReplicationToken()
  1698  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1699  				"ACL.ListPolicies", &req, &resp)
  1700  			if err != nil {
  1701  				s.logger.Error("failed to fetch policies from authoritative region", "error", err)
  1702  				goto ERR_WAIT
  1703  			}
  1704  
  1705  			// Perform a two-way diff
  1706  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
  1707  
  1708  			// Delete policies that should not exist
  1709  			if len(delete) > 0 {
  1710  				args := &structs.ACLPolicyDeleteRequest{
  1711  					Names: delete,
  1712  				}
  1713  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
  1714  				if err != nil {
  1715  					s.logger.Error("failed to delete policies", "error", err)
  1716  					goto ERR_WAIT
  1717  				}
  1718  			}
  1719  
  1720  			// Fetch any outdated policies
  1721  			var fetched []*structs.ACLPolicy
  1722  			if len(update) > 0 {
  1723  				req := structs.ACLPolicySetRequest{
  1724  					Names: update,
  1725  					QueryOptions: structs.QueryOptions{
  1726  						Region:        s.config.AuthoritativeRegion,
  1727  						AuthToken:     s.ReplicationToken(),
  1728  						AllowStale:    true,
  1729  						MinQueryIndex: resp.Index - 1,
  1730  					},
  1731  				}
  1732  				var reply structs.ACLPolicySetResponse
  1733  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1734  					"ACL.GetPolicies", &req, &reply); err != nil {
  1735  					s.logger.Error("failed to fetch policies from authoritative region", "error", err)
  1736  					goto ERR_WAIT
  1737  				}
  1738  				for _, policy := range reply.Policies {
  1739  					fetched = append(fetched, policy)
  1740  				}
  1741  			}
  1742  
  1743  			// Update local policies
  1744  			if len(fetched) > 0 {
  1745  				args := &structs.ACLPolicyUpsertRequest{
  1746  					Policies: fetched,
  1747  				}
  1748  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
  1749  				if err != nil {
  1750  					s.logger.Error("failed to update policies", "error", err)
  1751  					goto ERR_WAIT
  1752  				}
  1753  			}
  1754  
  1755  			// Update the minimum query index, blocks until there
  1756  			// is a change.
  1757  			req.MinQueryIndex = resp.Index
  1758  		}
  1759  	}
  1760  
  1761  ERR_WAIT:
  1762  	select {
  1763  	case <-time.After(s.config.ReplicationBackoff):
  1764  		goto START
  1765  	case <-stopCh:
  1766  		return
  1767  	}
  1768  }
  1769  
  1770  // diffACLPolicies is used to perform a two-way diff between the local
  1771  // policies and the remote policies to determine which policies need to
  1772  // be deleted or updated.
  1773  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
  1774  	// Construct a set of the local and remote policies
  1775  	local := make(map[string][]byte)
  1776  	remote := make(map[string]struct{})
  1777  
  1778  	// Add all the local policies
  1779  	iter, err := state.ACLPolicies(nil)
  1780  	if err != nil {
  1781  		panic("failed to iterate local policies")
  1782  	}
  1783  	for {
  1784  		raw := iter.Next()
  1785  		if raw == nil {
  1786  			break
  1787  		}
  1788  		policy := raw.(*structs.ACLPolicy)
  1789  		local[policy.Name] = policy.Hash
  1790  	}
  1791  
  1792  	// Iterate over the remote policies
  1793  	for _, rp := range remoteList {
  1794  		remote[rp.Name] = struct{}{}
  1795  
  1796  		// Check if the policy is missing locally
  1797  		if localHash, ok := local[rp.Name]; !ok {
  1798  			update = append(update, rp.Name)
  1799  
  1800  			// Check if policy is newer remotely and there is a hash mis-match.
  1801  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1802  			update = append(update, rp.Name)
  1803  		}
  1804  	}
  1805  
  1806  	// Check if policy should be deleted
  1807  	for lp := range local {
  1808  		if _, ok := remote[lp]; !ok {
  1809  			delete = append(delete, lp)
  1810  		}
  1811  	}
  1812  	return
  1813  }
  1814  
  1815  // replicateACLTokens is used to replicate global ACL tokens from
  1816  // the authoritative region to this region.
  1817  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
  1818  	req := structs.ACLTokenListRequest{
  1819  		GlobalOnly: true,
  1820  		QueryOptions: structs.QueryOptions{
  1821  			Region:     s.config.AuthoritativeRegion,
  1822  			AllowStale: true,
  1823  		},
  1824  	}
  1825  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1826  	s.logger.Debug("starting ACL token replication from authoritative region", "authoritative_region", req.Region)
  1827  
  1828  START:
  1829  	for {
  1830  		select {
  1831  		case <-stopCh:
  1832  			return
  1833  		default:
  1834  			// Rate limit how often we attempt replication
  1835  			limiter.Wait(context.Background())
  1836  
  1837  			// Fetch the list of tokens
  1838  			var resp structs.ACLTokenListResponse
  1839  			req.AuthToken = s.ReplicationToken()
  1840  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1841  				"ACL.ListTokens", &req, &resp)
  1842  			if err != nil {
  1843  				s.logger.Error("failed to fetch tokens from authoritative region", "error", err)
  1844  				goto ERR_WAIT
  1845  			}
  1846  
  1847  			// Perform a two-way diff
  1848  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
  1849  
  1850  			// Delete tokens that should not exist
  1851  			if len(delete) > 0 {
  1852  				args := &structs.ACLTokenDeleteRequest{
  1853  					AccessorIDs: delete,
  1854  				}
  1855  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1856  				if err != nil {
  1857  					s.logger.Error("failed to delete tokens", "error", err)
  1858  					goto ERR_WAIT
  1859  				}
  1860  			}
  1861  
  1862  			// Fetch any outdated policies.
  1863  			var fetched []*structs.ACLToken
  1864  			if len(update) > 0 {
  1865  				req := structs.ACLTokenSetRequest{
  1866  					AccessorIDS: update,
  1867  					QueryOptions: structs.QueryOptions{
  1868  						Region:        s.config.AuthoritativeRegion,
  1869  						AuthToken:     s.ReplicationToken(),
  1870  						AllowStale:    true,
  1871  						MinQueryIndex: resp.Index - 1,
  1872  					},
  1873  				}
  1874  				var reply structs.ACLTokenSetResponse
  1875  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1876  					"ACL.GetTokens", &req, &reply); err != nil {
  1877  					s.logger.Error("failed to fetch tokens from authoritative region", "error", err)
  1878  					goto ERR_WAIT
  1879  				}
  1880  				for _, token := range reply.Tokens {
  1881  					fetched = append(fetched, token)
  1882  				}
  1883  			}
  1884  
  1885  			// Update local tokens
  1886  			if len(fetched) > 0 {
  1887  				args := &structs.ACLTokenUpsertRequest{
  1888  					Tokens: fetched,
  1889  				}
  1890  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1891  				if err != nil {
  1892  					s.logger.Error("failed to update tokens", "error", err)
  1893  					goto ERR_WAIT
  1894  				}
  1895  			}
  1896  
  1897  			// Update the minimum query index, blocks until there
  1898  			// is a change.
  1899  			req.MinQueryIndex = resp.Index
  1900  		}
  1901  	}
  1902  
  1903  ERR_WAIT:
  1904  	select {
  1905  	case <-time.After(s.config.ReplicationBackoff):
  1906  		goto START
  1907  	case <-stopCh:
  1908  		return
  1909  	}
  1910  }
  1911  
  1912  // diffACLTokens is used to perform a two-way diff between the local
  1913  // tokens and the remote tokens to determine which tokens need to
  1914  // be deleted or updated.
  1915  func diffACLTokens(store *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1916  	// Construct a set of the local and remote policies
  1917  	local := make(map[string][]byte)
  1918  	remote := make(map[string]struct{})
  1919  
  1920  	// Add all the local global tokens
  1921  	iter, err := store.ACLTokensByGlobal(nil, true, state.SortDefault)
  1922  	if err != nil {
  1923  		panic("failed to iterate local tokens")
  1924  	}
  1925  	for {
  1926  		raw := iter.Next()
  1927  		if raw == nil {
  1928  			break
  1929  		}
  1930  		token := raw.(*structs.ACLToken)
  1931  		local[token.AccessorID] = token.Hash
  1932  	}
  1933  
  1934  	// Iterate over the remote tokens
  1935  	for _, rp := range remoteList {
  1936  		remote[rp.AccessorID] = struct{}{}
  1937  
  1938  		// Check if the token is missing locally
  1939  		if localHash, ok := local[rp.AccessorID]; !ok {
  1940  			update = append(update, rp.AccessorID)
  1941  
  1942  			// Check if policy is newer remotely and there is a hash mis-match.
  1943  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1944  			update = append(update, rp.AccessorID)
  1945  		}
  1946  	}
  1947  
  1948  	// Check if local token should be deleted
  1949  	for lp := range local {
  1950  		if _, ok := remote[lp]; !ok {
  1951  			delete = append(delete, lp)
  1952  		}
  1953  	}
  1954  	return
  1955  }
  1956  
  1957  // replicateACLRoles is used to replicate ACL Roles from the authoritative
  1958  // region to this region. The loop should only be run on the leader within the
  1959  // federated region.
  1960  func (s *Server) replicateACLRoles(stopCh chan struct{}) {
  1961  
  1962  	// Generate our request object. We only need to do this once and reuse it
  1963  	// for every RPC request. The MinQueryIndex is updated after every
  1964  	// successful replication loop, so the next query acts as a blocking query
  1965  	// and only returns upon a change in the authoritative region.
  1966  	req := structs.ACLRolesListRequest{
  1967  		QueryOptions: structs.QueryOptions{
  1968  			AllowStale: true,
  1969  			Region:     s.config.AuthoritativeRegion,
  1970  		},
  1971  	}
  1972  
  1973  	// Create our replication rate limiter for ACL roles and log a lovely
  1974  	// message to indicate the process is starting.
  1975  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1976  	s.logger.Debug("starting ACL Role replication from authoritative region",
  1977  		"authoritative_region", req.Region)
  1978  
  1979  	// Enter the main ACL Role replication loop that will only exit when the
  1980  	// stopCh is closed.
  1981  	//
  1982  	// Any error encountered will use the replicationBackoffContinue function
  1983  	// which handles replication backoff and shutdown coordination in the event
  1984  	// of an error inside the loop.
  1985  	for {
  1986  		select {
  1987  		case <-stopCh:
  1988  			return
  1989  		default:
  1990  
  1991  			// Rate limit how often we attempt replication. It is OK to ignore
  1992  			// the error as the context will never be cancelled and the limit
  1993  			// parameters are controlled internally.
  1994  			_ = limiter.Wait(context.Background())
  1995  
  1996  			if !ServersMeetMinimumVersion(
  1997  				s.serf.Members(), s.Region(), minACLRoleVersion, true) {
  1998  				s.logger.Trace(
  1999  					"all servers must be upgraded to 1.4.0 or later before ACL Roles can be replicated")
  2000  				if s.replicationBackoffContinue(stopCh) {
  2001  					continue
  2002  				} else {
  2003  					return
  2004  				}
  2005  			}
  2006  
  2007  			// Set the replication token on each replication iteration so that
  2008  			// it is always current and can handle agent SIGHUP reloads.
  2009  			req.AuthToken = s.ReplicationToken()
  2010  
  2011  			var resp structs.ACLRolesListResponse
  2012  
  2013  			// Make the list RPC request to the authoritative region, so we
  2014  			// capture the latest ACL role listing.
  2015  			err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListRolesRPCMethod, &req, &resp)
  2016  			if err != nil {
  2017  				s.logger.Error("failed to fetch ACL Roles from authoritative region", "error", err)
  2018  				if s.replicationBackoffContinue(stopCh) {
  2019  					continue
  2020  				} else {
  2021  					return
  2022  				}
  2023  			}
  2024  
  2025  			// Perform a two-way diff on the ACL roles.
  2026  			toDelete, toUpdate := diffACLRoles(s.State(), req.MinQueryIndex, resp.ACLRoles)
  2027  
  2028  			// A significant amount of time could pass between the last check
  2029  			// on whether we should stop the replication process. Therefore, do
  2030  			// a check here, before calling Raft.
  2031  			select {
  2032  			case <-stopCh:
  2033  				return
  2034  			default:
  2035  			}
  2036  
  2037  			// If we have ACL roles to delete, make this call directly to Raft.
  2038  			if len(toDelete) > 0 {
  2039  				args := structs.ACLRolesDeleteByIDRequest{ACLRoleIDs: toDelete}
  2040  				_, _, err := s.raftApply(structs.ACLRolesDeleteByIDRequestType, &args)
  2041  
  2042  				// If the error was because we lost leadership while calling
  2043  				// Raft, avoid logging as this can be confusing to operators.
  2044  				if err != nil {
  2045  					if err != raft.ErrLeadershipLost {
  2046  						s.logger.Error("failed to delete ACL roles", "error", err)
  2047  					}
  2048  					if s.replicationBackoffContinue(stopCh) {
  2049  						continue
  2050  					} else {
  2051  						return
  2052  					}
  2053  				}
  2054  			}
  2055  
  2056  			// Fetch any outdated policies.
  2057  			var fetched []*structs.ACLRole
  2058  			if len(toUpdate) > 0 {
  2059  				req := structs.ACLRolesByIDRequest{
  2060  					ACLRoleIDs: toUpdate,
  2061  					QueryOptions: structs.QueryOptions{
  2062  						Region:        s.config.AuthoritativeRegion,
  2063  						AuthToken:     s.ReplicationToken(),
  2064  						AllowStale:    true,
  2065  						MinQueryIndex: resp.Index - 1,
  2066  					},
  2067  				}
  2068  				var reply structs.ACLRolesByIDResponse
  2069  				if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetRolesByIDRPCMethod, &req, &reply); err != nil {
  2070  					s.logger.Error("failed to fetch ACL Roles from authoritative region", "error", err)
  2071  					if s.replicationBackoffContinue(stopCh) {
  2072  						continue
  2073  					} else {
  2074  						return
  2075  					}
  2076  				}
  2077  				for _, aclRole := range reply.ACLRoles {
  2078  					fetched = append(fetched, aclRole)
  2079  				}
  2080  			}
  2081  
  2082  			// Update local tokens
  2083  			if len(fetched) > 0 {
  2084  
  2085  				// The replication of ACL roles and policies are independent,
  2086  				// therefore we cannot ensure the policies linked within the
  2087  				// role are present. We must set allow missing to true.
  2088  				args := structs.ACLRolesUpsertRequest{
  2089  					ACLRoles:             fetched,
  2090  					AllowMissingPolicies: true,
  2091  				}
  2092  
  2093  				// Perform the upsert directly via Raft.
  2094  				_, _, err := s.raftApply(structs.ACLRolesUpsertRequestType, &args)
  2095  				if err != nil {
  2096  					s.logger.Error("failed to update ACL roles", "error", err)
  2097  					if s.replicationBackoffContinue(stopCh) {
  2098  						continue
  2099  					} else {
  2100  						return
  2101  					}
  2102  				}
  2103  			}
  2104  
  2105  			// Update the minimum query index, blocks until there is a change.
  2106  			req.MinQueryIndex = resp.Index
  2107  		}
  2108  	}
  2109  }
  2110  
  2111  // diffACLRoles is used to perform a two-way diff between the local ACL Roles
  2112  // and the remote Roles to determine which tokens need to be deleted or
  2113  // updated. The returned array's contain ACL Role IDs.
  2114  func diffACLRoles(
  2115  	store *state.StateStore, minIndex uint64, remoteList []*structs.ACLRoleListStub) (
  2116  	delete []string, update []string) {
  2117  
  2118  	// The local ACL role tracking is keyed by the role ID and the value is the
  2119  	// hash of the role.
  2120  	local := make(map[string][]byte)
  2121  
  2122  	// The remote ACL role tracking is keyed by the role ID; the value is an
  2123  	// empty struct as we already have the full object.
  2124  	remote := make(map[string]struct{})
  2125  
  2126  	// Read all the ACL role currently held within our local state. This panic
  2127  	// will only happen as a developer making a mistake with naming the index
  2128  	// to use.
  2129  	iter, err := store.GetACLRoles(nil)
  2130  	if err != nil {
  2131  		panic(fmt.Sprintf("failed to iterate local ACL roles: %v", err))
  2132  	}
  2133  
  2134  	// Iterate the local ACL roles and add them to our tracking of local roles.
  2135  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  2136  		aclRole := raw.(*structs.ACLRole)
  2137  		local[aclRole.ID] = aclRole.Hash
  2138  	}
  2139  
  2140  	// Iterate over the remote ACL roles.
  2141  	for _, remoteACLRole := range remoteList {
  2142  		remote[remoteACLRole.ID] = struct{}{}
  2143  
  2144  		// Identify whether the ACL role is within the local state. If it is
  2145  		// not, add this to our update list.
  2146  		if localHash, ok := local[remoteACLRole.ID]; !ok {
  2147  			update = append(update, remoteACLRole.ID)
  2148  
  2149  			// Check if ACL role is newer remotely and there is a hash
  2150  			// mismatch.
  2151  		} else if remoteACLRole.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLRole.Hash) {
  2152  			update = append(update, remoteACLRole.ID)
  2153  		}
  2154  	}
  2155  
  2156  	// If we have ACL roles within state which are no longer present in the
  2157  	// authoritative region we should delete them.
  2158  	for localACLRole := range local {
  2159  		if _, ok := remote[localACLRole]; !ok {
  2160  			delete = append(delete, localACLRole)
  2161  		}
  2162  	}
  2163  	return
  2164  }
  2165  
  2166  // replicateACLAuthMethods is used to replicate ACL Authentication Methods from
  2167  // the authoritative region to this region. The loop should only be run on the
  2168  // leader within the federated region.
  2169  func (s *Server) replicateACLAuthMethods(stopCh chan struct{}) {
  2170  
  2171  	// Generate our request object. We only need to do this once and reuse it
  2172  	// for every RPC request. The MinQueryIndex is updated after every
  2173  	// successful replication loop, so the next query acts as a blocking query
  2174  	// and only returns upon a change in the authoritative region.
  2175  	req := structs.ACLAuthMethodListRequest{
  2176  		QueryOptions: structs.QueryOptions{
  2177  			AllowStale: true,
  2178  			Region:     s.config.AuthoritativeRegion,
  2179  		},
  2180  	}
  2181  
  2182  	// Create our replication rate limiter for ACL auth-methods and log a
  2183  	// lovely message to indicate the process is starting.
  2184  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  2185  	s.logger.Debug("starting ACL Auth-Methods replication from authoritative region",
  2186  		"authoritative_region", req.Region)
  2187  
  2188  	// Enter the main ACL auth-methods replication loop that will only exit
  2189  	// when the stopCh is closed.
  2190  	//
  2191  	// Any error encountered will use the replicationBackoffContinue function
  2192  	// which handles replication backoff and shutdown coordination in the event
  2193  	// of an error inside the loop.
  2194  	for {
  2195  		select {
  2196  		case <-stopCh:
  2197  			return
  2198  		default:
  2199  
  2200  			// Rate limit how often we attempt replication. It is OK to ignore
  2201  			// the error as the context will never be cancelled and the limit
  2202  			// parameters are controlled internally.
  2203  			_ = limiter.Wait(context.Background())
  2204  
  2205  			if !ServersMeetMinimumVersion(
  2206  				s.serf.Members(), s.Region(), minACLAuthMethodVersion, true) {
  2207  				s.logger.Trace(
  2208  					"all servers must be upgraded to 1.5.0 or later before ACL Auth Methods can be replicated")
  2209  				if s.replicationBackoffContinue(stopCh) {
  2210  					continue
  2211  				} else {
  2212  					return
  2213  				}
  2214  			}
  2215  
  2216  			// Set the replication token on each replication iteration so that
  2217  			// it is always current and can handle agent SIGHUP reloads.
  2218  			req.AuthToken = s.ReplicationToken()
  2219  
  2220  			var resp structs.ACLAuthMethodListResponse
  2221  
  2222  			// Make the list RPC request to the authoritative region, so we
  2223  			// capture the latest ACL auth-method listing.
  2224  			err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListAuthMethodsRPCMethod, &req, &resp)
  2225  			if err != nil {
  2226  				s.logger.Error("failed to fetch ACL auth-methods from authoritative region", "error", err)
  2227  				if s.replicationBackoffContinue(stopCh) {
  2228  					continue
  2229  				} else {
  2230  					return
  2231  				}
  2232  			}
  2233  
  2234  			// Perform a two-way diff on the ACL auth-methods.
  2235  			toDelete, toUpdate := diffACLAuthMethods(s.State(), req.MinQueryIndex, resp.AuthMethods)
  2236  
  2237  			// A significant amount of time could pass between the last check
  2238  			// on whether we should stop the replication process. Therefore, do
  2239  			// a check here, before calling Raft.
  2240  			select {
  2241  			case <-stopCh:
  2242  				return
  2243  			default:
  2244  			}
  2245  
  2246  			// If we have ACL auth-methods to delete, make this call directly
  2247  			// to Raft.
  2248  			if len(toDelete) > 0 {
  2249  				args := structs.ACLAuthMethodDeleteRequest{Names: toDelete}
  2250  				_, _, err := s.raftApply(structs.ACLAuthMethodsDeleteRequestType, &args)
  2251  
  2252  				// If the error was because we lost leadership while calling
  2253  				// Raft, avoid logging as this can be confusing to operators.
  2254  				if err != nil {
  2255  					if err != raft.ErrLeadershipLost {
  2256  						s.logger.Error("failed to delete ACL auth-methods", "error", err)
  2257  					}
  2258  					if s.replicationBackoffContinue(stopCh) {
  2259  						continue
  2260  					} else {
  2261  						return
  2262  					}
  2263  				}
  2264  			}
  2265  
  2266  			// Fetch any outdated auth-methods.
  2267  			var fetched []*structs.ACLAuthMethod
  2268  			if len(toUpdate) > 0 {
  2269  				req := structs.ACLAuthMethodsGetRequest{
  2270  					Names: toUpdate,
  2271  					QueryOptions: structs.QueryOptions{
  2272  						Region:        s.config.AuthoritativeRegion,
  2273  						AuthToken:     s.ReplicationToken(),
  2274  						AllowStale:    true,
  2275  						MinQueryIndex: resp.Index - 1,
  2276  					},
  2277  				}
  2278  				var reply structs.ACLAuthMethodsGetResponse
  2279  				if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetAuthMethodsRPCMethod, &req, &reply); err != nil {
  2280  					s.logger.Error("failed to fetch ACL auth-methods from authoritative region", "error", err)
  2281  					if s.replicationBackoffContinue(stopCh) {
  2282  						continue
  2283  					} else {
  2284  						return
  2285  					}
  2286  				}
  2287  				for _, aclAuthMethod := range reply.AuthMethods {
  2288  					fetched = append(fetched, aclAuthMethod)
  2289  				}
  2290  			}
  2291  
  2292  			// Update local auth-methods.
  2293  			if len(fetched) > 0 {
  2294  				args := structs.ACLAuthMethodUpsertRequest{
  2295  					AuthMethods: fetched,
  2296  				}
  2297  
  2298  				// Perform the upsert directly via Raft.
  2299  				_, _, err := s.raftApply(structs.ACLAuthMethodsUpsertRequestType, &args)
  2300  				if err != nil {
  2301  					s.logger.Error("failed to update ACL auth-methods", "error", err)
  2302  					if s.replicationBackoffContinue(stopCh) {
  2303  						continue
  2304  					} else {
  2305  						return
  2306  					}
  2307  				}
  2308  			}
  2309  
  2310  			// Update the minimum query index, blocks until there is a change.
  2311  			req.MinQueryIndex = resp.Index
  2312  		}
  2313  	}
  2314  }
  2315  
  2316  // diffACLAuthMethods is used to perform a two-way diff between the local ACL
  2317  // auth-methods and the remote auth-methods to determine which ones need to be
  2318  // deleted or updated. The returned array's contain ACL auth-method names.
  2319  func diffACLAuthMethods(
  2320  	store *state.StateStore, minIndex uint64, remoteList []*structs.ACLAuthMethodStub) (
  2321  	delete []string, update []string) {
  2322  
  2323  	// The local ACL auth-method tracking is keyed by the name and the value is
  2324  	// the hash of the auth-method.
  2325  	local := make(map[string][]byte)
  2326  
  2327  	// The remote ACL auth-method tracking is keyed by the name; the value is
  2328  	// an empty struct as we already have the full object.
  2329  	remote := make(map[string]struct{})
  2330  
  2331  	// Read all the ACL auth-methods currently held within our local state.
  2332  	// This panic will only happen as a developer making a mistake with naming
  2333  	// the index to use.
  2334  	iter, err := store.GetACLAuthMethods(nil)
  2335  	if err != nil {
  2336  		panic(fmt.Sprintf("failed to iterate local ACL roles: %v", err))
  2337  	}
  2338  
  2339  	// Iterate the local ACL auth-methods and add them to our tracking of
  2340  	// local auth-methods
  2341  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  2342  		aclAuthMethod := raw.(*structs.ACLAuthMethod)
  2343  		local[aclAuthMethod.Name] = aclAuthMethod.Hash
  2344  	}
  2345  
  2346  	// Iterate over the remote ACL auth-methods.
  2347  	for _, remoteACLAuthMethod := range remoteList {
  2348  		remote[remoteACLAuthMethod.Name] = struct{}{}
  2349  
  2350  		// Identify whether the ACL auth-method is within the local state. If
  2351  		// it is not, add this to our update list.
  2352  		if localHash, ok := local[remoteACLAuthMethod.Name]; !ok {
  2353  			update = append(update, remoteACLAuthMethod.Name)
  2354  
  2355  			// Check if ACL auth-method is newer remotely and there is a hash
  2356  			// mismatch.
  2357  		} else if remoteACLAuthMethod.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLAuthMethod.Hash) {
  2358  			update = append(update, remoteACLAuthMethod.Name)
  2359  		}
  2360  	}
  2361  
  2362  	// If we have ACL auth-methods within state which are no longer present in
  2363  	// the authoritative region we should delete them.
  2364  	for localACLAuthMethod := range local {
  2365  		if _, ok := remote[localACLAuthMethod]; !ok {
  2366  			delete = append(delete, localACLAuthMethod)
  2367  		}
  2368  	}
  2369  	return
  2370  }
  2371  
  2372  // replicateACLBindingRules is used to replicate ACL binding rules from the
  2373  // authoritative region to this region. The loop should only be run on the
  2374  // leader within the federated region.
  2375  func (s *Server) replicateACLBindingRules(stopCh chan struct{}) {
  2376  
  2377  	// Generate our request object. We only need to do this once and reuse it
  2378  	// for every RPC request. The MinQueryIndex is updated after every
  2379  	// successful replication loop, so the next query acts as a blocking query
  2380  	// and only returns upon a change in the authoritative region.
  2381  	req := structs.ACLBindingRulesListRequest{
  2382  		QueryOptions: structs.QueryOptions{
  2383  			AllowStale: true,
  2384  			Region:     s.config.AuthoritativeRegion,
  2385  		},
  2386  	}
  2387  
  2388  	// Create our replication rate limiter for ACL binding rules and log a
  2389  	// lovely message to indicate the process is starting.
  2390  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  2391  	s.logger.Debug("starting ACL Binding Rules replication from authoritative region",
  2392  		"authoritative_region", req.Region)
  2393  
  2394  	// Enter the main ACL binding rules replication loop that will only exit
  2395  	// when the stopCh is closed.
  2396  	//
  2397  	// Any error encountered will use the replicationBackoffContinue function
  2398  	// which handles replication backoff and shutdown coordination in the event
  2399  	// of an error inside the loop.
  2400  	for {
  2401  		select {
  2402  		case <-stopCh:
  2403  			return
  2404  		default:
  2405  
  2406  			// Rate limit how often we attempt replication. It is OK to ignore
  2407  			// the error as the context will never be cancelled and the limit
  2408  			// parameters are controlled internally.
  2409  			_ = limiter.Wait(context.Background())
  2410  
  2411  			if !ServersMeetMinimumVersion(
  2412  				s.serf.Members(), s.Region(), minACLBindingRuleVersion, true) {
  2413  				s.logger.Trace(
  2414  					"all servers must be upgraded to 1.5.0 or later before ACL Binding Rules can be replicated")
  2415  				if s.replicationBackoffContinue(stopCh) {
  2416  					continue
  2417  				} else {
  2418  					return
  2419  				}
  2420  			}
  2421  
  2422  			// Set the replication token on each replication iteration so that
  2423  			// it is always current and can handle agent SIGHUP reloads.
  2424  			req.AuthToken = s.ReplicationToken()
  2425  
  2426  			var resp structs.ACLBindingRulesListResponse
  2427  
  2428  			// Make the list RPC request to the authoritative region, so we
  2429  			// capture the latest ACL binding rules listing.
  2430  			err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLListBindingRulesRPCMethod, &req, &resp)
  2431  			if err != nil {
  2432  				s.logger.Error("failed to fetch ACL binding rules from authoritative region", "error", err)
  2433  				if s.replicationBackoffContinue(stopCh) {
  2434  					continue
  2435  				} else {
  2436  					return
  2437  				}
  2438  			}
  2439  
  2440  			// Perform a two-way diff on the ACL binding rules.
  2441  			toDelete, toUpdate := diffACLBindingRules(s.State(), req.MinQueryIndex, resp.ACLBindingRules)
  2442  
  2443  			// A significant amount of time could pass between the last check
  2444  			// on whether we should stop the replication process. Therefore, do
  2445  			// a check here, before calling Raft.
  2446  			select {
  2447  			case <-stopCh:
  2448  				return
  2449  			default:
  2450  			}
  2451  
  2452  			// If we have ACL binding rules to delete, make this call directly
  2453  			// to Raft.
  2454  			if len(toDelete) > 0 {
  2455  				args := structs.ACLBindingRulesDeleteRequest{ACLBindingRuleIDs: toDelete}
  2456  				_, _, err := s.raftApply(structs.ACLBindingRulesDeleteRequestType, &args)
  2457  
  2458  				// If the error was because we lost leadership while calling
  2459  				// Raft, avoid logging as this can be confusing to operators.
  2460  				if err != nil {
  2461  					if err != raft.ErrLeadershipLost {
  2462  						s.logger.Error("failed to delete ACL binding rules", "error", err)
  2463  					}
  2464  					if s.replicationBackoffContinue(stopCh) {
  2465  						continue
  2466  					} else {
  2467  						return
  2468  					}
  2469  				}
  2470  			}
  2471  
  2472  			// Fetch any outdated binding rules.
  2473  			var fetched []*structs.ACLBindingRule
  2474  			if len(toUpdate) > 0 {
  2475  				req := structs.ACLBindingRulesRequest{
  2476  					ACLBindingRuleIDs: toUpdate,
  2477  					QueryOptions: structs.QueryOptions{
  2478  						Region:        s.config.AuthoritativeRegion,
  2479  						AuthToken:     s.ReplicationToken(),
  2480  						AllowStale:    true,
  2481  						MinQueryIndex: resp.Index - 1,
  2482  					},
  2483  				}
  2484  				var reply structs.ACLBindingRulesResponse
  2485  				if err := s.forwardRegion(s.config.AuthoritativeRegion, structs.ACLGetBindingRulesRPCMethod, &req, &reply); err != nil {
  2486  					s.logger.Error("failed to fetch ACL binding rules from authoritative region", "error", err)
  2487  					if s.replicationBackoffContinue(stopCh) {
  2488  						continue
  2489  					} else {
  2490  						return
  2491  					}
  2492  				}
  2493  				for _, aclBindingRule := range reply.ACLBindingRules {
  2494  					fetched = append(fetched, aclBindingRule)
  2495  				}
  2496  			}
  2497  
  2498  			// Update local binding rules.
  2499  			if len(fetched) > 0 {
  2500  				args := structs.ACLBindingRulesUpsertRequest{
  2501  					ACLBindingRules:         fetched,
  2502  					AllowMissingAuthMethods: true,
  2503  				}
  2504  
  2505  				// Perform the upsert directly via Raft.
  2506  				_, _, err := s.raftApply(structs.ACLBindingRulesUpsertRequestType, &args)
  2507  				if err != nil {
  2508  					s.logger.Error("failed to update ACL binding rules", "error", err)
  2509  					if s.replicationBackoffContinue(stopCh) {
  2510  						continue
  2511  					} else {
  2512  						return
  2513  					}
  2514  				}
  2515  			}
  2516  
  2517  			// Update the minimum query index, blocks until there is a change.
  2518  			req.MinQueryIndex = resp.Index
  2519  		}
  2520  	}
  2521  }
  2522  
  2523  // diffACLBindingRules is used to perform a two-way diff between the local ACL
  2524  // binding rules and the remote binding rules to determine which ones need to be
  2525  // deleted or updated. The returned array's contain ACL binding rule names.
  2526  func diffACLBindingRules(
  2527  	store *state.StateStore, minIndex uint64, remoteList []*structs.ACLBindingRuleListStub) (
  2528  	delete []string, update []string) {
  2529  
  2530  	// The local ACL binding rules tracking is keyed by the name and the value
  2531  	// is the hash of the auth-method.
  2532  	local := make(map[string][]byte)
  2533  
  2534  	// The remote ACL binding rules tracking is keyed by the name; the value is
  2535  	// an empty struct as we already have the full object.
  2536  	remote := make(map[string]struct{})
  2537  
  2538  	// Read all the ACL binding rules currently held within our local state.
  2539  	// This panic will only happen as a developer making a mistake with naming
  2540  	// the index to use.
  2541  	iter, err := store.GetACLBindingRules(nil)
  2542  	if err != nil {
  2543  		panic(fmt.Sprintf("failed to iterate local ACL binding rules: %v", err))
  2544  	}
  2545  
  2546  	// Iterate the local ACL binding rules and add them to our tracking of
  2547  	// local binding rules.
  2548  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  2549  		aclBindingRule := raw.(*structs.ACLBindingRule)
  2550  		local[aclBindingRule.ID] = aclBindingRule.Hash
  2551  	}
  2552  
  2553  	// Iterate over the remote ACL binding rules.
  2554  	for _, remoteACLBindingRule := range remoteList {
  2555  		remote[remoteACLBindingRule.ID] = struct{}{}
  2556  
  2557  		// Identify whether the ACL auth-method is within the local state. If
  2558  		// it is not, add this to our update list.
  2559  		if localHash, ok := local[remoteACLBindingRule.ID]; !ok {
  2560  			update = append(update, remoteACLBindingRule.ID)
  2561  
  2562  			// Check if the ACL binding rule is newer remotely and there is a
  2563  			// hash mismatch.
  2564  		} else if remoteACLBindingRule.ModifyIndex > minIndex && !bytes.Equal(localHash, remoteACLBindingRule.Hash) {
  2565  			update = append(update, remoteACLBindingRule.ID)
  2566  		}
  2567  	}
  2568  
  2569  	// If we have ACL binding rules within state which are no longer present in
  2570  	// the authoritative region we should delete them.
  2571  	for localACLBindingRules := range local {
  2572  		if _, ok := remote[localACLBindingRules]; !ok {
  2573  			delete = append(delete, localACLBindingRules)
  2574  		}
  2575  	}
  2576  	return
  2577  }
  2578  
  2579  // replicationBackoffContinue should be used when a replication loop encounters
  2580  // an error and wants to wait until either the backoff time has been met, or
  2581  // the stopCh has been closed. The boolean indicates whether the replication
  2582  // process should continue.
  2583  //
  2584  // Typical use:
  2585  //
  2586  //	  if s.replicationBackoffContinue(stopCh) {
  2587  //		   continue
  2588  //		 } else {
  2589  //	    return
  2590  //	  }
  2591  func (s *Server) replicationBackoffContinue(stopCh chan struct{}) bool {
  2592  
  2593  	timer, timerStopFn := helper.NewSafeTimer(s.config.ReplicationBackoff)
  2594  	defer timerStopFn()
  2595  
  2596  	select {
  2597  	case <-timer.C:
  2598  		return true
  2599  	case <-stopCh:
  2600  		return false
  2601  	}
  2602  }
  2603  
  2604  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
  2605  func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig {
  2606  	state := s.fsm.State()
  2607  	_, config, err := state.AutopilotConfig()
  2608  	if err != nil {
  2609  		s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err)
  2610  		return nil
  2611  	}
  2612  	if config != nil {
  2613  		return config
  2614  	}
  2615  
  2616  	if !ServersMeetMinimumVersion(s.Members(), AllRegions, minAutopilotVersion, false) {
  2617  		s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion)
  2618  		return nil
  2619  	}
  2620  
  2621  	config = s.config.AutopilotConfig
  2622  	req := structs.AutopilotSetConfigRequest{Config: *config}
  2623  	if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
  2624  		s.logger.Named("autopilot").Error("failed to initialize config", "error", err)
  2625  		return nil
  2626  	}
  2627  
  2628  	return config
  2629  }
  2630  
  2631  // getOrCreateSchedulerConfig is used to get the scheduler config. We create a default
  2632  // config if it doesn't already exist for bootstrapping an empty cluster
  2633  func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration {
  2634  	state := s.fsm.State()
  2635  	_, config, err := state.SchedulerConfig()
  2636  	if err != nil {
  2637  		s.logger.Named("core").Error("failed to get scheduler config", "error", err)
  2638  		return nil
  2639  	}
  2640  	if config != nil {
  2641  		return config
  2642  	}
  2643  	if !ServersMeetMinimumVersion(s.Members(), s.Region(), minSchedulerConfigVersion, false) {
  2644  		s.logger.Named("core").Warn("can't initialize scheduler config until all servers are above minimum version", "min_version", minSchedulerConfigVersion)
  2645  		return nil
  2646  	}
  2647  
  2648  	req := structs.SchedulerSetConfigRequest{Config: s.config.DefaultSchedulerConfig}
  2649  	if _, _, err = s.raftApply(structs.SchedulerConfigRequestType, req); err != nil {
  2650  		s.logger.Named("core").Error("failed to initialize config", "error", err)
  2651  		return nil
  2652  	}
  2653  
  2654  	return config
  2655  }
  2656  
  2657  var minVersionKeyring = version.Must(version.NewVersion("1.4.0"))
  2658  
  2659  // initializeKeyring creates the first root key if the leader doesn't
  2660  // already have one. The metadata will be replicated via raft and then
  2661  // the followers will get the key material from their own key
  2662  // replication.
  2663  func (s *Server) initializeKeyring(stopCh <-chan struct{}) {
  2664  
  2665  	logger := s.logger.Named("keyring")
  2666  
  2667  	store := s.fsm.State()
  2668  	keyMeta, err := store.GetActiveRootKeyMeta(nil)
  2669  	if err != nil {
  2670  		logger.Error("failed to get active key: %v", err)
  2671  		return
  2672  	}
  2673  	if keyMeta != nil {
  2674  		return
  2675  	}
  2676  
  2677  	logger.Trace("verifying cluster is ready to initialize keyring")
  2678  	for {
  2679  		select {
  2680  		case <-stopCh:
  2681  			return
  2682  		default:
  2683  		}
  2684  
  2685  		if ServersMeetMinimumVersion(s.serf.Members(), s.Region(), minVersionKeyring, true) {
  2686  			break
  2687  		}
  2688  	}
  2689  	// we might have lost leadership during the version check
  2690  	if !s.IsLeader() {
  2691  		return
  2692  	}
  2693  
  2694  	logger.Trace("initializing keyring")
  2695  
  2696  	rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM)
  2697  	rootKey.Meta.SetActive()
  2698  	if err != nil {
  2699  		logger.Error("could not initialize keyring: %v", err)
  2700  		return
  2701  	}
  2702  
  2703  	err = s.encrypter.AddKey(rootKey)
  2704  	if err != nil {
  2705  		logger.Error("could not add initial key to keyring: %v", err)
  2706  		return
  2707  	}
  2708  
  2709  	if _, _, err = s.raftApply(structs.RootKeyMetaUpsertRequestType,
  2710  		structs.KeyringUpdateRootKeyMetaRequest{
  2711  			RootKeyMeta: rootKey.Meta,
  2712  		}); err != nil {
  2713  		logger.Error("could not initialize keyring: %v", err)
  2714  		return
  2715  	}
  2716  
  2717  	logger.Info("initialized keyring", "id", rootKey.Meta.KeyID)
  2718  }
  2719  
  2720  func (s *Server) generateClusterID() (string, error) {
  2721  	if !ServersMeetMinimumVersion(s.Members(), AllRegions, minClusterIDVersion, false) {
  2722  		s.logger.Named("core").Warn("cannot initialize cluster ID until all servers are above minimum version", "min_version", minClusterIDVersion)
  2723  		return "", fmt.Errorf("cluster ID cannot be created until all servers are above minimum version %s", minClusterIDVersion)
  2724  	}
  2725  
  2726  	newMeta := structs.ClusterMetadata{ClusterID: uuid.Generate(), CreateTime: time.Now().UnixNano()}
  2727  	if _, _, err := s.raftApply(structs.ClusterMetadataRequestType, newMeta); err != nil {
  2728  		s.logger.Named("core").Error("failed to create cluster ID", "error", err)
  2729  		return "", fmt.Errorf("failed to create cluster ID: %w", err)
  2730  	}
  2731  
  2732  	s.logger.Named("core").Info("established cluster id", "cluster_id", newMeta.ClusterID, "create_time", newMeta.CreateTime)
  2733  	return newMeta.ClusterID, nil
  2734  }
  2735  
  2736  // handleEvalBrokerStateChange handles changing the evalBroker and blockedEvals
  2737  // enabled status based on the passed scheduler configuration. The boolean
  2738  // response indicates whether the caller needs to call restoreEvals() due to
  2739  // the brokers being enabled. It is for use when the change must take the
  2740  // scheduler configuration into account. This is not needed when calling
  2741  // revokeLeadership, as the configuration doesn't matter, and we need to ensure
  2742  // the brokers are stopped.
  2743  //
  2744  // The function checks the server is the leader and uses a mutex to avoid any
  2745  // potential timings problems. Consider the following timings:
  2746  //   - operator updates the configuration via the API
  2747  //   - the RPC handler applies the change via Raft
  2748  //   - leadership transitions with write barrier
  2749  //   - the RPC handler call this function to enact the change
  2750  //
  2751  // The mutex also protects against a situation where leadership is revoked
  2752  // while this function is being called. Ensuring the correct series of actions
  2753  // occurs so that state stays consistent.
  2754  func (s *Server) handleEvalBrokerStateChange(schedConfig *structs.SchedulerConfiguration) bool {
  2755  
  2756  	// Grab the lock first. Once we have this we can be sure to run everything
  2757  	// needed before any leader transition can attempt to modify the state.
  2758  	s.brokerLock.Lock()
  2759  	defer s.brokerLock.Unlock()
  2760  
  2761  	// If we are no longer the leader, exit early.
  2762  	if !s.IsLeader() {
  2763  		return false
  2764  	}
  2765  
  2766  	// enableEvalBroker tracks whether the evalBroker and blockedEvals
  2767  	// processes should be enabled or not. It allows us to answer this question
  2768  	// whether using a persisted Raft configuration, or the default bootstrap
  2769  	// config.
  2770  	var enableBrokers, restoreEvals bool
  2771  
  2772  	// The scheduler config can only be persisted to Raft once quorum has been
  2773  	// established. If this is a fresh cluster, we need to use the default
  2774  	// scheduler config, otherwise we can use the persisted object.
  2775  	switch schedConfig {
  2776  	case nil:
  2777  		enableBrokers = !s.config.DefaultSchedulerConfig.PauseEvalBroker
  2778  	default:
  2779  		enableBrokers = !schedConfig.PauseEvalBroker
  2780  	}
  2781  
  2782  	// If the evalBroker status is changing, set the new state.
  2783  	if enableBrokers != s.evalBroker.Enabled() {
  2784  		s.logger.Info("eval broker status modified", "paused", !enableBrokers)
  2785  		s.evalBroker.SetEnabled(enableBrokers)
  2786  		restoreEvals = enableBrokers
  2787  	}
  2788  
  2789  	// If the blockedEvals status is changing, set the new state.
  2790  	if enableBrokers != s.blockedEvals.Enabled() {
  2791  		s.logger.Info("blocked evals status modified", "paused", !enableBrokers)
  2792  		s.blockedEvals.SetEnabled(enableBrokers)
  2793  		restoreEvals = enableBrokers
  2794  
  2795  		if enableBrokers {
  2796  			s.blockedEvals.SetTimetable(s.fsm.TimeTable())
  2797  		}
  2798  	}
  2799  
  2800  	return restoreEvals
  2801  }