gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"math/rand"
     8  	"net"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"golang.org/x/time/rate"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	log "github.com/hashicorp/go-hclog"
    17  	memdb "github.com/hashicorp/go-memdb"
    18  	version "github.com/hashicorp/go-version"
    19  	"github.com/hashicorp/nomad/helper/uuid"
    20  	"github.com/hashicorp/nomad/nomad/state"
    21  	"github.com/hashicorp/nomad/nomad/structs"
    22  	"github.com/hashicorp/raft"
    23  	"github.com/hashicorp/serf/serf"
    24  	"github.com/pkg/errors"
    25  )
    26  
    27  const (
    28  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    29  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    30  	// high contention when the schedulers plan does not make progress.
    31  	failedEvalUnblockInterval = 1 * time.Minute
    32  
    33  	// replicationRateLimit is used to rate limit how often data is replicated
    34  	// between the authoritative region and the local region
    35  	replicationRateLimit rate.Limit = 10.0
    36  
    37  	// barrierWriteTimeout is used to give Raft a chance to process a
    38  	// possible loss of leadership event if we are unable to get a barrier
    39  	// while leader.
    40  	barrierWriteTimeout = 2 * time.Minute
    41  )
    42  
    43  var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    44  
    45  var minSchedulerConfigVersion = version.Must(version.NewVersion("0.9.0"))
    46  
    47  var minClusterIDVersion = version.Must(version.NewVersion("0.10.4"))
    48  
    49  // monitorLeadership is used to monitor if we acquire or lose our role
    50  // as the leader in the Raft cluster. There is some work the leader is
    51  // expected to do, so we must react to changes
    52  func (s *Server) monitorLeadership() {
    53  	var weAreLeaderCh chan struct{}
    54  	var leaderLoop sync.WaitGroup
    55  
    56  	leaderCh := s.raft.LeaderCh()
    57  
    58  	leaderStep := func(isLeader bool) {
    59  		if isLeader {
    60  			if weAreLeaderCh != nil {
    61  				s.logger.Error("attempted to start the leader loop while running")
    62  				return
    63  			}
    64  
    65  			weAreLeaderCh = make(chan struct{})
    66  			leaderLoop.Add(1)
    67  			go func(ch chan struct{}) {
    68  				defer leaderLoop.Done()
    69  				s.leaderLoop(ch)
    70  			}(weAreLeaderCh)
    71  			s.logger.Info("cluster leadership acquired")
    72  			return
    73  		}
    74  
    75  		if weAreLeaderCh == nil {
    76  			s.logger.Error("attempted to stop the leader loop while not running")
    77  			return
    78  		}
    79  
    80  		s.logger.Debug("shutting down leader loop")
    81  		close(weAreLeaderCh)
    82  		leaderLoop.Wait()
    83  		weAreLeaderCh = nil
    84  		s.logger.Info("cluster leadership lost")
    85  	}
    86  
    87  	wasLeader := false
    88  	for {
    89  		select {
    90  		case isLeader := <-leaderCh:
    91  			if wasLeader != isLeader {
    92  				wasLeader = isLeader
    93  				// normal case where we went through a transition
    94  				leaderStep(isLeader)
    95  			} else if wasLeader && isLeader {
    96  				// Server lost but then gained leadership immediately.
    97  				// During this time, this server may have received
    98  				// Raft transitions that haven't been applied to the FSM
    99  				// yet.
   100  				// Ensure that that FSM caught up and eval queues are refreshed
   101  				s.logger.Warn("cluster leadership lost and gained leadership immediately.  Could indicate network issues, memory paging, or high CPU load.")
   102  
   103  				leaderStep(false)
   104  				leaderStep(true)
   105  			} else {
   106  				// Server gained but lost leadership immediately
   107  				// before it reacted; nothing to do, move on
   108  				s.logger.Warn("cluster leadership gained and lost leadership immediately.  Could indicate network issues, memory paging, or high CPU load.")
   109  			}
   110  		case <-s.shutdownCh:
   111  			if weAreLeaderCh != nil {
   112  				leaderStep(false)
   113  			}
   114  			return
   115  		}
   116  	}
   117  }
   118  
   119  // leaderLoop runs as long as we are the leader to run various
   120  // maintenance activities
   121  func (s *Server) leaderLoop(stopCh chan struct{}) {
   122  	var reconcileCh chan serf.Member
   123  	establishedLeader := false
   124  
   125  RECONCILE:
   126  	// Setup a reconciliation timer
   127  	reconcileCh = nil
   128  	interval := time.After(s.config.ReconcileInterval)
   129  
   130  	// Apply a raft barrier to ensure our FSM is caught up
   131  	start := time.Now()
   132  	barrier := s.raft.Barrier(barrierWriteTimeout)
   133  	if err := barrier.Error(); err != nil {
   134  		s.logger.Error("failed to wait for barrier", "error", err)
   135  		goto WAIT
   136  	}
   137  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   138  
   139  	// Check if we need to handle initial leadership actions
   140  	if !establishedLeader {
   141  		if err := s.establishLeadership(stopCh); err != nil {
   142  			s.logger.Error("failed to establish leadership", "error", err)
   143  
   144  			// Immediately revoke leadership since we didn't successfully
   145  			// establish leadership.
   146  			if err := s.revokeLeadership(); err != nil {
   147  				s.logger.Error("failed to revoke leadership", "error", err)
   148  			}
   149  
   150  			goto WAIT
   151  		}
   152  
   153  		establishedLeader = true
   154  		defer func() {
   155  			if err := s.revokeLeadership(); err != nil {
   156  				s.logger.Error("failed to revoke leadership", "error", err)
   157  			}
   158  		}()
   159  	}
   160  
   161  	// Reconcile any missing data
   162  	if err := s.reconcile(); err != nil {
   163  		s.logger.Error("failed to reconcile", "error", err)
   164  		goto WAIT
   165  	}
   166  
   167  	// Initial reconcile worked, now we can process the channel
   168  	// updates
   169  	reconcileCh = s.reconcileCh
   170  
   171  	// Poll the stop channel to give it priority so we don't waste time
   172  	// trying to perform the other operations if we have been asked to shut
   173  	// down.
   174  	select {
   175  	case <-stopCh:
   176  		return
   177  	default:
   178  	}
   179  
   180  WAIT:
   181  	// Wait until leadership is lost
   182  	for {
   183  		select {
   184  		case <-stopCh:
   185  			return
   186  		case <-s.shutdownCh:
   187  			return
   188  		case <-interval:
   189  			goto RECONCILE
   190  		case member := <-reconcileCh:
   191  			s.reconcileMember(member)
   192  		}
   193  	}
   194  }
   195  
   196  // establishLeadership is invoked once we become leader and are able
   197  // to invoke an initial barrier. The barrier is used to ensure any
   198  // previously inflight transactions have been committed and that our
   199  // state is up-to-date.
   200  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   201  	defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now())
   202  
   203  	// Generate a leader ACL token. This will allow the leader to issue work
   204  	// that requires a valid ACL token.
   205  	s.setLeaderAcl(uuid.Generate())
   206  
   207  	// Disable workers to free half the cores for use in the plan queue and
   208  	// evaluation broker
   209  	for _, w := range s.pausableWorkers() {
   210  		w.SetPause(true)
   211  	}
   212  
   213  	// Initialize and start the autopilot routine
   214  	s.getOrCreateAutopilotConfig()
   215  	s.autopilot.Start()
   216  
   217  	// Initialize scheduler configuration
   218  	s.getOrCreateSchedulerConfig()
   219  
   220  	// Initialize the ClusterID
   221  	_, _ = s.ClusterID()
   222  	// todo: use cluster ID for stuff, later!
   223  
   224  	// Enable the plan queue, since we are now the leader
   225  	s.planQueue.SetEnabled(true)
   226  
   227  	// Start the plan evaluator
   228  	go s.planApply()
   229  
   230  	// Enable the eval broker, since we are now the leader
   231  	s.evalBroker.SetEnabled(true)
   232  
   233  	// Enable the blocked eval tracker, since we are now the leader
   234  	s.blockedEvals.SetEnabled(true)
   235  	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
   236  
   237  	// Enable the deployment watcher, since we are now the leader
   238  	s.deploymentWatcher.SetEnabled(true, s.State())
   239  
   240  	// Enable the NodeDrainer
   241  	s.nodeDrainer.SetEnabled(true, s.State())
   242  
   243  	// Enable the volume watcher, since we are now the leader
   244  	s.volumeWatcher.SetEnabled(true, s.State())
   245  
   246  	// Restore the eval broker state
   247  	if err := s.restoreEvals(); err != nil {
   248  		return err
   249  	}
   250  
   251  	// Activate the vault client
   252  	s.vault.SetActive(true)
   253  
   254  	// Enable the periodic dispatcher, since we are now the leader.
   255  	s.periodicDispatcher.SetEnabled(true)
   256  
   257  	// Activate RPC now that local FSM caught up with Raft (as evident by Barrier call success)
   258  	// and all leader related components (e.g. broker queue) are enabled.
   259  	// Auxiliary processes (e.g. background, bookkeeping, and cleanup tasks can start after)
   260  	s.setConsistentReadReady()
   261  
   262  	// Further clean ups and follow up that don't block RPC consistency
   263  
   264  	// Restore the periodic dispatcher state
   265  	if err := s.restorePeriodicDispatcher(); err != nil {
   266  		return err
   267  	}
   268  
   269  	// Scheduler periodic jobs
   270  	go s.schedulePeriodic(stopCh)
   271  
   272  	// Reap any failed evaluations
   273  	go s.reapFailedEvaluations(stopCh)
   274  
   275  	// Reap any duplicate blocked evaluations
   276  	go s.reapDupBlockedEvaluations(stopCh)
   277  
   278  	// Periodically unblock failed allocations
   279  	go s.periodicUnblockFailedEvals(stopCh)
   280  
   281  	// Periodically publish job summary metrics
   282  	go s.publishJobSummaryMetrics(stopCh)
   283  
   284  	// Periodically publish job status metrics
   285  	go s.publishJobStatusMetrics(stopCh)
   286  
   287  	// Setup the heartbeat timers. This is done both when starting up or when
   288  	// a leader fail over happens. Since the timers are maintained by the leader
   289  	// node, effectively this means all the timers are renewed at the time of failover.
   290  	// The TTL contract is that the session will not be expired before the TTL,
   291  	// so expiring it later is allowable.
   292  	//
   293  	// This MUST be done after the initial barrier to ensure the latest Nodes
   294  	// are available to be initialized. Otherwise initialization may use stale
   295  	// data.
   296  	if err := s.initializeHeartbeatTimers(); err != nil {
   297  		s.logger.Error("heartbeat timer setup failed", "error", err)
   298  		return err
   299  	}
   300  
   301  	// Start replication of ACLs and Policies if they are enabled,
   302  	// and we are not the authoritative region.
   303  	if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion {
   304  		go s.replicateACLPolicies(stopCh)
   305  		go s.replicateACLTokens(stopCh)
   306  	}
   307  
   308  	// Setup any enterprise systems required.
   309  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   310  		return err
   311  	}
   312  
   313  	// Cleanup orphaned Vault token accessors
   314  	if err := s.revokeVaultAccessorsOnRestore(); err != nil {
   315  		return err
   316  	}
   317  
   318  	// Cleanup orphaned Service Identity token accessors
   319  	if err := s.revokeSITokenAccessorsOnRestore(); err != nil {
   320  		return err
   321  	}
   322  
   323  	return nil
   324  }
   325  
   326  // restoreEvals is used to restore pending evaluations into the eval broker and
   327  // blocked evaluations into the blocked eval tracker. The broker and blocked
   328  // eval tracker is maintained only by the leader, so it must be restored anytime
   329  // a leadership transition takes place.
   330  func (s *Server) restoreEvals() error {
   331  	// Get an iterator over every evaluation
   332  	ws := memdb.NewWatchSet()
   333  	iter, err := s.fsm.State().Evals(ws)
   334  	if err != nil {
   335  		return fmt.Errorf("failed to get evaluations: %v", err)
   336  	}
   337  
   338  	for {
   339  		raw := iter.Next()
   340  		if raw == nil {
   341  			break
   342  		}
   343  		eval := raw.(*structs.Evaluation)
   344  
   345  		if eval.ShouldEnqueue() {
   346  			s.evalBroker.Enqueue(eval)
   347  		} else if eval.ShouldBlock() {
   348  			s.blockedEvals.Block(eval)
   349  		}
   350  	}
   351  	return nil
   352  }
   353  
   354  // revokeVaultAccessorsOnRestore is used to restore Vault accessors that should be
   355  // revoked.
   356  func (s *Server) revokeVaultAccessorsOnRestore() error {
   357  	// An accessor should be revoked if its allocation or node is terminal
   358  	ws := memdb.NewWatchSet()
   359  	state := s.fsm.State()
   360  	iter, err := state.VaultAccessors(ws)
   361  	if err != nil {
   362  		return fmt.Errorf("failed to get vault accessors: %v", err)
   363  	}
   364  
   365  	var revoke []*structs.VaultAccessor
   366  	for {
   367  		raw := iter.Next()
   368  		if raw == nil {
   369  			break
   370  		}
   371  
   372  		va := raw.(*structs.VaultAccessor)
   373  
   374  		// Check the allocation
   375  		alloc, err := state.AllocByID(ws, va.AllocID)
   376  		if err != nil {
   377  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   378  		}
   379  		if alloc == nil || alloc.Terminated() {
   380  			// No longer running and should be revoked
   381  			revoke = append(revoke, va)
   382  			continue
   383  		}
   384  
   385  		// Check the node
   386  		node, err := state.NodeByID(ws, va.NodeID)
   387  		if err != nil {
   388  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   389  		}
   390  		if node == nil || node.TerminalStatus() {
   391  			// Node is terminal so any accessor from it should be revoked
   392  			revoke = append(revoke, va)
   393  			continue
   394  		}
   395  	}
   396  
   397  	if len(revoke) != 0 {
   398  		s.logger.Info("revoking vault accessors after becoming leader", "accessors", len(revoke))
   399  
   400  		if err := s.vault.MarkForRevocation(revoke); err != nil {
   401  			return fmt.Errorf("failed to revoke tokens: %v", err)
   402  		}
   403  	}
   404  
   405  	return nil
   406  }
   407  
   408  // revokeSITokenAccessorsOnRestore is used to revoke Service Identity token
   409  // accessors on behalf of allocs that are now gone / terminal.
   410  func (s *Server) revokeSITokenAccessorsOnRestore() error {
   411  	ws := memdb.NewWatchSet()
   412  	fsmState := s.fsm.State()
   413  	iter, err := fsmState.SITokenAccessors(ws)
   414  	if err != nil {
   415  		return errors.Wrap(err, "failed to get SI token accessors")
   416  	}
   417  
   418  	var toRevoke []*structs.SITokenAccessor
   419  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   420  		accessor := raw.(*structs.SITokenAccessor)
   421  
   422  		// Check the allocation
   423  		alloc, err := fsmState.AllocByID(ws, accessor.AllocID)
   424  		if err != nil {
   425  			return errors.Wrapf(err, "failed to lookup alloc %q", accessor.AllocID)
   426  		}
   427  		if alloc == nil || alloc.Terminated() {
   428  			// no longer running and associated accessors should be revoked
   429  			toRevoke = append(toRevoke, accessor)
   430  			continue
   431  		}
   432  
   433  		// Check the node
   434  		node, err := fsmState.NodeByID(ws, accessor.NodeID)
   435  		if err != nil {
   436  			return errors.Wrapf(err, "failed to lookup node %q", accessor.NodeID)
   437  		}
   438  		if node == nil || node.TerminalStatus() {
   439  			// node is terminal and associated accessors should be revoked
   440  			toRevoke = append(toRevoke, accessor)
   441  			continue
   442  		}
   443  	}
   444  
   445  	if len(toRevoke) > 0 {
   446  		s.logger.Info("revoking consul accessors after becoming leader", "accessors", len(toRevoke))
   447  		s.consulACLs.MarkForRevocation(toRevoke)
   448  	}
   449  
   450  	return nil
   451  }
   452  
   453  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   454  // periodic dispatcher. It also determines if a periodic job should have been
   455  // created during the leadership transition and force runs them. The periodic
   456  // dispatcher is maintained only by the leader, so it must be restored anytime a
   457  // leadership transition takes place.
   458  func (s *Server) restorePeriodicDispatcher() error {
   459  	logger := s.logger.Named("periodic")
   460  	ws := memdb.NewWatchSet()
   461  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   462  	if err != nil {
   463  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   464  	}
   465  
   466  	now := time.Now()
   467  	for i := iter.Next(); i != nil; i = iter.Next() {
   468  		job := i.(*structs.Job)
   469  
   470  		// We skip adding parameterized jobs because they themselves aren't
   471  		// tracked, only the dispatched children are.
   472  		if job.IsParameterized() {
   473  			continue
   474  		}
   475  
   476  		if err := s.periodicDispatcher.Add(job); err != nil {
   477  			logger.Error("failed to add job to periodic dispatcher", "error", err)
   478  			continue
   479  		}
   480  
   481  		// We do not need to force run the job since it isn't active.
   482  		if !job.IsPeriodicActive() {
   483  			continue
   484  		}
   485  
   486  		// If the periodic job has never been launched before, launch will hold
   487  		// the time the periodic job was added. Otherwise it has the last launch
   488  		// time of the periodic job.
   489  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   490  		if err != nil {
   491  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   492  		}
   493  		if launch == nil {
   494  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   495  				job.ID, job.Namespace)
   496  		}
   497  
   498  		// nextLaunch is the next launch that should occur.
   499  		nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   500  		if err != nil {
   501  			logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err)
   502  			continue
   503  		}
   504  
   505  		// We skip force launching the job if  there should be no next launch
   506  		// (the zero case) or if the next launch time is in the future. If it is
   507  		// in the future, it will be handled by the periodic dispatcher.
   508  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   509  			continue
   510  		}
   511  
   512  		if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil {
   513  			logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err)
   514  			return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err)
   515  		}
   516  		logger.Debug("periodic job force runned during leadership establishment", "job", job.NamespacedID())
   517  	}
   518  
   519  	return nil
   520  }
   521  
   522  // schedulePeriodic is used to do periodic job dispatch while we are leader
   523  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   524  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   525  	defer evalGC.Stop()
   526  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   527  	defer nodeGC.Stop()
   528  	jobGC := time.NewTicker(s.config.JobGCInterval)
   529  	defer jobGC.Stop()
   530  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   531  	defer deploymentGC.Stop()
   532  	csiPluginGC := time.NewTicker(s.config.CSIPluginGCInterval)
   533  	defer csiPluginGC.Stop()
   534  	csiVolumeClaimGC := time.NewTicker(s.config.CSIVolumeClaimGCInterval)
   535  	defer csiVolumeClaimGC.Stop()
   536  
   537  	// getLatest grabs the latest index from the state store. It returns true if
   538  	// the index was retrieved successfully.
   539  	getLatest := func() (uint64, bool) {
   540  		snapshotIndex, err := s.fsm.State().LatestIndex()
   541  		if err != nil {
   542  			s.logger.Error("failed to determine state store's index", "error", err)
   543  			return 0, false
   544  		}
   545  
   546  		return snapshotIndex, true
   547  	}
   548  
   549  	for {
   550  
   551  		select {
   552  		case <-evalGC.C:
   553  			if index, ok := getLatest(); ok {
   554  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   555  			}
   556  		case <-nodeGC.C:
   557  			if index, ok := getLatest(); ok {
   558  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   559  			}
   560  		case <-jobGC.C:
   561  			if index, ok := getLatest(); ok {
   562  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   563  			}
   564  		case <-deploymentGC.C:
   565  			if index, ok := getLatest(); ok {
   566  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
   567  			}
   568  		case <-csiPluginGC.C:
   569  			if index, ok := getLatest(); ok {
   570  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIPluginGC, index))
   571  			}
   572  		case <-csiVolumeClaimGC.C:
   573  			if index, ok := getLatest(); ok {
   574  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobCSIVolumeClaimGC, index))
   575  			}
   576  
   577  		case <-stopCh:
   578  			return
   579  		}
   580  	}
   581  }
   582  
   583  // coreJobEval returns an evaluation for a core job
   584  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   585  	return &structs.Evaluation{
   586  		ID:          uuid.Generate(),
   587  		Namespace:   "-",
   588  		Priority:    structs.CoreJobPriority,
   589  		Type:        structs.JobTypeCore,
   590  		TriggeredBy: structs.EvalTriggerScheduled,
   591  		JobID:       job,
   592  		LeaderACL:   s.getLeaderAcl(),
   593  		Status:      structs.EvalStatusPending,
   594  		ModifyIndex: modifyIndex,
   595  	}
   596  }
   597  
   598  // reapFailedEvaluations is used to reap evaluations that
   599  // have reached their delivery limit and should be failed
   600  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   601  	for {
   602  		select {
   603  		case <-stopCh:
   604  			return
   605  		default:
   606  			// Scan for a failed evaluation
   607  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   608  			if err != nil {
   609  				return
   610  			}
   611  			if eval == nil {
   612  				continue
   613  			}
   614  
   615  			// Update the status to failed
   616  			updateEval := eval.Copy()
   617  			updateEval.Status = structs.EvalStatusFailed
   618  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   619  			s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())
   620  
   621  			// Create a follow-up evaluation that will be used to retry the
   622  			// scheduling for the job after the cluster is hopefully more stable
   623  			// due to the fairly large backoff.
   624  			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
   625  				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
   626  
   627  			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
   628  			updateEval.NextEval = followupEval.ID
   629  			updateEval.UpdateModifyTime()
   630  
   631  			// Update via Raft
   632  			req := structs.EvalUpdateRequest{
   633  				Evals: []*structs.Evaluation{updateEval, followupEval},
   634  			}
   635  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   636  				s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
   637  				continue
   638  			}
   639  
   640  			// Ack completion
   641  			s.evalBroker.Ack(eval.ID, token)
   642  		}
   643  	}
   644  }
   645  
   646  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   647  // should be cancelled.
   648  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   649  	for {
   650  		select {
   651  		case <-stopCh:
   652  			return
   653  		default:
   654  			// Scan for duplicate blocked evals.
   655  			dups := s.blockedEvals.GetDuplicates(time.Second)
   656  			if dups == nil {
   657  				continue
   658  			}
   659  
   660  			cancel := make([]*structs.Evaluation, len(dups))
   661  			for i, dup := range dups {
   662  				// Update the status to cancelled
   663  				newEval := dup.Copy()
   664  				newEval.Status = structs.EvalStatusCancelled
   665  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   666  				newEval.UpdateModifyTime()
   667  				cancel[i] = newEval
   668  			}
   669  
   670  			// Update via Raft
   671  			req := structs.EvalUpdateRequest{
   672  				Evals: cancel,
   673  			}
   674  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   675  				s.logger.Error("failed to update duplicate evals", "evals", log.Fmt("%#v", cancel), "error", err)
   676  				continue
   677  			}
   678  		}
   679  	}
   680  }
   681  
   682  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   683  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   684  	ticker := time.NewTicker(failedEvalUnblockInterval)
   685  	defer ticker.Stop()
   686  	for {
   687  		select {
   688  		case <-stopCh:
   689  			return
   690  		case <-ticker.C:
   691  			// Unblock the failed allocations
   692  			s.blockedEvals.UnblockFailed()
   693  		}
   694  	}
   695  }
   696  
   697  // publishJobSummaryMetrics publishes the job summaries as metrics
   698  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
   699  	timer := time.NewTimer(0)
   700  	defer timer.Stop()
   701  
   702  	for {
   703  		select {
   704  		case <-stopCh:
   705  			return
   706  		case <-timer.C:
   707  			timer.Reset(s.config.StatsCollectionInterval)
   708  			state, err := s.State().Snapshot()
   709  			if err != nil {
   710  				s.logger.Error("failed to get state", "error", err)
   711  				continue
   712  			}
   713  			ws := memdb.NewWatchSet()
   714  			iter, err := state.JobSummaries(ws)
   715  			if err != nil {
   716  				s.logger.Error("failed to get job summaries", "error", err)
   717  				continue
   718  			}
   719  
   720  			for {
   721  				raw := iter.Next()
   722  				if raw == nil {
   723  					break
   724  				}
   725  				summary := raw.(*structs.JobSummary)
   726  				if s.config.DisableDispatchedJobSummaryMetrics {
   727  					job, err := state.JobByID(ws, summary.Namespace, summary.JobID)
   728  					if err != nil {
   729  						s.logger.Error("error getting job for summary", "error", err)
   730  						continue
   731  					}
   732  					if job.Dispatched {
   733  						continue
   734  					}
   735  				}
   736  				s.iterateJobSummaryMetrics(summary)
   737  			}
   738  		}
   739  	}
   740  }
   741  
   742  func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
   743  	for name, tgSummary := range summary.Summary {
   744  		if !s.config.DisableTaggedMetrics {
   745  			labels := []metrics.Label{
   746  				{
   747  					Name:  "job",
   748  					Value: summary.JobID,
   749  				},
   750  				{
   751  					Name:  "task_group",
   752  					Value: name,
   753  				},
   754  				{
   755  					Name:  "namespace",
   756  					Value: summary.Namespace,
   757  				},
   758  			}
   759  
   760  			if strings.Contains(summary.JobID, "/dispatch-") {
   761  				jobInfo := strings.Split(summary.JobID, "/dispatch-")
   762  				labels = append(labels, metrics.Label{
   763  					Name:  "parent_id",
   764  					Value: jobInfo[0],
   765  				}, metrics.Label{
   766  					Name:  "dispatch_id",
   767  					Value: jobInfo[1],
   768  				})
   769  			}
   770  
   771  			if strings.Contains(summary.JobID, "/periodic-") {
   772  				jobInfo := strings.Split(summary.JobID, "/periodic-")
   773  				labels = append(labels, metrics.Label{
   774  					Name:  "parent_id",
   775  					Value: jobInfo[0],
   776  				}, metrics.Label{
   777  					Name:  "periodic_id",
   778  					Value: jobInfo[1],
   779  				})
   780  			}
   781  
   782  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
   783  				float32(tgSummary.Queued), labels)
   784  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
   785  				float32(tgSummary.Complete), labels)
   786  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
   787  				float32(tgSummary.Failed), labels)
   788  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
   789  				float32(tgSummary.Running), labels)
   790  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
   791  				float32(tgSummary.Starting), labels)
   792  			metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
   793  				float32(tgSummary.Lost), labels)
   794  		}
   795  		if s.config.BackwardsCompatibleMetrics {
   796  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
   797  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
   798  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
   799  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
   800  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
   801  			metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
   802  		}
   803  	}
   804  }
   805  
   806  // publishJobStatusMetrics publishes the job statuses as metrics
   807  func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) {
   808  	timer := time.NewTimer(0)
   809  	defer timer.Stop()
   810  
   811  	for {
   812  		select {
   813  		case <-stopCh:
   814  			return
   815  		case <-timer.C:
   816  			timer.Reset(s.config.StatsCollectionInterval)
   817  			state, err := s.State().Snapshot()
   818  			if err != nil {
   819  				s.logger.Error("failed to get state", "error", err)
   820  				continue
   821  			}
   822  			ws := memdb.NewWatchSet()
   823  			iter, err := state.Jobs(ws)
   824  			if err != nil {
   825  				s.logger.Error("failed to get job statuses", "error", err)
   826  				continue
   827  			}
   828  
   829  			s.iterateJobStatusMetrics(&iter)
   830  		}
   831  	}
   832  }
   833  
   834  func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) {
   835  	var pending int64 // Sum of all jobs in 'pending' state
   836  	var running int64 // Sum of all jobs in 'running' state
   837  	var dead int64    // Sum of all jobs in 'dead' state
   838  
   839  	for {
   840  		raw := (*jobs).Next()
   841  		if raw == nil {
   842  			break
   843  		}
   844  
   845  		job := raw.(*structs.Job)
   846  
   847  		switch job.Status {
   848  		case structs.JobStatusPending:
   849  			pending++
   850  		case structs.JobStatusRunning:
   851  			running++
   852  		case structs.JobStatusDead:
   853  			dead++
   854  		}
   855  	}
   856  
   857  	metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending))
   858  	metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running))
   859  	metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead))
   860  }
   861  
   862  // revokeLeadership is invoked once we step down as leader.
   863  // This is used to cleanup any state that may be specific to a leader.
   864  func (s *Server) revokeLeadership() error {
   865  	defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now())
   866  
   867  	s.resetConsistentReadReady()
   868  
   869  	// Clear the leader token since we are no longer the leader.
   870  	s.setLeaderAcl("")
   871  
   872  	// Disable autopilot
   873  	s.autopilot.Stop()
   874  
   875  	// Disable the plan queue, since we are no longer leader
   876  	s.planQueue.SetEnabled(false)
   877  
   878  	// Disable the eval broker, since it is only useful as a leader
   879  	s.evalBroker.SetEnabled(false)
   880  
   881  	// Disable the blocked eval tracker, since it is only useful as a leader
   882  	s.blockedEvals.SetEnabled(false)
   883  
   884  	// Disable the periodic dispatcher, since it is only useful as a leader
   885  	s.periodicDispatcher.SetEnabled(false)
   886  
   887  	// Disable the Vault client as it is only useful as a leader.
   888  	s.vault.SetActive(false)
   889  
   890  	// Disable the deployment watcher as it is only useful as a leader.
   891  	s.deploymentWatcher.SetEnabled(false, nil)
   892  
   893  	// Disable the node drainer
   894  	s.nodeDrainer.SetEnabled(false, nil)
   895  
   896  	// Disable the volume watcher
   897  	s.volumeWatcher.SetEnabled(false, nil)
   898  
   899  	// Disable any enterprise systems required.
   900  	if err := s.revokeEnterpriseLeadership(); err != nil {
   901  		return err
   902  	}
   903  
   904  	// Clear the heartbeat timers on either shutdown or step down,
   905  	// since we are no longer responsible for TTL expirations.
   906  	if err := s.clearAllHeartbeatTimers(); err != nil {
   907  		s.logger.Error("clearing heartbeat timers failed", "error", err)
   908  		return err
   909  	}
   910  
   911  	// Unpause our worker if we paused previously
   912  	for _, w := range s.pausableWorkers() {
   913  		w.SetPause(false)
   914  	}
   915  
   916  	return nil
   917  }
   918  
   919  // pausableWorkers returns a slice of the workers
   920  // to pause on leader transitions.
   921  //
   922  // Upon leadership establishment, pause workers to free half
   923  // the cores for use in the plan queue and evaluation broker
   924  func (s *Server) pausableWorkers() []*Worker {
   925  	n := len(s.workers)
   926  	if n <= 1 {
   927  		return []*Worker{}
   928  	}
   929  
   930  	// Disabling 3/4 of the workers frees CPU for raft and the
   931  	// plan applier which uses 1/2 the cores.
   932  	return s.workers[:3*n/4]
   933  }
   934  
   935  // reconcile is used to reconcile the differences between Serf
   936  // membership and what is reflected in our strongly consistent store.
   937  func (s *Server) reconcile() error {
   938  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   939  	members := s.serf.Members()
   940  	for _, member := range members {
   941  		if err := s.reconcileMember(member); err != nil {
   942  			return err
   943  		}
   944  	}
   945  	return nil
   946  }
   947  
   948  // reconcileMember is used to do an async reconcile of a single serf member
   949  func (s *Server) reconcileMember(member serf.Member) error {
   950  	// Check if this is a member we should handle
   951  	valid, parts := isNomadServer(member)
   952  	if !valid || parts.Region != s.config.Region {
   953  		return nil
   954  	}
   955  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   956  
   957  	var err error
   958  	switch member.Status {
   959  	case serf.StatusAlive:
   960  		err = s.addRaftPeer(member, parts)
   961  	case serf.StatusLeft, StatusReap:
   962  		err = s.removeRaftPeer(member, parts)
   963  	}
   964  	if err != nil {
   965  		s.logger.Error("failed to reconcile member", "member", member, "error", err)
   966  		return err
   967  	}
   968  	return nil
   969  }
   970  
   971  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   972  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   973  	// Check for possibility of multiple bootstrap nodes
   974  	members := s.serf.Members()
   975  	if parts.Bootstrap {
   976  		for _, member := range members {
   977  			valid, p := isNomadServer(member)
   978  			if valid && member.Name != m.Name && p.Bootstrap {
   979  				s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode",
   980  					"existing_peer", member.Name, "joining_peer", m.Name)
   981  				return nil
   982  			}
   983  		}
   984  	}
   985  
   986  	// Processing ourselves could result in trying to remove ourselves to
   987  	// fix up our address, which would make us step down. This is only
   988  	// safe to attempt if there are multiple servers available.
   989  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   990  	configFuture := s.raft.GetConfiguration()
   991  	if err := configFuture.Error(); err != nil {
   992  		s.logger.Error("failed to get raft configuration", "error", err)
   993  		return err
   994  	}
   995  
   996  	if m.Name == s.config.NodeName {
   997  		if l := len(configFuture.Configuration().Servers); l < 3 {
   998  			s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name)
   999  			return nil
  1000  		}
  1001  	}
  1002  
  1003  	// See if it's already in the configuration. It's harmless to re-add it
  1004  	// but we want to avoid doing that if possible to prevent useless Raft
  1005  	// log entries. If the address is the same but the ID changed, remove the
  1006  	// old server before adding the new one.
  1007  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
  1008  	if err != nil {
  1009  		return err
  1010  	}
  1011  	for _, server := range configFuture.Configuration().Servers {
  1012  		// No-op if the raft version is too low
  1013  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
  1014  			return nil
  1015  		}
  1016  
  1017  		// If the address or ID matches an existing server, see if we need to remove the old one first
  1018  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
  1019  			// Exit with no-op if this is being called on an existing server and both the ID and address match
  1020  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
  1021  				return nil
  1022  			}
  1023  			future := s.raft.RemoveServer(server.ID, 0, 0)
  1024  			if server.Address == raft.ServerAddress(addr) {
  1025  				if err := future.Error(); err != nil {
  1026  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
  1027  				}
  1028  				s.logger.Info("removed server with duplicate address", "address", server.Address)
  1029  			} else {
  1030  				if err := future.Error(); err != nil {
  1031  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
  1032  				}
  1033  				s.logger.Info("removed server with duplicate ID", "id", server.ID)
  1034  			}
  1035  		}
  1036  	}
  1037  
  1038  	// Attempt to add as a peer
  1039  	switch {
  1040  	case minRaftProtocol >= 3:
  1041  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1042  		if err := addFuture.Error(); err != nil {
  1043  			s.logger.Error("failed to add raft peer", "error", err)
  1044  			return err
  1045  		}
  1046  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
  1047  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1048  		if err := addFuture.Error(); err != nil {
  1049  			s.logger.Error("failed to add raft peer", "error", err)
  1050  			return err
  1051  		}
  1052  	default:
  1053  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
  1054  		if err := addFuture.Error(); err != nil {
  1055  			s.logger.Error("failed to add raft peer", "error", err)
  1056  			return err
  1057  		}
  1058  	}
  1059  
  1060  	return nil
  1061  }
  1062  
  1063  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
  1064  // or is reaped
  1065  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
  1066  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
  1067  
  1068  	// See if it's already in the configuration. It's harmless to re-remove it
  1069  	// but we want to avoid doing that if possible to prevent useless Raft
  1070  	// log entries.
  1071  	configFuture := s.raft.GetConfiguration()
  1072  	if err := configFuture.Error(); err != nil {
  1073  		s.logger.Error("failed to get raft configuration", "error", err)
  1074  		return err
  1075  	}
  1076  
  1077  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
  1078  	if err != nil {
  1079  		return err
  1080  	}
  1081  
  1082  	// Pick which remove API to use based on how the server was added.
  1083  	for _, server := range configFuture.Configuration().Servers {
  1084  		// If we understand the new add/remove APIs and the server was added by ID, use the new remove API
  1085  		if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) {
  1086  			s.logger.Info("removing server by ID", "id", server.ID)
  1087  			future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0)
  1088  			if err := future.Error(); err != nil {
  1089  				s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err)
  1090  				return err
  1091  			}
  1092  			break
  1093  		} else if server.Address == raft.ServerAddress(addr) {
  1094  			// If not, use the old remove API
  1095  			s.logger.Info("removing server by address", "address", server.Address)
  1096  			future := s.raft.RemovePeer(raft.ServerAddress(addr))
  1097  			if err := future.Error(); err != nil {
  1098  				s.logger.Error("failed to remove raft peer", "address", addr, "error", err)
  1099  				return err
  1100  			}
  1101  			break
  1102  		}
  1103  	}
  1104  
  1105  	return nil
  1106  }
  1107  
  1108  // replicateACLPolicies is used to replicate ACL policies from
  1109  // the authoritative region to this region.
  1110  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
  1111  	req := structs.ACLPolicyListRequest{
  1112  		QueryOptions: structs.QueryOptions{
  1113  			Region:     s.config.AuthoritativeRegion,
  1114  			AllowStale: true,
  1115  		},
  1116  	}
  1117  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1118  	s.logger.Debug("starting ACL policy replication from authoritative region", "authoritative_region", req.Region)
  1119  
  1120  START:
  1121  	for {
  1122  		select {
  1123  		case <-stopCh:
  1124  			return
  1125  		default:
  1126  			// Rate limit how often we attempt replication
  1127  			limiter.Wait(context.Background())
  1128  
  1129  			// Fetch the list of policies
  1130  			var resp structs.ACLPolicyListResponse
  1131  			req.AuthToken = s.ReplicationToken()
  1132  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1133  				"ACL.ListPolicies", &req, &resp)
  1134  			if err != nil {
  1135  				s.logger.Error("failed to fetch policies from authoritative region", "error", err)
  1136  				goto ERR_WAIT
  1137  			}
  1138  
  1139  			// Perform a two-way diff
  1140  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
  1141  
  1142  			// Delete policies that should not exist
  1143  			if len(delete) > 0 {
  1144  				args := &structs.ACLPolicyDeleteRequest{
  1145  					Names: delete,
  1146  				}
  1147  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
  1148  				if err != nil {
  1149  					s.logger.Error("failed to delete policies", "error", err)
  1150  					goto ERR_WAIT
  1151  				}
  1152  			}
  1153  
  1154  			// Fetch any outdated policies
  1155  			var fetched []*structs.ACLPolicy
  1156  			if len(update) > 0 {
  1157  				req := structs.ACLPolicySetRequest{
  1158  					Names: update,
  1159  					QueryOptions: structs.QueryOptions{
  1160  						Region:        s.config.AuthoritativeRegion,
  1161  						AuthToken:     s.ReplicationToken(),
  1162  						AllowStale:    true,
  1163  						MinQueryIndex: resp.Index - 1,
  1164  					},
  1165  				}
  1166  				var reply structs.ACLPolicySetResponse
  1167  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1168  					"ACL.GetPolicies", &req, &reply); err != nil {
  1169  					s.logger.Error("failed to fetch policies from authoritative region", "error", err)
  1170  					goto ERR_WAIT
  1171  				}
  1172  				for _, policy := range reply.Policies {
  1173  					fetched = append(fetched, policy)
  1174  				}
  1175  			}
  1176  
  1177  			// Update local policies
  1178  			if len(fetched) > 0 {
  1179  				args := &structs.ACLPolicyUpsertRequest{
  1180  					Policies: fetched,
  1181  				}
  1182  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
  1183  				if err != nil {
  1184  					s.logger.Error("failed to update policies", "error", err)
  1185  					goto ERR_WAIT
  1186  				}
  1187  			}
  1188  
  1189  			// Update the minimum query index, blocks until there
  1190  			// is a change.
  1191  			req.MinQueryIndex = resp.Index
  1192  		}
  1193  	}
  1194  
  1195  ERR_WAIT:
  1196  	select {
  1197  	case <-time.After(s.config.ReplicationBackoff):
  1198  		goto START
  1199  	case <-stopCh:
  1200  		return
  1201  	}
  1202  }
  1203  
  1204  // diffACLPolicies is used to perform a two-way diff between the local
  1205  // policies and the remote policies to determine which policies need to
  1206  // be deleted or updated.
  1207  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
  1208  	// Construct a set of the local and remote policies
  1209  	local := make(map[string][]byte)
  1210  	remote := make(map[string]struct{})
  1211  
  1212  	// Add all the local policies
  1213  	iter, err := state.ACLPolicies(nil)
  1214  	if err != nil {
  1215  		panic("failed to iterate local policies")
  1216  	}
  1217  	for {
  1218  		raw := iter.Next()
  1219  		if raw == nil {
  1220  			break
  1221  		}
  1222  		policy := raw.(*structs.ACLPolicy)
  1223  		local[policy.Name] = policy.Hash
  1224  	}
  1225  
  1226  	// Iterate over the remote policies
  1227  	for _, rp := range remoteList {
  1228  		remote[rp.Name] = struct{}{}
  1229  
  1230  		// Check if the policy is missing locally
  1231  		if localHash, ok := local[rp.Name]; !ok {
  1232  			update = append(update, rp.Name)
  1233  
  1234  			// Check if policy is newer remotely and there is a hash mis-match.
  1235  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1236  			update = append(update, rp.Name)
  1237  		}
  1238  	}
  1239  
  1240  	// Check if policy should be deleted
  1241  	for lp := range local {
  1242  		if _, ok := remote[lp]; !ok {
  1243  			delete = append(delete, lp)
  1244  		}
  1245  	}
  1246  	return
  1247  }
  1248  
  1249  // replicateACLTokens is used to replicate global ACL tokens from
  1250  // the authoritative region to this region.
  1251  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
  1252  	req := structs.ACLTokenListRequest{
  1253  		GlobalOnly: true,
  1254  		QueryOptions: structs.QueryOptions{
  1255  			Region:     s.config.AuthoritativeRegion,
  1256  			AllowStale: true,
  1257  		},
  1258  	}
  1259  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1260  	s.logger.Debug("starting ACL token replication from authoritative region", "authoritative_region", req.Region)
  1261  
  1262  START:
  1263  	for {
  1264  		select {
  1265  		case <-stopCh:
  1266  			return
  1267  		default:
  1268  			// Rate limit how often we attempt replication
  1269  			limiter.Wait(context.Background())
  1270  
  1271  			// Fetch the list of tokens
  1272  			var resp structs.ACLTokenListResponse
  1273  			req.AuthToken = s.ReplicationToken()
  1274  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1275  				"ACL.ListTokens", &req, &resp)
  1276  			if err != nil {
  1277  				s.logger.Error("failed to fetch tokens from authoritative region", "error", err)
  1278  				goto ERR_WAIT
  1279  			}
  1280  
  1281  			// Perform a two-way diff
  1282  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
  1283  
  1284  			// Delete tokens that should not exist
  1285  			if len(delete) > 0 {
  1286  				args := &structs.ACLTokenDeleteRequest{
  1287  					AccessorIDs: delete,
  1288  				}
  1289  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1290  				if err != nil {
  1291  					s.logger.Error("failed to delete tokens", "error", err)
  1292  					goto ERR_WAIT
  1293  				}
  1294  			}
  1295  
  1296  			// Fetch any outdated policies.
  1297  			var fetched []*structs.ACLToken
  1298  			if len(update) > 0 {
  1299  				req := structs.ACLTokenSetRequest{
  1300  					AccessorIDS: update,
  1301  					QueryOptions: structs.QueryOptions{
  1302  						Region:        s.config.AuthoritativeRegion,
  1303  						AuthToken:     s.ReplicationToken(),
  1304  						AllowStale:    true,
  1305  						MinQueryIndex: resp.Index - 1,
  1306  					},
  1307  				}
  1308  				var reply structs.ACLTokenSetResponse
  1309  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1310  					"ACL.GetTokens", &req, &reply); err != nil {
  1311  					s.logger.Error("failed to fetch tokens from authoritative region", "error", err)
  1312  					goto ERR_WAIT
  1313  				}
  1314  				for _, token := range reply.Tokens {
  1315  					fetched = append(fetched, token)
  1316  				}
  1317  			}
  1318  
  1319  			// Update local tokens
  1320  			if len(fetched) > 0 {
  1321  				args := &structs.ACLTokenUpsertRequest{
  1322  					Tokens: fetched,
  1323  				}
  1324  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1325  				if err != nil {
  1326  					s.logger.Error("failed to update tokens", "error", err)
  1327  					goto ERR_WAIT
  1328  				}
  1329  			}
  1330  
  1331  			// Update the minimum query index, blocks until there
  1332  			// is a change.
  1333  			req.MinQueryIndex = resp.Index
  1334  		}
  1335  	}
  1336  
  1337  ERR_WAIT:
  1338  	select {
  1339  	case <-time.After(s.config.ReplicationBackoff):
  1340  		goto START
  1341  	case <-stopCh:
  1342  		return
  1343  	}
  1344  }
  1345  
  1346  // diffACLTokens is used to perform a two-way diff between the local
  1347  // tokens and the remote tokens to determine which tokens need to
  1348  // be deleted or updated.
  1349  func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1350  	// Construct a set of the local and remote policies
  1351  	local := make(map[string][]byte)
  1352  	remote := make(map[string]struct{})
  1353  
  1354  	// Add all the local global tokens
  1355  	iter, err := state.ACLTokensByGlobal(nil, true)
  1356  	if err != nil {
  1357  		panic("failed to iterate local tokens")
  1358  	}
  1359  	for {
  1360  		raw := iter.Next()
  1361  		if raw == nil {
  1362  			break
  1363  		}
  1364  		token := raw.(*structs.ACLToken)
  1365  		local[token.AccessorID] = token.Hash
  1366  	}
  1367  
  1368  	// Iterate over the remote tokens
  1369  	for _, rp := range remoteList {
  1370  		remote[rp.AccessorID] = struct{}{}
  1371  
  1372  		// Check if the token is missing locally
  1373  		if localHash, ok := local[rp.AccessorID]; !ok {
  1374  			update = append(update, rp.AccessorID)
  1375  
  1376  			// Check if policy is newer remotely and there is a hash mis-match.
  1377  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1378  			update = append(update, rp.AccessorID)
  1379  		}
  1380  	}
  1381  
  1382  	// Check if local token should be deleted
  1383  	for lp := range local {
  1384  		if _, ok := remote[lp]; !ok {
  1385  			delete = append(delete, lp)
  1386  		}
  1387  	}
  1388  	return
  1389  }
  1390  
  1391  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
  1392  func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig {
  1393  	state := s.fsm.State()
  1394  	_, config, err := state.AutopilotConfig()
  1395  	if err != nil {
  1396  		s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err)
  1397  		return nil
  1398  	}
  1399  	if config != nil {
  1400  		return config
  1401  	}
  1402  
  1403  	if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion, false) {
  1404  		s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion)
  1405  		return nil
  1406  	}
  1407  
  1408  	config = s.config.AutopilotConfig
  1409  	req := structs.AutopilotSetConfigRequest{Config: *config}
  1410  	if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
  1411  		s.logger.Named("autopilot").Error("failed to initialize config", "error", err)
  1412  		return nil
  1413  	}
  1414  
  1415  	return config
  1416  }
  1417  
  1418  // getOrCreateSchedulerConfig is used to get the scheduler config. We create a default
  1419  // config if it doesn't already exist for bootstrapping an empty cluster
  1420  func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration {
  1421  	state := s.fsm.State()
  1422  	_, config, err := state.SchedulerConfig()
  1423  	if err != nil {
  1424  		s.logger.Named("core").Error("failed to get scheduler config", "error", err)
  1425  		return nil
  1426  	}
  1427  	if config != nil {
  1428  		return config
  1429  	}
  1430  	if !ServersMeetMinimumVersion(s.Members(), minSchedulerConfigVersion, false) {
  1431  		s.logger.Named("core").Warn("can't initialize scheduler config until all servers are above minimum version", "min_version", minSchedulerConfigVersion)
  1432  		return nil
  1433  	}
  1434  
  1435  	req := structs.SchedulerSetConfigRequest{Config: s.config.DefaultSchedulerConfig}
  1436  	if _, _, err = s.raftApply(structs.SchedulerConfigRequestType, req); err != nil {
  1437  		s.logger.Named("core").Error("failed to initialize config", "error", err)
  1438  		return nil
  1439  	}
  1440  
  1441  	return config
  1442  }
  1443  
  1444  func (s *Server) generateClusterID() (string, error) {
  1445  	if !ServersMeetMinimumVersion(s.Members(), minClusterIDVersion, false) {
  1446  		s.logger.Named("core").Warn("cannot initialize cluster ID until all servers are above minimum version", "min_version", minClusterIDVersion)
  1447  		return "", errors.Errorf("cluster ID cannot be created until all servers are above minimum version %s", minClusterIDVersion)
  1448  	}
  1449  
  1450  	newMeta := structs.ClusterMetadata{ClusterID: uuid.Generate(), CreateTime: time.Now().UnixNano()}
  1451  	if _, _, err := s.raftApply(structs.ClusterMetadataRequestType, newMeta); err != nil {
  1452  		s.logger.Named("core").Error("failed to create cluster ID", "error", err)
  1453  		return "", errors.Wrap(err, "failed to create cluster ID")
  1454  	}
  1455  
  1456  	s.logger.Named("core").Info("established cluster id", "cluster_id", newMeta.ClusterID, "create_time", newMeta.CreateTime)
  1457  	return newMeta.ClusterID, nil
  1458  }