github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/leader.go

github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"math/rand"
     9  	"net"
    10  	"sync"
    11  	"time"
    12  
    13  	"golang.org/x/time/rate"
    14  
    15  	"github.com/armon/go-metrics"
    16  	memdb "github.com/hashicorp/go-memdb"
    17  	"github.com/hashicorp/go-version"
    18  	"github.com/hashicorp/nomad/helper/uuid"
    19  	"github.com/hashicorp/nomad/nomad/state"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  	"github.com/hashicorp/raft"
    22  	"github.com/hashicorp/serf/serf"
    23  )
    24  
    25  const (
    26  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    27  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    28  	// high contention when the schedulers plan does not make progress.
    29  	failedEvalUnblockInterval = 1 * time.Minute
    30  
    31  	// replicationRateLimit is used to rate limit how often data is replicated
    32  	// between the authoritative region and the local region
    33  	replicationRateLimit rate.Limit = 10.0
    34  
    35  	// barrierWriteTimeout is used to give Raft a chance to process a
    36  	// possible loss of leadership event if we are unable to get a barrier
    37  	// while leader.
    38  	barrierWriteTimeout = 2 * time.Minute
    39  )
    40  
    41  var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    42  
    43  // monitorLeadership is used to monitor if we acquire or lose our role
    44  // as the leader in the Raft cluster. There is some work the leader is
    45  // expected to do, so we must react to changes
    46  func (s *Server) monitorLeadership() {
    47  	var weAreLeaderCh chan struct{}
    48  	var leaderLoop sync.WaitGroup
    49  	for {
    50  		select {
    51  		case isLeader := <-s.leaderCh:
    52  			switch {
    53  			case isLeader:
    54  				if weAreLeaderCh != nil {
    55  					s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running")
    56  					continue
    57  				}
    58  
    59  				weAreLeaderCh = make(chan struct{})
    60  				leaderLoop.Add(1)
    61  				go func(ch chan struct{}) {
    62  					defer leaderLoop.Done()
    63  					s.leaderLoop(ch)
    64  				}(weAreLeaderCh)
    65  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    66  
    67  			default:
    68  				if weAreLeaderCh == nil {
    69  					s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running")
    70  					continue
    71  				}
    72  
    73  				s.logger.Printf("[DEBUG] nomad: shutting down leader loop")
    74  				close(weAreLeaderCh)
    75  				leaderLoop.Wait()
    76  				weAreLeaderCh = nil
    77  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    78  			}
    79  
    80  		case <-s.shutdownCh:
    81  			return
    82  		}
    83  	}
    84  }
    85  
    86  // leaderLoop runs as long as we are the leader to run various
    87  // maintenance activities
    88  func (s *Server) leaderLoop(stopCh chan struct{}) {
    89  	var reconcileCh chan serf.Member
    90  	establishedLeader := false
    91  
    92  RECONCILE:
    93  	// Setup a reconciliation timer
    94  	reconcileCh = nil
    95  	interval := time.After(s.config.ReconcileInterval)
    96  
    97  	// Apply a raft barrier to ensure our FSM is caught up
    98  	start := time.Now()
    99  	barrier := s.raft.Barrier(barrierWriteTimeout)
   100  	if err := barrier.Error(); err != nil {
   101  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
   102  		goto WAIT
   103  	}
   104  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   105  
   106  	// Check if we need to handle initial leadership actions
   107  	if !establishedLeader {
   108  		if err := s.establishLeadership(stopCh); err != nil {
   109  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err)
   110  
   111  			// Immediately revoke leadership since we didn't successfully
   112  			// establish leadership.
   113  			if err := s.revokeLeadership(); err != nil {
   114  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   115  			}
   116  
   117  			goto WAIT
   118  		}
   119  
   120  		establishedLeader = true
   121  		defer func() {
   122  			if err := s.revokeLeadership(); err != nil {
   123  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   124  			}
   125  		}()
   126  	}
   127  
   128  	// Reconcile any missing data
   129  	if err := s.reconcile(); err != nil {
   130  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
   131  		goto WAIT
   132  	}
   133  
   134  	// Initial reconcile worked, now we can process the channel
   135  	// updates
   136  	reconcileCh = s.reconcileCh
   137  
   138  	// Poll the stop channel to give it priority so we don't waste time
   139  	// trying to perform the other operations if we have been asked to shut
   140  	// down.
   141  	select {
   142  	case <-stopCh:
   143  		return
   144  	default:
   145  	}
   146  
   147  WAIT:
   148  	// Wait until leadership is lost
   149  	for {
   150  		select {
   151  		case <-stopCh:
   152  			return
   153  		case <-s.shutdownCh:
   154  			return
   155  		case <-interval:
   156  			goto RECONCILE
   157  		case member := <-reconcileCh:
   158  			s.reconcileMember(member)
   159  		}
   160  	}
   161  }
   162  
   163  // establishLeadership is invoked once we become leader and are able
   164  // to invoke an initial barrier. The barrier is used to ensure any
   165  // previously inflight transactions have been committed and that our
   166  // state is up-to-date.
   167  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   168  	defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now())
   169  
   170  	// Generate a leader ACL token. This will allow the leader to issue work
   171  	// that requires a valid ACL token.
   172  	s.setLeaderAcl(uuid.Generate())
   173  
   174  	// Disable workers to free half the cores for use in the plan queue and
   175  	// evaluation broker
   176  	if numWorkers := len(s.workers); numWorkers > 1 {
   177  		// Disabling 3/4 of the workers frees CPU for raft and the
   178  		// plan applier which uses 1/2 the cores.
   179  		for i := 0; i < (3 * numWorkers / 4); i++ {
   180  			s.workers[i].SetPause(true)
   181  		}
   182  	}
   183  
   184  	// Initialize and start the autopilot routine
   185  	s.getOrCreateAutopilotConfig()
   186  	s.autopilot.Start()
   187  
   188  	// Enable the plan queue, since we are now the leader
   189  	s.planQueue.SetEnabled(true)
   190  
   191  	// Start the plan evaluator
   192  	go s.planApply()
   193  
   194  	// Enable the eval broker, since we are now the leader
   195  	s.evalBroker.SetEnabled(true)
   196  
   197  	// Enable the blocked eval tracker, since we are now the leader
   198  	s.blockedEvals.SetEnabled(true)
   199  	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
   200  
   201  	// Enable the deployment watcher, since we are now the leader
   202  	s.deploymentWatcher.SetEnabled(true, s.State())
   203  
   204  	// Enable the NodeDrainer
   205  	s.nodeDrainer.SetEnabled(true, s.State())
   206  
   207  	// Restore the eval broker state
   208  	if err := s.restoreEvals(); err != nil {
   209  		return err
   210  	}
   211  
   212  	// Activate the vault client
   213  	s.vault.SetActive(true)
   214  	if err := s.restoreRevokingAccessors(); err != nil {
   215  		return err
   216  	}
   217  
   218  	// Enable the periodic dispatcher, since we are now the leader.
   219  	s.periodicDispatcher.SetEnabled(true)
   220  
   221  	// Restore the periodic dispatcher state
   222  	if err := s.restorePeriodicDispatcher(); err != nil {
   223  		return err
   224  	}
   225  
   226  	// Scheduler periodic jobs
   227  	go s.schedulePeriodic(stopCh)
   228  
   229  	// Reap any failed evaluations
   230  	go s.reapFailedEvaluations(stopCh)
   231  
   232  	// Reap any duplicate blocked evaluations
   233  	go s.reapDupBlockedEvaluations(stopCh)
   234  
   235  	// Periodically unblock failed allocations
   236  	go s.periodicUnblockFailedEvals(stopCh)
   237  
   238  	// Periodically publish job summary metrics
   239  	go s.publishJobSummaryMetrics(stopCh)
   240  
   241  	// Setup the heartbeat timers. This is done both when starting up or when
   242  	// a leader fail over happens. Since the timers are maintained by the leader
   243  	// node, effectively this means all the timers are renewed at the time of failover.
   244  	// The TTL contract is that the session will not be expired before the TTL,
   245  	// so expiring it later is allowable.
   246  	//
   247  	// This MUST be done after the initial barrier to ensure the latest Nodes
   248  	// are available to be initialized. Otherwise initialization may use stale
   249  	// data.
   250  	if err := s.initializeHeartbeatTimers(); err != nil {
   251  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   252  		return err
   253  	}
   254  
   255  	// COMPAT 0.4 - 0.4.1
   256  	// Reconcile the summaries of the registered jobs. We reconcile summaries
   257  	// only if the server is 0.4.1 since summaries are not present in 0.4 they
   258  	// might be incorrect after upgrading to 0.4.1 the summaries might not be
   259  	// correct
   260  	if err := s.reconcileJobSummaries(); err != nil {
   261  		return fmt.Errorf("unable to reconcile job summaries: %v", err)
   262  	}
   263  
   264  	// Start replication of ACLs and Policies if they are enabled,
   265  	// and we are not the authoritative region.
   266  	if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion {
   267  		go s.replicateACLPolicies(stopCh)
   268  		go s.replicateACLTokens(stopCh)
   269  	}
   270  
   271  	// Setup any enterprise systems required.
   272  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   273  		return err
   274  	}
   275  
   276  	return nil
   277  }
   278  
   279  // restoreEvals is used to restore pending evaluations into the eval broker and
   280  // blocked evaluations into the blocked eval tracker. The broker and blocked
   281  // eval tracker is maintained only by the leader, so it must be restored anytime
   282  // a leadership transition takes place.
   283  func (s *Server) restoreEvals() error {
   284  	// Get an iterator over every evaluation
   285  	ws := memdb.NewWatchSet()
   286  	iter, err := s.fsm.State().Evals(ws)
   287  	if err != nil {
   288  		return fmt.Errorf("failed to get evaluations: %v", err)
   289  	}
   290  
   291  	for {
   292  		raw := iter.Next()
   293  		if raw == nil {
   294  			break
   295  		}
   296  		eval := raw.(*structs.Evaluation)
   297  
   298  		if eval.ShouldEnqueue() {
   299  			s.evalBroker.Enqueue(eval)
   300  		} else if eval.ShouldBlock() {
   301  			s.blockedEvals.Block(eval)
   302  		}
   303  	}
   304  	return nil
   305  }
   306  
   307  // restoreRevokingAccessors is used to restore Vault accessors that should be
   308  // revoked.
   309  func (s *Server) restoreRevokingAccessors() error {
   310  	// An accessor should be revoked if its allocation or node is terminal
   311  	ws := memdb.NewWatchSet()
   312  	state := s.fsm.State()
   313  	iter, err := state.VaultAccessors(ws)
   314  	if err != nil {
   315  		return fmt.Errorf("failed to get vault accessors: %v", err)
   316  	}
   317  
   318  	var revoke []*structs.VaultAccessor
   319  	for {
   320  		raw := iter.Next()
   321  		if raw == nil {
   322  			break
   323  		}
   324  
   325  		va := raw.(*structs.VaultAccessor)
   326  
   327  		// Check the allocation
   328  		alloc, err := state.AllocByID(ws, va.AllocID)
   329  		if err != nil {
   330  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   331  		}
   332  		if alloc == nil || alloc.Terminated() {
   333  			// No longer running and should be revoked
   334  			revoke = append(revoke, va)
   335  			continue
   336  		}
   337  
   338  		// Check the node
   339  		node, err := state.NodeByID(ws, va.NodeID)
   340  		if err != nil {
   341  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   342  		}
   343  		if node == nil || node.TerminalStatus() {
   344  			// Node is terminal so any accessor from it should be revoked
   345  			revoke = append(revoke, va)
   346  			continue
   347  		}
   348  	}
   349  
   350  	if len(revoke) != 0 {
   351  		if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
   352  			return fmt.Errorf("failed to revoke tokens: %v", err)
   353  		}
   354  	}
   355  
   356  	return nil
   357  }
   358  
   359  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   360  // periodic dispatcher. It also determines if a periodic job should have been
   361  // created during the leadership transition and force runs them. The periodic
   362  // dispatcher is maintained only by the leader, so it must be restored anytime a
   363  // leadership transition takes place.
   364  func (s *Server) restorePeriodicDispatcher() error {
   365  	ws := memdb.NewWatchSet()
   366  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   367  	if err != nil {
   368  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   369  	}
   370  
   371  	now := time.Now()
   372  	for i := iter.Next(); i != nil; i = iter.Next() {
   373  		job := i.(*structs.Job)
   374  
   375  		// We skip adding parameterized jobs because they themselves aren't
   376  		// tracked, only the dispatched children are.
   377  		if job.IsParameterized() {
   378  			continue
   379  		}
   380  
   381  		if err := s.periodicDispatcher.Add(job); err != nil {
   382  			s.logger.Printf("[ERR] nomad.periodic: %v", err)
   383  			continue
   384  		}
   385  
   386  		// We do not need to force run the job since it isn't active.
   387  		if !job.IsPeriodicActive() {
   388  			continue
   389  		}
   390  
   391  		// If the periodic job has never been launched before, launch will hold
   392  		// the time the periodic job was added. Otherwise it has the last launch
   393  		// time of the periodic job.
   394  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   395  		if err != nil {
   396  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   397  		}
   398  		if launch == nil {
   399  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   400  				job.ID, job.Namespace)
   401  		}
   402  
   403  		// nextLaunch is the next launch that should occur.
   404  		nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   405  		if err != nil {
   406  			s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err)
   407  			continue
   408  		}
   409  
   410  		// We skip force launching the job if  there should be no next launch
   411  		// (the zero case) or if the next launch time is in the future. If it is
   412  		// in the future, it will be handled by the periodic dispatcher.
   413  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   414  			continue
   415  		}
   416  
   417  		if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil {
   418  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   419  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   420  			return errors.New(msg)
   421  		}
   422  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   423  			" run during leadership establishment", job.ID)
   424  	}
   425  
   426  	return nil
   427  }
   428  
   429  // schedulePeriodic is used to do periodic job dispatch while we are leader
   430  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   431  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   432  	defer evalGC.Stop()
   433  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   434  	defer nodeGC.Stop()
   435  	jobGC := time.NewTicker(s.config.JobGCInterval)
   436  	defer jobGC.Stop()
   437  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   438  	defer deploymentGC.Stop()
   439  
   440  	// getLatest grabs the latest index from the state store. It returns true if
   441  	// the index was retrieved successfully.
   442  	getLatest := func() (uint64, bool) {
   443  		snapshotIndex, err := s.fsm.State().LatestIndex()
   444  		if err != nil {
   445  			s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err)
   446  			return 0, false
   447  		}
   448  
   449  		return snapshotIndex, true
   450  	}
   451  
   452  	for {
   453  
   454  		select {
   455  		case <-evalGC.C:
   456  			if index, ok := getLatest(); ok {
   457  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   458  			}
   459  		case <-nodeGC.C:
   460  			if index, ok := getLatest(); ok {
   461  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   462  			}
   463  		case <-jobGC.C:
   464  			if index, ok := getLatest(); ok {
   465  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   466  			}
   467  		case <-deploymentGC.C:
   468  			if index, ok := getLatest(); ok {
   469  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
   470  			}
   471  		case <-stopCh:
   472  			return
   473  		}
   474  	}
   475  }
   476  
   477  // coreJobEval returns an evaluation for a core job
   478  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   479  	return &structs.Evaluation{
   480  		ID:          uuid.Generate(),
   481  		Namespace:   "-",
   482  		Priority:    structs.CoreJobPriority,
   483  		Type:        structs.JobTypeCore,
   484  		TriggeredBy: structs.EvalTriggerScheduled,
   485  		JobID:       job,
   486  		LeaderACL:   s.getLeaderAcl(),
   487  		Status:      structs.EvalStatusPending,
   488  		ModifyIndex: modifyIndex,
   489  	}
   490  }
   491  
   492  // reapFailedEvaluations is used to reap evaluations that
   493  // have reached their delivery limit and should be failed
   494  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   495  	for {
   496  		select {
   497  		case <-stopCh:
   498  			return
   499  		default:
   500  			// Scan for a failed evaluation
   501  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   502  			if err != nil {
   503  				return
   504  			}
   505  			if eval == nil {
   506  				continue
   507  			}
   508  
   509  			// Update the status to failed
   510  			updateEval := eval.Copy()
   511  			updateEval.Status = structs.EvalStatusFailed
   512  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   513  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval)
   514  
   515  			// Create a follow-up evaluation that will be used to retry the
   516  			// scheduling for the job after the cluster is hopefully more stable
   517  			// due to the fairly large backoff.
   518  			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
   519  				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
   520  			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
   521  
   522  			// Update via Raft
   523  			req := structs.EvalUpdateRequest{
   524  				Evals: []*structs.Evaluation{updateEval, followupEval},
   525  			}
   526  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   527  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err)
   528  				continue
   529  			}
   530  
   531  			// Ack completion
   532  			s.evalBroker.Ack(eval.ID, token)
   533  		}
   534  	}
   535  }
   536  
   537  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   538  // should be cancelled.
   539  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   540  	for {
   541  		select {
   542  		case <-stopCh:
   543  			return
   544  		default:
   545  			// Scan for duplicate blocked evals.
   546  			dups := s.blockedEvals.GetDuplicates(time.Second)
   547  			if dups == nil {
   548  				continue
   549  			}
   550  
   551  			cancel := make([]*structs.Evaluation, len(dups))
   552  			for i, dup := range dups {
   553  				// Update the status to cancelled
   554  				newEval := dup.Copy()
   555  				newEval.Status = structs.EvalStatusCancelled
   556  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   557  				cancel[i] = newEval
   558  			}
   559  
   560  			// Update via Raft
   561  			req := structs.EvalUpdateRequest{
   562  				Evals: cancel,
   563  			}
   564  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   565  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   566  				continue
   567  			}
   568  		}
   569  	}
   570  }
   571  
   572  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   573  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   574  	ticker := time.NewTicker(failedEvalUnblockInterval)
   575  	defer ticker.Stop()
   576  	for {
   577  		select {
   578  		case <-stopCh:
   579  			return
   580  		case <-ticker.C:
   581  			// Unblock the failed allocations
   582  			s.blockedEvals.UnblockFailed()
   583  		}
   584  	}
   585  }
   586  
   587  // publishJobSummaryMetrics publishes the job summaries as metrics
   588  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
   589  	timer := time.NewTimer(0)
   590  	defer timer.Stop()
   591  
   592  	for {
   593  		select {
   594  		case <-stopCh:
   595  			return
   596  		case <-timer.C:
   597  			timer.Reset(s.config.StatsCollectionInterval)
   598  			state, err := s.State().Snapshot()
   599  			if err != nil {
   600  				s.logger.Printf("[ERR] nomad: failed to get state: %v", err)
   601  				continue
   602  			}
   603  			ws := memdb.NewWatchSet()
   604  			iter, err := state.JobSummaries(ws)
   605  			if err != nil {
   606  				s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err)
   607  				continue
   608  			}
   609  
   610  			for {
   611  				raw := iter.Next()
   612  				if raw == nil {
   613  					break
   614  				}
   615  				summary := raw.(*structs.JobSummary)
   616  				for name, tgSummary := range summary.Summary {
   617  					if !s.config.DisableTaggedMetrics {
   618  						labels := []metrics.Label{
   619  							{
   620  								Name:  "job",
   621  								Value: summary.JobID,
   622  							},
   623  							{
   624  								Name:  "task_group",
   625  								Value: name,
   626  							},
   627  						}
   628  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
   629  							float32(tgSummary.Queued), labels)
   630  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
   631  							float32(tgSummary.Complete), labels)
   632  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
   633  							float32(tgSummary.Failed), labels)
   634  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
   635  							float32(tgSummary.Running), labels)
   636  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
   637  							float32(tgSummary.Starting), labels)
   638  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
   639  							float32(tgSummary.Lost), labels)
   640  					}
   641  					if s.config.BackwardsCompatibleMetrics {
   642  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
   643  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
   644  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
   645  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
   646  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
   647  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
   648  					}
   649  				}
   650  			}
   651  		}
   652  	}
   653  }
   654  
   655  // revokeLeadership is invoked once we step down as leader.
   656  // This is used to cleanup any state that may be specific to a leader.
   657  func (s *Server) revokeLeadership() error {
   658  	defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now())
   659  
   660  	// Clear the leader token since we are no longer the leader.
   661  	s.setLeaderAcl("")
   662  
   663  	// Disable autopilot
   664  	s.autopilot.Stop()
   665  
   666  	// Disable the plan queue, since we are no longer leader
   667  	s.planQueue.SetEnabled(false)
   668  
   669  	// Disable the eval broker, since it is only useful as a leader
   670  	s.evalBroker.SetEnabled(false)
   671  
   672  	// Disable the blocked eval tracker, since it is only useful as a leader
   673  	s.blockedEvals.SetEnabled(false)
   674  
   675  	// Disable the periodic dispatcher, since it is only useful as a leader
   676  	s.periodicDispatcher.SetEnabled(false)
   677  
   678  	// Disable the Vault client as it is only useful as a leader.
   679  	s.vault.SetActive(false)
   680  
   681  	// Disable the deployment watcher as it is only useful as a leader.
   682  	s.deploymentWatcher.SetEnabled(false, nil)
   683  
   684  	// Disable the node drainer
   685  	s.nodeDrainer.SetEnabled(false, nil)
   686  
   687  	// Disable any enterprise systems required.
   688  	if err := s.revokeEnterpriseLeadership(); err != nil {
   689  		return err
   690  	}
   691  
   692  	// Clear the heartbeat timers on either shutdown or step down,
   693  	// since we are no longer responsible for TTL expirations.
   694  	if err := s.clearAllHeartbeatTimers(); err != nil {
   695  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   696  		return err
   697  	}
   698  
   699  	// Unpause our worker if we paused previously
   700  	if len(s.workers) > 1 {
   701  		for i := 0; i < len(s.workers)/2; i++ {
   702  			s.workers[i].SetPause(false)
   703  		}
   704  	}
   705  	return nil
   706  }
   707  
   708  // reconcile is used to reconcile the differences between Serf
   709  // membership and what is reflected in our strongly consistent store.
   710  func (s *Server) reconcile() error {
   711  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   712  	members := s.serf.Members()
   713  	for _, member := range members {
   714  		if err := s.reconcileMember(member); err != nil {
   715  			return err
   716  		}
   717  	}
   718  	return nil
   719  }
   720  
   721  // reconcileMember is used to do an async reconcile of a single serf member
   722  func (s *Server) reconcileMember(member serf.Member) error {
   723  	// Check if this is a member we should handle
   724  	valid, parts := isNomadServer(member)
   725  	if !valid || parts.Region != s.config.Region {
   726  		return nil
   727  	}
   728  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   729  
   730  	// Do not reconcile ourself
   731  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   732  		return nil
   733  	}
   734  
   735  	var err error
   736  	switch member.Status {
   737  	case serf.StatusAlive:
   738  		err = s.addRaftPeer(member, parts)
   739  	case serf.StatusLeft, StatusReap:
   740  		err = s.removeRaftPeer(member, parts)
   741  	}
   742  	if err != nil {
   743  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   744  			member, err)
   745  		return err
   746  	}
   747  	return nil
   748  }
   749  
   750  // reconcileJobSummaries reconciles the summaries of all the jobs registered in
   751  // the system
   752  // COMPAT 0.4 -> 0.4.1
   753  func (s *Server) reconcileJobSummaries() error {
   754  	index, err := s.fsm.state.LatestIndex()
   755  	if err != nil {
   756  		return fmt.Errorf("unable to read latest index: %v", err)
   757  	}
   758  	s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index)
   759  
   760  	args := &structs.GenericResponse{}
   761  	msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag
   762  	if _, _, err = s.raftApply(msg, args); err != nil {
   763  		return fmt.Errorf("reconciliation of job summaries failed: %v", err)
   764  	}
   765  
   766  	return nil
   767  }
   768  
   769  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   770  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   771  	// Do not join ourselfs
   772  	if m.Name == s.config.NodeName {
   773  		s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name)
   774  		return nil
   775  	}
   776  
   777  	// Check for possibility of multiple bootstrap nodes
   778  	members := s.serf.Members()
   779  	if parts.Bootstrap {
   780  		for _, member := range members {
   781  			valid, p := isNomadServer(member)
   782  			if valid && member.Name != m.Name && p.Bootstrap {
   783  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   784  				return nil
   785  			}
   786  		}
   787  	}
   788  
   789  	// See if it's already in the configuration. It's harmless to re-add it
   790  	// but we want to avoid doing that if possible to prevent useless Raft
   791  	// log entries.
   792  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   793  	configFuture := s.raft.GetConfiguration()
   794  	if err := configFuture.Error(); err != nil {
   795  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   796  		return err
   797  	}
   798  	for _, server := range configFuture.Configuration().Servers {
   799  		if server.Address == raft.ServerAddress(addr) {
   800  			return nil
   801  		}
   802  	}
   803  
   804  	// See if it's already in the configuration. It's harmless to re-add it
   805  	// but we want to avoid doing that if possible to prevent useless Raft
   806  	// log entries. If the address is the same but the ID changed, remove the
   807  	// old server before adding the new one.
   808  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   809  	if err != nil {
   810  		return err
   811  	}
   812  	for _, server := range configFuture.Configuration().Servers {
   813  		// No-op if the raft version is too low
   814  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
   815  			return nil
   816  		}
   817  
   818  		// If the address or ID matches an existing server, see if we need to remove the old one first
   819  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
   820  			// Exit with no-op if this is being called on an existing server
   821  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
   822  				return nil
   823  			}
   824  			future := s.raft.RemoveServer(server.ID, 0, 0)
   825  			if server.Address == raft.ServerAddress(addr) {
   826  				if err := future.Error(); err != nil {
   827  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
   828  				}
   829  				s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address)
   830  			} else {
   831  				if err := future.Error(); err != nil {
   832  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
   833  				}
   834  				s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID)
   835  			}
   836  		}
   837  	}
   838  
   839  	// Attempt to add as a peer
   840  	switch {
   841  	case minRaftProtocol >= 3:
   842  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   843  		if err := addFuture.Error(); err != nil {
   844  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   845  			return err
   846  		}
   847  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
   848  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   849  		if err := addFuture.Error(); err != nil {
   850  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   851  			return err
   852  		}
   853  	default:
   854  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
   855  		if err := addFuture.Error(); err != nil {
   856  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   857  			return err
   858  		}
   859  	}
   860  
   861  	return nil
   862  }
   863  
   864  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   865  // or is reaped
   866  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   867  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   868  
   869  	// See if it's already in the configuration. It's harmless to re-remove it
   870  	// but we want to avoid doing that if possible to prevent useless Raft
   871  	// log entries.
   872  	configFuture := s.raft.GetConfiguration()
   873  	if err := configFuture.Error(); err != nil {
   874  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   875  		return err
   876  	}
   877  
   878  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   879  	if err != nil {
   880  		return err
   881  	}
   882  
   883  	// Pick which remove API to use based on how the server was added.
   884  	for _, server := range configFuture.Configuration().Servers {
   885  		// If we understand the new add/remove APIs and the server was added by ID, use the new remove API
   886  		if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) {
   887  			s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID)
   888  			future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0)
   889  			if err := future.Error(); err != nil {
   890  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   891  					server.ID, err)
   892  				return err
   893  			}
   894  			break
   895  		} else if server.Address == raft.ServerAddress(addr) {
   896  			// If not, use the old remove API
   897  			s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address)
   898  			future := s.raft.RemovePeer(raft.ServerAddress(addr))
   899  			if err := future.Error(); err != nil {
   900  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   901  					addr, err)
   902  				return err
   903  			}
   904  			break
   905  		}
   906  	}
   907  
   908  	return nil
   909  }
   910  
   911  // replicateACLPolicies is used to replicate ACL policies from
   912  // the authoritative region to this region.
   913  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
   914  	req := structs.ACLPolicyListRequest{
   915  		QueryOptions: structs.QueryOptions{
   916  			Region:     s.config.AuthoritativeRegion,
   917  			AllowStale: true,
   918  		},
   919  	}
   920  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   921  	s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region)
   922  
   923  START:
   924  	for {
   925  		select {
   926  		case <-stopCh:
   927  			return
   928  		default:
   929  			// Rate limit how often we attempt replication
   930  			limiter.Wait(context.Background())
   931  
   932  			// Fetch the list of policies
   933  			var resp structs.ACLPolicyListResponse
   934  			req.AuthToken = s.ReplicationToken()
   935  			err := s.forwardRegion(s.config.AuthoritativeRegion,
   936  				"ACL.ListPolicies", &req, &resp)
   937  			if err != nil {
   938  				s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   939  				goto ERR_WAIT
   940  			}
   941  
   942  			// Perform a two-way diff
   943  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
   944  
   945  			// Delete policies that should not exist
   946  			if len(delete) > 0 {
   947  				args := &structs.ACLPolicyDeleteRequest{
   948  					Names: delete,
   949  				}
   950  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
   951  				if err != nil {
   952  					s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err)
   953  					goto ERR_WAIT
   954  				}
   955  			}
   956  
   957  			// Fetch any outdated policies
   958  			var fetched []*structs.ACLPolicy
   959  			if len(update) > 0 {
   960  				req := structs.ACLPolicySetRequest{
   961  					Names: update,
   962  					QueryOptions: structs.QueryOptions{
   963  						Region:        s.config.AuthoritativeRegion,
   964  						AuthToken:     s.ReplicationToken(),
   965  						AllowStale:    true,
   966  						MinQueryIndex: resp.Index - 1,
   967  					},
   968  				}
   969  				var reply structs.ACLPolicySetResponse
   970  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
   971  					"ACL.GetPolicies", &req, &reply); err != nil {
   972  					s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   973  					goto ERR_WAIT
   974  				}
   975  				for _, policy := range reply.Policies {
   976  					fetched = append(fetched, policy)
   977  				}
   978  			}
   979  
   980  			// Update local policies
   981  			if len(fetched) > 0 {
   982  				args := &structs.ACLPolicyUpsertRequest{
   983  					Policies: fetched,
   984  				}
   985  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
   986  				if err != nil {
   987  					s.logger.Printf("[ERR] nomad: failed to update policies: %v", err)
   988  					goto ERR_WAIT
   989  				}
   990  			}
   991  
   992  			// Update the minimum query index, blocks until there
   993  			// is a change.
   994  			req.MinQueryIndex = resp.Index
   995  		}
   996  	}
   997  
   998  ERR_WAIT:
   999  	select {
  1000  	case <-time.After(s.config.ReplicationBackoff):
  1001  		goto START
  1002  	case <-stopCh:
  1003  		return
  1004  	}
  1005  }
  1006  
  1007  // diffACLPolicies is used to perform a two-way diff between the local
  1008  // policies and the remote policies to determine which policies need to
  1009  // be deleted or updated.
  1010  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
  1011  	// Construct a set of the local and remote policies
  1012  	local := make(map[string][]byte)
  1013  	remote := make(map[string]struct{})
  1014  
  1015  	// Add all the local policies
  1016  	iter, err := state.ACLPolicies(nil)
  1017  	if err != nil {
  1018  		panic("failed to iterate local policies")
  1019  	}
  1020  	for {
  1021  		raw := iter.Next()
  1022  		if raw == nil {
  1023  			break
  1024  		}
  1025  		policy := raw.(*structs.ACLPolicy)
  1026  		local[policy.Name] = policy.Hash
  1027  	}
  1028  
  1029  	// Iterate over the remote policies
  1030  	for _, rp := range remoteList {
  1031  		remote[rp.Name] = struct{}{}
  1032  
  1033  		// Check if the policy is missing locally
  1034  		if localHash, ok := local[rp.Name]; !ok {
  1035  			update = append(update, rp.Name)
  1036  
  1037  			// Check if policy is newer remotely and there is a hash mis-match.
  1038  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1039  			update = append(update, rp.Name)
  1040  		}
  1041  	}
  1042  
  1043  	// Check if policy should be deleted
  1044  	for lp := range local {
  1045  		if _, ok := remote[lp]; !ok {
  1046  			delete = append(delete, lp)
  1047  		}
  1048  	}
  1049  	return
  1050  }
  1051  
  1052  // replicateACLTokens is used to replicate global ACL tokens from
  1053  // the authoritative region to this region.
  1054  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
  1055  	req := structs.ACLTokenListRequest{
  1056  		GlobalOnly: true,
  1057  		QueryOptions: structs.QueryOptions{
  1058  			Region:     s.config.AuthoritativeRegion,
  1059  			AllowStale: true,
  1060  		},
  1061  	}
  1062  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1063  	s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region)
  1064  
  1065  START:
  1066  	for {
  1067  		select {
  1068  		case <-stopCh:
  1069  			return
  1070  		default:
  1071  			// Rate limit how often we attempt replication
  1072  			limiter.Wait(context.Background())
  1073  
  1074  			// Fetch the list of tokens
  1075  			var resp structs.ACLTokenListResponse
  1076  			req.AuthToken = s.ReplicationToken()
  1077  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1078  				"ACL.ListTokens", &req, &resp)
  1079  			if err != nil {
  1080  				s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1081  				goto ERR_WAIT
  1082  			}
  1083  
  1084  			// Perform a two-way diff
  1085  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
  1086  
  1087  			// Delete tokens that should not exist
  1088  			if len(delete) > 0 {
  1089  				args := &structs.ACLTokenDeleteRequest{
  1090  					AccessorIDs: delete,
  1091  				}
  1092  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1093  				if err != nil {
  1094  					s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err)
  1095  					goto ERR_WAIT
  1096  				}
  1097  			}
  1098  
  1099  			// Fetch any outdated policies.
  1100  			var fetched []*structs.ACLToken
  1101  			if len(update) > 0 {
  1102  				req := structs.ACLTokenSetRequest{
  1103  					AccessorIDS: update,
  1104  					QueryOptions: structs.QueryOptions{
  1105  						Region:        s.config.AuthoritativeRegion,
  1106  						AuthToken:     s.ReplicationToken(),
  1107  						AllowStale:    true,
  1108  						MinQueryIndex: resp.Index - 1,
  1109  					},
  1110  				}
  1111  				var reply structs.ACLTokenSetResponse
  1112  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1113  					"ACL.GetTokens", &req, &reply); err != nil {
  1114  					s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1115  					goto ERR_WAIT
  1116  				}
  1117  				for _, token := range reply.Tokens {
  1118  					fetched = append(fetched, token)
  1119  				}
  1120  			}
  1121  
  1122  			// Update local tokens
  1123  			if len(fetched) > 0 {
  1124  				args := &structs.ACLTokenUpsertRequest{
  1125  					Tokens: fetched,
  1126  				}
  1127  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1128  				if err != nil {
  1129  					s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err)
  1130  					goto ERR_WAIT
  1131  				}
  1132  			}
  1133  
  1134  			// Update the minimum query index, blocks until there
  1135  			// is a change.
  1136  			req.MinQueryIndex = resp.Index
  1137  		}
  1138  	}
  1139  
  1140  ERR_WAIT:
  1141  	select {
  1142  	case <-time.After(s.config.ReplicationBackoff):
  1143  		goto START
  1144  	case <-stopCh:
  1145  		return
  1146  	}
  1147  }
  1148  
  1149  // diffACLTokens is used to perform a two-way diff between the local
  1150  // tokens and the remote tokens to determine which tokens need to
  1151  // be deleted or updated.
  1152  func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1153  	// Construct a set of the local and remote policies
  1154  	local := make(map[string][]byte)
  1155  	remote := make(map[string]struct{})
  1156  
  1157  	// Add all the local global tokens
  1158  	iter, err := state.ACLTokensByGlobal(nil, true)
  1159  	if err != nil {
  1160  		panic("failed to iterate local tokens")
  1161  	}
  1162  	for {
  1163  		raw := iter.Next()
  1164  		if raw == nil {
  1165  			break
  1166  		}
  1167  		token := raw.(*structs.ACLToken)
  1168  		local[token.AccessorID] = token.Hash
  1169  	}
  1170  
  1171  	// Iterate over the remote tokens
  1172  	for _, rp := range remoteList {
  1173  		remote[rp.AccessorID] = struct{}{}
  1174  
  1175  		// Check if the token is missing locally
  1176  		if localHash, ok := local[rp.AccessorID]; !ok {
  1177  			update = append(update, rp.AccessorID)
  1178  
  1179  			// Check if policy is newer remotely and there is a hash mis-match.
  1180  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1181  			update = append(update, rp.AccessorID)
  1182  		}
  1183  	}
  1184  
  1185  	// Check if local token should be deleted
  1186  	for lp := range local {
  1187  		if _, ok := remote[lp]; !ok {
  1188  			delete = append(delete, lp)
  1189  		}
  1190  	}
  1191  	return
  1192  }
  1193  
  1194  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
  1195  func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig {
  1196  	state := s.fsm.State()
  1197  	_, config, err := state.AutopilotConfig()
  1198  	if err != nil {
  1199  		s.logger.Printf("[ERR] autopilot: failed to get config: %v", err)
  1200  		return nil
  1201  	}
  1202  	if config != nil {
  1203  		return config
  1204  	}
  1205  
  1206  	if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) {
  1207  		s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String())
  1208  		return nil
  1209  	}
  1210  
  1211  	config = s.config.AutopilotConfig
  1212  	req := structs.AutopilotSetConfigRequest{Config: *config}
  1213  	if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
  1214  		s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err)
  1215  		return nil
  1216  	}
  1217  
  1218  	return config
  1219  }