github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"math/rand"
     9  	"net"
    10  	"sync"
    11  	"time"
    12  
    13  	"golang.org/x/time/rate"
    14  
    15  	"strings"
    16  
    17  	"github.com/armon/go-metrics"
    18  	memdb "github.com/hashicorp/go-memdb"
    19  	"github.com/hashicorp/go-version"
    20  	"github.com/hashicorp/nomad/helper/uuid"
    21  	"github.com/hashicorp/nomad/nomad/state"
    22  	"github.com/hashicorp/nomad/nomad/structs"
    23  	"github.com/hashicorp/raft"
    24  	"github.com/hashicorp/serf/serf"
    25  )
    26  
    27  const (
    28  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    29  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    30  	// high contention when the schedulers plan does not make progress.
    31  	failedEvalUnblockInterval = 1 * time.Minute
    32  
    33  	// replicationRateLimit is used to rate limit how often data is replicated
    34  	// between the authoritative region and the local region
    35  	replicationRateLimit rate.Limit = 10.0
    36  
    37  	// barrierWriteTimeout is used to give Raft a chance to process a
    38  	// possible loss of leadership event if we are unable to get a barrier
    39  	// while leader.
    40  	barrierWriteTimeout = 2 * time.Minute
    41  )
    42  
    43  var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    44  
    45  // monitorLeadership is used to monitor if we acquire or lose our role
    46  // as the leader in the Raft cluster. There is some work the leader is
    47  // expected to do, so we must react to changes
    48  func (s *Server) monitorLeadership() {
    49  	var weAreLeaderCh chan struct{}
    50  	var leaderLoop sync.WaitGroup
    51  	for {
    52  		select {
    53  		case isLeader := <-s.leaderCh:
    54  			switch {
    55  			case isLeader:
    56  				if weAreLeaderCh != nil {
    57  					s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running")
    58  					continue
    59  				}
    60  
    61  				weAreLeaderCh = make(chan struct{})
    62  				leaderLoop.Add(1)
    63  				go func(ch chan struct{}) {
    64  					defer leaderLoop.Done()
    65  					s.leaderLoop(ch)
    66  				}(weAreLeaderCh)
    67  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    68  
    69  			default:
    70  				if weAreLeaderCh == nil {
    71  					s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running")
    72  					continue
    73  				}
    74  
    75  				s.logger.Printf("[DEBUG] nomad: shutting down leader loop")
    76  				close(weAreLeaderCh)
    77  				leaderLoop.Wait()
    78  				weAreLeaderCh = nil
    79  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    80  			}
    81  
    82  		case <-s.shutdownCh:
    83  			return
    84  		}
    85  	}
    86  }
    87  
    88  // leaderLoop runs as long as we are the leader to run various
    89  // maintenance activities
    90  func (s *Server) leaderLoop(stopCh chan struct{}) {
    91  	var reconcileCh chan serf.Member
    92  	establishedLeader := false
    93  
    94  RECONCILE:
    95  	// Setup a reconciliation timer
    96  	reconcileCh = nil
    97  	interval := time.After(s.config.ReconcileInterval)
    98  
    99  	// Apply a raft barrier to ensure our FSM is caught up
   100  	start := time.Now()
   101  	barrier := s.raft.Barrier(barrierWriteTimeout)
   102  	if err := barrier.Error(); err != nil {
   103  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
   104  		goto WAIT
   105  	}
   106  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   107  
   108  	// Check if we need to handle initial leadership actions
   109  	if !establishedLeader {
   110  		if err := s.establishLeadership(stopCh); err != nil {
   111  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err)
   112  
   113  			// Immediately revoke leadership since we didn't successfully
   114  			// establish leadership.
   115  			if err := s.revokeLeadership(); err != nil {
   116  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   117  			}
   118  
   119  			goto WAIT
   120  		}
   121  
   122  		establishedLeader = true
   123  		defer func() {
   124  			if err := s.revokeLeadership(); err != nil {
   125  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   126  			}
   127  		}()
   128  	}
   129  
   130  	// Reconcile any missing data
   131  	if err := s.reconcile(); err != nil {
   132  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
   133  		goto WAIT
   134  	}
   135  
   136  	// Initial reconcile worked, now we can process the channel
   137  	// updates
   138  	reconcileCh = s.reconcileCh
   139  
   140  	// Poll the stop channel to give it priority so we don't waste time
   141  	// trying to perform the other operations if we have been asked to shut
   142  	// down.
   143  	select {
   144  	case <-stopCh:
   145  		return
   146  	default:
   147  	}
   148  
   149  WAIT:
   150  	// Wait until leadership is lost
   151  	for {
   152  		select {
   153  		case <-stopCh:
   154  			return
   155  		case <-s.shutdownCh:
   156  			return
   157  		case <-interval:
   158  			goto RECONCILE
   159  		case member := <-reconcileCh:
   160  			s.reconcileMember(member)
   161  		}
   162  	}
   163  }
   164  
   165  // establishLeadership is invoked once we become leader and are able
   166  // to invoke an initial barrier. The barrier is used to ensure any
   167  // previously inflight transactions have been committed and that our
   168  // state is up-to-date.
   169  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   170  	defer metrics.MeasureSince([]string{"nomad", "leader", "establish_leadership"}, time.Now())
   171  
   172  	// Generate a leader ACL token. This will allow the leader to issue work
   173  	// that requires a valid ACL token.
   174  	s.setLeaderAcl(uuid.Generate())
   175  
   176  	// Disable workers to free half the cores for use in the plan queue and
   177  	// evaluation broker
   178  	if numWorkers := len(s.workers); numWorkers > 1 {
   179  		// Disabling 3/4 of the workers frees CPU for raft and the
   180  		// plan applier which uses 1/2 the cores.
   181  		for i := 0; i < (3 * numWorkers / 4); i++ {
   182  			s.workers[i].SetPause(true)
   183  		}
   184  	}
   185  
   186  	// Initialize and start the autopilot routine
   187  	s.getOrCreateAutopilotConfig()
   188  	s.autopilot.Start()
   189  
   190  	// Enable the plan queue, since we are now the leader
   191  	s.planQueue.SetEnabled(true)
   192  
   193  	// Start the plan evaluator
   194  	go s.planApply()
   195  
   196  	// Enable the eval broker, since we are now the leader
   197  	s.evalBroker.SetEnabled(true)
   198  
   199  	// Enable the blocked eval tracker, since we are now the leader
   200  	s.blockedEvals.SetEnabled(true)
   201  	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
   202  
   203  	// Enable the deployment watcher, since we are now the leader
   204  	s.deploymentWatcher.SetEnabled(true, s.State())
   205  
   206  	// Enable the NodeDrainer
   207  	s.nodeDrainer.SetEnabled(true, s.State())
   208  
   209  	// Restore the eval broker state
   210  	if err := s.restoreEvals(); err != nil {
   211  		return err
   212  	}
   213  
   214  	// Activate the vault client
   215  	s.vault.SetActive(true)
   216  	if err := s.restoreRevokingAccessors(); err != nil {
   217  		return err
   218  	}
   219  
   220  	// Enable the periodic dispatcher, since we are now the leader.
   221  	s.periodicDispatcher.SetEnabled(true)
   222  
   223  	// Restore the periodic dispatcher state
   224  	if err := s.restorePeriodicDispatcher(); err != nil {
   225  		return err
   226  	}
   227  
   228  	// Scheduler periodic jobs
   229  	go s.schedulePeriodic(stopCh)
   230  
   231  	// Reap any failed evaluations
   232  	go s.reapFailedEvaluations(stopCh)
   233  
   234  	// Reap any duplicate blocked evaluations
   235  	go s.reapDupBlockedEvaluations(stopCh)
   236  
   237  	// Periodically unblock failed allocations
   238  	go s.periodicUnblockFailedEvals(stopCh)
   239  
   240  	// Periodically publish job summary metrics
   241  	go s.publishJobSummaryMetrics(stopCh)
   242  
   243  	// Setup the heartbeat timers. This is done both when starting up or when
   244  	// a leader fail over happens. Since the timers are maintained by the leader
   245  	// node, effectively this means all the timers are renewed at the time of failover.
   246  	// The TTL contract is that the session will not be expired before the TTL,
   247  	// so expiring it later is allowable.
   248  	//
   249  	// This MUST be done after the initial barrier to ensure the latest Nodes
   250  	// are available to be initialized. Otherwise initialization may use stale
   251  	// data.
   252  	if err := s.initializeHeartbeatTimers(); err != nil {
   253  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   254  		return err
   255  	}
   256  
   257  	// COMPAT 0.4 - 0.4.1
   258  	// Reconcile the summaries of the registered jobs. We reconcile summaries
   259  	// only if the server is 0.4.1 since summaries are not present in 0.4 they
   260  	// might be incorrect after upgrading to 0.4.1 the summaries might not be
   261  	// correct
   262  	if err := s.reconcileJobSummaries(); err != nil {
   263  		return fmt.Errorf("unable to reconcile job summaries: %v", err)
   264  	}
   265  
   266  	// Start replication of ACLs and Policies if they are enabled,
   267  	// and we are not the authoritative region.
   268  	if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion {
   269  		go s.replicateACLPolicies(stopCh)
   270  		go s.replicateACLTokens(stopCh)
   271  	}
   272  
   273  	// Setup any enterprise systems required.
   274  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   275  		return err
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  // restoreEvals is used to restore pending evaluations into the eval broker and
   282  // blocked evaluations into the blocked eval tracker. The broker and blocked
   283  // eval tracker is maintained only by the leader, so it must be restored anytime
   284  // a leadership transition takes place.
   285  func (s *Server) restoreEvals() error {
   286  	// Get an iterator over every evaluation
   287  	ws := memdb.NewWatchSet()
   288  	iter, err := s.fsm.State().Evals(ws)
   289  	if err != nil {
   290  		return fmt.Errorf("failed to get evaluations: %v", err)
   291  	}
   292  
   293  	for {
   294  		raw := iter.Next()
   295  		if raw == nil {
   296  			break
   297  		}
   298  		eval := raw.(*structs.Evaluation)
   299  
   300  		if eval.ShouldEnqueue() {
   301  			s.evalBroker.Enqueue(eval)
   302  		} else if eval.ShouldBlock() {
   303  			s.blockedEvals.Block(eval)
   304  		}
   305  	}
   306  	return nil
   307  }
   308  
   309  // restoreRevokingAccessors is used to restore Vault accessors that should be
   310  // revoked.
   311  func (s *Server) restoreRevokingAccessors() error {
   312  	// An accessor should be revoked if its allocation or node is terminal
   313  	ws := memdb.NewWatchSet()
   314  	state := s.fsm.State()
   315  	iter, err := state.VaultAccessors(ws)
   316  	if err != nil {
   317  		return fmt.Errorf("failed to get vault accessors: %v", err)
   318  	}
   319  
   320  	var revoke []*structs.VaultAccessor
   321  	for {
   322  		raw := iter.Next()
   323  		if raw == nil {
   324  			break
   325  		}
   326  
   327  		va := raw.(*structs.VaultAccessor)
   328  
   329  		// Check the allocation
   330  		alloc, err := state.AllocByID(ws, va.AllocID)
   331  		if err != nil {
   332  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   333  		}
   334  		if alloc == nil || alloc.Terminated() {
   335  			// No longer running and should be revoked
   336  			revoke = append(revoke, va)
   337  			continue
   338  		}
   339  
   340  		// Check the node
   341  		node, err := state.NodeByID(ws, va.NodeID)
   342  		if err != nil {
   343  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   344  		}
   345  		if node == nil || node.TerminalStatus() {
   346  			// Node is terminal so any accessor from it should be revoked
   347  			revoke = append(revoke, va)
   348  			continue
   349  		}
   350  	}
   351  
   352  	if len(revoke) != 0 {
   353  		if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
   354  			return fmt.Errorf("failed to revoke tokens: %v", err)
   355  		}
   356  	}
   357  
   358  	return nil
   359  }
   360  
   361  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   362  // periodic dispatcher. It also determines if a periodic job should have been
   363  // created during the leadership transition and force runs them. The periodic
   364  // dispatcher is maintained only by the leader, so it must be restored anytime a
   365  // leadership transition takes place.
   366  func (s *Server) restorePeriodicDispatcher() error {
   367  	ws := memdb.NewWatchSet()
   368  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   369  	if err != nil {
   370  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   371  	}
   372  
   373  	now := time.Now()
   374  	for i := iter.Next(); i != nil; i = iter.Next() {
   375  		job := i.(*structs.Job)
   376  
   377  		// We skip adding parameterized jobs because they themselves aren't
   378  		// tracked, only the dispatched children are.
   379  		if job.IsParameterized() {
   380  			continue
   381  		}
   382  
   383  		if err := s.periodicDispatcher.Add(job); err != nil {
   384  			s.logger.Printf("[ERR] nomad.periodic: %v", err)
   385  			continue
   386  		}
   387  
   388  		// We do not need to force run the job since it isn't active.
   389  		if !job.IsPeriodicActive() {
   390  			continue
   391  		}
   392  
   393  		// If the periodic job has never been launched before, launch will hold
   394  		// the time the periodic job was added. Otherwise it has the last launch
   395  		// time of the periodic job.
   396  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   397  		if err != nil {
   398  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   399  		}
   400  		if launch == nil {
   401  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   402  				job.ID, job.Namespace)
   403  		}
   404  
   405  		// nextLaunch is the next launch that should occur.
   406  		nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   407  		if err != nil {
   408  			s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err)
   409  			continue
   410  		}
   411  
   412  		// We skip force launching the job if  there should be no next launch
   413  		// (the zero case) or if the next launch time is in the future. If it is
   414  		// in the future, it will be handled by the periodic dispatcher.
   415  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   416  			continue
   417  		}
   418  
   419  		if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil {
   420  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   421  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   422  			return errors.New(msg)
   423  		}
   424  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   425  			" run during leadership establishment", job.ID)
   426  	}
   427  
   428  	return nil
   429  }
   430  
   431  // schedulePeriodic is used to do periodic job dispatch while we are leader
   432  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   433  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   434  	defer evalGC.Stop()
   435  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   436  	defer nodeGC.Stop()
   437  	jobGC := time.NewTicker(s.config.JobGCInterval)
   438  	defer jobGC.Stop()
   439  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   440  	defer deploymentGC.Stop()
   441  
   442  	// getLatest grabs the latest index from the state store. It returns true if
   443  	// the index was retrieved successfully.
   444  	getLatest := func() (uint64, bool) {
   445  		snapshotIndex, err := s.fsm.State().LatestIndex()
   446  		if err != nil {
   447  			s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err)
   448  			return 0, false
   449  		}
   450  
   451  		return snapshotIndex, true
   452  	}
   453  
   454  	for {
   455  
   456  		select {
   457  		case <-evalGC.C:
   458  			if index, ok := getLatest(); ok {
   459  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   460  			}
   461  		case <-nodeGC.C:
   462  			if index, ok := getLatest(); ok {
   463  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   464  			}
   465  		case <-jobGC.C:
   466  			if index, ok := getLatest(); ok {
   467  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   468  			}
   469  		case <-deploymentGC.C:
   470  			if index, ok := getLatest(); ok {
   471  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
   472  			}
   473  		case <-stopCh:
   474  			return
   475  		}
   476  	}
   477  }
   478  
   479  // coreJobEval returns an evaluation for a core job
   480  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   481  	return &structs.Evaluation{
   482  		ID:          uuid.Generate(),
   483  		Namespace:   "-",
   484  		Priority:    structs.CoreJobPriority,
   485  		Type:        structs.JobTypeCore,
   486  		TriggeredBy: structs.EvalTriggerScheduled,
   487  		JobID:       job,
   488  		LeaderACL:   s.getLeaderAcl(),
   489  		Status:      structs.EvalStatusPending,
   490  		ModifyIndex: modifyIndex,
   491  	}
   492  }
   493  
   494  // reapFailedEvaluations is used to reap evaluations that
   495  // have reached their delivery limit and should be failed
   496  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   497  	for {
   498  		select {
   499  		case <-stopCh:
   500  			return
   501  		default:
   502  			// Scan for a failed evaluation
   503  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   504  			if err != nil {
   505  				return
   506  			}
   507  			if eval == nil {
   508  				continue
   509  			}
   510  
   511  			// Update the status to failed
   512  			updateEval := eval.Copy()
   513  			updateEval.Status = structs.EvalStatusFailed
   514  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   515  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval)
   516  
   517  			// Create a follow-up evaluation that will be used to retry the
   518  			// scheduling for the job after the cluster is hopefully more stable
   519  			// due to the fairly large backoff.
   520  			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
   521  				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
   522  			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
   523  
   524  			// Update via Raft
   525  			req := structs.EvalUpdateRequest{
   526  				Evals: []*structs.Evaluation{updateEval, followupEval},
   527  			}
   528  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   529  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err)
   530  				continue
   531  			}
   532  
   533  			// Ack completion
   534  			s.evalBroker.Ack(eval.ID, token)
   535  		}
   536  	}
   537  }
   538  
   539  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   540  // should be cancelled.
   541  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   542  	for {
   543  		select {
   544  		case <-stopCh:
   545  			return
   546  		default:
   547  			// Scan for duplicate blocked evals.
   548  			dups := s.blockedEvals.GetDuplicates(time.Second)
   549  			if dups == nil {
   550  				continue
   551  			}
   552  
   553  			cancel := make([]*structs.Evaluation, len(dups))
   554  			for i, dup := range dups {
   555  				// Update the status to cancelled
   556  				newEval := dup.Copy()
   557  				newEval.Status = structs.EvalStatusCancelled
   558  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   559  				cancel[i] = newEval
   560  			}
   561  
   562  			// Update via Raft
   563  			req := structs.EvalUpdateRequest{
   564  				Evals: cancel,
   565  			}
   566  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   567  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   568  				continue
   569  			}
   570  		}
   571  	}
   572  }
   573  
   574  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   575  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   576  	ticker := time.NewTicker(failedEvalUnblockInterval)
   577  	defer ticker.Stop()
   578  	for {
   579  		select {
   580  		case <-stopCh:
   581  			return
   582  		case <-ticker.C:
   583  			// Unblock the failed allocations
   584  			s.blockedEvals.UnblockFailed()
   585  		}
   586  	}
   587  }
   588  
   589  // publishJobSummaryMetrics publishes the job summaries as metrics
   590  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
   591  	timer := time.NewTimer(0)
   592  	defer timer.Stop()
   593  
   594  	for {
   595  		select {
   596  		case <-stopCh:
   597  			return
   598  		case <-timer.C:
   599  			timer.Reset(s.config.StatsCollectionInterval)
   600  			state, err := s.State().Snapshot()
   601  			if err != nil {
   602  				s.logger.Printf("[ERR] nomad: failed to get state: %v", err)
   603  				continue
   604  			}
   605  			ws := memdb.NewWatchSet()
   606  			iter, err := state.JobSummaries(ws)
   607  			if err != nil {
   608  				s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err)
   609  				continue
   610  			}
   611  
   612  			for {
   613  				raw := iter.Next()
   614  				if raw == nil {
   615  					break
   616  				}
   617  				summary := raw.(*structs.JobSummary)
   618  				for name, tgSummary := range summary.Summary {
   619  					if !s.config.DisableTaggedMetrics {
   620  						labels := []metrics.Label{
   621  							{
   622  								Name:  "job",
   623  								Value: summary.JobID,
   624  							},
   625  							{
   626  								Name:  "task_group",
   627  								Value: name,
   628  							},
   629  						}
   630  
   631  						if strings.Contains(summary.JobID, "/dispatch-") {
   632  							jobInfo := strings.Split(summary.JobID, "/dispatch-")
   633  							labels = append(labels, metrics.Label{
   634  								Name:  "parent_id",
   635  								Value: jobInfo[0],
   636  							}, metrics.Label{
   637  								Name:  "dispatch_id",
   638  								Value: jobInfo[1],
   639  							})
   640  						}
   641  
   642  						if strings.Contains(summary.JobID, "/periodic-") {
   643  							jobInfo := strings.Split(summary.JobID, "/periodic-")
   644  							labels = append(labels, metrics.Label{
   645  								Name:  "parent_id",
   646  								Value: jobInfo[0],
   647  							}, metrics.Label{
   648  								Name:  "periodic_id",
   649  								Value: jobInfo[1],
   650  							})
   651  						}
   652  
   653  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
   654  							float32(tgSummary.Queued), labels)
   655  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
   656  							float32(tgSummary.Complete), labels)
   657  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
   658  							float32(tgSummary.Failed), labels)
   659  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
   660  							float32(tgSummary.Running), labels)
   661  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
   662  							float32(tgSummary.Starting), labels)
   663  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
   664  							float32(tgSummary.Lost), labels)
   665  					}
   666  					if s.config.BackwardsCompatibleMetrics {
   667  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
   668  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
   669  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
   670  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
   671  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
   672  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
   673  					}
   674  				}
   675  			}
   676  		}
   677  	}
   678  }
   679  
   680  // revokeLeadership is invoked once we step down as leader.
   681  // This is used to cleanup any state that may be specific to a leader.
   682  func (s *Server) revokeLeadership() error {
   683  	defer metrics.MeasureSince([]string{"nomad", "leader", "revoke_leadership"}, time.Now())
   684  
   685  	// Clear the leader token since we are no longer the leader.
   686  	s.setLeaderAcl("")
   687  
   688  	// Disable autopilot
   689  	s.autopilot.Stop()
   690  
   691  	// Disable the plan queue, since we are no longer leader
   692  	s.planQueue.SetEnabled(false)
   693  
   694  	// Disable the eval broker, since it is only useful as a leader
   695  	s.evalBroker.SetEnabled(false)
   696  
   697  	// Disable the blocked eval tracker, since it is only useful as a leader
   698  	s.blockedEvals.SetEnabled(false)
   699  
   700  	// Disable the periodic dispatcher, since it is only useful as a leader
   701  	s.periodicDispatcher.SetEnabled(false)
   702  
   703  	// Disable the Vault client as it is only useful as a leader.
   704  	s.vault.SetActive(false)
   705  
   706  	// Disable the deployment watcher as it is only useful as a leader.
   707  	s.deploymentWatcher.SetEnabled(false, nil)
   708  
   709  	// Disable the node drainer
   710  	s.nodeDrainer.SetEnabled(false, nil)
   711  
   712  	// Disable any enterprise systems required.
   713  	if err := s.revokeEnterpriseLeadership(); err != nil {
   714  		return err
   715  	}
   716  
   717  	// Clear the heartbeat timers on either shutdown or step down,
   718  	// since we are no longer responsible for TTL expirations.
   719  	if err := s.clearAllHeartbeatTimers(); err != nil {
   720  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   721  		return err
   722  	}
   723  
   724  	// Unpause our worker if we paused previously
   725  	if len(s.workers) > 1 {
   726  		for i := 0; i < len(s.workers)/2; i++ {
   727  			s.workers[i].SetPause(false)
   728  		}
   729  	}
   730  	return nil
   731  }
   732  
   733  // reconcile is used to reconcile the differences between Serf
   734  // membership and what is reflected in our strongly consistent store.
   735  func (s *Server) reconcile() error {
   736  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   737  	members := s.serf.Members()
   738  	for _, member := range members {
   739  		if err := s.reconcileMember(member); err != nil {
   740  			return err
   741  		}
   742  	}
   743  	return nil
   744  }
   745  
   746  // reconcileMember is used to do an async reconcile of a single serf member
   747  func (s *Server) reconcileMember(member serf.Member) error {
   748  	// Check if this is a member we should handle
   749  	valid, parts := isNomadServer(member)
   750  	if !valid || parts.Region != s.config.Region {
   751  		return nil
   752  	}
   753  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   754  
   755  	var err error
   756  	switch member.Status {
   757  	case serf.StatusAlive:
   758  		err = s.addRaftPeer(member, parts)
   759  	case serf.StatusLeft, StatusReap:
   760  		err = s.removeRaftPeer(member, parts)
   761  	}
   762  	if err != nil {
   763  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   764  			member, err)
   765  		return err
   766  	}
   767  	return nil
   768  }
   769  
   770  // reconcileJobSummaries reconciles the summaries of all the jobs registered in
   771  // the system
   772  // COMPAT 0.4 -> 0.4.1
   773  func (s *Server) reconcileJobSummaries() error {
   774  	index, err := s.fsm.state.LatestIndex()
   775  	if err != nil {
   776  		return fmt.Errorf("unable to read latest index: %v", err)
   777  	}
   778  	s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index)
   779  
   780  	args := &structs.GenericResponse{}
   781  	msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag
   782  	if _, _, err = s.raftApply(msg, args); err != nil {
   783  		return fmt.Errorf("reconciliation of job summaries failed: %v", err)
   784  	}
   785  
   786  	return nil
   787  }
   788  
   789  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   790  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   791  	// Check for possibility of multiple bootstrap nodes
   792  	members := s.serf.Members()
   793  	if parts.Bootstrap {
   794  		for _, member := range members {
   795  			valid, p := isNomadServer(member)
   796  			if valid && member.Name != m.Name && p.Bootstrap {
   797  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   798  				return nil
   799  			}
   800  		}
   801  	}
   802  
   803  	// Processing ourselves could result in trying to remove ourselves to
   804  	// fix up our address, which would make us step down. This is only
   805  	// safe to attempt if there are multiple servers available.
   806  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   807  	configFuture := s.raft.GetConfiguration()
   808  	if err := configFuture.Error(); err != nil {
   809  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   810  		return err
   811  	}
   812  
   813  	if m.Name == s.config.NodeName {
   814  		if l := len(configFuture.Configuration().Servers); l < 3 {
   815  			s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name)
   816  			return nil
   817  		}
   818  	}
   819  
   820  	// See if it's already in the configuration. It's harmless to re-add it
   821  	// but we want to avoid doing that if possible to prevent useless Raft
   822  	// log entries. If the address is the same but the ID changed, remove the
   823  	// old server before adding the new one.
   824  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   825  	if err != nil {
   826  		return err
   827  	}
   828  	for _, server := range configFuture.Configuration().Servers {
   829  		// No-op if the raft version is too low
   830  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
   831  			return nil
   832  		}
   833  
   834  		// If the address or ID matches an existing server, see if we need to remove the old one first
   835  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
   836  			// Exit with no-op if this is being called on an existing server and both the ID and address match
   837  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
   838  				return nil
   839  			}
   840  			future := s.raft.RemoveServer(server.ID, 0, 0)
   841  			if server.Address == raft.ServerAddress(addr) {
   842  				if err := future.Error(); err != nil {
   843  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
   844  				}
   845  				s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address)
   846  			} else {
   847  				if err := future.Error(); err != nil {
   848  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
   849  				}
   850  				s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID)
   851  			}
   852  		}
   853  	}
   854  
   855  	// Attempt to add as a peer
   856  	switch {
   857  	case minRaftProtocol >= 3:
   858  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   859  		if err := addFuture.Error(); err != nil {
   860  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   861  			return err
   862  		}
   863  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
   864  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   865  		if err := addFuture.Error(); err != nil {
   866  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   867  			return err
   868  		}
   869  	default:
   870  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
   871  		if err := addFuture.Error(); err != nil {
   872  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   873  			return err
   874  		}
   875  	}
   876  
   877  	return nil
   878  }
   879  
   880  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   881  // or is reaped
   882  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   883  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   884  
   885  	// See if it's already in the configuration. It's harmless to re-remove it
   886  	// but we want to avoid doing that if possible to prevent useless Raft
   887  	// log entries.
   888  	configFuture := s.raft.GetConfiguration()
   889  	if err := configFuture.Error(); err != nil {
   890  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   891  		return err
   892  	}
   893  
   894  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   895  	if err != nil {
   896  		return err
   897  	}
   898  
   899  	// Pick which remove API to use based on how the server was added.
   900  	for _, server := range configFuture.Configuration().Servers {
   901  		// If we understand the new add/remove APIs and the server was added by ID, use the new remove API
   902  		if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) {
   903  			s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID)
   904  			future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0)
   905  			if err := future.Error(); err != nil {
   906  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   907  					server.ID, err)
   908  				return err
   909  			}
   910  			break
   911  		} else if server.Address == raft.ServerAddress(addr) {
   912  			// If not, use the old remove API
   913  			s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address)
   914  			future := s.raft.RemovePeer(raft.ServerAddress(addr))
   915  			if err := future.Error(); err != nil {
   916  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   917  					addr, err)
   918  				return err
   919  			}
   920  			break
   921  		}
   922  	}
   923  
   924  	return nil
   925  }
   926  
   927  // replicateACLPolicies is used to replicate ACL policies from
   928  // the authoritative region to this region.
   929  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
   930  	req := structs.ACLPolicyListRequest{
   931  		QueryOptions: structs.QueryOptions{
   932  			Region:     s.config.AuthoritativeRegion,
   933  			AllowStale: true,
   934  		},
   935  	}
   936  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   937  	s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region)
   938  
   939  START:
   940  	for {
   941  		select {
   942  		case <-stopCh:
   943  			return
   944  		default:
   945  			// Rate limit how often we attempt replication
   946  			limiter.Wait(context.Background())
   947  
   948  			// Fetch the list of policies
   949  			var resp structs.ACLPolicyListResponse
   950  			req.AuthToken = s.ReplicationToken()
   951  			err := s.forwardRegion(s.config.AuthoritativeRegion,
   952  				"ACL.ListPolicies", &req, &resp)
   953  			if err != nil {
   954  				s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   955  				goto ERR_WAIT
   956  			}
   957  
   958  			// Perform a two-way diff
   959  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
   960  
   961  			// Delete policies that should not exist
   962  			if len(delete) > 0 {
   963  				args := &structs.ACLPolicyDeleteRequest{
   964  					Names: delete,
   965  				}
   966  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
   967  				if err != nil {
   968  					s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err)
   969  					goto ERR_WAIT
   970  				}
   971  			}
   972  
   973  			// Fetch any outdated policies
   974  			var fetched []*structs.ACLPolicy
   975  			if len(update) > 0 {
   976  				req := structs.ACLPolicySetRequest{
   977  					Names: update,
   978  					QueryOptions: structs.QueryOptions{
   979  						Region:        s.config.AuthoritativeRegion,
   980  						AuthToken:     s.ReplicationToken(),
   981  						AllowStale:    true,
   982  						MinQueryIndex: resp.Index - 1,
   983  					},
   984  				}
   985  				var reply structs.ACLPolicySetResponse
   986  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
   987  					"ACL.GetPolicies", &req, &reply); err != nil {
   988  					s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   989  					goto ERR_WAIT
   990  				}
   991  				for _, policy := range reply.Policies {
   992  					fetched = append(fetched, policy)
   993  				}
   994  			}
   995  
   996  			// Update local policies
   997  			if len(fetched) > 0 {
   998  				args := &structs.ACLPolicyUpsertRequest{
   999  					Policies: fetched,
  1000  				}
  1001  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
  1002  				if err != nil {
  1003  					s.logger.Printf("[ERR] nomad: failed to update policies: %v", err)
  1004  					goto ERR_WAIT
  1005  				}
  1006  			}
  1007  
  1008  			// Update the minimum query index, blocks until there
  1009  			// is a change.
  1010  			req.MinQueryIndex = resp.Index
  1011  		}
  1012  	}
  1013  
  1014  ERR_WAIT:
  1015  	select {
  1016  	case <-time.After(s.config.ReplicationBackoff):
  1017  		goto START
  1018  	case <-stopCh:
  1019  		return
  1020  	}
  1021  }
  1022  
  1023  // diffACLPolicies is used to perform a two-way diff between the local
  1024  // policies and the remote policies to determine which policies need to
  1025  // be deleted or updated.
  1026  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
  1027  	// Construct a set of the local and remote policies
  1028  	local := make(map[string][]byte)
  1029  	remote := make(map[string]struct{})
  1030  
  1031  	// Add all the local policies
  1032  	iter, err := state.ACLPolicies(nil)
  1033  	if err != nil {
  1034  		panic("failed to iterate local policies")
  1035  	}
  1036  	for {
  1037  		raw := iter.Next()
  1038  		if raw == nil {
  1039  			break
  1040  		}
  1041  		policy := raw.(*structs.ACLPolicy)
  1042  		local[policy.Name] = policy.Hash
  1043  	}
  1044  
  1045  	// Iterate over the remote policies
  1046  	for _, rp := range remoteList {
  1047  		remote[rp.Name] = struct{}{}
  1048  
  1049  		// Check if the policy is missing locally
  1050  		if localHash, ok := local[rp.Name]; !ok {
  1051  			update = append(update, rp.Name)
  1052  
  1053  			// Check if policy is newer remotely and there is a hash mis-match.
  1054  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1055  			update = append(update, rp.Name)
  1056  		}
  1057  	}
  1058  
  1059  	// Check if policy should be deleted
  1060  	for lp := range local {
  1061  		if _, ok := remote[lp]; !ok {
  1062  			delete = append(delete, lp)
  1063  		}
  1064  	}
  1065  	return
  1066  }
  1067  
  1068  // replicateACLTokens is used to replicate global ACL tokens from
  1069  // the authoritative region to this region.
  1070  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
  1071  	req := structs.ACLTokenListRequest{
  1072  		GlobalOnly: true,
  1073  		QueryOptions: structs.QueryOptions{
  1074  			Region:     s.config.AuthoritativeRegion,
  1075  			AllowStale: true,
  1076  		},
  1077  	}
  1078  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1079  	s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region)
  1080  
  1081  START:
  1082  	for {
  1083  		select {
  1084  		case <-stopCh:
  1085  			return
  1086  		default:
  1087  			// Rate limit how often we attempt replication
  1088  			limiter.Wait(context.Background())
  1089  
  1090  			// Fetch the list of tokens
  1091  			var resp structs.ACLTokenListResponse
  1092  			req.AuthToken = s.ReplicationToken()
  1093  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1094  				"ACL.ListTokens", &req, &resp)
  1095  			if err != nil {
  1096  				s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1097  				goto ERR_WAIT
  1098  			}
  1099  
  1100  			// Perform a two-way diff
  1101  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
  1102  
  1103  			// Delete tokens that should not exist
  1104  			if len(delete) > 0 {
  1105  				args := &structs.ACLTokenDeleteRequest{
  1106  					AccessorIDs: delete,
  1107  				}
  1108  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1109  				if err != nil {
  1110  					s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err)
  1111  					goto ERR_WAIT
  1112  				}
  1113  			}
  1114  
  1115  			// Fetch any outdated policies.
  1116  			var fetched []*structs.ACLToken
  1117  			if len(update) > 0 {
  1118  				req := structs.ACLTokenSetRequest{
  1119  					AccessorIDS: update,
  1120  					QueryOptions: structs.QueryOptions{
  1121  						Region:        s.config.AuthoritativeRegion,
  1122  						AuthToken:     s.ReplicationToken(),
  1123  						AllowStale:    true,
  1124  						MinQueryIndex: resp.Index - 1,
  1125  					},
  1126  				}
  1127  				var reply structs.ACLTokenSetResponse
  1128  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1129  					"ACL.GetTokens", &req, &reply); err != nil {
  1130  					s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1131  					goto ERR_WAIT
  1132  				}
  1133  				for _, token := range reply.Tokens {
  1134  					fetched = append(fetched, token)
  1135  				}
  1136  			}
  1137  
  1138  			// Update local tokens
  1139  			if len(fetched) > 0 {
  1140  				args := &structs.ACLTokenUpsertRequest{
  1141  					Tokens: fetched,
  1142  				}
  1143  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1144  				if err != nil {
  1145  					s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err)
  1146  					goto ERR_WAIT
  1147  				}
  1148  			}
  1149  
  1150  			// Update the minimum query index, blocks until there
  1151  			// is a change.
  1152  			req.MinQueryIndex = resp.Index
  1153  		}
  1154  	}
  1155  
  1156  ERR_WAIT:
  1157  	select {
  1158  	case <-time.After(s.config.ReplicationBackoff):
  1159  		goto START
  1160  	case <-stopCh:
  1161  		return
  1162  	}
  1163  }
  1164  
  1165  // diffACLTokens is used to perform a two-way diff between the local
  1166  // tokens and the remote tokens to determine which tokens need to
  1167  // be deleted or updated.
  1168  func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1169  	// Construct a set of the local and remote policies
  1170  	local := make(map[string][]byte)
  1171  	remote := make(map[string]struct{})
  1172  
  1173  	// Add all the local global tokens
  1174  	iter, err := state.ACLTokensByGlobal(nil, true)
  1175  	if err != nil {
  1176  		panic("failed to iterate local tokens")
  1177  	}
  1178  	for {
  1179  		raw := iter.Next()
  1180  		if raw == nil {
  1181  			break
  1182  		}
  1183  		token := raw.(*structs.ACLToken)
  1184  		local[token.AccessorID] = token.Hash
  1185  	}
  1186  
  1187  	// Iterate over the remote tokens
  1188  	for _, rp := range remoteList {
  1189  		remote[rp.AccessorID] = struct{}{}
  1190  
  1191  		// Check if the token is missing locally
  1192  		if localHash, ok := local[rp.AccessorID]; !ok {
  1193  			update = append(update, rp.AccessorID)
  1194  
  1195  			// Check if policy is newer remotely and there is a hash mis-match.
  1196  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1197  			update = append(update, rp.AccessorID)
  1198  		}
  1199  	}
  1200  
  1201  	// Check if local token should be deleted
  1202  	for lp := range local {
  1203  		if _, ok := remote[lp]; !ok {
  1204  			delete = append(delete, lp)
  1205  		}
  1206  	}
  1207  	return
  1208  }
  1209  
  1210  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
  1211  func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig {
  1212  	state := s.fsm.State()
  1213  	_, config, err := state.AutopilotConfig()
  1214  	if err != nil {
  1215  		s.logger.Printf("[ERR] autopilot: failed to get config: %v", err)
  1216  		return nil
  1217  	}
  1218  	if config != nil {
  1219  		return config
  1220  	}
  1221  
  1222  	if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) {
  1223  		s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String())
  1224  		return nil
  1225  	}
  1226  
  1227  	config = s.config.AutopilotConfig
  1228  	req := structs.AutopilotSetConfigRequest{Config: *config}
  1229  	if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
  1230  		s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err)
  1231  		return nil
  1232  	}
  1233  
  1234  	return config
  1235  }