github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/leader.go

github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"math/rand"
     9  	"net"
    10  	"sync"
    11  	"time"
    12  
    13  	"golang.org/x/time/rate"
    14  
    15  	"github.com/armon/go-metrics"
    16  	memdb "github.com/hashicorp/go-memdb"
    17  	"github.com/hashicorp/go-version"
    18  	"github.com/hashicorp/nomad/helper/uuid"
    19  	"github.com/hashicorp/nomad/nomad/state"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  	"github.com/hashicorp/raft"
    22  	"github.com/hashicorp/serf/serf"
    23  )
    24  
    25  const (
    26  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    27  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    28  	// high contention when the schedulers plan does not make progress.
    29  	failedEvalUnblockInterval = 1 * time.Minute
    30  
    31  	// replicationRateLimit is used to rate limit how often data is replicated
    32  	// between the authoritative region and the local region
    33  	replicationRateLimit rate.Limit = 10.0
    34  
    35  	// barrierWriteTimeout is used to give Raft a chance to process a
    36  	// possible loss of leadership event if we are unable to get a barrier
    37  	// while leader.
    38  	barrierWriteTimeout = 2 * time.Minute
    39  )
    40  
    41  var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    42  
    43  // monitorLeadership is used to monitor if we acquire or lose our role
    44  // as the leader in the Raft cluster. There is some work the leader is
    45  // expected to do, so we must react to changes
    46  func (s *Server) monitorLeadership() {
    47  	var weAreLeaderCh chan struct{}
    48  	var leaderLoop sync.WaitGroup
    49  	for {
    50  		select {
    51  		case isLeader := <-s.leaderCh:
    52  			switch {
    53  			case isLeader:
    54  				if weAreLeaderCh != nil {
    55  					s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running")
    56  					continue
    57  				}
    58  
    59  				weAreLeaderCh = make(chan struct{})
    60  				leaderLoop.Add(1)
    61  				go func(ch chan struct{}) {
    62  					defer leaderLoop.Done()
    63  					s.leaderLoop(ch)
    64  				}(weAreLeaderCh)
    65  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    66  
    67  			default:
    68  				if weAreLeaderCh == nil {
    69  					s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running")
    70  					continue
    71  				}
    72  
    73  				s.logger.Printf("[DEBUG] nomad: shutting down leader loop")
    74  				close(weAreLeaderCh)
    75  				leaderLoop.Wait()
    76  				weAreLeaderCh = nil
    77  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    78  			}
    79  
    80  		case <-s.shutdownCh:
    81  			return
    82  		}
    83  	}
    84  }
    85  
    86  // leaderLoop runs as long as we are the leader to run various
    87  // maintence activities
    88  func (s *Server) leaderLoop(stopCh chan struct{}) {
    89  	var reconcileCh chan serf.Member
    90  	establishedLeader := false
    91  
    92  RECONCILE:
    93  	// Setup a reconciliation timer
    94  	reconcileCh = nil
    95  	interval := time.After(s.config.ReconcileInterval)
    96  
    97  	// Apply a raft barrier to ensure our FSM is caught up
    98  	start := time.Now()
    99  	barrier := s.raft.Barrier(barrierWriteTimeout)
   100  	if err := barrier.Error(); err != nil {
   101  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
   102  		goto WAIT
   103  	}
   104  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   105  
   106  	// Check if we need to handle initial leadership actions
   107  	if !establishedLeader {
   108  		if err := s.establishLeadership(stopCh); err != nil {
   109  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err)
   110  			goto WAIT
   111  		}
   112  		establishedLeader = true
   113  		defer func() {
   114  			if err := s.revokeLeadership(); err != nil {
   115  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   116  			}
   117  		}()
   118  	}
   119  
   120  	// Reconcile any missing data
   121  	if err := s.reconcile(); err != nil {
   122  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
   123  		goto WAIT
   124  	}
   125  
   126  	// Initial reconcile worked, now we can process the channel
   127  	// updates
   128  	reconcileCh = s.reconcileCh
   129  
   130  	// Poll the stop channel to give it priority so we don't waste time
   131  	// trying to perform the other operations if we have been asked to shut
   132  	// down.
   133  	select {
   134  	case <-stopCh:
   135  		return
   136  	default:
   137  	}
   138  
   139  WAIT:
   140  	// Wait until leadership is lost
   141  	for {
   142  		select {
   143  		case <-stopCh:
   144  			return
   145  		case <-s.shutdownCh:
   146  			return
   147  		case <-interval:
   148  			goto RECONCILE
   149  		case member := <-reconcileCh:
   150  			s.reconcileMember(member)
   151  		}
   152  	}
   153  }
   154  
   155  // establishLeadership is invoked once we become leader and are able
   156  // to invoke an initial barrier. The barrier is used to ensure any
   157  // previously inflight transactions have been committed and that our
   158  // state is up-to-date.
   159  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   160  	// Generate a leader ACL token. This will allow the leader to issue work
   161  	// that requires a valid ACL token.
   162  	s.setLeaderAcl(uuid.Generate())
   163  
   164  	// Disable workers to free half the cores for use in the plan queue and
   165  	// evaluation broker
   166  	if numWorkers := len(s.workers); numWorkers > 1 {
   167  		// Disabling 3/4 of the workers frees CPU for raft and the
   168  		// plan applier which uses 1/2 the cores.
   169  		for i := 0; i < (3 * numWorkers / 4); i++ {
   170  			s.workers[i].SetPause(true)
   171  		}
   172  	}
   173  
   174  	// Initialize and start the autopilot routine
   175  	s.getOrCreateAutopilotConfig()
   176  	s.autopilot.Start()
   177  
   178  	// Enable the plan queue, since we are now the leader
   179  	s.planQueue.SetEnabled(true)
   180  
   181  	// Start the plan evaluator
   182  	go s.planApply()
   183  
   184  	// Enable the eval broker, since we are now the leader
   185  	s.evalBroker.SetEnabled(true)
   186  
   187  	// Enable the blocked eval tracker, since we are now the leader
   188  	s.blockedEvals.SetEnabled(true)
   189  	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
   190  
   191  	// Enable the deployment watcher, since we are now the leader
   192  	if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil {
   193  		return err
   194  	}
   195  
   196  	// Restore the eval broker state
   197  	if err := s.restoreEvals(); err != nil {
   198  		return err
   199  	}
   200  
   201  	// Activate the vault client
   202  	s.vault.SetActive(true)
   203  	if err := s.restoreRevokingAccessors(); err != nil {
   204  		return err
   205  	}
   206  
   207  	// Enable the periodic dispatcher, since we are now the leader.
   208  	s.periodicDispatcher.SetEnabled(true)
   209  
   210  	// Restore the periodic dispatcher state
   211  	if err := s.restorePeriodicDispatcher(); err != nil {
   212  		return err
   213  	}
   214  
   215  	// Scheduler periodic jobs
   216  	go s.schedulePeriodic(stopCh)
   217  
   218  	// Reap any failed evaluations
   219  	go s.reapFailedEvaluations(stopCh)
   220  
   221  	// Reap any duplicate blocked evaluations
   222  	go s.reapDupBlockedEvaluations(stopCh)
   223  
   224  	// Periodically unblock failed allocations
   225  	go s.periodicUnblockFailedEvals(stopCh)
   226  
   227  	// Periodically publish job summary metrics
   228  	go s.publishJobSummaryMetrics(stopCh)
   229  
   230  	// Setup the heartbeat timers. This is done both when starting up or when
   231  	// a leader fail over happens. Since the timers are maintained by the leader
   232  	// node, effectively this means all the timers are renewed at the time of failover.
   233  	// The TTL contract is that the session will not be expired before the TTL,
   234  	// so expiring it later is allowable.
   235  	//
   236  	// This MUST be done after the initial barrier to ensure the latest Nodes
   237  	// are available to be initialized. Otherwise initialization may use stale
   238  	// data.
   239  	if err := s.initializeHeartbeatTimers(); err != nil {
   240  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   241  		return err
   242  	}
   243  
   244  	// COMPAT 0.4 - 0.4.1
   245  	// Reconcile the summaries of the registered jobs. We reconcile summaries
   246  	// only if the server is 0.4.1 since summaries are not present in 0.4 they
   247  	// might be incorrect after upgrading to 0.4.1 the summaries might not be
   248  	// correct
   249  	if err := s.reconcileJobSummaries(); err != nil {
   250  		return fmt.Errorf("unable to reconcile job summaries: %v", err)
   251  	}
   252  
   253  	// Start replication of ACLs and Policies if they are enabled,
   254  	// and we are not the authoritative region.
   255  	if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion {
   256  		go s.replicateACLPolicies(stopCh)
   257  		go s.replicateACLTokens(stopCh)
   258  	}
   259  
   260  	// Setup any enterprise systems required.
   261  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   262  		return err
   263  	}
   264  
   265  	return nil
   266  }
   267  
   268  // restoreEvals is used to restore pending evaluations into the eval broker and
   269  // blocked evaluations into the blocked eval tracker. The broker and blocked
   270  // eval tracker is maintained only by the leader, so it must be restored anytime
   271  // a leadership transition takes place.
   272  func (s *Server) restoreEvals() error {
   273  	// Get an iterator over every evaluation
   274  	ws := memdb.NewWatchSet()
   275  	iter, err := s.fsm.State().Evals(ws)
   276  	if err != nil {
   277  		return fmt.Errorf("failed to get evaluations: %v", err)
   278  	}
   279  
   280  	for {
   281  		raw := iter.Next()
   282  		if raw == nil {
   283  			break
   284  		}
   285  		eval := raw.(*structs.Evaluation)
   286  
   287  		if eval.ShouldEnqueue() {
   288  			s.evalBroker.Enqueue(eval)
   289  		} else if eval.ShouldBlock() {
   290  			s.blockedEvals.Block(eval)
   291  		}
   292  	}
   293  	return nil
   294  }
   295  
   296  // restoreRevokingAccessors is used to restore Vault accessors that should be
   297  // revoked.
   298  func (s *Server) restoreRevokingAccessors() error {
   299  	// An accessor should be revoked if its allocation or node is terminal
   300  	ws := memdb.NewWatchSet()
   301  	state := s.fsm.State()
   302  	iter, err := state.VaultAccessors(ws)
   303  	if err != nil {
   304  		return fmt.Errorf("failed to get vault accessors: %v", err)
   305  	}
   306  
   307  	var revoke []*structs.VaultAccessor
   308  	for {
   309  		raw := iter.Next()
   310  		if raw == nil {
   311  			break
   312  		}
   313  
   314  		va := raw.(*structs.VaultAccessor)
   315  
   316  		// Check the allocation
   317  		alloc, err := state.AllocByID(ws, va.AllocID)
   318  		if err != nil {
   319  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   320  		}
   321  		if alloc == nil || alloc.Terminated() {
   322  			// No longer running and should be revoked
   323  			revoke = append(revoke, va)
   324  			continue
   325  		}
   326  
   327  		// Check the node
   328  		node, err := state.NodeByID(ws, va.NodeID)
   329  		if err != nil {
   330  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   331  		}
   332  		if node == nil || node.TerminalStatus() {
   333  			// Node is terminal so any accessor from it should be revoked
   334  			revoke = append(revoke, va)
   335  			continue
   336  		}
   337  	}
   338  
   339  	if len(revoke) != 0 {
   340  		if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
   341  			return fmt.Errorf("failed to revoke tokens: %v", err)
   342  		}
   343  	}
   344  
   345  	return nil
   346  }
   347  
   348  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   349  // periodic dispatcher. It also determines if a periodic job should have been
   350  // created during the leadership transition and force runs them. The periodic
   351  // dispatcher is maintained only by the leader, so it must be restored anytime a
   352  // leadership transition takes place.
   353  func (s *Server) restorePeriodicDispatcher() error {
   354  	ws := memdb.NewWatchSet()
   355  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   356  	if err != nil {
   357  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   358  	}
   359  
   360  	now := time.Now()
   361  	for i := iter.Next(); i != nil; i = iter.Next() {
   362  		job := i.(*structs.Job)
   363  
   364  		// We skip adding parameterized jobs because they themselves aren't
   365  		// tracked, only the dispatched children are.
   366  		if job.IsParameterized() {
   367  			continue
   368  		}
   369  
   370  		if err := s.periodicDispatcher.Add(job); err != nil {
   371  			return err
   372  		}
   373  
   374  		// We do not need to force run the job since it isn't active.
   375  		if !job.IsPeriodicActive() {
   376  			continue
   377  		}
   378  
   379  		// If the periodic job has never been launched before, launch will hold
   380  		// the time the periodic job was added. Otherwise it has the last launch
   381  		// time of the periodic job.
   382  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   383  		if err != nil {
   384  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   385  		}
   386  		if launch == nil {
   387  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   388  				job.ID, job.Namespace)
   389  		}
   390  
   391  		// nextLaunch is the next launch that should occur.
   392  		nextLaunch := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   393  
   394  		// We skip force launching the job if  there should be no next launch
   395  		// (the zero case) or if the next launch time is in the future. If it is
   396  		// in the future, it will be handled by the periodic dispatcher.
   397  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   398  			continue
   399  		}
   400  
   401  		if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil {
   402  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   403  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   404  			return errors.New(msg)
   405  		}
   406  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   407  			" run during leadership establishment", job.ID)
   408  	}
   409  
   410  	return nil
   411  }
   412  
   413  // schedulePeriodic is used to do periodic job dispatch while we are leader
   414  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   415  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   416  	defer evalGC.Stop()
   417  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   418  	defer nodeGC.Stop()
   419  	jobGC := time.NewTicker(s.config.JobGCInterval)
   420  	defer jobGC.Stop()
   421  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   422  	defer deploymentGC.Stop()
   423  
   424  	// getLatest grabs the latest index from the state store. It returns true if
   425  	// the index was retrieved successfully.
   426  	getLatest := func() (uint64, bool) {
   427  		snapshotIndex, err := s.fsm.State().LatestIndex()
   428  		if err != nil {
   429  			s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err)
   430  			return 0, false
   431  		}
   432  
   433  		return snapshotIndex, true
   434  	}
   435  
   436  	for {
   437  
   438  		select {
   439  		case <-evalGC.C:
   440  			if index, ok := getLatest(); ok {
   441  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   442  			}
   443  		case <-nodeGC.C:
   444  			if index, ok := getLatest(); ok {
   445  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   446  			}
   447  		case <-jobGC.C:
   448  			if index, ok := getLatest(); ok {
   449  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   450  			}
   451  		case <-deploymentGC.C:
   452  			if index, ok := getLatest(); ok {
   453  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
   454  			}
   455  		case <-stopCh:
   456  			return
   457  		}
   458  	}
   459  }
   460  
   461  // coreJobEval returns an evaluation for a core job
   462  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   463  	return &structs.Evaluation{
   464  		ID:          uuid.Generate(),
   465  		Namespace:   "-",
   466  		Priority:    structs.CoreJobPriority,
   467  		Type:        structs.JobTypeCore,
   468  		TriggeredBy: structs.EvalTriggerScheduled,
   469  		JobID:       job,
   470  		LeaderACL:   s.getLeaderAcl(),
   471  		Status:      structs.EvalStatusPending,
   472  		ModifyIndex: modifyIndex,
   473  	}
   474  }
   475  
   476  // reapFailedEvaluations is used to reap evaluations that
   477  // have reached their delivery limit and should be failed
   478  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   479  	for {
   480  		select {
   481  		case <-stopCh:
   482  			return
   483  		default:
   484  			// Scan for a failed evaluation
   485  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   486  			if err != nil {
   487  				return
   488  			}
   489  			if eval == nil {
   490  				continue
   491  			}
   492  
   493  			// Update the status to failed
   494  			updateEval := eval.Copy()
   495  			updateEval.Status = structs.EvalStatusFailed
   496  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   497  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval)
   498  
   499  			// Create a follow-up evaluation that will be used to retry the
   500  			// scheduling for the job after the cluster is hopefully more stable
   501  			// due to the fairly large backoff.
   502  			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
   503  				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
   504  			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
   505  
   506  			// Update via Raft
   507  			req := structs.EvalUpdateRequest{
   508  				Evals: []*structs.Evaluation{updateEval, followupEval},
   509  			}
   510  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   511  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err)
   512  				continue
   513  			}
   514  
   515  			// Ack completion
   516  			s.evalBroker.Ack(eval.ID, token)
   517  		}
   518  	}
   519  }
   520  
   521  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   522  // should be cancelled.
   523  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   524  	for {
   525  		select {
   526  		case <-stopCh:
   527  			return
   528  		default:
   529  			// Scan for duplicate blocked evals.
   530  			dups := s.blockedEvals.GetDuplicates(time.Second)
   531  			if dups == nil {
   532  				continue
   533  			}
   534  
   535  			cancel := make([]*structs.Evaluation, len(dups))
   536  			for i, dup := range dups {
   537  				// Update the status to cancelled
   538  				newEval := dup.Copy()
   539  				newEval.Status = structs.EvalStatusCancelled
   540  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   541  				cancel[i] = newEval
   542  			}
   543  
   544  			// Update via Raft
   545  			req := structs.EvalUpdateRequest{
   546  				Evals: cancel,
   547  			}
   548  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   549  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   550  				continue
   551  			}
   552  		}
   553  	}
   554  }
   555  
   556  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   557  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   558  	ticker := time.NewTicker(failedEvalUnblockInterval)
   559  	defer ticker.Stop()
   560  	for {
   561  		select {
   562  		case <-stopCh:
   563  			return
   564  		case <-ticker.C:
   565  			// Unblock the failed allocations
   566  			s.blockedEvals.UnblockFailed()
   567  		}
   568  	}
   569  }
   570  
   571  // publishJobSummaryMetrics publishes the job summaries as metrics
   572  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
   573  	timer := time.NewTimer(0)
   574  	defer timer.Stop()
   575  
   576  	for {
   577  		select {
   578  		case <-stopCh:
   579  			return
   580  		case <-timer.C:
   581  			timer.Reset(s.config.StatsCollectionInterval)
   582  			state, err := s.State().Snapshot()
   583  			if err != nil {
   584  				s.logger.Printf("[ERR] nomad: failed to get state: %v", err)
   585  				continue
   586  			}
   587  			ws := memdb.NewWatchSet()
   588  			iter, err := state.JobSummaries(ws)
   589  			if err != nil {
   590  				s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err)
   591  				continue
   592  			}
   593  
   594  			for {
   595  				raw := iter.Next()
   596  				if raw == nil {
   597  					break
   598  				}
   599  				summary := raw.(*structs.JobSummary)
   600  				for name, tgSummary := range summary.Summary {
   601  					if !s.config.DisableTaggedMetrics {
   602  						labels := []metrics.Label{
   603  							{
   604  								Name:  "job",
   605  								Value: summary.JobID,
   606  							},
   607  							{
   608  								Name:  "task_group",
   609  								Value: name,
   610  							},
   611  						}
   612  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
   613  							float32(tgSummary.Queued), labels)
   614  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
   615  							float32(tgSummary.Complete), labels)
   616  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
   617  							float32(tgSummary.Failed), labels)
   618  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
   619  							float32(tgSummary.Running), labels)
   620  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
   621  							float32(tgSummary.Starting), labels)
   622  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
   623  							float32(tgSummary.Lost), labels)
   624  					}
   625  					if s.config.BackwardsCompatibleMetrics {
   626  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
   627  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
   628  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
   629  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
   630  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
   631  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
   632  					}
   633  				}
   634  			}
   635  		}
   636  	}
   637  }
   638  
   639  // revokeLeadership is invoked once we step down as leader.
   640  // This is used to cleanup any state that may be specific to a leader.
   641  func (s *Server) revokeLeadership() error {
   642  	// Clear the leader token since we are no longer the leader.
   643  	s.setLeaderAcl("")
   644  
   645  	// Disable autopilot
   646  	s.autopilot.Stop()
   647  
   648  	// Disable the plan queue, since we are no longer leader
   649  	s.planQueue.SetEnabled(false)
   650  
   651  	// Disable the eval broker, since it is only useful as a leader
   652  	s.evalBroker.SetEnabled(false)
   653  
   654  	// Disable the blocked eval tracker, since it is only useful as a leader
   655  	s.blockedEvals.SetEnabled(false)
   656  
   657  	// Disable the periodic dispatcher, since it is only useful as a leader
   658  	s.periodicDispatcher.SetEnabled(false)
   659  
   660  	// Disable the Vault client as it is only useful as a leader.
   661  	s.vault.SetActive(false)
   662  
   663  	// Disable the deployment watcher as it is only useful as a leader.
   664  	if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil {
   665  		return err
   666  	}
   667  
   668  	// Disable any enterprise systems required.
   669  	if err := s.revokeEnterpriseLeadership(); err != nil {
   670  		return err
   671  	}
   672  
   673  	// Clear the heartbeat timers on either shutdown or step down,
   674  	// since we are no longer responsible for TTL expirations.
   675  	if err := s.clearAllHeartbeatTimers(); err != nil {
   676  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   677  		return err
   678  	}
   679  
   680  	// Unpause our worker if we paused previously
   681  	if len(s.workers) > 1 {
   682  		for i := 0; i < len(s.workers)/2; i++ {
   683  			s.workers[i].SetPause(false)
   684  		}
   685  	}
   686  	return nil
   687  }
   688  
   689  // reconcile is used to reconcile the differences between Serf
   690  // membership and what is reflected in our strongly consistent store.
   691  func (s *Server) reconcile() error {
   692  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   693  	members := s.serf.Members()
   694  	for _, member := range members {
   695  		if err := s.reconcileMember(member); err != nil {
   696  			return err
   697  		}
   698  	}
   699  	return nil
   700  }
   701  
   702  // reconcileMember is used to do an async reconcile of a single serf member
   703  func (s *Server) reconcileMember(member serf.Member) error {
   704  	// Check if this is a member we should handle
   705  	valid, parts := isNomadServer(member)
   706  	if !valid || parts.Region != s.config.Region {
   707  		return nil
   708  	}
   709  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   710  
   711  	// Do not reconcile ourself
   712  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   713  		return nil
   714  	}
   715  
   716  	var err error
   717  	switch member.Status {
   718  	case serf.StatusAlive:
   719  		err = s.addRaftPeer(member, parts)
   720  	case serf.StatusLeft, StatusReap:
   721  		err = s.removeRaftPeer(member, parts)
   722  	}
   723  	if err != nil {
   724  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   725  			member, err)
   726  		return err
   727  	}
   728  	return nil
   729  }
   730  
   731  // reconcileJobSummaries reconciles the summaries of all the jobs registered in
   732  // the system
   733  // COMPAT 0.4 -> 0.4.1
   734  func (s *Server) reconcileJobSummaries() error {
   735  	index, err := s.fsm.state.LatestIndex()
   736  	if err != nil {
   737  		return fmt.Errorf("unable to read latest index: %v", err)
   738  	}
   739  	s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index)
   740  
   741  	args := &structs.GenericResponse{}
   742  	msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag
   743  	if _, _, err = s.raftApply(msg, args); err != nil {
   744  		return fmt.Errorf("reconciliation of job summaries failed: %v", err)
   745  	}
   746  
   747  	return nil
   748  }
   749  
   750  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   751  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   752  	// Do not join ourselfs
   753  	if m.Name == s.config.NodeName {
   754  		s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name)
   755  		return nil
   756  	}
   757  
   758  	// Check for possibility of multiple bootstrap nodes
   759  	members := s.serf.Members()
   760  	if parts.Bootstrap {
   761  		for _, member := range members {
   762  			valid, p := isNomadServer(member)
   763  			if valid && member.Name != m.Name && p.Bootstrap {
   764  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   765  				return nil
   766  			}
   767  		}
   768  	}
   769  
   770  	// See if it's already in the configuration. It's harmless to re-add it
   771  	// but we want to avoid doing that if possible to prevent useless Raft
   772  	// log entries.
   773  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   774  	configFuture := s.raft.GetConfiguration()
   775  	if err := configFuture.Error(); err != nil {
   776  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   777  		return err
   778  	}
   779  	for _, server := range configFuture.Configuration().Servers {
   780  		if server.Address == raft.ServerAddress(addr) {
   781  			return nil
   782  		}
   783  	}
   784  
   785  	// See if it's already in the configuration. It's harmless to re-add it
   786  	// but we want to avoid doing that if possible to prevent useless Raft
   787  	// log entries. If the address is the same but the ID changed, remove the
   788  	// old server before adding the new one.
   789  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   790  	if err != nil {
   791  		return err
   792  	}
   793  	for _, server := range configFuture.Configuration().Servers {
   794  		// No-op if the raft version is too low
   795  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
   796  			return nil
   797  		}
   798  
   799  		// If the address or ID matches an existing server, see if we need to remove the old one first
   800  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
   801  			// Exit with no-op if this is being called on an existing server
   802  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
   803  				return nil
   804  			}
   805  			future := s.raft.RemoveServer(server.ID, 0, 0)
   806  			if server.Address == raft.ServerAddress(addr) {
   807  				if err := future.Error(); err != nil {
   808  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
   809  				}
   810  				s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address)
   811  			} else {
   812  				if err := future.Error(); err != nil {
   813  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
   814  				}
   815  				s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID)
   816  			}
   817  		}
   818  	}
   819  
   820  	// Attempt to add as a peer
   821  	switch {
   822  	case minRaftProtocol >= 3:
   823  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   824  		if err := addFuture.Error(); err != nil {
   825  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   826  			return err
   827  		}
   828  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
   829  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
   830  		if err := addFuture.Error(); err != nil {
   831  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   832  			return err
   833  		}
   834  	default:
   835  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
   836  		if err := addFuture.Error(); err != nil {
   837  			s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   838  			return err
   839  		}
   840  	}
   841  
   842  	return nil
   843  }
   844  
   845  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   846  // or is reaped
   847  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   848  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   849  
   850  	// See if it's already in the configuration. It's harmless to re-remove it
   851  	// but we want to avoid doing that if possible to prevent useless Raft
   852  	// log entries.
   853  	configFuture := s.raft.GetConfiguration()
   854  	if err := configFuture.Error(); err != nil {
   855  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   856  		return err
   857  	}
   858  
   859  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
   860  	if err != nil {
   861  		return err
   862  	}
   863  
   864  	// Pick which remove API to use based on how the server was added.
   865  	for _, server := range configFuture.Configuration().Servers {
   866  		// If we understand the new add/remove APIs and the server was added by ID, use the new remove API
   867  		if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) {
   868  			s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID)
   869  			future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0)
   870  			if err := future.Error(); err != nil {
   871  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   872  					server.ID, err)
   873  				return err
   874  			}
   875  			break
   876  		} else if server.Address == raft.ServerAddress(addr) {
   877  			// If not, use the old remove API
   878  			s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address)
   879  			future := s.raft.RemovePeer(raft.ServerAddress(addr))
   880  			if err := future.Error(); err != nil {
   881  				s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   882  					addr, err)
   883  				return err
   884  			}
   885  			break
   886  		}
   887  	}
   888  
   889  	return nil
   890  }
   891  
   892  // replicateACLPolicies is used to replicate ACL policies from
   893  // the authoritative region to this region.
   894  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
   895  	req := structs.ACLPolicyListRequest{
   896  		QueryOptions: structs.QueryOptions{
   897  			Region:     s.config.AuthoritativeRegion,
   898  			AllowStale: true,
   899  		},
   900  	}
   901  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   902  	s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region)
   903  
   904  START:
   905  	for {
   906  		select {
   907  		case <-stopCh:
   908  			return
   909  		default:
   910  			// Rate limit how often we attempt replication
   911  			limiter.Wait(context.Background())
   912  
   913  			// Fetch the list of policies
   914  			var resp structs.ACLPolicyListResponse
   915  			req.AuthToken = s.ReplicationToken()
   916  			err := s.forwardRegion(s.config.AuthoritativeRegion,
   917  				"ACL.ListPolicies", &req, &resp)
   918  			if err != nil {
   919  				s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   920  				goto ERR_WAIT
   921  			}
   922  
   923  			// Perform a two-way diff
   924  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
   925  
   926  			// Delete policies that should not exist
   927  			if len(delete) > 0 {
   928  				args := &structs.ACLPolicyDeleteRequest{
   929  					Names: delete,
   930  				}
   931  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
   932  				if err != nil {
   933  					s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err)
   934  					goto ERR_WAIT
   935  				}
   936  			}
   937  
   938  			// Fetch any outdated policies
   939  			var fetched []*structs.ACLPolicy
   940  			if len(update) > 0 {
   941  				req := structs.ACLPolicySetRequest{
   942  					Names: update,
   943  					QueryOptions: structs.QueryOptions{
   944  						Region:        s.config.AuthoritativeRegion,
   945  						AuthToken:     s.ReplicationToken(),
   946  						AllowStale:    true,
   947  						MinQueryIndex: resp.Index - 1,
   948  					},
   949  				}
   950  				var reply structs.ACLPolicySetResponse
   951  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
   952  					"ACL.GetPolicies", &req, &reply); err != nil {
   953  					s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   954  					goto ERR_WAIT
   955  				}
   956  				for _, policy := range reply.Policies {
   957  					fetched = append(fetched, policy)
   958  				}
   959  			}
   960  
   961  			// Update local policies
   962  			if len(fetched) > 0 {
   963  				args := &structs.ACLPolicyUpsertRequest{
   964  					Policies: fetched,
   965  				}
   966  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
   967  				if err != nil {
   968  					s.logger.Printf("[ERR] nomad: failed to update policies: %v", err)
   969  					goto ERR_WAIT
   970  				}
   971  			}
   972  
   973  			// Update the minimum query index, blocks until there
   974  			// is a change.
   975  			req.MinQueryIndex = resp.Index
   976  		}
   977  	}
   978  
   979  ERR_WAIT:
   980  	select {
   981  	case <-time.After(s.config.ReplicationBackoff):
   982  		goto START
   983  	case <-stopCh:
   984  		return
   985  	}
   986  }
   987  
   988  // diffACLPolicies is used to perform a two-way diff between the local
   989  // policies and the remote policies to determine which policies need to
   990  // be deleted or updated.
   991  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
   992  	// Construct a set of the local and remote policies
   993  	local := make(map[string][]byte)
   994  	remote := make(map[string]struct{})
   995  
   996  	// Add all the local policies
   997  	iter, err := state.ACLPolicies(nil)
   998  	if err != nil {
   999  		panic("failed to iterate local policies")
  1000  	}
  1001  	for {
  1002  		raw := iter.Next()
  1003  		if raw == nil {
  1004  			break
  1005  		}
  1006  		policy := raw.(*structs.ACLPolicy)
  1007  		local[policy.Name] = policy.Hash
  1008  	}
  1009  
  1010  	// Iterate over the remote policies
  1011  	for _, rp := range remoteList {
  1012  		remote[rp.Name] = struct{}{}
  1013  
  1014  		// Check if the policy is missing locally
  1015  		if localHash, ok := local[rp.Name]; !ok {
  1016  			update = append(update, rp.Name)
  1017  
  1018  			// Check if policy is newer remotely and there is a hash mis-match.
  1019  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1020  			update = append(update, rp.Name)
  1021  		}
  1022  	}
  1023  
  1024  	// Check if policy should be deleted
  1025  	for lp := range local {
  1026  		if _, ok := remote[lp]; !ok {
  1027  			delete = append(delete, lp)
  1028  		}
  1029  	}
  1030  	return
  1031  }
  1032  
  1033  // replicateACLTokens is used to replicate global ACL tokens from
  1034  // the authoritative region to this region.
  1035  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
  1036  	req := structs.ACLTokenListRequest{
  1037  		GlobalOnly: true,
  1038  		QueryOptions: structs.QueryOptions{
  1039  			Region:     s.config.AuthoritativeRegion,
  1040  			AllowStale: true,
  1041  		},
  1042  	}
  1043  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
  1044  	s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region)
  1045  
  1046  START:
  1047  	for {
  1048  		select {
  1049  		case <-stopCh:
  1050  			return
  1051  		default:
  1052  			// Rate limit how often we attempt replication
  1053  			limiter.Wait(context.Background())
  1054  
  1055  			// Fetch the list of tokens
  1056  			var resp structs.ACLTokenListResponse
  1057  			req.AuthToken = s.ReplicationToken()
  1058  			err := s.forwardRegion(s.config.AuthoritativeRegion,
  1059  				"ACL.ListTokens", &req, &resp)
  1060  			if err != nil {
  1061  				s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1062  				goto ERR_WAIT
  1063  			}
  1064  
  1065  			// Perform a two-way diff
  1066  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
  1067  
  1068  			// Delete tokens that should not exist
  1069  			if len(delete) > 0 {
  1070  				args := &structs.ACLTokenDeleteRequest{
  1071  					AccessorIDs: delete,
  1072  				}
  1073  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1074  				if err != nil {
  1075  					s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err)
  1076  					goto ERR_WAIT
  1077  				}
  1078  			}
  1079  
  1080  			// Fetch any outdated policies.
  1081  			var fetched []*structs.ACLToken
  1082  			if len(update) > 0 {
  1083  				req := structs.ACLTokenSetRequest{
  1084  					AccessorIDS: update,
  1085  					QueryOptions: structs.QueryOptions{
  1086  						Region:        s.config.AuthoritativeRegion,
  1087  						AuthToken:     s.ReplicationToken(),
  1088  						AllowStale:    true,
  1089  						MinQueryIndex: resp.Index - 1,
  1090  					},
  1091  				}
  1092  				var reply structs.ACLTokenSetResponse
  1093  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1094  					"ACL.GetTokens", &req, &reply); err != nil {
  1095  					s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1096  					goto ERR_WAIT
  1097  				}
  1098  				for _, token := range reply.Tokens {
  1099  					fetched = append(fetched, token)
  1100  				}
  1101  			}
  1102  
  1103  			// Update local tokens
  1104  			if len(fetched) > 0 {
  1105  				args := &structs.ACLTokenUpsertRequest{
  1106  					Tokens: fetched,
  1107  				}
  1108  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1109  				if err != nil {
  1110  					s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err)
  1111  					goto ERR_WAIT
  1112  				}
  1113  			}
  1114  
  1115  			// Update the minimum query index, blocks until there
  1116  			// is a change.
  1117  			req.MinQueryIndex = resp.Index
  1118  		}
  1119  	}
  1120  
  1121  ERR_WAIT:
  1122  	select {
  1123  	case <-time.After(s.config.ReplicationBackoff):
  1124  		goto START
  1125  	case <-stopCh:
  1126  		return
  1127  	}
  1128  }
  1129  
  1130  // diffACLTokens is used to perform a two-way diff between the local
  1131  // tokens and the remote tokens to determine which tokens need to
  1132  // be deleted or updated.
  1133  func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1134  	// Construct a set of the local and remote policies
  1135  	local := make(map[string][]byte)
  1136  	remote := make(map[string]struct{})
  1137  
  1138  	// Add all the local global tokens
  1139  	iter, err := state.ACLTokensByGlobal(nil, true)
  1140  	if err != nil {
  1141  		panic("failed to iterate local tokens")
  1142  	}
  1143  	for {
  1144  		raw := iter.Next()
  1145  		if raw == nil {
  1146  			break
  1147  		}
  1148  		token := raw.(*structs.ACLToken)
  1149  		local[token.AccessorID] = token.Hash
  1150  	}
  1151  
  1152  	// Iterate over the remote tokens
  1153  	for _, rp := range remoteList {
  1154  		remote[rp.AccessorID] = struct{}{}
  1155  
  1156  		// Check if the token is missing locally
  1157  		if localHash, ok := local[rp.AccessorID]; !ok {
  1158  			update = append(update, rp.AccessorID)
  1159  
  1160  			// Check if policy is newer remotely and there is a hash mis-match.
  1161  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1162  			update = append(update, rp.AccessorID)
  1163  		}
  1164  	}
  1165  
  1166  	// Check if local token should be deleted
  1167  	for lp := range local {
  1168  		if _, ok := remote[lp]; !ok {
  1169  			delete = append(delete, lp)
  1170  		}
  1171  	}
  1172  	return
  1173  }
  1174  
  1175  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
  1176  func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig {
  1177  	state := s.fsm.State()
  1178  	_, config, err := state.AutopilotConfig()
  1179  	if err != nil {
  1180  		s.logger.Printf("[ERR] autopilot: failed to get config: %v", err)
  1181  		return nil
  1182  	}
  1183  	if config != nil {
  1184  		return config
  1185  	}
  1186  
  1187  	if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) {
  1188  		s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String())
  1189  		return nil
  1190  	}
  1191  
  1192  	config = s.config.AutopilotConfig
  1193  	req := structs.AutopilotSetConfigRequest{Config: *config}
  1194  	if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
  1195  		s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err)
  1196  		return nil
  1197  	}
  1198  
  1199  	return config
  1200  }