github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/leader.go

github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"math/rand"
     9  	"net"
    10  	"sync"
    11  	"time"
    12  
    13  	"golang.org/x/time/rate"
    14  
    15  	"github.com/armon/go-metrics"
    16  	memdb "github.com/hashicorp/go-memdb"
    17  	"github.com/hashicorp/nomad/helper/uuid"
    18  	"github.com/hashicorp/nomad/nomad/state"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/raft"
    21  	"github.com/hashicorp/serf/serf"
    22  )
    23  
    24  const (
    25  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    26  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    27  	// high contention when the schedulers plan does not make progress.
    28  	failedEvalUnblockInterval = 1 * time.Minute
    29  
    30  	// replicationRateLimit is used to rate limit how often data is replicated
    31  	// between the authoritative region and the local region
    32  	replicationRateLimit rate.Limit = 10.0
    33  
    34  	// barrierWriteTimeout is used to give Raft a chance to process a
    35  	// possible loss of leadership event if we are unable to get a barrier
    36  	// while leader.
    37  	barrierWriteTimeout = 2 * time.Minute
    38  )
    39  
    40  // monitorLeadership is used to monitor if we acquire or lose our role
    41  // as the leader in the Raft cluster. There is some work the leader is
    42  // expected to do, so we must react to changes
    43  func (s *Server) monitorLeadership() {
    44  	var weAreLeaderCh chan struct{}
    45  	var leaderLoop sync.WaitGroup
    46  	for {
    47  		select {
    48  		case isLeader := <-s.leaderCh:
    49  			switch {
    50  			case isLeader:
    51  				if weAreLeaderCh != nil {
    52  					s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running")
    53  					continue
    54  				}
    55  
    56  				weAreLeaderCh = make(chan struct{})
    57  				leaderLoop.Add(1)
    58  				go func(ch chan struct{}) {
    59  					defer leaderLoop.Done()
    60  					s.leaderLoop(ch)
    61  				}(weAreLeaderCh)
    62  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    63  
    64  			default:
    65  				if weAreLeaderCh == nil {
    66  					s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running")
    67  					continue
    68  				}
    69  
    70  				s.logger.Printf("[DEBUG] nomad: shutting down leader loop")
    71  				close(weAreLeaderCh)
    72  				leaderLoop.Wait()
    73  				weAreLeaderCh = nil
    74  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    75  			}
    76  
    77  		case <-s.shutdownCh:
    78  			return
    79  		}
    80  	}
    81  }
    82  
    83  // leaderLoop runs as long as we are the leader to run various
    84  // maintence activities
    85  func (s *Server) leaderLoop(stopCh chan struct{}) {
    86  	var reconcileCh chan serf.Member
    87  	establishedLeader := false
    88  
    89  RECONCILE:
    90  	// Setup a reconciliation timer
    91  	reconcileCh = nil
    92  	interval := time.After(s.config.ReconcileInterval)
    93  
    94  	// Apply a raft barrier to ensure our FSM is caught up
    95  	start := time.Now()
    96  	barrier := s.raft.Barrier(barrierWriteTimeout)
    97  	if err := barrier.Error(); err != nil {
    98  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
    99  		goto WAIT
   100  	}
   101  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
   102  
   103  	// Check if we need to handle initial leadership actions
   104  	if !establishedLeader {
   105  		if err := s.establishLeadership(stopCh); err != nil {
   106  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err)
   107  			goto WAIT
   108  		}
   109  		establishedLeader = true
   110  		defer func() {
   111  			if err := s.revokeLeadership(); err != nil {
   112  				s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err)
   113  			}
   114  		}()
   115  	}
   116  
   117  	// Reconcile any missing data
   118  	if err := s.reconcile(); err != nil {
   119  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
   120  		goto WAIT
   121  	}
   122  
   123  	// Initial reconcile worked, now we can process the channel
   124  	// updates
   125  	reconcileCh = s.reconcileCh
   126  
   127  	// Poll the stop channel to give it priority so we don't waste time
   128  	// trying to perform the other operations if we have been asked to shut
   129  	// down.
   130  	select {
   131  	case <-stopCh:
   132  		return
   133  	default:
   134  	}
   135  
   136  WAIT:
   137  	// Wait until leadership is lost
   138  	for {
   139  		select {
   140  		case <-stopCh:
   141  			return
   142  		case <-s.shutdownCh:
   143  			return
   144  		case <-interval:
   145  			goto RECONCILE
   146  		case member := <-reconcileCh:
   147  			s.reconcileMember(member)
   148  		}
   149  	}
   150  }
   151  
   152  // establishLeadership is invoked once we become leader and are able
   153  // to invoke an initial barrier. The barrier is used to ensure any
   154  // previously inflight transactions have been committed and that our
   155  // state is up-to-date.
   156  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   157  	// Generate a leader ACL token. This will allow the leader to issue work
   158  	// that requires a valid ACL token.
   159  	s.setLeaderAcl(uuid.Generate())
   160  
   161  	// Disable workers to free half the cores for use in the plan queue and
   162  	// evaluation broker
   163  	if numWorkers := len(s.workers); numWorkers > 1 {
   164  		// Disabling 3/4 of the workers frees CPU for raft and the
   165  		// plan applier which uses 1/2 the cores.
   166  		for i := 0; i < (3 * numWorkers / 4); i++ {
   167  			s.workers[i].SetPause(true)
   168  		}
   169  	}
   170  
   171  	// Enable the plan queue, since we are now the leader
   172  	s.planQueue.SetEnabled(true)
   173  
   174  	// Start the plan evaluator
   175  	go s.planApply()
   176  
   177  	// Enable the eval broker, since we are now the leader
   178  	s.evalBroker.SetEnabled(true)
   179  
   180  	// Enable the blocked eval tracker, since we are now the leader
   181  	s.blockedEvals.SetEnabled(true)
   182  	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
   183  
   184  	// Enable the deployment watcher, since we are now the leader
   185  	if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil {
   186  		return err
   187  	}
   188  
   189  	// Restore the eval broker state
   190  	if err := s.restoreEvals(); err != nil {
   191  		return err
   192  	}
   193  
   194  	// Activate the vault client
   195  	s.vault.SetActive(true)
   196  	if err := s.restoreRevokingAccessors(); err != nil {
   197  		return err
   198  	}
   199  
   200  	// Enable the periodic dispatcher, since we are now the leader.
   201  	s.periodicDispatcher.SetEnabled(true)
   202  
   203  	// Restore the periodic dispatcher state
   204  	if err := s.restorePeriodicDispatcher(); err != nil {
   205  		return err
   206  	}
   207  
   208  	// Scheduler periodic jobs
   209  	go s.schedulePeriodic(stopCh)
   210  
   211  	// Reap any failed evaluations
   212  	go s.reapFailedEvaluations(stopCh)
   213  
   214  	// Reap any duplicate blocked evaluations
   215  	go s.reapDupBlockedEvaluations(stopCh)
   216  
   217  	// Periodically unblock failed allocations
   218  	go s.periodicUnblockFailedEvals(stopCh)
   219  
   220  	// Periodically publish job summary metrics
   221  	go s.publishJobSummaryMetrics(stopCh)
   222  
   223  	// Setup the heartbeat timers. This is done both when starting up or when
   224  	// a leader fail over happens. Since the timers are maintained by the leader
   225  	// node, effectively this means all the timers are renewed at the time of failover.
   226  	// The TTL contract is that the session will not be expired before the TTL,
   227  	// so expiring it later is allowable.
   228  	//
   229  	// This MUST be done after the initial barrier to ensure the latest Nodes
   230  	// are available to be initialized. Otherwise initialization may use stale
   231  	// data.
   232  	if err := s.initializeHeartbeatTimers(); err != nil {
   233  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   234  		return err
   235  	}
   236  
   237  	// COMPAT 0.4 - 0.4.1
   238  	// Reconcile the summaries of the registered jobs. We reconcile summaries
   239  	// only if the server is 0.4.1 since summaries are not present in 0.4 they
   240  	// might be incorrect after upgrading to 0.4.1 the summaries might not be
   241  	// correct
   242  	if err := s.reconcileJobSummaries(); err != nil {
   243  		return fmt.Errorf("unable to reconcile job summaries: %v", err)
   244  	}
   245  
   246  	// Start replication of ACLs and Policies if they are enabled,
   247  	// and we are not the authoritative region.
   248  	if s.config.ACLEnabled && s.config.Region != s.config.AuthoritativeRegion {
   249  		go s.replicateACLPolicies(stopCh)
   250  		go s.replicateACLTokens(stopCh)
   251  	}
   252  
   253  	// Setup any enterprise systems required.
   254  	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
   255  		return err
   256  	}
   257  
   258  	return nil
   259  }
   260  
   261  // restoreEvals is used to restore pending evaluations into the eval broker and
   262  // blocked evaluations into the blocked eval tracker. The broker and blocked
   263  // eval tracker is maintained only by the leader, so it must be restored anytime
   264  // a leadership transition takes place.
   265  func (s *Server) restoreEvals() error {
   266  	// Get an iterator over every evaluation
   267  	ws := memdb.NewWatchSet()
   268  	iter, err := s.fsm.State().Evals(ws)
   269  	if err != nil {
   270  		return fmt.Errorf("failed to get evaluations: %v", err)
   271  	}
   272  
   273  	for {
   274  		raw := iter.Next()
   275  		if raw == nil {
   276  			break
   277  		}
   278  		eval := raw.(*structs.Evaluation)
   279  
   280  		if eval.ShouldEnqueue() {
   281  			s.evalBroker.Enqueue(eval)
   282  		} else if eval.ShouldBlock() {
   283  			s.blockedEvals.Block(eval)
   284  		}
   285  	}
   286  	return nil
   287  }
   288  
   289  // restoreRevokingAccessors is used to restore Vault accessors that should be
   290  // revoked.
   291  func (s *Server) restoreRevokingAccessors() error {
   292  	// An accessor should be revoked if its allocation or node is terminal
   293  	ws := memdb.NewWatchSet()
   294  	state := s.fsm.State()
   295  	iter, err := state.VaultAccessors(ws)
   296  	if err != nil {
   297  		return fmt.Errorf("failed to get vault accessors: %v", err)
   298  	}
   299  
   300  	var revoke []*structs.VaultAccessor
   301  	for {
   302  		raw := iter.Next()
   303  		if raw == nil {
   304  			break
   305  		}
   306  
   307  		va := raw.(*structs.VaultAccessor)
   308  
   309  		// Check the allocation
   310  		alloc, err := state.AllocByID(ws, va.AllocID)
   311  		if err != nil {
   312  			return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err)
   313  		}
   314  		if alloc == nil || alloc.Terminated() {
   315  			// No longer running and should be revoked
   316  			revoke = append(revoke, va)
   317  			continue
   318  		}
   319  
   320  		// Check the node
   321  		node, err := state.NodeByID(ws, va.NodeID)
   322  		if err != nil {
   323  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   324  		}
   325  		if node == nil || node.TerminalStatus() {
   326  			// Node is terminal so any accessor from it should be revoked
   327  			revoke = append(revoke, va)
   328  			continue
   329  		}
   330  	}
   331  
   332  	if len(revoke) != 0 {
   333  		if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
   334  			return fmt.Errorf("failed to revoke tokens: %v", err)
   335  		}
   336  	}
   337  
   338  	return nil
   339  }
   340  
   341  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   342  // periodic dispatcher. It also determines if a periodic job should have been
   343  // created during the leadership transition and force runs them. The periodic
   344  // dispatcher is maintained only by the leader, so it must be restored anytime a
   345  // leadership transition takes place.
   346  func (s *Server) restorePeriodicDispatcher() error {
   347  	ws := memdb.NewWatchSet()
   348  	iter, err := s.fsm.State().JobsByPeriodic(ws, true)
   349  	if err != nil {
   350  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   351  	}
   352  
   353  	now := time.Now()
   354  	for i := iter.Next(); i != nil; i = iter.Next() {
   355  		job := i.(*structs.Job)
   356  
   357  		// We skip adding parameterized jobs because they themselves aren't
   358  		// tracked, only the dispatched children are.
   359  		if job.IsParameterized() {
   360  			continue
   361  		}
   362  
   363  		if err := s.periodicDispatcher.Add(job); err != nil {
   364  			return err
   365  		}
   366  
   367  		// We do not need to force run the job since it isn't active.
   368  		if !job.IsPeriodicActive() {
   369  			continue
   370  		}
   371  
   372  		// If the periodic job has never been launched before, launch will hold
   373  		// the time the periodic job was added. Otherwise it has the last launch
   374  		// time of the periodic job.
   375  		launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   376  		if err != nil {
   377  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   378  		}
   379  		if launch == nil {
   380  			return fmt.Errorf("no recorded periodic launch time for job %q in namespace %q",
   381  				job.ID, job.Namespace)
   382  		}
   383  
   384  		// nextLaunch is the next launch that should occur.
   385  		nextLaunch := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation()))
   386  
   387  		// We skip force launching the job if  there should be no next launch
   388  		// (the zero case) or if the next launch time is in the future. If it is
   389  		// in the future, it will be handled by the periodic dispatcher.
   390  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   391  			continue
   392  		}
   393  
   394  		if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil {
   395  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   396  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   397  			return errors.New(msg)
   398  		}
   399  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   400  			" run during leadership establishment", job.ID)
   401  	}
   402  
   403  	return nil
   404  }
   405  
   406  // schedulePeriodic is used to do periodic job dispatch while we are leader
   407  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   408  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   409  	defer evalGC.Stop()
   410  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   411  	defer nodeGC.Stop()
   412  	jobGC := time.NewTicker(s.config.JobGCInterval)
   413  	defer jobGC.Stop()
   414  	deploymentGC := time.NewTicker(s.config.DeploymentGCInterval)
   415  	defer deploymentGC.Stop()
   416  
   417  	// getLatest grabs the latest index from the state store. It returns true if
   418  	// the index was retrieved successfully.
   419  	getLatest := func() (uint64, bool) {
   420  		snapshotIndex, err := s.fsm.State().LatestIndex()
   421  		if err != nil {
   422  			s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err)
   423  			return 0, false
   424  		}
   425  
   426  		return snapshotIndex, true
   427  	}
   428  
   429  	for {
   430  
   431  		select {
   432  		case <-evalGC.C:
   433  			if index, ok := getLatest(); ok {
   434  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   435  			}
   436  		case <-nodeGC.C:
   437  			if index, ok := getLatest(); ok {
   438  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   439  			}
   440  		case <-jobGC.C:
   441  			if index, ok := getLatest(); ok {
   442  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   443  			}
   444  		case <-deploymentGC.C:
   445  			if index, ok := getLatest(); ok {
   446  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobDeploymentGC, index))
   447  			}
   448  		case <-stopCh:
   449  			return
   450  		}
   451  	}
   452  }
   453  
   454  // coreJobEval returns an evaluation for a core job
   455  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   456  	return &structs.Evaluation{
   457  		ID:          uuid.Generate(),
   458  		Namespace:   "-",
   459  		Priority:    structs.CoreJobPriority,
   460  		Type:        structs.JobTypeCore,
   461  		TriggeredBy: structs.EvalTriggerScheduled,
   462  		JobID:       job,
   463  		LeaderACL:   s.getLeaderAcl(),
   464  		Status:      structs.EvalStatusPending,
   465  		ModifyIndex: modifyIndex,
   466  	}
   467  }
   468  
   469  // reapFailedEvaluations is used to reap evaluations that
   470  // have reached their delivery limit and should be failed
   471  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   472  	for {
   473  		select {
   474  		case <-stopCh:
   475  			return
   476  		default:
   477  			// Scan for a failed evaluation
   478  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   479  			if err != nil {
   480  				return
   481  			}
   482  			if eval == nil {
   483  				continue
   484  			}
   485  
   486  			// Update the status to failed
   487  			updateEval := eval.Copy()
   488  			updateEval.Status = structs.EvalStatusFailed
   489  			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   490  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval)
   491  
   492  			// Create a follow-up evaluation that will be used to retry the
   493  			// scheduling for the job after the cluster is hopefully more stable
   494  			// due to the fairly large backoff.
   495  			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
   496  				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
   497  			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
   498  
   499  			// Update via Raft
   500  			req := structs.EvalUpdateRequest{
   501  				Evals: []*structs.Evaluation{updateEval, followupEval},
   502  			}
   503  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   504  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err)
   505  				continue
   506  			}
   507  
   508  			// Ack completion
   509  			s.evalBroker.Ack(eval.ID, token)
   510  		}
   511  	}
   512  }
   513  
   514  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   515  // should be cancelled.
   516  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   517  	for {
   518  		select {
   519  		case <-stopCh:
   520  			return
   521  		default:
   522  			// Scan for duplicate blocked evals.
   523  			dups := s.blockedEvals.GetDuplicates(time.Second)
   524  			if dups == nil {
   525  				continue
   526  			}
   527  
   528  			cancel := make([]*structs.Evaluation, len(dups))
   529  			for i, dup := range dups {
   530  				// Update the status to cancelled
   531  				newEval := dup.Copy()
   532  				newEval.Status = structs.EvalStatusCancelled
   533  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   534  				cancel[i] = newEval
   535  			}
   536  
   537  			// Update via Raft
   538  			req := structs.EvalUpdateRequest{
   539  				Evals: cancel,
   540  			}
   541  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   542  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   543  				continue
   544  			}
   545  		}
   546  	}
   547  }
   548  
   549  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   550  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   551  	ticker := time.NewTicker(failedEvalUnblockInterval)
   552  	defer ticker.Stop()
   553  	for {
   554  		select {
   555  		case <-stopCh:
   556  			return
   557  		case <-ticker.C:
   558  			// Unblock the failed allocations
   559  			s.blockedEvals.UnblockFailed()
   560  		}
   561  	}
   562  }
   563  
   564  // publishJobSummaryMetrics publishes the job summaries as metrics
   565  func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) {
   566  	timer := time.NewTimer(0)
   567  	defer timer.Stop()
   568  
   569  	for {
   570  		select {
   571  		case <-stopCh:
   572  			return
   573  		case <-timer.C:
   574  			timer.Reset(s.config.StatsCollectionInterval)
   575  			state, err := s.State().Snapshot()
   576  			if err != nil {
   577  				s.logger.Printf("[ERR] nomad: failed to get state: %v", err)
   578  				continue
   579  			}
   580  			ws := memdb.NewWatchSet()
   581  			iter, err := state.JobSummaries(ws)
   582  			if err != nil {
   583  				s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err)
   584  				continue
   585  			}
   586  
   587  			for {
   588  				raw := iter.Next()
   589  				if raw == nil {
   590  					break
   591  				}
   592  				summary := raw.(*structs.JobSummary)
   593  				for name, tgSummary := range summary.Summary {
   594  					if !s.config.DisableTaggedMetrics {
   595  						labels := []metrics.Label{
   596  							{
   597  								Name:  "job",
   598  								Value: summary.JobID,
   599  							},
   600  							{
   601  								Name:  "task_group",
   602  								Value: name,
   603  							},
   604  						}
   605  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "queued"},
   606  							float32(tgSummary.Queued), labels)
   607  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "complete"},
   608  							float32(tgSummary.Complete), labels)
   609  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "failed"},
   610  							float32(tgSummary.Failed), labels)
   611  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "running"},
   612  							float32(tgSummary.Running), labels)
   613  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "starting"},
   614  							float32(tgSummary.Starting), labels)
   615  						metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
   616  							float32(tgSummary.Lost), labels)
   617  					}
   618  					if s.config.BackwardsCompatibleMetrics {
   619  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "queued"}, float32(tgSummary.Queued))
   620  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "complete"}, float32(tgSummary.Complete))
   621  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "failed"}, float32(tgSummary.Failed))
   622  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "running"}, float32(tgSummary.Running))
   623  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "starting"}, float32(tgSummary.Starting))
   624  						metrics.SetGauge([]string{"nomad", "job_summary", summary.JobID, name, "lost"}, float32(tgSummary.Lost))
   625  					}
   626  				}
   627  			}
   628  		}
   629  	}
   630  }
   631  
   632  // revokeLeadership is invoked once we step down as leader.
   633  // This is used to cleanup any state that may be specific to a leader.
   634  func (s *Server) revokeLeadership() error {
   635  	// Clear the leader token since we are no longer the leader.
   636  	s.setLeaderAcl("")
   637  
   638  	// Disable the plan queue, since we are no longer leader
   639  	s.planQueue.SetEnabled(false)
   640  
   641  	// Disable the eval broker, since it is only useful as a leader
   642  	s.evalBroker.SetEnabled(false)
   643  
   644  	// Disable the blocked eval tracker, since it is only useful as a leader
   645  	s.blockedEvals.SetEnabled(false)
   646  
   647  	// Disable the periodic dispatcher, since it is only useful as a leader
   648  	s.periodicDispatcher.SetEnabled(false)
   649  
   650  	// Disable the Vault client as it is only useful as a leader.
   651  	s.vault.SetActive(false)
   652  
   653  	// Disable the deployment watcher as it is only useful as a leader.
   654  	if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil {
   655  		return err
   656  	}
   657  
   658  	// Disable any enterprise systems required.
   659  	if err := s.revokeEnterpriseLeadership(); err != nil {
   660  		return err
   661  	}
   662  
   663  	// Clear the heartbeat timers on either shutdown or step down,
   664  	// since we are no longer responsible for TTL expirations.
   665  	if err := s.clearAllHeartbeatTimers(); err != nil {
   666  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   667  		return err
   668  	}
   669  
   670  	// Unpause our worker if we paused previously
   671  	if len(s.workers) > 1 {
   672  		for i := 0; i < len(s.workers)/2; i++ {
   673  			s.workers[i].SetPause(false)
   674  		}
   675  	}
   676  	return nil
   677  }
   678  
   679  // reconcile is used to reconcile the differences between Serf
   680  // membership and what is reflected in our strongly consistent store.
   681  func (s *Server) reconcile() error {
   682  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   683  	members := s.serf.Members()
   684  	for _, member := range members {
   685  		if err := s.reconcileMember(member); err != nil {
   686  			return err
   687  		}
   688  	}
   689  	return nil
   690  }
   691  
   692  // reconcileMember is used to do an async reconcile of a single serf member
   693  func (s *Server) reconcileMember(member serf.Member) error {
   694  	// Check if this is a member we should handle
   695  	valid, parts := isNomadServer(member)
   696  	if !valid || parts.Region != s.config.Region {
   697  		return nil
   698  	}
   699  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   700  
   701  	// Do not reconcile ourself
   702  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   703  		return nil
   704  	}
   705  
   706  	var err error
   707  	switch member.Status {
   708  	case serf.StatusAlive:
   709  		err = s.addRaftPeer(member, parts)
   710  	case serf.StatusLeft, StatusReap:
   711  		err = s.removeRaftPeer(member, parts)
   712  	}
   713  	if err != nil {
   714  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   715  			member, err)
   716  		return err
   717  	}
   718  	return nil
   719  }
   720  
   721  // reconcileJobSummaries reconciles the summaries of all the jobs registered in
   722  // the system
   723  // COMPAT 0.4 -> 0.4.1
   724  func (s *Server) reconcileJobSummaries() error {
   725  	index, err := s.fsm.state.LatestIndex()
   726  	if err != nil {
   727  		return fmt.Errorf("unable to read latest index: %v", err)
   728  	}
   729  	s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index)
   730  
   731  	args := &structs.GenericResponse{}
   732  	msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag
   733  	if _, _, err = s.raftApply(msg, args); err != nil {
   734  		return fmt.Errorf("reconciliation of job summaries failed: %v", err)
   735  	}
   736  
   737  	return nil
   738  }
   739  
   740  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   741  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   742  	// Do not join ourselfs
   743  	if m.Name == s.config.NodeName {
   744  		s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name)
   745  		return nil
   746  	}
   747  
   748  	// Check for possibility of multiple bootstrap nodes
   749  	if parts.Bootstrap {
   750  		members := s.serf.Members()
   751  		for _, member := range members {
   752  			valid, p := isNomadServer(member)
   753  			if valid && member.Name != m.Name && p.Bootstrap {
   754  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   755  				return nil
   756  			}
   757  		}
   758  	}
   759  
   760  	// TODO (alexdadgar) - This will need to be changed once we support node IDs.
   761  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   762  
   763  	// See if it's already in the configuration. It's harmless to re-add it
   764  	// but we want to avoid doing that if possible to prevent useless Raft
   765  	// log entries.
   766  	configFuture := s.raft.GetConfiguration()
   767  	if err := configFuture.Error(); err != nil {
   768  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   769  		return err
   770  	}
   771  	for _, server := range configFuture.Configuration().Servers {
   772  		if server.Address == raft.ServerAddress(addr) {
   773  			return nil
   774  		}
   775  	}
   776  
   777  	// Attempt to add as a peer
   778  	addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
   779  	if err := addFuture.Error(); err != nil {
   780  		s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   781  		return err
   782  	} else if err == nil {
   783  		s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
   784  	}
   785  	return nil
   786  }
   787  
   788  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   789  // or is reaped
   790  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   791  	// TODO (alexdadgar) - This will need to be changed once we support node IDs.
   792  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
   793  
   794  	// See if it's already in the configuration. It's harmless to re-remove it
   795  	// but we want to avoid doing that if possible to prevent useless Raft
   796  	// log entries.
   797  	configFuture := s.raft.GetConfiguration()
   798  	if err := configFuture.Error(); err != nil {
   799  		s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
   800  		return err
   801  	}
   802  	for _, server := range configFuture.Configuration().Servers {
   803  		if server.Address == raft.ServerAddress(addr) {
   804  			goto REMOVE
   805  		}
   806  	}
   807  	return nil
   808  
   809  REMOVE:
   810  	// Attempt to remove as a peer.
   811  	future := s.raft.RemovePeer(raft.ServerAddress(addr))
   812  	if err := future.Error(); err != nil {
   813  		s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   814  			parts, err)
   815  		return err
   816  	}
   817  	return nil
   818  }
   819  
   820  // replicateACLPolicies is used to replicate ACL policies from
   821  // the authoritative region to this region.
   822  func (s *Server) replicateACLPolicies(stopCh chan struct{}) {
   823  	req := structs.ACLPolicyListRequest{
   824  		QueryOptions: structs.QueryOptions{
   825  			Region:     s.config.AuthoritativeRegion,
   826  			AllowStale: true,
   827  		},
   828  	}
   829  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   830  	s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region)
   831  
   832  START:
   833  	for {
   834  		select {
   835  		case <-stopCh:
   836  			return
   837  		default:
   838  			// Rate limit how often we attempt replication
   839  			limiter.Wait(context.Background())
   840  
   841  			// Fetch the list of policies
   842  			var resp structs.ACLPolicyListResponse
   843  			req.AuthToken = s.ReplicationToken()
   844  			err := s.forwardRegion(s.config.AuthoritativeRegion,
   845  				"ACL.ListPolicies", &req, &resp)
   846  			if err != nil {
   847  				s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   848  				goto ERR_WAIT
   849  			}
   850  
   851  			// Perform a two-way diff
   852  			delete, update := diffACLPolicies(s.State(), req.MinQueryIndex, resp.Policies)
   853  
   854  			// Delete policies that should not exist
   855  			if len(delete) > 0 {
   856  				args := &structs.ACLPolicyDeleteRequest{
   857  					Names: delete,
   858  				}
   859  				_, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args)
   860  				if err != nil {
   861  					s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err)
   862  					goto ERR_WAIT
   863  				}
   864  			}
   865  
   866  			// Fetch any outdated policies
   867  			var fetched []*structs.ACLPolicy
   868  			if len(update) > 0 {
   869  				req := structs.ACLPolicySetRequest{
   870  					Names: update,
   871  					QueryOptions: structs.QueryOptions{
   872  						Region:        s.config.AuthoritativeRegion,
   873  						AuthToken:     s.ReplicationToken(),
   874  						AllowStale:    true,
   875  						MinQueryIndex: resp.Index - 1,
   876  					},
   877  				}
   878  				var reply structs.ACLPolicySetResponse
   879  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
   880  					"ACL.GetPolicies", &req, &reply); err != nil {
   881  					s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err)
   882  					goto ERR_WAIT
   883  				}
   884  				for _, policy := range reply.Policies {
   885  					fetched = append(fetched, policy)
   886  				}
   887  			}
   888  
   889  			// Update local policies
   890  			if len(fetched) > 0 {
   891  				args := &structs.ACLPolicyUpsertRequest{
   892  					Policies: fetched,
   893  				}
   894  				_, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args)
   895  				if err != nil {
   896  					s.logger.Printf("[ERR] nomad: failed to update policies: %v", err)
   897  					goto ERR_WAIT
   898  				}
   899  			}
   900  
   901  			// Update the minimum query index, blocks until there
   902  			// is a change.
   903  			req.MinQueryIndex = resp.Index
   904  		}
   905  	}
   906  
   907  ERR_WAIT:
   908  	select {
   909  	case <-time.After(s.config.ReplicationBackoff):
   910  		goto START
   911  	case <-stopCh:
   912  		return
   913  	}
   914  }
   915  
   916  // diffACLPolicies is used to perform a two-way diff between the local
   917  // policies and the remote policies to determine which policies need to
   918  // be deleted or updated.
   919  func diffACLPolicies(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLPolicyListStub) (delete []string, update []string) {
   920  	// Construct a set of the local and remote policies
   921  	local := make(map[string][]byte)
   922  	remote := make(map[string]struct{})
   923  
   924  	// Add all the local policies
   925  	iter, err := state.ACLPolicies(nil)
   926  	if err != nil {
   927  		panic("failed to iterate local policies")
   928  	}
   929  	for {
   930  		raw := iter.Next()
   931  		if raw == nil {
   932  			break
   933  		}
   934  		policy := raw.(*structs.ACLPolicy)
   935  		local[policy.Name] = policy.Hash
   936  	}
   937  
   938  	// Iterate over the remote policies
   939  	for _, rp := range remoteList {
   940  		remote[rp.Name] = struct{}{}
   941  
   942  		// Check if the policy is missing locally
   943  		if localHash, ok := local[rp.Name]; !ok {
   944  			update = append(update, rp.Name)
   945  
   946  			// Check if policy is newer remotely and there is a hash mis-match.
   947  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
   948  			update = append(update, rp.Name)
   949  		}
   950  	}
   951  
   952  	// Check if policy should be deleted
   953  	for lp := range local {
   954  		if _, ok := remote[lp]; !ok {
   955  			delete = append(delete, lp)
   956  		}
   957  	}
   958  	return
   959  }
   960  
   961  // replicateACLTokens is used to replicate global ACL tokens from
   962  // the authoritative region to this region.
   963  func (s *Server) replicateACLTokens(stopCh chan struct{}) {
   964  	req := structs.ACLTokenListRequest{
   965  		GlobalOnly: true,
   966  		QueryOptions: structs.QueryOptions{
   967  			Region:     s.config.AuthoritativeRegion,
   968  			AllowStale: true,
   969  		},
   970  	}
   971  	limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit))
   972  	s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region)
   973  
   974  START:
   975  	for {
   976  		select {
   977  		case <-stopCh:
   978  			return
   979  		default:
   980  			// Rate limit how often we attempt replication
   981  			limiter.Wait(context.Background())
   982  
   983  			// Fetch the list of tokens
   984  			var resp structs.ACLTokenListResponse
   985  			req.AuthToken = s.ReplicationToken()
   986  			err := s.forwardRegion(s.config.AuthoritativeRegion,
   987  				"ACL.ListTokens", &req, &resp)
   988  			if err != nil {
   989  				s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
   990  				goto ERR_WAIT
   991  			}
   992  
   993  			// Perform a two-way diff
   994  			delete, update := diffACLTokens(s.State(), req.MinQueryIndex, resp.Tokens)
   995  
   996  			// Delete tokens that should not exist
   997  			if len(delete) > 0 {
   998  				args := &structs.ACLTokenDeleteRequest{
   999  					AccessorIDs: delete,
  1000  				}
  1001  				_, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args)
  1002  				if err != nil {
  1003  					s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err)
  1004  					goto ERR_WAIT
  1005  				}
  1006  			}
  1007  
  1008  			// Fetch any outdated policies.
  1009  			var fetched []*structs.ACLToken
  1010  			if len(update) > 0 {
  1011  				req := structs.ACLTokenSetRequest{
  1012  					AccessorIDS: update,
  1013  					QueryOptions: structs.QueryOptions{
  1014  						Region:        s.config.AuthoritativeRegion,
  1015  						AuthToken:     s.ReplicationToken(),
  1016  						AllowStale:    true,
  1017  						MinQueryIndex: resp.Index - 1,
  1018  					},
  1019  				}
  1020  				var reply structs.ACLTokenSetResponse
  1021  				if err := s.forwardRegion(s.config.AuthoritativeRegion,
  1022  					"ACL.GetTokens", &req, &reply); err != nil {
  1023  					s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err)
  1024  					goto ERR_WAIT
  1025  				}
  1026  				for _, token := range reply.Tokens {
  1027  					fetched = append(fetched, token)
  1028  				}
  1029  			}
  1030  
  1031  			// Update local tokens
  1032  			if len(fetched) > 0 {
  1033  				args := &structs.ACLTokenUpsertRequest{
  1034  					Tokens: fetched,
  1035  				}
  1036  				_, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args)
  1037  				if err != nil {
  1038  					s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err)
  1039  					goto ERR_WAIT
  1040  				}
  1041  			}
  1042  
  1043  			// Update the minimum query index, blocks until there
  1044  			// is a change.
  1045  			req.MinQueryIndex = resp.Index
  1046  		}
  1047  	}
  1048  
  1049  ERR_WAIT:
  1050  	select {
  1051  	case <-time.After(s.config.ReplicationBackoff):
  1052  		goto START
  1053  	case <-stopCh:
  1054  		return
  1055  	}
  1056  }
  1057  
  1058  // diffACLTokens is used to perform a two-way diff between the local
  1059  // tokens and the remote tokens to determine which tokens need to
  1060  // be deleted or updated.
  1061  func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*structs.ACLTokenListStub) (delete []string, update []string) {
  1062  	// Construct a set of the local and remote policies
  1063  	local := make(map[string][]byte)
  1064  	remote := make(map[string]struct{})
  1065  
  1066  	// Add all the local global tokens
  1067  	iter, err := state.ACLTokensByGlobal(nil, true)
  1068  	if err != nil {
  1069  		panic("failed to iterate local tokens")
  1070  	}
  1071  	for {
  1072  		raw := iter.Next()
  1073  		if raw == nil {
  1074  			break
  1075  		}
  1076  		token := raw.(*structs.ACLToken)
  1077  		local[token.AccessorID] = token.Hash
  1078  	}
  1079  
  1080  	// Iterate over the remote tokens
  1081  	for _, rp := range remoteList {
  1082  		remote[rp.AccessorID] = struct{}{}
  1083  
  1084  		// Check if the token is missing locally
  1085  		if localHash, ok := local[rp.AccessorID]; !ok {
  1086  			update = append(update, rp.AccessorID)
  1087  
  1088  			// Check if policy is newer remotely and there is a hash mis-match.
  1089  		} else if rp.ModifyIndex > minIndex && !bytes.Equal(localHash, rp.Hash) {
  1090  			update = append(update, rp.AccessorID)
  1091  		}
  1092  	}
  1093  
  1094  	// Check if local token should be deleted
  1095  	for lp := range local {
  1096  		if _, ok := remote[lp]; !ok {
  1097  			delete = append(delete, lp)
  1098  		}
  1099  	}
  1100  	return
  1101  }