github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/armon/go-metrics"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/raft"
    12  	"github.com/hashicorp/serf/serf"
    13  )
    14  
    15  const (
    16  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    17  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    18  	// high contention when the schedulers plan does not make progress.
    19  	failedEvalUnblockInterval = 1 * time.Minute
    20  )
    21  
    22  // monitorLeadership is used to monitor if we acquire or lose our role
    23  // as the leader in the Raft cluster. There is some work the leader is
    24  // expected to do, so we must react to changes
    25  func (s *Server) monitorLeadership() {
    26  	var stopCh chan struct{}
    27  	for {
    28  		select {
    29  		case isLeader := <-s.leaderCh:
    30  			if isLeader {
    31  				stopCh = make(chan struct{})
    32  				go s.leaderLoop(stopCh)
    33  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    34  			} else if stopCh != nil {
    35  				close(stopCh)
    36  				stopCh = nil
    37  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    38  			}
    39  		case <-s.shutdownCh:
    40  			return
    41  		}
    42  	}
    43  }
    44  
    45  // leaderLoop runs as long as we are the leader to run various
    46  // maintence activities
    47  func (s *Server) leaderLoop(stopCh chan struct{}) {
    48  	// Ensure we revoke leadership on stepdown
    49  	defer s.revokeLeadership()
    50  
    51  	var reconcileCh chan serf.Member
    52  	establishedLeader := false
    53  
    54  RECONCILE:
    55  	// Setup a reconciliation timer
    56  	reconcileCh = nil
    57  	interval := time.After(s.config.ReconcileInterval)
    58  
    59  	// Apply a raft barrier to ensure our FSM is caught up
    60  	start := time.Now()
    61  	barrier := s.raft.Barrier(0)
    62  	if err := barrier.Error(); err != nil {
    63  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
    64  		goto WAIT
    65  	}
    66  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
    67  
    68  	// Check if we need to handle initial leadership actions
    69  	if !establishedLeader {
    70  		if err := s.establishLeadership(stopCh); err != nil {
    71  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v",
    72  				err)
    73  			goto WAIT
    74  		}
    75  		establishedLeader = true
    76  	}
    77  
    78  	// Reconcile any missing data
    79  	if err := s.reconcile(); err != nil {
    80  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
    81  		goto WAIT
    82  	}
    83  
    84  	// Initial reconcile worked, now we can process the channel
    85  	// updates
    86  	reconcileCh = s.reconcileCh
    87  
    88  WAIT:
    89  	// Wait until leadership is lost
    90  	for {
    91  		select {
    92  		case <-stopCh:
    93  			return
    94  		case <-s.shutdownCh:
    95  			return
    96  		case <-interval:
    97  			goto RECONCILE
    98  		case member := <-reconcileCh:
    99  			s.reconcileMember(member)
   100  		}
   101  	}
   102  }
   103  
   104  // establishLeadership is invoked once we become leader and are able
   105  // to invoke an initial barrier. The barrier is used to ensure any
   106  // previously inflight transactions have been committed and that our
   107  // state is up-to-date.
   108  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   109  	// Disable workers to free half the cores for use in the plan queue and
   110  	// evaluation broker
   111  	if numWorkers := len(s.workers); numWorkers > 1 {
   112  		// Disabling 3/4 of the workers frees CPU for raft and the
   113  		// plan applier which uses 1/2 the cores.
   114  		for i := 0; i < (3 * numWorkers / 4); i++ {
   115  			s.workers[i].SetPause(true)
   116  		}
   117  	}
   118  
   119  	// Enable the plan queue, since we are now the leader
   120  	s.planQueue.SetEnabled(true)
   121  
   122  	// Start the plan evaluator
   123  	go s.planApply()
   124  
   125  	// Enable the eval broker, since we are now the leader
   126  	s.evalBroker.SetEnabled(true)
   127  
   128  	// Enable the blocked eval tracker, since we are now the leader
   129  	s.blockedEvals.SetEnabled(true)
   130  
   131  	// Restore the eval broker state
   132  	if err := s.restoreEvals(); err != nil {
   133  		return err
   134  	}
   135  
   136  	// Activate the vault client
   137  	s.vault.SetActive(true)
   138  	if err := s.restoreRevokingAccessors(); err != nil {
   139  		return err
   140  	}
   141  
   142  	// Enable the periodic dispatcher, since we are now the leader.
   143  	s.periodicDispatcher.SetEnabled(true)
   144  	s.periodicDispatcher.Start()
   145  
   146  	// Restore the periodic dispatcher state
   147  	if err := s.restorePeriodicDispatcher(); err != nil {
   148  		return err
   149  	}
   150  
   151  	// Scheduler periodic jobs
   152  	go s.schedulePeriodic(stopCh)
   153  
   154  	// Reap any failed evaluations
   155  	go s.reapFailedEvaluations(stopCh)
   156  
   157  	// Reap any duplicate blocked evaluations
   158  	go s.reapDupBlockedEvaluations(stopCh)
   159  
   160  	// Periodically unblock failed allocations
   161  	go s.periodicUnblockFailedEvals(stopCh)
   162  
   163  	// Setup the heartbeat timers. This is done both when starting up or when
   164  	// a leader fail over happens. Since the timers are maintained by the leader
   165  	// node, effectively this means all the timers are renewed at the time of failover.
   166  	// The TTL contract is that the session will not be expired before the TTL,
   167  	// so expiring it later is allowable.
   168  	//
   169  	// This MUST be done after the initial barrier to ensure the latest Nodes
   170  	// are available to be initialized. Otherwise initialization may use stale
   171  	// data.
   172  	if err := s.initializeHeartbeatTimers(); err != nil {
   173  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   174  		return err
   175  	}
   176  
   177  	// COMPAT 0.4 - 0.4.1
   178  	// Reconcile the summaries of the registered jobs. We reconcile summaries
   179  	// only if the server is 0.4.1 since summaries are not present in 0.4 they
   180  	// might be incorrect after upgrading to 0.4.1 the summaries might not be
   181  	// correct
   182  	if err := s.reconcileJobSummaries(); err != nil {
   183  		return fmt.Errorf("unable to reconcile job summaries: %v", err)
   184  	}
   185  	return nil
   186  }
   187  
   188  // restoreEvals is used to restore pending evaluations into the eval broker and
   189  // blocked evaluations into the blocked eval tracker. The broker and blocked
   190  // eval tracker is maintained only by the leader, so it must be restored anytime
   191  // a leadership transition takes place.
   192  func (s *Server) restoreEvals() error {
   193  	// Get an iterator over every evaluation
   194  	iter, err := s.fsm.State().Evals()
   195  	if err != nil {
   196  		return fmt.Errorf("failed to get evaluations: %v", err)
   197  	}
   198  
   199  	for {
   200  		raw := iter.Next()
   201  		if raw == nil {
   202  			break
   203  		}
   204  		eval := raw.(*structs.Evaluation)
   205  
   206  		if eval.ShouldEnqueue() {
   207  			s.evalBroker.Enqueue(eval)
   208  		} else if eval.ShouldBlock() {
   209  			s.blockedEvals.Block(eval)
   210  		}
   211  	}
   212  	return nil
   213  }
   214  
   215  // restoreRevokingAccessors is used to restore Vault accessors that should be
   216  // revoked.
   217  func (s *Server) restoreRevokingAccessors() error {
   218  	// An accessor should be revoked if its allocation or node is terminal
   219  	state := s.fsm.State()
   220  	iter, err := state.VaultAccessors()
   221  	if err != nil {
   222  		return fmt.Errorf("failed to get vault accessors: %v", err)
   223  	}
   224  
   225  	var revoke []*structs.VaultAccessor
   226  	for {
   227  		raw := iter.Next()
   228  		if raw == nil {
   229  			break
   230  		}
   231  
   232  		va := raw.(*structs.VaultAccessor)
   233  
   234  		// Check the allocation
   235  		alloc, err := state.AllocByID(va.AllocID)
   236  		if err != nil {
   237  			return fmt.Errorf("failed to lookup allocation: %v", va.AllocID, err)
   238  		}
   239  		if alloc == nil || alloc.Terminated() {
   240  			// No longer running and should be revoked
   241  			revoke = append(revoke, va)
   242  			continue
   243  		}
   244  
   245  		// Check the node
   246  		node, err := state.NodeByID(va.NodeID)
   247  		if err != nil {
   248  			return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err)
   249  		}
   250  		if node == nil || node.TerminalStatus() {
   251  			// Node is terminal so any accessor from it should be revoked
   252  			revoke = append(revoke, va)
   253  			continue
   254  		}
   255  	}
   256  
   257  	if len(revoke) != 0 {
   258  		if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil {
   259  			return fmt.Errorf("failed to revoke tokens: %v", err)
   260  		}
   261  	}
   262  
   263  	return nil
   264  }
   265  
   266  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   267  // periodic dispatcher. It also determines if a periodic job should have been
   268  // created during the leadership transition and force runs them. The periodic
   269  // dispatcher is maintained only by the leader, so it must be restored anytime a
   270  // leadership transition takes place.
   271  func (s *Server) restorePeriodicDispatcher() error {
   272  	iter, err := s.fsm.State().JobsByPeriodic(true)
   273  	if err != nil {
   274  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   275  	}
   276  
   277  	now := time.Now()
   278  	for i := iter.Next(); i != nil; i = iter.Next() {
   279  		job := i.(*structs.Job)
   280  		s.periodicDispatcher.Add(job)
   281  
   282  		// If the periodic job has never been launched before, launch will hold
   283  		// the time the periodic job was added. Otherwise it has the last launch
   284  		// time of the periodic job.
   285  		launch, err := s.fsm.State().PeriodicLaunchByID(job.ID)
   286  		if err != nil || launch == nil {
   287  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   288  		}
   289  
   290  		// nextLaunch is the next launch that should occur.
   291  		nextLaunch := job.Periodic.Next(launch.Launch)
   292  
   293  		// We skip force launching the job if  there should be no next launch
   294  		// (the zero case) or if the next launch time is in the future. If it is
   295  		// in the future, it will be handled by the periodic dispatcher.
   296  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   297  			continue
   298  		}
   299  
   300  		if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil {
   301  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   302  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   303  			return errors.New(msg)
   304  		}
   305  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   306  			" run during leadership establishment", job.ID)
   307  	}
   308  
   309  	return nil
   310  }
   311  
   312  // schedulePeriodic is used to do periodic job dispatch while we are leader
   313  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   314  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   315  	defer evalGC.Stop()
   316  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   317  	defer nodeGC.Stop()
   318  	jobGC := time.NewTicker(s.config.JobGCInterval)
   319  	defer jobGC.Stop()
   320  
   321  	// getLatest grabs the latest index from the state store. It returns true if
   322  	// the index was retrieved successfully.
   323  	getLatest := func() (uint64, bool) {
   324  		snapshotIndex, err := s.fsm.State().LatestIndex()
   325  		if err != nil {
   326  			s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err)
   327  			return 0, false
   328  		}
   329  
   330  		return snapshotIndex, true
   331  	}
   332  
   333  	for {
   334  
   335  		select {
   336  		case <-evalGC.C:
   337  			if index, ok := getLatest(); ok {
   338  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index))
   339  			}
   340  		case <-nodeGC.C:
   341  			if index, ok := getLatest(); ok {
   342  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index))
   343  			}
   344  		case <-jobGC.C:
   345  			if index, ok := getLatest(); ok {
   346  				s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index))
   347  			}
   348  		case <-stopCh:
   349  			return
   350  		}
   351  	}
   352  }
   353  
   354  // coreJobEval returns an evaluation for a core job
   355  func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation {
   356  	return &structs.Evaluation{
   357  		ID:          structs.GenerateUUID(),
   358  		Priority:    structs.CoreJobPriority,
   359  		Type:        structs.JobTypeCore,
   360  		TriggeredBy: structs.EvalTriggerScheduled,
   361  		JobID:       job,
   362  		Status:      structs.EvalStatusPending,
   363  		ModifyIndex: modifyIndex,
   364  	}
   365  }
   366  
   367  // reapFailedEvaluations is used to reap evaluations that
   368  // have reached their delivery limit and should be failed
   369  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   370  	for {
   371  		select {
   372  		case <-stopCh:
   373  			return
   374  		default:
   375  			// Scan for a failed evaluation
   376  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   377  			if err != nil {
   378  				return
   379  			}
   380  			if eval == nil {
   381  				continue
   382  			}
   383  
   384  			// Update the status to failed
   385  			newEval := eval.Copy()
   386  			newEval.Status = structs.EvalStatusFailed
   387  			newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   388  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval)
   389  
   390  			// Update via Raft
   391  			req := structs.EvalUpdateRequest{
   392  				Evals: []*structs.Evaluation{newEval},
   393  			}
   394  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   395  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err)
   396  				continue
   397  			}
   398  
   399  			// Ack completion
   400  			s.evalBroker.Ack(eval.ID, token)
   401  		}
   402  	}
   403  }
   404  
   405  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   406  // should be cancelled.
   407  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   408  	for {
   409  		select {
   410  		case <-stopCh:
   411  			return
   412  		default:
   413  			// Scan for duplicate blocked evals.
   414  			dups := s.blockedEvals.GetDuplicates(time.Second)
   415  			if dups == nil {
   416  				continue
   417  			}
   418  
   419  			cancel := make([]*structs.Evaluation, len(dups))
   420  			for i, dup := range dups {
   421  				// Update the status to cancelled
   422  				newEval := dup.Copy()
   423  				newEval.Status = structs.EvalStatusCancelled
   424  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   425  				cancel[i] = newEval
   426  			}
   427  
   428  			// Update via Raft
   429  			req := structs.EvalUpdateRequest{
   430  				Evals: cancel,
   431  			}
   432  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   433  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   434  				continue
   435  			}
   436  		}
   437  	}
   438  }
   439  
   440  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   441  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   442  	ticker := time.NewTicker(failedEvalUnblockInterval)
   443  	defer ticker.Stop()
   444  	for {
   445  		select {
   446  		case <-stopCh:
   447  			return
   448  		case <-ticker.C:
   449  			// Unblock the failed allocations
   450  			s.blockedEvals.UnblockFailed()
   451  		}
   452  	}
   453  }
   454  
   455  // revokeLeadership is invoked once we step down as leader.
   456  // This is used to cleanup any state that may be specific to a leader.
   457  func (s *Server) revokeLeadership() error {
   458  	// Disable the plan queue, since we are no longer leader
   459  	s.planQueue.SetEnabled(false)
   460  
   461  	// Disable the eval broker, since it is only useful as a leader
   462  	s.evalBroker.SetEnabled(false)
   463  
   464  	// Disable the blocked eval tracker, since it is only useful as a leader
   465  	s.blockedEvals.SetEnabled(false)
   466  
   467  	// Disable the periodic dispatcher, since it is only useful as a leader
   468  	s.periodicDispatcher.SetEnabled(false)
   469  
   470  	// Disable the Vault client as it is only useful as a leader.
   471  	s.vault.SetActive(false)
   472  
   473  	// Clear the heartbeat timers on either shutdown or step down,
   474  	// since we are no longer responsible for TTL expirations.
   475  	if err := s.clearAllHeartbeatTimers(); err != nil {
   476  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   477  		return err
   478  	}
   479  
   480  	// Unpause our worker if we paused previously
   481  	if len(s.workers) > 1 {
   482  		for i := 0; i < len(s.workers)/2; i++ {
   483  			s.workers[i].SetPause(false)
   484  		}
   485  	}
   486  	return nil
   487  }
   488  
   489  // reconcile is used to reconcile the differences between Serf
   490  // membership and what is reflected in our strongly consistent store.
   491  func (s *Server) reconcile() error {
   492  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   493  	members := s.serf.Members()
   494  	for _, member := range members {
   495  		if err := s.reconcileMember(member); err != nil {
   496  			return err
   497  		}
   498  	}
   499  	return nil
   500  }
   501  
   502  // reconcileMember is used to do an async reconcile of a single serf member
   503  func (s *Server) reconcileMember(member serf.Member) error {
   504  	// Check if this is a member we should handle
   505  	valid, parts := isNomadServer(member)
   506  	if !valid || parts.Region != s.config.Region {
   507  		return nil
   508  	}
   509  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   510  
   511  	// Do not reconcile ourself
   512  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   513  		return nil
   514  	}
   515  
   516  	var err error
   517  	switch member.Status {
   518  	case serf.StatusAlive:
   519  		err = s.addRaftPeer(member, parts)
   520  	case serf.StatusLeft, StatusReap:
   521  		err = s.removeRaftPeer(member, parts)
   522  	}
   523  	if err != nil {
   524  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   525  			member, err)
   526  		return err
   527  	}
   528  	return nil
   529  }
   530  
   531  // reconcileJobSummaries reconciles the summaries of all the jobs registered in
   532  // the system
   533  // COMPAT 0.4 -> 0.4.1
   534  func (s *Server) reconcileJobSummaries() error {
   535  	index, err := s.fsm.state.LatestIndex()
   536  	if err != nil {
   537  		return fmt.Errorf("unable to read latest index: %v", err)
   538  	}
   539  	s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index)
   540  
   541  	args := &structs.GenericResponse{}
   542  	msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag
   543  	if _, _, err = s.raftApply(msg, args); err != nil {
   544  		return fmt.Errorf("reconciliation of job summaries failed: %v", err)
   545  	}
   546  
   547  	return nil
   548  }
   549  
   550  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   551  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   552  	// Check for possibility of multiple bootstrap nodes
   553  	if parts.Bootstrap {
   554  		members := s.serf.Members()
   555  		for _, member := range members {
   556  			valid, p := isNomadServer(member)
   557  			if valid && member.Name != m.Name && p.Bootstrap {
   558  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   559  				return nil
   560  			}
   561  		}
   562  	}
   563  
   564  	// Attempt to add as a peer
   565  	future := s.raft.AddPeer(parts.Addr.String())
   566  	if err := future.Error(); err != nil && err != raft.ErrKnownPeer {
   567  		s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   568  		return err
   569  	} else if err == nil {
   570  		s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
   571  	}
   572  	return nil
   573  }
   574  
   575  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   576  // or is reaped
   577  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   578  	// Attempt to remove as peer
   579  	future := s.raft.RemovePeer(parts.Addr.String())
   580  	if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
   581  		s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   582  			parts, err)
   583  		return err
   584  	} else if err == nil {
   585  		s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name)
   586  	}
   587  	return nil
   588  }