github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/armon/go-metrics"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/raft"
    11  	"github.com/hashicorp/serf/serf"
    12  )
    13  
    14  const (
    15  	// failedEvalUnblockInterval is the interval at which failed evaluations are
    16  	// unblocked to re-enter the scheduler. A failed evaluation occurs under
    17  	// high contention when the schedulers plan does not make progress.
    18  	failedEvalUnblockInterval = 1 * time.Minute
    19  )
    20  
    21  // monitorLeadership is used to monitor if we acquire or lose our role
    22  // as the leader in the Raft cluster. There is some work the leader is
    23  // expected to do, so we must react to changes
    24  func (s *Server) monitorLeadership() {
    25  	var stopCh chan struct{}
    26  	for {
    27  		select {
    28  		case isLeader := <-s.leaderCh:
    29  			if isLeader {
    30  				stopCh = make(chan struct{})
    31  				go s.leaderLoop(stopCh)
    32  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    33  			} else if stopCh != nil {
    34  				close(stopCh)
    35  				stopCh = nil
    36  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    37  			}
    38  		case <-s.shutdownCh:
    39  			return
    40  		}
    41  	}
    42  }
    43  
    44  // leaderLoop runs as long as we are the leader to run various
    45  // maintence activities
    46  func (s *Server) leaderLoop(stopCh chan struct{}) {
    47  	// Ensure we revoke leadership on stepdown
    48  	defer s.revokeLeadership()
    49  
    50  	var reconcileCh chan serf.Member
    51  	establishedLeader := false
    52  
    53  RECONCILE:
    54  	// Setup a reconciliation timer
    55  	reconcileCh = nil
    56  	interval := time.After(s.config.ReconcileInterval)
    57  
    58  	// Apply a raft barrier to ensure our FSM is caught up
    59  	start := time.Now()
    60  	barrier := s.raft.Barrier(0)
    61  	if err := barrier.Error(); err != nil {
    62  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
    63  		goto WAIT
    64  	}
    65  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
    66  
    67  	// Check if we need to handle initial leadership actions
    68  	if !establishedLeader {
    69  		if err := s.establishLeadership(stopCh); err != nil {
    70  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v",
    71  				err)
    72  			goto WAIT
    73  		}
    74  		establishedLeader = true
    75  	}
    76  
    77  	// Reconcile any missing data
    78  	if err := s.reconcile(); err != nil {
    79  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
    80  		goto WAIT
    81  	}
    82  
    83  	// Initial reconcile worked, now we can process the channel
    84  	// updates
    85  	reconcileCh = s.reconcileCh
    86  
    87  WAIT:
    88  	// Wait until leadership is lost
    89  	for {
    90  		select {
    91  		case <-stopCh:
    92  			return
    93  		case <-s.shutdownCh:
    94  			return
    95  		case <-interval:
    96  			goto RECONCILE
    97  		case member := <-reconcileCh:
    98  			s.reconcileMember(member)
    99  		}
   100  	}
   101  }
   102  
   103  // establishLeadership is invoked once we become leader and are able
   104  // to invoke an initial barrier. The barrier is used to ensure any
   105  // previously inflight transactions have been committed and that our
   106  // state is up-to-date.
   107  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   108  	// Disable workers to free half the cores for use in the plan queue and
   109  	// evaluation broker
   110  	if numWorkers := len(s.workers); numWorkers > 1 {
   111  		// Disabling 3/4 of the workers frees CPU for raft and the
   112  		// plan applier which uses 1/2 the cores.
   113  		for i := 0; i < (3 * numWorkers / 4); i++ {
   114  			s.workers[i].SetPause(true)
   115  		}
   116  	}
   117  
   118  	// Enable the plan queue, since we are now the leader
   119  	s.planQueue.SetEnabled(true)
   120  
   121  	// Start the plan evaluator
   122  	go s.planApply()
   123  
   124  	// Enable the eval broker, since we are now the leader
   125  	s.evalBroker.SetEnabled(true)
   126  
   127  	// Enable the blocked eval tracker, since we are now the leader
   128  	s.blockedEvals.SetEnabled(true)
   129  
   130  	// Restore the eval broker state
   131  	if err := s.restoreEvals(); err != nil {
   132  		return err
   133  	}
   134  
   135  	// Enable the periodic dispatcher, since we are now the leader.
   136  	s.periodicDispatcher.SetEnabled(true)
   137  	s.periodicDispatcher.Start()
   138  
   139  	// Restore the periodic dispatcher state
   140  	if err := s.restorePeriodicDispatcher(); err != nil {
   141  		return err
   142  	}
   143  
   144  	// Scheduler periodic jobs
   145  	go s.schedulePeriodic(stopCh)
   146  
   147  	// Reap any failed evaluations
   148  	go s.reapFailedEvaluations(stopCh)
   149  
   150  	// Reap any duplicate blocked evaluations
   151  	go s.reapDupBlockedEvaluations(stopCh)
   152  
   153  	// Periodically unblock failed allocations
   154  	go s.periodicUnblockFailedEvals(stopCh)
   155  
   156  	// Setup the heartbeat timers. This is done both when starting up or when
   157  	// a leader fail over happens. Since the timers are maintained by the leader
   158  	// node, effectively this means all the timers are renewed at the time of failover.
   159  	// The TTL contract is that the session will not be expired before the TTL,
   160  	// so expiring it later is allowable.
   161  	//
   162  	// This MUST be done after the initial barrier to ensure the latest Nodes
   163  	// are available to be initialized. Otherwise initialization may use stale
   164  	// data.
   165  	if err := s.initializeHeartbeatTimers(); err != nil {
   166  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   167  		return err
   168  	}
   169  	return nil
   170  }
   171  
   172  // restoreEvals is used to restore pending evaluations into the eval broker and
   173  // blocked evaluations into the blocked eval tracker. The broker and blocked
   174  // eval tracker is maintained only by the leader, so it must be restored anytime
   175  // a leadership transition takes place.
   176  func (s *Server) restoreEvals() error {
   177  	// Get an iterator over every evaluation
   178  	iter, err := s.fsm.State().Evals()
   179  	if err != nil {
   180  		return fmt.Errorf("failed to get evaluations: %v", err)
   181  	}
   182  
   183  	for {
   184  		raw := iter.Next()
   185  		if raw == nil {
   186  			break
   187  		}
   188  		eval := raw.(*structs.Evaluation)
   189  
   190  		if eval.ShouldEnqueue() {
   191  			s.evalBroker.Enqueue(eval)
   192  		} else if eval.ShouldBlock() {
   193  			s.blockedEvals.Block(eval)
   194  		}
   195  	}
   196  	return nil
   197  }
   198  
   199  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   200  // periodic dispatcher. It also determines if a periodic job should have been
   201  // created during the leadership transition and force runs them. The periodic
   202  // dispatcher is maintained only by the leader, so it must be restored anytime a
   203  // leadership transition takes place.
   204  func (s *Server) restorePeriodicDispatcher() error {
   205  	iter, err := s.fsm.State().JobsByPeriodic(true)
   206  	if err != nil {
   207  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   208  	}
   209  
   210  	now := time.Now()
   211  	for i := iter.Next(); i != nil; i = iter.Next() {
   212  		job := i.(*structs.Job)
   213  		s.periodicDispatcher.Add(job)
   214  
   215  		// If the periodic job has never been launched before, launch will hold
   216  		// the time the periodic job was added. Otherwise it has the last launch
   217  		// time of the periodic job.
   218  		launch, err := s.fsm.State().PeriodicLaunchByID(job.ID)
   219  		if err != nil || launch == nil {
   220  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   221  		}
   222  
   223  		// nextLaunch is the next launch that should occur.
   224  		nextLaunch := job.Periodic.Next(launch.Launch)
   225  
   226  		// We skip force launching the job if  there should be no next launch
   227  		// (the zero case) or if the next launch time is in the future. If it is
   228  		// in the future, it will be handled by the periodic dispatcher.
   229  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   230  			continue
   231  		}
   232  
   233  		if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil {
   234  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   235  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   236  			return errors.New(msg)
   237  		}
   238  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   239  			" run during leadership establishment", job.ID)
   240  	}
   241  
   242  	return nil
   243  }
   244  
   245  // schedulePeriodic is used to do periodic job dispatch while we are leader
   246  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   247  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   248  	defer evalGC.Stop()
   249  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   250  	defer nodeGC.Stop()
   251  	jobGC := time.NewTicker(s.config.JobGCInterval)
   252  	defer jobGC.Stop()
   253  
   254  	for {
   255  		select {
   256  		case <-evalGC.C:
   257  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC))
   258  		case <-nodeGC.C:
   259  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC))
   260  		case <-jobGC.C:
   261  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC))
   262  		case <-stopCh:
   263  			return
   264  		}
   265  	}
   266  }
   267  
   268  // coreJobEval returns an evaluation for a core job
   269  func (s *Server) coreJobEval(job string) *structs.Evaluation {
   270  	return &structs.Evaluation{
   271  		ID:          structs.GenerateUUID(),
   272  		Priority:    structs.CoreJobPriority,
   273  		Type:        structs.JobTypeCore,
   274  		TriggeredBy: structs.EvalTriggerScheduled,
   275  		JobID:       job,
   276  		Status:      structs.EvalStatusPending,
   277  		ModifyIndex: s.raft.AppliedIndex(),
   278  	}
   279  }
   280  
   281  // reapFailedEvaluations is used to reap evaluations that
   282  // have reached their delivery limit and should be failed
   283  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   284  	for {
   285  		select {
   286  		case <-stopCh:
   287  			return
   288  		default:
   289  			// Scan for a failed evaluation
   290  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   291  			if err != nil {
   292  				return
   293  			}
   294  			if eval == nil {
   295  				continue
   296  			}
   297  
   298  			// Update the status to failed
   299  			newEval := eval.Copy()
   300  			newEval.Status = structs.EvalStatusFailed
   301  			newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   302  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval)
   303  
   304  			// Update via Raft
   305  			req := structs.EvalUpdateRequest{
   306  				Evals: []*structs.Evaluation{newEval},
   307  			}
   308  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   309  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err)
   310  				continue
   311  			}
   312  
   313  			// Ack completion
   314  			s.evalBroker.Ack(eval.ID, token)
   315  		}
   316  	}
   317  }
   318  
   319  // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and
   320  // should be cancelled.
   321  func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) {
   322  	for {
   323  		select {
   324  		case <-stopCh:
   325  			return
   326  		default:
   327  			// Scan for duplicate blocked evals.
   328  			dups := s.blockedEvals.GetDuplicates(time.Second)
   329  			if dups == nil {
   330  				continue
   331  			}
   332  
   333  			cancel := make([]*structs.Evaluation, len(dups))
   334  			for i, dup := range dups {
   335  				// Update the status to cancelled
   336  				newEval := dup.Copy()
   337  				newEval.Status = structs.EvalStatusCancelled
   338  				newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID)
   339  				cancel[i] = newEval
   340  			}
   341  
   342  			// Update via Raft
   343  			req := structs.EvalUpdateRequest{
   344  				Evals: cancel,
   345  			}
   346  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   347  				s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err)
   348  				continue
   349  			}
   350  		}
   351  	}
   352  }
   353  
   354  // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations.
   355  func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) {
   356  	ticker := time.NewTimer(failedEvalUnblockInterval)
   357  	defer ticker.Stop()
   358  	for {
   359  		select {
   360  		case <-stopCh:
   361  			return
   362  		case <-ticker.C:
   363  			// Unblock the failed allocations
   364  			s.blockedEvals.UnblockFailed()
   365  		}
   366  	}
   367  }
   368  
   369  // revokeLeadership is invoked once we step down as leader.
   370  // This is used to cleanup any state that may be specific to a leader.
   371  func (s *Server) revokeLeadership() error {
   372  	// Disable the plan queue, since we are no longer leader
   373  	s.planQueue.SetEnabled(false)
   374  
   375  	// Disable the eval broker, since it is only useful as a leader
   376  	s.evalBroker.SetEnabled(false)
   377  
   378  	// Disable the blocked eval tracker, since it is only useful as a leader
   379  	s.blockedEvals.SetEnabled(false)
   380  
   381  	// Disable the periodic dispatcher, since it is only useful as a leader
   382  	s.periodicDispatcher.SetEnabled(false)
   383  
   384  	// Clear the heartbeat timers on either shutdown or step down,
   385  	// since we are no longer responsible for TTL expirations.
   386  	if err := s.clearAllHeartbeatTimers(); err != nil {
   387  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   388  		return err
   389  	}
   390  
   391  	// Unpause our worker if we paused previously
   392  	if len(s.workers) > 1 {
   393  		for i := 0; i < len(s.workers)/2; i++ {
   394  			s.workers[i].SetPause(false)
   395  		}
   396  	}
   397  	return nil
   398  }
   399  
   400  // reconcile is used to reconcile the differences between Serf
   401  // membership and what is reflected in our strongly consistent store.
   402  func (s *Server) reconcile() error {
   403  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   404  	members := s.serf.Members()
   405  	for _, member := range members {
   406  		if err := s.reconcileMember(member); err != nil {
   407  			return err
   408  		}
   409  	}
   410  	return nil
   411  }
   412  
   413  // reconcileMember is used to do an async reconcile of a single serf member
   414  func (s *Server) reconcileMember(member serf.Member) error {
   415  	// Check if this is a member we should handle
   416  	valid, parts := isNomadServer(member)
   417  	if !valid || parts.Region != s.config.Region {
   418  		return nil
   419  	}
   420  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   421  
   422  	// Do not reconcile ourself
   423  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   424  		return nil
   425  	}
   426  
   427  	var err error
   428  	switch member.Status {
   429  	case serf.StatusAlive:
   430  		err = s.addRaftPeer(member, parts)
   431  	case serf.StatusLeft, StatusReap:
   432  		err = s.removeRaftPeer(member, parts)
   433  	}
   434  	if err != nil {
   435  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   436  			member, err)
   437  		return err
   438  	}
   439  	return nil
   440  }
   441  
   442  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   443  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   444  	// Check for possibility of multiple bootstrap nodes
   445  	if parts.Bootstrap {
   446  		members := s.serf.Members()
   447  		for _, member := range members {
   448  			valid, p := isNomadServer(member)
   449  			if valid && member.Name != m.Name && p.Bootstrap {
   450  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   451  				return nil
   452  			}
   453  		}
   454  	}
   455  
   456  	// Attempt to add as a peer
   457  	future := s.raft.AddPeer(parts.Addr.String())
   458  	if err := future.Error(); err != nil && err != raft.ErrKnownPeer {
   459  		s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   460  		return err
   461  	} else if err == nil {
   462  		s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
   463  	}
   464  	return nil
   465  }
   466  
   467  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   468  // or is reaped
   469  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   470  	// Attempt to remove as peer
   471  	future := s.raft.RemovePeer(parts.Addr.String())
   472  	if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
   473  		s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   474  			parts, err)
   475  		return err
   476  	} else if err == nil {
   477  		s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name)
   478  	}
   479  	return nil
   480  }