github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/armon/go-metrics"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/raft"
    11  	"github.com/hashicorp/serf/serf"
    12  )
    13  
    14  // monitorLeadership is used to monitor if we acquire or lose our role
    15  // as the leader in the Raft cluster. There is some work the leader is
    16  // expected to do, so we must react to changes
    17  func (s *Server) monitorLeadership() {
    18  	var stopCh chan struct{}
    19  	for {
    20  		select {
    21  		case isLeader := <-s.leaderCh:
    22  			if isLeader {
    23  				stopCh = make(chan struct{})
    24  				go s.leaderLoop(stopCh)
    25  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    26  			} else if stopCh != nil {
    27  				close(stopCh)
    28  				stopCh = nil
    29  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    30  			}
    31  		case <-s.shutdownCh:
    32  			return
    33  		}
    34  	}
    35  }
    36  
    37  // leaderLoop runs as long as we are the leader to run various
    38  // maintence activities
    39  func (s *Server) leaderLoop(stopCh chan struct{}) {
    40  	// Ensure we revoke leadership on stepdown
    41  	defer s.revokeLeadership()
    42  
    43  	var reconcileCh chan serf.Member
    44  	establishedLeader := false
    45  
    46  RECONCILE:
    47  	// Setup a reconciliation timer
    48  	reconcileCh = nil
    49  	interval := time.After(s.config.ReconcileInterval)
    50  
    51  	// Apply a raft barrier to ensure our FSM is caught up
    52  	start := time.Now()
    53  	barrier := s.raft.Barrier(0)
    54  	if err := barrier.Error(); err != nil {
    55  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
    56  		goto WAIT
    57  	}
    58  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
    59  
    60  	// Check if we need to handle initial leadership actions
    61  	if !establishedLeader {
    62  		if err := s.establishLeadership(stopCh); err != nil {
    63  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v",
    64  				err)
    65  			goto WAIT
    66  		}
    67  		establishedLeader = true
    68  	}
    69  
    70  	// Reconcile any missing data
    71  	if err := s.reconcile(); err != nil {
    72  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
    73  		goto WAIT
    74  	}
    75  
    76  	// Initial reconcile worked, now we can process the channel
    77  	// updates
    78  	reconcileCh = s.reconcileCh
    79  
    80  WAIT:
    81  	// Wait until leadership is lost
    82  	for {
    83  		select {
    84  		case <-stopCh:
    85  			return
    86  		case <-s.shutdownCh:
    87  			return
    88  		case <-interval:
    89  			goto RECONCILE
    90  		case member := <-reconcileCh:
    91  			s.reconcileMember(member)
    92  		}
    93  	}
    94  }
    95  
    96  // establishLeadership is invoked once we become leader and are able
    97  // to invoke an initial barrier. The barrier is used to ensure any
    98  // previously inflight transactions have been commited and that our
    99  // state is up-to-date.
   100  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   101  	// If we have multiple workers, disable one to free processing
   102  	// for the plan queue and evaluation broker
   103  	if len(s.workers) > 1 {
   104  		s.workers[0].SetPause(true)
   105  	}
   106  
   107  	// Enable the plan queue, since we are now the leader
   108  	s.planQueue.SetEnabled(true)
   109  
   110  	// Start the plan evaluator
   111  	go s.planApply()
   112  
   113  	// Enable the eval broker, since we are now the leader
   114  	s.evalBroker.SetEnabled(true)
   115  
   116  	// Restore the eval broker state
   117  	if err := s.restoreEvalBroker(); err != nil {
   118  		return err
   119  	}
   120  
   121  	// Enable the periodic dispatcher, since we are now the leader.
   122  	s.periodicDispatcher.SetEnabled(true)
   123  	s.periodicDispatcher.Start()
   124  
   125  	// Restore the periodic dispatcher state
   126  	if err := s.restorePeriodicDispatcher(); err != nil {
   127  		return err
   128  	}
   129  
   130  	// Scheduler periodic jobs
   131  	go s.schedulePeriodic(stopCh)
   132  
   133  	// Reap any failed evaluations
   134  	go s.reapFailedEvaluations(stopCh)
   135  
   136  	// Setup the heartbeat timers. This is done both when starting up or when
   137  	// a leader fail over happens. Since the timers are maintained by the leader
   138  	// node, effectively this means all the timers are renewed at the time of failover.
   139  	// The TTL contract is that the session will not be expired before the TTL,
   140  	// so expiring it later is allowable.
   141  	//
   142  	// This MUST be done after the initial barrier to ensure the latest Nodes
   143  	// are available to be initialized. Otherwise initialization may use stale
   144  	// data.
   145  	if err := s.initializeHeartbeatTimers(); err != nil {
   146  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   147  		return err
   148  	}
   149  	return nil
   150  }
   151  
   152  // restoreEvalBroker is used to restore all pending evaluations
   153  // into the eval broker. The broker is maintained only by the leader,
   154  // so it must be restored anytime a leadership transition takes place.
   155  func (s *Server) restoreEvalBroker() error {
   156  	// Get an iterator over every evaluation
   157  	iter, err := s.fsm.State().Evals()
   158  	if err != nil {
   159  		return fmt.Errorf("failed to get evaluations: %v", err)
   160  	}
   161  
   162  	for {
   163  		raw := iter.Next()
   164  		if raw == nil {
   165  			break
   166  		}
   167  		eval := raw.(*structs.Evaluation)
   168  
   169  		if !eval.ShouldEnqueue() {
   170  			continue
   171  		}
   172  
   173  		if err := s.evalBroker.Enqueue(eval); err != nil {
   174  			return fmt.Errorf("failed to enqueue evaluation %s: %v", eval.ID, err)
   175  		}
   176  	}
   177  	return nil
   178  }
   179  
   180  // restorePeriodicDispatcher is used to restore all periodic jobs into the
   181  // periodic dispatcher. It also determines if a periodic job should have been
   182  // created during the leadership transition and force runs them. The periodic
   183  // dispatcher is maintained only by the leader, so it must be restored anytime a
   184  // leadership transition takes place.
   185  func (s *Server) restorePeriodicDispatcher() error {
   186  	iter, err := s.fsm.State().JobsByPeriodic(true)
   187  	if err != nil {
   188  		return fmt.Errorf("failed to get periodic jobs: %v", err)
   189  	}
   190  
   191  	now := time.Now()
   192  	for i := iter.Next(); i != nil; i = iter.Next() {
   193  		job := i.(*structs.Job)
   194  		s.periodicDispatcher.Add(job)
   195  
   196  		// If the periodic job has never been launched before, launch will hold
   197  		// the time the periodic job was added. Otherwise it has the last launch
   198  		// time of the periodic job.
   199  		launch, err := s.fsm.State().PeriodicLaunchByID(job.ID)
   200  		if err != nil || launch == nil {
   201  			return fmt.Errorf("failed to get periodic launch time: %v", err)
   202  		}
   203  
   204  		// nextLaunch is the next launch that should occur.
   205  		nextLaunch := job.Periodic.Next(launch.Launch)
   206  
   207  		// We skip force launching the job if  there should be no next launch
   208  		// (the zero case) or if the next launch time is in the future. If it is
   209  		// in the future, it will be handled by the periodic dispatcher.
   210  		if nextLaunch.IsZero() || !nextLaunch.Before(now) {
   211  			continue
   212  		}
   213  
   214  		if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil {
   215  			msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err)
   216  			s.logger.Printf("[ERR] nomad.periodic: %s", msg)
   217  			return errors.New(msg)
   218  		}
   219  		s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+
   220  			" run during leadership establishment", job.ID)
   221  	}
   222  
   223  	return nil
   224  }
   225  
   226  // schedulePeriodic is used to do periodic job dispatch while we are leader
   227  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   228  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   229  	defer evalGC.Stop()
   230  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   231  	defer nodeGC.Stop()
   232  	jobGC := time.NewTicker(s.config.JobGCInterval)
   233  	defer jobGC.Stop()
   234  
   235  	for {
   236  		select {
   237  		case <-evalGC.C:
   238  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC))
   239  		case <-nodeGC.C:
   240  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC))
   241  		case <-jobGC.C:
   242  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC))
   243  		case <-stopCh:
   244  			return
   245  		}
   246  	}
   247  }
   248  
   249  // coreJobEval returns an evaluation for a core job
   250  func (s *Server) coreJobEval(job string) *structs.Evaluation {
   251  	return &structs.Evaluation{
   252  		ID:          structs.GenerateUUID(),
   253  		Priority:    structs.CoreJobPriority,
   254  		Type:        structs.JobTypeCore,
   255  		TriggeredBy: structs.EvalTriggerScheduled,
   256  		JobID:       job,
   257  		Status:      structs.EvalStatusPending,
   258  		ModifyIndex: s.raft.AppliedIndex(),
   259  	}
   260  }
   261  
   262  // reapFailedEvaluations is used to reap evaluations that
   263  // have reached their delivery limit and should be failed
   264  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   265  	for {
   266  		select {
   267  		case <-stopCh:
   268  			return
   269  		default:
   270  			// Scan for a failed evaluation
   271  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   272  			if err != nil {
   273  				return
   274  			}
   275  			if eval == nil {
   276  				continue
   277  			}
   278  
   279  			// Update the status to failed
   280  			newEval := eval.Copy()
   281  			newEval.Status = structs.EvalStatusFailed
   282  			newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   283  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval)
   284  
   285  			// Update via Raft
   286  			req := structs.EvalUpdateRequest{
   287  				Evals: []*structs.Evaluation{newEval},
   288  			}
   289  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   290  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err)
   291  				continue
   292  			}
   293  
   294  			// Ack completion
   295  			s.evalBroker.Ack(eval.ID, token)
   296  		}
   297  	}
   298  }
   299  
   300  // revokeLeadership is invoked once we step down as leader.
   301  // This is used to cleanup any state that may be specific to a leader.
   302  func (s *Server) revokeLeadership() error {
   303  	// Disable the plan queue, since we are no longer leader
   304  	s.planQueue.SetEnabled(false)
   305  
   306  	// Disable the eval broker, since it is only useful as a leader
   307  	s.evalBroker.SetEnabled(false)
   308  
   309  	// Disable the periodic dispatcher, since it is only useful as a leader
   310  	s.periodicDispatcher.SetEnabled(false)
   311  
   312  	// Clear the heartbeat timers on either shutdown or step down,
   313  	// since we are no longer responsible for TTL expirations.
   314  	if err := s.clearAllHeartbeatTimers(); err != nil {
   315  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   316  		return err
   317  	}
   318  
   319  	// Unpause our worker if we paused previously
   320  	if len(s.workers) > 1 {
   321  		s.workers[0].SetPause(false)
   322  	}
   323  	return nil
   324  }
   325  
   326  // reconcile is used to reconcile the differences between Serf
   327  // membership and what is reflected in our strongly consistent store.
   328  func (s *Server) reconcile() error {
   329  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   330  	members := s.serf.Members()
   331  	for _, member := range members {
   332  		if err := s.reconcileMember(member); err != nil {
   333  			return err
   334  		}
   335  	}
   336  	return nil
   337  }
   338  
   339  // reconcileMember is used to do an async reconcile of a single serf member
   340  func (s *Server) reconcileMember(member serf.Member) error {
   341  	// Check if this is a member we should handle
   342  	valid, parts := isNomadServer(member)
   343  	if !valid || parts.Region != s.config.Region {
   344  		return nil
   345  	}
   346  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   347  
   348  	// Do not reconcile ourself
   349  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   350  		return nil
   351  	}
   352  
   353  	var err error
   354  	switch member.Status {
   355  	case serf.StatusAlive:
   356  		err = s.addRaftPeer(member, parts)
   357  	case serf.StatusLeft, StatusReap:
   358  		err = s.removeRaftPeer(member, parts)
   359  	}
   360  	if err != nil {
   361  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   362  			member, err)
   363  		return err
   364  	}
   365  	return nil
   366  }
   367  
   368  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   369  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   370  	// Check for possibility of multiple bootstrap nodes
   371  	if parts.Bootstrap {
   372  		members := s.serf.Members()
   373  		for _, member := range members {
   374  			valid, p := isNomadServer(member)
   375  			if valid && member.Name != m.Name && p.Bootstrap {
   376  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   377  				return nil
   378  			}
   379  		}
   380  	}
   381  
   382  	// Attempt to add as a peer
   383  	future := s.raft.AddPeer(parts.Addr.String())
   384  	if err := future.Error(); err != nil && err != raft.ErrKnownPeer {
   385  		s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   386  		return err
   387  	} else if err == nil {
   388  		s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
   389  	}
   390  	return nil
   391  }
   392  
   393  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   394  // or is reaped
   395  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   396  	// Attempt to remove as peer
   397  	future := s.raft.RemovePeer(parts.Addr.String())
   398  	if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
   399  		s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   400  			parts, err)
   401  		return err
   402  	} else if err == nil {
   403  		s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name)
   404  	}
   405  	return nil
   406  }