github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/leader.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  	"github.com/hashicorp/raft"
    10  	"github.com/hashicorp/serf/serf"
    11  )
    12  
    13  // monitorLeadership is used to monitor if we acquire or lose our role
    14  // as the leader in the Raft cluster. There is some work the leader is
    15  // expected to do, so we must react to changes
    16  func (s *Server) monitorLeadership() {
    17  	var stopCh chan struct{}
    18  	for {
    19  		select {
    20  		case isLeader := <-s.leaderCh:
    21  			if isLeader {
    22  				stopCh = make(chan struct{})
    23  				go s.leaderLoop(stopCh)
    24  				s.logger.Printf("[INFO] nomad: cluster leadership acquired")
    25  			} else if stopCh != nil {
    26  				close(stopCh)
    27  				stopCh = nil
    28  				s.logger.Printf("[INFO] nomad: cluster leadership lost")
    29  			}
    30  		case <-s.shutdownCh:
    31  			return
    32  		}
    33  	}
    34  }
    35  
    36  // leaderLoop runs as long as we are the leader to run various
    37  // maintence activities
    38  func (s *Server) leaderLoop(stopCh chan struct{}) {
    39  	// Ensure we revoke leadership on stepdown
    40  	defer s.revokeLeadership()
    41  
    42  	var reconcileCh chan serf.Member
    43  	establishedLeader := false
    44  
    45  RECONCILE:
    46  	// Setup a reconciliation timer
    47  	reconcileCh = nil
    48  	interval := time.After(s.config.ReconcileInterval)
    49  
    50  	// Apply a raft barrier to ensure our FSM is caught up
    51  	start := time.Now()
    52  	barrier := s.raft.Barrier(0)
    53  	if err := barrier.Error(); err != nil {
    54  		s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err)
    55  		goto WAIT
    56  	}
    57  	metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start)
    58  
    59  	// Check if we need to handle initial leadership actions
    60  	if !establishedLeader {
    61  		if err := s.establishLeadership(stopCh); err != nil {
    62  			s.logger.Printf("[ERR] nomad: failed to establish leadership: %v",
    63  				err)
    64  			goto WAIT
    65  		}
    66  		establishedLeader = true
    67  	}
    68  
    69  	// Reconcile any missing data
    70  	if err := s.reconcile(); err != nil {
    71  		s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err)
    72  		goto WAIT
    73  	}
    74  
    75  	// Initial reconcile worked, now we can process the channel
    76  	// updates
    77  	reconcileCh = s.reconcileCh
    78  
    79  WAIT:
    80  	// Wait until leadership is lost
    81  	for {
    82  		select {
    83  		case <-stopCh:
    84  			return
    85  		case <-s.shutdownCh:
    86  			return
    87  		case <-interval:
    88  			goto RECONCILE
    89  		case member := <-reconcileCh:
    90  			s.reconcileMember(member)
    91  		}
    92  	}
    93  }
    94  
    95  // establishLeadership is invoked once we become leader and are able
    96  // to invoke an initial barrier. The barrier is used to ensure any
    97  // previously inflight transactions have been commited and that our
    98  // state is up-to-date.
    99  func (s *Server) establishLeadership(stopCh chan struct{}) error {
   100  	// If we have multiple workers, disable one to free processing
   101  	// for the plan queue and evaluation broker
   102  	if len(s.workers) > 1 {
   103  		s.workers[0].SetPause(true)
   104  	}
   105  
   106  	// Enable the plan queue, since we are now the leader
   107  	s.planQueue.SetEnabled(true)
   108  
   109  	// Start the plan evaluator
   110  	go s.planApply()
   111  
   112  	// Enable the eval broker, since we are now the leader
   113  	s.evalBroker.SetEnabled(true)
   114  
   115  	// Restore the eval broker state
   116  	if err := s.restoreEvalBroker(); err != nil {
   117  		return err
   118  	}
   119  
   120  	// Scheduler periodic jobs
   121  	go s.schedulePeriodic(stopCh)
   122  
   123  	// Reap any failed evaluations
   124  	go s.reapFailedEvaluations(stopCh)
   125  
   126  	// Setup the heartbeat timers. This is done both when starting up or when
   127  	// a leader fail over happens. Since the timers are maintained by the leader
   128  	// node, effectively this means all the timers are renewed at the time of failover.
   129  	// The TTL contract is that the session will not be expired before the TTL,
   130  	// so expiring it later is allowable.
   131  	//
   132  	// This MUST be done after the initial barrier to ensure the latest Nodes
   133  	// are available to be initialized. Otherwise initialization may use stale
   134  	// data.
   135  	if err := s.initializeHeartbeatTimers(); err != nil {
   136  		s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err)
   137  		return err
   138  	}
   139  	return nil
   140  }
   141  
   142  // restoreEvalBroker is used to restore all pending evaluations
   143  // into the eval broker. The broker is maintained only by the leader,
   144  // so it must be restored anytime a leadership transition takes place.
   145  func (s *Server) restoreEvalBroker() error {
   146  	// Get an iterator over every evaluation
   147  	iter, err := s.fsm.State().Evals()
   148  	if err != nil {
   149  		return fmt.Errorf("failed to get evaluations: %v", err)
   150  	}
   151  
   152  	for {
   153  		raw := iter.Next()
   154  		if raw == nil {
   155  			break
   156  		}
   157  		eval := raw.(*structs.Evaluation)
   158  
   159  		if !eval.ShouldEnqueue() {
   160  			continue
   161  		}
   162  
   163  		if err := s.evalBroker.Enqueue(eval); err != nil {
   164  			return fmt.Errorf("failed to enqueue evaluation %s: %v", eval.ID, err)
   165  		}
   166  	}
   167  	return nil
   168  }
   169  
   170  // schedulePeriodic is used to do periodic job dispatch while we are leader
   171  func (s *Server) schedulePeriodic(stopCh chan struct{}) {
   172  	evalGC := time.NewTicker(s.config.EvalGCInterval)
   173  	defer evalGC.Stop()
   174  	nodeGC := time.NewTicker(s.config.NodeGCInterval)
   175  	defer nodeGC.Stop()
   176  
   177  	for {
   178  		select {
   179  		case <-evalGC.C:
   180  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC))
   181  		case <-nodeGC.C:
   182  			s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC))
   183  		case <-stopCh:
   184  			return
   185  		}
   186  	}
   187  }
   188  
   189  // coreJobEval returns an evaluation for a core job
   190  func (s *Server) coreJobEval(job string) *structs.Evaluation {
   191  	return &structs.Evaluation{
   192  		ID:          structs.GenerateUUID(),
   193  		Priority:    structs.CoreJobPriority,
   194  		Type:        structs.JobTypeCore,
   195  		TriggeredBy: structs.EvalTriggerScheduled,
   196  		JobID:       job,
   197  		Status:      structs.EvalStatusPending,
   198  		ModifyIndex: s.raft.AppliedIndex(),
   199  	}
   200  }
   201  
   202  // reapFailedEvaluations is used to reap evaluations that
   203  // have reached their delivery limit and should be failed
   204  func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
   205  	for {
   206  		select {
   207  		case <-stopCh:
   208  			return
   209  		default:
   210  			// Scan for a failed evaluation
   211  			eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second)
   212  			if err != nil {
   213  				return
   214  			}
   215  			if eval == nil {
   216  				continue
   217  			}
   218  
   219  			// Update the status to failed
   220  			newEval := eval.Copy()
   221  			newEval.Status = structs.EvalStatusFailed
   222  			newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
   223  			s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval)
   224  
   225  			// Update via Raft
   226  			req := structs.EvalUpdateRequest{
   227  				Evals: []*structs.Evaluation{newEval},
   228  			}
   229  			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
   230  				s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err)
   231  				continue
   232  			}
   233  
   234  			// Ack completion
   235  			s.evalBroker.Ack(eval.ID, token)
   236  		}
   237  	}
   238  }
   239  
   240  // revokeLeadership is invoked once we step down as leader.
   241  // This is used to cleanup any state that may be specific to a leader.
   242  func (s *Server) revokeLeadership() error {
   243  	// Disable the plan queue, since we are no longer leader
   244  	s.planQueue.SetEnabled(false)
   245  
   246  	// Disable the eval broker, since it is only useful as a leader
   247  	s.evalBroker.SetEnabled(false)
   248  
   249  	// Clear the heartbeat timers on either shutdown or step down,
   250  	// since we are no longer responsible for TTL expirations.
   251  	if err := s.clearAllHeartbeatTimers(); err != nil {
   252  		s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err)
   253  		return err
   254  	}
   255  
   256  	// Unpause our worker if we paused previously
   257  	if len(s.workers) > 1 {
   258  		s.workers[0].SetPause(false)
   259  	}
   260  	return nil
   261  }
   262  
   263  // reconcile is used to reconcile the differences between Serf
   264  // membership and what is reflected in our strongly consistent store.
   265  func (s *Server) reconcile() error {
   266  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now())
   267  	members := s.serf.Members()
   268  	for _, member := range members {
   269  		if err := s.reconcileMember(member); err != nil {
   270  			return err
   271  		}
   272  	}
   273  	return nil
   274  }
   275  
   276  // reconcileMember is used to do an async reconcile of a single serf member
   277  func (s *Server) reconcileMember(member serf.Member) error {
   278  	// Check if this is a member we should handle
   279  	valid, parts := isNomadServer(member)
   280  	if !valid || parts.Region != s.config.Region {
   281  		return nil
   282  	}
   283  	defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
   284  
   285  	// Do not reconcile ourself
   286  	if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
   287  		return nil
   288  	}
   289  
   290  	var err error
   291  	switch member.Status {
   292  	case serf.StatusAlive:
   293  		err = s.addRaftPeer(member, parts)
   294  	case serf.StatusLeft, StatusReap:
   295  		err = s.removeRaftPeer(member, parts)
   296  	}
   297  	if err != nil {
   298  		s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v",
   299  			member, err)
   300  		return err
   301  	}
   302  	return nil
   303  }
   304  
   305  // addRaftPeer is used to add a new Raft peer when a Nomad server joins
   306  func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
   307  	// Check for possibility of multiple bootstrap nodes
   308  	if parts.Bootstrap {
   309  		members := s.serf.Members()
   310  		for _, member := range members {
   311  			valid, p := isNomadServer(member)
   312  			if valid && member.Name != m.Name && p.Bootstrap {
   313  				s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
   314  				return nil
   315  			}
   316  		}
   317  	}
   318  
   319  	// Attempt to add as a peer
   320  	future := s.raft.AddPeer(parts.Addr.String())
   321  	if err := future.Error(); err != nil && err != raft.ErrKnownPeer {
   322  		s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
   323  		return err
   324  	} else if err == nil {
   325  		s.logger.Printf("[INFO] nomad: added raft peer: %v", parts)
   326  	}
   327  	return nil
   328  }
   329  
   330  // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
   331  // or is reaped
   332  func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
   333  	// Attempt to remove as peer
   334  	future := s.raft.RemovePeer(parts.Addr.String())
   335  	if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
   336  		s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v",
   337  			parts, err)
   338  		return err
   339  	} else if err == nil {
   340  		s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name)
   341  	}
   342  	return nil
   343  }