github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/nomad/nomad/structs" 9 "github.com/hashicorp/raft" 10 "github.com/hashicorp/serf/serf" 11 ) 12 13 // monitorLeadership is used to monitor if we acquire or lose our role 14 // as the leader in the Raft cluster. There is some work the leader is 15 // expected to do, so we must react to changes 16 func (s *Server) monitorLeadership() { 17 var stopCh chan struct{} 18 for { 19 select { 20 case isLeader := <-s.leaderCh: 21 if isLeader { 22 stopCh = make(chan struct{}) 23 go s.leaderLoop(stopCh) 24 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 25 } else if stopCh != nil { 26 close(stopCh) 27 stopCh = nil 28 s.logger.Printf("[INFO] nomad: cluster leadership lost") 29 } 30 case <-s.shutdownCh: 31 return 32 } 33 } 34 } 35 36 // leaderLoop runs as long as we are the leader to run various 37 // maintence activities 38 func (s *Server) leaderLoop(stopCh chan struct{}) { 39 // Ensure we revoke leadership on stepdown 40 defer s.revokeLeadership() 41 42 var reconcileCh chan serf.Member 43 establishedLeader := false 44 45 RECONCILE: 46 // Setup a reconciliation timer 47 reconcileCh = nil 48 interval := time.After(s.config.ReconcileInterval) 49 50 // Apply a raft barrier to ensure our FSM is caught up 51 start := time.Now() 52 barrier := s.raft.Barrier(0) 53 if err := barrier.Error(); err != nil { 54 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 55 goto WAIT 56 } 57 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 58 59 // Check if we need to handle initial leadership actions 60 if !establishedLeader { 61 if err := s.establishLeadership(stopCh); err != nil { 62 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", 63 err) 64 goto WAIT 65 } 66 establishedLeader = true 67 } 68 69 // Reconcile any missing data 70 if err := s.reconcile(); err != nil { 71 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 72 goto WAIT 73 } 74 75 // Initial reconcile worked, now we can process the channel 76 // updates 77 reconcileCh = s.reconcileCh 78 79 WAIT: 80 // Wait until leadership is lost 81 for { 82 select { 83 case <-stopCh: 84 return 85 case <-s.shutdownCh: 86 return 87 case <-interval: 88 goto RECONCILE 89 case member := <-reconcileCh: 90 s.reconcileMember(member) 91 } 92 } 93 } 94 95 // establishLeadership is invoked once we become leader and are able 96 // to invoke an initial barrier. The barrier is used to ensure any 97 // previously inflight transactions have been commited and that our 98 // state is up-to-date. 99 func (s *Server) establishLeadership(stopCh chan struct{}) error { 100 // If we have multiple workers, disable one to free processing 101 // for the plan queue and evaluation broker 102 if len(s.workers) > 1 { 103 s.workers[0].SetPause(true) 104 } 105 106 // Enable the plan queue, since we are now the leader 107 s.planQueue.SetEnabled(true) 108 109 // Start the plan evaluator 110 go s.planApply() 111 112 // Enable the eval broker, since we are now the leader 113 s.evalBroker.SetEnabled(true) 114 115 // Restore the eval broker state 116 if err := s.restoreEvalBroker(); err != nil { 117 return err 118 } 119 120 // Scheduler periodic jobs 121 go s.schedulePeriodic(stopCh) 122 123 // Reap any failed evaluations 124 go s.reapFailedEvaluations(stopCh) 125 126 // Setup the heartbeat timers. This is done both when starting up or when 127 // a leader fail over happens. Since the timers are maintained by the leader 128 // node, effectively this means all the timers are renewed at the time of failover. 129 // The TTL contract is that the session will not be expired before the TTL, 130 // so expiring it later is allowable. 131 // 132 // This MUST be done after the initial barrier to ensure the latest Nodes 133 // are available to be initialized. Otherwise initialization may use stale 134 // data. 135 if err := s.initializeHeartbeatTimers(); err != nil { 136 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 137 return err 138 } 139 return nil 140 } 141 142 // restoreEvalBroker is used to restore all pending evaluations 143 // into the eval broker. The broker is maintained only by the leader, 144 // so it must be restored anytime a leadership transition takes place. 145 func (s *Server) restoreEvalBroker() error { 146 // Get an iterator over every evaluation 147 iter, err := s.fsm.State().Evals() 148 if err != nil { 149 return fmt.Errorf("failed to get evaluations: %v", err) 150 } 151 152 for { 153 raw := iter.Next() 154 if raw == nil { 155 break 156 } 157 eval := raw.(*structs.Evaluation) 158 159 if !eval.ShouldEnqueue() { 160 continue 161 } 162 163 if err := s.evalBroker.Enqueue(eval); err != nil { 164 return fmt.Errorf("failed to enqueue evaluation %s: %v", eval.ID, err) 165 } 166 } 167 return nil 168 } 169 170 // schedulePeriodic is used to do periodic job dispatch while we are leader 171 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 172 evalGC := time.NewTicker(s.config.EvalGCInterval) 173 defer evalGC.Stop() 174 nodeGC := time.NewTicker(s.config.NodeGCInterval) 175 defer nodeGC.Stop() 176 177 for { 178 select { 179 case <-evalGC.C: 180 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC)) 181 case <-nodeGC.C: 182 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC)) 183 case <-stopCh: 184 return 185 } 186 } 187 } 188 189 // coreJobEval returns an evaluation for a core job 190 func (s *Server) coreJobEval(job string) *structs.Evaluation { 191 return &structs.Evaluation{ 192 ID: structs.GenerateUUID(), 193 Priority: structs.CoreJobPriority, 194 Type: structs.JobTypeCore, 195 TriggeredBy: structs.EvalTriggerScheduled, 196 JobID: job, 197 Status: structs.EvalStatusPending, 198 ModifyIndex: s.raft.AppliedIndex(), 199 } 200 } 201 202 // reapFailedEvaluations is used to reap evaluations that 203 // have reached their delivery limit and should be failed 204 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 205 for { 206 select { 207 case <-stopCh: 208 return 209 default: 210 // Scan for a failed evaluation 211 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 212 if err != nil { 213 return 214 } 215 if eval == nil { 216 continue 217 } 218 219 // Update the status to failed 220 newEval := eval.Copy() 221 newEval.Status = structs.EvalStatusFailed 222 newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 223 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval) 224 225 // Update via Raft 226 req := structs.EvalUpdateRequest{ 227 Evals: []*structs.Evaluation{newEval}, 228 } 229 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 230 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err) 231 continue 232 } 233 234 // Ack completion 235 s.evalBroker.Ack(eval.ID, token) 236 } 237 } 238 } 239 240 // revokeLeadership is invoked once we step down as leader. 241 // This is used to cleanup any state that may be specific to a leader. 242 func (s *Server) revokeLeadership() error { 243 // Disable the plan queue, since we are no longer leader 244 s.planQueue.SetEnabled(false) 245 246 // Disable the eval broker, since it is only useful as a leader 247 s.evalBroker.SetEnabled(false) 248 249 // Clear the heartbeat timers on either shutdown or step down, 250 // since we are no longer responsible for TTL expirations. 251 if err := s.clearAllHeartbeatTimers(); err != nil { 252 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 253 return err 254 } 255 256 // Unpause our worker if we paused previously 257 if len(s.workers) > 1 { 258 s.workers[0].SetPause(false) 259 } 260 return nil 261 } 262 263 // reconcile is used to reconcile the differences between Serf 264 // membership and what is reflected in our strongly consistent store. 265 func (s *Server) reconcile() error { 266 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 267 members := s.serf.Members() 268 for _, member := range members { 269 if err := s.reconcileMember(member); err != nil { 270 return err 271 } 272 } 273 return nil 274 } 275 276 // reconcileMember is used to do an async reconcile of a single serf member 277 func (s *Server) reconcileMember(member serf.Member) error { 278 // Check if this is a member we should handle 279 valid, parts := isNomadServer(member) 280 if !valid || parts.Region != s.config.Region { 281 return nil 282 } 283 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 284 285 // Do not reconcile ourself 286 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 287 return nil 288 } 289 290 var err error 291 switch member.Status { 292 case serf.StatusAlive: 293 err = s.addRaftPeer(member, parts) 294 case serf.StatusLeft, StatusReap: 295 err = s.removeRaftPeer(member, parts) 296 } 297 if err != nil { 298 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 299 member, err) 300 return err 301 } 302 return nil 303 } 304 305 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 306 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 307 // Check for possibility of multiple bootstrap nodes 308 if parts.Bootstrap { 309 members := s.serf.Members() 310 for _, member := range members { 311 valid, p := isNomadServer(member) 312 if valid && member.Name != m.Name && p.Bootstrap { 313 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 314 return nil 315 } 316 } 317 } 318 319 // Attempt to add as a peer 320 future := s.raft.AddPeer(parts.Addr.String()) 321 if err := future.Error(); err != nil && err != raft.ErrKnownPeer { 322 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 323 return err 324 } else if err == nil { 325 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 326 } 327 return nil 328 } 329 330 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 331 // or is reaped 332 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 333 // Attempt to remove as peer 334 future := s.raft.RemovePeer(parts.Addr.String()) 335 if err := future.Error(); err != nil && err != raft.ErrUnknownPeer { 336 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 337 parts, err) 338 return err 339 } else if err == nil { 340 s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name) 341 } 342 return nil 343 }