github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "fmt" 6 "time" 7 8 "github.com/armon/go-metrics" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/raft" 11 "github.com/hashicorp/serf/serf" 12 ) 13 14 // monitorLeadership is used to monitor if we acquire or lose our role 15 // as the leader in the Raft cluster. There is some work the leader is 16 // expected to do, so we must react to changes 17 func (s *Server) monitorLeadership() { 18 var stopCh chan struct{} 19 for { 20 select { 21 case isLeader := <-s.leaderCh: 22 if isLeader { 23 stopCh = make(chan struct{}) 24 go s.leaderLoop(stopCh) 25 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 26 } else if stopCh != nil { 27 close(stopCh) 28 stopCh = nil 29 s.logger.Printf("[INFO] nomad: cluster leadership lost") 30 } 31 case <-s.shutdownCh: 32 return 33 } 34 } 35 } 36 37 // leaderLoop runs as long as we are the leader to run various 38 // maintence activities 39 func (s *Server) leaderLoop(stopCh chan struct{}) { 40 // Ensure we revoke leadership on stepdown 41 defer s.revokeLeadership() 42 43 var reconcileCh chan serf.Member 44 establishedLeader := false 45 46 RECONCILE: 47 // Setup a reconciliation timer 48 reconcileCh = nil 49 interval := time.After(s.config.ReconcileInterval) 50 51 // Apply a raft barrier to ensure our FSM is caught up 52 start := time.Now() 53 barrier := s.raft.Barrier(0) 54 if err := barrier.Error(); err != nil { 55 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 56 goto WAIT 57 } 58 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 59 60 // Check if we need to handle initial leadership actions 61 if !establishedLeader { 62 if err := s.establishLeadership(stopCh); err != nil { 63 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", 64 err) 65 goto WAIT 66 } 67 establishedLeader = true 68 } 69 70 // Reconcile any missing data 71 if err := s.reconcile(); err != nil { 72 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 73 goto WAIT 74 } 75 76 // Initial reconcile worked, now we can process the channel 77 // updates 78 reconcileCh = s.reconcileCh 79 80 WAIT: 81 // Wait until leadership is lost 82 for { 83 select { 84 case <-stopCh: 85 return 86 case <-s.shutdownCh: 87 return 88 case <-interval: 89 goto RECONCILE 90 case member := <-reconcileCh: 91 s.reconcileMember(member) 92 } 93 } 94 } 95 96 // establishLeadership is invoked once we become leader and are able 97 // to invoke an initial barrier. The barrier is used to ensure any 98 // previously inflight transactions have been commited and that our 99 // state is up-to-date. 100 func (s *Server) establishLeadership(stopCh chan struct{}) error { 101 // If we have multiple workers, disable one to free processing 102 // for the plan queue and evaluation broker 103 if len(s.workers) > 1 { 104 s.workers[0].SetPause(true) 105 } 106 107 // Enable the plan queue, since we are now the leader 108 s.planQueue.SetEnabled(true) 109 110 // Start the plan evaluator 111 go s.planApply() 112 113 // Enable the eval broker, since we are now the leader 114 s.evalBroker.SetEnabled(true) 115 116 // Restore the eval broker state 117 if err := s.restoreEvalBroker(); err != nil { 118 return err 119 } 120 121 // Enable the periodic dispatcher, since we are now the leader. 122 s.periodicDispatcher.SetEnabled(true) 123 s.periodicDispatcher.Start() 124 125 // Restore the periodic dispatcher state 126 if err := s.restorePeriodicDispatcher(); err != nil { 127 return err 128 } 129 130 // Scheduler periodic jobs 131 go s.schedulePeriodic(stopCh) 132 133 // Reap any failed evaluations 134 go s.reapFailedEvaluations(stopCh) 135 136 // Setup the heartbeat timers. This is done both when starting up or when 137 // a leader fail over happens. Since the timers are maintained by the leader 138 // node, effectively this means all the timers are renewed at the time of failover. 139 // The TTL contract is that the session will not be expired before the TTL, 140 // so expiring it later is allowable. 141 // 142 // This MUST be done after the initial barrier to ensure the latest Nodes 143 // are available to be initialized. Otherwise initialization may use stale 144 // data. 145 if err := s.initializeHeartbeatTimers(); err != nil { 146 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 147 return err 148 } 149 return nil 150 } 151 152 // restoreEvalBroker is used to restore all pending evaluations 153 // into the eval broker. The broker is maintained only by the leader, 154 // so it must be restored anytime a leadership transition takes place. 155 func (s *Server) restoreEvalBroker() error { 156 // Get an iterator over every evaluation 157 iter, err := s.fsm.State().Evals() 158 if err != nil { 159 return fmt.Errorf("failed to get evaluations: %v", err) 160 } 161 162 for { 163 raw := iter.Next() 164 if raw == nil { 165 break 166 } 167 eval := raw.(*structs.Evaluation) 168 169 if !eval.ShouldEnqueue() { 170 continue 171 } 172 173 if err := s.evalBroker.Enqueue(eval); err != nil { 174 return fmt.Errorf("failed to enqueue evaluation %s: %v", eval.ID, err) 175 } 176 } 177 return nil 178 } 179 180 // restorePeriodicDispatcher is used to restore all periodic jobs into the 181 // periodic dispatcher. It also determines if a periodic job should have been 182 // created during the leadership transition and force runs them. The periodic 183 // dispatcher is maintained only by the leader, so it must be restored anytime a 184 // leadership transition takes place. 185 func (s *Server) restorePeriodicDispatcher() error { 186 iter, err := s.fsm.State().JobsByPeriodic(true) 187 if err != nil { 188 return fmt.Errorf("failed to get periodic jobs: %v", err) 189 } 190 191 now := time.Now() 192 for i := iter.Next(); i != nil; i = iter.Next() { 193 job := i.(*structs.Job) 194 s.periodicDispatcher.Add(job) 195 196 // If the periodic job has never been launched before, launch will hold 197 // the time the periodic job was added. Otherwise it has the last launch 198 // time of the periodic job. 199 launch, err := s.fsm.State().PeriodicLaunchByID(job.ID) 200 if err != nil || launch == nil { 201 return fmt.Errorf("failed to get periodic launch time: %v", err) 202 } 203 204 // nextLaunch is the next launch that should occur. 205 nextLaunch := job.Periodic.Next(launch.Launch) 206 207 // We skip force launching the job if there should be no next launch 208 // (the zero case) or if the next launch time is in the future. If it is 209 // in the future, it will be handled by the periodic dispatcher. 210 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 211 continue 212 } 213 214 if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil { 215 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 216 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 217 return errors.New(msg) 218 } 219 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 220 " run during leadership establishment", job.ID) 221 } 222 223 return nil 224 } 225 226 // schedulePeriodic is used to do periodic job dispatch while we are leader 227 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 228 evalGC := time.NewTicker(s.config.EvalGCInterval) 229 defer evalGC.Stop() 230 nodeGC := time.NewTicker(s.config.NodeGCInterval) 231 defer nodeGC.Stop() 232 jobGC := time.NewTicker(s.config.JobGCInterval) 233 defer jobGC.Stop() 234 235 for { 236 select { 237 case <-evalGC.C: 238 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC)) 239 case <-nodeGC.C: 240 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC)) 241 case <-jobGC.C: 242 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC)) 243 case <-stopCh: 244 return 245 } 246 } 247 } 248 249 // coreJobEval returns an evaluation for a core job 250 func (s *Server) coreJobEval(job string) *structs.Evaluation { 251 return &structs.Evaluation{ 252 ID: structs.GenerateUUID(), 253 Priority: structs.CoreJobPriority, 254 Type: structs.JobTypeCore, 255 TriggeredBy: structs.EvalTriggerScheduled, 256 JobID: job, 257 Status: structs.EvalStatusPending, 258 ModifyIndex: s.raft.AppliedIndex(), 259 } 260 } 261 262 // reapFailedEvaluations is used to reap evaluations that 263 // have reached their delivery limit and should be failed 264 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 265 for { 266 select { 267 case <-stopCh: 268 return 269 default: 270 // Scan for a failed evaluation 271 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 272 if err != nil { 273 return 274 } 275 if eval == nil { 276 continue 277 } 278 279 // Update the status to failed 280 newEval := eval.Copy() 281 newEval.Status = structs.EvalStatusFailed 282 newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 283 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval) 284 285 // Update via Raft 286 req := structs.EvalUpdateRequest{ 287 Evals: []*structs.Evaluation{newEval}, 288 } 289 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 290 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err) 291 continue 292 } 293 294 // Ack completion 295 s.evalBroker.Ack(eval.ID, token) 296 } 297 } 298 } 299 300 // revokeLeadership is invoked once we step down as leader. 301 // This is used to cleanup any state that may be specific to a leader. 302 func (s *Server) revokeLeadership() error { 303 // Disable the plan queue, since we are no longer leader 304 s.planQueue.SetEnabled(false) 305 306 // Disable the eval broker, since it is only useful as a leader 307 s.evalBroker.SetEnabled(false) 308 309 // Disable the periodic dispatcher, since it is only useful as a leader 310 s.periodicDispatcher.SetEnabled(false) 311 312 // Clear the heartbeat timers on either shutdown or step down, 313 // since we are no longer responsible for TTL expirations. 314 if err := s.clearAllHeartbeatTimers(); err != nil { 315 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 316 return err 317 } 318 319 // Unpause our worker if we paused previously 320 if len(s.workers) > 1 { 321 s.workers[0].SetPause(false) 322 } 323 return nil 324 } 325 326 // reconcile is used to reconcile the differences between Serf 327 // membership and what is reflected in our strongly consistent store. 328 func (s *Server) reconcile() error { 329 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 330 members := s.serf.Members() 331 for _, member := range members { 332 if err := s.reconcileMember(member); err != nil { 333 return err 334 } 335 } 336 return nil 337 } 338 339 // reconcileMember is used to do an async reconcile of a single serf member 340 func (s *Server) reconcileMember(member serf.Member) error { 341 // Check if this is a member we should handle 342 valid, parts := isNomadServer(member) 343 if !valid || parts.Region != s.config.Region { 344 return nil 345 } 346 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 347 348 // Do not reconcile ourself 349 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 350 return nil 351 } 352 353 var err error 354 switch member.Status { 355 case serf.StatusAlive: 356 err = s.addRaftPeer(member, parts) 357 case serf.StatusLeft, StatusReap: 358 err = s.removeRaftPeer(member, parts) 359 } 360 if err != nil { 361 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 362 member, err) 363 return err 364 } 365 return nil 366 } 367 368 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 369 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 370 // Check for possibility of multiple bootstrap nodes 371 if parts.Bootstrap { 372 members := s.serf.Members() 373 for _, member := range members { 374 valid, p := isNomadServer(member) 375 if valid && member.Name != m.Name && p.Bootstrap { 376 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 377 return nil 378 } 379 } 380 } 381 382 // Attempt to add as a peer 383 future := s.raft.AddPeer(parts.Addr.String()) 384 if err := future.Error(); err != nil && err != raft.ErrKnownPeer { 385 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 386 return err 387 } else if err == nil { 388 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 389 } 390 return nil 391 } 392 393 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 394 // or is reaped 395 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 396 // Attempt to remove as peer 397 future := s.raft.RemovePeer(parts.Addr.String()) 398 if err := future.Error(); err != nil && err != raft.ErrUnknownPeer { 399 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 400 parts, err) 401 return err 402 } else if err == nil { 403 s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name) 404 } 405 return nil 406 }