github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "fmt" 6 "time" 7 8 "github.com/armon/go-metrics" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/raft" 11 "github.com/hashicorp/serf/serf" 12 ) 13 14 const ( 15 // failedEvalUnblockInterval is the interval at which failed evaluations are 16 // unblocked to re-enter the scheduler. A failed evaluation occurs under 17 // high contention when the schedulers plan does not make progress. 18 failedEvalUnblockInterval = 1 * time.Minute 19 ) 20 21 // monitorLeadership is used to monitor if we acquire or lose our role 22 // as the leader in the Raft cluster. There is some work the leader is 23 // expected to do, so we must react to changes 24 func (s *Server) monitorLeadership() { 25 var stopCh chan struct{} 26 for { 27 select { 28 case isLeader := <-s.leaderCh: 29 if isLeader { 30 stopCh = make(chan struct{}) 31 go s.leaderLoop(stopCh) 32 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 33 } else if stopCh != nil { 34 close(stopCh) 35 stopCh = nil 36 s.logger.Printf("[INFO] nomad: cluster leadership lost") 37 } 38 case <-s.shutdownCh: 39 return 40 } 41 } 42 } 43 44 // leaderLoop runs as long as we are the leader to run various 45 // maintence activities 46 func (s *Server) leaderLoop(stopCh chan struct{}) { 47 // Ensure we revoke leadership on stepdown 48 defer s.revokeLeadership() 49 50 var reconcileCh chan serf.Member 51 establishedLeader := false 52 53 RECONCILE: 54 // Setup a reconciliation timer 55 reconcileCh = nil 56 interval := time.After(s.config.ReconcileInterval) 57 58 // Apply a raft barrier to ensure our FSM is caught up 59 start := time.Now() 60 barrier := s.raft.Barrier(0) 61 if err := barrier.Error(); err != nil { 62 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 63 goto WAIT 64 } 65 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 66 67 // Check if we need to handle initial leadership actions 68 if !establishedLeader { 69 if err := s.establishLeadership(stopCh); err != nil { 70 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", 71 err) 72 goto WAIT 73 } 74 establishedLeader = true 75 } 76 77 // Reconcile any missing data 78 if err := s.reconcile(); err != nil { 79 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 80 goto WAIT 81 } 82 83 // Initial reconcile worked, now we can process the channel 84 // updates 85 reconcileCh = s.reconcileCh 86 87 WAIT: 88 // Wait until leadership is lost 89 for { 90 select { 91 case <-stopCh: 92 return 93 case <-s.shutdownCh: 94 return 95 case <-interval: 96 goto RECONCILE 97 case member := <-reconcileCh: 98 s.reconcileMember(member) 99 } 100 } 101 } 102 103 // establishLeadership is invoked once we become leader and are able 104 // to invoke an initial barrier. The barrier is used to ensure any 105 // previously inflight transactions have been committed and that our 106 // state is up-to-date. 107 func (s *Server) establishLeadership(stopCh chan struct{}) error { 108 // Disable workers to free half the cores for use in the plan queue and 109 // evaluation broker 110 if numWorkers := len(s.workers); numWorkers > 1 { 111 // Disabling 3/4 of the workers frees CPU for raft and the 112 // plan applier which uses 1/2 the cores. 113 for i := 0; i < (3 * numWorkers / 4); i++ { 114 s.workers[i].SetPause(true) 115 } 116 } 117 118 // Enable the plan queue, since we are now the leader 119 s.planQueue.SetEnabled(true) 120 121 // Start the plan evaluator 122 go s.planApply() 123 124 // Enable the eval broker, since we are now the leader 125 s.evalBroker.SetEnabled(true) 126 127 // Enable the blocked eval tracker, since we are now the leader 128 s.blockedEvals.SetEnabled(true) 129 130 // Restore the eval broker state 131 if err := s.restoreEvals(); err != nil { 132 return err 133 } 134 135 // Enable the periodic dispatcher, since we are now the leader. 136 s.periodicDispatcher.SetEnabled(true) 137 s.periodicDispatcher.Start() 138 139 // Restore the periodic dispatcher state 140 if err := s.restorePeriodicDispatcher(); err != nil { 141 return err 142 } 143 144 // Scheduler periodic jobs 145 go s.schedulePeriodic(stopCh) 146 147 // Reap any failed evaluations 148 go s.reapFailedEvaluations(stopCh) 149 150 // Reap any duplicate blocked evaluations 151 go s.reapDupBlockedEvaluations(stopCh) 152 153 // Periodically unblock failed allocations 154 go s.periodicUnblockFailedEvals(stopCh) 155 156 // Setup the heartbeat timers. This is done both when starting up or when 157 // a leader fail over happens. Since the timers are maintained by the leader 158 // node, effectively this means all the timers are renewed at the time of failover. 159 // The TTL contract is that the session will not be expired before the TTL, 160 // so expiring it later is allowable. 161 // 162 // This MUST be done after the initial barrier to ensure the latest Nodes 163 // are available to be initialized. Otherwise initialization may use stale 164 // data. 165 if err := s.initializeHeartbeatTimers(); err != nil { 166 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 167 return err 168 } 169 return nil 170 } 171 172 // restoreEvals is used to restore pending evaluations into the eval broker and 173 // blocked evaluations into the blocked eval tracker. The broker and blocked 174 // eval tracker is maintained only by the leader, so it must be restored anytime 175 // a leadership transition takes place. 176 func (s *Server) restoreEvals() error { 177 // Get an iterator over every evaluation 178 iter, err := s.fsm.State().Evals() 179 if err != nil { 180 return fmt.Errorf("failed to get evaluations: %v", err) 181 } 182 183 for { 184 raw := iter.Next() 185 if raw == nil { 186 break 187 } 188 eval := raw.(*structs.Evaluation) 189 190 if eval.ShouldEnqueue() { 191 s.evalBroker.Enqueue(eval) 192 } else if eval.ShouldBlock() { 193 s.blockedEvals.Block(eval) 194 } 195 } 196 return nil 197 } 198 199 // restorePeriodicDispatcher is used to restore all periodic jobs into the 200 // periodic dispatcher. It also determines if a periodic job should have been 201 // created during the leadership transition and force runs them. The periodic 202 // dispatcher is maintained only by the leader, so it must be restored anytime a 203 // leadership transition takes place. 204 func (s *Server) restorePeriodicDispatcher() error { 205 iter, err := s.fsm.State().JobsByPeriodic(true) 206 if err != nil { 207 return fmt.Errorf("failed to get periodic jobs: %v", err) 208 } 209 210 now := time.Now() 211 for i := iter.Next(); i != nil; i = iter.Next() { 212 job := i.(*structs.Job) 213 s.periodicDispatcher.Add(job) 214 215 // If the periodic job has never been launched before, launch will hold 216 // the time the periodic job was added. Otherwise it has the last launch 217 // time of the periodic job. 218 launch, err := s.fsm.State().PeriodicLaunchByID(job.ID) 219 if err != nil || launch == nil { 220 return fmt.Errorf("failed to get periodic launch time: %v", err) 221 } 222 223 // nextLaunch is the next launch that should occur. 224 nextLaunch := job.Periodic.Next(launch.Launch) 225 226 // We skip force launching the job if there should be no next launch 227 // (the zero case) or if the next launch time is in the future. If it is 228 // in the future, it will be handled by the periodic dispatcher. 229 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 230 continue 231 } 232 233 if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil { 234 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 235 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 236 return errors.New(msg) 237 } 238 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 239 " run during leadership establishment", job.ID) 240 } 241 242 return nil 243 } 244 245 // schedulePeriodic is used to do periodic job dispatch while we are leader 246 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 247 evalGC := time.NewTicker(s.config.EvalGCInterval) 248 defer evalGC.Stop() 249 nodeGC := time.NewTicker(s.config.NodeGCInterval) 250 defer nodeGC.Stop() 251 jobGC := time.NewTicker(s.config.JobGCInterval) 252 defer jobGC.Stop() 253 254 for { 255 select { 256 case <-evalGC.C: 257 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC)) 258 case <-nodeGC.C: 259 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC)) 260 case <-jobGC.C: 261 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC)) 262 case <-stopCh: 263 return 264 } 265 } 266 } 267 268 // coreJobEval returns an evaluation for a core job 269 func (s *Server) coreJobEval(job string) *structs.Evaluation { 270 return &structs.Evaluation{ 271 ID: structs.GenerateUUID(), 272 Priority: structs.CoreJobPriority, 273 Type: structs.JobTypeCore, 274 TriggeredBy: structs.EvalTriggerScheduled, 275 JobID: job, 276 Status: structs.EvalStatusPending, 277 ModifyIndex: s.raft.AppliedIndex(), 278 } 279 } 280 281 // reapFailedEvaluations is used to reap evaluations that 282 // have reached their delivery limit and should be failed 283 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 284 for { 285 select { 286 case <-stopCh: 287 return 288 default: 289 // Scan for a failed evaluation 290 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 291 if err != nil { 292 return 293 } 294 if eval == nil { 295 continue 296 } 297 298 // Update the status to failed 299 newEval := eval.Copy() 300 newEval.Status = structs.EvalStatusFailed 301 newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 302 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval) 303 304 // Update via Raft 305 req := structs.EvalUpdateRequest{ 306 Evals: []*structs.Evaluation{newEval}, 307 } 308 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 309 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err) 310 continue 311 } 312 313 // Ack completion 314 s.evalBroker.Ack(eval.ID, token) 315 } 316 } 317 } 318 319 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 320 // should be cancelled. 321 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 322 for { 323 select { 324 case <-stopCh: 325 return 326 default: 327 // Scan for duplicate blocked evals. 328 dups := s.blockedEvals.GetDuplicates(time.Second) 329 if dups == nil { 330 continue 331 } 332 333 cancel := make([]*structs.Evaluation, len(dups)) 334 for i, dup := range dups { 335 // Update the status to cancelled 336 newEval := dup.Copy() 337 newEval.Status = structs.EvalStatusCancelled 338 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 339 cancel[i] = newEval 340 } 341 342 // Update via Raft 343 req := structs.EvalUpdateRequest{ 344 Evals: cancel, 345 } 346 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 347 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 348 continue 349 } 350 } 351 } 352 } 353 354 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 355 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 356 ticker := time.NewTimer(failedEvalUnblockInterval) 357 defer ticker.Stop() 358 for { 359 select { 360 case <-stopCh: 361 return 362 case <-ticker.C: 363 // Unblock the failed allocations 364 s.blockedEvals.UnblockFailed() 365 } 366 } 367 } 368 369 // revokeLeadership is invoked once we step down as leader. 370 // This is used to cleanup any state that may be specific to a leader. 371 func (s *Server) revokeLeadership() error { 372 // Disable the plan queue, since we are no longer leader 373 s.planQueue.SetEnabled(false) 374 375 // Disable the eval broker, since it is only useful as a leader 376 s.evalBroker.SetEnabled(false) 377 378 // Disable the blocked eval tracker, since it is only useful as a leader 379 s.blockedEvals.SetEnabled(false) 380 381 // Disable the periodic dispatcher, since it is only useful as a leader 382 s.periodicDispatcher.SetEnabled(false) 383 384 // Clear the heartbeat timers on either shutdown or step down, 385 // since we are no longer responsible for TTL expirations. 386 if err := s.clearAllHeartbeatTimers(); err != nil { 387 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 388 return err 389 } 390 391 // Unpause our worker if we paused previously 392 if len(s.workers) > 1 { 393 for i := 0; i < len(s.workers)/2; i++ { 394 s.workers[i].SetPause(false) 395 } 396 } 397 return nil 398 } 399 400 // reconcile is used to reconcile the differences between Serf 401 // membership and what is reflected in our strongly consistent store. 402 func (s *Server) reconcile() error { 403 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 404 members := s.serf.Members() 405 for _, member := range members { 406 if err := s.reconcileMember(member); err != nil { 407 return err 408 } 409 } 410 return nil 411 } 412 413 // reconcileMember is used to do an async reconcile of a single serf member 414 func (s *Server) reconcileMember(member serf.Member) error { 415 // Check if this is a member we should handle 416 valid, parts := isNomadServer(member) 417 if !valid || parts.Region != s.config.Region { 418 return nil 419 } 420 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 421 422 // Do not reconcile ourself 423 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 424 return nil 425 } 426 427 var err error 428 switch member.Status { 429 case serf.StatusAlive: 430 err = s.addRaftPeer(member, parts) 431 case serf.StatusLeft, StatusReap: 432 err = s.removeRaftPeer(member, parts) 433 } 434 if err != nil { 435 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 436 member, err) 437 return err 438 } 439 return nil 440 } 441 442 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 443 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 444 // Check for possibility of multiple bootstrap nodes 445 if parts.Bootstrap { 446 members := s.serf.Members() 447 for _, member := range members { 448 valid, p := isNomadServer(member) 449 if valid && member.Name != m.Name && p.Bootstrap { 450 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 451 return nil 452 } 453 } 454 } 455 456 // Attempt to add as a peer 457 future := s.raft.AddPeer(parts.Addr.String()) 458 if err := future.Error(); err != nil && err != raft.ErrKnownPeer { 459 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 460 return err 461 } else if err == nil { 462 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 463 } 464 return nil 465 } 466 467 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 468 // or is reaped 469 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 470 // Attempt to remove as peer 471 future := s.raft.RemovePeer(parts.Addr.String()) 472 if err := future.Error(); err != nil && err != raft.ErrUnknownPeer { 473 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 474 parts, err) 475 return err 476 } else if err == nil { 477 s.logger.Printf("[INFO] nomad: removed server '%s' as peer", m.Name) 478 } 479 return nil 480 }