github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/nomad/leader.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "time" 9 10 "github.com/armon/go-metrics" 11 memdb "github.com/hashicorp/go-memdb" 12 "github.com/ncodes/nomad/nomad/structs" 13 "github.com/hashicorp/raft" 14 "github.com/hashicorp/serf/serf" 15 ) 16 17 const ( 18 // failedEvalUnblockInterval is the interval at which failed evaluations are 19 // unblocked to re-enter the scheduler. A failed evaluation occurs under 20 // high contention when the schedulers plan does not make progress. 21 failedEvalUnblockInterval = 1 * time.Minute 22 ) 23 24 // monitorLeadership is used to monitor if we acquire or lose our role 25 // as the leader in the Raft cluster. There is some work the leader is 26 // expected to do, so we must react to changes 27 func (s *Server) monitorLeadership() { 28 var stopCh chan struct{} 29 for { 30 select { 31 case isLeader := <-s.leaderCh: 32 if isLeader { 33 stopCh = make(chan struct{}) 34 go s.leaderLoop(stopCh) 35 s.logger.Printf("[INFO] nomad: cluster leadership acquired") 36 } else if stopCh != nil { 37 close(stopCh) 38 stopCh = nil 39 s.logger.Printf("[INFO] nomad: cluster leadership lost") 40 } 41 case <-s.shutdownCh: 42 return 43 } 44 } 45 } 46 47 // leaderLoop runs as long as we are the leader to run various 48 // maintence activities 49 func (s *Server) leaderLoop(stopCh chan struct{}) { 50 // Ensure we revoke leadership on stepdown 51 defer s.revokeLeadership() 52 53 var reconcileCh chan serf.Member 54 establishedLeader := false 55 56 RECONCILE: 57 // Setup a reconciliation timer 58 reconcileCh = nil 59 interval := time.After(s.config.ReconcileInterval) 60 61 // Apply a raft barrier to ensure our FSM is caught up 62 start := time.Now() 63 barrier := s.raft.Barrier(0) 64 if err := barrier.Error(); err != nil { 65 s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) 66 goto WAIT 67 } 68 metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) 69 70 // Check if we need to handle initial leadership actions 71 if !establishedLeader { 72 if err := s.establishLeadership(stopCh); err != nil { 73 s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", 74 err) 75 goto WAIT 76 } 77 establishedLeader = true 78 } 79 80 // Reconcile any missing data 81 if err := s.reconcile(); err != nil { 82 s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) 83 goto WAIT 84 } 85 86 // Initial reconcile worked, now we can process the channel 87 // updates 88 reconcileCh = s.reconcileCh 89 90 WAIT: 91 // Wait until leadership is lost 92 for { 93 select { 94 case <-stopCh: 95 return 96 case <-s.shutdownCh: 97 return 98 case <-interval: 99 goto RECONCILE 100 case member := <-reconcileCh: 101 s.reconcileMember(member) 102 } 103 } 104 } 105 106 // establishLeadership is invoked once we become leader and are able 107 // to invoke an initial barrier. The barrier is used to ensure any 108 // previously inflight transactions have been committed and that our 109 // state is up-to-date. 110 func (s *Server) establishLeadership(stopCh chan struct{}) error { 111 // Disable workers to free half the cores for use in the plan queue and 112 // evaluation broker 113 if numWorkers := len(s.workers); numWorkers > 1 { 114 // Disabling 3/4 of the workers frees CPU for raft and the 115 // plan applier which uses 1/2 the cores. 116 for i := 0; i < (3 * numWorkers / 4); i++ { 117 s.workers[i].SetPause(true) 118 } 119 } 120 121 // Enable the plan queue, since we are now the leader 122 s.planQueue.SetEnabled(true) 123 124 // Start the plan evaluator 125 go s.planApply() 126 127 // Enable the eval broker, since we are now the leader 128 s.evalBroker.SetEnabled(true) 129 130 // Enable the blocked eval tracker, since we are now the leader 131 s.blockedEvals.SetEnabled(true) 132 133 // Restore the eval broker state 134 if err := s.restoreEvals(); err != nil { 135 return err 136 } 137 138 // Activate the vault client 139 s.vault.SetActive(true) 140 if err := s.restoreRevokingAccessors(); err != nil { 141 return err 142 } 143 144 // Enable the periodic dispatcher, since we are now the leader. 145 s.periodicDispatcher.SetEnabled(true) 146 s.periodicDispatcher.Start() 147 148 // Restore the periodic dispatcher state 149 if err := s.restorePeriodicDispatcher(); err != nil { 150 return err 151 } 152 153 // Scheduler periodic jobs 154 go s.schedulePeriodic(stopCh) 155 156 // Reap any failed evaluations 157 go s.reapFailedEvaluations(stopCh) 158 159 // Reap any duplicate blocked evaluations 160 go s.reapDupBlockedEvaluations(stopCh) 161 162 // Periodically unblock failed allocations 163 go s.periodicUnblockFailedEvals(stopCh) 164 165 // Setup the heartbeat timers. This is done both when starting up or when 166 // a leader fail over happens. Since the timers are maintained by the leader 167 // node, effectively this means all the timers are renewed at the time of failover. 168 // The TTL contract is that the session will not be expired before the TTL, 169 // so expiring it later is allowable. 170 // 171 // This MUST be done after the initial barrier to ensure the latest Nodes 172 // are available to be initialized. Otherwise initialization may use stale 173 // data. 174 if err := s.initializeHeartbeatTimers(); err != nil { 175 s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) 176 return err 177 } 178 179 // COMPAT 0.4 - 0.4.1 180 // Reconcile the summaries of the registered jobs. We reconcile summaries 181 // only if the server is 0.4.1 since summaries are not present in 0.4 they 182 // might be incorrect after upgrading to 0.4.1 the summaries might not be 183 // correct 184 if err := s.reconcileJobSummaries(); err != nil { 185 return fmt.Errorf("unable to reconcile job summaries: %v", err) 186 } 187 return nil 188 } 189 190 // restoreEvals is used to restore pending evaluations into the eval broker and 191 // blocked evaluations into the blocked eval tracker. The broker and blocked 192 // eval tracker is maintained only by the leader, so it must be restored anytime 193 // a leadership transition takes place. 194 func (s *Server) restoreEvals() error { 195 // Get an iterator over every evaluation 196 ws := memdb.NewWatchSet() 197 iter, err := s.fsm.State().Evals(ws) 198 if err != nil { 199 return fmt.Errorf("failed to get evaluations: %v", err) 200 } 201 202 for { 203 raw := iter.Next() 204 if raw == nil { 205 break 206 } 207 eval := raw.(*structs.Evaluation) 208 209 if eval.ShouldEnqueue() { 210 s.evalBroker.Enqueue(eval) 211 } else if eval.ShouldBlock() { 212 s.blockedEvals.Block(eval) 213 } 214 } 215 return nil 216 } 217 218 // restoreRevokingAccessors is used to restore Vault accessors that should be 219 // revoked. 220 func (s *Server) restoreRevokingAccessors() error { 221 // An accessor should be revoked if its allocation or node is terminal 222 ws := memdb.NewWatchSet() 223 state := s.fsm.State() 224 iter, err := state.VaultAccessors(ws) 225 if err != nil { 226 return fmt.Errorf("failed to get vault accessors: %v", err) 227 } 228 229 var revoke []*structs.VaultAccessor 230 for { 231 raw := iter.Next() 232 if raw == nil { 233 break 234 } 235 236 va := raw.(*structs.VaultAccessor) 237 238 // Check the allocation 239 alloc, err := state.AllocByID(ws, va.AllocID) 240 if err != nil { 241 return fmt.Errorf("failed to lookup allocation %q: %v", va.AllocID, err) 242 } 243 if alloc == nil || alloc.Terminated() { 244 // No longer running and should be revoked 245 revoke = append(revoke, va) 246 continue 247 } 248 249 // Check the node 250 node, err := state.NodeByID(ws, va.NodeID) 251 if err != nil { 252 return fmt.Errorf("failed to lookup node %q: %v", va.NodeID, err) 253 } 254 if node == nil || node.TerminalStatus() { 255 // Node is terminal so any accessor from it should be revoked 256 revoke = append(revoke, va) 257 continue 258 } 259 } 260 261 if len(revoke) != 0 { 262 if err := s.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 263 return fmt.Errorf("failed to revoke tokens: %v", err) 264 } 265 } 266 267 return nil 268 } 269 270 // restorePeriodicDispatcher is used to restore all periodic jobs into the 271 // periodic dispatcher. It also determines if a periodic job should have been 272 // created during the leadership transition and force runs them. The periodic 273 // dispatcher is maintained only by the leader, so it must be restored anytime a 274 // leadership transition takes place. 275 func (s *Server) restorePeriodicDispatcher() error { 276 ws := memdb.NewWatchSet() 277 iter, err := s.fsm.State().JobsByPeriodic(ws, true) 278 if err != nil { 279 return fmt.Errorf("failed to get periodic jobs: %v", err) 280 } 281 282 now := time.Now() 283 for i := iter.Next(); i != nil; i = iter.Next() { 284 job := i.(*structs.Job) 285 s.periodicDispatcher.Add(job) 286 287 // If the periodic job has never been launched before, launch will hold 288 // the time the periodic job was added. Otherwise it has the last launch 289 // time of the periodic job. 290 launch, err := s.fsm.State().PeriodicLaunchByID(ws, job.ID) 291 if err != nil || launch == nil { 292 return fmt.Errorf("failed to get periodic launch time: %v", err) 293 } 294 295 // nextLaunch is the next launch that should occur. 296 nextLaunch := job.Periodic.Next(launch.Launch) 297 298 // We skip force launching the job if there should be no next launch 299 // (the zero case) or if the next launch time is in the future. If it is 300 // in the future, it will be handled by the periodic dispatcher. 301 if nextLaunch.IsZero() || !nextLaunch.Before(now) { 302 continue 303 } 304 305 if _, err := s.periodicDispatcher.ForceRun(job.ID); err != nil { 306 msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) 307 s.logger.Printf("[ERR] nomad.periodic: %s", msg) 308 return errors.New(msg) 309 } 310 s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ 311 " run during leadership establishment", job.ID) 312 } 313 314 return nil 315 } 316 317 // schedulePeriodic is used to do periodic job dispatch while we are leader 318 func (s *Server) schedulePeriodic(stopCh chan struct{}) { 319 evalGC := time.NewTicker(s.config.EvalGCInterval) 320 defer evalGC.Stop() 321 nodeGC := time.NewTicker(s.config.NodeGCInterval) 322 defer nodeGC.Stop() 323 jobGC := time.NewTicker(s.config.JobGCInterval) 324 defer jobGC.Stop() 325 326 // getLatest grabs the latest index from the state store. It returns true if 327 // the index was retrieved successfully. 328 getLatest := func() (uint64, bool) { 329 snapshotIndex, err := s.fsm.State().LatestIndex() 330 if err != nil { 331 s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) 332 return 0, false 333 } 334 335 return snapshotIndex, true 336 } 337 338 for { 339 340 select { 341 case <-evalGC.C: 342 if index, ok := getLatest(); ok { 343 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobEvalGC, index)) 344 } 345 case <-nodeGC.C: 346 if index, ok := getLatest(); ok { 347 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobNodeGC, index)) 348 } 349 case <-jobGC.C: 350 if index, ok := getLatest(); ok { 351 s.evalBroker.Enqueue(s.coreJobEval(structs.CoreJobJobGC, index)) 352 } 353 case <-stopCh: 354 return 355 } 356 } 357 } 358 359 // coreJobEval returns an evaluation for a core job 360 func (s *Server) coreJobEval(job string, modifyIndex uint64) *structs.Evaluation { 361 return &structs.Evaluation{ 362 ID: structs.GenerateUUID(), 363 Priority: structs.CoreJobPriority, 364 Type: structs.JobTypeCore, 365 TriggeredBy: structs.EvalTriggerScheduled, 366 JobID: job, 367 Status: structs.EvalStatusPending, 368 ModifyIndex: modifyIndex, 369 } 370 } 371 372 // reapFailedEvaluations is used to reap evaluations that 373 // have reached their delivery limit and should be failed 374 func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { 375 for { 376 select { 377 case <-stopCh: 378 return 379 default: 380 // Scan for a failed evaluation 381 eval, token, err := s.evalBroker.Dequeue([]string{failedQueue}, time.Second) 382 if err != nil { 383 return 384 } 385 if eval == nil { 386 continue 387 } 388 389 // Update the status to failed 390 newEval := eval.Copy() 391 newEval.Status = structs.EvalStatusFailed 392 newEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) 393 s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", newEval) 394 395 // Update via Raft 396 req := structs.EvalUpdateRequest{ 397 Evals: []*structs.Evaluation{newEval}, 398 } 399 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 400 s.logger.Printf("[ERR] nomad: failed to update failed eval %#v: %v", newEval, err) 401 continue 402 } 403 404 // Ack completion 405 s.evalBroker.Ack(eval.ID, token) 406 } 407 } 408 } 409 410 // reapDupBlockedEvaluations is used to reap duplicate blocked evaluations and 411 // should be cancelled. 412 func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { 413 for { 414 select { 415 case <-stopCh: 416 return 417 default: 418 // Scan for duplicate blocked evals. 419 dups := s.blockedEvals.GetDuplicates(time.Second) 420 if dups == nil { 421 continue 422 } 423 424 cancel := make([]*structs.Evaluation, len(dups)) 425 for i, dup := range dups { 426 // Update the status to cancelled 427 newEval := dup.Copy() 428 newEval.Status = structs.EvalStatusCancelled 429 newEval.StatusDescription = fmt.Sprintf("existing blocked evaluation exists for job %q", newEval.JobID) 430 cancel[i] = newEval 431 } 432 433 // Update via Raft 434 req := structs.EvalUpdateRequest{ 435 Evals: cancel, 436 } 437 if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { 438 s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) 439 continue 440 } 441 } 442 } 443 } 444 445 // periodicUnblockFailedEvals periodically unblocks failed, blocked evaluations. 446 func (s *Server) periodicUnblockFailedEvals(stopCh chan struct{}) { 447 ticker := time.NewTicker(failedEvalUnblockInterval) 448 defer ticker.Stop() 449 for { 450 select { 451 case <-stopCh: 452 return 453 case <-ticker.C: 454 // Unblock the failed allocations 455 s.blockedEvals.UnblockFailed() 456 } 457 } 458 } 459 460 // revokeLeadership is invoked once we step down as leader. 461 // This is used to cleanup any state that may be specific to a leader. 462 func (s *Server) revokeLeadership() error { 463 // Disable the plan queue, since we are no longer leader 464 s.planQueue.SetEnabled(false) 465 466 // Disable the eval broker, since it is only useful as a leader 467 s.evalBroker.SetEnabled(false) 468 469 // Disable the blocked eval tracker, since it is only useful as a leader 470 s.blockedEvals.SetEnabled(false) 471 472 // Disable the periodic dispatcher, since it is only useful as a leader 473 s.periodicDispatcher.SetEnabled(false) 474 475 // Disable the Vault client as it is only useful as a leader. 476 s.vault.SetActive(false) 477 478 // Clear the heartbeat timers on either shutdown or step down, 479 // since we are no longer responsible for TTL expirations. 480 if err := s.clearAllHeartbeatTimers(); err != nil { 481 s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) 482 return err 483 } 484 485 // Unpause our worker if we paused previously 486 if len(s.workers) > 1 { 487 for i := 0; i < len(s.workers)/2; i++ { 488 s.workers[i].SetPause(false) 489 } 490 } 491 return nil 492 } 493 494 // reconcile is used to reconcile the differences between Serf 495 // membership and what is reflected in our strongly consistent store. 496 func (s *Server) reconcile() error { 497 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcile"}, time.Now()) 498 members := s.serf.Members() 499 for _, member := range members { 500 if err := s.reconcileMember(member); err != nil { 501 return err 502 } 503 } 504 return nil 505 } 506 507 // reconcileMember is used to do an async reconcile of a single serf member 508 func (s *Server) reconcileMember(member serf.Member) error { 509 // Check if this is a member we should handle 510 valid, parts := isNomadServer(member) 511 if !valid || parts.Region != s.config.Region { 512 return nil 513 } 514 defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) 515 516 // Do not reconcile ourself 517 if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { 518 return nil 519 } 520 521 var err error 522 switch member.Status { 523 case serf.StatusAlive: 524 err = s.addRaftPeer(member, parts) 525 case serf.StatusLeft, StatusReap: 526 err = s.removeRaftPeer(member, parts) 527 } 528 if err != nil { 529 s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", 530 member, err) 531 return err 532 } 533 return nil 534 } 535 536 // reconcileJobSummaries reconciles the summaries of all the jobs registered in 537 // the system 538 // COMPAT 0.4 -> 0.4.1 539 func (s *Server) reconcileJobSummaries() error { 540 index, err := s.fsm.state.LatestIndex() 541 if err != nil { 542 return fmt.Errorf("unable to read latest index: %v", err) 543 } 544 s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) 545 546 args := &structs.GenericResponse{} 547 msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag 548 if _, _, err = s.raftApply(msg, args); err != nil { 549 return fmt.Errorf("reconciliation of job summaries failed: %v", err) 550 } 551 552 return nil 553 } 554 555 // addRaftPeer is used to add a new Raft peer when a Nomad server joins 556 func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { 557 // Do not join ourselfs 558 if m.Name == s.config.NodeName { 559 s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) 560 return nil 561 } 562 563 // Check for possibility of multiple bootstrap nodes 564 if parts.Bootstrap { 565 members := s.serf.Members() 566 for _, member := range members { 567 valid, p := isNomadServer(member) 568 if valid && member.Name != m.Name && p.Bootstrap { 569 s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) 570 return nil 571 } 572 } 573 } 574 575 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 576 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 577 578 // See if it's already in the configuration. It's harmless to re-add it 579 // but we want to avoid doing that if possible to prevent useless Raft 580 // log entries. 581 configFuture := s.raft.GetConfiguration() 582 if err := configFuture.Error(); err != nil { 583 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 584 return err 585 } 586 for _, server := range configFuture.Configuration().Servers { 587 if server.Address == raft.ServerAddress(addr) { 588 return nil 589 } 590 } 591 592 // Attempt to add as a peer 593 addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) 594 if err := addFuture.Error(); err != nil { 595 s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) 596 return err 597 } else if err == nil { 598 s.logger.Printf("[INFO] nomad: added raft peer: %v", parts) 599 } 600 return nil 601 } 602 603 // removeRaftPeer is used to remove a Raft peer when a Nomad server leaves 604 // or is reaped 605 func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { 606 // TODO (alexdadgar) - This will need to be changed once we support node IDs. 607 addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() 608 609 // See if it's already in the configuration. It's harmless to re-remove it 610 // but we want to avoid doing that if possible to prevent useless Raft 611 // log entries. 612 configFuture := s.raft.GetConfiguration() 613 if err := configFuture.Error(); err != nil { 614 s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) 615 return err 616 } 617 for _, server := range configFuture.Configuration().Servers { 618 if server.Address == raft.ServerAddress(addr) { 619 goto REMOVE 620 } 621 } 622 return nil 623 624 REMOVE: 625 // Attempt to remove as a peer. 626 future := s.raft.RemovePeer(raft.ServerAddress(addr)) 627 if err := future.Error(); err != nil { 628 s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", 629 parts, err) 630 return err 631 } 632 return nil 633 }