github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/periodic.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "context" 6 "fmt" 7 "log" 8 "strconv" 9 "strings" 10 "sync" 11 "time" 12 13 memdb "github.com/hashicorp/go-memdb" 14 "github.com/hashicorp/nomad/helper/uuid" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 // PeriodicDispatch is used to track and launch periodic jobs. It maintains the 19 // set of periodic jobs and creates derived jobs and evaluations per 20 // instantiation which is determined by the periodic spec. 21 type PeriodicDispatch struct { 22 dispatcher JobEvalDispatcher 23 enabled bool 24 25 tracked map[structs.NamespacedID]*structs.Job 26 heap *periodicHeap 27 28 updateCh chan struct{} 29 stopFn context.CancelFunc 30 logger *log.Logger 31 l sync.RWMutex 32 } 33 34 // JobEvalDispatcher is an interface to submit jobs and have evaluations created 35 // for them. 36 type JobEvalDispatcher interface { 37 // DispatchJob takes a job a new, untracked job and creates an evaluation 38 // for it and returns the eval. 39 DispatchJob(job *structs.Job) (*structs.Evaluation, error) 40 41 // RunningChildren returns whether the passed job has any running children. 42 RunningChildren(job *structs.Job) (bool, error) 43 } 44 45 // DispatchJob creates an evaluation for the passed job and commits both the 46 // evaluation and the job to the raft log. It returns the eval. 47 func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) { 48 // Commit this update via Raft 49 job.SetSubmitTime() 50 req := structs.JobRegisterRequest{ 51 Job: job, 52 WriteRequest: structs.WriteRequest{ 53 Namespace: job.Namespace, 54 }, 55 } 56 fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req) 57 if err, ok := fsmErr.(error); ok && err != nil { 58 return nil, err 59 } 60 if err != nil { 61 return nil, err 62 } 63 64 // Create a new evaluation 65 eval := &structs.Evaluation{ 66 ID: uuid.Generate(), 67 Namespace: job.Namespace, 68 Priority: job.Priority, 69 Type: job.Type, 70 TriggeredBy: structs.EvalTriggerPeriodicJob, 71 JobID: job.ID, 72 JobModifyIndex: index, 73 Status: structs.EvalStatusPending, 74 } 75 update := &structs.EvalUpdateRequest{ 76 Evals: []*structs.Evaluation{eval}, 77 } 78 79 // Commit this evaluation via Raft 80 // XXX: There is a risk of partial failure where the JobRegister succeeds 81 // but that the EvalUpdate does not. 82 _, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update) 83 if err != nil { 84 return nil, err 85 } 86 87 // Update its indexes. 88 eval.CreateIndex = evalIndex 89 eval.ModifyIndex = evalIndex 90 return eval, nil 91 } 92 93 // RunningChildren checks whether the passed job has any running children. 94 func (s *Server) RunningChildren(job *structs.Job) (bool, error) { 95 state, err := s.fsm.State().Snapshot() 96 if err != nil { 97 return false, err 98 } 99 100 ws := memdb.NewWatchSet() 101 prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix) 102 iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix) 103 if err != nil { 104 return false, err 105 } 106 107 var child *structs.Job 108 for i := iter.Next(); i != nil; i = iter.Next() { 109 child = i.(*structs.Job) 110 111 // Ensure the job is actually a child. 112 if child.ParentID != job.ID { 113 continue 114 } 115 116 // Get the childs evaluations. 117 evals, err := state.EvalsByJob(ws, child.Namespace, child.ID) 118 if err != nil { 119 return false, err 120 } 121 122 // Check if any of the evals are active or have running allocations. 123 for _, eval := range evals { 124 if !eval.TerminalStatus() { 125 return true, nil 126 } 127 128 allocs, err := state.AllocsByEval(ws, eval.ID) 129 if err != nil { 130 return false, err 131 } 132 133 for _, alloc := range allocs { 134 if !alloc.TerminalStatus() { 135 return true, nil 136 } 137 } 138 } 139 } 140 141 // There are no evals or allocations that aren't terminal. 142 return false, nil 143 } 144 145 // NewPeriodicDispatch returns a periodic dispatcher that is used to track and 146 // launch periodic jobs. 147 func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { 148 return &PeriodicDispatch{ 149 dispatcher: dispatcher, 150 tracked: make(map[structs.NamespacedID]*structs.Job), 151 heap: NewPeriodicHeap(), 152 updateCh: make(chan struct{}, 1), 153 logger: logger, 154 } 155 } 156 157 // SetEnabled is used to control if the periodic dispatcher is enabled. It 158 // should only be enabled on the active leader. Disabling an active dispatcher 159 // will stop any launched go routine and flush the dispatcher. 160 func (p *PeriodicDispatch) SetEnabled(enabled bool) { 161 p.l.Lock() 162 defer p.l.Unlock() 163 wasRunning := p.enabled 164 p.enabled = enabled 165 166 // If we are transitioning from enabled to disabled, stop the daemon and 167 // flush. 168 if !enabled && wasRunning { 169 p.stopFn() 170 p.flush() 171 } else if enabled && !wasRunning { 172 // If we are transitioning from disabled to enabled, run the daemon. 173 ctx, cancel := context.WithCancel(context.Background()) 174 p.stopFn = cancel 175 go p.run(ctx) 176 } 177 } 178 179 // Tracked returns the set of tracked job IDs. 180 func (p *PeriodicDispatch) Tracked() []*structs.Job { 181 p.l.RLock() 182 defer p.l.RUnlock() 183 tracked := make([]*structs.Job, len(p.tracked)) 184 i := 0 185 for _, job := range p.tracked { 186 tracked[i] = job 187 i++ 188 } 189 return tracked 190 } 191 192 // Add begins tracking of a periodic job. If it is already tracked, it acts as 193 // an update to the jobs periodic spec. The method returns whether the job was 194 // added and any error that may have occurred. 195 func (p *PeriodicDispatch) Add(job *structs.Job) error { 196 p.l.Lock() 197 defer p.l.Unlock() 198 199 // Do nothing if not enabled 200 if !p.enabled { 201 return nil 202 } 203 204 // If we were tracking a job and it has been disabled, made non-periodic, 205 // stopped or is parameterized, remove it 206 disabled := !job.IsPeriodicActive() 207 208 tuple := structs.NamespacedID{ 209 ID: job.ID, 210 Namespace: job.Namespace, 211 } 212 _, tracked := p.tracked[tuple] 213 if disabled { 214 if tracked { 215 p.removeLocked(tuple) 216 } 217 218 // If the job is disabled and we aren't tracking it, do nothing. 219 return nil 220 } 221 222 // Add or update the job. 223 p.tracked[tuple] = job 224 next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation())) 225 if err != nil { 226 return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err) 227 } 228 if tracked { 229 if err := p.heap.Update(job, next); err != nil { 230 return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err) 231 } 232 p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q (%s)", job.ID, job.Namespace) 233 } else { 234 if err := p.heap.Push(job, next); err != nil { 235 return fmt.Errorf("failed to add job %v: %v", job.ID, err) 236 } 237 p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q (%s)", job.ID, job.Namespace) 238 } 239 240 // Signal an update. 241 select { 242 case p.updateCh <- struct{}{}: 243 default: 244 } 245 246 return nil 247 } 248 249 // Remove stops tracking the passed job. If the job is not tracked, it is a 250 // no-op. 251 func (p *PeriodicDispatch) Remove(namespace, jobID string) error { 252 p.l.Lock() 253 defer p.l.Unlock() 254 return p.removeLocked(structs.NamespacedID{ 255 ID: jobID, 256 Namespace: namespace, 257 }) 258 } 259 260 // Remove stops tracking the passed job. If the job is not tracked, it is a 261 // no-op. It assumes this is called while a lock is held. 262 func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error { 263 // Do nothing if not enabled 264 if !p.enabled { 265 return nil 266 } 267 268 job, tracked := p.tracked[jobID] 269 if !tracked { 270 return nil 271 } 272 273 delete(p.tracked, jobID) 274 if err := p.heap.Remove(job); err != nil { 275 return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err) 276 } 277 278 // Signal an update. 279 select { 280 case p.updateCh <- struct{}{}: 281 default: 282 } 283 284 p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q (%s)", jobID.ID, jobID.Namespace) 285 return nil 286 } 287 288 // ForceRun causes the periodic job to be evaluated immediately and returns the 289 // subsequent eval. 290 func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) { 291 p.l.Lock() 292 293 // Do nothing if not enabled 294 if !p.enabled { 295 p.l.Unlock() 296 return nil, fmt.Errorf("periodic dispatch disabled") 297 } 298 299 tuple := structs.NamespacedID{ 300 ID: jobID, 301 Namespace: namespace, 302 } 303 job, tracked := p.tracked[tuple] 304 if !tracked { 305 p.l.Unlock() 306 return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace) 307 } 308 309 p.l.Unlock() 310 return p.createEval(job, time.Now().In(job.Periodic.GetLocation())) 311 } 312 313 // shouldRun returns whether the long lived run function should run. 314 func (p *PeriodicDispatch) shouldRun() bool { 315 p.l.RLock() 316 defer p.l.RUnlock() 317 return p.enabled 318 } 319 320 // run is a long-lived function that waits till a job's periodic spec is met and 321 // then creates an evaluation to run the job. 322 func (p *PeriodicDispatch) run(ctx context.Context) { 323 var launchCh <-chan time.Time 324 for p.shouldRun() { 325 job, launch := p.nextLaunch() 326 if launch.IsZero() { 327 launchCh = nil 328 } else { 329 launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation())) 330 launchCh = time.After(launchDur) 331 p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%s) in %s", job.ID, job.Namespace, launchDur) 332 } 333 334 select { 335 case <-ctx.Done(): 336 return 337 case <-p.updateCh: 338 continue 339 case <-launchCh: 340 p.dispatch(job, launch) 341 } 342 } 343 } 344 345 // dispatch creates an evaluation for the job and updates its next launchtime 346 // based on the passed launch time. 347 func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { 348 p.l.Lock() 349 350 nextLaunch, err := job.Periodic.Next(launchTime) 351 if err != nil { 352 p.logger.Printf("[ERR] nomad.periodic: failed to parse next periodic launch for job %s: %v", job.NamespacedID(), err) 353 } else if err := p.heap.Update(job, nextLaunch); err != nil { 354 p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %s: %v", job.NamespacedID(), err) 355 } 356 357 // If the job prohibits overlapping and there are running children, we skip 358 // the launch. 359 if job.Periodic.ProhibitOverlap { 360 running, err := p.dispatcher.RunningChildren(job) 361 if err != nil { 362 msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+ 363 " periodic job %q (%s) has running children: %v", job.ID, job.Namespace, err) 364 p.logger.Println(msg) 365 p.l.Unlock() 366 return 367 } 368 369 if running { 370 msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+ 371 " periodic job %q (%s) because job prohibits overlap", job.ID, job.Namespace) 372 p.logger.Println(msg) 373 p.l.Unlock() 374 return 375 } 376 } 377 378 p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%v) at %v", job.ID, job.Namespace, launchTime) 379 p.l.Unlock() 380 p.createEval(job, launchTime) 381 } 382 383 // nextLaunch returns the next job to launch and when it should be launched. If 384 // the next job can't be determined, an error is returned. If the dispatcher is 385 // stopped, a nil job will be returned. 386 func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) { 387 // If there is nothing wait for an update. 388 p.l.RLock() 389 defer p.l.RUnlock() 390 if p.heap.Length() == 0 { 391 return nil, time.Time{} 392 } 393 394 nextJob := p.heap.Peek() 395 if nextJob == nil { 396 return nil, time.Time{} 397 } 398 399 return nextJob.job, nextJob.next 400 } 401 402 // createEval instantiates a job based on the passed periodic job and submits an 403 // evaluation for it. This should not be called with the lock held. 404 func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) { 405 derived, err := p.deriveJob(periodicJob, time) 406 if err != nil { 407 return nil, err 408 } 409 410 eval, err := p.dispatcher.DispatchJob(derived) 411 if err != nil { 412 p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q (%s): %v", 413 periodicJob.ID, periodicJob.Namespace, err) 414 return nil, err 415 } 416 417 return eval, nil 418 } 419 420 // deriveJob instantiates a new job based on the passed periodic job and the 421 // launch time. 422 func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( 423 derived *structs.Job, err error) { 424 425 // Have to recover in case the job copy panics. 426 defer func() { 427 if r := recover(); r != nil { 428 p.logger.Printf("[ERR] nomad.periodic: deriving job from"+ 429 " periodic job %q (%s) failed; deregistering from periodic runner: %v", 430 periodicJob.ID, periodicJob.Namespace, r) 431 432 p.Remove(periodicJob.Namespace, periodicJob.ID) 433 derived = nil 434 err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v", 435 periodicJob.ID, periodicJob.Namespace, r) 436 } 437 }() 438 439 // Create a copy of the periodic job, give it a derived ID/Name and make it 440 // non-periodic. 441 derived = periodicJob.Copy() 442 derived.ParentID = periodicJob.ID 443 derived.ID = p.derivedJobID(periodicJob, time) 444 derived.Name = derived.ID 445 derived.Periodic = nil 446 return 447 } 448 449 // deriveJobID returns a job ID based on the parent periodic job and the launch 450 // time. 451 func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string { 452 return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix()) 453 } 454 455 // LaunchTime returns the launch time of the job. This is only valid for 456 // jobs created by PeriodicDispatch and will otherwise return an error. 457 func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) { 458 index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix) 459 if index == -1 { 460 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 461 } 462 463 launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):]) 464 if err != nil { 465 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 466 } 467 468 return time.Unix(int64(launch), 0), nil 469 } 470 471 // flush clears the state of the PeriodicDispatcher 472 func (p *PeriodicDispatch) flush() { 473 p.updateCh = make(chan struct{}, 1) 474 p.tracked = make(map[structs.NamespacedID]*structs.Job) 475 p.heap = NewPeriodicHeap() 476 p.stopFn = nil 477 } 478 479 // periodicHeap wraps a heap and gives operations other than Push/Pop. 480 type periodicHeap struct { 481 index map[structs.NamespacedID]*periodicJob 482 heap periodicHeapImp 483 } 484 485 type periodicJob struct { 486 job *structs.Job 487 next time.Time 488 index int 489 } 490 491 func NewPeriodicHeap() *periodicHeap { 492 return &periodicHeap{ 493 index: make(map[structs.NamespacedID]*periodicJob), 494 heap: make(periodicHeapImp, 0), 495 } 496 } 497 498 func (p *periodicHeap) Push(job *structs.Job, next time.Time) error { 499 tuple := structs.NamespacedID{ 500 ID: job.ID, 501 Namespace: job.Namespace, 502 } 503 if _, ok := p.index[tuple]; ok { 504 return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace) 505 } 506 507 pJob := &periodicJob{job, next, 0} 508 p.index[tuple] = pJob 509 heap.Push(&p.heap, pJob) 510 return nil 511 } 512 513 func (p *periodicHeap) Pop() *periodicJob { 514 if len(p.heap) == 0 { 515 return nil 516 } 517 518 pJob := heap.Pop(&p.heap).(*periodicJob) 519 tuple := structs.NamespacedID{ 520 ID: pJob.job.ID, 521 Namespace: pJob.job.Namespace, 522 } 523 delete(p.index, tuple) 524 return pJob 525 } 526 527 func (p *periodicHeap) Peek() *periodicJob { 528 if len(p.heap) == 0 { 529 return nil 530 } 531 532 return p.heap[0] 533 } 534 535 func (p *periodicHeap) Contains(job *structs.Job) bool { 536 tuple := structs.NamespacedID{ 537 ID: job.ID, 538 Namespace: job.Namespace, 539 } 540 _, ok := p.index[tuple] 541 return ok 542 } 543 544 func (p *periodicHeap) Update(job *structs.Job, next time.Time) error { 545 tuple := structs.NamespacedID{ 546 ID: job.ID, 547 Namespace: job.Namespace, 548 } 549 if pJob, ok := p.index[tuple]; ok { 550 // Need to update the job as well because its spec can change. 551 pJob.job = job 552 pJob.next = next 553 heap.Fix(&p.heap, pJob.index) 554 return nil 555 } 556 557 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 558 } 559 560 func (p *periodicHeap) Remove(job *structs.Job) error { 561 tuple := structs.NamespacedID{ 562 ID: job.ID, 563 Namespace: job.Namespace, 564 } 565 if pJob, ok := p.index[tuple]; ok { 566 heap.Remove(&p.heap, pJob.index) 567 delete(p.index, tuple) 568 return nil 569 } 570 571 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 572 } 573 574 func (p *periodicHeap) Length() int { 575 return len(p.heap) 576 } 577 578 type periodicHeapImp []*periodicJob 579 580 func (h periodicHeapImp) Len() int { return len(h) } 581 582 func (h periodicHeapImp) Less(i, j int) bool { 583 // Two zero times should return false. 584 // Otherwise, zero is "greater" than any other time. 585 // (To sort it at the end of the list.) 586 // Sort such that zero times are at the end of the list. 587 iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero() 588 if iZero && jZero { 589 return false 590 } else if iZero { 591 return false 592 } else if jZero { 593 return true 594 } 595 596 return h[i].next.Before(h[j].next) 597 } 598 599 func (h periodicHeapImp) Swap(i, j int) { 600 h[i], h[j] = h[j], h[i] 601 h[i].index = i 602 h[j].index = j 603 } 604 605 func (h *periodicHeapImp) Push(x interface{}) { 606 n := len(*h) 607 job := x.(*periodicJob) 608 job.index = n 609 *h = append(*h, job) 610 } 611 612 func (h *periodicHeapImp) Pop() interface{} { 613 old := *h 614 n := len(old) 615 job := old[n-1] 616 job.index = -1 // for safety 617 *h = old[0 : n-1] 618 return job 619 }