github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/periodic.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "context" 6 "fmt" 7 "strconv" 8 "strings" 9 "sync" 10 "time" 11 12 log "github.com/hashicorp/go-hclog" 13 memdb "github.com/hashicorp/go-memdb" 14 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 // PeriodicDispatch is used to track and launch periodic jobs. It maintains the 20 // set of periodic jobs and creates derived jobs and evaluations per 21 // instantiation which is determined by the periodic spec. 22 type PeriodicDispatch struct { 23 dispatcher JobEvalDispatcher 24 enabled bool 25 26 tracked map[structs.NamespacedID]*structs.Job 27 heap *periodicHeap 28 29 updateCh chan struct{} 30 stopFn context.CancelFunc 31 logger log.Logger 32 l sync.RWMutex 33 } 34 35 // JobEvalDispatcher is an interface to submit jobs and have evaluations created 36 // for them. 37 type JobEvalDispatcher interface { 38 // DispatchJob takes a job a new, untracked job and creates an evaluation 39 // for it and returns the eval. 40 DispatchJob(job *structs.Job) (*structs.Evaluation, error) 41 42 // RunningChildren returns whether the passed job has any running children. 43 RunningChildren(job *structs.Job) (bool, error) 44 } 45 46 // DispatchJob creates an evaluation for the passed job and commits both the 47 // evaluation and the job to the raft log. It returns the eval. 48 func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) { 49 // Commit this update via Raft 50 job.SetSubmitTime() 51 req := structs.JobRegisterRequest{ 52 Job: job, 53 WriteRequest: structs.WriteRequest{ 54 Namespace: job.Namespace, 55 }, 56 } 57 fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req) 58 if err, ok := fsmErr.(error); ok && err != nil { 59 return nil, err 60 } 61 if err != nil { 62 return nil, err 63 } 64 65 // Create a new evaluation 66 eval := &structs.Evaluation{ 67 ID: uuid.Generate(), 68 Namespace: job.Namespace, 69 Priority: job.Priority, 70 Type: job.Type, 71 TriggeredBy: structs.EvalTriggerPeriodicJob, 72 JobID: job.ID, 73 JobModifyIndex: index, 74 Status: structs.EvalStatusPending, 75 } 76 update := &structs.EvalUpdateRequest{ 77 Evals: []*structs.Evaluation{eval}, 78 } 79 80 // Commit this evaluation via Raft 81 // XXX: There is a risk of partial failure where the JobRegister succeeds 82 // but that the EvalUpdate does not. 83 _, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update) 84 if err != nil { 85 return nil, err 86 } 87 88 // Update its indexes. 89 eval.CreateIndex = evalIndex 90 eval.ModifyIndex = evalIndex 91 return eval, nil 92 } 93 94 // RunningChildren checks whether the passed job has any running children. 95 func (s *Server) RunningChildren(job *structs.Job) (bool, error) { 96 state, err := s.fsm.State().Snapshot() 97 if err != nil { 98 return false, err 99 } 100 101 ws := memdb.NewWatchSet() 102 prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix) 103 iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix) 104 if err != nil { 105 return false, err 106 } 107 108 var child *structs.Job 109 for i := iter.Next(); i != nil; i = iter.Next() { 110 child = i.(*structs.Job) 111 112 // Ensure the job is actually a child. 113 if child.ParentID != job.ID { 114 continue 115 } 116 117 // Get the childs evaluations. 118 evals, err := state.EvalsByJob(ws, child.Namespace, child.ID) 119 if err != nil { 120 return false, err 121 } 122 123 // Check if any of the evals are active or have running allocations. 124 for _, eval := range evals { 125 if !eval.TerminalStatus() { 126 return true, nil 127 } 128 129 allocs, err := state.AllocsByEval(ws, eval.ID) 130 if err != nil { 131 return false, err 132 } 133 134 for _, alloc := range allocs { 135 if !alloc.TerminalStatus() { 136 return true, nil 137 } 138 } 139 } 140 } 141 142 // There are no evals or allocations that aren't terminal. 143 return false, nil 144 } 145 146 // NewPeriodicDispatch returns a periodic dispatcher that is used to track and 147 // launch periodic jobs. 148 func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { 149 return &PeriodicDispatch{ 150 dispatcher: dispatcher, 151 tracked: make(map[structs.NamespacedID]*structs.Job), 152 heap: NewPeriodicHeap(), 153 updateCh: make(chan struct{}, 1), 154 logger: logger.Named("periodic"), 155 } 156 } 157 158 // SetEnabled is used to control if the periodic dispatcher is enabled. It 159 // should only be enabled on the active leader. Disabling an active dispatcher 160 // will stop any launched go routine and flush the dispatcher. 161 func (p *PeriodicDispatch) SetEnabled(enabled bool) { 162 p.l.Lock() 163 defer p.l.Unlock() 164 wasRunning := p.enabled 165 p.enabled = enabled 166 167 // If we are transitioning from enabled to disabled, stop the daemon and 168 // flush. 169 if !enabled && wasRunning { 170 p.stopFn() 171 p.flush() 172 } else if enabled && !wasRunning { 173 // If we are transitioning from disabled to enabled, run the daemon. 174 ctx, cancel := context.WithCancel(context.Background()) 175 p.stopFn = cancel 176 go p.run(ctx, p.updateCh) 177 } 178 } 179 180 // Tracked returns the set of tracked job IDs. 181 func (p *PeriodicDispatch) Tracked() []*structs.Job { 182 p.l.RLock() 183 defer p.l.RUnlock() 184 tracked := make([]*structs.Job, len(p.tracked)) 185 i := 0 186 for _, job := range p.tracked { 187 tracked[i] = job 188 i++ 189 } 190 return tracked 191 } 192 193 // Add begins tracking of a periodic job. If it is already tracked, it acts as 194 // an update to the jobs periodic spec. The method returns whether the job was 195 // added and any error that may have occurred. 196 func (p *PeriodicDispatch) Add(job *structs.Job) error { 197 p.l.Lock() 198 defer p.l.Unlock() 199 200 // Do nothing if not enabled 201 if !p.enabled { 202 return nil 203 } 204 205 // If we were tracking a job and it has been disabled, made non-periodic, 206 // stopped or is parameterized, remove it 207 disabled := !job.IsPeriodicActive() 208 209 tuple := structs.NamespacedID{ 210 ID: job.ID, 211 Namespace: job.Namespace, 212 } 213 _, tracked := p.tracked[tuple] 214 if disabled { 215 if tracked { 216 p.removeLocked(tuple) 217 } 218 219 // If the job is disabled and we aren't tracking it, do nothing. 220 return nil 221 } 222 223 // Add or update the job. 224 p.tracked[tuple] = job 225 next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation())) 226 if err != nil { 227 return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err) 228 } 229 if tracked { 230 if err := p.heap.Update(job, next); err != nil { 231 return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err) 232 } 233 p.logger.Debug("updated periodic job", "job", job.NamespacedID()) 234 } else { 235 if err := p.heap.Push(job, next); err != nil { 236 return fmt.Errorf("failed to add job %v: %v", job.ID, err) 237 } 238 p.logger.Debug("registered periodic job", "job", job.NamespacedID()) 239 } 240 241 // Signal an update. 242 select { 243 case p.updateCh <- struct{}{}: 244 default: 245 } 246 247 return nil 248 } 249 250 // Remove stops tracking the passed job. If the job is not tracked, it is a 251 // no-op. 252 func (p *PeriodicDispatch) Remove(namespace, jobID string) error { 253 p.l.Lock() 254 defer p.l.Unlock() 255 return p.removeLocked(structs.NamespacedID{ 256 ID: jobID, 257 Namespace: namespace, 258 }) 259 } 260 261 // Remove stops tracking the passed job. If the job is not tracked, it is a 262 // no-op. It assumes this is called while a lock is held. 263 func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error { 264 // Do nothing if not enabled 265 if !p.enabled { 266 return nil 267 } 268 269 job, tracked := p.tracked[jobID] 270 if !tracked { 271 return nil 272 } 273 274 delete(p.tracked, jobID) 275 if err := p.heap.Remove(job); err != nil { 276 return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err) 277 } 278 279 // Signal an update. 280 select { 281 case p.updateCh <- struct{}{}: 282 default: 283 } 284 285 p.logger.Debug("deregistered periodic job", "job", job.NamespacedID()) 286 return nil 287 } 288 289 // ForceRun causes the periodic job to be evaluated immediately and returns the 290 // subsequent eval. 291 func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) { 292 p.l.Lock() 293 294 // Do nothing if not enabled 295 if !p.enabled { 296 p.l.Unlock() 297 return nil, fmt.Errorf("periodic dispatch disabled") 298 } 299 300 tuple := structs.NamespacedID{ 301 ID: jobID, 302 Namespace: namespace, 303 } 304 job, tracked := p.tracked[tuple] 305 if !tracked { 306 p.l.Unlock() 307 return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace) 308 } 309 310 p.l.Unlock() 311 return p.createEval(job, time.Now().In(job.Periodic.GetLocation())) 312 } 313 314 // shouldRun returns whether the long lived run function should run. 315 func (p *PeriodicDispatch) shouldRun() bool { 316 p.l.RLock() 317 defer p.l.RUnlock() 318 return p.enabled 319 } 320 321 // run is a long-lived function that waits till a job's periodic spec is met and 322 // then creates an evaluation to run the job. 323 func (p *PeriodicDispatch) run(ctx context.Context, updateCh <-chan struct{}) { 324 var launchCh <-chan time.Time 325 for p.shouldRun() { 326 job, launch := p.nextLaunch() 327 if launch.IsZero() { 328 launchCh = nil 329 } else { 330 launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation())) 331 launchCh = time.After(launchDur) 332 p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID()) 333 } 334 335 select { 336 case <-ctx.Done(): 337 return 338 case <-updateCh: 339 continue 340 case <-launchCh: 341 p.dispatch(job, launch) 342 } 343 } 344 } 345 346 // dispatch creates an evaluation for the job and updates its next launchtime 347 // based on the passed launch time. 348 func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { 349 p.l.Lock() 350 351 nextLaunch, err := job.Periodic.Next(launchTime) 352 if err != nil { 353 p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err) 354 } else if err := p.heap.Update(job, nextLaunch); err != nil { 355 p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err) 356 } 357 358 // If the job prohibits overlapping and there are running children, we skip 359 // the launch. 360 if job.Periodic.ProhibitOverlap { 361 running, err := p.dispatcher.RunningChildren(job) 362 if err != nil { 363 p.logger.Error("failed to determine if periodic job has running children", "job", job.NamespacedID(), "error", err) 364 p.l.Unlock() 365 return 366 } 367 368 if running { 369 p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID()) 370 p.l.Unlock() 371 return 372 } 373 } 374 375 p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime) 376 p.l.Unlock() 377 p.createEval(job, launchTime) 378 } 379 380 // nextLaunch returns the next job to launch and when it should be launched. If 381 // the next job can't be determined, an error is returned. If the dispatcher is 382 // stopped, a nil job will be returned. 383 func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) { 384 // If there is nothing wait for an update. 385 p.l.RLock() 386 defer p.l.RUnlock() 387 if p.heap.Length() == 0 { 388 return nil, time.Time{} 389 } 390 391 nextJob := p.heap.Peek() 392 if nextJob == nil { 393 return nil, time.Time{} 394 } 395 396 return nextJob.job, nextJob.next 397 } 398 399 // createEval instantiates a job based on the passed periodic job and submits an 400 // evaluation for it. This should not be called with the lock held. 401 func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) { 402 derived, err := p.deriveJob(periodicJob, time) 403 if err != nil { 404 return nil, err 405 } 406 407 eval, err := p.dispatcher.DispatchJob(derived) 408 if err != nil { 409 p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err) 410 return nil, err 411 } 412 413 return eval, nil 414 } 415 416 // deriveJob instantiates a new job based on the passed periodic job and the 417 // launch time. 418 func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( 419 derived *structs.Job, err error) { 420 421 // Have to recover in case the job copy panics. 422 defer func() { 423 if r := recover(); r != nil { 424 p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner", 425 "job", periodicJob.NamespacedID(), "error", r) 426 427 p.Remove(periodicJob.Namespace, periodicJob.ID) 428 derived = nil 429 err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v", 430 periodicJob.ID, periodicJob.Namespace, r) 431 } 432 }() 433 434 // Create a copy of the periodic job, give it a derived ID/Name and make it 435 // non-periodic. 436 derived = periodicJob.Copy() 437 derived.ParentID = periodicJob.ID 438 derived.ID = p.derivedJobID(periodicJob, time) 439 derived.Name = derived.ID 440 derived.Periodic = nil 441 return 442 } 443 444 // deriveJobID returns a job ID based on the parent periodic job and the launch 445 // time. 446 func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string { 447 return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix()) 448 } 449 450 // LaunchTime returns the launch time of the job. This is only valid for 451 // jobs created by PeriodicDispatch and will otherwise return an error. 452 func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) { 453 index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix) 454 if index == -1 { 455 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 456 } 457 458 launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):]) 459 if err != nil { 460 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 461 } 462 463 return time.Unix(int64(launch), 0), nil 464 } 465 466 // flush clears the state of the PeriodicDispatcher 467 func (p *PeriodicDispatch) flush() { 468 p.updateCh = make(chan struct{}, 1) 469 p.tracked = make(map[structs.NamespacedID]*structs.Job) 470 p.heap = NewPeriodicHeap() 471 p.stopFn = nil 472 } 473 474 // periodicHeap wraps a heap and gives operations other than Push/Pop. 475 type periodicHeap struct { 476 index map[structs.NamespacedID]*periodicJob 477 heap periodicHeapImp 478 } 479 480 type periodicJob struct { 481 job *structs.Job 482 next time.Time 483 index int 484 } 485 486 func NewPeriodicHeap() *periodicHeap { 487 return &periodicHeap{ 488 index: make(map[structs.NamespacedID]*periodicJob), 489 heap: make(periodicHeapImp, 0), 490 } 491 } 492 493 func (p *periodicHeap) Push(job *structs.Job, next time.Time) error { 494 tuple := structs.NamespacedID{ 495 ID: job.ID, 496 Namespace: job.Namespace, 497 } 498 if _, ok := p.index[tuple]; ok { 499 return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace) 500 } 501 502 pJob := &periodicJob{job, next, 0} 503 p.index[tuple] = pJob 504 heap.Push(&p.heap, pJob) 505 return nil 506 } 507 508 func (p *periodicHeap) Pop() *periodicJob { 509 if len(p.heap) == 0 { 510 return nil 511 } 512 513 pJob := heap.Pop(&p.heap).(*periodicJob) 514 tuple := structs.NamespacedID{ 515 ID: pJob.job.ID, 516 Namespace: pJob.job.Namespace, 517 } 518 delete(p.index, tuple) 519 return pJob 520 } 521 522 func (p *periodicHeap) Peek() *periodicJob { 523 if len(p.heap) == 0 { 524 return nil 525 } 526 527 return p.heap[0] 528 } 529 530 func (p *periodicHeap) Contains(job *structs.Job) bool { 531 tuple := structs.NamespacedID{ 532 ID: job.ID, 533 Namespace: job.Namespace, 534 } 535 _, ok := p.index[tuple] 536 return ok 537 } 538 539 func (p *periodicHeap) Update(job *structs.Job, next time.Time) error { 540 tuple := structs.NamespacedID{ 541 ID: job.ID, 542 Namespace: job.Namespace, 543 } 544 if pJob, ok := p.index[tuple]; ok { 545 // Need to update the job as well because its spec can change. 546 pJob.job = job 547 pJob.next = next 548 heap.Fix(&p.heap, pJob.index) 549 return nil 550 } 551 552 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 553 } 554 555 func (p *periodicHeap) Remove(job *structs.Job) error { 556 tuple := structs.NamespacedID{ 557 ID: job.ID, 558 Namespace: job.Namespace, 559 } 560 if pJob, ok := p.index[tuple]; ok { 561 heap.Remove(&p.heap, pJob.index) 562 delete(p.index, tuple) 563 return nil 564 } 565 566 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 567 } 568 569 func (p *periodicHeap) Length() int { 570 return len(p.heap) 571 } 572 573 type periodicHeapImp []*periodicJob 574 575 func (h periodicHeapImp) Len() int { return len(h) } 576 577 func (h periodicHeapImp) Less(i, j int) bool { 578 // Two zero times should return false. 579 // Otherwise, zero is "greater" than any other time. 580 // (To sort it at the end of the list.) 581 // Sort such that zero times are at the end of the list. 582 iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero() 583 if iZero && jZero { 584 return false 585 } else if iZero { 586 return false 587 } else if jZero { 588 return true 589 } 590 591 return h[i].next.Before(h[j].next) 592 } 593 594 func (h periodicHeapImp) Swap(i, j int) { 595 h[i], h[j] = h[j], h[i] 596 h[i].index = i 597 h[j].index = j 598 } 599 600 func (h *periodicHeapImp) Push(x interface{}) { 601 n := len(*h) 602 job := x.(*periodicJob) 603 job.index = n 604 *h = append(*h, job) 605 } 606 607 func (h *periodicHeapImp) Pop() interface{} { 608 old := *h 609 n := len(old) 610 job := old[n-1] 611 job.index = -1 // for safety 612 *h = old[0 : n-1] 613 return job 614 }