github.com/aminovpavel/nomad@v0.11.8/nomad/periodic.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "context" 6 "fmt" 7 "strconv" 8 "strings" 9 "sync" 10 "time" 11 12 log "github.com/hashicorp/go-hclog" 13 memdb "github.com/hashicorp/go-memdb" 14 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 // PeriodicDispatch is used to track and launch periodic jobs. It maintains the 20 // set of periodic jobs and creates derived jobs and evaluations per 21 // instantiation which is determined by the periodic spec. 22 type PeriodicDispatch struct { 23 dispatcher JobEvalDispatcher 24 enabled bool 25 26 tracked map[structs.NamespacedID]*structs.Job 27 heap *periodicHeap 28 29 updateCh chan struct{} 30 stopFn context.CancelFunc 31 logger log.Logger 32 l sync.RWMutex 33 } 34 35 // JobEvalDispatcher is an interface to submit jobs and have evaluations created 36 // for them. 37 type JobEvalDispatcher interface { 38 // DispatchJob takes a job a new, untracked job and creates an evaluation 39 // for it and returns the eval. 40 DispatchJob(job *structs.Job) (*structs.Evaluation, error) 41 42 // RunningChildren returns whether the passed job has any running children. 43 RunningChildren(job *structs.Job) (bool, error) 44 } 45 46 // DispatchJob creates an evaluation for the passed job and commits both the 47 // evaluation and the job to the raft log. It returns the eval. 48 func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) { 49 // Commit this update via Raft 50 job.SetSubmitTime() 51 req := structs.JobRegisterRequest{ 52 Job: job, 53 WriteRequest: structs.WriteRequest{ 54 Namespace: job.Namespace, 55 }, 56 } 57 fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req) 58 if err, ok := fsmErr.(error); ok && err != nil { 59 return nil, err 60 } 61 if err != nil { 62 return nil, err 63 } 64 65 // Create a new evaluation 66 now := time.Now().UTC().UnixNano() 67 eval := &structs.Evaluation{ 68 ID: uuid.Generate(), 69 Namespace: job.Namespace, 70 Priority: job.Priority, 71 Type: job.Type, 72 TriggeredBy: structs.EvalTriggerPeriodicJob, 73 JobID: job.ID, 74 JobModifyIndex: index, 75 Status: structs.EvalStatusPending, 76 CreateTime: now, 77 ModifyTime: now, 78 } 79 update := &structs.EvalUpdateRequest{ 80 Evals: []*structs.Evaluation{eval}, 81 } 82 83 // Commit this evaluation via Raft 84 // XXX: There is a risk of partial failure where the JobRegister succeeds 85 // but that the EvalUpdate does not. 86 _, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update) 87 if err != nil { 88 return nil, err 89 } 90 91 // Update its indexes. 92 eval.CreateIndex = evalIndex 93 eval.ModifyIndex = evalIndex 94 return eval, nil 95 } 96 97 // RunningChildren checks whether the passed job has any running children. 98 func (s *Server) RunningChildren(job *structs.Job) (bool, error) { 99 state, err := s.fsm.State().Snapshot() 100 if err != nil { 101 return false, err 102 } 103 104 ws := memdb.NewWatchSet() 105 prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix) 106 iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix) 107 if err != nil { 108 return false, err 109 } 110 111 var child *structs.Job 112 for i := iter.Next(); i != nil; i = iter.Next() { 113 child = i.(*structs.Job) 114 115 // Ensure the job is actually a child. 116 if child.ParentID != job.ID { 117 continue 118 } 119 120 // Get the childs evaluations. 121 evals, err := state.EvalsByJob(ws, child.Namespace, child.ID) 122 if err != nil { 123 return false, err 124 } 125 126 // Check if any of the evals are active or have running allocations. 127 for _, eval := range evals { 128 if !eval.TerminalStatus() { 129 return true, nil 130 } 131 132 allocs, err := state.AllocsByEval(ws, eval.ID) 133 if err != nil { 134 return false, err 135 } 136 137 for _, alloc := range allocs { 138 if !alloc.TerminalStatus() { 139 return true, nil 140 } 141 } 142 } 143 } 144 145 // There are no evals or allocations that aren't terminal. 146 return false, nil 147 } 148 149 // NewPeriodicDispatch returns a periodic dispatcher that is used to track and 150 // launch periodic jobs. 151 func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { 152 return &PeriodicDispatch{ 153 dispatcher: dispatcher, 154 tracked: make(map[structs.NamespacedID]*structs.Job), 155 heap: NewPeriodicHeap(), 156 updateCh: make(chan struct{}, 1), 157 logger: logger.Named("periodic"), 158 } 159 } 160 161 // SetEnabled is used to control if the periodic dispatcher is enabled. It 162 // should only be enabled on the active leader. Disabling an active dispatcher 163 // will stop any launched go routine and flush the dispatcher. 164 func (p *PeriodicDispatch) SetEnabled(enabled bool) { 165 p.l.Lock() 166 defer p.l.Unlock() 167 wasRunning := p.enabled 168 p.enabled = enabled 169 170 // If we are transitioning from enabled to disabled, stop the daemon and 171 // flush. 172 if !enabled && wasRunning { 173 p.stopFn() 174 p.flush() 175 } else if enabled && !wasRunning { 176 // If we are transitioning from disabled to enabled, run the daemon. 177 ctx, cancel := context.WithCancel(context.Background()) 178 p.stopFn = cancel 179 go p.run(ctx, p.updateCh) 180 } 181 } 182 183 // Tracked returns the set of tracked job IDs. 184 func (p *PeriodicDispatch) Tracked() []*structs.Job { 185 p.l.RLock() 186 defer p.l.RUnlock() 187 tracked := make([]*structs.Job, len(p.tracked)) 188 i := 0 189 for _, job := range p.tracked { 190 tracked[i] = job 191 i++ 192 } 193 return tracked 194 } 195 196 // Add begins tracking of a periodic job. If it is already tracked, it acts as 197 // an update to the jobs periodic spec. The method returns whether the job was 198 // added and any error that may have occurred. 199 func (p *PeriodicDispatch) Add(job *structs.Job) error { 200 p.l.Lock() 201 defer p.l.Unlock() 202 203 // Do nothing if not enabled 204 if !p.enabled { 205 return nil 206 } 207 208 // If we were tracking a job and it has been disabled, made non-periodic, 209 // stopped or is parameterized, remove it 210 disabled := !job.IsPeriodicActive() 211 212 tuple := structs.NamespacedID{ 213 ID: job.ID, 214 Namespace: job.Namespace, 215 } 216 _, tracked := p.tracked[tuple] 217 if disabled { 218 if tracked { 219 p.removeLocked(tuple) 220 } 221 222 // If the job is disabled and we aren't tracking it, do nothing. 223 return nil 224 } 225 226 // Add or update the job. 227 p.tracked[tuple] = job 228 next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation())) 229 if err != nil { 230 return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err) 231 } 232 if tracked { 233 if err := p.heap.Update(job, next); err != nil { 234 return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err) 235 } 236 p.logger.Debug("updated periodic job", "job", job.NamespacedID()) 237 } else { 238 if err := p.heap.Push(job, next); err != nil { 239 return fmt.Errorf("failed to add job %v: %v", job.ID, err) 240 } 241 p.logger.Debug("registered periodic job", "job", job.NamespacedID()) 242 } 243 244 // Signal an update. 245 select { 246 case p.updateCh <- struct{}{}: 247 default: 248 } 249 250 return nil 251 } 252 253 // Remove stops tracking the passed job. If the job is not tracked, it is a 254 // no-op. 255 func (p *PeriodicDispatch) Remove(namespace, jobID string) error { 256 p.l.Lock() 257 defer p.l.Unlock() 258 return p.removeLocked(structs.NamespacedID{ 259 ID: jobID, 260 Namespace: namespace, 261 }) 262 } 263 264 // Remove stops tracking the passed job. If the job is not tracked, it is a 265 // no-op. It assumes this is called while a lock is held. 266 func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error { 267 // Do nothing if not enabled 268 if !p.enabled { 269 return nil 270 } 271 272 job, tracked := p.tracked[jobID] 273 if !tracked { 274 return nil 275 } 276 277 delete(p.tracked, jobID) 278 if err := p.heap.Remove(job); err != nil { 279 return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err) 280 } 281 282 // Signal an update. 283 select { 284 case p.updateCh <- struct{}{}: 285 default: 286 } 287 288 p.logger.Debug("deregistered periodic job", "job", job.NamespacedID()) 289 return nil 290 } 291 292 // ForceRun causes the periodic job to be evaluated immediately and returns the 293 // subsequent eval. 294 func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) { 295 p.l.Lock() 296 297 // Do nothing if not enabled 298 if !p.enabled { 299 p.l.Unlock() 300 return nil, fmt.Errorf("periodic dispatch disabled") 301 } 302 303 tuple := structs.NamespacedID{ 304 ID: jobID, 305 Namespace: namespace, 306 } 307 job, tracked := p.tracked[tuple] 308 if !tracked { 309 p.l.Unlock() 310 return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace) 311 } 312 313 p.l.Unlock() 314 return p.createEval(job, time.Now().In(job.Periodic.GetLocation())) 315 } 316 317 // shouldRun returns whether the long lived run function should run. 318 func (p *PeriodicDispatch) shouldRun() bool { 319 p.l.RLock() 320 defer p.l.RUnlock() 321 return p.enabled 322 } 323 324 // run is a long-lived function that waits till a job's periodic spec is met and 325 // then creates an evaluation to run the job. 326 func (p *PeriodicDispatch) run(ctx context.Context, updateCh <-chan struct{}) { 327 var launchCh <-chan time.Time 328 for p.shouldRun() { 329 job, launch := p.nextLaunch() 330 if launch.IsZero() { 331 launchCh = nil 332 } else { 333 launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation())) 334 launchCh = time.After(launchDur) 335 p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID()) 336 } 337 338 select { 339 case <-ctx.Done(): 340 return 341 case <-updateCh: 342 continue 343 case <-launchCh: 344 p.dispatch(job, launch) 345 } 346 } 347 } 348 349 // dispatch creates an evaluation for the job and updates its next launchtime 350 // based on the passed launch time. 351 func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { 352 p.l.Lock() 353 354 nextLaunch, err := job.Periodic.Next(launchTime) 355 if err != nil { 356 p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err) 357 } else if err := p.heap.Update(job, nextLaunch); err != nil { 358 p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err) 359 } 360 361 // If the job prohibits overlapping and there are running children, we skip 362 // the launch. 363 if job.Periodic.ProhibitOverlap { 364 running, err := p.dispatcher.RunningChildren(job) 365 if err != nil { 366 p.logger.Error("failed to determine if periodic job has running children", "job", job.NamespacedID(), "error", err) 367 p.l.Unlock() 368 return 369 } 370 371 if running { 372 p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID()) 373 p.l.Unlock() 374 return 375 } 376 } 377 378 p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime) 379 p.l.Unlock() 380 p.createEval(job, launchTime) 381 } 382 383 // nextLaunch returns the next job to launch and when it should be launched. If 384 // the next job can't be determined, an error is returned. If the dispatcher is 385 // stopped, a nil job will be returned. 386 func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) { 387 // If there is nothing wait for an update. 388 p.l.RLock() 389 defer p.l.RUnlock() 390 if p.heap.Length() == 0 { 391 return nil, time.Time{} 392 } 393 394 nextJob := p.heap.Peek() 395 if nextJob == nil { 396 return nil, time.Time{} 397 } 398 399 return nextJob.job, nextJob.next 400 } 401 402 // createEval instantiates a job based on the passed periodic job and submits an 403 // evaluation for it. This should not be called with the lock held. 404 func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) { 405 derived, err := p.deriveJob(periodicJob, time) 406 if err != nil { 407 return nil, err 408 } 409 410 eval, err := p.dispatcher.DispatchJob(derived) 411 if err != nil { 412 p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err) 413 return nil, err 414 } 415 416 return eval, nil 417 } 418 419 // deriveJob instantiates a new job based on the passed periodic job and the 420 // launch time. 421 func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( 422 derived *structs.Job, err error) { 423 424 // Have to recover in case the job copy panics. 425 defer func() { 426 if r := recover(); r != nil { 427 p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner", 428 "job", periodicJob.NamespacedID(), "error", r) 429 430 p.Remove(periodicJob.Namespace, periodicJob.ID) 431 derived = nil 432 err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v", 433 periodicJob.ID, periodicJob.Namespace, r) 434 } 435 }() 436 437 // Create a copy of the periodic job, give it a derived ID/Name and make it 438 // non-periodic. 439 derived = periodicJob.Copy() 440 derived.ParentID = periodicJob.ID 441 derived.ID = p.derivedJobID(periodicJob, time) 442 derived.Name = derived.ID 443 derived.Periodic = nil 444 return 445 } 446 447 // deriveJobID returns a job ID based on the parent periodic job and the launch 448 // time. 449 func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string { 450 return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix()) 451 } 452 453 // LaunchTime returns the launch time of the job. This is only valid for 454 // jobs created by PeriodicDispatch and will otherwise return an error. 455 func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) { 456 index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix) 457 if index == -1 { 458 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 459 } 460 461 launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):]) 462 if err != nil { 463 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 464 } 465 466 return time.Unix(int64(launch), 0), nil 467 } 468 469 // flush clears the state of the PeriodicDispatcher 470 func (p *PeriodicDispatch) flush() { 471 p.updateCh = make(chan struct{}, 1) 472 p.tracked = make(map[structs.NamespacedID]*structs.Job) 473 p.heap = NewPeriodicHeap() 474 p.stopFn = nil 475 } 476 477 // periodicHeap wraps a heap and gives operations other than Push/Pop. 478 type periodicHeap struct { 479 index map[structs.NamespacedID]*periodicJob 480 heap periodicHeapImp 481 } 482 483 type periodicJob struct { 484 job *structs.Job 485 next time.Time 486 index int 487 } 488 489 func NewPeriodicHeap() *periodicHeap { 490 return &periodicHeap{ 491 index: make(map[structs.NamespacedID]*periodicJob), 492 heap: make(periodicHeapImp, 0), 493 } 494 } 495 496 func (p *periodicHeap) Push(job *structs.Job, next time.Time) error { 497 tuple := structs.NamespacedID{ 498 ID: job.ID, 499 Namespace: job.Namespace, 500 } 501 if _, ok := p.index[tuple]; ok { 502 return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace) 503 } 504 505 pJob := &periodicJob{job, next, 0} 506 p.index[tuple] = pJob 507 heap.Push(&p.heap, pJob) 508 return nil 509 } 510 511 func (p *periodicHeap) Pop() *periodicJob { 512 if len(p.heap) == 0 { 513 return nil 514 } 515 516 pJob := heap.Pop(&p.heap).(*periodicJob) 517 tuple := structs.NamespacedID{ 518 ID: pJob.job.ID, 519 Namespace: pJob.job.Namespace, 520 } 521 delete(p.index, tuple) 522 return pJob 523 } 524 525 func (p *periodicHeap) Peek() *periodicJob { 526 if len(p.heap) == 0 { 527 return nil 528 } 529 530 return p.heap[0] 531 } 532 533 func (p *periodicHeap) Contains(job *structs.Job) bool { 534 tuple := structs.NamespacedID{ 535 ID: job.ID, 536 Namespace: job.Namespace, 537 } 538 _, ok := p.index[tuple] 539 return ok 540 } 541 542 func (p *periodicHeap) Update(job *structs.Job, next time.Time) error { 543 tuple := structs.NamespacedID{ 544 ID: job.ID, 545 Namespace: job.Namespace, 546 } 547 if pJob, ok := p.index[tuple]; ok { 548 // Need to update the job as well because its spec can change. 549 pJob.job = job 550 pJob.next = next 551 heap.Fix(&p.heap, pJob.index) 552 return nil 553 } 554 555 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 556 } 557 558 func (p *periodicHeap) Remove(job *structs.Job) error { 559 tuple := structs.NamespacedID{ 560 ID: job.ID, 561 Namespace: job.Namespace, 562 } 563 if pJob, ok := p.index[tuple]; ok { 564 heap.Remove(&p.heap, pJob.index) 565 delete(p.index, tuple) 566 return nil 567 } 568 569 return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace) 570 } 571 572 func (p *periodicHeap) Length() int { 573 return len(p.heap) 574 } 575 576 type periodicHeapImp []*periodicJob 577 578 func (h periodicHeapImp) Len() int { return len(h) } 579 580 func (h periodicHeapImp) Less(i, j int) bool { 581 // Two zero times should return false. 582 // Otherwise, zero is "greater" than any other time. 583 // (To sort it at the end of the list.) 584 // Sort such that zero times are at the end of the list. 585 iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero() 586 if iZero && jZero { 587 return false 588 } else if iZero { 589 return false 590 } else if jZero { 591 return true 592 } 593 594 return h[i].next.Before(h[j].next) 595 } 596 597 func (h periodicHeapImp) Swap(i, j int) { 598 h[i], h[j] = h[j], h[i] 599 h[i].index = i 600 h[j].index = j 601 } 602 603 func (h *periodicHeapImp) Push(x interface{}) { 604 n := len(*h) 605 job := x.(*periodicJob) 606 job.index = n 607 *h = append(*h, job) 608 } 609 610 func (h *periodicHeapImp) Pop() interface{} { 611 old := *h 612 n := len(old) 613 job := old[n-1] 614 job.index = -1 // for safety 615 *h = old[0 : n-1] 616 return job 617 }