github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/periodic.go (about) 1 package nomad 2 3 import ( 4 "container/heap" 5 "fmt" 6 "log" 7 "strconv" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 // PeriodicDispatch is used to track and launch periodic jobs. It maintains the 16 // set of periodic jobs and creates derived jobs and evaluations per 17 // instantiation which is determined by the periodic spec. 18 type PeriodicDispatch struct { 19 dispatcher JobEvalDispatcher 20 enabled bool 21 running bool 22 23 tracked map[string]*structs.Job 24 heap *periodicHeap 25 26 updateCh chan struct{} 27 stopCh chan struct{} 28 waitCh chan struct{} 29 logger *log.Logger 30 l sync.RWMutex 31 } 32 33 // JobEvalDispatcher is an interface to submit jobs and have evaluations created 34 // for them. 35 type JobEvalDispatcher interface { 36 // DispatchJob takes a job a new, untracked job and creates an evaluation 37 // for it and returns the eval. 38 DispatchJob(job *structs.Job) (*structs.Evaluation, error) 39 40 // RunningChildren returns whether the passed job has any running children. 41 RunningChildren(job *structs.Job) (bool, error) 42 } 43 44 // DispatchJob creates an evaluation for the passed job and commits both the 45 // evaluation and the job to the raft log. It returns the eval. 46 func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) { 47 // Commit this update via Raft 48 req := structs.JobRegisterRequest{Job: job} 49 _, index, err := s.raftApply(structs.JobRegisterRequestType, req) 50 if err != nil { 51 return nil, err 52 } 53 54 // Create a new evaluation 55 eval := &structs.Evaluation{ 56 ID: structs.GenerateUUID(), 57 Priority: job.Priority, 58 Type: job.Type, 59 TriggeredBy: structs.EvalTriggerPeriodicJob, 60 JobID: job.ID, 61 JobModifyIndex: index, 62 Status: structs.EvalStatusPending, 63 } 64 update := &structs.EvalUpdateRequest{ 65 Evals: []*structs.Evaluation{eval}, 66 } 67 68 // Commit this evaluation via Raft 69 // XXX: There is a risk of partial failure where the JobRegister succeeds 70 // but that the EvalUpdate does not. 71 _, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update) 72 if err != nil { 73 return nil, err 74 } 75 76 // Update its indexes. 77 eval.CreateIndex = evalIndex 78 eval.ModifyIndex = evalIndex 79 return eval, nil 80 } 81 82 // RunningChildren checks whether the passed job has any running children. 83 func (s *Server) RunningChildren(job *structs.Job) (bool, error) { 84 state, err := s.fsm.State().Snapshot() 85 if err != nil { 86 return false, err 87 } 88 89 prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix) 90 iter, err := state.JobsByIDPrefix(prefix) 91 if err != nil { 92 return false, err 93 } 94 95 var child *structs.Job 96 for i := iter.Next(); i != nil; i = iter.Next() { 97 child = i.(*structs.Job) 98 99 // Ensure the job is actually a child. 100 if child.ParentID != job.ID { 101 continue 102 } 103 104 // Get the childs evaluations. 105 evals, err := state.EvalsByJob(child.ID) 106 if err != nil { 107 return false, err 108 } 109 110 // Check if any of the evals are active or have running allocations. 111 for _, eval := range evals { 112 if !eval.TerminalStatus() { 113 return true, nil 114 } 115 116 allocs, err := state.AllocsByEval(eval.ID) 117 if err != nil { 118 return false, err 119 } 120 121 for _, alloc := range allocs { 122 if !alloc.TerminalStatus() { 123 return true, nil 124 } 125 } 126 } 127 } 128 129 // There are no evals or allocations that aren't terminal. 130 return false, nil 131 } 132 133 // NewPeriodicDispatch returns a periodic dispatcher that is used to track and 134 // launch periodic jobs. 135 func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { 136 return &PeriodicDispatch{ 137 dispatcher: dispatcher, 138 tracked: make(map[string]*structs.Job), 139 heap: NewPeriodicHeap(), 140 updateCh: make(chan struct{}, 1), 141 stopCh: make(chan struct{}), 142 waitCh: make(chan struct{}), 143 logger: logger, 144 } 145 } 146 147 // SetEnabled is used to control if the periodic dispatcher is enabled. It 148 // should only be enabled on the active leader. Disabling an active dispatcher 149 // will stop any launched go routine and flush the dispatcher. 150 func (p *PeriodicDispatch) SetEnabled(enabled bool) { 151 p.l.Lock() 152 p.enabled = enabled 153 p.l.Unlock() 154 if !enabled { 155 if p.running { 156 close(p.stopCh) 157 <-p.waitCh 158 p.running = false 159 } 160 p.Flush() 161 } 162 } 163 164 // Start begins the goroutine that creates derived jobs and evals. 165 func (p *PeriodicDispatch) Start() { 166 p.l.Lock() 167 p.running = true 168 p.l.Unlock() 169 go p.run() 170 } 171 172 // Tracked returns the set of tracked job IDs. 173 func (p *PeriodicDispatch) Tracked() []*structs.Job { 174 p.l.RLock() 175 defer p.l.RUnlock() 176 tracked := make([]*structs.Job, len(p.tracked)) 177 i := 0 178 for _, job := range p.tracked { 179 tracked[i] = job 180 i++ 181 } 182 return tracked 183 } 184 185 // Add begins tracking of a periodic job. If it is already tracked, it acts as 186 // an update to the jobs periodic spec. 187 func (p *PeriodicDispatch) Add(job *structs.Job) error { 188 p.l.Lock() 189 defer p.l.Unlock() 190 191 // Do nothing if not enabled 192 if !p.enabled { 193 return nil 194 } 195 196 // If we were tracking a job and it has been disabled or made non-periodic remove it. 197 disabled := !job.IsPeriodic() || !job.Periodic.Enabled 198 _, tracked := p.tracked[job.ID] 199 if disabled { 200 if tracked { 201 p.removeLocked(job.ID) 202 } 203 204 // If the job is disabled and we aren't tracking it, do nothing. 205 return nil 206 } 207 208 // Add or update the job. 209 p.tracked[job.ID] = job 210 next := job.Periodic.Next(time.Now().UTC()) 211 if tracked { 212 if err := p.heap.Update(job, next); err != nil { 213 return fmt.Errorf("failed to update job %v launch time: %v", job.ID, err) 214 } 215 p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q", job.ID) 216 } else { 217 if err := p.heap.Push(job, next); err != nil { 218 return fmt.Errorf("failed to add job %v: %v", job.ID, err) 219 } 220 p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q", job.ID) 221 } 222 223 // Signal an update. 224 if p.running { 225 select { 226 case p.updateCh <- struct{}{}: 227 default: 228 } 229 } 230 231 return nil 232 } 233 234 // Remove stops tracking the passed job. If the job is not tracked, it is a 235 // no-op. 236 func (p *PeriodicDispatch) Remove(jobID string) error { 237 p.l.Lock() 238 defer p.l.Unlock() 239 return p.removeLocked(jobID) 240 } 241 242 // Remove stops tracking the passed job. If the job is not tracked, it is a 243 // no-op. It assumes this is called while a lock is held. 244 func (p *PeriodicDispatch) removeLocked(jobID string) error { 245 // Do nothing if not enabled 246 if !p.enabled { 247 return nil 248 } 249 250 job, tracked := p.tracked[jobID] 251 if !tracked { 252 return nil 253 } 254 255 delete(p.tracked, jobID) 256 if err := p.heap.Remove(job); err != nil { 257 return fmt.Errorf("failed to remove tracked job %v: %v", jobID, err) 258 } 259 260 // Signal an update. 261 if p.running { 262 select { 263 case p.updateCh <- struct{}{}: 264 default: 265 } 266 } 267 268 p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q", jobID) 269 return nil 270 } 271 272 // ForceRun causes the periodic job to be evaluated immediately and returns the 273 // subsequent eval. 274 func (p *PeriodicDispatch) ForceRun(jobID string) (*structs.Evaluation, error) { 275 p.l.Lock() 276 277 // Do nothing if not enabled 278 if !p.enabled { 279 p.l.Unlock() 280 return nil, fmt.Errorf("periodic dispatch disabled") 281 } 282 283 job, tracked := p.tracked[jobID] 284 if !tracked { 285 p.l.Unlock() 286 return nil, fmt.Errorf("can't force run non-tracked job %v", jobID) 287 } 288 289 p.l.Unlock() 290 return p.createEval(job, time.Now().UTC()) 291 } 292 293 // shouldRun returns whether the long lived run function should run. 294 func (p *PeriodicDispatch) shouldRun() bool { 295 p.l.RLock() 296 defer p.l.RUnlock() 297 return p.enabled && p.running 298 } 299 300 // run is a long-lived function that waits till a job's periodic spec is met and 301 // then creates an evaluation to run the job. 302 func (p *PeriodicDispatch) run() { 303 defer close(p.waitCh) 304 var launchCh <-chan time.Time 305 for p.shouldRun() { 306 job, launch := p.nextLaunch() 307 if launch.IsZero() { 308 launchCh = nil 309 } else { 310 launchDur := launch.Sub(time.Now().UTC()) 311 launchCh = time.After(launchDur) 312 p.logger.Printf("[DEBUG] nomad.periodic: launching job %q in %s", job.ID, launchDur) 313 } 314 315 select { 316 case <-p.stopCh: 317 return 318 case <-p.updateCh: 319 continue 320 case <-launchCh: 321 p.dispatch(job, launch) 322 } 323 } 324 } 325 326 // dispatch creates an evaluation for the job and updates its next launchtime 327 // based on the passed launch time. 328 func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { 329 p.l.Lock() 330 331 nextLaunch := job.Periodic.Next(launchTime) 332 if err := p.heap.Update(job, nextLaunch); err != nil { 333 p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %q: %v", job.ID, err) 334 } 335 336 // If the job prohibits overlapping and there are running children, we skip 337 // the launch. 338 if job.Periodic.ProhibitOverlap { 339 running, err := p.dispatcher.RunningChildren(job) 340 if err != nil { 341 msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+ 342 " periodic job %q has running children: %v", job.ID, err) 343 p.logger.Println(msg) 344 p.l.Unlock() 345 return 346 } 347 348 if running { 349 msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+ 350 " periodic job %q because job prohibits overlap", job.ID) 351 p.logger.Println(msg) 352 p.l.Unlock() 353 return 354 } 355 } 356 357 p.logger.Printf("[DEBUG] nomad.periodic: launching job %v at %v", job.ID, launchTime) 358 p.l.Unlock() 359 p.createEval(job, launchTime) 360 } 361 362 // nextLaunch returns the next job to launch and when it should be launched. If 363 // the next job can't be determined, an error is returned. If the dispatcher is 364 // stopped, a nil job will be returned. 365 func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) { 366 // If there is nothing wait for an update. 367 p.l.RLock() 368 defer p.l.RUnlock() 369 if p.heap.Length() == 0 { 370 return nil, time.Time{} 371 } 372 373 nextJob := p.heap.Peek() 374 if nextJob == nil { 375 return nil, time.Time{} 376 } 377 378 return nextJob.job, nextJob.next 379 } 380 381 // createEval instantiates a job based on the passed periodic job and submits an 382 // evaluation for it. This should not be called with the lock held. 383 func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) { 384 derived, err := p.deriveJob(periodicJob, time) 385 if err != nil { 386 return nil, err 387 } 388 389 eval, err := p.dispatcher.DispatchJob(derived) 390 if err != nil { 391 p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q: %v", periodicJob.ID, err) 392 return nil, err 393 } 394 395 return eval, nil 396 } 397 398 // deriveJob instantiates a new job based on the passed periodic job and the 399 // launch time. 400 func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( 401 derived *structs.Job, err error) { 402 403 // Have to recover in case the job copy panics. 404 defer func() { 405 if r := recover(); r != nil { 406 p.logger.Printf("[ERR] nomad.periodic: deriving job from"+ 407 " periodic job %v failed; deregistering from periodic runner: %v", 408 periodicJob.ID, r) 409 p.Remove(periodicJob.ID) 410 derived = nil 411 err = fmt.Errorf("Failed to create a copy of the periodic job %v: %v", periodicJob.ID, r) 412 } 413 }() 414 415 // Create a copy of the periodic job, give it a derived ID/Name and make it 416 // non-periodic. 417 derived = periodicJob.Copy() 418 derived.ParentID = periodicJob.ID 419 derived.ID = p.derivedJobID(periodicJob, time) 420 derived.Name = derived.ID 421 derived.Periodic = nil 422 return 423 } 424 425 // deriveJobID returns a job ID based on the parent periodic job and the launch 426 // time. 427 func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string { 428 return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix()) 429 } 430 431 // LaunchTime returns the launch time of the job. This is only valid for 432 // jobs created by PeriodicDispatch and will otherwise return an error. 433 func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) { 434 index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix) 435 if index == -1 { 436 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 437 } 438 439 launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):]) 440 if err != nil { 441 return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID) 442 } 443 444 return time.Unix(int64(launch), 0), nil 445 } 446 447 // Flush clears the state of the PeriodicDispatcher 448 func (p *PeriodicDispatch) Flush() { 449 p.l.Lock() 450 defer p.l.Unlock() 451 p.stopCh = make(chan struct{}) 452 p.updateCh = make(chan struct{}, 1) 453 p.waitCh = make(chan struct{}) 454 p.tracked = make(map[string]*structs.Job) 455 p.heap = NewPeriodicHeap() 456 } 457 458 // periodicHeap wraps a heap and gives operations other than Push/Pop. 459 type periodicHeap struct { 460 index map[string]*periodicJob 461 heap periodicHeapImp 462 } 463 464 type periodicJob struct { 465 job *structs.Job 466 next time.Time 467 index int 468 } 469 470 func NewPeriodicHeap() *periodicHeap { 471 return &periodicHeap{ 472 index: make(map[string]*periodicJob), 473 heap: make(periodicHeapImp, 0), 474 } 475 } 476 477 func (p *periodicHeap) Push(job *structs.Job, next time.Time) error { 478 if _, ok := p.index[job.ID]; ok { 479 return fmt.Errorf("job %v already exists", job.ID) 480 } 481 482 pJob := &periodicJob{job, next, 0} 483 p.index[job.ID] = pJob 484 heap.Push(&p.heap, pJob) 485 return nil 486 } 487 488 func (p *periodicHeap) Pop() *periodicJob { 489 if len(p.heap) == 0 { 490 return nil 491 } 492 493 pJob := heap.Pop(&p.heap).(*periodicJob) 494 delete(p.index, pJob.job.ID) 495 return pJob 496 } 497 498 func (p *periodicHeap) Peek() *periodicJob { 499 if len(p.heap) == 0 { 500 return nil 501 } 502 503 return p.heap[0] 504 } 505 506 func (p *periodicHeap) Contains(job *structs.Job) bool { 507 _, ok := p.index[job.ID] 508 return ok 509 } 510 511 func (p *periodicHeap) Update(job *structs.Job, next time.Time) error { 512 if pJob, ok := p.index[job.ID]; ok { 513 // Need to update the job as well because its spec can change. 514 pJob.job = job 515 pJob.next = next 516 heap.Fix(&p.heap, pJob.index) 517 return nil 518 } 519 520 return fmt.Errorf("heap doesn't contain job %v", job.ID) 521 } 522 523 func (p *periodicHeap) Remove(job *structs.Job) error { 524 if pJob, ok := p.index[job.ID]; ok { 525 heap.Remove(&p.heap, pJob.index) 526 delete(p.index, job.ID) 527 return nil 528 } 529 530 return fmt.Errorf("heap doesn't contain job %v", job.ID) 531 } 532 533 func (p *periodicHeap) Length() int { 534 return len(p.heap) 535 } 536 537 type periodicHeapImp []*periodicJob 538 539 func (h periodicHeapImp) Len() int { return len(h) } 540 541 func (h periodicHeapImp) Less(i, j int) bool { 542 // Two zero times should return false. 543 // Otherwise, zero is "greater" than any other time. 544 // (To sort it at the end of the list.) 545 // Sort such that zero times are at the end of the list. 546 iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero() 547 if iZero && jZero { 548 return false 549 } else if iZero { 550 return false 551 } else if jZero { 552 return true 553 } 554 555 return h[i].next.Before(h[j].next) 556 } 557 558 func (h periodicHeapImp) Swap(i, j int) { 559 h[i], h[j] = h[j], h[i] 560 h[i].index = i 561 h[j].index = j 562 } 563 564 func (h *periodicHeapImp) Push(x interface{}) { 565 n := len(*h) 566 job := x.(*periodicJob) 567 job.index = n 568 *h = append(*h, job) 569 } 570 571 func (h *periodicHeapImp) Pop() interface{} { 572 old := *h 573 n := len(old) 574 job := old[n-1] 575 job.index = -1 // for safety 576 *h = old[0 : n-1] 577 return job 578 }