github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/periodic.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"log"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  // PeriodicDispatch is used to track and launch periodic jobs. It maintains the
    16  // set of periodic jobs and creates derived jobs and evaluations per
    17  // instantiation which is determined by the periodic spec.
    18  type PeriodicDispatch struct {
    19  	dispatcher JobEvalDispatcher
    20  	enabled    bool
    21  	running    bool
    22  
    23  	tracked map[string]*structs.Job
    24  	heap    *periodicHeap
    25  
    26  	updateCh chan struct{}
    27  	stopCh   chan struct{}
    28  	waitCh   chan struct{}
    29  	logger   *log.Logger
    30  	l        sync.RWMutex
    31  }
    32  
    33  // JobEvalDispatcher is an interface to submit jobs and have evaluations created
    34  // for them.
    35  type JobEvalDispatcher interface {
    36  	// DispatchJob takes a job a new, untracked job and creates an evaluation
    37  	// for it and returns the eval.
    38  	DispatchJob(job *structs.Job) (*structs.Evaluation, error)
    39  
    40  	// RunningChildren returns whether the passed job has any running children.
    41  	RunningChildren(job *structs.Job) (bool, error)
    42  }
    43  
    44  // DispatchJob creates an evaluation for the passed job and commits both the
    45  // evaluation and the job to the raft log. It returns the eval.
    46  func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
    47  	// Commit this update via Raft
    48  	req := structs.JobRegisterRequest{Job: job}
    49  	_, index, err := s.raftApply(structs.JobRegisterRequestType, req)
    50  	if err != nil {
    51  		return nil, err
    52  	}
    53  
    54  	// Create a new evaluation
    55  	eval := &structs.Evaluation{
    56  		ID:             structs.GenerateUUID(),
    57  		Priority:       job.Priority,
    58  		Type:           job.Type,
    59  		TriggeredBy:    structs.EvalTriggerPeriodicJob,
    60  		JobID:          job.ID,
    61  		JobModifyIndex: index,
    62  		Status:         structs.EvalStatusPending,
    63  	}
    64  	update := &structs.EvalUpdateRequest{
    65  		Evals: []*structs.Evaluation{eval},
    66  	}
    67  
    68  	// Commit this evaluation via Raft
    69  	// XXX: There is a risk of partial failure where the JobRegister succeeds
    70  	// but that the EvalUpdate does not.
    71  	_, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update)
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	// Update its indexes.
    77  	eval.CreateIndex = evalIndex
    78  	eval.ModifyIndex = evalIndex
    79  	return eval, nil
    80  }
    81  
    82  // RunningChildren checks whether the passed job has any running children.
    83  func (s *Server) RunningChildren(job *structs.Job) (bool, error) {
    84  	state, err := s.fsm.State().Snapshot()
    85  	if err != nil {
    86  		return false, err
    87  	}
    88  
    89  	prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix)
    90  	iter, err := state.JobsByIDPrefix(prefix)
    91  	if err != nil {
    92  		return false, err
    93  	}
    94  
    95  	var child *structs.Job
    96  	for i := iter.Next(); i != nil; i = iter.Next() {
    97  		child = i.(*structs.Job)
    98  
    99  		// Ensure the job is actually a child.
   100  		if child.ParentID != job.ID {
   101  			continue
   102  		}
   103  
   104  		// Get the childs evaluations.
   105  		evals, err := state.EvalsByJob(child.ID)
   106  		if err != nil {
   107  			return false, err
   108  		}
   109  
   110  		// Check if any of the evals are active or have running allocations.
   111  		for _, eval := range evals {
   112  			if !eval.TerminalStatus() {
   113  				return true, nil
   114  			}
   115  
   116  			allocs, err := state.AllocsByEval(eval.ID)
   117  			if err != nil {
   118  				return false, err
   119  			}
   120  
   121  			for _, alloc := range allocs {
   122  				if !alloc.TerminalStatus() {
   123  					return true, nil
   124  				}
   125  			}
   126  		}
   127  	}
   128  
   129  	// There are no evals or allocations that aren't terminal.
   130  	return false, nil
   131  }
   132  
   133  // NewPeriodicDispatch returns a periodic dispatcher that is used to track and
   134  // launch periodic jobs.
   135  func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch {
   136  	return &PeriodicDispatch{
   137  		dispatcher: dispatcher,
   138  		tracked:    make(map[string]*structs.Job),
   139  		heap:       NewPeriodicHeap(),
   140  		updateCh:   make(chan struct{}, 1),
   141  		stopCh:     make(chan struct{}),
   142  		waitCh:     make(chan struct{}),
   143  		logger:     logger,
   144  	}
   145  }
   146  
   147  // SetEnabled is used to control if the periodic dispatcher is enabled. It
   148  // should only be enabled on the active leader. Disabling an active dispatcher
   149  // will stop any launched go routine and flush the dispatcher.
   150  func (p *PeriodicDispatch) SetEnabled(enabled bool) {
   151  	p.l.Lock()
   152  	p.enabled = enabled
   153  	p.l.Unlock()
   154  	if !enabled {
   155  		if p.running {
   156  			close(p.stopCh)
   157  			<-p.waitCh
   158  			p.running = false
   159  		}
   160  		p.Flush()
   161  	}
   162  }
   163  
   164  // Start begins the goroutine that creates derived jobs and evals.
   165  func (p *PeriodicDispatch) Start() {
   166  	p.l.Lock()
   167  	p.running = true
   168  	p.l.Unlock()
   169  	go p.run()
   170  }
   171  
   172  // Tracked returns the set of tracked job IDs.
   173  func (p *PeriodicDispatch) Tracked() []*structs.Job {
   174  	p.l.RLock()
   175  	defer p.l.RUnlock()
   176  	tracked := make([]*structs.Job, len(p.tracked))
   177  	i := 0
   178  	for _, job := range p.tracked {
   179  		tracked[i] = job
   180  		i++
   181  	}
   182  	return tracked
   183  }
   184  
   185  // Add begins tracking of a periodic job. If it is already tracked, it acts as
   186  // an update to the jobs periodic spec.
   187  func (p *PeriodicDispatch) Add(job *structs.Job) error {
   188  	p.l.Lock()
   189  	defer p.l.Unlock()
   190  
   191  	// Do nothing if not enabled
   192  	if !p.enabled {
   193  		return nil
   194  	}
   195  
   196  	// If we were tracking a job and it has been disabled or made non-periodic remove it.
   197  	disabled := !job.IsPeriodic() || !job.Periodic.Enabled
   198  	_, tracked := p.tracked[job.ID]
   199  	if disabled {
   200  		if tracked {
   201  			p.removeLocked(job.ID)
   202  		}
   203  
   204  		// If the job is disabled and we aren't tracking it, do nothing.
   205  		return nil
   206  	}
   207  
   208  	// Add or update the job.
   209  	p.tracked[job.ID] = job
   210  	next := job.Periodic.Next(time.Now().UTC())
   211  	if tracked {
   212  		if err := p.heap.Update(job, next); err != nil {
   213  			return fmt.Errorf("failed to update job %v launch time: %v", job.ID, err)
   214  		}
   215  		p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q", job.ID)
   216  	} else {
   217  		if err := p.heap.Push(job, next); err != nil {
   218  			return fmt.Errorf("failed to add job %v: %v", job.ID, err)
   219  		}
   220  		p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q", job.ID)
   221  	}
   222  
   223  	// Signal an update.
   224  	if p.running {
   225  		select {
   226  		case p.updateCh <- struct{}{}:
   227  		default:
   228  		}
   229  	}
   230  
   231  	return nil
   232  }
   233  
   234  // Remove stops tracking the passed job. If the job is not tracked, it is a
   235  // no-op.
   236  func (p *PeriodicDispatch) Remove(jobID string) error {
   237  	p.l.Lock()
   238  	defer p.l.Unlock()
   239  	return p.removeLocked(jobID)
   240  }
   241  
   242  // Remove stops tracking the passed job. If the job is not tracked, it is a
   243  // no-op. It assumes this is called while a lock is held.
   244  func (p *PeriodicDispatch) removeLocked(jobID string) error {
   245  	// Do nothing if not enabled
   246  	if !p.enabled {
   247  		return nil
   248  	}
   249  
   250  	job, tracked := p.tracked[jobID]
   251  	if !tracked {
   252  		return nil
   253  	}
   254  
   255  	delete(p.tracked, jobID)
   256  	if err := p.heap.Remove(job); err != nil {
   257  		return fmt.Errorf("failed to remove tracked job %v: %v", jobID, err)
   258  	}
   259  
   260  	// Signal an update.
   261  	if p.running {
   262  		select {
   263  		case p.updateCh <- struct{}{}:
   264  		default:
   265  		}
   266  	}
   267  
   268  	p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q", jobID)
   269  	return nil
   270  }
   271  
   272  // ForceRun causes the periodic job to be evaluated immediately and returns the
   273  // subsequent eval.
   274  func (p *PeriodicDispatch) ForceRun(jobID string) (*structs.Evaluation, error) {
   275  	p.l.Lock()
   276  
   277  	// Do nothing if not enabled
   278  	if !p.enabled {
   279  		p.l.Unlock()
   280  		return nil, fmt.Errorf("periodic dispatch disabled")
   281  	}
   282  
   283  	job, tracked := p.tracked[jobID]
   284  	if !tracked {
   285  		p.l.Unlock()
   286  		return nil, fmt.Errorf("can't force run non-tracked job %v", jobID)
   287  	}
   288  
   289  	p.l.Unlock()
   290  	return p.createEval(job, time.Now().UTC())
   291  }
   292  
   293  // shouldRun returns whether the long lived run function should run.
   294  func (p *PeriodicDispatch) shouldRun() bool {
   295  	p.l.RLock()
   296  	defer p.l.RUnlock()
   297  	return p.enabled && p.running
   298  }
   299  
   300  // run is a long-lived function that waits till a job's periodic spec is met and
   301  // then creates an evaluation to run the job.
   302  func (p *PeriodicDispatch) run() {
   303  	defer close(p.waitCh)
   304  	var launchCh <-chan time.Time
   305  	for p.shouldRun() {
   306  		job, launch := p.nextLaunch()
   307  		if launch.IsZero() {
   308  			launchCh = nil
   309  		} else {
   310  			launchDur := launch.Sub(time.Now().UTC())
   311  			launchCh = time.After(launchDur)
   312  			p.logger.Printf("[DEBUG] nomad.periodic: launching job %q in %s", job.ID, launchDur)
   313  		}
   314  
   315  		select {
   316  		case <-p.stopCh:
   317  			return
   318  		case <-p.updateCh:
   319  			continue
   320  		case <-launchCh:
   321  			p.dispatch(job, launch)
   322  		}
   323  	}
   324  }
   325  
   326  // dispatch creates an evaluation for the job and updates its next launchtime
   327  // based on the passed launch time.
   328  func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) {
   329  	p.l.Lock()
   330  
   331  	nextLaunch := job.Periodic.Next(launchTime)
   332  	if err := p.heap.Update(job, nextLaunch); err != nil {
   333  		p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %q: %v", job.ID, err)
   334  	}
   335  
   336  	// If the job prohibits overlapping and there are running children, we skip
   337  	// the launch.
   338  	if job.Periodic.ProhibitOverlap {
   339  		running, err := p.dispatcher.RunningChildren(job)
   340  		if err != nil {
   341  			msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+
   342  				" periodic job %q has running children: %v", job.ID, err)
   343  			p.logger.Println(msg)
   344  			p.l.Unlock()
   345  			return
   346  		}
   347  
   348  		if running {
   349  			msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+
   350  				" periodic job %q because job prohibits overlap", job.ID)
   351  			p.logger.Println(msg)
   352  			p.l.Unlock()
   353  			return
   354  		}
   355  	}
   356  
   357  	p.logger.Printf("[DEBUG] nomad.periodic: launching job %v at %v", job.ID, launchTime)
   358  	p.l.Unlock()
   359  	p.createEval(job, launchTime)
   360  }
   361  
   362  // nextLaunch returns the next job to launch and when it should be launched. If
   363  // the next job can't be determined, an error is returned. If the dispatcher is
   364  // stopped, a nil job will be returned.
   365  func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) {
   366  	// If there is nothing wait for an update.
   367  	p.l.RLock()
   368  	defer p.l.RUnlock()
   369  	if p.heap.Length() == 0 {
   370  		return nil, time.Time{}
   371  	}
   372  
   373  	nextJob := p.heap.Peek()
   374  	if nextJob == nil {
   375  		return nil, time.Time{}
   376  	}
   377  
   378  	return nextJob.job, nextJob.next
   379  }
   380  
   381  // createEval instantiates a job based on the passed periodic job and submits an
   382  // evaluation for it. This should not be called with the lock held.
   383  func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) {
   384  	derived, err := p.deriveJob(periodicJob, time)
   385  	if err != nil {
   386  		return nil, err
   387  	}
   388  
   389  	eval, err := p.dispatcher.DispatchJob(derived)
   390  	if err != nil {
   391  		p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q: %v", periodicJob.ID, err)
   392  		return nil, err
   393  	}
   394  
   395  	return eval, nil
   396  }
   397  
   398  // deriveJob instantiates a new job based on the passed periodic job and the
   399  // launch time.
   400  func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
   401  	derived *structs.Job, err error) {
   402  
   403  	// Have to recover in case the job copy panics.
   404  	defer func() {
   405  		if r := recover(); r != nil {
   406  			p.logger.Printf("[ERR] nomad.periodic: deriving job from"+
   407  				" periodic job %v failed; deregistering from periodic runner: %v",
   408  				periodicJob.ID, r)
   409  			p.Remove(periodicJob.ID)
   410  			derived = nil
   411  			err = fmt.Errorf("Failed to create a copy of the periodic job %v: %v", periodicJob.ID, r)
   412  		}
   413  	}()
   414  
   415  	// Create a copy of the periodic job, give it a derived ID/Name and make it
   416  	// non-periodic.
   417  	derived = periodicJob.Copy()
   418  	derived.ParentID = periodicJob.ID
   419  	derived.ID = p.derivedJobID(periodicJob, time)
   420  	derived.Name = derived.ID
   421  	derived.Periodic = nil
   422  	return
   423  }
   424  
   425  // deriveJobID returns a job ID based on the parent periodic job and the launch
   426  // time.
   427  func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string {
   428  	return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix())
   429  }
   430  
   431  // LaunchTime returns the launch time of the job. This is only valid for
   432  // jobs created by PeriodicDispatch and will otherwise return an error.
   433  func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) {
   434  	index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix)
   435  	if index == -1 {
   436  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   437  	}
   438  
   439  	launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):])
   440  	if err != nil {
   441  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   442  	}
   443  
   444  	return time.Unix(int64(launch), 0), nil
   445  }
   446  
   447  // Flush clears the state of the PeriodicDispatcher
   448  func (p *PeriodicDispatch) Flush() {
   449  	p.l.Lock()
   450  	defer p.l.Unlock()
   451  	p.stopCh = make(chan struct{})
   452  	p.updateCh = make(chan struct{}, 1)
   453  	p.waitCh = make(chan struct{})
   454  	p.tracked = make(map[string]*structs.Job)
   455  	p.heap = NewPeriodicHeap()
   456  }
   457  
   458  // periodicHeap wraps a heap and gives operations other than Push/Pop.
   459  type periodicHeap struct {
   460  	index map[string]*periodicJob
   461  	heap  periodicHeapImp
   462  }
   463  
   464  type periodicJob struct {
   465  	job   *structs.Job
   466  	next  time.Time
   467  	index int
   468  }
   469  
   470  func NewPeriodicHeap() *periodicHeap {
   471  	return &periodicHeap{
   472  		index: make(map[string]*periodicJob),
   473  		heap:  make(periodicHeapImp, 0),
   474  	}
   475  }
   476  
   477  func (p *periodicHeap) Push(job *structs.Job, next time.Time) error {
   478  	if _, ok := p.index[job.ID]; ok {
   479  		return fmt.Errorf("job %v already exists", job.ID)
   480  	}
   481  
   482  	pJob := &periodicJob{job, next, 0}
   483  	p.index[job.ID] = pJob
   484  	heap.Push(&p.heap, pJob)
   485  	return nil
   486  }
   487  
   488  func (p *periodicHeap) Pop() *periodicJob {
   489  	if len(p.heap) == 0 {
   490  		return nil
   491  	}
   492  
   493  	pJob := heap.Pop(&p.heap).(*periodicJob)
   494  	delete(p.index, pJob.job.ID)
   495  	return pJob
   496  }
   497  
   498  func (p *periodicHeap) Peek() *periodicJob {
   499  	if len(p.heap) == 0 {
   500  		return nil
   501  	}
   502  
   503  	return p.heap[0]
   504  }
   505  
   506  func (p *periodicHeap) Contains(job *structs.Job) bool {
   507  	_, ok := p.index[job.ID]
   508  	return ok
   509  }
   510  
   511  func (p *periodicHeap) Update(job *structs.Job, next time.Time) error {
   512  	if pJob, ok := p.index[job.ID]; ok {
   513  		// Need to update the job as well because its spec can change.
   514  		pJob.job = job
   515  		pJob.next = next
   516  		heap.Fix(&p.heap, pJob.index)
   517  		return nil
   518  	}
   519  
   520  	return fmt.Errorf("heap doesn't contain job %v", job.ID)
   521  }
   522  
   523  func (p *periodicHeap) Remove(job *structs.Job) error {
   524  	if pJob, ok := p.index[job.ID]; ok {
   525  		heap.Remove(&p.heap, pJob.index)
   526  		delete(p.index, job.ID)
   527  		return nil
   528  	}
   529  
   530  	return fmt.Errorf("heap doesn't contain job %v", job.ID)
   531  }
   532  
   533  func (p *periodicHeap) Length() int {
   534  	return len(p.heap)
   535  }
   536  
   537  type periodicHeapImp []*periodicJob
   538  
   539  func (h periodicHeapImp) Len() int { return len(h) }
   540  
   541  func (h periodicHeapImp) Less(i, j int) bool {
   542  	// Two zero times should return false.
   543  	// Otherwise, zero is "greater" than any other time.
   544  	// (To sort it at the end of the list.)
   545  	// Sort such that zero times are at the end of the list.
   546  	iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero()
   547  	if iZero && jZero {
   548  		return false
   549  	} else if iZero {
   550  		return false
   551  	} else if jZero {
   552  		return true
   553  	}
   554  
   555  	return h[i].next.Before(h[j].next)
   556  }
   557  
   558  func (h periodicHeapImp) Swap(i, j int) {
   559  	h[i], h[j] = h[j], h[i]
   560  	h[i].index = i
   561  	h[j].index = j
   562  }
   563  
   564  func (h *periodicHeapImp) Push(x interface{}) {
   565  	n := len(*h)
   566  	job := x.(*periodicJob)
   567  	job.index = n
   568  	*h = append(*h, job)
   569  }
   570  
   571  func (h *periodicHeapImp) Pop() interface{} {
   572  	old := *h
   573  	n := len(old)
   574  	job := old[n-1]
   575  	job.index = -1 // for safety
   576  	*h = old[0 : n-1]
   577  	return job
   578  }