github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/periodic.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"log"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  )
    15  
    16  // PeriodicDispatch is used to track and launch periodic jobs. It maintains the
    17  // set of periodic jobs and creates derived jobs and evaluations per
    18  // instantiation which is determined by the periodic spec.
    19  type PeriodicDispatch struct {
    20  	dispatcher JobEvalDispatcher
    21  	enabled    bool
    22  	running    bool
    23  
    24  	tracked map[string]*structs.Job
    25  	heap    *periodicHeap
    26  
    27  	updateCh chan struct{}
    28  	stopCh   chan struct{}
    29  	waitCh   chan struct{}
    30  	logger   *log.Logger
    31  	l        sync.RWMutex
    32  }
    33  
    34  // JobEvalDispatcher is an interface to submit jobs and have evaluations created
    35  // for them.
    36  type JobEvalDispatcher interface {
    37  	// DispatchJob takes a job a new, untracked job and creates an evaluation
    38  	// for it and returns the eval.
    39  	DispatchJob(job *structs.Job) (*structs.Evaluation, error)
    40  
    41  	// RunningChildren returns whether the passed job has any running children.
    42  	RunningChildren(job *structs.Job) (bool, error)
    43  }
    44  
    45  // DispatchJob creates an evaluation for the passed job and commits both the
    46  // evaluation and the job to the raft log. It returns the eval.
    47  func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
    48  	// Commit this update via Raft
    49  	job.SetSubmitTime()
    50  	req := structs.JobRegisterRequest{Job: job}
    51  	_, index, err := s.raftApply(structs.JobRegisterRequestType, req)
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  
    56  	// Create a new evaluation
    57  	eval := &structs.Evaluation{
    58  		ID:             structs.GenerateUUID(),
    59  		Priority:       job.Priority,
    60  		Type:           job.Type,
    61  		TriggeredBy:    structs.EvalTriggerPeriodicJob,
    62  		JobID:          job.ID,
    63  		JobModifyIndex: index,
    64  		Status:         structs.EvalStatusPending,
    65  	}
    66  	update := &structs.EvalUpdateRequest{
    67  		Evals: []*structs.Evaluation{eval},
    68  	}
    69  
    70  	// Commit this evaluation via Raft
    71  	// XXX: There is a risk of partial failure where the JobRegister succeeds
    72  	// but that the EvalUpdate does not.
    73  	_, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  
    78  	// Update its indexes.
    79  	eval.CreateIndex = evalIndex
    80  	eval.ModifyIndex = evalIndex
    81  	return eval, nil
    82  }
    83  
    84  // RunningChildren checks whether the passed job has any running children.
    85  func (s *Server) RunningChildren(job *structs.Job) (bool, error) {
    86  	state, err := s.fsm.State().Snapshot()
    87  	if err != nil {
    88  		return false, err
    89  	}
    90  
    91  	ws := memdb.NewWatchSet()
    92  	prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix)
    93  	iter, err := state.JobsByIDPrefix(ws, prefix)
    94  	if err != nil {
    95  		return false, err
    96  	}
    97  
    98  	var child *structs.Job
    99  	for i := iter.Next(); i != nil; i = iter.Next() {
   100  		child = i.(*structs.Job)
   101  
   102  		// Ensure the job is actually a child.
   103  		if child.ParentID != job.ID {
   104  			continue
   105  		}
   106  
   107  		// Get the childs evaluations.
   108  		evals, err := state.EvalsByJob(ws, child.ID)
   109  		if err != nil {
   110  			return false, err
   111  		}
   112  
   113  		// Check if any of the evals are active or have running allocations.
   114  		for _, eval := range evals {
   115  			if !eval.TerminalStatus() {
   116  				return true, nil
   117  			}
   118  
   119  			allocs, err := state.AllocsByEval(ws, eval.ID)
   120  			if err != nil {
   121  				return false, err
   122  			}
   123  
   124  			for _, alloc := range allocs {
   125  				if !alloc.TerminalStatus() {
   126  					return true, nil
   127  				}
   128  			}
   129  		}
   130  	}
   131  
   132  	// There are no evals or allocations that aren't terminal.
   133  	return false, nil
   134  }
   135  
   136  // NewPeriodicDispatch returns a periodic dispatcher that is used to track and
   137  // launch periodic jobs.
   138  func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch {
   139  	return &PeriodicDispatch{
   140  		dispatcher: dispatcher,
   141  		tracked:    make(map[string]*structs.Job),
   142  		heap:       NewPeriodicHeap(),
   143  		updateCh:   make(chan struct{}, 1),
   144  		stopCh:     make(chan struct{}),
   145  		waitCh:     make(chan struct{}),
   146  		logger:     logger,
   147  	}
   148  }
   149  
   150  // SetEnabled is used to control if the periodic dispatcher is enabled. It
   151  // should only be enabled on the active leader. Disabling an active dispatcher
   152  // will stop any launched go routine and flush the dispatcher.
   153  func (p *PeriodicDispatch) SetEnabled(enabled bool) {
   154  	p.l.Lock()
   155  	p.enabled = enabled
   156  	p.l.Unlock()
   157  	if !enabled {
   158  		if p.running {
   159  			close(p.stopCh)
   160  			<-p.waitCh
   161  			p.running = false
   162  		}
   163  		p.Flush()
   164  	}
   165  }
   166  
   167  // Start begins the goroutine that creates derived jobs and evals.
   168  func (p *PeriodicDispatch) Start() {
   169  	p.l.Lock()
   170  	p.running = true
   171  	p.l.Unlock()
   172  	go p.run()
   173  }
   174  
   175  // Tracked returns the set of tracked job IDs.
   176  func (p *PeriodicDispatch) Tracked() []*structs.Job {
   177  	p.l.RLock()
   178  	defer p.l.RUnlock()
   179  	tracked := make([]*structs.Job, len(p.tracked))
   180  	i := 0
   181  	for _, job := range p.tracked {
   182  		tracked[i] = job
   183  		i++
   184  	}
   185  	return tracked
   186  }
   187  
   188  // Add begins tracking of a periodic job. If it is already tracked, it acts as
   189  // an update to the jobs periodic spec.
   190  func (p *PeriodicDispatch) Add(job *structs.Job) error {
   191  	p.l.Lock()
   192  	defer p.l.Unlock()
   193  
   194  	// Do nothing if not enabled
   195  	if !p.enabled {
   196  		return nil
   197  	}
   198  
   199  	// If we were tracking a job and it has been disabled or made non-periodic remove it.
   200  	disabled := !job.IsPeriodic() || !job.Periodic.Enabled
   201  	_, tracked := p.tracked[job.ID]
   202  	if disabled {
   203  		if tracked {
   204  			p.removeLocked(job.ID)
   205  		}
   206  
   207  		// If the job is disabled and we aren't tracking it, do nothing.
   208  		return nil
   209  	}
   210  
   211  	// Check if the job is also a parameterized job. If it is, then we do not want to
   212  	// treat it as a periodic job but only its dispatched children.
   213  	if job.IsParameterized() {
   214  		return nil
   215  	}
   216  
   217  	// Add or update the job.
   218  	p.tracked[job.ID] = job
   219  	next := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation()))
   220  	if tracked {
   221  		if err := p.heap.Update(job, next); err != nil {
   222  			return fmt.Errorf("failed to update job %v launch time: %v", job.ID, err)
   223  		}
   224  		p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q", job.ID)
   225  	} else {
   226  		if err := p.heap.Push(job, next); err != nil {
   227  			return fmt.Errorf("failed to add job %v: %v", job.ID, err)
   228  		}
   229  		p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q", job.ID)
   230  	}
   231  
   232  	// Signal an update.
   233  	if p.running {
   234  		select {
   235  		case p.updateCh <- struct{}{}:
   236  		default:
   237  		}
   238  	}
   239  
   240  	return nil
   241  }
   242  
   243  // Remove stops tracking the passed job. If the job is not tracked, it is a
   244  // no-op.
   245  func (p *PeriodicDispatch) Remove(jobID string) error {
   246  	p.l.Lock()
   247  	defer p.l.Unlock()
   248  	return p.removeLocked(jobID)
   249  }
   250  
   251  // Remove stops tracking the passed job. If the job is not tracked, it is a
   252  // no-op. It assumes this is called while a lock is held.
   253  func (p *PeriodicDispatch) removeLocked(jobID string) error {
   254  	// Do nothing if not enabled
   255  	if !p.enabled {
   256  		return nil
   257  	}
   258  
   259  	job, tracked := p.tracked[jobID]
   260  	if !tracked {
   261  		return nil
   262  	}
   263  
   264  	delete(p.tracked, jobID)
   265  	if err := p.heap.Remove(job); err != nil {
   266  		return fmt.Errorf("failed to remove tracked job %v: %v", jobID, err)
   267  	}
   268  
   269  	// Signal an update.
   270  	if p.running {
   271  		select {
   272  		case p.updateCh <- struct{}{}:
   273  		default:
   274  		}
   275  	}
   276  
   277  	p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q", jobID)
   278  	return nil
   279  }
   280  
   281  // ForceRun causes the periodic job to be evaluated immediately and returns the
   282  // subsequent eval.
   283  func (p *PeriodicDispatch) ForceRun(jobID string) (*structs.Evaluation, error) {
   284  	p.l.Lock()
   285  
   286  	// Do nothing if not enabled
   287  	if !p.enabled {
   288  		p.l.Unlock()
   289  		return nil, fmt.Errorf("periodic dispatch disabled")
   290  	}
   291  
   292  	job, tracked := p.tracked[jobID]
   293  	if !tracked {
   294  		p.l.Unlock()
   295  		return nil, fmt.Errorf("can't force run non-tracked job %v", jobID)
   296  	}
   297  
   298  	p.l.Unlock()
   299  	return p.createEval(job, time.Now().In(job.Periodic.GetLocation()))
   300  }
   301  
   302  // shouldRun returns whether the long lived run function should run.
   303  func (p *PeriodicDispatch) shouldRun() bool {
   304  	p.l.RLock()
   305  	defer p.l.RUnlock()
   306  	return p.enabled && p.running
   307  }
   308  
   309  // run is a long-lived function that waits till a job's periodic spec is met and
   310  // then creates an evaluation to run the job.
   311  func (p *PeriodicDispatch) run() {
   312  	defer close(p.waitCh)
   313  	var launchCh <-chan time.Time
   314  	for p.shouldRun() {
   315  		job, launch := p.nextLaunch()
   316  		if launch.IsZero() {
   317  			launchCh = nil
   318  		} else {
   319  			launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation()))
   320  			launchCh = time.After(launchDur)
   321  			p.logger.Printf("[DEBUG] nomad.periodic: launching job %q in %s", job.ID, launchDur)
   322  		}
   323  
   324  		select {
   325  		case <-p.stopCh:
   326  			return
   327  		case <-p.updateCh:
   328  			continue
   329  		case <-launchCh:
   330  			p.dispatch(job, launch)
   331  		}
   332  	}
   333  }
   334  
   335  // dispatch creates an evaluation for the job and updates its next launchtime
   336  // based on the passed launch time.
   337  func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) {
   338  	p.l.Lock()
   339  
   340  	nextLaunch := job.Periodic.Next(launchTime)
   341  	if err := p.heap.Update(job, nextLaunch); err != nil {
   342  		p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %q: %v", job.ID, err)
   343  	}
   344  
   345  	// If the job prohibits overlapping and there are running children, we skip
   346  	// the launch.
   347  	if job.Periodic.ProhibitOverlap {
   348  		running, err := p.dispatcher.RunningChildren(job)
   349  		if err != nil {
   350  			msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+
   351  				" periodic job %q has running children: %v", job.ID, err)
   352  			p.logger.Println(msg)
   353  			p.l.Unlock()
   354  			return
   355  		}
   356  
   357  		if running {
   358  			msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+
   359  				" periodic job %q because job prohibits overlap", job.ID)
   360  			p.logger.Println(msg)
   361  			p.l.Unlock()
   362  			return
   363  		}
   364  	}
   365  
   366  	p.logger.Printf("[DEBUG] nomad.periodic: launching job %v at %v", job.ID, launchTime)
   367  	p.l.Unlock()
   368  	p.createEval(job, launchTime)
   369  }
   370  
   371  // nextLaunch returns the next job to launch and when it should be launched. If
   372  // the next job can't be determined, an error is returned. If the dispatcher is
   373  // stopped, a nil job will be returned.
   374  func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) {
   375  	// If there is nothing wait for an update.
   376  	p.l.RLock()
   377  	defer p.l.RUnlock()
   378  	if p.heap.Length() == 0 {
   379  		return nil, time.Time{}
   380  	}
   381  
   382  	nextJob := p.heap.Peek()
   383  	if nextJob == nil {
   384  		return nil, time.Time{}
   385  	}
   386  
   387  	return nextJob.job, nextJob.next
   388  }
   389  
   390  // createEval instantiates a job based on the passed periodic job and submits an
   391  // evaluation for it. This should not be called with the lock held.
   392  func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) {
   393  	derived, err := p.deriveJob(periodicJob, time)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  
   398  	eval, err := p.dispatcher.DispatchJob(derived)
   399  	if err != nil {
   400  		p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q: %v", periodicJob.ID, err)
   401  		return nil, err
   402  	}
   403  
   404  	return eval, nil
   405  }
   406  
   407  // deriveJob instantiates a new job based on the passed periodic job and the
   408  // launch time.
   409  func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
   410  	derived *structs.Job, err error) {
   411  
   412  	// Have to recover in case the job copy panics.
   413  	defer func() {
   414  		if r := recover(); r != nil {
   415  			p.logger.Printf("[ERR] nomad.periodic: deriving job from"+
   416  				" periodic job %v failed; deregistering from periodic runner: %v",
   417  				periodicJob.ID, r)
   418  			p.Remove(periodicJob.ID)
   419  			derived = nil
   420  			err = fmt.Errorf("Failed to create a copy of the periodic job %v: %v", periodicJob.ID, r)
   421  		}
   422  	}()
   423  
   424  	// Create a copy of the periodic job, give it a derived ID/Name and make it
   425  	// non-periodic.
   426  	derived = periodicJob.Copy()
   427  	derived.ParentID = periodicJob.ID
   428  	derived.ID = p.derivedJobID(periodicJob, time)
   429  	derived.Name = derived.ID
   430  	derived.Periodic = nil
   431  	return
   432  }
   433  
   434  // deriveJobID returns a job ID based on the parent periodic job and the launch
   435  // time.
   436  func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string {
   437  	return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix())
   438  }
   439  
   440  // LaunchTime returns the launch time of the job. This is only valid for
   441  // jobs created by PeriodicDispatch and will otherwise return an error.
   442  func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) {
   443  	index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix)
   444  	if index == -1 {
   445  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   446  	}
   447  
   448  	launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):])
   449  	if err != nil {
   450  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   451  	}
   452  
   453  	return time.Unix(int64(launch), 0), nil
   454  }
   455  
   456  // Flush clears the state of the PeriodicDispatcher
   457  func (p *PeriodicDispatch) Flush() {
   458  	p.l.Lock()
   459  	defer p.l.Unlock()
   460  	p.stopCh = make(chan struct{})
   461  	p.updateCh = make(chan struct{}, 1)
   462  	p.waitCh = make(chan struct{})
   463  	p.tracked = make(map[string]*structs.Job)
   464  	p.heap = NewPeriodicHeap()
   465  }
   466  
   467  // periodicHeap wraps a heap and gives operations other than Push/Pop.
   468  type periodicHeap struct {
   469  	index map[string]*periodicJob
   470  	heap  periodicHeapImp
   471  }
   472  
   473  type periodicJob struct {
   474  	job   *structs.Job
   475  	next  time.Time
   476  	index int
   477  }
   478  
   479  func NewPeriodicHeap() *periodicHeap {
   480  	return &periodicHeap{
   481  		index: make(map[string]*periodicJob),
   482  		heap:  make(periodicHeapImp, 0),
   483  	}
   484  }
   485  
   486  func (p *periodicHeap) Push(job *structs.Job, next time.Time) error {
   487  	if _, ok := p.index[job.ID]; ok {
   488  		return fmt.Errorf("job %v already exists", job.ID)
   489  	}
   490  
   491  	pJob := &periodicJob{job, next, 0}
   492  	p.index[job.ID] = pJob
   493  	heap.Push(&p.heap, pJob)
   494  	return nil
   495  }
   496  
   497  func (p *periodicHeap) Pop() *periodicJob {
   498  	if len(p.heap) == 0 {
   499  		return nil
   500  	}
   501  
   502  	pJob := heap.Pop(&p.heap).(*periodicJob)
   503  	delete(p.index, pJob.job.ID)
   504  	return pJob
   505  }
   506  
   507  func (p *periodicHeap) Peek() *periodicJob {
   508  	if len(p.heap) == 0 {
   509  		return nil
   510  	}
   511  
   512  	return p.heap[0]
   513  }
   514  
   515  func (p *periodicHeap) Contains(job *structs.Job) bool {
   516  	_, ok := p.index[job.ID]
   517  	return ok
   518  }
   519  
   520  func (p *periodicHeap) Update(job *structs.Job, next time.Time) error {
   521  	if pJob, ok := p.index[job.ID]; ok {
   522  		// Need to update the job as well because its spec can change.
   523  		pJob.job = job
   524  		pJob.next = next
   525  		heap.Fix(&p.heap, pJob.index)
   526  		return nil
   527  	}
   528  
   529  	return fmt.Errorf("heap doesn't contain job %v", job.ID)
   530  }
   531  
   532  func (p *periodicHeap) Remove(job *structs.Job) error {
   533  	if pJob, ok := p.index[job.ID]; ok {
   534  		heap.Remove(&p.heap, pJob.index)
   535  		delete(p.index, job.ID)
   536  		return nil
   537  	}
   538  
   539  	return fmt.Errorf("heap doesn't contain job %v", job.ID)
   540  }
   541  
   542  func (p *periodicHeap) Length() int {
   543  	return len(p.heap)
   544  }
   545  
   546  type periodicHeapImp []*periodicJob
   547  
   548  func (h periodicHeapImp) Len() int { return len(h) }
   549  
   550  func (h periodicHeapImp) Less(i, j int) bool {
   551  	// Two zero times should return false.
   552  	// Otherwise, zero is "greater" than any other time.
   553  	// (To sort it at the end of the list.)
   554  	// Sort such that zero times are at the end of the list.
   555  	iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero()
   556  	if iZero && jZero {
   557  		return false
   558  	} else if iZero {
   559  		return false
   560  	} else if jZero {
   561  		return true
   562  	}
   563  
   564  	return h[i].next.Before(h[j].next)
   565  }
   566  
   567  func (h periodicHeapImp) Swap(i, j int) {
   568  	h[i], h[j] = h[j], h[i]
   569  	h[i].index = i
   570  	h[j].index = j
   571  }
   572  
   573  func (h *periodicHeapImp) Push(x interface{}) {
   574  	n := len(*h)
   575  	job := x.(*periodicJob)
   576  	job.index = n
   577  	*h = append(*h, job)
   578  }
   579  
   580  func (h *periodicHeapImp) Pop() interface{} {
   581  	old := *h
   582  	n := len(old)
   583  	job := old[n-1]
   584  	job.index = -1 // for safety
   585  	*h = old[0 : n-1]
   586  	return job
   587  }