github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/periodic.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"context"
     6  	"fmt"
     7  	"log"
     8  	"strconv"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	memdb "github.com/hashicorp/go-memdb"
    14  	"github.com/hashicorp/nomad/helper/uuid"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  // PeriodicDispatch is used to track and launch periodic jobs. It maintains the
    19  // set of periodic jobs and creates derived jobs and evaluations per
    20  // instantiation which is determined by the periodic spec.
    21  type PeriodicDispatch struct {
    22  	dispatcher JobEvalDispatcher
    23  	enabled    bool
    24  
    25  	tracked map[structs.NamespacedID]*structs.Job
    26  	heap    *periodicHeap
    27  
    28  	updateCh chan struct{}
    29  	stopFn   context.CancelFunc
    30  	logger   *log.Logger
    31  	l        sync.RWMutex
    32  }
    33  
    34  // JobEvalDispatcher is an interface to submit jobs and have evaluations created
    35  // for them.
    36  type JobEvalDispatcher interface {
    37  	// DispatchJob takes a job a new, untracked job and creates an evaluation
    38  	// for it and returns the eval.
    39  	DispatchJob(job *structs.Job) (*structs.Evaluation, error)
    40  
    41  	// RunningChildren returns whether the passed job has any running children.
    42  	RunningChildren(job *structs.Job) (bool, error)
    43  }
    44  
    45  // DispatchJob creates an evaluation for the passed job and commits both the
    46  // evaluation and the job to the raft log. It returns the eval.
    47  func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
    48  	// Commit this update via Raft
    49  	job.SetSubmitTime()
    50  	req := structs.JobRegisterRequest{
    51  		Job: job,
    52  		WriteRequest: structs.WriteRequest{
    53  			Namespace: job.Namespace,
    54  		},
    55  	}
    56  	fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req)
    57  	if err, ok := fsmErr.(error); ok && err != nil {
    58  		return nil, err
    59  	}
    60  	if err != nil {
    61  		return nil, err
    62  	}
    63  
    64  	// Create a new evaluation
    65  	eval := &structs.Evaluation{
    66  		ID:             uuid.Generate(),
    67  		Namespace:      job.Namespace,
    68  		Priority:       job.Priority,
    69  		Type:           job.Type,
    70  		TriggeredBy:    structs.EvalTriggerPeriodicJob,
    71  		JobID:          job.ID,
    72  		JobModifyIndex: index,
    73  		Status:         structs.EvalStatusPending,
    74  	}
    75  	update := &structs.EvalUpdateRequest{
    76  		Evals: []*structs.Evaluation{eval},
    77  	}
    78  
    79  	// Commit this evaluation via Raft
    80  	// XXX: There is a risk of partial failure where the JobRegister succeeds
    81  	// but that the EvalUpdate does not.
    82  	_, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update)
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	// Update its indexes.
    88  	eval.CreateIndex = evalIndex
    89  	eval.ModifyIndex = evalIndex
    90  	return eval, nil
    91  }
    92  
    93  // RunningChildren checks whether the passed job has any running children.
    94  func (s *Server) RunningChildren(job *structs.Job) (bool, error) {
    95  	state, err := s.fsm.State().Snapshot()
    96  	if err != nil {
    97  		return false, err
    98  	}
    99  
   100  	ws := memdb.NewWatchSet()
   101  	prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix)
   102  	iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix)
   103  	if err != nil {
   104  		return false, err
   105  	}
   106  
   107  	var child *structs.Job
   108  	for i := iter.Next(); i != nil; i = iter.Next() {
   109  		child = i.(*structs.Job)
   110  
   111  		// Ensure the job is actually a child.
   112  		if child.ParentID != job.ID {
   113  			continue
   114  		}
   115  
   116  		// Get the childs evaluations.
   117  		evals, err := state.EvalsByJob(ws, child.Namespace, child.ID)
   118  		if err != nil {
   119  			return false, err
   120  		}
   121  
   122  		// Check if any of the evals are active or have running allocations.
   123  		for _, eval := range evals {
   124  			if !eval.TerminalStatus() {
   125  				return true, nil
   126  			}
   127  
   128  			allocs, err := state.AllocsByEval(ws, eval.ID)
   129  			if err != nil {
   130  				return false, err
   131  			}
   132  
   133  			for _, alloc := range allocs {
   134  				if !alloc.TerminalStatus() {
   135  					return true, nil
   136  				}
   137  			}
   138  		}
   139  	}
   140  
   141  	// There are no evals or allocations that aren't terminal.
   142  	return false, nil
   143  }
   144  
   145  // NewPeriodicDispatch returns a periodic dispatcher that is used to track and
   146  // launch periodic jobs.
   147  func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch {
   148  	return &PeriodicDispatch{
   149  		dispatcher: dispatcher,
   150  		tracked:    make(map[structs.NamespacedID]*structs.Job),
   151  		heap:       NewPeriodicHeap(),
   152  		updateCh:   make(chan struct{}, 1),
   153  		logger:     logger,
   154  	}
   155  }
   156  
   157  // SetEnabled is used to control if the periodic dispatcher is enabled. It
   158  // should only be enabled on the active leader. Disabling an active dispatcher
   159  // will stop any launched go routine and flush the dispatcher.
   160  func (p *PeriodicDispatch) SetEnabled(enabled bool) {
   161  	p.l.Lock()
   162  	defer p.l.Unlock()
   163  	wasRunning := p.enabled
   164  	p.enabled = enabled
   165  
   166  	// If we are transitioning from enabled to disabled, stop the daemon and
   167  	// flush.
   168  	if !enabled && wasRunning {
   169  		p.stopFn()
   170  		p.flush()
   171  	} else if enabled && !wasRunning {
   172  		// If we are transitioning from disabled to enabled, run the daemon.
   173  		ctx, cancel := context.WithCancel(context.Background())
   174  		p.stopFn = cancel
   175  		go p.run(ctx)
   176  	}
   177  }
   178  
   179  // Tracked returns the set of tracked job IDs.
   180  func (p *PeriodicDispatch) Tracked() []*structs.Job {
   181  	p.l.RLock()
   182  	defer p.l.RUnlock()
   183  	tracked := make([]*structs.Job, len(p.tracked))
   184  	i := 0
   185  	for _, job := range p.tracked {
   186  		tracked[i] = job
   187  		i++
   188  	}
   189  	return tracked
   190  }
   191  
   192  // Add begins tracking of a periodic job. If it is already tracked, it acts as
   193  // an update to the jobs periodic spec. The method returns whether the job was
   194  // added and any error that may have occurred.
   195  func (p *PeriodicDispatch) Add(job *structs.Job) error {
   196  	p.l.Lock()
   197  	defer p.l.Unlock()
   198  
   199  	// Do nothing if not enabled
   200  	if !p.enabled {
   201  		return nil
   202  	}
   203  
   204  	// If we were tracking a job and it has been disabled, made non-periodic,
   205  	// stopped or is parameterized, remove it
   206  	disabled := !job.IsPeriodicActive()
   207  
   208  	tuple := structs.NamespacedID{
   209  		ID:        job.ID,
   210  		Namespace: job.Namespace,
   211  	}
   212  	_, tracked := p.tracked[tuple]
   213  	if disabled {
   214  		if tracked {
   215  			p.removeLocked(tuple)
   216  		}
   217  
   218  		// If the job is disabled and we aren't tracking it, do nothing.
   219  		return nil
   220  	}
   221  
   222  	// Add or update the job.
   223  	p.tracked[tuple] = job
   224  	next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation()))
   225  	if err != nil {
   226  		return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err)
   227  	}
   228  	if tracked {
   229  		if err := p.heap.Update(job, next); err != nil {
   230  			return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err)
   231  		}
   232  		p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q (%s)", job.ID, job.Namespace)
   233  	} else {
   234  		if err := p.heap.Push(job, next); err != nil {
   235  			return fmt.Errorf("failed to add job %v: %v", job.ID, err)
   236  		}
   237  		p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q (%s)", job.ID, job.Namespace)
   238  	}
   239  
   240  	// Signal an update.
   241  	select {
   242  	case p.updateCh <- struct{}{}:
   243  	default:
   244  	}
   245  
   246  	return nil
   247  }
   248  
   249  // Remove stops tracking the passed job. If the job is not tracked, it is a
   250  // no-op.
   251  func (p *PeriodicDispatch) Remove(namespace, jobID string) error {
   252  	p.l.Lock()
   253  	defer p.l.Unlock()
   254  	return p.removeLocked(structs.NamespacedID{
   255  		ID:        jobID,
   256  		Namespace: namespace,
   257  	})
   258  }
   259  
   260  // Remove stops tracking the passed job. If the job is not tracked, it is a
   261  // no-op. It assumes this is called while a lock is held.
   262  func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error {
   263  	// Do nothing if not enabled
   264  	if !p.enabled {
   265  		return nil
   266  	}
   267  
   268  	job, tracked := p.tracked[jobID]
   269  	if !tracked {
   270  		return nil
   271  	}
   272  
   273  	delete(p.tracked, jobID)
   274  	if err := p.heap.Remove(job); err != nil {
   275  		return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err)
   276  	}
   277  
   278  	// Signal an update.
   279  	select {
   280  	case p.updateCh <- struct{}{}:
   281  	default:
   282  	}
   283  
   284  	p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q (%s)", jobID.ID, jobID.Namespace)
   285  	return nil
   286  }
   287  
   288  // ForceRun causes the periodic job to be evaluated immediately and returns the
   289  // subsequent eval.
   290  func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) {
   291  	p.l.Lock()
   292  
   293  	// Do nothing if not enabled
   294  	if !p.enabled {
   295  		p.l.Unlock()
   296  		return nil, fmt.Errorf("periodic dispatch disabled")
   297  	}
   298  
   299  	tuple := structs.NamespacedID{
   300  		ID:        jobID,
   301  		Namespace: namespace,
   302  	}
   303  	job, tracked := p.tracked[tuple]
   304  	if !tracked {
   305  		p.l.Unlock()
   306  		return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace)
   307  	}
   308  
   309  	p.l.Unlock()
   310  	return p.createEval(job, time.Now().In(job.Periodic.GetLocation()))
   311  }
   312  
   313  // shouldRun returns whether the long lived run function should run.
   314  func (p *PeriodicDispatch) shouldRun() bool {
   315  	p.l.RLock()
   316  	defer p.l.RUnlock()
   317  	return p.enabled
   318  }
   319  
   320  // run is a long-lived function that waits till a job's periodic spec is met and
   321  // then creates an evaluation to run the job.
   322  func (p *PeriodicDispatch) run(ctx context.Context) {
   323  	var launchCh <-chan time.Time
   324  	for p.shouldRun() {
   325  		job, launch := p.nextLaunch()
   326  		if launch.IsZero() {
   327  			launchCh = nil
   328  		} else {
   329  			launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation()))
   330  			launchCh = time.After(launchDur)
   331  			p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%s) in %s", job.ID, job.Namespace, launchDur)
   332  		}
   333  
   334  		select {
   335  		case <-ctx.Done():
   336  			return
   337  		case <-p.updateCh:
   338  			continue
   339  		case <-launchCh:
   340  			p.dispatch(job, launch)
   341  		}
   342  	}
   343  }
   344  
   345  // dispatch creates an evaluation for the job and updates its next launchtime
   346  // based on the passed launch time.
   347  func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) {
   348  	p.l.Lock()
   349  
   350  	nextLaunch, err := job.Periodic.Next(launchTime)
   351  	if err != nil {
   352  		p.logger.Printf("[ERR] nomad.periodic: failed to parse next periodic launch for job %s: %v", job.NamespacedID(), err)
   353  	} else if err := p.heap.Update(job, nextLaunch); err != nil {
   354  		p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %s: %v", job.NamespacedID(), err)
   355  	}
   356  
   357  	// If the job prohibits overlapping and there are running children, we skip
   358  	// the launch.
   359  	if job.Periodic.ProhibitOverlap {
   360  		running, err := p.dispatcher.RunningChildren(job)
   361  		if err != nil {
   362  			msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+
   363  				" periodic job %q (%s) has running children: %v", job.ID, job.Namespace, err)
   364  			p.logger.Println(msg)
   365  			p.l.Unlock()
   366  			return
   367  		}
   368  
   369  		if running {
   370  			msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+
   371  				" periodic job %q (%s) because job prohibits overlap", job.ID, job.Namespace)
   372  			p.logger.Println(msg)
   373  			p.l.Unlock()
   374  			return
   375  		}
   376  	}
   377  
   378  	p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%v) at %v", job.ID, job.Namespace, launchTime)
   379  	p.l.Unlock()
   380  	p.createEval(job, launchTime)
   381  }
   382  
   383  // nextLaunch returns the next job to launch and when it should be launched. If
   384  // the next job can't be determined, an error is returned. If the dispatcher is
   385  // stopped, a nil job will be returned.
   386  func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) {
   387  	// If there is nothing wait for an update.
   388  	p.l.RLock()
   389  	defer p.l.RUnlock()
   390  	if p.heap.Length() == 0 {
   391  		return nil, time.Time{}
   392  	}
   393  
   394  	nextJob := p.heap.Peek()
   395  	if nextJob == nil {
   396  		return nil, time.Time{}
   397  	}
   398  
   399  	return nextJob.job, nextJob.next
   400  }
   401  
   402  // createEval instantiates a job based on the passed periodic job and submits an
   403  // evaluation for it. This should not be called with the lock held.
   404  func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) {
   405  	derived, err := p.deriveJob(periodicJob, time)
   406  	if err != nil {
   407  		return nil, err
   408  	}
   409  
   410  	eval, err := p.dispatcher.DispatchJob(derived)
   411  	if err != nil {
   412  		p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q (%s): %v",
   413  			periodicJob.ID, periodicJob.Namespace, err)
   414  		return nil, err
   415  	}
   416  
   417  	return eval, nil
   418  }
   419  
   420  // deriveJob instantiates a new job based on the passed periodic job and the
   421  // launch time.
   422  func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
   423  	derived *structs.Job, err error) {
   424  
   425  	// Have to recover in case the job copy panics.
   426  	defer func() {
   427  		if r := recover(); r != nil {
   428  			p.logger.Printf("[ERR] nomad.periodic: deriving job from"+
   429  				" periodic job %q (%s) failed; deregistering from periodic runner: %v",
   430  				periodicJob.ID, periodicJob.Namespace, r)
   431  
   432  			p.Remove(periodicJob.Namespace, periodicJob.ID)
   433  			derived = nil
   434  			err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v",
   435  				periodicJob.ID, periodicJob.Namespace, r)
   436  		}
   437  	}()
   438  
   439  	// Create a copy of the periodic job, give it a derived ID/Name and make it
   440  	// non-periodic.
   441  	derived = periodicJob.Copy()
   442  	derived.ParentID = periodicJob.ID
   443  	derived.ID = p.derivedJobID(periodicJob, time)
   444  	derived.Name = derived.ID
   445  	derived.Periodic = nil
   446  	return
   447  }
   448  
   449  // deriveJobID returns a job ID based on the parent periodic job and the launch
   450  // time.
   451  func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string {
   452  	return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix())
   453  }
   454  
   455  // LaunchTime returns the launch time of the job. This is only valid for
   456  // jobs created by PeriodicDispatch and will otherwise return an error.
   457  func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) {
   458  	index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix)
   459  	if index == -1 {
   460  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   461  	}
   462  
   463  	launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):])
   464  	if err != nil {
   465  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   466  	}
   467  
   468  	return time.Unix(int64(launch), 0), nil
   469  }
   470  
   471  // flush clears the state of the PeriodicDispatcher
   472  func (p *PeriodicDispatch) flush() {
   473  	p.updateCh = make(chan struct{}, 1)
   474  	p.tracked = make(map[structs.NamespacedID]*structs.Job)
   475  	p.heap = NewPeriodicHeap()
   476  	p.stopFn = nil
   477  }
   478  
   479  // periodicHeap wraps a heap and gives operations other than Push/Pop.
   480  type periodicHeap struct {
   481  	index map[structs.NamespacedID]*periodicJob
   482  	heap  periodicHeapImp
   483  }
   484  
   485  type periodicJob struct {
   486  	job   *structs.Job
   487  	next  time.Time
   488  	index int
   489  }
   490  
   491  func NewPeriodicHeap() *periodicHeap {
   492  	return &periodicHeap{
   493  		index: make(map[structs.NamespacedID]*periodicJob),
   494  		heap:  make(periodicHeapImp, 0),
   495  	}
   496  }
   497  
   498  func (p *periodicHeap) Push(job *structs.Job, next time.Time) error {
   499  	tuple := structs.NamespacedID{
   500  		ID:        job.ID,
   501  		Namespace: job.Namespace,
   502  	}
   503  	if _, ok := p.index[tuple]; ok {
   504  		return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace)
   505  	}
   506  
   507  	pJob := &periodicJob{job, next, 0}
   508  	p.index[tuple] = pJob
   509  	heap.Push(&p.heap, pJob)
   510  	return nil
   511  }
   512  
   513  func (p *periodicHeap) Pop() *periodicJob {
   514  	if len(p.heap) == 0 {
   515  		return nil
   516  	}
   517  
   518  	pJob := heap.Pop(&p.heap).(*periodicJob)
   519  	tuple := structs.NamespacedID{
   520  		ID:        pJob.job.ID,
   521  		Namespace: pJob.job.Namespace,
   522  	}
   523  	delete(p.index, tuple)
   524  	return pJob
   525  }
   526  
   527  func (p *periodicHeap) Peek() *periodicJob {
   528  	if len(p.heap) == 0 {
   529  		return nil
   530  	}
   531  
   532  	return p.heap[0]
   533  }
   534  
   535  func (p *periodicHeap) Contains(job *structs.Job) bool {
   536  	tuple := structs.NamespacedID{
   537  		ID:        job.ID,
   538  		Namespace: job.Namespace,
   539  	}
   540  	_, ok := p.index[tuple]
   541  	return ok
   542  }
   543  
   544  func (p *periodicHeap) Update(job *structs.Job, next time.Time) error {
   545  	tuple := structs.NamespacedID{
   546  		ID:        job.ID,
   547  		Namespace: job.Namespace,
   548  	}
   549  	if pJob, ok := p.index[tuple]; ok {
   550  		// Need to update the job as well because its spec can change.
   551  		pJob.job = job
   552  		pJob.next = next
   553  		heap.Fix(&p.heap, pJob.index)
   554  		return nil
   555  	}
   556  
   557  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   558  }
   559  
   560  func (p *periodicHeap) Remove(job *structs.Job) error {
   561  	tuple := structs.NamespacedID{
   562  		ID:        job.ID,
   563  		Namespace: job.Namespace,
   564  	}
   565  	if pJob, ok := p.index[tuple]; ok {
   566  		heap.Remove(&p.heap, pJob.index)
   567  		delete(p.index, tuple)
   568  		return nil
   569  	}
   570  
   571  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   572  }
   573  
   574  func (p *periodicHeap) Length() int {
   575  	return len(p.heap)
   576  }
   577  
   578  type periodicHeapImp []*periodicJob
   579  
   580  func (h periodicHeapImp) Len() int { return len(h) }
   581  
   582  func (h periodicHeapImp) Less(i, j int) bool {
   583  	// Two zero times should return false.
   584  	// Otherwise, zero is "greater" than any other time.
   585  	// (To sort it at the end of the list.)
   586  	// Sort such that zero times are at the end of the list.
   587  	iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero()
   588  	if iZero && jZero {
   589  		return false
   590  	} else if iZero {
   591  		return false
   592  	} else if jZero {
   593  		return true
   594  	}
   595  
   596  	return h[i].next.Before(h[j].next)
   597  }
   598  
   599  func (h periodicHeapImp) Swap(i, j int) {
   600  	h[i], h[j] = h[j], h[i]
   601  	h[i].index = i
   602  	h[j].index = j
   603  }
   604  
   605  func (h *periodicHeapImp) Push(x interface{}) {
   606  	n := len(*h)
   607  	job := x.(*periodicJob)
   608  	job.index = n
   609  	*h = append(*h, job)
   610  }
   611  
   612  func (h *periodicHeapImp) Pop() interface{} {
   613  	old := *h
   614  	n := len(old)
   615  	job := old[n-1]
   616  	job.index = -1 // for safety
   617  	*h = old[0 : n-1]
   618  	return job
   619  }