github.com/bigcommerce/nomad@v0.9.3-bc/nomad/periodic.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"context"
     6  	"fmt"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	log "github.com/hashicorp/go-hclog"
    13  	memdb "github.com/hashicorp/go-memdb"
    14  
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  // PeriodicDispatch is used to track and launch periodic jobs. It maintains the
    20  // set of periodic jobs and creates derived jobs and evaluations per
    21  // instantiation which is determined by the periodic spec.
    22  type PeriodicDispatch struct {
    23  	dispatcher JobEvalDispatcher
    24  	enabled    bool
    25  
    26  	tracked map[structs.NamespacedID]*structs.Job
    27  	heap    *periodicHeap
    28  
    29  	updateCh chan struct{}
    30  	stopFn   context.CancelFunc
    31  	logger   log.Logger
    32  	l        sync.RWMutex
    33  }
    34  
    35  // JobEvalDispatcher is an interface to submit jobs and have evaluations created
    36  // for them.
    37  type JobEvalDispatcher interface {
    38  	// DispatchJob takes a job a new, untracked job and creates an evaluation
    39  	// for it and returns the eval.
    40  	DispatchJob(job *structs.Job) (*structs.Evaluation, error)
    41  
    42  	// RunningChildren returns whether the passed job has any running children.
    43  	RunningChildren(job *structs.Job) (bool, error)
    44  }
    45  
    46  // DispatchJob creates an evaluation for the passed job and commits both the
    47  // evaluation and the job to the raft log. It returns the eval.
    48  func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
    49  	// Commit this update via Raft
    50  	job.SetSubmitTime()
    51  	req := structs.JobRegisterRequest{
    52  		Job: job,
    53  		WriteRequest: structs.WriteRequest{
    54  			Namespace: job.Namespace,
    55  		},
    56  	}
    57  	fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req)
    58  	if err, ok := fsmErr.(error); ok && err != nil {
    59  		return nil, err
    60  	}
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  
    65  	// Create a new evaluation
    66  	eval := &structs.Evaluation{
    67  		ID:             uuid.Generate(),
    68  		Namespace:      job.Namespace,
    69  		Priority:       job.Priority,
    70  		Type:           job.Type,
    71  		TriggeredBy:    structs.EvalTriggerPeriodicJob,
    72  		JobID:          job.ID,
    73  		JobModifyIndex: index,
    74  		Status:         structs.EvalStatusPending,
    75  	}
    76  	update := &structs.EvalUpdateRequest{
    77  		Evals: []*structs.Evaluation{eval},
    78  	}
    79  
    80  	// Commit this evaluation via Raft
    81  	// XXX: There is a risk of partial failure where the JobRegister succeeds
    82  	// but that the EvalUpdate does not.
    83  	_, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	// Update its indexes.
    89  	eval.CreateIndex = evalIndex
    90  	eval.ModifyIndex = evalIndex
    91  	return eval, nil
    92  }
    93  
    94  // RunningChildren checks whether the passed job has any running children.
    95  func (s *Server) RunningChildren(job *structs.Job) (bool, error) {
    96  	state, err := s.fsm.State().Snapshot()
    97  	if err != nil {
    98  		return false, err
    99  	}
   100  
   101  	ws := memdb.NewWatchSet()
   102  	prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix)
   103  	iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix)
   104  	if err != nil {
   105  		return false, err
   106  	}
   107  
   108  	var child *structs.Job
   109  	for i := iter.Next(); i != nil; i = iter.Next() {
   110  		child = i.(*structs.Job)
   111  
   112  		// Ensure the job is actually a child.
   113  		if child.ParentID != job.ID {
   114  			continue
   115  		}
   116  
   117  		// Get the childs evaluations.
   118  		evals, err := state.EvalsByJob(ws, child.Namespace, child.ID)
   119  		if err != nil {
   120  			return false, err
   121  		}
   122  
   123  		// Check if any of the evals are active or have running allocations.
   124  		for _, eval := range evals {
   125  			if !eval.TerminalStatus() {
   126  				return true, nil
   127  			}
   128  
   129  			allocs, err := state.AllocsByEval(ws, eval.ID)
   130  			if err != nil {
   131  				return false, err
   132  			}
   133  
   134  			for _, alloc := range allocs {
   135  				if !alloc.TerminalStatus() {
   136  					return true, nil
   137  				}
   138  			}
   139  		}
   140  	}
   141  
   142  	// There are no evals or allocations that aren't terminal.
   143  	return false, nil
   144  }
   145  
   146  // NewPeriodicDispatch returns a periodic dispatcher that is used to track and
   147  // launch periodic jobs.
   148  func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch {
   149  	return &PeriodicDispatch{
   150  		dispatcher: dispatcher,
   151  		tracked:    make(map[structs.NamespacedID]*structs.Job),
   152  		heap:       NewPeriodicHeap(),
   153  		updateCh:   make(chan struct{}, 1),
   154  		logger:     logger.Named("periodic"),
   155  	}
   156  }
   157  
   158  // SetEnabled is used to control if the periodic dispatcher is enabled. It
   159  // should only be enabled on the active leader. Disabling an active dispatcher
   160  // will stop any launched go routine and flush the dispatcher.
   161  func (p *PeriodicDispatch) SetEnabled(enabled bool) {
   162  	p.l.Lock()
   163  	defer p.l.Unlock()
   164  	wasRunning := p.enabled
   165  	p.enabled = enabled
   166  
   167  	// If we are transitioning from enabled to disabled, stop the daemon and
   168  	// flush.
   169  	if !enabled && wasRunning {
   170  		p.stopFn()
   171  		p.flush()
   172  	} else if enabled && !wasRunning {
   173  		// If we are transitioning from disabled to enabled, run the daemon.
   174  		ctx, cancel := context.WithCancel(context.Background())
   175  		p.stopFn = cancel
   176  		go p.run(ctx, p.updateCh)
   177  	}
   178  }
   179  
   180  // Tracked returns the set of tracked job IDs.
   181  func (p *PeriodicDispatch) Tracked() []*structs.Job {
   182  	p.l.RLock()
   183  	defer p.l.RUnlock()
   184  	tracked := make([]*structs.Job, len(p.tracked))
   185  	i := 0
   186  	for _, job := range p.tracked {
   187  		tracked[i] = job
   188  		i++
   189  	}
   190  	return tracked
   191  }
   192  
   193  // Add begins tracking of a periodic job. If it is already tracked, it acts as
   194  // an update to the jobs periodic spec. The method returns whether the job was
   195  // added and any error that may have occurred.
   196  func (p *PeriodicDispatch) Add(job *structs.Job) error {
   197  	p.l.Lock()
   198  	defer p.l.Unlock()
   199  
   200  	// Do nothing if not enabled
   201  	if !p.enabled {
   202  		return nil
   203  	}
   204  
   205  	// If we were tracking a job and it has been disabled, made non-periodic,
   206  	// stopped or is parameterized, remove it
   207  	disabled := !job.IsPeriodicActive()
   208  
   209  	tuple := structs.NamespacedID{
   210  		ID:        job.ID,
   211  		Namespace: job.Namespace,
   212  	}
   213  	_, tracked := p.tracked[tuple]
   214  	if disabled {
   215  		if tracked {
   216  			p.removeLocked(tuple)
   217  		}
   218  
   219  		// If the job is disabled and we aren't tracking it, do nothing.
   220  		return nil
   221  	}
   222  
   223  	// Add or update the job.
   224  	p.tracked[tuple] = job
   225  	next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation()))
   226  	if err != nil {
   227  		return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err)
   228  	}
   229  	if tracked {
   230  		if err := p.heap.Update(job, next); err != nil {
   231  			return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err)
   232  		}
   233  		p.logger.Debug("updated periodic job", "job", job.NamespacedID())
   234  	} else {
   235  		if err := p.heap.Push(job, next); err != nil {
   236  			return fmt.Errorf("failed to add job %v: %v", job.ID, err)
   237  		}
   238  		p.logger.Debug("registered periodic job", "job", job.NamespacedID())
   239  	}
   240  
   241  	// Signal an update.
   242  	select {
   243  	case p.updateCh <- struct{}{}:
   244  	default:
   245  	}
   246  
   247  	return nil
   248  }
   249  
   250  // Remove stops tracking the passed job. If the job is not tracked, it is a
   251  // no-op.
   252  func (p *PeriodicDispatch) Remove(namespace, jobID string) error {
   253  	p.l.Lock()
   254  	defer p.l.Unlock()
   255  	return p.removeLocked(structs.NamespacedID{
   256  		ID:        jobID,
   257  		Namespace: namespace,
   258  	})
   259  }
   260  
   261  // Remove stops tracking the passed job. If the job is not tracked, it is a
   262  // no-op. It assumes this is called while a lock is held.
   263  func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error {
   264  	// Do nothing if not enabled
   265  	if !p.enabled {
   266  		return nil
   267  	}
   268  
   269  	job, tracked := p.tracked[jobID]
   270  	if !tracked {
   271  		return nil
   272  	}
   273  
   274  	delete(p.tracked, jobID)
   275  	if err := p.heap.Remove(job); err != nil {
   276  		return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err)
   277  	}
   278  
   279  	// Signal an update.
   280  	select {
   281  	case p.updateCh <- struct{}{}:
   282  	default:
   283  	}
   284  
   285  	p.logger.Debug("deregistered periodic job", "job", job.NamespacedID())
   286  	return nil
   287  }
   288  
   289  // ForceRun causes the periodic job to be evaluated immediately and returns the
   290  // subsequent eval.
   291  func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) {
   292  	p.l.Lock()
   293  
   294  	// Do nothing if not enabled
   295  	if !p.enabled {
   296  		p.l.Unlock()
   297  		return nil, fmt.Errorf("periodic dispatch disabled")
   298  	}
   299  
   300  	tuple := structs.NamespacedID{
   301  		ID:        jobID,
   302  		Namespace: namespace,
   303  	}
   304  	job, tracked := p.tracked[tuple]
   305  	if !tracked {
   306  		p.l.Unlock()
   307  		return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace)
   308  	}
   309  
   310  	p.l.Unlock()
   311  	return p.createEval(job, time.Now().In(job.Periodic.GetLocation()))
   312  }
   313  
   314  // shouldRun returns whether the long lived run function should run.
   315  func (p *PeriodicDispatch) shouldRun() bool {
   316  	p.l.RLock()
   317  	defer p.l.RUnlock()
   318  	return p.enabled
   319  }
   320  
   321  // run is a long-lived function that waits till a job's periodic spec is met and
   322  // then creates an evaluation to run the job.
   323  func (p *PeriodicDispatch) run(ctx context.Context, updateCh <-chan struct{}) {
   324  	var launchCh <-chan time.Time
   325  	for p.shouldRun() {
   326  		job, launch := p.nextLaunch()
   327  		if launch.IsZero() {
   328  			launchCh = nil
   329  		} else {
   330  			launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation()))
   331  			launchCh = time.After(launchDur)
   332  			p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID())
   333  		}
   334  
   335  		select {
   336  		case <-ctx.Done():
   337  			return
   338  		case <-updateCh:
   339  			continue
   340  		case <-launchCh:
   341  			p.dispatch(job, launch)
   342  		}
   343  	}
   344  }
   345  
   346  // dispatch creates an evaluation for the job and updates its next launchtime
   347  // based on the passed launch time.
   348  func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) {
   349  	p.l.Lock()
   350  
   351  	nextLaunch, err := job.Periodic.Next(launchTime)
   352  	if err != nil {
   353  		p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err)
   354  	} else if err := p.heap.Update(job, nextLaunch); err != nil {
   355  		p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err)
   356  	}
   357  
   358  	// If the job prohibits overlapping and there are running children, we skip
   359  	// the launch.
   360  	if job.Periodic.ProhibitOverlap {
   361  		running, err := p.dispatcher.RunningChildren(job)
   362  		if err != nil {
   363  			p.logger.Error("failed to determine if periodic job has running children", "job", job.NamespacedID(), "error", err)
   364  			p.l.Unlock()
   365  			return
   366  		}
   367  
   368  		if running {
   369  			p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID())
   370  			p.l.Unlock()
   371  			return
   372  		}
   373  	}
   374  
   375  	p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime)
   376  	p.l.Unlock()
   377  	p.createEval(job, launchTime)
   378  }
   379  
   380  // nextLaunch returns the next job to launch and when it should be launched. If
   381  // the next job can't be determined, an error is returned. If the dispatcher is
   382  // stopped, a nil job will be returned.
   383  func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) {
   384  	// If there is nothing wait for an update.
   385  	p.l.RLock()
   386  	defer p.l.RUnlock()
   387  	if p.heap.Length() == 0 {
   388  		return nil, time.Time{}
   389  	}
   390  
   391  	nextJob := p.heap.Peek()
   392  	if nextJob == nil {
   393  		return nil, time.Time{}
   394  	}
   395  
   396  	return nextJob.job, nextJob.next
   397  }
   398  
   399  // createEval instantiates a job based on the passed periodic job and submits an
   400  // evaluation for it. This should not be called with the lock held.
   401  func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) {
   402  	derived, err := p.deriveJob(periodicJob, time)
   403  	if err != nil {
   404  		return nil, err
   405  	}
   406  
   407  	eval, err := p.dispatcher.DispatchJob(derived)
   408  	if err != nil {
   409  		p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err)
   410  		return nil, err
   411  	}
   412  
   413  	return eval, nil
   414  }
   415  
   416  // deriveJob instantiates a new job based on the passed periodic job and the
   417  // launch time.
   418  func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
   419  	derived *structs.Job, err error) {
   420  
   421  	// Have to recover in case the job copy panics.
   422  	defer func() {
   423  		if r := recover(); r != nil {
   424  			p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner",
   425  				"job", periodicJob.NamespacedID(), "error", r)
   426  
   427  			p.Remove(periodicJob.Namespace, periodicJob.ID)
   428  			derived = nil
   429  			err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v",
   430  				periodicJob.ID, periodicJob.Namespace, r)
   431  		}
   432  	}()
   433  
   434  	// Create a copy of the periodic job, give it a derived ID/Name and make it
   435  	// non-periodic.
   436  	derived = periodicJob.Copy()
   437  	derived.ParentID = periodicJob.ID
   438  	derived.ID = p.derivedJobID(periodicJob, time)
   439  	derived.Name = derived.ID
   440  	derived.Periodic = nil
   441  	return
   442  }
   443  
   444  // deriveJobID returns a job ID based on the parent periodic job and the launch
   445  // time.
   446  func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string {
   447  	return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix())
   448  }
   449  
   450  // LaunchTime returns the launch time of the job. This is only valid for
   451  // jobs created by PeriodicDispatch and will otherwise return an error.
   452  func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) {
   453  	index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix)
   454  	if index == -1 {
   455  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   456  	}
   457  
   458  	launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):])
   459  	if err != nil {
   460  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   461  	}
   462  
   463  	return time.Unix(int64(launch), 0), nil
   464  }
   465  
   466  // flush clears the state of the PeriodicDispatcher
   467  func (p *PeriodicDispatch) flush() {
   468  	p.updateCh = make(chan struct{}, 1)
   469  	p.tracked = make(map[structs.NamespacedID]*structs.Job)
   470  	p.heap = NewPeriodicHeap()
   471  	p.stopFn = nil
   472  }
   473  
   474  // periodicHeap wraps a heap and gives operations other than Push/Pop.
   475  type periodicHeap struct {
   476  	index map[structs.NamespacedID]*periodicJob
   477  	heap  periodicHeapImp
   478  }
   479  
   480  type periodicJob struct {
   481  	job   *structs.Job
   482  	next  time.Time
   483  	index int
   484  }
   485  
   486  func NewPeriodicHeap() *periodicHeap {
   487  	return &periodicHeap{
   488  		index: make(map[structs.NamespacedID]*periodicJob),
   489  		heap:  make(periodicHeapImp, 0),
   490  	}
   491  }
   492  
   493  func (p *periodicHeap) Push(job *structs.Job, next time.Time) error {
   494  	tuple := structs.NamespacedID{
   495  		ID:        job.ID,
   496  		Namespace: job.Namespace,
   497  	}
   498  	if _, ok := p.index[tuple]; ok {
   499  		return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace)
   500  	}
   501  
   502  	pJob := &periodicJob{job, next, 0}
   503  	p.index[tuple] = pJob
   504  	heap.Push(&p.heap, pJob)
   505  	return nil
   506  }
   507  
   508  func (p *periodicHeap) Pop() *periodicJob {
   509  	if len(p.heap) == 0 {
   510  		return nil
   511  	}
   512  
   513  	pJob := heap.Pop(&p.heap).(*periodicJob)
   514  	tuple := structs.NamespacedID{
   515  		ID:        pJob.job.ID,
   516  		Namespace: pJob.job.Namespace,
   517  	}
   518  	delete(p.index, tuple)
   519  	return pJob
   520  }
   521  
   522  func (p *periodicHeap) Peek() *periodicJob {
   523  	if len(p.heap) == 0 {
   524  		return nil
   525  	}
   526  
   527  	return p.heap[0]
   528  }
   529  
   530  func (p *periodicHeap) Contains(job *structs.Job) bool {
   531  	tuple := structs.NamespacedID{
   532  		ID:        job.ID,
   533  		Namespace: job.Namespace,
   534  	}
   535  	_, ok := p.index[tuple]
   536  	return ok
   537  }
   538  
   539  func (p *periodicHeap) Update(job *structs.Job, next time.Time) error {
   540  	tuple := structs.NamespacedID{
   541  		ID:        job.ID,
   542  		Namespace: job.Namespace,
   543  	}
   544  	if pJob, ok := p.index[tuple]; ok {
   545  		// Need to update the job as well because its spec can change.
   546  		pJob.job = job
   547  		pJob.next = next
   548  		heap.Fix(&p.heap, pJob.index)
   549  		return nil
   550  	}
   551  
   552  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   553  }
   554  
   555  func (p *periodicHeap) Remove(job *structs.Job) error {
   556  	tuple := structs.NamespacedID{
   557  		ID:        job.ID,
   558  		Namespace: job.Namespace,
   559  	}
   560  	if pJob, ok := p.index[tuple]; ok {
   561  		heap.Remove(&p.heap, pJob.index)
   562  		delete(p.index, tuple)
   563  		return nil
   564  	}
   565  
   566  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   567  }
   568  
   569  func (p *periodicHeap) Length() int {
   570  	return len(p.heap)
   571  }
   572  
   573  type periodicHeapImp []*periodicJob
   574  
   575  func (h periodicHeapImp) Len() int { return len(h) }
   576  
   577  func (h periodicHeapImp) Less(i, j int) bool {
   578  	// Two zero times should return false.
   579  	// Otherwise, zero is "greater" than any other time.
   580  	// (To sort it at the end of the list.)
   581  	// Sort such that zero times are at the end of the list.
   582  	iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero()
   583  	if iZero && jZero {
   584  		return false
   585  	} else if iZero {
   586  		return false
   587  	} else if jZero {
   588  		return true
   589  	}
   590  
   591  	return h[i].next.Before(h[j].next)
   592  }
   593  
   594  func (h periodicHeapImp) Swap(i, j int) {
   595  	h[i], h[j] = h[j], h[i]
   596  	h[i].index = i
   597  	h[j].index = j
   598  }
   599  
   600  func (h *periodicHeapImp) Push(x interface{}) {
   601  	n := len(*h)
   602  	job := x.(*periodicJob)
   603  	job.index = n
   604  	*h = append(*h, job)
   605  }
   606  
   607  func (h *periodicHeapImp) Pop() interface{} {
   608  	old := *h
   609  	n := len(old)
   610  	job := old[n-1]
   611  	job.index = -1 // for safety
   612  	*h = old[0 : n-1]
   613  	return job
   614  }