gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/periodic.go (about)

     1  package nomad
     2  
     3  import (
     4  	"container/heap"
     5  	"context"
     6  	"fmt"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	log "github.com/hashicorp/go-hclog"
    13  	memdb "github.com/hashicorp/go-memdb"
    14  
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  // PeriodicDispatch is used to track and launch periodic jobs. It maintains the
    20  // set of periodic jobs and creates derived jobs and evaluations per
    21  // instantiation which is determined by the periodic spec.
    22  type PeriodicDispatch struct {
    23  	dispatcher JobEvalDispatcher
    24  	enabled    bool
    25  
    26  	tracked map[structs.NamespacedID]*structs.Job
    27  	heap    *periodicHeap
    28  
    29  	updateCh chan struct{}
    30  	stopFn   context.CancelFunc
    31  	logger   log.Logger
    32  	l        sync.RWMutex
    33  }
    34  
    35  // JobEvalDispatcher is an interface to submit jobs and have evaluations created
    36  // for them.
    37  type JobEvalDispatcher interface {
    38  	// DispatchJob takes a job a new, untracked job and creates an evaluation
    39  	// for it and returns the eval.
    40  	DispatchJob(job *structs.Job) (*structs.Evaluation, error)
    41  
    42  	// RunningChildren returns whether the passed job has any running children.
    43  	RunningChildren(job *structs.Job) (bool, error)
    44  }
    45  
    46  // DispatchJob creates an evaluation for the passed job and commits both the
    47  // evaluation and the job to the raft log. It returns the eval.
    48  func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
    49  	// Commit this update via Raft
    50  	job.SetSubmitTime()
    51  	req := structs.JobRegisterRequest{
    52  		Job: job,
    53  		WriteRequest: structs.WriteRequest{
    54  			Namespace: job.Namespace,
    55  		},
    56  	}
    57  	fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req)
    58  	if err, ok := fsmErr.(error); ok && err != nil {
    59  		return nil, err
    60  	}
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  
    65  	// Create a new evaluation
    66  	now := time.Now().UTC().UnixNano()
    67  	eval := &structs.Evaluation{
    68  		ID:             uuid.Generate(),
    69  		Namespace:      job.Namespace,
    70  		Priority:       job.Priority,
    71  		Type:           job.Type,
    72  		TriggeredBy:    structs.EvalTriggerPeriodicJob,
    73  		JobID:          job.ID,
    74  		JobModifyIndex: index,
    75  		Status:         structs.EvalStatusPending,
    76  		CreateTime:     now,
    77  		ModifyTime:     now,
    78  	}
    79  	update := &structs.EvalUpdateRequest{
    80  		Evals: []*structs.Evaluation{eval},
    81  	}
    82  
    83  	// Commit this evaluation via Raft
    84  	// XXX: There is a risk of partial failure where the JobRegister succeeds
    85  	// but that the EvalUpdate does not.
    86  	_, evalIndex, err := s.raftApply(structs.EvalUpdateRequestType, update)
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  
    91  	// Update its indexes.
    92  	eval.CreateIndex = evalIndex
    93  	eval.ModifyIndex = evalIndex
    94  	return eval, nil
    95  }
    96  
    97  // RunningChildren checks whether the passed job has any running children.
    98  func (s *Server) RunningChildren(job *structs.Job) (bool, error) {
    99  	state, err := s.fsm.State().Snapshot()
   100  	if err != nil {
   101  		return false, err
   102  	}
   103  
   104  	ws := memdb.NewWatchSet()
   105  	prefix := fmt.Sprintf("%s%s", job.ID, structs.PeriodicLaunchSuffix)
   106  	iter, err := state.JobsByIDPrefix(ws, job.Namespace, prefix)
   107  	if err != nil {
   108  		return false, err
   109  	}
   110  
   111  	var child *structs.Job
   112  	for i := iter.Next(); i != nil; i = iter.Next() {
   113  		child = i.(*structs.Job)
   114  
   115  		// Ensure the job is actually a child.
   116  		if child.ParentID != job.ID {
   117  			continue
   118  		}
   119  
   120  		// Get the childs evaluations.
   121  		evals, err := state.EvalsByJob(ws, child.Namespace, child.ID)
   122  		if err != nil {
   123  			return false, err
   124  		}
   125  
   126  		// Check if any of the evals are active or have running allocations.
   127  		for _, eval := range evals {
   128  			if !eval.TerminalStatus() {
   129  				return true, nil
   130  			}
   131  
   132  			allocs, err := state.AllocsByEval(ws, eval.ID)
   133  			if err != nil {
   134  				return false, err
   135  			}
   136  
   137  			for _, alloc := range allocs {
   138  				if !alloc.TerminalStatus() {
   139  					return true, nil
   140  				}
   141  			}
   142  		}
   143  	}
   144  
   145  	// There are no evals or allocations that aren't terminal.
   146  	return false, nil
   147  }
   148  
   149  // NewPeriodicDispatch returns a periodic dispatcher that is used to track and
   150  // launch periodic jobs.
   151  func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch {
   152  	return &PeriodicDispatch{
   153  		dispatcher: dispatcher,
   154  		tracked:    make(map[structs.NamespacedID]*structs.Job),
   155  		heap:       NewPeriodicHeap(),
   156  		updateCh:   make(chan struct{}, 1),
   157  		logger:     logger.Named("periodic"),
   158  	}
   159  }
   160  
   161  // SetEnabled is used to control if the periodic dispatcher is enabled. It
   162  // should only be enabled on the active leader. Disabling an active dispatcher
   163  // will stop any launched go routine and flush the dispatcher.
   164  func (p *PeriodicDispatch) SetEnabled(enabled bool) {
   165  	p.l.Lock()
   166  	defer p.l.Unlock()
   167  	wasRunning := p.enabled
   168  	p.enabled = enabled
   169  
   170  	// If we are transitioning from enabled to disabled, stop the daemon and
   171  	// flush.
   172  	if !enabled && wasRunning {
   173  		p.stopFn()
   174  		p.flush()
   175  	} else if enabled && !wasRunning {
   176  		// If we are transitioning from disabled to enabled, run the daemon.
   177  		ctx, cancel := context.WithCancel(context.Background())
   178  		p.stopFn = cancel
   179  		go p.run(ctx, p.updateCh)
   180  	}
   181  }
   182  
   183  // Tracked returns the set of tracked job IDs.
   184  func (p *PeriodicDispatch) Tracked() []*structs.Job {
   185  	p.l.RLock()
   186  	defer p.l.RUnlock()
   187  	tracked := make([]*structs.Job, len(p.tracked))
   188  	i := 0
   189  	for _, job := range p.tracked {
   190  		tracked[i] = job
   191  		i++
   192  	}
   193  	return tracked
   194  }
   195  
   196  // Add begins tracking of a periodic job. If it is already tracked, it acts as
   197  // an update to the jobs periodic spec. The method returns whether the job was
   198  // added and any error that may have occurred.
   199  func (p *PeriodicDispatch) Add(job *structs.Job) error {
   200  	p.l.Lock()
   201  	defer p.l.Unlock()
   202  
   203  	// Do nothing if not enabled
   204  	if !p.enabled {
   205  		return nil
   206  	}
   207  
   208  	// If we were tracking a job and it has been disabled, made non-periodic,
   209  	// stopped or is parameterized, remove it
   210  	disabled := !job.IsPeriodicActive()
   211  
   212  	tuple := structs.NamespacedID{
   213  		ID:        job.ID,
   214  		Namespace: job.Namespace,
   215  	}
   216  	_, tracked := p.tracked[tuple]
   217  	if disabled {
   218  		if tracked {
   219  			p.removeLocked(tuple)
   220  		}
   221  
   222  		// If the job is disabled and we aren't tracking it, do nothing.
   223  		return nil
   224  	}
   225  
   226  	// Add or update the job.
   227  	p.tracked[tuple] = job
   228  	next, err := job.Periodic.Next(time.Now().In(job.Periodic.GetLocation()))
   229  	if err != nil {
   230  		return fmt.Errorf("failed adding job %s: %v", job.NamespacedID(), err)
   231  	}
   232  	if tracked {
   233  		if err := p.heap.Update(job, next); err != nil {
   234  			return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err)
   235  		}
   236  		p.logger.Debug("updated periodic job", "job", job.NamespacedID())
   237  	} else {
   238  		if err := p.heap.Push(job, next); err != nil {
   239  			return fmt.Errorf("failed to add job %v: %v", job.ID, err)
   240  		}
   241  		p.logger.Debug("registered periodic job", "job", job.NamespacedID())
   242  	}
   243  
   244  	// Signal an update.
   245  	select {
   246  	case p.updateCh <- struct{}{}:
   247  	default:
   248  	}
   249  
   250  	return nil
   251  }
   252  
   253  // Remove stops tracking the passed job. If the job is not tracked, it is a
   254  // no-op.
   255  func (p *PeriodicDispatch) Remove(namespace, jobID string) error {
   256  	p.l.Lock()
   257  	defer p.l.Unlock()
   258  	return p.removeLocked(structs.NamespacedID{
   259  		ID:        jobID,
   260  		Namespace: namespace,
   261  	})
   262  }
   263  
   264  // Remove stops tracking the passed job. If the job is not tracked, it is a
   265  // no-op. It assumes this is called while a lock is held.
   266  func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error {
   267  	// Do nothing if not enabled
   268  	if !p.enabled {
   269  		return nil
   270  	}
   271  
   272  	job, tracked := p.tracked[jobID]
   273  	if !tracked {
   274  		return nil
   275  	}
   276  
   277  	delete(p.tracked, jobID)
   278  	if err := p.heap.Remove(job); err != nil {
   279  		return fmt.Errorf("failed to remove tracked job %q (%s): %v", jobID.ID, jobID.Namespace, err)
   280  	}
   281  
   282  	// Signal an update.
   283  	select {
   284  	case p.updateCh <- struct{}{}:
   285  	default:
   286  	}
   287  
   288  	p.logger.Debug("deregistered periodic job", "job", job.NamespacedID())
   289  	return nil
   290  }
   291  
   292  // ForceRun causes the periodic job to be evaluated immediately and returns the
   293  // subsequent eval.
   294  func (p *PeriodicDispatch) ForceRun(namespace, jobID string) (*structs.Evaluation, error) {
   295  	p.l.Lock()
   296  
   297  	// Do nothing if not enabled
   298  	if !p.enabled {
   299  		p.l.Unlock()
   300  		return nil, fmt.Errorf("periodic dispatch disabled")
   301  	}
   302  
   303  	tuple := structs.NamespacedID{
   304  		ID:        jobID,
   305  		Namespace: namespace,
   306  	}
   307  	job, tracked := p.tracked[tuple]
   308  	if !tracked {
   309  		p.l.Unlock()
   310  		return nil, fmt.Errorf("can't force run non-tracked job %q (%s)", jobID, namespace)
   311  	}
   312  
   313  	p.l.Unlock()
   314  	return p.createEval(job, time.Now().In(job.Periodic.GetLocation()))
   315  }
   316  
   317  // shouldRun returns whether the long lived run function should run.
   318  func (p *PeriodicDispatch) shouldRun() bool {
   319  	p.l.RLock()
   320  	defer p.l.RUnlock()
   321  	return p.enabled
   322  }
   323  
   324  // run is a long-lived function that waits till a job's periodic spec is met and
   325  // then creates an evaluation to run the job.
   326  func (p *PeriodicDispatch) run(ctx context.Context, updateCh <-chan struct{}) {
   327  	var launchCh <-chan time.Time
   328  	for p.shouldRun() {
   329  		job, launch := p.nextLaunch()
   330  		if launch.IsZero() {
   331  			launchCh = nil
   332  		} else {
   333  			launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation()))
   334  			launchCh = time.After(launchDur)
   335  			p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID())
   336  		}
   337  
   338  		select {
   339  		case <-ctx.Done():
   340  			return
   341  		case <-updateCh:
   342  			continue
   343  		case <-launchCh:
   344  			p.dispatch(job, launch)
   345  		}
   346  	}
   347  }
   348  
   349  // dispatch creates an evaluation for the job and updates its next launchtime
   350  // based on the passed launch time.
   351  func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) {
   352  	p.l.Lock()
   353  
   354  	nextLaunch, err := job.Periodic.Next(launchTime)
   355  	if err != nil {
   356  		p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err)
   357  	} else if err := p.heap.Update(job, nextLaunch); err != nil {
   358  		p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err)
   359  	}
   360  
   361  	// If the job prohibits overlapping and there are running children, we skip
   362  	// the launch.
   363  	if job.Periodic.ProhibitOverlap {
   364  		running, err := p.dispatcher.RunningChildren(job)
   365  		if err != nil {
   366  			p.logger.Error("failed to determine if periodic job has running children", "job", job.NamespacedID(), "error", err)
   367  			p.l.Unlock()
   368  			return
   369  		}
   370  
   371  		if running {
   372  			p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID())
   373  			p.l.Unlock()
   374  			return
   375  		}
   376  	}
   377  
   378  	p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime)
   379  	p.l.Unlock()
   380  	p.createEval(job, launchTime)
   381  }
   382  
   383  // nextLaunch returns the next job to launch and when it should be launched. If
   384  // the next job can't be determined, an error is returned. If the dispatcher is
   385  // stopped, a nil job will be returned.
   386  func (p *PeriodicDispatch) nextLaunch() (*structs.Job, time.Time) {
   387  	// If there is nothing wait for an update.
   388  	p.l.RLock()
   389  	defer p.l.RUnlock()
   390  	if p.heap.Length() == 0 {
   391  		return nil, time.Time{}
   392  	}
   393  
   394  	nextJob := p.heap.Peek()
   395  	if nextJob == nil {
   396  		return nil, time.Time{}
   397  	}
   398  
   399  	return nextJob.job, nextJob.next
   400  }
   401  
   402  // createEval instantiates a job based on the passed periodic job and submits an
   403  // evaluation for it. This should not be called with the lock held.
   404  func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) (*structs.Evaluation, error) {
   405  	derived, err := p.deriveJob(periodicJob, time)
   406  	if err != nil {
   407  		return nil, err
   408  	}
   409  
   410  	eval, err := p.dispatcher.DispatchJob(derived)
   411  	if err != nil {
   412  		p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err)
   413  		return nil, err
   414  	}
   415  
   416  	return eval, nil
   417  }
   418  
   419  // deriveJob instantiates a new job based on the passed periodic job and the
   420  // launch time.
   421  func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
   422  	derived *structs.Job, err error) {
   423  
   424  	// Have to recover in case the job copy panics.
   425  	defer func() {
   426  		if r := recover(); r != nil {
   427  			p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner",
   428  				"job", periodicJob.NamespacedID(), "error", r)
   429  
   430  			p.Remove(periodicJob.Namespace, periodicJob.ID)
   431  			derived = nil
   432  			err = fmt.Errorf("Failed to create a copy of the periodic job %q (%s): %v",
   433  				periodicJob.ID, periodicJob.Namespace, r)
   434  		}
   435  	}()
   436  
   437  	// Create a copy of the periodic job, give it a derived ID/Name and make it
   438  	// non-periodic.
   439  	derived = periodicJob.Copy()
   440  	derived.ParentID = periodicJob.ID
   441  	derived.ID = p.derivedJobID(periodicJob, time)
   442  	derived.Name = derived.ID
   443  	derived.Periodic = nil
   444  	return
   445  }
   446  
   447  // deriveJobID returns a job ID based on the parent periodic job and the launch
   448  // time.
   449  func (p *PeriodicDispatch) derivedJobID(periodicJob *structs.Job, time time.Time) string {
   450  	return fmt.Sprintf("%s%s%d", periodicJob.ID, structs.PeriodicLaunchSuffix, time.Unix())
   451  }
   452  
   453  // LaunchTime returns the launch time of the job. This is only valid for
   454  // jobs created by PeriodicDispatch and will otherwise return an error.
   455  func (p *PeriodicDispatch) LaunchTime(jobID string) (time.Time, error) {
   456  	index := strings.LastIndex(jobID, structs.PeriodicLaunchSuffix)
   457  	if index == -1 {
   458  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   459  	}
   460  
   461  	launch, err := strconv.Atoi(jobID[index+len(structs.PeriodicLaunchSuffix):])
   462  	if err != nil {
   463  		return time.Time{}, fmt.Errorf("couldn't parse launch time from eval: %v", jobID)
   464  	}
   465  
   466  	return time.Unix(int64(launch), 0), nil
   467  }
   468  
   469  // flush clears the state of the PeriodicDispatcher
   470  func (p *PeriodicDispatch) flush() {
   471  	p.updateCh = make(chan struct{}, 1)
   472  	p.tracked = make(map[structs.NamespacedID]*structs.Job)
   473  	p.heap = NewPeriodicHeap()
   474  	p.stopFn = nil
   475  }
   476  
   477  // periodicHeap wraps a heap and gives operations other than Push/Pop.
   478  type periodicHeap struct {
   479  	index map[structs.NamespacedID]*periodicJob
   480  	heap  periodicHeapImp
   481  }
   482  
   483  type periodicJob struct {
   484  	job   *structs.Job
   485  	next  time.Time
   486  	index int
   487  }
   488  
   489  func NewPeriodicHeap() *periodicHeap {
   490  	return &periodicHeap{
   491  		index: make(map[structs.NamespacedID]*periodicJob),
   492  		heap:  make(periodicHeapImp, 0),
   493  	}
   494  }
   495  
   496  func (p *periodicHeap) Push(job *structs.Job, next time.Time) error {
   497  	tuple := structs.NamespacedID{
   498  		ID:        job.ID,
   499  		Namespace: job.Namespace,
   500  	}
   501  	if _, ok := p.index[tuple]; ok {
   502  		return fmt.Errorf("job %q (%s) already exists", job.ID, job.Namespace)
   503  	}
   504  
   505  	pJob := &periodicJob{job, next, 0}
   506  	p.index[tuple] = pJob
   507  	heap.Push(&p.heap, pJob)
   508  	return nil
   509  }
   510  
   511  func (p *periodicHeap) Pop() *periodicJob {
   512  	if len(p.heap) == 0 {
   513  		return nil
   514  	}
   515  
   516  	pJob := heap.Pop(&p.heap).(*periodicJob)
   517  	tuple := structs.NamespacedID{
   518  		ID:        pJob.job.ID,
   519  		Namespace: pJob.job.Namespace,
   520  	}
   521  	delete(p.index, tuple)
   522  	return pJob
   523  }
   524  
   525  func (p *periodicHeap) Peek() *periodicJob {
   526  	if len(p.heap) == 0 {
   527  		return nil
   528  	}
   529  
   530  	return p.heap[0]
   531  }
   532  
   533  func (p *periodicHeap) Contains(job *structs.Job) bool {
   534  	tuple := structs.NamespacedID{
   535  		ID:        job.ID,
   536  		Namespace: job.Namespace,
   537  	}
   538  	_, ok := p.index[tuple]
   539  	return ok
   540  }
   541  
   542  func (p *periodicHeap) Update(job *structs.Job, next time.Time) error {
   543  	tuple := structs.NamespacedID{
   544  		ID:        job.ID,
   545  		Namespace: job.Namespace,
   546  	}
   547  	if pJob, ok := p.index[tuple]; ok {
   548  		// Need to update the job as well because its spec can change.
   549  		pJob.job = job
   550  		pJob.next = next
   551  		heap.Fix(&p.heap, pJob.index)
   552  		return nil
   553  	}
   554  
   555  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   556  }
   557  
   558  func (p *periodicHeap) Remove(job *structs.Job) error {
   559  	tuple := structs.NamespacedID{
   560  		ID:        job.ID,
   561  		Namespace: job.Namespace,
   562  	}
   563  	if pJob, ok := p.index[tuple]; ok {
   564  		heap.Remove(&p.heap, pJob.index)
   565  		delete(p.index, tuple)
   566  		return nil
   567  	}
   568  
   569  	return fmt.Errorf("heap doesn't contain job %q (%s)", job.ID, job.Namespace)
   570  }
   571  
   572  func (p *periodicHeap) Length() int {
   573  	return len(p.heap)
   574  }
   575  
   576  type periodicHeapImp []*periodicJob
   577  
   578  func (h periodicHeapImp) Len() int { return len(h) }
   579  
   580  func (h periodicHeapImp) Less(i, j int) bool {
   581  	// Two zero times should return false.
   582  	// Otherwise, zero is "greater" than any other time.
   583  	// (To sort it at the end of the list.)
   584  	// Sort such that zero times are at the end of the list.
   585  	iZero, jZero := h[i].next.IsZero(), h[j].next.IsZero()
   586  	if iZero && jZero {
   587  		return false
   588  	} else if iZero {
   589  		return false
   590  	} else if jZero {
   591  		return true
   592  	}
   593  
   594  	return h[i].next.Before(h[j].next)
   595  }
   596  
   597  func (h periodicHeapImp) Swap(i, j int) {
   598  	h[i], h[j] = h[j], h[i]
   599  	h[i].index = i
   600  	h[j].index = j
   601  }
   602  
   603  func (h *periodicHeapImp) Push(x interface{}) {
   604  	n := len(*h)
   605  	job := x.(*periodicJob)
   606  	job.index = n
   607  	*h = append(*h, job)
   608  }
   609  
   610  func (h *periodicHeapImp) Pop() interface{} {
   611  	old := *h
   612  	n := len(old)
   613  	job := old[n-1]
   614  	job.index = -1 // for safety
   615  	*h = old[0 : n-1]
   616  	return job
   617  }