github.com/hugh712/snapd@v0.0.0-20200910133618-1a99902bd583/overlord/state/taskrunner.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package state
    21  
    22  import (
    23  	"sync"
    24  	"time"
    25  
    26  	"gopkg.in/tomb.v2"
    27  
    28  	"github.com/snapcore/snapd/logger"
    29  )
    30  
    31  // HandlerFunc is the type of function for the handlers
    32  type HandlerFunc func(task *Task, tomb *tomb.Tomb) error
    33  
    34  // Retry is returned from a handler to signal that is ok to rerun the
    35  // task at a later point. It's to be used also when a task goroutine
    36  // is asked to stop through its tomb. After can be used to indicate
    37  // how much to postpone the retry, 0 (the default) means at the next
    38  // ensure pass and is what should be used if stopped through its tomb.
    39  // Reason is an optional explanation of the conflict.
    40  type Retry struct {
    41  	After  time.Duration
    42  	Reason string
    43  }
    44  
    45  func (r *Retry) Error() string {
    46  	return "task should be retried"
    47  }
    48  
    49  type blockedFunc func(t *Task, running []*Task) bool
    50  
    51  // TaskRunner controls the running of goroutines to execute known task kinds.
    52  type TaskRunner struct {
    53  	state *State
    54  
    55  	// locking
    56  	mu       sync.Mutex
    57  	handlers map[string]handlerPair
    58  	optional []optionalHandler
    59  	cleanups map[string]HandlerFunc
    60  	stopped  bool
    61  
    62  	blocked     []blockedFunc
    63  	someBlocked bool
    64  
    65  	// optional callback executed on task errors
    66  	taskErrorCallback func(err error)
    67  
    68  	// go-routines lifecycle
    69  	tombs map[string]*tomb.Tomb
    70  }
    71  
    72  type handlerPair struct {
    73  	do, undo HandlerFunc
    74  }
    75  
    76  type optionalHandler struct {
    77  	match func(t *Task) bool
    78  	handlerPair
    79  }
    80  
    81  // NewTaskRunner creates a new TaskRunner
    82  func NewTaskRunner(s *State) *TaskRunner {
    83  	return &TaskRunner{
    84  		state:    s,
    85  		handlers: make(map[string]handlerPair),
    86  		cleanups: make(map[string]HandlerFunc),
    87  		tombs:    make(map[string]*tomb.Tomb),
    88  	}
    89  }
    90  
    91  // OnTaskError sets an error callback executed when any task errors out.
    92  func (r *TaskRunner) OnTaskError(f func(err error)) {
    93  	r.taskErrorCallback = f
    94  }
    95  
    96  // AddHandler registers the functions to concurrently call for doing and
    97  // undoing tasks of the given kind. The undo handler may be nil.
    98  func (r *TaskRunner) AddHandler(kind string, do, undo HandlerFunc) {
    99  	r.mu.Lock()
   100  	defer r.mu.Unlock()
   101  
   102  	r.handlers[kind] = handlerPair{do, undo}
   103  }
   104  
   105  // AddOptionalHandler register functions for doing and undoing tasks that match
   106  // the given predicate if no explicit handler was registered for the task kind.
   107  func (r *TaskRunner) AddOptionalHandler(match func(t *Task) bool, do, undo HandlerFunc) {
   108  	r.optional = append(r.optional, optionalHandler{match, handlerPair{do, undo}})
   109  }
   110  
   111  func (r *TaskRunner) handlerPair(t *Task) handlerPair {
   112  	if handler, ok := r.handlers[t.Kind()]; ok {
   113  		return handler
   114  	}
   115  	for _, h := range r.optional {
   116  		if h.match(t) {
   117  			return h.handlerPair
   118  		}
   119  	}
   120  	return handlerPair{}
   121  }
   122  
   123  // KnownTaskKinds returns all tasks kinds handled by this runner.
   124  func (r *TaskRunner) KnownTaskKinds() []string {
   125  	kinds := make([]string, 0, len(r.handlers))
   126  	for h := range r.handlers {
   127  		kinds = append(kinds, h)
   128  	}
   129  	return kinds
   130  }
   131  
   132  // AddCleanup registers a function to be called after the change completes,
   133  // for cleaning up data left behind by tasks of the specified kind.
   134  // The provided function will be called no matter what the final status of the
   135  // task is. This mechanism enables keeping data around for a potential undo
   136  // until there's no more chance of the task being undone.
   137  //
   138  // The cleanup function is run concurrently with other cleanup functions,
   139  // despite any wait ordering between the tasks. If it returns an error,
   140  // it will be retried later.
   141  //
   142  // The handler for tasks of the provided kind must have been previously
   143  // registered before AddCleanup is called for it.
   144  func (r *TaskRunner) AddCleanup(kind string, cleanup HandlerFunc) {
   145  	r.mu.Lock()
   146  	defer r.mu.Unlock()
   147  	if _, ok := r.handlers[kind]; !ok {
   148  		panic("internal error: attempted to register cleanup for unknown task kind")
   149  	}
   150  	r.cleanups[kind] = cleanup
   151  }
   152  
   153  // SetBlocked sets a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation.
   154  func (r *TaskRunner) SetBlocked(pred func(t *Task, running []*Task) bool) {
   155  	r.mu.Lock()
   156  	defer r.mu.Unlock()
   157  
   158  	r.blocked = []blockedFunc{pred}
   159  }
   160  
   161  // AddBlocked adds a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation. All added predicates are considered in turn until one returns true, or none.
   162  func (r *TaskRunner) AddBlocked(pred func(t *Task, running []*Task) bool) {
   163  	r.mu.Lock()
   164  	defer r.mu.Unlock()
   165  
   166  	r.blocked = append(r.blocked, pred)
   167  }
   168  
   169  // run must be called with the state lock in place
   170  func (r *TaskRunner) run(t *Task) {
   171  	var handler HandlerFunc
   172  	var accuRuntime func(dur time.Duration)
   173  	switch t.Status() {
   174  	case DoStatus:
   175  		t.SetStatus(DoingStatus)
   176  		fallthrough
   177  	case DoingStatus:
   178  		handler = r.handlerPair(t).do
   179  		accuRuntime = t.accumulateDoingTime
   180  
   181  	case UndoStatus:
   182  		t.SetStatus(UndoingStatus)
   183  		fallthrough
   184  	case UndoingStatus:
   185  		handler = r.handlerPair(t).undo
   186  		accuRuntime = t.accumulateUndoingTime
   187  
   188  	default:
   189  		panic("internal error: attempted to run task in status " + t.Status().String())
   190  	}
   191  	if handler == nil {
   192  		panic("internal error: attempted to run task with nil handler for status " + t.Status().String())
   193  	}
   194  
   195  	t.At(time.Time{}) // clear schedule
   196  	tomb := &tomb.Tomb{}
   197  	r.tombs[t.ID()] = tomb
   198  	tomb.Go(func() error {
   199  		// Capture the error result with tomb.Kill so we can
   200  		// use tomb.Err uniformily to consider both it or a
   201  		// overriding previous Kill reason.
   202  		t0 := time.Now()
   203  		tomb.Kill(handler(t, tomb))
   204  		t1 := time.Now()
   205  
   206  		// Locks must be acquired in the same order everywhere.
   207  		r.mu.Lock()
   208  		defer r.mu.Unlock()
   209  		r.state.Lock()
   210  		defer r.state.Unlock()
   211  		accuRuntime(t1.Sub(t0))
   212  
   213  		delete(r.tombs, t.ID())
   214  
   215  		// some tasks were blocked, now there's chance the
   216  		// blocked predicate will change its value
   217  		if r.someBlocked {
   218  			r.state.EnsureBefore(0)
   219  		}
   220  
   221  		err := tomb.Err()
   222  		switch err.(type) {
   223  		case nil:
   224  			// we are ok
   225  		case *Retry:
   226  			// preserve
   227  		default:
   228  			if r.stopped {
   229  				// we are shutting down, errors might be due
   230  				// to cancellations, to be safe retry
   231  				err = &Retry{}
   232  			}
   233  		}
   234  
   235  		switch x := err.(type) {
   236  		case *Retry:
   237  			// Handler asked to be called again later.
   238  			// TODO Allow postponing retries past the next Ensure.
   239  			if t.Status() == AbortStatus {
   240  				// Would work without it but might take two ensures.
   241  				r.tryUndo(t)
   242  			} else if x.After != 0 {
   243  				t.At(timeNow().Add(x.After))
   244  			}
   245  		case nil:
   246  			var next []*Task
   247  			switch t.Status() {
   248  			case DoingStatus:
   249  				t.SetStatus(DoneStatus)
   250  				fallthrough
   251  			case DoneStatus:
   252  				next = t.HaltTasks()
   253  			case AbortStatus:
   254  				// It was actually Done if it got here.
   255  				t.SetStatus(UndoStatus)
   256  				r.state.EnsureBefore(0)
   257  			case UndoingStatus:
   258  				t.SetStatus(UndoneStatus)
   259  				fallthrough
   260  			case UndoneStatus:
   261  				next = t.WaitTasks()
   262  			}
   263  			if len(next) > 0 {
   264  				r.state.EnsureBefore(0)
   265  			}
   266  		default:
   267  			r.abortLanes(t.Change(), t.Lanes())
   268  			t.SetStatus(ErrorStatus)
   269  			t.Errorf("%s", err)
   270  			// ensure the error is available in the global log too
   271  			logger.Noticef("[change %s %q task] failed: %v", t.Change().ID(), t.Summary(), err)
   272  			if r.taskErrorCallback != nil {
   273  				r.taskErrorCallback(err)
   274  			}
   275  		}
   276  
   277  		return nil
   278  	})
   279  }
   280  
   281  func (r *TaskRunner) clean(t *Task) {
   282  	if !t.Change().IsReady() {
   283  		// Whole Change is not ready so don't run cleanups yet.
   284  		return
   285  	}
   286  
   287  	cleanup, ok := r.cleanups[t.Kind()]
   288  	if !ok {
   289  		t.SetClean()
   290  		return
   291  	}
   292  
   293  	tomb := &tomb.Tomb{}
   294  	r.tombs[t.ID()] = tomb
   295  	tomb.Go(func() error {
   296  		tomb.Kill(cleanup(t, tomb))
   297  
   298  		// Locks must be acquired in the same order everywhere.
   299  		r.mu.Lock()
   300  		defer r.mu.Unlock()
   301  		r.state.Lock()
   302  		defer r.state.Unlock()
   303  
   304  		delete(r.tombs, t.ID())
   305  
   306  		if tomb.Err() != nil {
   307  			logger.Debugf("Cleaning task %s: %s", t.ID(), tomb.Err())
   308  		} else {
   309  			t.SetClean()
   310  		}
   311  		return nil
   312  	})
   313  }
   314  
   315  func (r *TaskRunner) abortLanes(chg *Change, lanes []int) {
   316  	chg.AbortLanes(lanes)
   317  	ensureScheduled := false
   318  	for _, t := range chg.Tasks() {
   319  		status := t.Status()
   320  		if status == AbortStatus {
   321  			if tb, ok := r.tombs[t.ID()]; ok {
   322  				tb.Kill(nil)
   323  			}
   324  		}
   325  		if !ensureScheduled && !status.Ready() {
   326  			ensureScheduled = true
   327  			r.state.EnsureBefore(0)
   328  		}
   329  	}
   330  }
   331  
   332  // tryUndo replaces the status of a knowingly aborted task.
   333  func (r *TaskRunner) tryUndo(t *Task) {
   334  	if t.Status() == AbortStatus && r.handlerPair(t).undo == nil {
   335  		// Cannot undo but it was stopped in flight.
   336  		// Hold so it doesn't look like it finished.
   337  		t.SetStatus(HoldStatus)
   338  		if len(t.WaitTasks()) > 0 {
   339  			r.state.EnsureBefore(0)
   340  		}
   341  	} else {
   342  		t.SetStatus(UndoStatus)
   343  		r.state.EnsureBefore(0)
   344  	}
   345  }
   346  
   347  // Ensure starts new goroutines for all known tasks with no pending
   348  // dependencies.
   349  // Note that Ensure will lock the state.
   350  func (r *TaskRunner) Ensure() error {
   351  	r.mu.Lock()
   352  	defer r.mu.Unlock()
   353  
   354  	if r.stopped {
   355  		// we are stopping, don't run another ensure
   356  		return nil
   357  	}
   358  
   359  	// Locks must be acquired in the same order everywhere.
   360  	r.state.Lock()
   361  	defer r.state.Unlock()
   362  
   363  	r.someBlocked = false
   364  	running := make([]*Task, 0, len(r.tombs))
   365  	for tid := range r.tombs {
   366  		t := r.state.Task(tid)
   367  		if t != nil {
   368  			running = append(running, t)
   369  		}
   370  	}
   371  
   372  	ensureTime := timeNow()
   373  	nextTaskTime := time.Time{}
   374  ConsiderTasks:
   375  	for _, t := range r.state.Tasks() {
   376  		handlers := r.handlerPair(t)
   377  		if handlers.do == nil {
   378  			// Handled by a different runner instance.
   379  			continue
   380  		}
   381  
   382  		tb := r.tombs[t.ID()]
   383  
   384  		if t.Status() == AbortStatus {
   385  			if tb != nil {
   386  				tb.Kill(nil)
   387  				continue
   388  			}
   389  			r.tryUndo(t)
   390  		}
   391  
   392  		if tb != nil {
   393  			// Already being handled.
   394  			continue
   395  		}
   396  
   397  		status := t.Status()
   398  		if status.Ready() {
   399  			if !t.IsClean() {
   400  				r.clean(t)
   401  			}
   402  			continue
   403  		}
   404  
   405  		if mustWait(t) {
   406  			// Dependencies still unhandled.
   407  			continue
   408  		}
   409  
   410  		if status == UndoStatus && handlers.undo == nil {
   411  			// Although this has no dependencies itself, it must have waited
   412  			// above too since follow up tasks may have handlers again.
   413  			// Cannot undo. Revert to done status.
   414  			t.SetStatus(DoneStatus)
   415  			if len(t.WaitTasks()) > 0 {
   416  				r.state.EnsureBefore(0)
   417  			}
   418  			continue
   419  		}
   420  
   421  		// skip tasks scheduled for later and also track the earliest one
   422  		tWhen := t.AtTime()
   423  		if !tWhen.IsZero() && ensureTime.Before(tWhen) {
   424  			if nextTaskTime.IsZero() || nextTaskTime.After(tWhen) {
   425  				nextTaskTime = tWhen
   426  			}
   427  			continue
   428  		}
   429  
   430  		// check if any of the blocked predicates returns true
   431  		// and skip the task if so
   432  		for _, blocked := range r.blocked {
   433  			if blocked(t, running) {
   434  				r.someBlocked = true
   435  				continue ConsiderTasks
   436  			}
   437  		}
   438  
   439  		logger.Debugf("Running task %s on %s: %s", t.ID(), t.Status(), t.Summary())
   440  		r.run(t)
   441  
   442  		running = append(running, t)
   443  	}
   444  
   445  	// schedule next Ensure no later than the next task time
   446  	if !nextTaskTime.IsZero() {
   447  		r.state.EnsureBefore(nextTaskTime.Sub(ensureTime))
   448  	}
   449  
   450  	return nil
   451  }
   452  
   453  // mustWait returns whether task t must wait for other tasks to be done.
   454  func mustWait(t *Task) bool {
   455  	switch t.Status() {
   456  	case DoStatus:
   457  		for _, wt := range t.WaitTasks() {
   458  			if wt.Status() != DoneStatus {
   459  				return true
   460  			}
   461  		}
   462  	case UndoStatus:
   463  		for _, ht := range t.HaltTasks() {
   464  			if !ht.Status().Ready() {
   465  				return true
   466  			}
   467  		}
   468  	}
   469  	return false
   470  }
   471  
   472  // wait expects to be called with th r.mu lock held
   473  func (r *TaskRunner) wait() {
   474  	for len(r.tombs) > 0 {
   475  		for _, t := range r.tombs {
   476  			r.mu.Unlock()
   477  			t.Wait()
   478  			r.mu.Lock()
   479  			break
   480  		}
   481  	}
   482  }
   483  
   484  // Stop kills all concurrent activities and returns after that's done.
   485  func (r *TaskRunner) Stop() {
   486  	r.mu.Lock()
   487  	defer r.mu.Unlock()
   488  
   489  	r.stopped = true
   490  
   491  	for _, tb := range r.tombs {
   492  		tb.Kill(nil)
   493  	}
   494  
   495  	r.wait()
   496  }
   497  
   498  // Wait waits for all concurrent activities and returns after that's done.
   499  func (r *TaskRunner) Wait() {
   500  	r.mu.Lock()
   501  	defer r.mu.Unlock()
   502  
   503  	r.wait()
   504  }
   505  
   506  // StopKinds kills all concurrent tasks of the given kinds and returns
   507  // after that's done.
   508  func (r *TaskRunner) StopKinds(kind ...string) {
   509  	r.mu.Lock()
   510  	defer r.mu.Unlock()
   511  
   512  	kinds := make(map[string]bool, len(kind))
   513  	for _, k := range kind {
   514  		kinds[k] = true
   515  	}
   516  
   517  	var tombs []*tomb.Tomb
   518  	// Locks must be acquired in the same order everywhere:
   519  	// r.mu, r.state
   520  	r.state.Lock()
   521  	for tid, tb := range r.tombs {
   522  		task := r.state.Task(tid)
   523  		if task == nil || !kinds[task.Kind()] {
   524  			continue
   525  		}
   526  		tombs = append(tombs, tb)
   527  		tb.Kill(nil)
   528  	}
   529  	r.state.Unlock()
   530  
   531  	for _, tb := range tombs {
   532  		r.mu.Unlock()
   533  		tb.Wait()
   534  		r.mu.Lock()
   535  	}
   536  }