github.com/rigado/snapd@v2.42.5-go-mod+incompatible/overlord/state/taskrunner.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package state
    21  
    22  import (
    23  	"sync"
    24  	"time"
    25  
    26  	"gopkg.in/tomb.v2"
    27  
    28  	"github.com/snapcore/snapd/logger"
    29  )
    30  
    31  // HandlerFunc is the type of function for the handlers
    32  type HandlerFunc func(task *Task, tomb *tomb.Tomb) error
    33  
    34  // Retry is returned from a handler to signal that is ok to rerun the
    35  // task at a later point. It's to be used also when a task goroutine
    36  // is asked to stop through its tomb. After can be used to indicate
    37  // how much to postpone the retry, 0 (the default) means at the next
    38  // ensure pass and is what should be used if stopped through its tomb.
    39  // Reason is an optional explanation of the conflict.
    40  type Retry struct {
    41  	After  time.Duration
    42  	Reason string
    43  }
    44  
    45  func (r *Retry) Error() string {
    46  	return "task should be retried"
    47  }
    48  
    49  type blockedFunc func(t *Task, running []*Task) bool
    50  
    51  // TaskRunner controls the running of goroutines to execute known task kinds.
    52  type TaskRunner struct {
    53  	state *State
    54  
    55  	// locking
    56  	mu       sync.Mutex
    57  	handlers map[string]handlerPair
    58  	optional []optionalHandler
    59  	cleanups map[string]HandlerFunc
    60  	stopped  bool
    61  
    62  	blocked     []blockedFunc
    63  	someBlocked bool
    64  
    65  	// go-routines lifecycle
    66  	tombs map[string]*tomb.Tomb
    67  }
    68  
    69  type handlerPair struct {
    70  	do, undo HandlerFunc
    71  }
    72  
    73  type optionalHandler struct {
    74  	match func(t *Task) bool
    75  	handlerPair
    76  }
    77  
    78  // NewTaskRunner creates a new TaskRunner
    79  func NewTaskRunner(s *State) *TaskRunner {
    80  	return &TaskRunner{
    81  		state:    s,
    82  		handlers: make(map[string]handlerPair),
    83  		cleanups: make(map[string]HandlerFunc),
    84  		tombs:    make(map[string]*tomb.Tomb),
    85  	}
    86  }
    87  
    88  // AddHandler registers the functions to concurrently call for doing and
    89  // undoing tasks of the given kind. The undo handler may be nil.
    90  func (r *TaskRunner) AddHandler(kind string, do, undo HandlerFunc) {
    91  	r.mu.Lock()
    92  	defer r.mu.Unlock()
    93  
    94  	r.handlers[kind] = handlerPair{do, undo}
    95  }
    96  
    97  // AddOptionalHandler register functions for doing and undoing tasks that match
    98  // the given predicate if no explicit handler was registered for the task kind.
    99  func (r *TaskRunner) AddOptionalHandler(match func(t *Task) bool, do, undo HandlerFunc) {
   100  	r.optional = append(r.optional, optionalHandler{match, handlerPair{do, undo}})
   101  }
   102  
   103  func (r *TaskRunner) handlerPair(t *Task) handlerPair {
   104  	if handler, ok := r.handlers[t.Kind()]; ok {
   105  		return handler
   106  	}
   107  	for _, h := range r.optional {
   108  		if h.match(t) {
   109  			return h.handlerPair
   110  		}
   111  	}
   112  	return handlerPair{}
   113  }
   114  
   115  // KnownTaskKinds returns all tasks kinds handled by this runner.
   116  func (r *TaskRunner) KnownTaskKinds() []string {
   117  	kinds := make([]string, 0, len(r.handlers))
   118  	for h := range r.handlers {
   119  		kinds = append(kinds, h)
   120  	}
   121  	return kinds
   122  }
   123  
   124  // AddCleanup registers a function to be called after the change completes,
   125  // for cleaning up data left behind by tasks of the specified kind.
   126  // The provided function will be called no matter what the final status of the
   127  // task is. This mechanism enables keeping data around for a potential undo
   128  // until there's no more chance of the task being undone.
   129  //
   130  // The cleanup function is run concurrently with other cleanup functions,
   131  // despite any wait ordering between the tasks. If it returns an error,
   132  // it will be retried later.
   133  //
   134  // The handler for tasks of the provided kind must have been previously
   135  // registered before AddCleanup is called for it.
   136  func (r *TaskRunner) AddCleanup(kind string, cleanup HandlerFunc) {
   137  	r.mu.Lock()
   138  	defer r.mu.Unlock()
   139  	if _, ok := r.handlers[kind]; !ok {
   140  		panic("internal error: attempted to register cleanup for unknown task kind")
   141  	}
   142  	r.cleanups[kind] = cleanup
   143  }
   144  
   145  // SetBlocked sets a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation.
   146  func (r *TaskRunner) SetBlocked(pred func(t *Task, running []*Task) bool) {
   147  	r.mu.Lock()
   148  	defer r.mu.Unlock()
   149  
   150  	r.blocked = []blockedFunc{pred}
   151  }
   152  
   153  // AddBlocked adds a predicate function to decide whether to block a task from running based on the current running tasks. It can be used to control task serialisation. All added predicates are considered in turn until one returns true, or none.
   154  func (r *TaskRunner) AddBlocked(pred func(t *Task, running []*Task) bool) {
   155  	r.mu.Lock()
   156  	defer r.mu.Unlock()
   157  
   158  	r.blocked = append(r.blocked, pred)
   159  }
   160  
   161  // run must be called with the state lock in place
   162  func (r *TaskRunner) run(t *Task) {
   163  	var handler HandlerFunc
   164  	var accuRuntime func(dur time.Duration)
   165  	switch t.Status() {
   166  	case DoStatus:
   167  		t.SetStatus(DoingStatus)
   168  		fallthrough
   169  	case DoingStatus:
   170  		handler = r.handlerPair(t).do
   171  		accuRuntime = t.accumulateDoingTime
   172  
   173  	case UndoStatus:
   174  		t.SetStatus(UndoingStatus)
   175  		fallthrough
   176  	case UndoingStatus:
   177  		handler = r.handlerPair(t).undo
   178  		accuRuntime = t.accumulateUndoingTime
   179  
   180  	default:
   181  		panic("internal error: attempted to run task in status " + t.Status().String())
   182  	}
   183  	if handler == nil {
   184  		panic("internal error: attempted to run task with nil handler for status " + t.Status().String())
   185  	}
   186  
   187  	t.At(time.Time{}) // clear schedule
   188  	tomb := &tomb.Tomb{}
   189  	r.tombs[t.ID()] = tomb
   190  	tomb.Go(func() error {
   191  		// Capture the error result with tomb.Kill so we can
   192  		// use tomb.Err uniformily to consider both it or a
   193  		// overriding previous Kill reason.
   194  		t0 := time.Now()
   195  		tomb.Kill(handler(t, tomb))
   196  		t1 := time.Now()
   197  
   198  		// Locks must be acquired in the same order everywhere.
   199  		r.mu.Lock()
   200  		defer r.mu.Unlock()
   201  		r.state.Lock()
   202  		defer r.state.Unlock()
   203  		accuRuntime(t1.Sub(t0))
   204  
   205  		delete(r.tombs, t.ID())
   206  
   207  		// some tasks were blocked, now there's chance the
   208  		// blocked predicate will change its value
   209  		if r.someBlocked {
   210  			r.state.EnsureBefore(0)
   211  		}
   212  
   213  		err := tomb.Err()
   214  		switch err.(type) {
   215  		case nil:
   216  			// we are ok
   217  		case *Retry:
   218  			// preserve
   219  		default:
   220  			if r.stopped {
   221  				// we are shutting down, errors might be due
   222  				// to cancellations, to be safe retry
   223  				err = &Retry{}
   224  			}
   225  		}
   226  
   227  		switch x := err.(type) {
   228  		case *Retry:
   229  			// Handler asked to be called again later.
   230  			// TODO Allow postponing retries past the next Ensure.
   231  			if t.Status() == AbortStatus {
   232  				// Would work without it but might take two ensures.
   233  				r.tryUndo(t)
   234  			} else if x.After != 0 {
   235  				t.At(timeNow().Add(x.After))
   236  			}
   237  		case nil:
   238  			var next []*Task
   239  			switch t.Status() {
   240  			case DoingStatus:
   241  				t.SetStatus(DoneStatus)
   242  				fallthrough
   243  			case DoneStatus:
   244  				next = t.HaltTasks()
   245  			case AbortStatus:
   246  				// It was actually Done if it got here.
   247  				t.SetStatus(UndoStatus)
   248  				r.state.EnsureBefore(0)
   249  			case UndoingStatus:
   250  				t.SetStatus(UndoneStatus)
   251  				fallthrough
   252  			case UndoneStatus:
   253  				next = t.WaitTasks()
   254  			}
   255  			if len(next) > 0 {
   256  				r.state.EnsureBefore(0)
   257  			}
   258  		default:
   259  			r.abortLanes(t.Change(), t.Lanes())
   260  			t.SetStatus(ErrorStatus)
   261  			t.Errorf("%s", err)
   262  		}
   263  
   264  		return nil
   265  	})
   266  }
   267  
   268  func (r *TaskRunner) clean(t *Task) {
   269  	if !t.Change().IsReady() {
   270  		// Whole Change is not ready so don't run cleanups yet.
   271  		return
   272  	}
   273  
   274  	cleanup, ok := r.cleanups[t.Kind()]
   275  	if !ok {
   276  		t.SetClean()
   277  		return
   278  	}
   279  
   280  	tomb := &tomb.Tomb{}
   281  	r.tombs[t.ID()] = tomb
   282  	tomb.Go(func() error {
   283  		tomb.Kill(cleanup(t, tomb))
   284  
   285  		// Locks must be acquired in the same order everywhere.
   286  		r.mu.Lock()
   287  		defer r.mu.Unlock()
   288  		r.state.Lock()
   289  		defer r.state.Unlock()
   290  
   291  		delete(r.tombs, t.ID())
   292  
   293  		if tomb.Err() != nil {
   294  			logger.Debugf("Cleaning task %s: %s", t.ID(), tomb.Err())
   295  		} else {
   296  			t.SetClean()
   297  		}
   298  		return nil
   299  	})
   300  }
   301  
   302  func (r *TaskRunner) abortLanes(chg *Change, lanes []int) {
   303  	chg.AbortLanes(lanes)
   304  	ensureScheduled := false
   305  	for _, t := range chg.Tasks() {
   306  		status := t.Status()
   307  		if status == AbortStatus {
   308  			if tb, ok := r.tombs[t.ID()]; ok {
   309  				tb.Kill(nil)
   310  			}
   311  		}
   312  		if !ensureScheduled && !status.Ready() {
   313  			ensureScheduled = true
   314  			r.state.EnsureBefore(0)
   315  		}
   316  	}
   317  }
   318  
   319  // tryUndo replaces the status of a knowingly aborted task.
   320  func (r *TaskRunner) tryUndo(t *Task) {
   321  	if t.Status() == AbortStatus && r.handlerPair(t).undo == nil {
   322  		// Cannot undo but it was stopped in flight.
   323  		// Hold so it doesn't look like it finished.
   324  		t.SetStatus(HoldStatus)
   325  		if len(t.WaitTasks()) > 0 {
   326  			r.state.EnsureBefore(0)
   327  		}
   328  	} else {
   329  		t.SetStatus(UndoStatus)
   330  		r.state.EnsureBefore(0)
   331  	}
   332  }
   333  
   334  // Ensure starts new goroutines for all known tasks with no pending
   335  // dependencies.
   336  // Note that Ensure will lock the state.
   337  func (r *TaskRunner) Ensure() error {
   338  	r.mu.Lock()
   339  	defer r.mu.Unlock()
   340  
   341  	if r.stopped {
   342  		// we are stopping, don't run another ensure
   343  		return nil
   344  	}
   345  
   346  	// Locks must be acquired in the same order everywhere.
   347  	r.state.Lock()
   348  	defer r.state.Unlock()
   349  
   350  	r.someBlocked = false
   351  	running := make([]*Task, 0, len(r.tombs))
   352  	for tid := range r.tombs {
   353  		t := r.state.Task(tid)
   354  		if t != nil {
   355  			running = append(running, t)
   356  		}
   357  	}
   358  
   359  	ensureTime := timeNow()
   360  	nextTaskTime := time.Time{}
   361  ConsiderTasks:
   362  	for _, t := range r.state.Tasks() {
   363  		handlers := r.handlerPair(t)
   364  		if handlers.do == nil {
   365  			// Handled by a different runner instance.
   366  			continue
   367  		}
   368  
   369  		tb := r.tombs[t.ID()]
   370  
   371  		if t.Status() == AbortStatus {
   372  			if tb != nil {
   373  				tb.Kill(nil)
   374  				continue
   375  			}
   376  			r.tryUndo(t)
   377  		}
   378  
   379  		if tb != nil {
   380  			// Already being handled.
   381  			continue
   382  		}
   383  
   384  		status := t.Status()
   385  		if status.Ready() {
   386  			if !t.IsClean() {
   387  				r.clean(t)
   388  			}
   389  			continue
   390  		}
   391  
   392  		if mustWait(t) {
   393  			// Dependencies still unhandled.
   394  			continue
   395  		}
   396  
   397  		if status == UndoStatus && handlers.undo == nil {
   398  			// Although this has no dependencies itself, it must have waited
   399  			// above too since follow up tasks may have handlers again.
   400  			// Cannot undo. Revert to done status.
   401  			t.SetStatus(DoneStatus)
   402  			if len(t.WaitTasks()) > 0 {
   403  				r.state.EnsureBefore(0)
   404  			}
   405  			continue
   406  		}
   407  
   408  		// skip tasks scheduled for later and also track the earliest one
   409  		tWhen := t.AtTime()
   410  		if !tWhen.IsZero() && ensureTime.Before(tWhen) {
   411  			if nextTaskTime.IsZero() || nextTaskTime.After(tWhen) {
   412  				nextTaskTime = tWhen
   413  			}
   414  			continue
   415  		}
   416  
   417  		// check if any of the blocked predicates returns true
   418  		// and skip the task if so
   419  		for _, blocked := range r.blocked {
   420  			if blocked(t, running) {
   421  				r.someBlocked = true
   422  				continue ConsiderTasks
   423  			}
   424  		}
   425  
   426  		logger.Debugf("Running task %s on %s: %s", t.ID(), t.Status(), t.Summary())
   427  		r.run(t)
   428  
   429  		running = append(running, t)
   430  	}
   431  
   432  	// schedule next Ensure no later than the next task time
   433  	if !nextTaskTime.IsZero() {
   434  		r.state.EnsureBefore(nextTaskTime.Sub(ensureTime))
   435  	}
   436  
   437  	return nil
   438  }
   439  
   440  // mustWait returns whether task t must wait for other tasks to be done.
   441  func mustWait(t *Task) bool {
   442  	switch t.Status() {
   443  	case DoStatus:
   444  		for _, wt := range t.WaitTasks() {
   445  			if wt.Status() != DoneStatus {
   446  				return true
   447  			}
   448  		}
   449  	case UndoStatus:
   450  		for _, ht := range t.HaltTasks() {
   451  			if !ht.Status().Ready() {
   452  				return true
   453  			}
   454  		}
   455  	}
   456  	return false
   457  }
   458  
   459  // wait expects to be called with th r.mu lock held
   460  func (r *TaskRunner) wait() {
   461  	for len(r.tombs) > 0 {
   462  		for _, t := range r.tombs {
   463  			r.mu.Unlock()
   464  			t.Wait()
   465  			r.mu.Lock()
   466  			break
   467  		}
   468  	}
   469  }
   470  
   471  // Stop kills all concurrent activities and returns after that's done.
   472  func (r *TaskRunner) Stop() {
   473  	r.mu.Lock()
   474  	defer r.mu.Unlock()
   475  
   476  	r.stopped = true
   477  
   478  	for _, tb := range r.tombs {
   479  		tb.Kill(nil)
   480  	}
   481  
   482  	r.wait()
   483  }
   484  
   485  // Wait waits for all concurrent activities and returns after that's done.
   486  func (r *TaskRunner) Wait() {
   487  	r.mu.Lock()
   488  	defer r.mu.Unlock()
   489  
   490  	r.wait()
   491  }
   492  
   493  // StopKinds kills all concurrent tasks of the given kinds and returns
   494  // after that's done.
   495  func (r *TaskRunner) StopKinds(kind ...string) {
   496  	r.mu.Lock()
   497  	defer r.mu.Unlock()
   498  
   499  	kinds := make(map[string]bool, len(kind))
   500  	for _, k := range kind {
   501  		kinds[k] = true
   502  	}
   503  
   504  	var tombs []*tomb.Tomb
   505  	// Locks must be acquired in the same order everywhere:
   506  	// r.mu, r.state
   507  	r.state.Lock()
   508  	for tid, tb := range r.tombs {
   509  		task := r.state.Task(tid)
   510  		if task == nil || !kinds[task.Kind()] {
   511  			continue
   512  		}
   513  		tombs = append(tombs, tb)
   514  		tb.Kill(nil)
   515  	}
   516  	r.state.Unlock()
   517  
   518  	for _, tb := range tombs {
   519  		r.mu.Unlock()
   520  		tb.Wait()
   521  		r.mu.Lock()
   522  	}
   523  }