github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/util/stop/stopper.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package stop
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"net/http"
    17  	"sort"
    18  	"strings"
    19  	"sync"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/settings"
    24  	"github.com/cockroachdb/cockroach/pkg/util/caller"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    27  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    29  	"github.com/cockroachdb/errors"
    30  	opentracing "github.com/opentracing/opentracing-go"
    31  )
    32  
    33  const asyncTaskNamePrefix = "[async] "
    34  
    35  // ErrThrottled is returned from RunLimitedAsyncTask in the event that there
    36  // is no more capacity for async tasks, as limited by the semaphore.
    37  var ErrThrottled = errors.New("throttled on async limiting semaphore")
    38  
    39  // ErrUnavailable indicates that the server is quiescing and is unable to
    40  // process new work.
    41  var ErrUnavailable = &roachpb.NodeUnavailableError{}
    42  
    43  func register(s *Stopper) {
    44  	trackedStoppers.Lock()
    45  	trackedStoppers.stoppers = append(trackedStoppers.stoppers, s)
    46  	trackedStoppers.Unlock()
    47  }
    48  
    49  func unregister(s *Stopper) {
    50  	trackedStoppers.Lock()
    51  	defer trackedStoppers.Unlock()
    52  	sl := trackedStoppers.stoppers
    53  	for i, tracked := range sl {
    54  		if tracked == s {
    55  			trackedStoppers.stoppers = sl[:i+copy(sl[i:], sl[i+1:])]
    56  			return
    57  		}
    58  	}
    59  	panic("attempt to unregister untracked stopper")
    60  }
    61  
    62  var trackedStoppers struct {
    63  	syncutil.Mutex
    64  	stoppers []*Stopper
    65  }
    66  
    67  // HandleDebug responds with the list of stopper tasks actively running.
    68  func HandleDebug(w http.ResponseWriter, r *http.Request) {
    69  	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
    70  	trackedStoppers.Lock()
    71  	defer trackedStoppers.Unlock()
    72  	for _, s := range trackedStoppers.stoppers {
    73  		s.mu.Lock()
    74  		fmt.Fprintf(w, "%p: %d tasks\n%s", s, s.mu.numTasks, s.runningTasksLocked())
    75  		s.mu.Unlock()
    76  	}
    77  }
    78  
    79  // Closer is an interface for objects to attach to the stopper to
    80  // be closed once the stopper completes.
    81  type Closer interface {
    82  	Close()
    83  }
    84  
    85  // CloserFn is type that allows any function to be a Closer.
    86  type CloserFn func()
    87  
    88  // Close implements the Closer interface.
    89  func (f CloserFn) Close() {
    90  	f()
    91  }
    92  
    93  // A Stopper provides a channel-based mechanism to stop an arbitrary
    94  // array of workers. Each worker is registered with the stopper via
    95  // the RunWorker() method. The system further allows execution of functions
    96  // through RunTask() and RunAsyncTask().
    97  //
    98  // Stopping occurs in two phases: the first is the request to stop, which moves
    99  // the stopper into a quiescing phase. While quiescing, calls to RunTask() &
   100  // RunAsyncTask() don't execute the function passed in and return ErrUnavailable.
   101  // When all outstanding tasks have been completed, the stopper
   102  // closes its stopper channel, which signals all live workers that it's safe to
   103  // shut down. When all workers have shutdown, the stopper is complete.
   104  //
   105  // An arbitrary list of objects implementing the Closer interface may
   106  // be added to the stopper via AddCloser(), to be closed after the
   107  // stopper has stopped.
   108  type Stopper struct {
   109  	quiescer chan struct{}     // Closed when quiescing
   110  	stopper  chan struct{}     // Closed when stopping
   111  	stopped  chan struct{}     // Closed when stopped completely
   112  	onPanic  func(interface{}) // called with recover() on panic on any goroutine
   113  	stop     sync.WaitGroup    // Incremented for outstanding workers
   114  	mu       struct {
   115  		syncutil.Mutex
   116  		quiesce   *sync.Cond // Conditional variable to wait for outstanding tasks
   117  		quiescing bool       // true when Stop() has been called
   118  		numTasks  int        // number of outstanding tasks
   119  		tasks     TaskMap
   120  		closers   []Closer
   121  		idAlloc   int
   122  		qCancels  map[int]func()
   123  		sCancels  map[int]func()
   124  
   125  		stopCalled bool // turns all but first call to Stop into noop
   126  	}
   127  }
   128  
   129  // An Option can be passed to NewStopper.
   130  type Option interface {
   131  	apply(*Stopper)
   132  }
   133  
   134  type optionPanicHandler func(interface{})
   135  
   136  func (oph optionPanicHandler) apply(stopper *Stopper) {
   137  	stopper.onPanic = oph
   138  }
   139  
   140  // OnPanic is an option which lets the Stopper recover from all panics using
   141  // the provided panic handler.
   142  //
   143  // When Stop() is invoked during stack unwinding, OnPanic is also invoked, but
   144  // Stop() may not have carried out its duties.
   145  func OnPanic(handler func(interface{})) Option {
   146  	return optionPanicHandler(handler)
   147  }
   148  
   149  // NewStopper returns an instance of Stopper.
   150  func NewStopper(options ...Option) *Stopper {
   151  	s := &Stopper{
   152  		quiescer: make(chan struct{}),
   153  		stopper:  make(chan struct{}),
   154  		stopped:  make(chan struct{}),
   155  	}
   156  
   157  	s.mu.tasks = TaskMap{}
   158  	s.mu.qCancels = map[int]func(){}
   159  	s.mu.sCancels = map[int]func(){}
   160  
   161  	for _, opt := range options {
   162  		opt.apply(s)
   163  	}
   164  
   165  	s.mu.quiesce = sync.NewCond(&s.mu)
   166  	register(s)
   167  	return s
   168  }
   169  
   170  // Recover is used internally by Stopper to provide a hook for recovery of
   171  // panics on goroutines started by the Stopper. It can also be invoked
   172  // explicitly (via "defer s.Recover()") on goroutines that are created outside
   173  // of Stopper.
   174  func (s *Stopper) Recover(ctx context.Context) {
   175  	if r := recover(); r != nil {
   176  		if s.onPanic != nil {
   177  			s.onPanic(r)
   178  			return
   179  		}
   180  		if sv := settings.TODO(); sv != nil {
   181  			log.ReportPanic(ctx, sv, r, 1)
   182  		}
   183  		panic(r)
   184  	}
   185  }
   186  
   187  // RunWorker runs the supplied function as a "worker" to be stopped
   188  // by the stopper. The function <f> is run in a goroutine.
   189  func (s *Stopper) RunWorker(ctx context.Context, f func(context.Context)) {
   190  	s.stop.Add(1)
   191  	go func() {
   192  		// Remove any associated span; we need to ensure this because the
   193  		// worker may run longer than the caller which presumably closes
   194  		// any spans it has created.
   195  		ctx = opentracing.ContextWithSpan(ctx, nil)
   196  		defer s.Recover(ctx)
   197  		defer s.stop.Done()
   198  		f(ctx)
   199  	}()
   200  }
   201  
   202  // AddCloser adds an object to close after the stopper has been stopped.
   203  //
   204  // WARNING: memory resources acquired by this method will stay around for
   205  // the lifetime of the Stopper. Use with care to avoid leaking memory.
   206  func (s *Stopper) AddCloser(c Closer) {
   207  	s.mu.Lock()
   208  	defer s.mu.Unlock()
   209  	select {
   210  	case <-s.stopper:
   211  		// Close immediately.
   212  		c.Close()
   213  	default:
   214  		s.mu.closers = append(s.mu.closers, c)
   215  	}
   216  }
   217  
   218  // WithCancelOnQuiesce returns a child context which is canceled when the
   219  // returned cancel function is called or when the Stopper begins to quiesce,
   220  // whichever happens first.
   221  //
   222  // Canceling this context releases resources associated with it, so code should
   223  // call cancel as soon as the operations running in this Context complete.
   224  func (s *Stopper) WithCancelOnQuiesce(ctx context.Context) (context.Context, func()) {
   225  	return s.withCancel(ctx, s.mu.qCancels, s.quiescer)
   226  }
   227  
   228  // WithCancelOnStop returns a child context which is canceled when the
   229  // returned cancel function is called or when the Stopper begins to stop,
   230  // whichever happens first.
   231  //
   232  // Canceling this context releases resources associated with it, so code should
   233  // call cancel as soon as the operations running in this Context complete.
   234  func (s *Stopper) WithCancelOnStop(ctx context.Context) (context.Context, func()) {
   235  	return s.withCancel(ctx, s.mu.sCancels, s.stopper)
   236  }
   237  
   238  func (s *Stopper) withCancel(
   239  	ctx context.Context, cancels map[int]func(), cancelCh chan struct{},
   240  ) (context.Context, func()) {
   241  	var cancel func()
   242  	ctx, cancel = context.WithCancel(ctx)
   243  	s.mu.Lock()
   244  	defer s.mu.Unlock()
   245  	select {
   246  	case <-cancelCh:
   247  		// Cancel immediately.
   248  		cancel()
   249  		return ctx, func() {}
   250  	default:
   251  		id := s.mu.idAlloc
   252  		s.mu.idAlloc++
   253  		cancels[id] = cancel
   254  		return ctx, func() {
   255  			cancel()
   256  			s.mu.Lock()
   257  			defer s.mu.Unlock()
   258  			delete(cancels, id)
   259  		}
   260  	}
   261  }
   262  
   263  // RunTask adds one to the count of tasks left to quiesce in the system.
   264  // Any worker which is a "first mover" when starting tasks must call this method
   265  // before starting work on a new task. First movers include goroutines launched
   266  // to do periodic work and the kv/db.go gateway which accepts external client
   267  // requests.
   268  //
   269  // taskName is used as the "operation" field of the span opened for this task
   270  // and is visible in traces. It's also part of reports printed by stoppers
   271  // waiting to stop. The convention is
   272  // <package name>.<struct name>: <succinct description of the task's action>
   273  //
   274  // Returns an error to indicate that the system is currently quiescing and
   275  // function f was not called.
   276  func (s *Stopper) RunTask(ctx context.Context, taskName string, f func(context.Context)) error {
   277  	if !s.runPrelude(taskName) {
   278  		return ErrUnavailable
   279  	}
   280  
   281  	// Call f.
   282  	defer s.Recover(ctx)
   283  	defer s.runPostlude(taskName)
   284  
   285  	f(ctx)
   286  	return nil
   287  }
   288  
   289  // RunTaskWithErr is like RunTask(), but takes in a callback that can return an
   290  // error. The error is returned to the caller.
   291  func (s *Stopper) RunTaskWithErr(
   292  	ctx context.Context, taskName string, f func(context.Context) error,
   293  ) error {
   294  	if !s.runPrelude(taskName) {
   295  		return ErrUnavailable
   296  	}
   297  
   298  	// Call f.
   299  	defer s.Recover(ctx)
   300  	defer s.runPostlude(taskName)
   301  
   302  	return f(ctx)
   303  }
   304  
   305  // RunAsyncTask is like RunTask, except the callback is run in a goroutine. The
   306  // method doesn't block for the callback to finish execution.
   307  func (s *Stopper) RunAsyncTask(
   308  	ctx context.Context, taskName string, f func(context.Context),
   309  ) error {
   310  	taskName = asyncTaskNamePrefix + taskName
   311  	if !s.runPrelude(taskName) {
   312  		return ErrUnavailable
   313  	}
   314  
   315  	ctx, span := tracing.ForkCtxSpan(ctx, taskName)
   316  
   317  	// Call f.
   318  	go func() {
   319  		defer s.Recover(ctx)
   320  		defer s.runPostlude(taskName)
   321  		defer tracing.FinishSpan(span)
   322  
   323  		f(ctx)
   324  	}()
   325  	return nil
   326  }
   327  
   328  // RunLimitedAsyncTask runs function f in a goroutine, using the given
   329  // channel as a semaphore to limit the number of tasks that are run
   330  // concurrently to the channel's capacity. If wait is true, blocks
   331  // until the semaphore is available in order to push back on callers
   332  // that may be trying to create many tasks. If wait is false, returns
   333  // immediately with an error if the semaphore is not
   334  // available. It is the caller's responsibility to ensure that sem is
   335  // closed when the stopper is quiesced. For quotapools which live for the
   336  // lifetime of the stopper, it is generally best to register the sem with the
   337  // stopper using AddCloser.
   338  func (s *Stopper) RunLimitedAsyncTask(
   339  	ctx context.Context, taskName string, sem *quotapool.IntPool, wait bool, f func(context.Context),
   340  ) (err error) {
   341  	// Wait for permission to run from the semaphore.
   342  	var alloc *quotapool.IntAlloc
   343  	if wait {
   344  		alloc, err = sem.Acquire(ctx, 1)
   345  	} else {
   346  		alloc, err = sem.TryAcquire(ctx, 1)
   347  	}
   348  	if errors.Is(err, quotapool.ErrNotEnoughQuota) {
   349  		err = ErrThrottled
   350  	} else if quotapool.HasErrClosed(err) {
   351  		err = ErrUnavailable
   352  	}
   353  	if err != nil {
   354  		return err
   355  	}
   356  	defer func() {
   357  		// If the err is non-nil then we know that we did not start the async task
   358  		// and thus we need to release the acquired quota. If it is nil then we
   359  		// did start the task and it will release the quota.
   360  		if err != nil {
   361  			alloc.Release()
   362  		}
   363  	}()
   364  
   365  	// Check for canceled context: it's possible to get the semaphore even
   366  	// if the context is canceled.
   367  	if ctx.Err() != nil {
   368  		return ctx.Err()
   369  	}
   370  	if !s.runPrelude(taskName) {
   371  		return ErrUnavailable
   372  	}
   373  
   374  	ctx, span := tracing.ForkCtxSpan(ctx, taskName)
   375  
   376  	go func() {
   377  		defer s.Recover(ctx)
   378  		defer s.runPostlude(taskName)
   379  		defer alloc.Release()
   380  		defer tracing.FinishSpan(span)
   381  
   382  		f(ctx)
   383  	}()
   384  	return nil
   385  }
   386  
   387  func (s *Stopper) runPrelude(taskName string) bool {
   388  	s.mu.Lock()
   389  	defer s.mu.Unlock()
   390  	if s.mu.quiescing {
   391  		return false
   392  	}
   393  	s.mu.numTasks++
   394  	s.mu.tasks[taskName]++
   395  	return true
   396  }
   397  
   398  func (s *Stopper) runPostlude(taskName string) {
   399  	s.mu.Lock()
   400  	defer s.mu.Unlock()
   401  	s.mu.numTasks--
   402  	s.mu.tasks[taskName]--
   403  	s.mu.quiesce.Broadcast()
   404  }
   405  
   406  // NumTasks returns the number of active tasks.
   407  func (s *Stopper) NumTasks() int {
   408  	s.mu.Lock()
   409  	defer s.mu.Unlock()
   410  	return s.mu.numTasks
   411  }
   412  
   413  // A TaskMap is returned by RunningTasks().
   414  type TaskMap map[string]int
   415  
   416  // String implements fmt.Stringer and returns a sorted multi-line listing of
   417  // the TaskMap.
   418  func (tm TaskMap) String() string {
   419  	var lines []string
   420  	for location, num := range tm {
   421  		lines = append(lines, fmt.Sprintf("%-6d %s", num, location))
   422  	}
   423  	sort.Sort(sort.Reverse(sort.StringSlice(lines)))
   424  	return strings.Join(lines, "\n")
   425  }
   426  
   427  // RunningTasks returns a map containing the count of running tasks keyed by
   428  // call site.
   429  func (s *Stopper) RunningTasks() TaskMap {
   430  	s.mu.Lock()
   431  	defer s.mu.Unlock()
   432  	return s.runningTasksLocked()
   433  }
   434  
   435  func (s *Stopper) runningTasksLocked() TaskMap {
   436  	m := TaskMap{}
   437  	for k := range s.mu.tasks {
   438  		if s.mu.tasks[k] == 0 {
   439  			continue
   440  		}
   441  		m[k] = s.mu.tasks[k]
   442  	}
   443  	return m
   444  }
   445  
   446  // Stop signals all live workers to stop and then waits for each to
   447  // confirm it has stopped.
   448  func (s *Stopper) Stop(ctx context.Context) {
   449  	s.mu.Lock()
   450  	stopCalled := s.mu.stopCalled
   451  	s.mu.stopCalled = true
   452  	s.mu.Unlock()
   453  
   454  	if stopCalled {
   455  		return
   456  	}
   457  
   458  	defer s.Recover(ctx)
   459  	defer unregister(s)
   460  
   461  	if log.V(1) {
   462  		file, line, _ := caller.Lookup(1)
   463  		log.Infof(ctx,
   464  			"stop has been called from %s:%d, stopping or quiescing all running tasks", file, line)
   465  	}
   466  	// Don't bother doing stuff cleanly if we're panicking, that would likely
   467  	// block. Instead, best effort only. This cleans up the stack traces,
   468  	// avoids stalls and helps some tests in `./cli` finish cleanly (where
   469  	// panics happen on purpose).
   470  	if r := recover(); r != nil {
   471  		go s.Quiesce(ctx)
   472  		close(s.stopper)
   473  		close(s.stopped)
   474  		s.mu.Lock()
   475  		for _, c := range s.mu.closers {
   476  			go c.Close()
   477  		}
   478  		s.mu.Unlock()
   479  		panic(r)
   480  	}
   481  
   482  	s.Quiesce(ctx)
   483  	s.mu.Lock()
   484  	for _, cancel := range s.mu.sCancels {
   485  		cancel()
   486  	}
   487  	close(s.stopper)
   488  	s.mu.Unlock()
   489  
   490  	s.stop.Wait()
   491  	s.mu.Lock()
   492  	defer s.mu.Unlock()
   493  	for _, c := range s.mu.closers {
   494  		c.Close()
   495  	}
   496  	close(s.stopped)
   497  }
   498  
   499  // ShouldQuiesce returns a channel which will be closed when Stop() has been
   500  // invoked and outstanding tasks should begin to quiesce.
   501  func (s *Stopper) ShouldQuiesce() <-chan struct{} {
   502  	if s == nil {
   503  		// A nil stopper will never signal ShouldQuiesce, but will also never panic.
   504  		return nil
   505  	}
   506  	return s.quiescer
   507  }
   508  
   509  // ShouldStop returns a channel which will be closed when Stop() has been
   510  // invoked and outstanding tasks have quiesced.
   511  func (s *Stopper) ShouldStop() <-chan struct{} {
   512  	if s == nil {
   513  		// A nil stopper will never signal ShouldStop, but will also never panic.
   514  		return nil
   515  	}
   516  	return s.stopper
   517  }
   518  
   519  // IsStopped returns a channel which will be closed after Stop() has
   520  // been invoked to full completion, meaning all workers have completed
   521  // and all closers have been closed.
   522  func (s *Stopper) IsStopped() <-chan struct{} {
   523  	if s == nil {
   524  		return nil
   525  	}
   526  	return s.stopped
   527  }
   528  
   529  // Quiesce moves the stopper to state quiescing and waits until all
   530  // tasks complete. This is used from Stop() and unittests.
   531  func (s *Stopper) Quiesce(ctx context.Context) {
   532  	defer s.Recover(ctx)
   533  	s.mu.Lock()
   534  	defer s.mu.Unlock()
   535  	for _, cancel := range s.mu.qCancels {
   536  		cancel()
   537  	}
   538  	if !s.mu.quiescing {
   539  		log.Infof(ctx, "quiescing")
   540  		s.mu.quiescing = true
   541  		close(s.quiescer)
   542  	}
   543  	for s.mu.numTasks > 0 {
   544  		t := time.AfterFunc(5*time.Second, func() {
   545  			// If we're waiting for 5+s without a task terminating, log the ones
   546  			// that remain.
   547  			log.Infof(ctx, "quiescing; tasks left:\n%s", s.RunningTasks())
   548  		})
   549  		// Unlock s.mu, wait for the signal, and lock s.mu.
   550  		s.mu.quiesce.Wait()
   551  		t.Stop()
   552  	}
   553  }