github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/pkg/workerpool/pool_impl.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package workerpool
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"sync/atomic"
    20  	"time"
    21  
    22  	"github.com/pingcap/log"
    23  
    24  	"github.com/pingcap/errors"
    25  	"github.com/pingcap/failpoint"
    26  	cerrors "github.com/pingcap/ticdc/pkg/errors"
    27  	"github.com/pingcap/ticdc/pkg/notify"
    28  	"go.uber.org/zap"
    29  	"golang.org/x/sync/errgroup"
    30  )
    31  
    32  const (
    33  	workerPoolDefaultClockSourceInterval = time.Millisecond * 100
    34  )
    35  
    36  type defaultPoolImpl struct {
    37  	// assume the hasher to be the trivial hasher for now
    38  	hasher Hasher
    39  	// do not resize this slice after creating the pool
    40  	workers []*worker
    41  	// used to generate handler IDs, must be accessed atomically
    42  	nextHandlerID int64
    43  }
    44  
    45  // NewDefaultWorkerPool creates a new WorkerPool that uses the default implementation
    46  func NewDefaultWorkerPool(numWorkers int) WorkerPool {
    47  	return newDefaultPoolImpl(&defaultHasher{}, numWorkers)
    48  }
    49  
    50  func newDefaultPoolImpl(hasher Hasher, numWorkers int) *defaultPoolImpl {
    51  	workers := make([]*worker, numWorkers)
    52  	for i := 0; i < numWorkers; i++ {
    53  		workers[i] = newWorker()
    54  	}
    55  	return &defaultPoolImpl{
    56  		hasher:  hasher,
    57  		workers: workers,
    58  	}
    59  }
    60  
    61  func (p *defaultPoolImpl) Run(ctx context.Context) error {
    62  	errg, ctx := errgroup.WithContext(ctx)
    63  
    64  	for _, worker := range p.workers {
    65  		workerFinal := worker
    66  		errg.Go(func() error {
    67  			err := workerFinal.run(ctx)
    68  			if err != nil {
    69  				return errors.Trace(err)
    70  			}
    71  			return nil
    72  		})
    73  	}
    74  
    75  	return errg.Wait()
    76  }
    77  
    78  func (p *defaultPoolImpl) RegisterEvent(f func(ctx context.Context, event interface{}) error) EventHandle {
    79  	handler := &defaultEventHandle{
    80  		f:     f,
    81  		errCh: make(chan error, 1),
    82  		id:    atomic.AddInt64(&p.nextHandlerID, 1) - 1,
    83  	}
    84  
    85  	workerID := p.hasher.Hash(handler) % int64(len(p.workers))
    86  	p.workers[workerID].addHandle(handler)
    87  	handler.worker = p.workers[workerID]
    88  
    89  	return handler
    90  }
    91  
    92  type defaultEventHandle struct {
    93  	// the function to be run each time the event is triggered
    94  	f func(ctx context.Context, event interface{}) error
    95  	// whether this handle has been cancelled, must be accessed atomically
    96  	isCancelled int32
    97  	// channel for the error returned by f
    98  	errCh chan error
    99  	// the worker that the handle is associated with
   100  	worker *worker
   101  	// identifier for this handle. No significant usage for now.
   102  	// Might be used to support consistent hashing in the future,
   103  	// so that the pool can be resized efficiently.
   104  	id int64
   105  
   106  	// whether there is a valid timer handler, must be accessed atomically
   107  	hasTimer int32
   108  	// the time when timer was triggered the last time
   109  	lastTimer time.Time
   110  	// minimum interval between two timer calls
   111  	timerInterval time.Duration
   112  	// the handler for the timer
   113  	timerHandler func(ctx context.Context) error
   114  
   115  	// whether this is a valid errorHandler, must be accessed atomically
   116  	hasErrorHandler int32
   117  	// the error handler, called when the handle meets an error (which is returned by f)
   118  	errorHandler func(err error)
   119  }
   120  
   121  func (h *defaultEventHandle) AddEvent(ctx context.Context, event interface{}) error {
   122  	if atomic.LoadInt32(&h.isCancelled) == 1 {
   123  		return cerrors.ErrWorkerPoolHandleCancelled.GenWithStackByArgs()
   124  	}
   125  
   126  	failpoint.Inject("addEventDelayPoint", func() {})
   127  
   128  	task := task{
   129  		handle: h,
   130  		f: func(ctx1 context.Context) error {
   131  			return h.f(ctx, event)
   132  		},
   133  	}
   134  
   135  	select {
   136  	case <-ctx.Done():
   137  		return errors.Trace(ctx.Err())
   138  	case h.worker.taskCh <- task:
   139  	}
   140  	return nil
   141  }
   142  
   143  func (h *defaultEventHandle) SetTimer(ctx context.Context, interval time.Duration, f func(ctx context.Context) error) EventHandle {
   144  	// mark the timer handler function as invalid
   145  	atomic.StoreInt32(&h.hasTimer, 0)
   146  	// wait for `hasTimer` to take effect, otherwise we might have a data race, if there was a previous handler.
   147  	h.worker.synchronize()
   148  
   149  	h.timerInterval = interval
   150  	h.timerHandler = func(ctx1 context.Context) error {
   151  		return f(ctx)
   152  	}
   153  	// mark the timer handler function as valid
   154  	atomic.StoreInt32(&h.hasTimer, 1)
   155  
   156  	return h
   157  }
   158  
   159  func (h *defaultEventHandle) Unregister() {
   160  	if !atomic.CompareAndSwapInt32(&h.isCancelled, 0, 1) {
   161  		// already cancelled
   162  		return
   163  	}
   164  
   165  	failpoint.Inject("unregisterDelayPoint", func() {})
   166  
   167  	// call synchronize so that all function executions related to this handle will be
   168  	// linearized BEFORE Unregister.
   169  	h.worker.synchronize()
   170  
   171  	h.doCancel(cerrors.ErrWorkerPoolHandleCancelled.GenWithStackByArgs())
   172  }
   173  
   174  // callers of doCancel need to check h.isCancelled first.
   175  // DO NOT call doCancel multiple times on the same handle.
   176  func (h *defaultEventHandle) doCancel(err error) {
   177  	h.worker.removeHandle(h)
   178  
   179  	if atomic.LoadInt32(&h.hasErrorHandler) == 1 {
   180  		h.errorHandler(err)
   181  	}
   182  
   183  	h.errCh <- err
   184  	close(h.errCh)
   185  }
   186  
   187  func (h *defaultEventHandle) ErrCh() <-chan error {
   188  	return h.errCh
   189  }
   190  
   191  func (h *defaultEventHandle) OnExit(f func(err error)) EventHandle {
   192  	atomic.StoreInt32(&h.hasErrorHandler, 0)
   193  	h.worker.synchronize()
   194  	h.errorHandler = f
   195  	atomic.StoreInt32(&h.hasErrorHandler, 1)
   196  	return h
   197  }
   198  
   199  func (h *defaultEventHandle) HashCode() int64 {
   200  	return h.id
   201  }
   202  
   203  func (h *defaultEventHandle) cancelWithErr(err error) {
   204  	if !atomic.CompareAndSwapInt32(&h.isCancelled, 0, 1) {
   205  		// already cancelled
   206  		return
   207  	}
   208  
   209  	h.doCancel(err)
   210  }
   211  
   212  func (h *defaultEventHandle) durationSinceLastTimer() time.Duration {
   213  	return time.Since(h.lastTimer)
   214  }
   215  
   216  func (h *defaultEventHandle) doTimer(ctx context.Context) error {
   217  	if atomic.LoadInt32(&h.hasTimer) == 0 {
   218  		return nil
   219  	}
   220  
   221  	if h.durationSinceLastTimer() < h.timerInterval {
   222  		return nil
   223  	}
   224  
   225  	err := h.timerHandler(ctx)
   226  	if err != nil {
   227  		return errors.Trace(err)
   228  	}
   229  
   230  	h.lastTimer = time.Now()
   231  
   232  	return nil
   233  }
   234  
   235  type task struct {
   236  	handle *defaultEventHandle
   237  	f      func(ctx context.Context) error
   238  }
   239  
   240  type worker struct {
   241  	taskCh       chan task
   242  	handles      map[*defaultEventHandle]struct{}
   243  	handleRWLock sync.RWMutex
   244  	// A message is passed to handleCancelCh when we need to wait for the
   245  	// current execution of handler to finish. Should be BLOCKING.
   246  	handleCancelCh chan struct{}
   247  	// must be accessed atomically
   248  	isRunning int32
   249  	// notifies exits of run()
   250  	stopNotifier notify.Notifier
   251  }
   252  
   253  func newWorker() *worker {
   254  	return &worker{
   255  		taskCh:         make(chan task, 128),
   256  		handles:        make(map[*defaultEventHandle]struct{}),
   257  		handleCancelCh: make(chan struct{}), // this channel must be unbuffered, i.e. blocking
   258  	}
   259  }
   260  
   261  func (w *worker) run(ctx context.Context) error {
   262  	ticker := time.NewTicker(workerPoolDefaultClockSourceInterval)
   263  	atomic.StoreInt32(&w.isRunning, 1)
   264  	defer func() {
   265  		ticker.Stop()
   266  		atomic.StoreInt32(&w.isRunning, 0)
   267  		w.stopNotifier.Notify()
   268  	}()
   269  
   270  	for {
   271  		select {
   272  		case <-ctx.Done():
   273  			return errors.Trace(ctx.Err())
   274  		case task := <-w.taskCh:
   275  			if atomic.LoadInt32(&task.handle.isCancelled) == 1 {
   276  				// ignored cancelled handle
   277  				continue
   278  			}
   279  
   280  			err := task.f(ctx)
   281  			if err != nil {
   282  				task.handle.cancelWithErr(err)
   283  			}
   284  		case <-ticker.C:
   285  			var handleErrs []struct {
   286  				h *defaultEventHandle
   287  				e error
   288  			}
   289  
   290  			w.handleRWLock.RLock()
   291  			for handle := range w.handles {
   292  				if atomic.LoadInt32(&handle.isCancelled) == 1 {
   293  					// ignored cancelled handle
   294  					continue
   295  				}
   296  				err := handle.doTimer(ctx)
   297  				if err != nil {
   298  					handleErrs = append(handleErrs, struct {
   299  						h *defaultEventHandle
   300  						e error
   301  					}{handle, err})
   302  				}
   303  			}
   304  			w.handleRWLock.RUnlock()
   305  
   306  			// cancelWithErr must be called out side of the loop above,
   307  			// to avoid deadlock.
   308  			for _, handleErr := range handleErrs {
   309  				handleErr.h.cancelWithErr(handleErr.e)
   310  			}
   311  		case <-w.handleCancelCh:
   312  		}
   313  	}
   314  }
   315  
   316  // synchronize waits for the worker to loop at least once, or to exit.
   317  func (w *worker) synchronize() {
   318  	if atomic.LoadInt32(&w.isRunning) == 0 {
   319  		return
   320  	}
   321  
   322  	receiver, err := w.stopNotifier.NewReceiver(time.Millisecond * 100)
   323  	if err != nil {
   324  		if cerrors.ErrOperateOnClosedNotifier.Equal(errors.Cause(err)) {
   325  			return
   326  		}
   327  		log.Panic("unexpected error", zap.Error(err))
   328  	}
   329  	defer receiver.Stop()
   330  
   331  	startTime := time.Now()
   332  	for {
   333  		workerHasFinishedLoop := false
   334  		select {
   335  		case w.handleCancelCh <- struct{}{}:
   336  			workerHasFinishedLoop = true
   337  		case <-receiver.C:
   338  		}
   339  		if workerHasFinishedLoop || atomic.LoadInt32(&w.isRunning) == 0 {
   340  			break
   341  		}
   342  
   343  		if time.Since(startTime) > time.Second*10 {
   344  			// likely the workerpool has deadlocked, or there is a bug in the event handlers.
   345  			log.Warn("synchronize is taking too long, report a bug", zap.Duration("elapsed", time.Since(startTime)))
   346  		}
   347  	}
   348  }
   349  
   350  func (w *worker) addHandle(handle *defaultEventHandle) {
   351  	w.handleRWLock.Lock()
   352  	defer w.handleRWLock.Unlock()
   353  
   354  	w.handles[handle] = struct{}{}
   355  }
   356  
   357  func (w *worker) removeHandle(handle *defaultEventHandle) {
   358  	w.handleRWLock.Lock()
   359  	defer w.handleRWLock.Unlock()
   360  
   361  	delete(w.handles, handle)
   362  }