github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/sink/dmlsink/txn/worker.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package txn
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"time"
    20  
    21  	"github.com/pingcap/log"
    22  	"github.com/pingcap/tiflow/cdc/model"
    23  	"github.com/pingcap/tiflow/cdc/sink/metrics/txn"
    24  	"github.com/pingcap/tiflow/cdc/sink/tablesink/state"
    25  	"github.com/pingcap/tiflow/pkg/causality"
    26  	"github.com/prometheus/client_golang/prometheus"
    27  	"go.uber.org/zap"
    28  )
    29  
    30  type worker struct {
    31  	ctx         context.Context
    32  	changefeed  string
    33  	workerCount int
    34  
    35  	ID      int
    36  	backend backend
    37  
    38  	// Metrics.
    39  	metricConflictDetectDuration prometheus.Observer
    40  	metricQueueDuration          prometheus.Observer
    41  	metricTxnWorkerFlushDuration prometheus.Observer
    42  	metricTxnWorkerTotalDuration prometheus.Observer
    43  	metricTxnWorkerHandledRows   prometheus.Counter
    44  
    45  	// Fields only used in the background loop.
    46  	flushInterval            time.Duration
    47  	hasPending               bool
    48  	postTxnExecutedCallbacks []func()
    49  }
    50  
    51  func newWorker(ctx context.Context, changefeedID model.ChangeFeedID,
    52  	ID int, backend backend, workerCount int,
    53  ) *worker {
    54  	wid := fmt.Sprintf("%d", ID)
    55  
    56  	return &worker{
    57  		ctx:         ctx,
    58  		changefeed:  fmt.Sprintf("%s.%s", changefeedID.Namespace, changefeedID.ID),
    59  		workerCount: workerCount,
    60  
    61  		ID:      ID,
    62  		backend: backend,
    63  
    64  		metricConflictDetectDuration: txn.ConflictDetectDuration.WithLabelValues(changefeedID.Namespace, changefeedID.ID),
    65  		metricQueueDuration:          txn.QueueDuration.WithLabelValues(changefeedID.Namespace, changefeedID.ID),
    66  		metricTxnWorkerFlushDuration: txn.WorkerFlushDuration.WithLabelValues(changefeedID.Namespace, changefeedID.ID, wid),
    67  		metricTxnWorkerTotalDuration: txn.WorkerTotalDuration.WithLabelValues(changefeedID.Namespace, changefeedID.ID, wid),
    68  		metricTxnWorkerHandledRows:   txn.WorkerHandledRows.WithLabelValues(changefeedID.Namespace, changefeedID.ID, wid),
    69  
    70  		flushInterval:            backend.MaxFlushInterval(),
    71  		hasPending:               false,
    72  		postTxnExecutedCallbacks: make([]func(), 0, 1024),
    73  	}
    74  }
    75  
    76  // Run a loop.
    77  func (w *worker) runLoop(txnCh <-chan causality.TxnWithNotifier[*txnEvent]) error {
    78  	defer func() {
    79  		if err := w.backend.Close(); err != nil {
    80  			log.Info("Transaction dmlSink backend close fail",
    81  				zap.String("changefeedID", w.changefeed),
    82  				zap.Int("workerID", w.ID),
    83  				zap.Error(err))
    84  		}
    85  	}()
    86  	log.Info("Transaction dmlSink worker starts",
    87  		zap.String("changefeedID", w.changefeed),
    88  		zap.Int("workerID", w.ID))
    89  
    90  	start := time.Now()
    91  	for {
    92  		select {
    93  		case <-w.ctx.Done():
    94  			log.Info("Transaction dmlSink worker exits as canceled",
    95  				zap.String("changefeedID", w.changefeed),
    96  				zap.Int("workerID", w.ID))
    97  			return nil
    98  		case txn := <-txnCh:
    99  			// we get the data from txnCh until no more data here or reach the state that can be flushed.
   100  			// If no more data in txnCh, and also not reach the state that can be flushed,
   101  			// we will wait for 10ms and then do flush to avoid too much flush with small amount of txns.
   102  			if txn.TxnEvent != nil {
   103  				needFlush := w.onEvent(txn.TxnEvent, txn.PostTxnExecuted)
   104  				if !needFlush {
   105  					delay := time.NewTimer(w.flushInterval)
   106  					for !needFlush {
   107  						select {
   108  						case txn := <-txnCh:
   109  							needFlush = w.onEvent(txn.TxnEvent, txn.PostTxnExecuted)
   110  						case <-delay.C:
   111  							needFlush = true
   112  						}
   113  					}
   114  					// Release resources promptly
   115  					if !delay.Stop() {
   116  						select {
   117  						case <-delay.C:
   118  						default:
   119  						}
   120  					}
   121  				}
   122  				// needFlush must be true here, so we can do flush.
   123  				if err := w.doFlush(); err != nil {
   124  					log.Error("Transaction dmlSink worker exits unexpectly",
   125  						zap.String("changefeedID", w.changefeed),
   126  						zap.Int("workerID", w.ID),
   127  						zap.Error(err))
   128  					return err
   129  				}
   130  				// we record total time to calcuate the worker busy ratio.
   131  				// so we record the total time after flushing, to unified statistics on
   132  				// flush time and total time
   133  				w.metricTxnWorkerTotalDuration.Observe(time.Since(start).Seconds())
   134  				start = time.Now()
   135  			}
   136  		}
   137  	}
   138  }
   139  
   140  // onEvent is called when a new event is received.
   141  // It returns true if the event is sent to backend.
   142  func (w *worker) onEvent(txn *txnEvent, postTxnExecuted func()) bool {
   143  	w.hasPending = true
   144  
   145  	if txn.GetTableSinkState() != state.TableSinkSinking {
   146  		// The table where the event comes from is in stopping, so it's safe
   147  		// to drop the event directly.
   148  		txn.Callback()
   149  		// Still necessary to append the callbacks into the pending list.
   150  		w.postTxnExecutedCallbacks = append(w.postTxnExecutedCallbacks, postTxnExecuted)
   151  		return false
   152  	}
   153  
   154  	w.metricConflictDetectDuration.Observe(txn.conflictResolved.Sub(txn.start).Seconds())
   155  	w.metricQueueDuration.Observe(time.Since(txn.start).Seconds())
   156  	w.metricTxnWorkerHandledRows.Add(float64(len(txn.Event.Rows)))
   157  	w.postTxnExecutedCallbacks = append(w.postTxnExecutedCallbacks, postTxnExecuted)
   158  	return w.backend.OnTxnEvent(txn.TxnCallbackableEvent)
   159  }
   160  
   161  // doFlush flushes the backend.
   162  func (w *worker) doFlush() error {
   163  	if w.hasPending {
   164  		start := time.Now()
   165  		defer func() {
   166  			w.metricTxnWorkerFlushDuration.Observe(time.Since(start).Seconds())
   167  		}()
   168  		if err := w.backend.Flush(w.ctx); err != nil {
   169  			log.Warn("Transaction dmlSink backend flush fail",
   170  				zap.String("changefeedID", w.changefeed),
   171  				zap.Int("workerID", w.ID),
   172  				zap.Error(err))
   173  			return err
   174  		}
   175  		// Flush successfully, call callbacks to notify conflict detector.
   176  		for _, postTxnExecuted := range w.postTxnExecutedCallbacks {
   177  			postTxnExecuted()
   178  		}
   179  		w.postTxnExecutedCallbacks = w.postTxnExecutedCallbacks[:0]
   180  		if cap(w.postTxnExecutedCallbacks) > 1024 {
   181  			// Resize the buffer if it's too big.
   182  			w.postTxnExecutedCallbacks = make([]func(), 0, 1024)
   183  		}
   184  	}
   185  
   186  	w.hasPending = false
   187  	return nil
   188  }