github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/dml_worker.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package syncer
    15  
    16  import (
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/pingcap/errors"
    21  	"github.com/pingcap/failpoint"
    22  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    23  	"github.com/pingcap/tiflow/dm/pkg/log"
    24  	"github.com/pingcap/tiflow/dm/pkg/terror"
    25  	"github.com/pingcap/tiflow/dm/pkg/utils"
    26  	"github.com/pingcap/tiflow/dm/syncer/dbconn"
    27  	"github.com/pingcap/tiflow/dm/syncer/metrics"
    28  	"github.com/pingcap/tiflow/pkg/sqlmodel"
    29  	"go.uber.org/zap"
    30  )
    31  
    32  // DMLWorker is used to sync dml.
    33  type DMLWorker struct {
    34  	compact       bool
    35  	batch         int
    36  	workerCount   int
    37  	chanSize      int
    38  	multipleRows  bool
    39  	toDBConns     []*dbconn.DBConn
    40  	syncCtx       *tcontext.Context
    41  	logger        log.Logger
    42  	metricProxies *metrics.Proxies
    43  
    44  	// for MetricsProxies
    45  	task   string
    46  	source string
    47  	worker string
    48  
    49  	// callback func
    50  	// TODO: refine callback func
    51  	successFunc          func(int, int, []*job)
    52  	fatalFunc            func(*job, error)
    53  	lagFunc              func(*job, int)
    54  	updateJobMetricsFunc func(bool, string, *job)
    55  
    56  	// channel
    57  	inCh    chan *job
    58  	flushCh chan *job
    59  }
    60  
    61  // dmlWorkerWrap creates and runs a dmlWorker instance and returns flush job channel.
    62  func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
    63  	chanSize := syncer.cfg.QueueSize / 2
    64  	if syncer.cfg.Compact {
    65  		chanSize /= 2
    66  	}
    67  	dmlWorker := &DMLWorker{
    68  		compact:              syncer.cfg.Compact,
    69  		batch:                syncer.cfg.Batch,
    70  		workerCount:          syncer.cfg.WorkerCount,
    71  		chanSize:             chanSize,
    72  		multipleRows:         syncer.cfg.MultipleRows,
    73  		task:                 syncer.cfg.Name,
    74  		source:               syncer.cfg.SourceID,
    75  		worker:               syncer.cfg.WorkerName,
    76  		logger:               syncer.tctx.Logger.WithFields(zap.String("component", "dml_worker")),
    77  		successFunc:          syncer.successFunc,
    78  		fatalFunc:            syncer.fatalFunc,
    79  		lagFunc:              syncer.updateReplicationJobTS,
    80  		updateJobMetricsFunc: syncer.updateJobMetrics,
    81  		syncCtx:              syncer.syncCtx, // this ctx can be used to cancel all the workers
    82  		metricProxies:        syncer.metricsProxies,
    83  		toDBConns:            syncer.toDBConns,
    84  		inCh:                 inCh,
    85  		flushCh:              make(chan *job),
    86  	}
    87  
    88  	go func() {
    89  		dmlWorker.run()
    90  		dmlWorker.close()
    91  	}()
    92  	return dmlWorker.flushCh
    93  }
    94  
    95  // close closes outer channel.
    96  func (w *DMLWorker) close() {
    97  	close(w.flushCh)
    98  }
    99  
   100  // run distribute jobs by queueBucket.
   101  func (w *DMLWorker) run() {
   102  	jobChs := make([]chan *job, w.workerCount)
   103  
   104  	for i := 0; i < w.workerCount; i++ {
   105  		jobChs[i] = make(chan *job, w.chanSize)
   106  		go w.executeJobs(i, jobChs[i])
   107  	}
   108  
   109  	defer func() {
   110  		for i := 0; i < w.workerCount; i++ {
   111  			close(jobChs[i])
   112  		}
   113  	}()
   114  
   115  	queueBucketMapping := make([]string, w.workerCount)
   116  	for i := 0; i < w.workerCount; i++ {
   117  		queueBucketMapping[i] = queueBucketName(i)
   118  	}
   119  	for j := range w.inCh {
   120  		w.metricProxies.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh)))
   121  		switch j.tp {
   122  		case flush:
   123  			w.updateJobMetricsFunc(false, adminQueueName, j)
   124  			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
   125  			j.flushWg.Wait()
   126  			w.updateJobMetricsFunc(true, adminQueueName, j)
   127  			w.flushCh <- j
   128  		case asyncFlush:
   129  			w.updateJobMetricsFunc(false, adminQueueName, j)
   130  			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
   131  			w.flushCh <- j
   132  		case conflict:
   133  			w.updateJobMetricsFunc(false, adminQueueName, j)
   134  			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
   135  			j.flushWg.Wait()
   136  			w.updateJobMetricsFunc(true, adminQueueName, j)
   137  		default:
   138  			queueBucket := int(utils.GenHashKey(j.dmlQueueKey)) % w.workerCount
   139  			w.updateJobMetricsFunc(false, queueBucketMapping[queueBucket], j)
   140  			startTime := time.Now()
   141  			w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dmlQueueKey))
   142  			jobChs[queueBucket] <- j
   143  			w.metricProxies.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds())
   144  		}
   145  	}
   146  }
   147  
   148  func (w *DMLWorker) sendJobToAllDmlQueue(j *job, jobChs []chan *job, queueBucketMapping []string) {
   149  	// flush for every DML queue
   150  	for i, jobCh := range jobChs {
   151  		startTime := time.Now()
   152  		jobCh <- j
   153  		w.metricProxies.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[i], w.source).Observe(time.Since(startTime).Seconds())
   154  	}
   155  }
   156  
   157  // executeJobs execute jobs in same queueBucket
   158  // All the jobs received should be executed consecutively.
   159  func (w *DMLWorker) executeJobs(queueID int, jobCh chan *job) {
   160  	jobs := make([]*job, 0, w.batch)
   161  	workerJobIdx := dmlWorkerJobIdx(queueID)
   162  	queueBucket := queueBucketName(queueID)
   163  	for j := range jobCh {
   164  		w.metricProxies.QueueSizeGauge.WithLabelValues(w.task, queueBucket, w.source).Set(float64(len(jobCh)))
   165  
   166  		if j.tp != flush && j.tp != asyncFlush && j.tp != conflict {
   167  			if len(jobs) == 0 {
   168  				// set job TS when received first job of this batch.
   169  				w.lagFunc(j, workerJobIdx)
   170  			}
   171  			jobs = append(jobs, j)
   172  			if len(jobs) < w.batch && len(jobCh) > 0 {
   173  				continue
   174  			}
   175  		}
   176  
   177  		failpoint.Inject("syncDMLBatchNotFull", func() {
   178  			if len(jobCh) == 0 && len(jobs) < w.batch {
   179  				w.logger.Info("execute not full job queue")
   180  			}
   181  		})
   182  
   183  		w.executeBatchJobs(queueID, jobs)
   184  		if j.tp == conflict || j.tp == flush || j.tp == asyncFlush {
   185  			j.flushWg.Done()
   186  		}
   187  
   188  		jobs = jobs[0:0]
   189  		if len(jobCh) == 0 {
   190  			failpoint.Inject("noJobInQueueLog", func() {
   191  				w.logger.Debug("no job in queue, update lag to zero", zap.Int(
   192  					"workerJobIdx", workerJobIdx), zap.Int64("current ts", time.Now().Unix()))
   193  			})
   194  			w.lagFunc(nil, workerJobIdx)
   195  		}
   196  	}
   197  }
   198  
   199  // executeBatchJobs execute jobs with batch size.
   200  func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) {
   201  	var (
   202  		affect  int
   203  		queries []string
   204  		args    [][]interface{}
   205  		db      = w.toDBConns[queueID]
   206  		err     error
   207  		dmls    = make([]*sqlmodel.RowChange, 0, len(jobs))
   208  	)
   209  
   210  	defer func() {
   211  		if err == nil {
   212  			w.successFunc(queueID, len(dmls), jobs)
   213  		} else {
   214  			if len(queries) == len(jobs) {
   215  				w.fatalFunc(jobs[affect], err)
   216  			} else {
   217  				w.logger.Warn("length of queries not equals length of jobs, cannot determine which job failed", zap.Int("queries", len(queries)), zap.Int("jobs", len(jobs)))
   218  				newJob := job{
   219  					startLocation:   jobs[0].startLocation,
   220  					currentLocation: jobs[len(jobs)-1].currentLocation,
   221  				}
   222  				w.fatalFunc(&newJob, err)
   223  			}
   224  		}
   225  	}()
   226  
   227  	if len(jobs) == 0 {
   228  		return
   229  	}
   230  	failpoint.Inject("failSecondJob", func() {
   231  		if failExecuteSQLForTest && failOnceForTest.CAS(false, true) {
   232  			w.logger.Info("trigger failSecondJob")
   233  			err = terror.ErrDBExecuteFailed.Delegate(errors.New("failSecondJob"), "mock")
   234  			failpoint.Return()
   235  		}
   236  	})
   237  
   238  	queries, args = w.genSQLs(jobs)
   239  	failpoint.Inject("BlockExecuteSQLs", func(v failpoint.Value) {
   240  		t := v.(int) // sleep time
   241  		w.logger.Info("BlockExecuteSQLs", zap.Any("job", jobs[0]), zap.Int("sleep time", t))
   242  		for _, query := range queries {
   243  			if strings.Contains(query, "UPDATE") && strings.Contains(query, "MetricsProxies") {
   244  				t = 10
   245  				w.logger.Info("BlockExecuteSQLs block for update sleep 10s for MetricsProxies it test", zap.Any("query", query))
   246  			}
   247  		}
   248  		time.Sleep(time.Second * time.Duration(t))
   249  	})
   250  	failpoint.Inject("WaitUserCancel", func(v failpoint.Value) {
   251  		t := v.(int)
   252  		time.Sleep(time.Duration(t) * time.Second)
   253  	})
   254  	// use background context to execute sqls as much as possible
   255  	// set timeout to maxDMLConnectionDuration to make sure dmls can be replicated to downstream event if the latency is high
   256  	// if users need to quit this asap, we can support pause-task/stop-task --force in the future
   257  	ctx, cancel := w.syncCtx.WithTimeout(maxDMLConnectionDuration)
   258  	defer cancel()
   259  	affect, err = db.ExecuteSQL(ctx, w.metricProxies, queries, args...)
   260  	failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
   261  		if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 {
   262  			w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit"))
   263  			affect, err = 0, terror.ErrDBExecuteFailed.Delegate(errors.New("SafeModeExit"), "mock")
   264  		}
   265  	})
   266  
   267  	failpoint.Inject("ErrorOnLastDML", func(_ failpoint.Value) {
   268  		if len(queries) > len(jobs) {
   269  			w.logger.Error("error on last queries", zap.Int("queries", len(queries)), zap.Int("jobs", len(jobs)))
   270  			affect, err = len(queries)-1, terror.ErrDBExecuteFailed.Delegate(errors.New("ErrorOnLastDML"), "mock")
   271  		}
   272  	})
   273  
   274  	if w.judgeKeyNotFound(affect, jobs) {
   275  		// throw an error if needed in the future.
   276  		// err = terror.ErrDBExecuteFailed.Delegate(errors.New("key not found"), "mock")
   277  		w.logger.Warn("no matching record is found to update/delete, ER_KEY_NOT_FOUND", zap.Int("affect", affect), zap.Int("jobs", len(jobs)), zap.Stringer("start from", jobs[0].startLocation), zap.Stringer("end at", jobs[len(jobs)-1].currentLocation))
   278  	}
   279  }
   280  
   281  // genSQLs generate SQLs in single row mode or multiple rows mode.
   282  func (w *DMLWorker) genSQLs(jobs []*job) ([]string, [][]interface{}) {
   283  	if w.multipleRows {
   284  		return genDMLsWithSameOp(jobs)
   285  	}
   286  
   287  	queries := make([]string, 0, len(jobs))
   288  	args := make([][]interface{}, 0, len(jobs))
   289  	for _, j := range jobs {
   290  		var query string
   291  		var arg []interface{}
   292  		appendQueryAndArg := func() {
   293  			queries = append(queries, query)
   294  			args = append(args, arg)
   295  		}
   296  
   297  		switch j.dml.Type() {
   298  		case sqlmodel.RowChangeInsert:
   299  			if j.safeMode {
   300  				query, arg = j.dml.GenSQL(sqlmodel.DMLReplace)
   301  			} else {
   302  				query, arg = j.dml.GenSQL(sqlmodel.DMLInsert)
   303  			}
   304  
   305  		case sqlmodel.RowChangeUpdate:
   306  			if j.safeMode {
   307  				query, arg = j.dml.GenSQL(sqlmodel.DMLDelete)
   308  				appendQueryAndArg()
   309  				query, arg = j.dml.GenSQL(sqlmodel.DMLReplace)
   310  			} else {
   311  				query, arg = j.dml.GenSQL(sqlmodel.DMLUpdate)
   312  			}
   313  
   314  		case sqlmodel.RowChangeDelete:
   315  			query, arg = j.dml.GenSQL(sqlmodel.DMLDelete)
   316  		}
   317  
   318  		appendQueryAndArg()
   319  	}
   320  	return queries, args
   321  }
   322  
   323  func (w *DMLWorker) judgeKeyNotFound(affect int, jobs []*job) bool {
   324  	// TODO: support compact and multiple rows
   325  	// In compact mode, we need to calculate the expected affected rows based on the compacted job
   326  	// while in multiple-rows, we need to calculate the affected rows based on the sql type
   327  	if w.compact || w.multipleRows {
   328  		return false
   329  	}
   330  	for _, j := range jobs {
   331  		if j.safeMode {
   332  			return false
   333  		}
   334  	}
   335  	return affect < len(jobs)
   336  }