github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/data_validator.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package syncer
    15  
    16  import (
    17  	"context"
    18  	"crypto/sha256"
    19  	"encoding/hex"
    20  	"fmt"
    21  	"strings"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/go-mysql-org/go-mysql/mysql"
    26  	"github.com/go-mysql-org/go-mysql/replication"
    27  	"github.com/pingcap/errors"
    28  	"github.com/pingcap/failpoint"
    29  	"github.com/pingcap/tidb/pkg/parser/model"
    30  	"github.com/pingcap/tidb/pkg/util/filter"
    31  	cdcmodel "github.com/pingcap/tiflow/cdc/model"
    32  	"github.com/pingcap/tiflow/dm/config"
    33  	"github.com/pingcap/tiflow/dm/config/dbconfig"
    34  	"github.com/pingcap/tiflow/dm/pb"
    35  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    36  	"github.com/pingcap/tiflow/dm/pkg/conn"
    37  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    38  	"github.com/pingcap/tiflow/dm/pkg/gtid"
    39  	"github.com/pingcap/tiflow/dm/pkg/log"
    40  	"github.com/pingcap/tiflow/dm/pkg/schema"
    41  	"github.com/pingcap/tiflow/dm/pkg/terror"
    42  	"github.com/pingcap/tiflow/dm/pkg/utils"
    43  	"github.com/pingcap/tiflow/dm/relay"
    44  	"github.com/pingcap/tiflow/dm/syncer/binlogstream"
    45  	"github.com/pingcap/tiflow/dm/syncer/dbconn"
    46  	"github.com/pingcap/tiflow/dm/syncer/metrics"
    47  	"github.com/pingcap/tiflow/dm/unit"
    48  	"github.com/pingcap/tiflow/pkg/sqlmodel"
    49  	"go.uber.org/atomic"
    50  	"go.uber.org/zap"
    51  )
    52  
    53  const (
    54  	validatorStatusInterval = time.Minute
    55  
    56  	moreColumnInBinlogMsg            = "binlog has more columns than current table"
    57  	tableWithoutPrimaryKeyMsg        = "no primary key"
    58  	tableNotSyncedOrDropped          = "table is not synced or dropped"
    59  	downstreamPKColumnOutOfBoundsMsg = "primary key column of downstream table out of range of binlog event row"
    60  )
    61  
    62  type validateTableInfo struct {
    63  	targetTable         *filter.Table
    64  	srcTableInfo        *model.TableInfo
    65  	downstreamTableInfo *schema.DownstreamTableInfo
    66  
    67  	message string
    68  }
    69  
    70  type rowChangeJobType int
    71  
    72  func (r rowChangeJobType) String() string {
    73  	switch r {
    74  	case rowInsert:
    75  		return "row-insert"
    76  	case rowUpdated:
    77  		return "row-update"
    78  	case rowDeleted:
    79  		return "row-delete"
    80  	case flushCheckpoint:
    81  		return "flush"
    82  	default:
    83  		return "unknown"
    84  	}
    85  }
    86  
    87  const (
    88  	rowInsert rowChangeJobType = iota
    89  	rowUpdated
    90  	rowDeleted
    91  	flushCheckpoint
    92  
    93  	rowChangeTypeCount  = 3
    94  	errorStateTypeCount = 4 // pb.ValidateErrorState_*
    95  
    96  	validatorDmctlOpTimeout = 5 * time.Second
    97  )
    98  
    99  // to make ut easier, we define it as a var, so we can change it.
   100  var markErrorRowDelay = config.DefaultValidatorRowErrorDelay
   101  
   102  // change of table
   103  // binlog changes are clustered into table changes
   104  // the validator validates changes of table-grain at a time.
   105  type tableChangeJob struct {
   106  	jobs map[string]*rowValidationJob
   107  }
   108  
   109  func newTableChangeJob() *tableChangeJob {
   110  	return &tableChangeJob{jobs: make(map[string]*rowValidationJob)}
   111  }
   112  
   113  // return true if it's new added row job.
   114  func (tc *tableChangeJob) addOrUpdate(job *rowValidationJob) bool {
   115  	if val, ok := tc.jobs[job.Key]; ok {
   116  		val.row = job.row
   117  		val.size = job.size
   118  		val.Tp = job.Tp
   119  		val.FirstValidateTS = 0
   120  		val.FailedCnt = 0 // clear failed count
   121  		return false
   122  	}
   123  	tc.jobs[job.Key] = job
   124  	return true
   125  }
   126  
   127  // change of a row.
   128  type rowValidationJob struct {
   129  	Key string
   130  	Tp  rowChangeJobType
   131  	row *sqlmodel.RowChange
   132  
   133  	// estimated memory size taken by this row, we use binlog size of the row to estimated it now.
   134  	// the memory taken for a row change job is more than this size
   135  	size int32
   136  	wg   *sync.WaitGroup
   137  	// timestamp of first validation of this row. will reset when merge row changes.
   138  	// if the job is loaded from meta, it's reset too, in case validator stopped for a long time,
   139  	// then those failed row change maybe marked as error row immediately.
   140  	FirstValidateTS int64
   141  	FailedCnt       int
   142  }
   143  
   144  type tableValidateStatus struct {
   145  	source  filter.Table
   146  	target  filter.Table
   147  	stage   pb.Stage // either Running or Stopped
   148  	message string
   149  }
   150  
   151  func (vs *tableValidateStatus) String() string {
   152  	return fmt.Sprintf("source=%s, target=%s, stage=%s, message=%s",
   153  		vs.source, vs.target, vs.stage, vs.message)
   154  }
   155  
   156  func (vs *tableValidateStatus) stopped(msg string) {
   157  	vs.stage = pb.Stage_Stopped
   158  	vs.message = msg
   159  }
   160  
   161  // DataValidator is used to continuously validate incremental data migrated to downstream by dm.
   162  // validator can be start when there's syncer unit in the subtask and validation mode is not none,
   163  // it's terminated when the subtask is terminated.
   164  // stage of validator is independent of subtask, pause/resume subtask doesn't affect the stage of validator.
   165  //
   166  // validator can be in running or stopped stage
   167  // - in running when it's started with subtask or started later on the fly.
   168  // - in stopped when validation stop is executed.
   169  //
   170  // for each subtask, before it's closed/killed, only one DataValidator object is created,
   171  // on "dmctl validation stop/start", will call Stop and Start on the same object.
   172  type DataValidator struct {
   173  	// used to sync Stop and Start operations.
   174  	sync.RWMutex
   175  
   176  	cfg    *config.SubTaskConfig
   177  	syncer *Syncer
   178  	// whether validator starts together with subtask
   179  	startWithSubtask bool
   180  
   181  	wg           sync.WaitGroup
   182  	errProcessWg sync.WaitGroup
   183  	errChan      chan error
   184  	ctx          context.Context
   185  	cancel       context.CancelFunc
   186  	tctx         *tcontext.Context
   187  
   188  	L                  log.Logger
   189  	fromDB             *conn.BaseDB
   190  	toDB               *conn.BaseDB
   191  	upstreamTZ         *time.Location
   192  	timezone           *time.Location
   193  	syncCfg            replication.BinlogSyncerConfig
   194  	streamerController *binlogstream.StreamerController
   195  	persistHelper      *validatorPersistHelper
   196  
   197  	validateInterval time.Duration
   198  	checkInterval    time.Duration
   199  	cutOverLocation  atomic.Pointer[binlog.Location]
   200  
   201  	workers   []*validateWorker
   202  	workerCnt int
   203  
   204  	// whether we start to mark failed rows as error rows
   205  	// if it's false, we don't mark failed row change as error to reduce false-positive
   206  	// it's set to true when validator reached the progress of syncer once or after markErrorRowDelay
   207  	markErrorStarted atomic.Bool
   208  
   209  	// fields in this field block are guarded by stateMutex
   210  	stateMutex  sync.RWMutex
   211  	stage       pb.Stage // only Running or Stopped is allowed for validator
   212  	flushedLoc  *binlog.Location
   213  	result      pb.ProcessResult
   214  	tableStatus map[string]*tableValidateStatus
   215  
   216  	processedRowCounts   []atomic.Int64 // all processed row count since the beginning of validator
   217  	pendingRowCounts     []atomic.Int64
   218  	newErrorRowCount     atomic.Int64
   219  	processedBinlogSize  atomic.Int64
   220  	pendingRowSize       atomic.Int64 // accumulation of rowValidationJob.size
   221  	lastFlushTime        time.Time
   222  	location             *binlog.Location
   223  	loadedPendingChanges map[string]*tableChangeJob
   224  
   225  	vmetric *metrics.ValidatorMetrics
   226  }
   227  
   228  func NewContinuousDataValidator(cfg *config.SubTaskConfig, syncerObj *Syncer, startWithSubtask bool) *DataValidator {
   229  	v := &DataValidator{
   230  		cfg:              cfg,
   231  		syncer:           syncerObj,
   232  		startWithSubtask: startWithSubtask,
   233  		vmetric:          metrics.NewValidatorMetrics(cfg.Name, cfg.SourceID),
   234  	}
   235  	v.L = log.With(zap.String("task", cfg.Name), zap.String("unit", "continuous validator"))
   236  
   237  	v.setStage(pb.Stage_Stopped)
   238  	v.workerCnt = cfg.ValidatorCfg.WorkerCount
   239  	v.processedRowCounts = make([]atomic.Int64, rowChangeTypeCount)
   240  	v.validateInterval = cfg.ValidatorCfg.ValidateInterval.Duration
   241  	v.checkInterval = cfg.ValidatorCfg.CheckInterval.Duration
   242  	v.persistHelper = newValidatorCheckpointHelper(v)
   243  	v.pendingRowCounts = make([]atomic.Int64, rowChangeTypeCount)
   244  
   245  	return v
   246  }
   247  
   248  // reset state on start/restart.
   249  func (v *DataValidator) reset() {
   250  	v.errChan = make(chan error, 10)
   251  	v.workers = []*validateWorker{}
   252  
   253  	v.markErrorStarted.Store(false)
   254  	v.resetResult()
   255  	for i := range v.processedRowCounts {
   256  		v.processedRowCounts[i].Store(0)
   257  	}
   258  	for i := range v.pendingRowCounts {
   259  		v.pendingRowCounts[i].Store(0)
   260  	}
   261  	v.newErrorRowCount.Store(0)
   262  	v.processedBinlogSize.Store(0)
   263  	v.pendingRowSize.Store(0)
   264  	v.initTableStatus(map[string]*tableValidateStatus{})
   265  }
   266  
   267  func (v *DataValidator) initialize() error {
   268  	v.ctx, v.cancel = context.WithCancel(context.Background())
   269  	v.tctx = tcontext.NewContext(v.ctx, v.L)
   270  	v.reset()
   271  
   272  	newCtx, cancelFunc := v.tctx.WithTimeout(unit.DefaultInitTimeout)
   273  	defer cancelFunc()
   274  
   275  	var err error
   276  	defer func() {
   277  		if err == nil {
   278  			return
   279  		}
   280  		dbconn.CloseBaseDB(newCtx, v.fromDB)
   281  		dbconn.CloseBaseDB(newCtx, v.toDB)
   282  		v.cancel()
   283  	}()
   284  
   285  	dbCfg := v.cfg.From
   286  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout).SetMaxIdleConns(1)
   287  	v.fromDB, err = conn.GetUpstreamDB(&dbCfg)
   288  	if err != nil {
   289  		return err
   290  	}
   291  
   292  	dbCfg = v.cfg.To
   293  	// worker count + checkpoint connection, others concurrent access can create it on the fly
   294  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout).SetMaxIdleConns(v.workerCnt + 1)
   295  	v.toDB, err = conn.GetDownstreamDB(&dbCfg)
   296  	if err != nil {
   297  		return err
   298  	}
   299  
   300  	if err = v.persistHelper.init(newCtx); err != nil {
   301  		return err
   302  	}
   303  
   304  	var defaultUpstreamTZ string
   305  	failpoint.Inject("ValidatorMockUpstreamTZ", func() {
   306  		defaultUpstreamTZ = "UTC"
   307  	})
   308  	v.upstreamTZ, _, err = str2TimezoneOrFromDB(newCtx, defaultUpstreamTZ, conn.UpstreamDBConfig(&v.cfg.From))
   309  	if err != nil {
   310  		return err
   311  	}
   312  	v.timezone, _, err = str2TimezoneOrFromDB(newCtx, v.cfg.Timezone, conn.DownstreamDBConfig(&v.cfg.To))
   313  	if err != nil {
   314  		return err
   315  	}
   316  
   317  	v.syncCfg, err = subtaskCfg2BinlogSyncerCfg(v.cfg, v.timezone, v.syncer.baList)
   318  	if err != nil {
   319  		return err
   320  	}
   321  
   322  	v.streamerController = binlogstream.NewStreamerController(
   323  		v.syncCfg,
   324  		v.cfg.EnableGTID,
   325  		&dbconn.UpStreamConn{BaseDB: v.fromDB},
   326  		v.cfg.RelayDir,
   327  		v.timezone,
   328  		nil,
   329  		v.L,
   330  	)
   331  	return nil
   332  }
   333  
   334  func (v *DataValidator) routineWrapper(fn func()) {
   335  	defer func() {
   336  		if err := recover(); err != nil {
   337  			v.L.Error("panic", zap.Any("err", err))
   338  			v.sendError(terror.ErrValidatorPanic.Generate(err))
   339  		}
   340  	}()
   341  
   342  	fn()
   343  }
   344  
   345  func (v *DataValidator) Start(expect pb.Stage) {
   346  	v.Lock()
   347  	defer v.Unlock()
   348  
   349  	v.L.Info("starting", zap.Any("cfg", v.cfg.ValidatorCfg),
   350  		zap.String("start-time", v.cfg.ValidatorCfg.StartTime),
   351  		zap.Bool("start with subtask", v.startWithSubtask),
   352  		zap.Any("expect", expect))
   353  	if v.Stage() == pb.Stage_Running {
   354  		v.L.Info("already started")
   355  		return
   356  	}
   357  
   358  	if expect != pb.Stage_Running {
   359  		v.L.Info("expect stage is not running", zap.Any("expect", expect))
   360  		return
   361  	}
   362  
   363  	if err := v.initialize(); err != nil {
   364  		v.fillResult(err)
   365  		return
   366  	}
   367  
   368  	v.wg.Add(1)
   369  	go v.routineWrapper(v.doValidate)
   370  
   371  	v.wg.Add(1)
   372  	go v.routineWrapper(v.printStatusRoutine)
   373  
   374  	v.wg.Add(1)
   375  	go utils.GoLogWrapper(v.L, v.markErrorStartedRoutine)
   376  
   377  	// routineWrapper relies on errorProcessRoutine to handle panic errors,
   378  	// so just wrap it using a common wrapper.
   379  	v.errProcessWg.Add(1)
   380  	go utils.GoLogWrapper(v.L, v.errorProcessRoutine)
   381  
   382  	v.setStage(pb.Stage_Running)
   383  	v.L.Info("started")
   384  }
   385  
   386  func (v *DataValidator) markErrorStartedRoutine() {
   387  	defer v.wg.Done()
   388  
   389  	select {
   390  	case <-v.ctx.Done():
   391  	case <-time.After(markErrorRowDelay):
   392  		if !v.markErrorStarted.Load() {
   393  			v.L.Info("mark markErrorStarted=true after error row delay")
   394  			v.markErrorStarted.Store(true)
   395  		}
   396  	}
   397  }
   398  
   399  func (v *DataValidator) printStatusRoutine() {
   400  	defer v.wg.Done()
   401  	var (
   402  		prevProcessedBinlogSize = v.processedBinlogSize.Load()
   403  		prevTime                = time.Now()
   404  	)
   405  	for {
   406  		select {
   407  		case <-v.ctx.Done():
   408  			return
   409  		case <-time.After(validatorStatusInterval):
   410  			processed := v.getProcessedRowCounts()
   411  			pending := []int64{
   412  				v.pendingRowCounts[rowInsert].Load(),
   413  				v.pendingRowCounts[rowUpdated].Load(),
   414  				v.pendingRowCounts[rowDeleted].Load(),
   415  			}
   416  			currProcessedBinlogSize := v.processedBinlogSize.Load()
   417  			currTime := time.Now()
   418  			interval := time.Since(prevTime)
   419  			speed := float64((currProcessedBinlogSize-prevProcessedBinlogSize)>>20) / interval.Seconds()
   420  			prevProcessedBinlogSize = currProcessedBinlogSize
   421  			prevTime = currTime
   422  			counts, err := v.getErrorRowCount(validatorDmctlOpTimeout)
   423  			if err == nil {
   424  				v.vmetric.ErrorCount.Set(float64(counts[pb.ValidateErrorState_NewErr]))
   425  			} else {
   426  				v.L.Warn("failed to get error row count", zap.Error(err))
   427  			}
   428  			v.L.Info("validator status",
   429  				zap.Int64s("processed(i, u, d)", processed),
   430  				zap.Int64s("pending(i, u, d)", pending),
   431  				zap.Int64("new error rows(not flushed)", v.newErrorRowCount.Load()),
   432  				zap.String("binlog process speed", fmt.Sprintf("%.2f MB/s", speed)),
   433  			)
   434  		}
   435  	}
   436  }
   437  
   438  func (v *DataValidator) fillResult(err error) {
   439  	// when met a non-retryable error, we'll call stopInner, then v.ctx is cancelled,
   440  	// don't set IsCanceled in this case
   441  	isCanceled := false
   442  	if v.getResultErrCnt() == 0 {
   443  		select {
   444  		case <-v.ctx.Done():
   445  			isCanceled = true
   446  		default:
   447  		}
   448  	}
   449  
   450  	var processErr *pb.ProcessError
   451  	if utils.IsContextCanceledError(err) {
   452  		v.L.Info("filter out context cancelled error", log.ShortError(err))
   453  	} else {
   454  		v.L.Error("error during validation", zap.Error(err))
   455  		processErr = unit.NewProcessError(err)
   456  	}
   457  	v.addResultError(processErr, isCanceled)
   458  }
   459  
   460  func (v *DataValidator) errorProcessRoutine() {
   461  	defer v.errProcessWg.Done()
   462  
   463  	var (
   464  		stopped bool
   465  		wg      sync.WaitGroup
   466  	)
   467  
   468  	for err := range v.errChan {
   469  		v.fillResult(err)
   470  
   471  		if errors.Cause(err) != context.Canceled && !stopped {
   472  			stopped = true
   473  			wg.Add(1)
   474  			go func() {
   475  				defer wg.Done()
   476  				v.stopInner()
   477  			}()
   478  		}
   479  	}
   480  	wg.Wait()
   481  }
   482  
   483  func (v *DataValidator) waitSyncerSynced(currLoc binlog.Location) error {
   484  	syncLoc := v.syncer.getFlushedGlobalPoint()
   485  	cmp := binlog.CompareLocation(currLoc, syncLoc, v.cfg.EnableGTID)
   486  	if cmp >= 0 && !v.markErrorStarted.Load() {
   487  		v.markErrorStarted.Store(true)
   488  		v.L.Info("validator progress reached syncer")
   489  	}
   490  	if cmp <= 0 {
   491  		return nil
   492  	}
   493  
   494  	for {
   495  		select {
   496  		case <-v.ctx.Done():
   497  			return v.ctx.Err()
   498  		case <-time.After(v.checkInterval):
   499  			syncLoc = v.syncer.getFlushedGlobalPoint()
   500  			cmp = binlog.CompareLocation(currLoc, syncLoc, v.cfg.EnableGTID)
   501  			if cmp <= 0 {
   502  				return nil
   503  			}
   504  			v.L.Debug("wait syncer synced", zap.Reflect("loc", currLoc))
   505  		}
   506  	}
   507  }
   508  
   509  func (v *DataValidator) updateValidatorBinlogMetric(currLoc binlog.Location) {
   510  	v.vmetric.BinlogPos.Set(float64(currLoc.Position.Pos))
   511  	index, err := utils.GetFilenameIndex(currLoc.Position.Name)
   512  	if err != nil {
   513  		v.L.Warn("fail to record validator binlog file index", zap.Error(err))
   514  	} else {
   515  		v.vmetric.BinlogFile.Set(float64(index))
   516  	}
   517  }
   518  
   519  func (v *DataValidator) updateValidatorBinlogLag(currLoc binlog.Location) {
   520  	syncerLoc := v.syncer.getFlushedGlobalPoint()
   521  	index, err := utils.GetFilenameIndex(currLoc.Position.Name)
   522  	if err != nil {
   523  		v.L.Warn("fail to record validator binlog file index", zap.Error(err))
   524  	}
   525  	if syncerLoc.Position.Name == currLoc.Position.Name {
   526  		// same file: record the log pos latency
   527  		v.vmetric.LogPosLatency.Set(float64(syncerLoc.Position.Pos - currLoc.Position.Pos))
   528  		v.vmetric.LogFileLatency.Set(float64(0))
   529  	} else {
   530  		var syncerLogIdx int64
   531  		v.vmetric.LogPosLatency.Set(float64(0))
   532  		syncerLogIdx, err = utils.GetFilenameIndex(syncerLoc.Position.Name)
   533  		if err == nil {
   534  			v.vmetric.LogFileLatency.Set(float64(syncerLogIdx - index))
   535  		} else {
   536  			v.vmetric.LogFileLatency.Set(float64(0))
   537  			v.L.Warn("fail to get syncer's log file index", zap.Error(err))
   538  		}
   539  	}
   540  }
   541  
   542  func (v *DataValidator) waitSyncerRunning() error {
   543  	if v.syncer.IsRunning() {
   544  		return nil
   545  	}
   546  	v.L.Info("wait until syncer running")
   547  	for {
   548  		select {
   549  		case <-v.ctx.Done():
   550  			return v.ctx.Err()
   551  		case <-time.After(v.checkInterval):
   552  			if v.syncer.IsRunning() {
   553  				v.L.Info("syncer is running, wait finished")
   554  				return nil
   555  			}
   556  		}
   557  	}
   558  }
   559  
   560  func (v *DataValidator) getInitialBinlogPosition() (binlog.Location, error) {
   561  	var location binlog.Location
   562  	timeStr := v.cfg.ValidatorCfg.StartTime
   563  	switch {
   564  	case timeStr != "":
   565  		// already check it when set it, will not check it again
   566  		t, _ := utils.ParseStartTimeInLoc(timeStr, v.upstreamTZ)
   567  		finder := binlog.NewRemoteBinlogPosFinder(v.tctx, v.fromDB, v.syncCfg, v.cfg.EnableGTID)
   568  		loc, posTp, err := finder.FindByTimestamp(t.Unix())
   569  		if err != nil {
   570  			v.L.Error("fail to find binlog position by timestamp",
   571  				zap.Time("time", t), zap.Error(err))
   572  			return location, err
   573  		}
   574  		v.L.Info("find binlog pos by timestamp", zap.String("time", timeStr),
   575  			zap.Any("loc", loc), zap.Stringer("pos type", posTp))
   576  
   577  		if posTp == binlog.AboveUpperBoundBinlogPos {
   578  			return location, terror.ErrConfigStartTimeTooLate.Generate(timeStr)
   579  		}
   580  		location = *loc
   581  		v.L.Info("do validate from timestamp", zap.Any("loc", location))
   582  	case v.startWithSubtask:
   583  		// in extreme case, this loc may still not be the first binlog location of this task:
   584  		//   syncer synced some binlog and flush checkpoint, but validator still not has chance to run, then fail-over
   585  		location = v.syncer.getInitExecutedLoc()
   586  		v.L.Info("do validate from init executed loc of syncer", zap.Any("loc", location))
   587  	default:
   588  		location = v.syncer.getFlushedGlobalPoint()
   589  		v.L.Info("do validate from current loc of syncer", zap.Any("loc", location))
   590  	}
   591  	return location, nil
   592  }
   593  
   594  // doValidate: runs in a separate goroutine.
   595  func (v *DataValidator) doValidate() {
   596  	defer v.wg.Done()
   597  
   598  	if err := v.waitSyncerRunning(); err != nil {
   599  		// no need to wrapped it in error_list, since err can be context.Canceled only.
   600  		v.sendError(err)
   601  		return
   602  	}
   603  
   604  	if err := v.loadPersistedData(); err != nil {
   605  		v.sendError(terror.ErrValidatorLoadPersistedData.Delegate(err))
   606  		return
   607  	}
   608  
   609  	var location binlog.Location
   610  	if v.location != nil {
   611  		location = *v.location
   612  		v.L.Info("do validate from checkpoint", zap.Any("loc", location))
   613  	} else {
   614  		// validator always uses remote binlog streamer now.
   615  		var err error
   616  		location, err = v.getInitialBinlogPosition()
   617  		if err != nil {
   618  			v.sendError(err)
   619  			return
   620  		}
   621  		// when relay log enabled, binlog name may contain uuid suffix, so need to extract the real location
   622  		location.Position.Name = utils.ExtractRealName(location.Position.Name)
   623  		// persist current location to make sure we start from the same location
   624  		// if fail-over happens before we flush checkpoint and data.
   625  		err = v.persistHelper.persist(v.tctx, location)
   626  		if err != nil {
   627  			v.sendError(terror.ErrValidatorPersistData.Delegate(err))
   628  			return
   629  		}
   630  	}
   631  	// it's for test, some fields in streamerController is mocked, cannot call Start
   632  	if v.streamerController.IsClosed() {
   633  		err := v.streamerController.Start(v.tctx, location)
   634  		if err != nil {
   635  			v.sendError(terror.Annotate(err, "fail to start streamer controller"))
   636  			return
   637  		}
   638  	}
   639  
   640  	v.startValidateWorkers()
   641  	defer func() {
   642  		for _, worker := range v.workers {
   643  			worker.close()
   644  		}
   645  	}()
   646  
   647  	// we don't flush checkpoint&data on exist, since checkpoint and pending data may not correspond with each other.
   648  	locationForFlush := location.CloneWithFlavor(v.cfg.Flavor)
   649  	v.lastFlushTime = time.Now()
   650  	for {
   651  		e, _, err := v.streamerController.GetEvent(v.tctx)
   652  		if err != nil {
   653  			switch {
   654  			case err == context.Canceled:
   655  				return
   656  			case err == context.DeadlineExceeded:
   657  				v.L.Info("deadline exceeded when fetching binlog event")
   658  				continue
   659  			case isDuplicateServerIDError(err):
   660  				// if the server id is already used, need to use a new server id
   661  				v.L.Info("server id is already used by another slave, will change to a new server id and get event again")
   662  				err1 := v.streamerController.UpdateServerIDAndResetReplication(v.tctx, locationForFlush)
   663  				if err1 != nil {
   664  					v.sendError(terror.Annotate(err1, "fail to update UpdateServerIDAndResetReplication"))
   665  					return
   666  				}
   667  				continue
   668  			case err == relay.ErrorMaybeDuplicateEvent:
   669  				continue
   670  			case isConnectionRefusedError(err):
   671  				v.sendError(terror.ErrValidatorGetEvent.Delegate(err))
   672  				return
   673  			default:
   674  				if v.streamerController.CanRetry(err) {
   675  					err = v.streamerController.ResetReplicationSyncer(v.tctx, locationForFlush)
   676  					if err != nil {
   677  						v.sendError(terror.Annotate(err, "fail to reset replication"))
   678  						return
   679  					}
   680  					continue
   681  				}
   682  				v.sendError(terror.ErrValidatorGetEvent.Delegate(err))
   683  				return
   684  			}
   685  		}
   686  
   687  		currEndLoc := v.streamerController.GetCurEndLocation()
   688  		locationForFlush = v.streamerController.GetTxnEndLocation()
   689  
   690  		// wait until syncer synced current event
   691  		err = v.waitSyncerSynced(currEndLoc)
   692  		if err != nil {
   693  			// no need to wrap it in error_list, since err can be context.Canceled only.
   694  			v.sendError(err)
   695  			return
   696  		}
   697  		failpoint.Inject("mockValidatorDelay", func(val failpoint.Value) {
   698  			if sec, ok := val.(int); ok {
   699  				v.L.Info("mock validator delay", zap.Int("second", sec))
   700  				time.Sleep(time.Duration(sec) * time.Second)
   701  			}
   702  		})
   703  		// update validator metric
   704  		v.updateValidatorBinlogMetric(currEndLoc)
   705  		v.updateValidatorBinlogLag(currEndLoc)
   706  		v.processedBinlogSize.Add(int64(e.Header.EventSize))
   707  
   708  		switch ev := e.Event.(type) {
   709  		case *replication.RowsEvent:
   710  			if err = v.processRowsEvent(e.Header, ev); err != nil {
   711  				v.L.Warn("failed to process event: ", zap.Reflect("error", err))
   712  				v.sendError(terror.ErrValidatorProcessRowEvent.Delegate(err))
   713  				return
   714  			}
   715  		case *replication.XIDEvent:
   716  			if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil {
   717  				v.sendError(terror.ErrValidatorPersistData.Delegate(err))
   718  				return
   719  			}
   720  		case *replication.QueryEvent:
   721  			if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil {
   722  				v.sendError(terror.ErrValidatorPersistData.Delegate(err))
   723  				return
   724  			}
   725  		case *replication.GenericEvent:
   726  			if e.Header.EventType == replication.HEARTBEAT_EVENT {
   727  				if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil {
   728  					v.sendError(terror.ErrValidatorPersistData.Delegate(err))
   729  					return
   730  				}
   731  			}
   732  		}
   733  	}
   734  }
   735  
   736  func (v *DataValidator) Stop() {
   737  	v.stopInner()
   738  	v.errProcessWg.Wait()
   739  	metrics.RemoveValidatorLabelValuesWithTask(v.cfg.Name)
   740  }
   741  
   742  func (v *DataValidator) stopInner() {
   743  	v.Lock()
   744  	defer v.Unlock()
   745  	v.L.Info("stopping")
   746  	if v.Stage() != pb.Stage_Running {
   747  		v.L.Warn("not started")
   748  		return
   749  	}
   750  
   751  	v.cancel()
   752  	v.streamerController.Close()
   753  	v.fromDB.Close()
   754  	v.toDB.Close()
   755  
   756  	v.wg.Wait()
   757  	// we want to record all errors, so we need to wait all error sender goroutines to stop
   758  	// before closing this error chan.
   759  	close(v.errChan)
   760  
   761  	v.setStage(pb.Stage_Stopped)
   762  	v.L.Info("stopped")
   763  }
   764  
   765  func (v *DataValidator) startValidateWorkers() {
   766  	v.wg.Add(v.workerCnt)
   767  	v.workers = make([]*validateWorker, v.workerCnt)
   768  	for i := 0; i < v.workerCnt; i++ {
   769  		worker := newValidateWorker(v, i)
   770  		v.workers[i] = worker
   771  		// worker handles panic in validateTableChange, so we can see it in `dmctl validation status`,
   772  		// for other panics we just log it.
   773  		go utils.GoLogWrapper(v.L, func() {
   774  			defer v.wg.Done()
   775  			worker.run()
   776  		})
   777  	}
   778  
   779  	for _, tblChange := range v.loadedPendingChanges {
   780  		for key, row := range tblChange.jobs {
   781  			v.dispatchRowChange(key, row)
   782  		}
   783  	}
   784  }
   785  
   786  func (v *DataValidator) dispatchRowChange(key string, row *rowValidationJob) {
   787  	hashVal := int(utils.GenHashKey(key)) % v.workerCnt
   788  	v.workers[hashVal].rowChangeCh <- row
   789  
   790  	v.L.Debug("dispatch row change job", zap.Any("table", row.row.GetSourceTable()),
   791  		zap.Stringer("type", row.Tp), zap.String("key", key), zap.Int("worker id", hashVal))
   792  }
   793  
   794  func (v *DataValidator) genValidateTableInfo(sourceTable *filter.Table, columnCount int) (*validateTableInfo, error) {
   795  	targetTable := v.syncer.route(sourceTable)
   796  	// there are 2 cases tracker may drop table:
   797  	// 1. checkpoint rollback, tracker may recreate tables and drop non-needed tables
   798  	// 2. when operate-schema set/remove
   799  	// in case 1, we add another layer synchronization to make sure we don't get a dropped table when recreation.
   800  	// 	for non-needed tables, we will not validate them.
   801  	// in case 2, validator should be paused
   802  	res := &validateTableInfo{targetTable: targetTable}
   803  	var (
   804  		tableInfo *model.TableInfo
   805  		err       error
   806  	)
   807  	tableInfo, err = v.syncer.getTrackedTableInfo(sourceTable)
   808  	if err != nil {
   809  		switch {
   810  		case schema.IsTableNotExists(err):
   811  			// not a table need to sync
   812  			res.message = tableNotSyncedOrDropped
   813  			return res, nil
   814  		case terror.ErrSchemaTrackerIsClosed.Equal(err):
   815  			// schema tracker is closed
   816  			// try to get table schema from checkpoint
   817  			tableInfo = v.syncer.getTableInfoFromCheckpoint(sourceTable)
   818  			if tableInfo == nil {
   819  				// get table schema from checkpoint failed
   820  				return res, errors.Annotate(err, "fail to get table info from checkpoint")
   821  			}
   822  		default:
   823  			return res, err
   824  		}
   825  	}
   826  	if len(tableInfo.Columns) < columnCount {
   827  		res.message = moreColumnInBinlogMsg
   828  		return res, nil
   829  	}
   830  
   831  	tableID := utils.GenTableID(targetTable)
   832  	downstreamTableInfo, err := v.syncer.getDownStreamTableInfo(v.tctx, tableID, tableInfo)
   833  	if err != nil {
   834  		// todo: might be connection error, then return error, or downstream table not exists, then set state to stopped.
   835  		return res, err
   836  	}
   837  	pk := downstreamTableInfo.WhereHandle.UniqueNotNullIdx
   838  	if pk == nil {
   839  		res.message = tableWithoutPrimaryKeyMsg
   840  		return res, nil
   841  	}
   842  	// offset of pk column is adjusted using source table info, the offsets should stay in range of ev.ColumnCount.
   843  	for _, col := range pk.Columns {
   844  		if col.Offset >= columnCount {
   845  			res.message = downstreamPKColumnOutOfBoundsMsg
   846  			return res, nil
   847  		}
   848  	}
   849  	// if current TI has more columns, clone and strip columns
   850  	if len(tableInfo.Columns) > columnCount {
   851  		tableInfo = tableInfo.Clone()
   852  		tableInfo.Columns = tableInfo.Columns[:columnCount]
   853  	}
   854  
   855  	res.srcTableInfo = tableInfo
   856  	res.downstreamTableInfo = downstreamTableInfo
   857  	return res, nil
   858  }
   859  
   860  func (v *DataValidator) processRowsEvent(header *replication.EventHeader, ev *replication.RowsEvent) error {
   861  	sourceTable := &filter.Table{
   862  		Schema: string(ev.Table.Schema),
   863  		Name:   string(ev.Table.Table),
   864  	}
   865  
   866  	failpoint.Inject("ValidatorPanic", func() {})
   867  
   868  	if err := checkLogColumns(ev.SkippedColumns); err != nil {
   869  		return terror.Annotate(err, sourceTable.String())
   870  	}
   871  
   872  	needSkip, err := v.syncer.skipRowsEvent(sourceTable, header.EventType)
   873  	if err != nil {
   874  		return err
   875  	}
   876  	if needSkip {
   877  		return nil
   878  	}
   879  
   880  	fullTableName := sourceTable.String()
   881  	state, ok := v.getTableStatus(fullTableName)
   882  	if ok && state.stage == pb.Stage_Stopped {
   883  		return nil
   884  	}
   885  
   886  	validateTbl, err := v.genValidateTableInfo(sourceTable, int(ev.ColumnCount))
   887  	if err != nil {
   888  		return terror.Annotate(err, "failed to get table info")
   889  	}
   890  
   891  	targetTable := validateTbl.targetTable
   892  	if state == nil {
   893  		state = &tableValidateStatus{
   894  			source: *sourceTable,
   895  			target: *targetTable,
   896  			stage:  pb.Stage_Running,
   897  		}
   898  
   899  		v.L.Info("put table status", zap.Stringer("state", state))
   900  		v.putTableStatus(fullTableName, state)
   901  	}
   902  	if validateTbl.message != "" {
   903  		v.L.Warn("stop validating table", zap.String("table", sourceTable.String()),
   904  			zap.String("reason", validateTbl.message))
   905  		state.stopped(validateTbl.message)
   906  		return nil
   907  	}
   908  
   909  	tableInfo, downstreamTableInfo := validateTbl.srcTableInfo, validateTbl.downstreamTableInfo
   910  
   911  	changeType := getRowChangeType(header.EventType)
   912  
   913  	step := 1
   914  	if changeType == rowUpdated {
   915  		step = 2
   916  	}
   917  	estimatedRowSize := int32(header.EventSize) / int32(len(ev.Rows))
   918  	for i := 0; i < len(ev.Rows); i += step {
   919  		var beforeImage, afterImage []interface{}
   920  		switch changeType {
   921  		case rowInsert:
   922  			afterImage = ev.Rows[i]
   923  		case rowUpdated:
   924  			beforeImage, afterImage = ev.Rows[i], ev.Rows[i+1]
   925  		default: // rowDeleted
   926  			beforeImage = ev.Rows[i]
   927  		}
   928  
   929  		rowChange := sqlmodel.NewRowChange(
   930  			&cdcmodel.TableName{Schema: sourceTable.Schema, Table: sourceTable.Name},
   931  			&cdcmodel.TableName{Schema: targetTable.Schema, Table: targetTable.Name},
   932  			beforeImage, afterImage,
   933  			tableInfo, downstreamTableInfo.TableInfo,
   934  			nil,
   935  		)
   936  		rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle)
   937  		size := estimatedRowSize
   938  		if changeType == rowUpdated && rowChange.IsIdentityUpdated() {
   939  			delRow, insRow := rowChange.SplitUpdate()
   940  			delRowKey := genRowKey(delRow)
   941  			v.dispatchRowChange(delRowKey, &rowValidationJob{Key: delRowKey, Tp: rowDeleted, row: delRow, size: size})
   942  			v.processedRowCounts[rowDeleted].Inc()
   943  
   944  			insRowKey := genRowKey(insRow)
   945  			v.dispatchRowChange(insRowKey, &rowValidationJob{Key: insRowKey, Tp: rowInsert, row: insRow, size: size})
   946  			v.processedRowCounts[rowInsert].Inc()
   947  		} else {
   948  			rowKey := genRowKey(rowChange)
   949  			if changeType == rowUpdated {
   950  				size *= 2
   951  			}
   952  			v.dispatchRowChange(rowKey, &rowValidationJob{Key: rowKey, Tp: changeType, row: rowChange, size: size})
   953  			v.processedRowCounts[changeType].Inc()
   954  		}
   955  	}
   956  	return nil
   957  }
   958  
   959  func (v *DataValidator) checkAndPersistCheckpointAndData(loc binlog.Location) error {
   960  	metaFlushInterval := v.cfg.ValidatorCfg.MetaFlushInterval.Duration
   961  	cutOverLocation := v.cutOverLocation.Load()
   962  	needCutOver := cutOverLocation != nil && binlog.CompareLocation(*cutOverLocation, loc, v.cfg.EnableGTID) <= 0
   963  	if time.Since(v.lastFlushTime) > metaFlushInterval || needCutOver {
   964  		if needCutOver {
   965  			v.cutOverLocation.Store(nil)
   966  		}
   967  		v.lastFlushTime = time.Now()
   968  		if err := v.persistCheckpointAndData(loc); err != nil {
   969  			v.L.Warn("failed to flush checkpoint: ", zap.Error(err))
   970  			if isRetryableValidateError(err) {
   971  				return nil
   972  			}
   973  			return err
   974  		}
   975  	}
   976  	return nil
   977  }
   978  
   979  func (v *DataValidator) persistCheckpointAndData(loc binlog.Location) error {
   980  	var wg sync.WaitGroup
   981  	wg.Add(v.workerCnt)
   982  	flushJob := &rowValidationJob{
   983  		Tp: flushCheckpoint,
   984  		wg: &wg,
   985  	}
   986  	for i, worker := range v.workers {
   987  		v.L.Debug("dispatch flush job", zap.Int("worker id", i))
   988  		worker.rowChangeCh <- flushJob
   989  	}
   990  	wg.Wait()
   991  
   992  	v.L.Info("persist checkpoint and intermediate data",
   993  		zap.Int64("pending size", v.getPendingRowSize()),
   994  		zap.Int64("pending count", v.getAllPendingRowCount()),
   995  		zap.Int64("new error", v.newErrorRowCount.Load()))
   996  
   997  	err := v.persistHelper.persist(v.tctx, loc)
   998  	if err != nil {
   999  		return err
  1000  	}
  1001  
  1002  	// reset errors after save
  1003  	for _, worker := range v.workers {
  1004  		worker.resetErrorRows()
  1005  	}
  1006  	v.newErrorRowCount.Store(0)
  1007  	v.setFlushedLoc(&loc)
  1008  	return nil
  1009  }
  1010  
  1011  func (v *DataValidator) loadPersistedData() error {
  1012  	data, err := v.persistHelper.loadPersistedDataRetry(v.tctx)
  1013  	if err != nil {
  1014  		return err
  1015  	}
  1016  	// table info of pending change is not persisted in order to save space, so need to init them after load.
  1017  	pendingChanges := make(map[string]*tableChangeJob)
  1018  	for _, tblChange := range data.pendingChanges {
  1019  		// todo: if table is dropped since last run, we should skip rows related to this table & update table status
  1020  		// see https://github.com/pingcap/tiflow/pull/4881#discussion_r834093316
  1021  		sourceTable := tblChange.sourceTable
  1022  		validateTbl, err2 := v.genValidateTableInfo(sourceTable, tblChange.columnCount)
  1023  		if err2 != nil {
  1024  			return terror.Annotate(err2, "failed to get table info on load")
  1025  		}
  1026  		if validateTbl.message != "" {
  1027  			return errors.New("failed to get table info " + validateTbl.message)
  1028  		}
  1029  		pendingTblChange := newTableChangeJob()
  1030  		// aggregate using target table just as worker did.
  1031  		pendingChanges[validateTbl.targetTable.String()] = pendingTblChange
  1032  		for _, row := range tblChange.rows {
  1033  			var beforeImage, afterImage []interface{}
  1034  			switch row.Tp {
  1035  			case rowInsert:
  1036  				afterImage = row.Data
  1037  			case rowUpdated:
  1038  				// set both to row.Data, since we only save one image on persist in order to save space
  1039  				beforeImage, afterImage = row.Data, row.Data
  1040  			default:
  1041  				// rowDeleted
  1042  				beforeImage = row.Data
  1043  			}
  1044  			pendingTblChange.jobs[row.Key] = &rowValidationJob{
  1045  				Key: row.Key,
  1046  				Tp:  row.Tp,
  1047  				row: sqlmodel.NewRowChange(
  1048  					&cdcmodel.TableName{Schema: sourceTable.Schema, Table: sourceTable.Name},
  1049  					&cdcmodel.TableName{Schema: validateTbl.targetTable.Schema, Table: validateTbl.targetTable.Name},
  1050  					beforeImage, afterImage,
  1051  					validateTbl.srcTableInfo, validateTbl.downstreamTableInfo.TableInfo,
  1052  					nil,
  1053  				),
  1054  				size:      row.Size,
  1055  				FailedCnt: row.FailedCnt,
  1056  			}
  1057  		}
  1058  	}
  1059  
  1060  	v.location = data.checkpoint
  1061  	v.setProcessedRowCounts(data.processedRowCounts)
  1062  	v.loadedPendingChanges = pendingChanges
  1063  	v.persistHelper.setRevision(data.rev)
  1064  	v.initTableStatus(data.tableStatus)
  1065  
  1066  	return nil
  1067  }
  1068  
  1069  func (v *DataValidator) incrErrorRowCount(cnt int) {
  1070  	v.newErrorRowCount.Add(int64(cnt))
  1071  }
  1072  
  1073  func (v *DataValidator) getWorkers() []*validateWorker {
  1074  	return v.workers
  1075  }
  1076  
  1077  func (v *DataValidator) Started() bool {
  1078  	v.stateMutex.RLock()
  1079  	defer v.stateMutex.RUnlock()
  1080  	return v.stage == pb.Stage_Running
  1081  }
  1082  
  1083  func (v *DataValidator) Stage() pb.Stage {
  1084  	v.stateMutex.RLock()
  1085  	defer v.stateMutex.RUnlock()
  1086  	return v.stage
  1087  }
  1088  
  1089  func (v *DataValidator) setStage(stage pb.Stage) {
  1090  	v.stateMutex.Lock()
  1091  	defer v.stateMutex.Unlock()
  1092  	v.stage = stage
  1093  }
  1094  
  1095  func (v *DataValidator) getFlushedLoc() *binlog.Location {
  1096  	v.stateMutex.RLock()
  1097  	defer v.stateMutex.RUnlock()
  1098  	return v.flushedLoc
  1099  }
  1100  
  1101  func (v *DataValidator) setFlushedLoc(loc *binlog.Location) {
  1102  	v.stateMutex.Lock()
  1103  	defer v.stateMutex.Unlock()
  1104  	if loc == nil {
  1105  		v.flushedLoc = nil
  1106  		return
  1107  	}
  1108  	clone := loc.Clone()
  1109  	v.flushedLoc = &clone
  1110  }
  1111  
  1112  func (v *DataValidator) getResult() pb.ProcessResult {
  1113  	v.stateMutex.RLock()
  1114  	defer v.stateMutex.RUnlock()
  1115  	return v.result
  1116  }
  1117  
  1118  func (v *DataValidator) addResultError(err *pb.ProcessError, cancelled bool) {
  1119  	v.stateMutex.Lock()
  1120  	defer v.stateMutex.Unlock()
  1121  	if err != nil {
  1122  		v.result.Errors = append(v.result.Errors, err)
  1123  	}
  1124  	v.result.IsCanceled = cancelled
  1125  }
  1126  
  1127  func (v *DataValidator) getResultErrCnt() int {
  1128  	v.stateMutex.Lock()
  1129  	defer v.stateMutex.Unlock()
  1130  	return len(v.result.Errors)
  1131  }
  1132  
  1133  func (v *DataValidator) resetResult() {
  1134  	v.stateMutex.Lock()
  1135  	defer v.stateMutex.Unlock()
  1136  	v.result.Reset()
  1137  }
  1138  
  1139  func (v *DataValidator) initTableStatus(m map[string]*tableValidateStatus) {
  1140  	v.stateMutex.Lock()
  1141  	defer v.stateMutex.Unlock()
  1142  	v.tableStatus = m
  1143  }
  1144  
  1145  func (v *DataValidator) getTableStatus(fullTableName string) (*tableValidateStatus, bool) {
  1146  	v.stateMutex.RLock()
  1147  	defer v.stateMutex.RUnlock()
  1148  	res, ok := v.tableStatus[fullTableName]
  1149  	return res, ok
  1150  }
  1151  
  1152  // return snapshot of the current table status.
  1153  func (v *DataValidator) getTableStatusMap() map[string]*tableValidateStatus {
  1154  	v.stateMutex.RLock()
  1155  	defer v.stateMutex.RUnlock()
  1156  	tblStatus := make(map[string]*tableValidateStatus)
  1157  	for key, tblStat := range v.tableStatus {
  1158  		stat := &tableValidateStatus{}
  1159  		*stat = *tblStat // deep copy
  1160  		tblStatus[key] = stat
  1161  	}
  1162  	return tblStatus
  1163  }
  1164  
  1165  func (v *DataValidator) putTableStatus(name string, status *tableValidateStatus) {
  1166  	v.stateMutex.Lock()
  1167  	defer v.stateMutex.Unlock()
  1168  	v.tableStatus[name] = status
  1169  }
  1170  
  1171  func (v *DataValidator) isMarkErrorStarted() bool {
  1172  	return v.markErrorStarted.Load()
  1173  }
  1174  
  1175  func (v *DataValidator) getProcessedRowCounts() []int64 {
  1176  	return []int64{
  1177  		v.processedRowCounts[rowInsert].Load(),
  1178  		v.processedRowCounts[rowUpdated].Load(),
  1179  		v.processedRowCounts[rowDeleted].Load(),
  1180  	}
  1181  }
  1182  
  1183  func (v *DataValidator) setProcessedRowCounts(counts []int64) {
  1184  	v.processedRowCounts[rowInsert].Store(counts[rowInsert])
  1185  	v.processedRowCounts[rowUpdated].Store(counts[rowUpdated])
  1186  	v.processedRowCounts[rowDeleted].Store(counts[rowDeleted])
  1187  }
  1188  
  1189  func (v *DataValidator) addPendingRowCount(tp rowChangeJobType, cnt int64) {
  1190  	v.pendingRowCounts[tp].Add(cnt)
  1191  }
  1192  
  1193  func (v *DataValidator) getAllPendingRowCount() int64 {
  1194  	return v.pendingRowCounts[rowInsert].Load() +
  1195  		v.pendingRowCounts[rowUpdated].Load() +
  1196  		v.pendingRowCounts[rowDeleted].Load()
  1197  }
  1198  
  1199  func (v *DataValidator) addPendingRowSize(size int64) {
  1200  	v.pendingRowSize.Add(size)
  1201  }
  1202  
  1203  func (v *DataValidator) getPendingRowSize() int64 {
  1204  	return v.pendingRowSize.Load()
  1205  }
  1206  
  1207  func (v *DataValidator) sendError(err error) {
  1208  	v.errChan <- err
  1209  }
  1210  
  1211  func (v *DataValidator) getNewErrorRowCount() int64 {
  1212  	return v.newErrorRowCount.Load()
  1213  }
  1214  
  1215  // getRowChangeType should be called only when the event type is RowsEvent.
  1216  func getRowChangeType(t replication.EventType) rowChangeJobType {
  1217  	switch t {
  1218  	case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2:
  1219  		return rowInsert
  1220  	case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2:
  1221  		return rowUpdated
  1222  	default:
  1223  		// replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2:
  1224  		return rowDeleted
  1225  	}
  1226  }
  1227  
  1228  func genRowKey(row *sqlmodel.RowChange) string {
  1229  	vals := row.RowStrIdentity()
  1230  	return genRowKeyByString(vals)
  1231  }
  1232  
  1233  func genRowKeyByString(pkValues []string) string {
  1234  	// in the scenario below, the generated key may not be unique, but it's rare
  1235  	// suppose a table with multiple column primary key: (v1, v2)
  1236  	// for below case, the generated key is the same:
  1237  	// 	 (aaa\t, bbb) and (aaa, \tbbb), the joint values both are "aaa\t\tbbb"
  1238  	join := strings.Join(pkValues, "\t")
  1239  	// if the key is too long, need to make sure it can be stored into database
  1240  	if len(join) > maxRowKeyLength {
  1241  		sum := sha256.Sum256([]byte(join))
  1242  		return hex.EncodeToString(sum[:])
  1243  	}
  1244  	return join
  1245  }
  1246  
  1247  func (v *DataValidator) GetValidatorTableStatus(filterStatus pb.Stage) []*pb.ValidationTableStatus {
  1248  	tblStatus := v.getTableStatusMap()
  1249  
  1250  	result := make([]*pb.ValidationTableStatus, 0)
  1251  	for _, tblStat := range tblStatus {
  1252  		returnAll := filterStatus == pb.Stage_InvalidStage
  1253  		if returnAll || tblStat.stage == filterStatus {
  1254  			result = append(result, &pb.ValidationTableStatus{
  1255  				Source:   v.cfg.SourceID,
  1256  				SrcTable: tblStat.source.String(),
  1257  				DstTable: tblStat.target.String(),
  1258  				Stage:    tblStat.stage,
  1259  				Message:  tblStat.message,
  1260  			})
  1261  		}
  1262  	}
  1263  	return result
  1264  }
  1265  
  1266  func (v *DataValidator) GetValidatorError(errState pb.ValidateErrorState) ([]*pb.ValidationError, error) {
  1267  	// todo: validation error in workers cannot be returned
  1268  	// because the errID is only allocated when the error rows are flushed
  1269  	// user cannot handle errorRows without errID
  1270  	var (
  1271  		toDB  *conn.BaseDB
  1272  		err   error
  1273  		dbCfg dbconfig.DBConfig
  1274  	)
  1275  	ctx, cancel := context.WithTimeout(context.Background(), validatorDmctlOpTimeout)
  1276  	tctx := tcontext.NewContext(ctx, v.L)
  1277  	defer cancel()
  1278  	failpoint.Inject("MockValidationQuery", func() {
  1279  		toDB = v.persistHelper.db
  1280  		failpoint.Return(v.persistHelper.loadError(tctx, toDB, errState))
  1281  	})
  1282  	dbCfg = v.cfg.To
  1283  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1)
  1284  	toDB, err = conn.GetDownstreamDB(&dbCfg)
  1285  	if err != nil {
  1286  		v.L.Warn("failed to create downstream db", zap.Error(err))
  1287  		return nil, err
  1288  	}
  1289  	defer dbconn.CloseBaseDB(tctx, toDB)
  1290  	ret, err := v.persistHelper.loadError(tctx, toDB, errState)
  1291  	if err != nil {
  1292  		v.L.Warn("fail to load validator error", zap.Error(err))
  1293  		return nil, err
  1294  	}
  1295  	return ret, nil
  1296  }
  1297  
  1298  func (v *DataValidator) OperateValidatorError(validateOp pb.ValidationErrOp, errID uint64, isAll bool) error {
  1299  	var (
  1300  		toDB  *conn.BaseDB
  1301  		err   error
  1302  		dbCfg dbconfig.DBConfig
  1303  	)
  1304  	ctx, cancel := context.WithTimeout(context.Background(), validatorDmctlOpTimeout)
  1305  	tctx := tcontext.NewContext(ctx, v.L)
  1306  	defer cancel()
  1307  	failpoint.Inject("MockValidationQuery", func() {
  1308  		toDB = v.persistHelper.db
  1309  		failpoint.Return(v.persistHelper.operateError(tctx, toDB, validateOp, errID, isAll))
  1310  	})
  1311  	dbCfg = v.cfg.To
  1312  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1)
  1313  	toDB, err = conn.GetDownstreamDB(&dbCfg)
  1314  	if err != nil {
  1315  		return err
  1316  	}
  1317  	defer dbconn.CloseBaseDB(tctx, toDB)
  1318  	return v.persistHelper.operateError(tctx, toDB, validateOp, errID, isAll)
  1319  }
  1320  
  1321  func (v *DataValidator) UpdateValidator(req *pb.UpdateValidationWorkerRequest) error {
  1322  	var (
  1323  		pos = mysql.Position{}
  1324  		gs  mysql.GTIDSet
  1325  		err error
  1326  	)
  1327  	if len(req.BinlogPos) > 0 {
  1328  		pos, err = binlog.PositionFromPosStr(req.BinlogPos)
  1329  		if err != nil {
  1330  			return err
  1331  		}
  1332  	}
  1333  	if len(req.BinlogGTID) > 0 {
  1334  		gs, err = gtid.ParserGTID(v.cfg.Flavor, req.BinlogGTID)
  1335  		if err != nil {
  1336  			return err
  1337  		}
  1338  	}
  1339  	cutOverLocation := binlog.NewLocation(pos, gs)
  1340  	v.cutOverLocation.Store(&cutOverLocation)
  1341  	v.syncer.cutOverLocation.Store(&cutOverLocation)
  1342  	return nil
  1343  }
  1344  
  1345  func (v *DataValidator) getErrorRowCount(timeout time.Duration) ([errorStateTypeCount]int64, error) {
  1346  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
  1347  	defer cancel()
  1348  	tctx := tcontext.NewContext(ctx, v.L)
  1349  
  1350  	// use a separate db to get error count, since validator maybe stopped or initializing
  1351  	dbCfg := v.cfg.To
  1352  	dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1)
  1353  	countMap := map[pb.ValidateErrorState]int64{}
  1354  	toDB, err := conn.GetDownstreamDB(&dbCfg)
  1355  	if err != nil {
  1356  		v.L.Warn("failed to create downstream db", zap.Error(err))
  1357  	} else {
  1358  		defer dbconn.CloseBaseDB(tctx, toDB)
  1359  		countMap, err = v.persistHelper.loadErrorCount(tctx, toDB)
  1360  		if err != nil {
  1361  			v.L.Warn("failed to load error count", zap.Error(err))
  1362  		}
  1363  	}
  1364  	var errorRowCount [errorStateTypeCount]int64
  1365  	errorRowCount[pb.ValidateErrorState_NewErr] = countMap[pb.ValidateErrorState_NewErr]
  1366  	errorRowCount[pb.ValidateErrorState_IgnoredErr] = countMap[pb.ValidateErrorState_IgnoredErr]
  1367  	errorRowCount[pb.ValidateErrorState_ResolvedErr] = countMap[pb.ValidateErrorState_ResolvedErr]
  1368  
  1369  	errorRowCount[pb.ValidateErrorState_NewErr] += v.newErrorRowCount.Load()
  1370  
  1371  	return errorRowCount, err
  1372  }
  1373  
  1374  func (v *DataValidator) GetValidatorStatus() *pb.ValidationStatus {
  1375  	var extraMsg string
  1376  	errorRowCount, err := v.getErrorRowCount(validatorDmctlOpTimeout)
  1377  	if err != nil {
  1378  		// nolint:nilerr
  1379  		extraMsg = fmt.Sprintf(" (failed to load error count from meta db: %s)", err.Error())
  1380  	}
  1381  	// if we print those state in a structured way, there would be at least 9 lines for each subtask,
  1382  	// which is hard to read, so print them into one line.
  1383  	template := "insert/update/delete: %d/%d/%d"
  1384  	processedRowCounts := v.getProcessedRowCounts()
  1385  	processedRows := fmt.Sprintf(template, processedRowCounts[rowInsert],
  1386  		processedRowCounts[rowUpdated], processedRowCounts[rowDeleted])
  1387  	pendingRows := fmt.Sprintf(template, v.pendingRowCounts[rowInsert].Load(),
  1388  		v.pendingRowCounts[rowUpdated].Load(), v.pendingRowCounts[rowDeleted].Load())
  1389  	errorRows := fmt.Sprintf("new/ignored/resolved: %d/%d/%d%s",
  1390  		errorRowCount[pb.ValidateErrorState_NewErr], errorRowCount[pb.ValidateErrorState_IgnoredErr],
  1391  		errorRowCount[pb.ValidateErrorState_ResolvedErr], extraMsg)
  1392  
  1393  	result := v.getResult()
  1394  	returnedResult := &result
  1395  	if !result.IsCanceled && len(result.Errors) == 0 {
  1396  		// no need to show if validator is running normally
  1397  		returnedResult = nil
  1398  	}
  1399  
  1400  	flushedLoc := v.getFlushedLoc()
  1401  	var validatorBinlog, validatorBinlogGtid string
  1402  	if flushedLoc != nil {
  1403  		validatorBinlog = flushedLoc.Position.String()
  1404  		if flushedLoc.GetGTID() != nil {
  1405  			validatorBinlogGtid = flushedLoc.GetGTID().String()
  1406  		}
  1407  	}
  1408  	var cutoverBinlogPos, cutoverBinlogGTID string
  1409  	if cutOverLoc := v.cutOverLocation.Load(); cutOverLoc != nil {
  1410  		cutoverBinlogPos = cutOverLoc.Position.String()
  1411  		if cutOverLoc.GetGTID() != nil {
  1412  			cutoverBinlogGTID = cutOverLoc.GetGTID().String()
  1413  		}
  1414  	}
  1415  
  1416  	return &pb.ValidationStatus{
  1417  		Task:                v.cfg.Name,
  1418  		Source:              v.cfg.SourceID,
  1419  		Mode:                v.cfg.ValidatorCfg.Mode,
  1420  		Stage:               v.Stage(),
  1421  		Result:              returnedResult,
  1422  		ValidatorBinlog:     validatorBinlog,
  1423  		ValidatorBinlogGtid: validatorBinlogGtid,
  1424  		ProcessedRowsStatus: processedRows,
  1425  		PendingRowsStatus:   pendingRows,
  1426  		ErrorRowsStatus:     errorRows,
  1427  		CutoverBinlogPos:    cutoverBinlogPos,
  1428  		CutoverBinlogGtid:   cutoverBinlogGTID,
  1429  	}
  1430  }