github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/data_validator.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package syncer 15 16 import ( 17 "context" 18 "crypto/sha256" 19 "encoding/hex" 20 "fmt" 21 "strings" 22 "sync" 23 "time" 24 25 "github.com/go-mysql-org/go-mysql/mysql" 26 "github.com/go-mysql-org/go-mysql/replication" 27 "github.com/pingcap/errors" 28 "github.com/pingcap/failpoint" 29 "github.com/pingcap/tidb/pkg/parser/model" 30 "github.com/pingcap/tidb/pkg/util/filter" 31 cdcmodel "github.com/pingcap/tiflow/cdc/model" 32 "github.com/pingcap/tiflow/dm/config" 33 "github.com/pingcap/tiflow/dm/config/dbconfig" 34 "github.com/pingcap/tiflow/dm/pb" 35 "github.com/pingcap/tiflow/dm/pkg/binlog" 36 "github.com/pingcap/tiflow/dm/pkg/conn" 37 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 38 "github.com/pingcap/tiflow/dm/pkg/gtid" 39 "github.com/pingcap/tiflow/dm/pkg/log" 40 "github.com/pingcap/tiflow/dm/pkg/schema" 41 "github.com/pingcap/tiflow/dm/pkg/terror" 42 "github.com/pingcap/tiflow/dm/pkg/utils" 43 "github.com/pingcap/tiflow/dm/relay" 44 "github.com/pingcap/tiflow/dm/syncer/binlogstream" 45 "github.com/pingcap/tiflow/dm/syncer/dbconn" 46 "github.com/pingcap/tiflow/dm/syncer/metrics" 47 "github.com/pingcap/tiflow/dm/unit" 48 "github.com/pingcap/tiflow/pkg/sqlmodel" 49 "go.uber.org/atomic" 50 "go.uber.org/zap" 51 ) 52 53 const ( 54 validatorStatusInterval = time.Minute 55 56 moreColumnInBinlogMsg = "binlog has more columns than current table" 57 tableWithoutPrimaryKeyMsg = "no primary key" 58 tableNotSyncedOrDropped = "table is not synced or dropped" 59 downstreamPKColumnOutOfBoundsMsg = "primary key column of downstream table out of range of binlog event row" 60 ) 61 62 type validateTableInfo struct { 63 targetTable *filter.Table 64 srcTableInfo *model.TableInfo 65 downstreamTableInfo *schema.DownstreamTableInfo 66 67 message string 68 } 69 70 type rowChangeJobType int 71 72 func (r rowChangeJobType) String() string { 73 switch r { 74 case rowInsert: 75 return "row-insert" 76 case rowUpdated: 77 return "row-update" 78 case rowDeleted: 79 return "row-delete" 80 case flushCheckpoint: 81 return "flush" 82 default: 83 return "unknown" 84 } 85 } 86 87 const ( 88 rowInsert rowChangeJobType = iota 89 rowUpdated 90 rowDeleted 91 flushCheckpoint 92 93 rowChangeTypeCount = 3 94 errorStateTypeCount = 4 // pb.ValidateErrorState_* 95 96 validatorDmctlOpTimeout = 5 * time.Second 97 ) 98 99 // to make ut easier, we define it as a var, so we can change it. 100 var markErrorRowDelay = config.DefaultValidatorRowErrorDelay 101 102 // change of table 103 // binlog changes are clustered into table changes 104 // the validator validates changes of table-grain at a time. 105 type tableChangeJob struct { 106 jobs map[string]*rowValidationJob 107 } 108 109 func newTableChangeJob() *tableChangeJob { 110 return &tableChangeJob{jobs: make(map[string]*rowValidationJob)} 111 } 112 113 // return true if it's new added row job. 114 func (tc *tableChangeJob) addOrUpdate(job *rowValidationJob) bool { 115 if val, ok := tc.jobs[job.Key]; ok { 116 val.row = job.row 117 val.size = job.size 118 val.Tp = job.Tp 119 val.FirstValidateTS = 0 120 val.FailedCnt = 0 // clear failed count 121 return false 122 } 123 tc.jobs[job.Key] = job 124 return true 125 } 126 127 // change of a row. 128 type rowValidationJob struct { 129 Key string 130 Tp rowChangeJobType 131 row *sqlmodel.RowChange 132 133 // estimated memory size taken by this row, we use binlog size of the row to estimated it now. 134 // the memory taken for a row change job is more than this size 135 size int32 136 wg *sync.WaitGroup 137 // timestamp of first validation of this row. will reset when merge row changes. 138 // if the job is loaded from meta, it's reset too, in case validator stopped for a long time, 139 // then those failed row change maybe marked as error row immediately. 140 FirstValidateTS int64 141 FailedCnt int 142 } 143 144 type tableValidateStatus struct { 145 source filter.Table 146 target filter.Table 147 stage pb.Stage // either Running or Stopped 148 message string 149 } 150 151 func (vs *tableValidateStatus) String() string { 152 return fmt.Sprintf("source=%s, target=%s, stage=%s, message=%s", 153 vs.source, vs.target, vs.stage, vs.message) 154 } 155 156 func (vs *tableValidateStatus) stopped(msg string) { 157 vs.stage = pb.Stage_Stopped 158 vs.message = msg 159 } 160 161 // DataValidator is used to continuously validate incremental data migrated to downstream by dm. 162 // validator can be start when there's syncer unit in the subtask and validation mode is not none, 163 // it's terminated when the subtask is terminated. 164 // stage of validator is independent of subtask, pause/resume subtask doesn't affect the stage of validator. 165 // 166 // validator can be in running or stopped stage 167 // - in running when it's started with subtask or started later on the fly. 168 // - in stopped when validation stop is executed. 169 // 170 // for each subtask, before it's closed/killed, only one DataValidator object is created, 171 // on "dmctl validation stop/start", will call Stop and Start on the same object. 172 type DataValidator struct { 173 // used to sync Stop and Start operations. 174 sync.RWMutex 175 176 cfg *config.SubTaskConfig 177 syncer *Syncer 178 // whether validator starts together with subtask 179 startWithSubtask bool 180 181 wg sync.WaitGroup 182 errProcessWg sync.WaitGroup 183 errChan chan error 184 ctx context.Context 185 cancel context.CancelFunc 186 tctx *tcontext.Context 187 188 L log.Logger 189 fromDB *conn.BaseDB 190 toDB *conn.BaseDB 191 upstreamTZ *time.Location 192 timezone *time.Location 193 syncCfg replication.BinlogSyncerConfig 194 streamerController *binlogstream.StreamerController 195 persistHelper *validatorPersistHelper 196 197 validateInterval time.Duration 198 checkInterval time.Duration 199 cutOverLocation atomic.Pointer[binlog.Location] 200 201 workers []*validateWorker 202 workerCnt int 203 204 // whether we start to mark failed rows as error rows 205 // if it's false, we don't mark failed row change as error to reduce false-positive 206 // it's set to true when validator reached the progress of syncer once or after markErrorRowDelay 207 markErrorStarted atomic.Bool 208 209 // fields in this field block are guarded by stateMutex 210 stateMutex sync.RWMutex 211 stage pb.Stage // only Running or Stopped is allowed for validator 212 flushedLoc *binlog.Location 213 result pb.ProcessResult 214 tableStatus map[string]*tableValidateStatus 215 216 processedRowCounts []atomic.Int64 // all processed row count since the beginning of validator 217 pendingRowCounts []atomic.Int64 218 newErrorRowCount atomic.Int64 219 processedBinlogSize atomic.Int64 220 pendingRowSize atomic.Int64 // accumulation of rowValidationJob.size 221 lastFlushTime time.Time 222 location *binlog.Location 223 loadedPendingChanges map[string]*tableChangeJob 224 225 vmetric *metrics.ValidatorMetrics 226 } 227 228 func NewContinuousDataValidator(cfg *config.SubTaskConfig, syncerObj *Syncer, startWithSubtask bool) *DataValidator { 229 v := &DataValidator{ 230 cfg: cfg, 231 syncer: syncerObj, 232 startWithSubtask: startWithSubtask, 233 vmetric: metrics.NewValidatorMetrics(cfg.Name, cfg.SourceID), 234 } 235 v.L = log.With(zap.String("task", cfg.Name), zap.String("unit", "continuous validator")) 236 237 v.setStage(pb.Stage_Stopped) 238 v.workerCnt = cfg.ValidatorCfg.WorkerCount 239 v.processedRowCounts = make([]atomic.Int64, rowChangeTypeCount) 240 v.validateInterval = cfg.ValidatorCfg.ValidateInterval.Duration 241 v.checkInterval = cfg.ValidatorCfg.CheckInterval.Duration 242 v.persistHelper = newValidatorCheckpointHelper(v) 243 v.pendingRowCounts = make([]atomic.Int64, rowChangeTypeCount) 244 245 return v 246 } 247 248 // reset state on start/restart. 249 func (v *DataValidator) reset() { 250 v.errChan = make(chan error, 10) 251 v.workers = []*validateWorker{} 252 253 v.markErrorStarted.Store(false) 254 v.resetResult() 255 for i := range v.processedRowCounts { 256 v.processedRowCounts[i].Store(0) 257 } 258 for i := range v.pendingRowCounts { 259 v.pendingRowCounts[i].Store(0) 260 } 261 v.newErrorRowCount.Store(0) 262 v.processedBinlogSize.Store(0) 263 v.pendingRowSize.Store(0) 264 v.initTableStatus(map[string]*tableValidateStatus{}) 265 } 266 267 func (v *DataValidator) initialize() error { 268 v.ctx, v.cancel = context.WithCancel(context.Background()) 269 v.tctx = tcontext.NewContext(v.ctx, v.L) 270 v.reset() 271 272 newCtx, cancelFunc := v.tctx.WithTimeout(unit.DefaultInitTimeout) 273 defer cancelFunc() 274 275 var err error 276 defer func() { 277 if err == nil { 278 return 279 } 280 dbconn.CloseBaseDB(newCtx, v.fromDB) 281 dbconn.CloseBaseDB(newCtx, v.toDB) 282 v.cancel() 283 }() 284 285 dbCfg := v.cfg.From 286 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout).SetMaxIdleConns(1) 287 v.fromDB, err = conn.GetUpstreamDB(&dbCfg) 288 if err != nil { 289 return err 290 } 291 292 dbCfg = v.cfg.To 293 // worker count + checkpoint connection, others concurrent access can create it on the fly 294 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetReadTimeout(maxDMLConnectionTimeout).SetMaxIdleConns(v.workerCnt + 1) 295 v.toDB, err = conn.GetDownstreamDB(&dbCfg) 296 if err != nil { 297 return err 298 } 299 300 if err = v.persistHelper.init(newCtx); err != nil { 301 return err 302 } 303 304 var defaultUpstreamTZ string 305 failpoint.Inject("ValidatorMockUpstreamTZ", func() { 306 defaultUpstreamTZ = "UTC" 307 }) 308 v.upstreamTZ, _, err = str2TimezoneOrFromDB(newCtx, defaultUpstreamTZ, conn.UpstreamDBConfig(&v.cfg.From)) 309 if err != nil { 310 return err 311 } 312 v.timezone, _, err = str2TimezoneOrFromDB(newCtx, v.cfg.Timezone, conn.DownstreamDBConfig(&v.cfg.To)) 313 if err != nil { 314 return err 315 } 316 317 v.syncCfg, err = subtaskCfg2BinlogSyncerCfg(v.cfg, v.timezone, v.syncer.baList) 318 if err != nil { 319 return err 320 } 321 322 v.streamerController = binlogstream.NewStreamerController( 323 v.syncCfg, 324 v.cfg.EnableGTID, 325 &dbconn.UpStreamConn{BaseDB: v.fromDB}, 326 v.cfg.RelayDir, 327 v.timezone, 328 nil, 329 v.L, 330 ) 331 return nil 332 } 333 334 func (v *DataValidator) routineWrapper(fn func()) { 335 defer func() { 336 if err := recover(); err != nil { 337 v.L.Error("panic", zap.Any("err", err)) 338 v.sendError(terror.ErrValidatorPanic.Generate(err)) 339 } 340 }() 341 342 fn() 343 } 344 345 func (v *DataValidator) Start(expect pb.Stage) { 346 v.Lock() 347 defer v.Unlock() 348 349 v.L.Info("starting", zap.Any("cfg", v.cfg.ValidatorCfg), 350 zap.String("start-time", v.cfg.ValidatorCfg.StartTime), 351 zap.Bool("start with subtask", v.startWithSubtask), 352 zap.Any("expect", expect)) 353 if v.Stage() == pb.Stage_Running { 354 v.L.Info("already started") 355 return 356 } 357 358 if expect != pb.Stage_Running { 359 v.L.Info("expect stage is not running", zap.Any("expect", expect)) 360 return 361 } 362 363 if err := v.initialize(); err != nil { 364 v.fillResult(err) 365 return 366 } 367 368 v.wg.Add(1) 369 go v.routineWrapper(v.doValidate) 370 371 v.wg.Add(1) 372 go v.routineWrapper(v.printStatusRoutine) 373 374 v.wg.Add(1) 375 go utils.GoLogWrapper(v.L, v.markErrorStartedRoutine) 376 377 // routineWrapper relies on errorProcessRoutine to handle panic errors, 378 // so just wrap it using a common wrapper. 379 v.errProcessWg.Add(1) 380 go utils.GoLogWrapper(v.L, v.errorProcessRoutine) 381 382 v.setStage(pb.Stage_Running) 383 v.L.Info("started") 384 } 385 386 func (v *DataValidator) markErrorStartedRoutine() { 387 defer v.wg.Done() 388 389 select { 390 case <-v.ctx.Done(): 391 case <-time.After(markErrorRowDelay): 392 if !v.markErrorStarted.Load() { 393 v.L.Info("mark markErrorStarted=true after error row delay") 394 v.markErrorStarted.Store(true) 395 } 396 } 397 } 398 399 func (v *DataValidator) printStatusRoutine() { 400 defer v.wg.Done() 401 var ( 402 prevProcessedBinlogSize = v.processedBinlogSize.Load() 403 prevTime = time.Now() 404 ) 405 for { 406 select { 407 case <-v.ctx.Done(): 408 return 409 case <-time.After(validatorStatusInterval): 410 processed := v.getProcessedRowCounts() 411 pending := []int64{ 412 v.pendingRowCounts[rowInsert].Load(), 413 v.pendingRowCounts[rowUpdated].Load(), 414 v.pendingRowCounts[rowDeleted].Load(), 415 } 416 currProcessedBinlogSize := v.processedBinlogSize.Load() 417 currTime := time.Now() 418 interval := time.Since(prevTime) 419 speed := float64((currProcessedBinlogSize-prevProcessedBinlogSize)>>20) / interval.Seconds() 420 prevProcessedBinlogSize = currProcessedBinlogSize 421 prevTime = currTime 422 counts, err := v.getErrorRowCount(validatorDmctlOpTimeout) 423 if err == nil { 424 v.vmetric.ErrorCount.Set(float64(counts[pb.ValidateErrorState_NewErr])) 425 } else { 426 v.L.Warn("failed to get error row count", zap.Error(err)) 427 } 428 v.L.Info("validator status", 429 zap.Int64s("processed(i, u, d)", processed), 430 zap.Int64s("pending(i, u, d)", pending), 431 zap.Int64("new error rows(not flushed)", v.newErrorRowCount.Load()), 432 zap.String("binlog process speed", fmt.Sprintf("%.2f MB/s", speed)), 433 ) 434 } 435 } 436 } 437 438 func (v *DataValidator) fillResult(err error) { 439 // when met a non-retryable error, we'll call stopInner, then v.ctx is cancelled, 440 // don't set IsCanceled in this case 441 isCanceled := false 442 if v.getResultErrCnt() == 0 { 443 select { 444 case <-v.ctx.Done(): 445 isCanceled = true 446 default: 447 } 448 } 449 450 var processErr *pb.ProcessError 451 if utils.IsContextCanceledError(err) { 452 v.L.Info("filter out context cancelled error", log.ShortError(err)) 453 } else { 454 v.L.Error("error during validation", zap.Error(err)) 455 processErr = unit.NewProcessError(err) 456 } 457 v.addResultError(processErr, isCanceled) 458 } 459 460 func (v *DataValidator) errorProcessRoutine() { 461 defer v.errProcessWg.Done() 462 463 var ( 464 stopped bool 465 wg sync.WaitGroup 466 ) 467 468 for err := range v.errChan { 469 v.fillResult(err) 470 471 if errors.Cause(err) != context.Canceled && !stopped { 472 stopped = true 473 wg.Add(1) 474 go func() { 475 defer wg.Done() 476 v.stopInner() 477 }() 478 } 479 } 480 wg.Wait() 481 } 482 483 func (v *DataValidator) waitSyncerSynced(currLoc binlog.Location) error { 484 syncLoc := v.syncer.getFlushedGlobalPoint() 485 cmp := binlog.CompareLocation(currLoc, syncLoc, v.cfg.EnableGTID) 486 if cmp >= 0 && !v.markErrorStarted.Load() { 487 v.markErrorStarted.Store(true) 488 v.L.Info("validator progress reached syncer") 489 } 490 if cmp <= 0 { 491 return nil 492 } 493 494 for { 495 select { 496 case <-v.ctx.Done(): 497 return v.ctx.Err() 498 case <-time.After(v.checkInterval): 499 syncLoc = v.syncer.getFlushedGlobalPoint() 500 cmp = binlog.CompareLocation(currLoc, syncLoc, v.cfg.EnableGTID) 501 if cmp <= 0 { 502 return nil 503 } 504 v.L.Debug("wait syncer synced", zap.Reflect("loc", currLoc)) 505 } 506 } 507 } 508 509 func (v *DataValidator) updateValidatorBinlogMetric(currLoc binlog.Location) { 510 v.vmetric.BinlogPos.Set(float64(currLoc.Position.Pos)) 511 index, err := utils.GetFilenameIndex(currLoc.Position.Name) 512 if err != nil { 513 v.L.Warn("fail to record validator binlog file index", zap.Error(err)) 514 } else { 515 v.vmetric.BinlogFile.Set(float64(index)) 516 } 517 } 518 519 func (v *DataValidator) updateValidatorBinlogLag(currLoc binlog.Location) { 520 syncerLoc := v.syncer.getFlushedGlobalPoint() 521 index, err := utils.GetFilenameIndex(currLoc.Position.Name) 522 if err != nil { 523 v.L.Warn("fail to record validator binlog file index", zap.Error(err)) 524 } 525 if syncerLoc.Position.Name == currLoc.Position.Name { 526 // same file: record the log pos latency 527 v.vmetric.LogPosLatency.Set(float64(syncerLoc.Position.Pos - currLoc.Position.Pos)) 528 v.vmetric.LogFileLatency.Set(float64(0)) 529 } else { 530 var syncerLogIdx int64 531 v.vmetric.LogPosLatency.Set(float64(0)) 532 syncerLogIdx, err = utils.GetFilenameIndex(syncerLoc.Position.Name) 533 if err == nil { 534 v.vmetric.LogFileLatency.Set(float64(syncerLogIdx - index)) 535 } else { 536 v.vmetric.LogFileLatency.Set(float64(0)) 537 v.L.Warn("fail to get syncer's log file index", zap.Error(err)) 538 } 539 } 540 } 541 542 func (v *DataValidator) waitSyncerRunning() error { 543 if v.syncer.IsRunning() { 544 return nil 545 } 546 v.L.Info("wait until syncer running") 547 for { 548 select { 549 case <-v.ctx.Done(): 550 return v.ctx.Err() 551 case <-time.After(v.checkInterval): 552 if v.syncer.IsRunning() { 553 v.L.Info("syncer is running, wait finished") 554 return nil 555 } 556 } 557 } 558 } 559 560 func (v *DataValidator) getInitialBinlogPosition() (binlog.Location, error) { 561 var location binlog.Location 562 timeStr := v.cfg.ValidatorCfg.StartTime 563 switch { 564 case timeStr != "": 565 // already check it when set it, will not check it again 566 t, _ := utils.ParseStartTimeInLoc(timeStr, v.upstreamTZ) 567 finder := binlog.NewRemoteBinlogPosFinder(v.tctx, v.fromDB, v.syncCfg, v.cfg.EnableGTID) 568 loc, posTp, err := finder.FindByTimestamp(t.Unix()) 569 if err != nil { 570 v.L.Error("fail to find binlog position by timestamp", 571 zap.Time("time", t), zap.Error(err)) 572 return location, err 573 } 574 v.L.Info("find binlog pos by timestamp", zap.String("time", timeStr), 575 zap.Any("loc", loc), zap.Stringer("pos type", posTp)) 576 577 if posTp == binlog.AboveUpperBoundBinlogPos { 578 return location, terror.ErrConfigStartTimeTooLate.Generate(timeStr) 579 } 580 location = *loc 581 v.L.Info("do validate from timestamp", zap.Any("loc", location)) 582 case v.startWithSubtask: 583 // in extreme case, this loc may still not be the first binlog location of this task: 584 // syncer synced some binlog and flush checkpoint, but validator still not has chance to run, then fail-over 585 location = v.syncer.getInitExecutedLoc() 586 v.L.Info("do validate from init executed loc of syncer", zap.Any("loc", location)) 587 default: 588 location = v.syncer.getFlushedGlobalPoint() 589 v.L.Info("do validate from current loc of syncer", zap.Any("loc", location)) 590 } 591 return location, nil 592 } 593 594 // doValidate: runs in a separate goroutine. 595 func (v *DataValidator) doValidate() { 596 defer v.wg.Done() 597 598 if err := v.waitSyncerRunning(); err != nil { 599 // no need to wrapped it in error_list, since err can be context.Canceled only. 600 v.sendError(err) 601 return 602 } 603 604 if err := v.loadPersistedData(); err != nil { 605 v.sendError(terror.ErrValidatorLoadPersistedData.Delegate(err)) 606 return 607 } 608 609 var location binlog.Location 610 if v.location != nil { 611 location = *v.location 612 v.L.Info("do validate from checkpoint", zap.Any("loc", location)) 613 } else { 614 // validator always uses remote binlog streamer now. 615 var err error 616 location, err = v.getInitialBinlogPosition() 617 if err != nil { 618 v.sendError(err) 619 return 620 } 621 // when relay log enabled, binlog name may contain uuid suffix, so need to extract the real location 622 location.Position.Name = utils.ExtractRealName(location.Position.Name) 623 // persist current location to make sure we start from the same location 624 // if fail-over happens before we flush checkpoint and data. 625 err = v.persistHelper.persist(v.tctx, location) 626 if err != nil { 627 v.sendError(terror.ErrValidatorPersistData.Delegate(err)) 628 return 629 } 630 } 631 // it's for test, some fields in streamerController is mocked, cannot call Start 632 if v.streamerController.IsClosed() { 633 err := v.streamerController.Start(v.tctx, location) 634 if err != nil { 635 v.sendError(terror.Annotate(err, "fail to start streamer controller")) 636 return 637 } 638 } 639 640 v.startValidateWorkers() 641 defer func() { 642 for _, worker := range v.workers { 643 worker.close() 644 } 645 }() 646 647 // we don't flush checkpoint&data on exist, since checkpoint and pending data may not correspond with each other. 648 locationForFlush := location.CloneWithFlavor(v.cfg.Flavor) 649 v.lastFlushTime = time.Now() 650 for { 651 e, _, err := v.streamerController.GetEvent(v.tctx) 652 if err != nil { 653 switch { 654 case err == context.Canceled: 655 return 656 case err == context.DeadlineExceeded: 657 v.L.Info("deadline exceeded when fetching binlog event") 658 continue 659 case isDuplicateServerIDError(err): 660 // if the server id is already used, need to use a new server id 661 v.L.Info("server id is already used by another slave, will change to a new server id and get event again") 662 err1 := v.streamerController.UpdateServerIDAndResetReplication(v.tctx, locationForFlush) 663 if err1 != nil { 664 v.sendError(terror.Annotate(err1, "fail to update UpdateServerIDAndResetReplication")) 665 return 666 } 667 continue 668 case err == relay.ErrorMaybeDuplicateEvent: 669 continue 670 case isConnectionRefusedError(err): 671 v.sendError(terror.ErrValidatorGetEvent.Delegate(err)) 672 return 673 default: 674 if v.streamerController.CanRetry(err) { 675 err = v.streamerController.ResetReplicationSyncer(v.tctx, locationForFlush) 676 if err != nil { 677 v.sendError(terror.Annotate(err, "fail to reset replication")) 678 return 679 } 680 continue 681 } 682 v.sendError(terror.ErrValidatorGetEvent.Delegate(err)) 683 return 684 } 685 } 686 687 currEndLoc := v.streamerController.GetCurEndLocation() 688 locationForFlush = v.streamerController.GetTxnEndLocation() 689 690 // wait until syncer synced current event 691 err = v.waitSyncerSynced(currEndLoc) 692 if err != nil { 693 // no need to wrap it in error_list, since err can be context.Canceled only. 694 v.sendError(err) 695 return 696 } 697 failpoint.Inject("mockValidatorDelay", func(val failpoint.Value) { 698 if sec, ok := val.(int); ok { 699 v.L.Info("mock validator delay", zap.Int("second", sec)) 700 time.Sleep(time.Duration(sec) * time.Second) 701 } 702 }) 703 // update validator metric 704 v.updateValidatorBinlogMetric(currEndLoc) 705 v.updateValidatorBinlogLag(currEndLoc) 706 v.processedBinlogSize.Add(int64(e.Header.EventSize)) 707 708 switch ev := e.Event.(type) { 709 case *replication.RowsEvent: 710 if err = v.processRowsEvent(e.Header, ev); err != nil { 711 v.L.Warn("failed to process event: ", zap.Reflect("error", err)) 712 v.sendError(terror.ErrValidatorProcessRowEvent.Delegate(err)) 713 return 714 } 715 case *replication.XIDEvent: 716 if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil { 717 v.sendError(terror.ErrValidatorPersistData.Delegate(err)) 718 return 719 } 720 case *replication.QueryEvent: 721 if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil { 722 v.sendError(terror.ErrValidatorPersistData.Delegate(err)) 723 return 724 } 725 case *replication.GenericEvent: 726 if e.Header.EventType == replication.HEARTBEAT_EVENT { 727 if err = v.checkAndPersistCheckpointAndData(locationForFlush); err != nil { 728 v.sendError(terror.ErrValidatorPersistData.Delegate(err)) 729 return 730 } 731 } 732 } 733 } 734 } 735 736 func (v *DataValidator) Stop() { 737 v.stopInner() 738 v.errProcessWg.Wait() 739 metrics.RemoveValidatorLabelValuesWithTask(v.cfg.Name) 740 } 741 742 func (v *DataValidator) stopInner() { 743 v.Lock() 744 defer v.Unlock() 745 v.L.Info("stopping") 746 if v.Stage() != pb.Stage_Running { 747 v.L.Warn("not started") 748 return 749 } 750 751 v.cancel() 752 v.streamerController.Close() 753 v.fromDB.Close() 754 v.toDB.Close() 755 756 v.wg.Wait() 757 // we want to record all errors, so we need to wait all error sender goroutines to stop 758 // before closing this error chan. 759 close(v.errChan) 760 761 v.setStage(pb.Stage_Stopped) 762 v.L.Info("stopped") 763 } 764 765 func (v *DataValidator) startValidateWorkers() { 766 v.wg.Add(v.workerCnt) 767 v.workers = make([]*validateWorker, v.workerCnt) 768 for i := 0; i < v.workerCnt; i++ { 769 worker := newValidateWorker(v, i) 770 v.workers[i] = worker 771 // worker handles panic in validateTableChange, so we can see it in `dmctl validation status`, 772 // for other panics we just log it. 773 go utils.GoLogWrapper(v.L, func() { 774 defer v.wg.Done() 775 worker.run() 776 }) 777 } 778 779 for _, tblChange := range v.loadedPendingChanges { 780 for key, row := range tblChange.jobs { 781 v.dispatchRowChange(key, row) 782 } 783 } 784 } 785 786 func (v *DataValidator) dispatchRowChange(key string, row *rowValidationJob) { 787 hashVal := int(utils.GenHashKey(key)) % v.workerCnt 788 v.workers[hashVal].rowChangeCh <- row 789 790 v.L.Debug("dispatch row change job", zap.Any("table", row.row.GetSourceTable()), 791 zap.Stringer("type", row.Tp), zap.String("key", key), zap.Int("worker id", hashVal)) 792 } 793 794 func (v *DataValidator) genValidateTableInfo(sourceTable *filter.Table, columnCount int) (*validateTableInfo, error) { 795 targetTable := v.syncer.route(sourceTable) 796 // there are 2 cases tracker may drop table: 797 // 1. checkpoint rollback, tracker may recreate tables and drop non-needed tables 798 // 2. when operate-schema set/remove 799 // in case 1, we add another layer synchronization to make sure we don't get a dropped table when recreation. 800 // for non-needed tables, we will not validate them. 801 // in case 2, validator should be paused 802 res := &validateTableInfo{targetTable: targetTable} 803 var ( 804 tableInfo *model.TableInfo 805 err error 806 ) 807 tableInfo, err = v.syncer.getTrackedTableInfo(sourceTable) 808 if err != nil { 809 switch { 810 case schema.IsTableNotExists(err): 811 // not a table need to sync 812 res.message = tableNotSyncedOrDropped 813 return res, nil 814 case terror.ErrSchemaTrackerIsClosed.Equal(err): 815 // schema tracker is closed 816 // try to get table schema from checkpoint 817 tableInfo = v.syncer.getTableInfoFromCheckpoint(sourceTable) 818 if tableInfo == nil { 819 // get table schema from checkpoint failed 820 return res, errors.Annotate(err, "fail to get table info from checkpoint") 821 } 822 default: 823 return res, err 824 } 825 } 826 if len(tableInfo.Columns) < columnCount { 827 res.message = moreColumnInBinlogMsg 828 return res, nil 829 } 830 831 tableID := utils.GenTableID(targetTable) 832 downstreamTableInfo, err := v.syncer.getDownStreamTableInfo(v.tctx, tableID, tableInfo) 833 if err != nil { 834 // todo: might be connection error, then return error, or downstream table not exists, then set state to stopped. 835 return res, err 836 } 837 pk := downstreamTableInfo.WhereHandle.UniqueNotNullIdx 838 if pk == nil { 839 res.message = tableWithoutPrimaryKeyMsg 840 return res, nil 841 } 842 // offset of pk column is adjusted using source table info, the offsets should stay in range of ev.ColumnCount. 843 for _, col := range pk.Columns { 844 if col.Offset >= columnCount { 845 res.message = downstreamPKColumnOutOfBoundsMsg 846 return res, nil 847 } 848 } 849 // if current TI has more columns, clone and strip columns 850 if len(tableInfo.Columns) > columnCount { 851 tableInfo = tableInfo.Clone() 852 tableInfo.Columns = tableInfo.Columns[:columnCount] 853 } 854 855 res.srcTableInfo = tableInfo 856 res.downstreamTableInfo = downstreamTableInfo 857 return res, nil 858 } 859 860 func (v *DataValidator) processRowsEvent(header *replication.EventHeader, ev *replication.RowsEvent) error { 861 sourceTable := &filter.Table{ 862 Schema: string(ev.Table.Schema), 863 Name: string(ev.Table.Table), 864 } 865 866 failpoint.Inject("ValidatorPanic", func() {}) 867 868 if err := checkLogColumns(ev.SkippedColumns); err != nil { 869 return terror.Annotate(err, sourceTable.String()) 870 } 871 872 needSkip, err := v.syncer.skipRowsEvent(sourceTable, header.EventType) 873 if err != nil { 874 return err 875 } 876 if needSkip { 877 return nil 878 } 879 880 fullTableName := sourceTable.String() 881 state, ok := v.getTableStatus(fullTableName) 882 if ok && state.stage == pb.Stage_Stopped { 883 return nil 884 } 885 886 validateTbl, err := v.genValidateTableInfo(sourceTable, int(ev.ColumnCount)) 887 if err != nil { 888 return terror.Annotate(err, "failed to get table info") 889 } 890 891 targetTable := validateTbl.targetTable 892 if state == nil { 893 state = &tableValidateStatus{ 894 source: *sourceTable, 895 target: *targetTable, 896 stage: pb.Stage_Running, 897 } 898 899 v.L.Info("put table status", zap.Stringer("state", state)) 900 v.putTableStatus(fullTableName, state) 901 } 902 if validateTbl.message != "" { 903 v.L.Warn("stop validating table", zap.String("table", sourceTable.String()), 904 zap.String("reason", validateTbl.message)) 905 state.stopped(validateTbl.message) 906 return nil 907 } 908 909 tableInfo, downstreamTableInfo := validateTbl.srcTableInfo, validateTbl.downstreamTableInfo 910 911 changeType := getRowChangeType(header.EventType) 912 913 step := 1 914 if changeType == rowUpdated { 915 step = 2 916 } 917 estimatedRowSize := int32(header.EventSize) / int32(len(ev.Rows)) 918 for i := 0; i < len(ev.Rows); i += step { 919 var beforeImage, afterImage []interface{} 920 switch changeType { 921 case rowInsert: 922 afterImage = ev.Rows[i] 923 case rowUpdated: 924 beforeImage, afterImage = ev.Rows[i], ev.Rows[i+1] 925 default: // rowDeleted 926 beforeImage = ev.Rows[i] 927 } 928 929 rowChange := sqlmodel.NewRowChange( 930 &cdcmodel.TableName{Schema: sourceTable.Schema, Table: sourceTable.Name}, 931 &cdcmodel.TableName{Schema: targetTable.Schema, Table: targetTable.Name}, 932 beforeImage, afterImage, 933 tableInfo, downstreamTableInfo.TableInfo, 934 nil, 935 ) 936 rowChange.SetWhereHandle(downstreamTableInfo.WhereHandle) 937 size := estimatedRowSize 938 if changeType == rowUpdated && rowChange.IsIdentityUpdated() { 939 delRow, insRow := rowChange.SplitUpdate() 940 delRowKey := genRowKey(delRow) 941 v.dispatchRowChange(delRowKey, &rowValidationJob{Key: delRowKey, Tp: rowDeleted, row: delRow, size: size}) 942 v.processedRowCounts[rowDeleted].Inc() 943 944 insRowKey := genRowKey(insRow) 945 v.dispatchRowChange(insRowKey, &rowValidationJob{Key: insRowKey, Tp: rowInsert, row: insRow, size: size}) 946 v.processedRowCounts[rowInsert].Inc() 947 } else { 948 rowKey := genRowKey(rowChange) 949 if changeType == rowUpdated { 950 size *= 2 951 } 952 v.dispatchRowChange(rowKey, &rowValidationJob{Key: rowKey, Tp: changeType, row: rowChange, size: size}) 953 v.processedRowCounts[changeType].Inc() 954 } 955 } 956 return nil 957 } 958 959 func (v *DataValidator) checkAndPersistCheckpointAndData(loc binlog.Location) error { 960 metaFlushInterval := v.cfg.ValidatorCfg.MetaFlushInterval.Duration 961 cutOverLocation := v.cutOverLocation.Load() 962 needCutOver := cutOverLocation != nil && binlog.CompareLocation(*cutOverLocation, loc, v.cfg.EnableGTID) <= 0 963 if time.Since(v.lastFlushTime) > metaFlushInterval || needCutOver { 964 if needCutOver { 965 v.cutOverLocation.Store(nil) 966 } 967 v.lastFlushTime = time.Now() 968 if err := v.persistCheckpointAndData(loc); err != nil { 969 v.L.Warn("failed to flush checkpoint: ", zap.Error(err)) 970 if isRetryableValidateError(err) { 971 return nil 972 } 973 return err 974 } 975 } 976 return nil 977 } 978 979 func (v *DataValidator) persistCheckpointAndData(loc binlog.Location) error { 980 var wg sync.WaitGroup 981 wg.Add(v.workerCnt) 982 flushJob := &rowValidationJob{ 983 Tp: flushCheckpoint, 984 wg: &wg, 985 } 986 for i, worker := range v.workers { 987 v.L.Debug("dispatch flush job", zap.Int("worker id", i)) 988 worker.rowChangeCh <- flushJob 989 } 990 wg.Wait() 991 992 v.L.Info("persist checkpoint and intermediate data", 993 zap.Int64("pending size", v.getPendingRowSize()), 994 zap.Int64("pending count", v.getAllPendingRowCount()), 995 zap.Int64("new error", v.newErrorRowCount.Load())) 996 997 err := v.persistHelper.persist(v.tctx, loc) 998 if err != nil { 999 return err 1000 } 1001 1002 // reset errors after save 1003 for _, worker := range v.workers { 1004 worker.resetErrorRows() 1005 } 1006 v.newErrorRowCount.Store(0) 1007 v.setFlushedLoc(&loc) 1008 return nil 1009 } 1010 1011 func (v *DataValidator) loadPersistedData() error { 1012 data, err := v.persistHelper.loadPersistedDataRetry(v.tctx) 1013 if err != nil { 1014 return err 1015 } 1016 // table info of pending change is not persisted in order to save space, so need to init them after load. 1017 pendingChanges := make(map[string]*tableChangeJob) 1018 for _, tblChange := range data.pendingChanges { 1019 // todo: if table is dropped since last run, we should skip rows related to this table & update table status 1020 // see https://github.com/pingcap/tiflow/pull/4881#discussion_r834093316 1021 sourceTable := tblChange.sourceTable 1022 validateTbl, err2 := v.genValidateTableInfo(sourceTable, tblChange.columnCount) 1023 if err2 != nil { 1024 return terror.Annotate(err2, "failed to get table info on load") 1025 } 1026 if validateTbl.message != "" { 1027 return errors.New("failed to get table info " + validateTbl.message) 1028 } 1029 pendingTblChange := newTableChangeJob() 1030 // aggregate using target table just as worker did. 1031 pendingChanges[validateTbl.targetTable.String()] = pendingTblChange 1032 for _, row := range tblChange.rows { 1033 var beforeImage, afterImage []interface{} 1034 switch row.Tp { 1035 case rowInsert: 1036 afterImage = row.Data 1037 case rowUpdated: 1038 // set both to row.Data, since we only save one image on persist in order to save space 1039 beforeImage, afterImage = row.Data, row.Data 1040 default: 1041 // rowDeleted 1042 beforeImage = row.Data 1043 } 1044 pendingTblChange.jobs[row.Key] = &rowValidationJob{ 1045 Key: row.Key, 1046 Tp: row.Tp, 1047 row: sqlmodel.NewRowChange( 1048 &cdcmodel.TableName{Schema: sourceTable.Schema, Table: sourceTable.Name}, 1049 &cdcmodel.TableName{Schema: validateTbl.targetTable.Schema, Table: validateTbl.targetTable.Name}, 1050 beforeImage, afterImage, 1051 validateTbl.srcTableInfo, validateTbl.downstreamTableInfo.TableInfo, 1052 nil, 1053 ), 1054 size: row.Size, 1055 FailedCnt: row.FailedCnt, 1056 } 1057 } 1058 } 1059 1060 v.location = data.checkpoint 1061 v.setProcessedRowCounts(data.processedRowCounts) 1062 v.loadedPendingChanges = pendingChanges 1063 v.persistHelper.setRevision(data.rev) 1064 v.initTableStatus(data.tableStatus) 1065 1066 return nil 1067 } 1068 1069 func (v *DataValidator) incrErrorRowCount(cnt int) { 1070 v.newErrorRowCount.Add(int64(cnt)) 1071 } 1072 1073 func (v *DataValidator) getWorkers() []*validateWorker { 1074 return v.workers 1075 } 1076 1077 func (v *DataValidator) Started() bool { 1078 v.stateMutex.RLock() 1079 defer v.stateMutex.RUnlock() 1080 return v.stage == pb.Stage_Running 1081 } 1082 1083 func (v *DataValidator) Stage() pb.Stage { 1084 v.stateMutex.RLock() 1085 defer v.stateMutex.RUnlock() 1086 return v.stage 1087 } 1088 1089 func (v *DataValidator) setStage(stage pb.Stage) { 1090 v.stateMutex.Lock() 1091 defer v.stateMutex.Unlock() 1092 v.stage = stage 1093 } 1094 1095 func (v *DataValidator) getFlushedLoc() *binlog.Location { 1096 v.stateMutex.RLock() 1097 defer v.stateMutex.RUnlock() 1098 return v.flushedLoc 1099 } 1100 1101 func (v *DataValidator) setFlushedLoc(loc *binlog.Location) { 1102 v.stateMutex.Lock() 1103 defer v.stateMutex.Unlock() 1104 if loc == nil { 1105 v.flushedLoc = nil 1106 return 1107 } 1108 clone := loc.Clone() 1109 v.flushedLoc = &clone 1110 } 1111 1112 func (v *DataValidator) getResult() pb.ProcessResult { 1113 v.stateMutex.RLock() 1114 defer v.stateMutex.RUnlock() 1115 return v.result 1116 } 1117 1118 func (v *DataValidator) addResultError(err *pb.ProcessError, cancelled bool) { 1119 v.stateMutex.Lock() 1120 defer v.stateMutex.Unlock() 1121 if err != nil { 1122 v.result.Errors = append(v.result.Errors, err) 1123 } 1124 v.result.IsCanceled = cancelled 1125 } 1126 1127 func (v *DataValidator) getResultErrCnt() int { 1128 v.stateMutex.Lock() 1129 defer v.stateMutex.Unlock() 1130 return len(v.result.Errors) 1131 } 1132 1133 func (v *DataValidator) resetResult() { 1134 v.stateMutex.Lock() 1135 defer v.stateMutex.Unlock() 1136 v.result.Reset() 1137 } 1138 1139 func (v *DataValidator) initTableStatus(m map[string]*tableValidateStatus) { 1140 v.stateMutex.Lock() 1141 defer v.stateMutex.Unlock() 1142 v.tableStatus = m 1143 } 1144 1145 func (v *DataValidator) getTableStatus(fullTableName string) (*tableValidateStatus, bool) { 1146 v.stateMutex.RLock() 1147 defer v.stateMutex.RUnlock() 1148 res, ok := v.tableStatus[fullTableName] 1149 return res, ok 1150 } 1151 1152 // return snapshot of the current table status. 1153 func (v *DataValidator) getTableStatusMap() map[string]*tableValidateStatus { 1154 v.stateMutex.RLock() 1155 defer v.stateMutex.RUnlock() 1156 tblStatus := make(map[string]*tableValidateStatus) 1157 for key, tblStat := range v.tableStatus { 1158 stat := &tableValidateStatus{} 1159 *stat = *tblStat // deep copy 1160 tblStatus[key] = stat 1161 } 1162 return tblStatus 1163 } 1164 1165 func (v *DataValidator) putTableStatus(name string, status *tableValidateStatus) { 1166 v.stateMutex.Lock() 1167 defer v.stateMutex.Unlock() 1168 v.tableStatus[name] = status 1169 } 1170 1171 func (v *DataValidator) isMarkErrorStarted() bool { 1172 return v.markErrorStarted.Load() 1173 } 1174 1175 func (v *DataValidator) getProcessedRowCounts() []int64 { 1176 return []int64{ 1177 v.processedRowCounts[rowInsert].Load(), 1178 v.processedRowCounts[rowUpdated].Load(), 1179 v.processedRowCounts[rowDeleted].Load(), 1180 } 1181 } 1182 1183 func (v *DataValidator) setProcessedRowCounts(counts []int64) { 1184 v.processedRowCounts[rowInsert].Store(counts[rowInsert]) 1185 v.processedRowCounts[rowUpdated].Store(counts[rowUpdated]) 1186 v.processedRowCounts[rowDeleted].Store(counts[rowDeleted]) 1187 } 1188 1189 func (v *DataValidator) addPendingRowCount(tp rowChangeJobType, cnt int64) { 1190 v.pendingRowCounts[tp].Add(cnt) 1191 } 1192 1193 func (v *DataValidator) getAllPendingRowCount() int64 { 1194 return v.pendingRowCounts[rowInsert].Load() + 1195 v.pendingRowCounts[rowUpdated].Load() + 1196 v.pendingRowCounts[rowDeleted].Load() 1197 } 1198 1199 func (v *DataValidator) addPendingRowSize(size int64) { 1200 v.pendingRowSize.Add(size) 1201 } 1202 1203 func (v *DataValidator) getPendingRowSize() int64 { 1204 return v.pendingRowSize.Load() 1205 } 1206 1207 func (v *DataValidator) sendError(err error) { 1208 v.errChan <- err 1209 } 1210 1211 func (v *DataValidator) getNewErrorRowCount() int64 { 1212 return v.newErrorRowCount.Load() 1213 } 1214 1215 // getRowChangeType should be called only when the event type is RowsEvent. 1216 func getRowChangeType(t replication.EventType) rowChangeJobType { 1217 switch t { 1218 case replication.WRITE_ROWS_EVENTv0, replication.WRITE_ROWS_EVENTv1, replication.WRITE_ROWS_EVENTv2: 1219 return rowInsert 1220 case replication.UPDATE_ROWS_EVENTv0, replication.UPDATE_ROWS_EVENTv1, replication.UPDATE_ROWS_EVENTv2: 1221 return rowUpdated 1222 default: 1223 // replication.DELETE_ROWS_EVENTv0, replication.DELETE_ROWS_EVENTv1, replication.DELETE_ROWS_EVENTv2: 1224 return rowDeleted 1225 } 1226 } 1227 1228 func genRowKey(row *sqlmodel.RowChange) string { 1229 vals := row.RowStrIdentity() 1230 return genRowKeyByString(vals) 1231 } 1232 1233 func genRowKeyByString(pkValues []string) string { 1234 // in the scenario below, the generated key may not be unique, but it's rare 1235 // suppose a table with multiple column primary key: (v1, v2) 1236 // for below case, the generated key is the same: 1237 // (aaa\t, bbb) and (aaa, \tbbb), the joint values both are "aaa\t\tbbb" 1238 join := strings.Join(pkValues, "\t") 1239 // if the key is too long, need to make sure it can be stored into database 1240 if len(join) > maxRowKeyLength { 1241 sum := sha256.Sum256([]byte(join)) 1242 return hex.EncodeToString(sum[:]) 1243 } 1244 return join 1245 } 1246 1247 func (v *DataValidator) GetValidatorTableStatus(filterStatus pb.Stage) []*pb.ValidationTableStatus { 1248 tblStatus := v.getTableStatusMap() 1249 1250 result := make([]*pb.ValidationTableStatus, 0) 1251 for _, tblStat := range tblStatus { 1252 returnAll := filterStatus == pb.Stage_InvalidStage 1253 if returnAll || tblStat.stage == filterStatus { 1254 result = append(result, &pb.ValidationTableStatus{ 1255 Source: v.cfg.SourceID, 1256 SrcTable: tblStat.source.String(), 1257 DstTable: tblStat.target.String(), 1258 Stage: tblStat.stage, 1259 Message: tblStat.message, 1260 }) 1261 } 1262 } 1263 return result 1264 } 1265 1266 func (v *DataValidator) GetValidatorError(errState pb.ValidateErrorState) ([]*pb.ValidationError, error) { 1267 // todo: validation error in workers cannot be returned 1268 // because the errID is only allocated when the error rows are flushed 1269 // user cannot handle errorRows without errID 1270 var ( 1271 toDB *conn.BaseDB 1272 err error 1273 dbCfg dbconfig.DBConfig 1274 ) 1275 ctx, cancel := context.WithTimeout(context.Background(), validatorDmctlOpTimeout) 1276 tctx := tcontext.NewContext(ctx, v.L) 1277 defer cancel() 1278 failpoint.Inject("MockValidationQuery", func() { 1279 toDB = v.persistHelper.db 1280 failpoint.Return(v.persistHelper.loadError(tctx, toDB, errState)) 1281 }) 1282 dbCfg = v.cfg.To 1283 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1) 1284 toDB, err = conn.GetDownstreamDB(&dbCfg) 1285 if err != nil { 1286 v.L.Warn("failed to create downstream db", zap.Error(err)) 1287 return nil, err 1288 } 1289 defer dbconn.CloseBaseDB(tctx, toDB) 1290 ret, err := v.persistHelper.loadError(tctx, toDB, errState) 1291 if err != nil { 1292 v.L.Warn("fail to load validator error", zap.Error(err)) 1293 return nil, err 1294 } 1295 return ret, nil 1296 } 1297 1298 func (v *DataValidator) OperateValidatorError(validateOp pb.ValidationErrOp, errID uint64, isAll bool) error { 1299 var ( 1300 toDB *conn.BaseDB 1301 err error 1302 dbCfg dbconfig.DBConfig 1303 ) 1304 ctx, cancel := context.WithTimeout(context.Background(), validatorDmctlOpTimeout) 1305 tctx := tcontext.NewContext(ctx, v.L) 1306 defer cancel() 1307 failpoint.Inject("MockValidationQuery", func() { 1308 toDB = v.persistHelper.db 1309 failpoint.Return(v.persistHelper.operateError(tctx, toDB, validateOp, errID, isAll)) 1310 }) 1311 dbCfg = v.cfg.To 1312 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1) 1313 toDB, err = conn.GetDownstreamDB(&dbCfg) 1314 if err != nil { 1315 return err 1316 } 1317 defer dbconn.CloseBaseDB(tctx, toDB) 1318 return v.persistHelper.operateError(tctx, toDB, validateOp, errID, isAll) 1319 } 1320 1321 func (v *DataValidator) UpdateValidator(req *pb.UpdateValidationWorkerRequest) error { 1322 var ( 1323 pos = mysql.Position{} 1324 gs mysql.GTIDSet 1325 err error 1326 ) 1327 if len(req.BinlogPos) > 0 { 1328 pos, err = binlog.PositionFromPosStr(req.BinlogPos) 1329 if err != nil { 1330 return err 1331 } 1332 } 1333 if len(req.BinlogGTID) > 0 { 1334 gs, err = gtid.ParserGTID(v.cfg.Flavor, req.BinlogGTID) 1335 if err != nil { 1336 return err 1337 } 1338 } 1339 cutOverLocation := binlog.NewLocation(pos, gs) 1340 v.cutOverLocation.Store(&cutOverLocation) 1341 v.syncer.cutOverLocation.Store(&cutOverLocation) 1342 return nil 1343 } 1344 1345 func (v *DataValidator) getErrorRowCount(timeout time.Duration) ([errorStateTypeCount]int64, error) { 1346 ctx, cancel := context.WithTimeout(context.Background(), timeout) 1347 defer cancel() 1348 tctx := tcontext.NewContext(ctx, v.L) 1349 1350 // use a separate db to get error count, since validator maybe stopped or initializing 1351 dbCfg := v.cfg.To 1352 dbCfg.RawDBCfg = dbconfig.DefaultRawDBConfig().SetMaxIdleConns(1) 1353 countMap := map[pb.ValidateErrorState]int64{} 1354 toDB, err := conn.GetDownstreamDB(&dbCfg) 1355 if err != nil { 1356 v.L.Warn("failed to create downstream db", zap.Error(err)) 1357 } else { 1358 defer dbconn.CloseBaseDB(tctx, toDB) 1359 countMap, err = v.persistHelper.loadErrorCount(tctx, toDB) 1360 if err != nil { 1361 v.L.Warn("failed to load error count", zap.Error(err)) 1362 } 1363 } 1364 var errorRowCount [errorStateTypeCount]int64 1365 errorRowCount[pb.ValidateErrorState_NewErr] = countMap[pb.ValidateErrorState_NewErr] 1366 errorRowCount[pb.ValidateErrorState_IgnoredErr] = countMap[pb.ValidateErrorState_IgnoredErr] 1367 errorRowCount[pb.ValidateErrorState_ResolvedErr] = countMap[pb.ValidateErrorState_ResolvedErr] 1368 1369 errorRowCount[pb.ValidateErrorState_NewErr] += v.newErrorRowCount.Load() 1370 1371 return errorRowCount, err 1372 } 1373 1374 func (v *DataValidator) GetValidatorStatus() *pb.ValidationStatus { 1375 var extraMsg string 1376 errorRowCount, err := v.getErrorRowCount(validatorDmctlOpTimeout) 1377 if err != nil { 1378 // nolint:nilerr 1379 extraMsg = fmt.Sprintf(" (failed to load error count from meta db: %s)", err.Error()) 1380 } 1381 // if we print those state in a structured way, there would be at least 9 lines for each subtask, 1382 // which is hard to read, so print them into one line. 1383 template := "insert/update/delete: %d/%d/%d" 1384 processedRowCounts := v.getProcessedRowCounts() 1385 processedRows := fmt.Sprintf(template, processedRowCounts[rowInsert], 1386 processedRowCounts[rowUpdated], processedRowCounts[rowDeleted]) 1387 pendingRows := fmt.Sprintf(template, v.pendingRowCounts[rowInsert].Load(), 1388 v.pendingRowCounts[rowUpdated].Load(), v.pendingRowCounts[rowDeleted].Load()) 1389 errorRows := fmt.Sprintf("new/ignored/resolved: %d/%d/%d%s", 1390 errorRowCount[pb.ValidateErrorState_NewErr], errorRowCount[pb.ValidateErrorState_IgnoredErr], 1391 errorRowCount[pb.ValidateErrorState_ResolvedErr], extraMsg) 1392 1393 result := v.getResult() 1394 returnedResult := &result 1395 if !result.IsCanceled && len(result.Errors) == 0 { 1396 // no need to show if validator is running normally 1397 returnedResult = nil 1398 } 1399 1400 flushedLoc := v.getFlushedLoc() 1401 var validatorBinlog, validatorBinlogGtid string 1402 if flushedLoc != nil { 1403 validatorBinlog = flushedLoc.Position.String() 1404 if flushedLoc.GetGTID() != nil { 1405 validatorBinlogGtid = flushedLoc.GetGTID().String() 1406 } 1407 } 1408 var cutoverBinlogPos, cutoverBinlogGTID string 1409 if cutOverLoc := v.cutOverLocation.Load(); cutOverLoc != nil { 1410 cutoverBinlogPos = cutOverLoc.Position.String() 1411 if cutOverLoc.GetGTID() != nil { 1412 cutoverBinlogGTID = cutOverLoc.GetGTID().String() 1413 } 1414 } 1415 1416 return &pb.ValidationStatus{ 1417 Task: v.cfg.Name, 1418 Source: v.cfg.SourceID, 1419 Mode: v.cfg.ValidatorCfg.Mode, 1420 Stage: v.Stage(), 1421 Result: returnedResult, 1422 ValidatorBinlog: validatorBinlog, 1423 ValidatorBinlogGtid: validatorBinlogGtid, 1424 ProcessedRowsStatus: processedRows, 1425 PendingRowsStatus: pendingRows, 1426 ErrorRowsStatus: errorRows, 1427 CutoverBinlogPos: cutoverBinlogPos, 1428 CutoverBinlogGtid: cutoverBinlogGTID, 1429 } 1430 }