github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/relay/relay.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package relay 15 16 import ( 17 "context" 18 "crypto/tls" 19 "fmt" 20 "os" 21 "path/filepath" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/go-mysql-org/go-mysql/mysql" 27 "github.com/go-mysql-org/go-mysql/replication" 28 "github.com/pingcap/errors" 29 "github.com/pingcap/failpoint" 30 "github.com/pingcap/tidb/pkg/parser" 31 "github.com/pingcap/tidb/pkg/util" 32 "github.com/pingcap/tiflow/dm/config" 33 "github.com/pingcap/tiflow/dm/config/dbconfig" 34 "github.com/pingcap/tiflow/dm/pb" 35 "github.com/pingcap/tiflow/dm/pkg/binlog" 36 "github.com/pingcap/tiflow/dm/pkg/binlog/common" 37 binlogReader "github.com/pingcap/tiflow/dm/pkg/binlog/reader" 38 "github.com/pingcap/tiflow/dm/pkg/conn" 39 tcontext "github.com/pingcap/tiflow/dm/pkg/context" 40 "github.com/pingcap/tiflow/dm/pkg/gtid" 41 "github.com/pingcap/tiflow/dm/pkg/log" 42 parserpkg "github.com/pingcap/tiflow/dm/pkg/parser" 43 pkgstreamer "github.com/pingcap/tiflow/dm/pkg/streamer" 44 "github.com/pingcap/tiflow/dm/pkg/terror" 45 "github.com/pingcap/tiflow/dm/pkg/utils" 46 "github.com/pingcap/tiflow/dm/unit" 47 "go.uber.org/atomic" 48 "go.uber.org/zap" 49 ) 50 51 // used to fill RelayLogInfo. 52 var fakeRelayTaskName = "relay" 53 54 const ( 55 flushMetaInterval = 30 * time.Second 56 getMasterStatusInterval = 30 * time.Second 57 trimUUIDsInterval = 1 * time.Hour 58 showStatusConnectionTimeout = "1m" 59 60 // dumpFlagSendAnnotateRowsEvent (BINLOG_SEND_ANNOTATE_ROWS_EVENT) request the MariaDB master to send Annotate_rows_log_event back. 61 dumpFlagSendAnnotateRowsEvent uint16 = 0x02 62 ) 63 64 // NewRelay creates an instance of Relay. 65 var NewRelay = NewRealRelay 66 67 var _ Process = &Relay{} 68 69 // Listener defines a binlog event listener of relay log. 70 type Listener interface { 71 // OnEvent get called when relay processed an event successfully. 72 OnEvent(e *replication.BinlogEvent) 73 } 74 75 // Process defines mysql-like relay log process unit. 76 type Process interface { 77 // Init initial relat log unit 78 Init(ctx context.Context) (err error) 79 // Process run background logic of relay log unit 80 Process(ctx context.Context) pb.ProcessResult 81 // ActiveRelayLog returns the earliest active relay log info in this operator 82 ActiveRelayLog() *pkgstreamer.RelayLogInfo 83 // Reload reloads config 84 Reload(newCfg *Config) error 85 // Update updates config 86 Update(cfg *config.SubTaskConfig) error 87 // Resume resumes paused relay log process unit 88 Resume(ctx context.Context, pr chan pb.ProcessResult) 89 // Pause pauses a running relay log process unit 90 Pause() 91 // Error returns error message if having one 92 Error() interface{} 93 // Status returns status of relay log process unit. 94 Status(sourceStatus *binlog.SourceStatus) interface{} 95 // Close does some clean works 96 Close() 97 // IsClosed returns whether relay log process unit was closed 98 IsClosed() bool 99 // SaveMeta save relay meta 100 SaveMeta(pos mysql.Position, gset mysql.GTIDSet) error 101 // ResetMeta reset relay meta 102 ResetMeta() 103 // PurgeRelayDir will clear all contents under w.cfg.RelayDir 104 PurgeRelayDir() error 105 // RegisterListener registers a relay listener 106 RegisterListener(el Listener) 107 // UnRegisterListener unregisters a relay listener 108 UnRegisterListener(el Listener) 109 // NewReader creates a new relay reader 110 NewReader(logger log.Logger, cfg *BinlogReaderConfig) *BinlogReader 111 // IsActive check whether given uuid+filename is active binlog file, if true return current file offset 112 IsActive(uuid, filename string) (bool, int64) 113 } 114 115 // Relay relays mysql binlog to local file. 116 type Relay struct { 117 db *conn.BaseDB 118 cfg *Config 119 syncerCfg replication.BinlogSyncerConfig 120 121 meta Meta 122 closed atomic.Bool 123 sync.RWMutex 124 125 logger log.Logger 126 127 activeRelayLog struct { 128 sync.RWMutex 129 info *pkgstreamer.RelayLogInfo 130 } 131 132 writer Writer 133 listeners map[Listener]struct{} // make it a set to make it easier to remove listener 134 } 135 136 // NewRealRelay creates an instance of Relay. 137 func NewRealRelay(cfg *Config) Process { 138 r := &Relay{ 139 cfg: cfg, 140 meta: NewLocalMeta(cfg.Flavor, cfg.RelayDir), 141 logger: log.With(zap.String("component", "relay log")), 142 listeners: make(map[Listener]struct{}), 143 } 144 r.writer = NewFileWriter(r.logger, cfg.RelayDir) 145 return r 146 } 147 148 // Init implements the dm.Unit interface. 149 // NOTE when Init encounters an error, it will make DM-worker exit when it boots up and assigned relay. 150 func (r *Relay) Init(ctx context.Context) (err error) { 151 return reportRelayLogSpaceInBackground(ctx, r.cfg.RelayDir) 152 } 153 154 // Process implements the dm.Unit interface. 155 func (r *Relay) Process(ctx context.Context) pb.ProcessResult { 156 relayExitWithErrorCounter.WithLabelValues("true").Add(0) 157 relayExitWithErrorCounter.WithLabelValues("false").Add(0) 158 errs := make([]*pb.ProcessError, 0, 1) 159 err := r.process(ctx) 160 if err != nil && errors.Cause(err) != replication.ErrSyncClosed { 161 r.logger.Error("process exit", zap.Error(err)) 162 // TODO: add specified error type instead of pb.ErrorType_UnknownError 163 processError := unit.NewProcessError(err) 164 resumable := fmt.Sprintf("%t", unit.IsResumableRelayError(processError)) 165 relayExitWithErrorCounter.WithLabelValues(resumable).Inc() 166 errs = append(errs, processError) 167 } 168 169 isCanceled := false 170 if len(errs) == 0 { 171 select { 172 case <-ctx.Done(): 173 isCanceled = true 174 default: 175 } 176 } 177 return pb.ProcessResult{ 178 IsCanceled: isCanceled, 179 Errors: errs, 180 } 181 } 182 183 func (r *Relay) process(ctx context.Context) error { 184 err := r.setSyncConfig() 185 if err != nil { 186 return err 187 } 188 189 db, err := conn.GetUpstreamDB(&r.cfg.From) 190 if err != nil { 191 return terror.WithScope(err, terror.ScopeUpstream) 192 } 193 r.db = db 194 195 if err2 := os.MkdirAll(r.cfg.RelayDir, 0o700); err2 != nil { 196 return terror.ErrRelayMkdir.Delegate(err2) 197 } 198 199 err = r.meta.Load() 200 if err != nil { 201 return err 202 } 203 204 parser2, err := conn.GetParser(tcontext.NewContext(ctx, log.L()), r.db) // refine to use user config later 205 if err != nil { 206 return err 207 } 208 209 isNew, err := isNewServer(ctx, r.meta.SubDir(), r.db, r.cfg.Flavor) 210 if err != nil { 211 return err 212 } 213 214 failpoint.Inject("NewUpstreamServer", func(_ failpoint.Value) { 215 // test a bug which caused by upstream switching 216 isNew = true 217 }) 218 219 if isNew { 220 // re-setup meta for new server or new source 221 err = r.reSetupMeta(ctx) 222 if err != nil { 223 return err 224 } 225 } else { 226 // connected to last source 227 r.updateMetricsRelaySubDirIndex() 228 // if not a new server, try to recover the latest relay log file. 229 err = r.tryRecoverLatestFile(ctx, parser2) 230 if err != nil { 231 return err 232 } 233 234 // resuming will take the risk that upstream has purge the binlog relay is needed. 235 // when this worker is down, HA may schedule the source to other workers and forward the sync progress, 236 // and then when the source is scheduled back to this worker, we could start relay from sync checkpoint's 237 // location which is newer, and now could purge the outdated relay logs. 238 // 239 // locations in `r.cfg` is set to min needed location of subtasks (higher priority) or source config specified 240 isRelayMetaOutdated := false 241 neededBinlogName := r.cfg.BinLogName 242 neededBinlogGset, err2 := gtid.ParserGTID(r.cfg.Flavor, r.cfg.BinlogGTID) 243 if err2 != nil { 244 return err2 245 } 246 if r.cfg.EnableGTID { 247 _, metaGset := r.meta.GTID() 248 if neededBinlogGset.Contain(metaGset) && !neededBinlogGset.Equal(metaGset) { 249 isRelayMetaOutdated = true 250 } 251 } else { 252 _, metaPos := r.meta.Pos() 253 if mysql.CompareBinlogFileName(neededBinlogName, metaPos.Name) > 0 { 254 isRelayMetaOutdated = true 255 } 256 } 257 258 if isRelayMetaOutdated { 259 uuidWithSuffix := r.meta.SubDir() // only change after switch 260 err2 = r.PurgeRelayDir() 261 if err2 != nil { 262 return err2 263 } 264 r.ResetMeta() 265 266 uuid, _, err3 := utils.ParseRelaySubDir(uuidWithSuffix) 267 if err3 != nil { 268 r.logger.Error("parse suffix for UUID when relay meta outdated", zap.String("UUID", uuidWithSuffix), zap.Error(err)) 269 return err3 270 } 271 272 pos := &mysql.Position{Name: neededBinlogName, Pos: binlog.MinPosition.Pos} 273 err = r.meta.AddDir(uuid, pos, neededBinlogGset, r.cfg.UUIDSuffix) 274 if err != nil { 275 return err 276 } 277 err = r.meta.Load() 278 if err != nil { 279 return err 280 } 281 } 282 } 283 284 reader2, err := r.setUpReader(ctx) 285 if err != nil { 286 return err 287 } 288 defer func() { 289 if reader2 != nil { 290 err = reader2.Close() 291 if err != nil { 292 r.logger.Error("fail to close binlog event reader", zap.Error(err)) 293 } 294 } 295 }() 296 297 uuid, pos := r.meta.Pos() 298 r.writer.Init(uuid, pos.Name) 299 r.logger.Info("started underlying writer", zap.String("UUID", uuid), zap.String("filename", pos.Name)) 300 defer func() { 301 err = r.writer.Close() 302 if err != nil { 303 r.logger.Error("fail to close binlog event writer", zap.Error(err)) 304 } 305 }() 306 307 readerRetry, err := NewReaderRetry(r.cfg.ReaderRetry) 308 if err != nil { 309 return err 310 } 311 312 go r.doIntervalOps(ctx) 313 314 // handles binlog events with retry mechanism. 315 // it only do the retry for some binlog reader error now. 316 for { 317 err = r.handleEvents(ctx, reader2, parser2) 318 if err == nil { 319 return nil 320 } else if !readerRetry.Check(ctx, err) { 321 return err 322 } 323 324 r.logger.Warn("receive retryable error for binlog reader", log.ShortError(err)) 325 err = reader2.Close() // close the previous reader 326 if err != nil { 327 r.logger.Error("fail to close binlog event reader", zap.Error(err)) 328 } 329 reader2, err = r.setUpReader(ctx) // setup a new one 330 if err != nil { 331 return err 332 } 333 r.logger.Info("retrying to read binlog") 334 } 335 } 336 337 // PurgeRelayDir implements the dm.Unit interface. 338 func (r *Relay) PurgeRelayDir() error { 339 dir := r.cfg.RelayDir 340 d, err := os.Open(dir) 341 r.logger.Info("will try purge whole relay dir for new relay log", zap.String("relayDir", dir)) 342 // fail to open dir, return directly 343 if err != nil { 344 if err == os.ErrNotExist { 345 return nil 346 } 347 return err 348 } 349 defer d.Close() 350 names, err := d.Readdirnames(-1) 351 if err != nil { 352 return err 353 } 354 for _, name := range names { 355 err = os.RemoveAll(filepath.Join(dir, name)) 356 if err != nil { 357 return err 358 } 359 } 360 r.logger.Info("relay dir is purged to be ready for new relay log", zap.String("relayDir", dir)) 361 return nil 362 } 363 364 // tryRecoverLatestFile tries to recover latest relay log file with corrupt/incomplete binlog events/transactions. 365 func (r *Relay) tryRecoverLatestFile(ctx context.Context, parser2 *parser.Parser) error { 366 var ( 367 uuid, latestPos = r.meta.Pos() 368 _, latestGTID = r.meta.GTID() 369 ) 370 371 if latestPos.Compare(minCheckpoint) <= 0 { 372 r.logger.Warn("no relay log file need to recover", zap.Stringer("position", latestPos), log.WrapStringerField("gtid set", latestGTID)) 373 return nil 374 } 375 376 // setup a special writer to do the recovering 377 binlogDir := r.meta.Dir() 378 r.logger.Info("started recover", zap.String("binlog dir", binlogDir), zap.String("filename", latestPos.Name)) 379 380 // NOTE: recover a relay log file with too many binlog events may take a little long time. 381 result, err := r.doRecovering(ctx, r.meta.Dir(), latestPos.Name, parser2) 382 if err == nil { 383 relayLogHasMore := result.LatestPos.Compare(latestPos) > 0 || 384 (result.LatestGTIDs != nil && !result.LatestGTIDs.Equal(latestGTID) && result.LatestGTIDs.Contain(latestGTID)) 385 386 if result.Truncated || relayLogHasMore { 387 r.logger.Warn("relay log file recovered", 388 zap.Stringer("from position", latestPos), zap.Stringer("to position", result.LatestPos), log.WrapStringerField("from GTID set", latestGTID), log.WrapStringerField("to GTID set", result.LatestGTIDs)) 389 390 if result.LatestGTIDs != nil { 391 dbConn, err2 := r.db.GetBaseConn(ctx) 392 if err2 != nil { 393 return err2 394 } 395 defer r.db.CloseConnWithoutErr(dbConn) 396 result.LatestGTIDs, err2 = conn.AddGSetWithPurged(ctx, result.LatestGTIDs, dbConn) 397 if err2 != nil { 398 return err2 399 } 400 latestGTID, err2 = conn.AddGSetWithPurged(ctx, latestGTID, dbConn) 401 if err2 != nil { 402 return err2 403 } 404 } 405 406 if result.LatestGTIDs != nil && !result.LatestGTIDs.Equal(latestGTID) { 407 if result.LatestGTIDs.Contain(latestGTID) { 408 r.logger.Warn("some GTIDs are missing in the meta data, this is usually due to the process was interrupted while writing the meta data. force to update GTIDs", 409 log.WrapStringerField("from GTID set", latestGTID), log.WrapStringerField("to GTID set", result.LatestGTIDs)) 410 } 411 latestGTID = result.LatestGTIDs.Clone() 412 } 413 414 err = r.SaveMeta(result.LatestPos, latestGTID) 415 if err != nil { 416 return terror.Annotatef(err, "save position %s, GTID sets %v after recovered", result.LatestPos, result.LatestGTIDs) 417 } 418 } 419 } 420 return terror.Annotatef(err, "recover for UUID %s with relay dir %s, filename %s", uuid, binlogDir, latestPos.Name) 421 } 422 423 // recoverResult represents a result for a binlog recover operation. 424 type recoverResult struct { 425 // if truncate trailing incomplete events during recovering in relay log 426 Truncated bool 427 // the latest binlog position after recover operation has done. 428 LatestPos mysql.Position 429 // the latest binlog GTID set after recover operation has done. 430 LatestGTIDs mysql.GTIDSet 431 } 432 433 // doRecovering tries to recover the current binlog file. 434 // 1. read events from the file 435 // 2. 436 // 437 // a. update the position with the event's position if the transaction finished 438 // b. update the GTID set with the event's GTID if the transaction finished 439 // 440 // 3. truncate any incomplete events/transactions 441 // now, we think a transaction finished if we received a XIDEvent or DDL in QueryEvent 442 // NOTE: handle cases when file size > 4GB. 443 func (r *Relay) doRecovering(ctx context.Context, binlogDir, filename string, parser *parser.Parser) (recoverResult, error) { 444 fullName := filepath.Join(binlogDir, filename) 445 fs, err := os.Stat(fullName) 446 if (err != nil && os.IsNotExist(err)) || (err == nil && len(filename) == 0) { 447 return recoverResult{}, nil // no file need to recover 448 } else if err != nil { 449 return recoverResult{}, terror.ErrRelayWriterGetFileStat.Delegate(err, fullName) 450 } 451 452 // get latest pos/GTID set for all completed transactions from the file 453 latestPos, latestGTIDs, err := getTxnPosGTIDs(ctx, fullName, parser) 454 if err != nil { 455 return recoverResult{}, terror.Annotatef(err, "get latest pos/GTID set from %s", fullName) 456 } 457 458 // mock file truncated by recover 459 failpoint.Inject("MockRecoverRelayWriter", func() { 460 r.logger.Info("mock recover relay writer") 461 failpoint.Goto("bypass") 462 }) 463 464 // in most cases, we think the file is fine, so compare the size is simpler. 465 if fs.Size() == latestPos { 466 return recoverResult{ 467 Truncated: false, 468 LatestPos: mysql.Position{Name: filename, Pos: uint32(latestPos)}, 469 LatestGTIDs: latestGTIDs, 470 }, nil 471 } else if fs.Size() < latestPos { 472 return recoverResult{}, terror.ErrRelayWriterLatestPosGTFileSize.Generate(latestPos, fs.Size()) 473 } 474 475 failpoint.Label("bypass") 476 477 // truncate the file 478 f, err := os.OpenFile(fullName, os.O_WRONLY, 0o600) 479 if err != nil { 480 return recoverResult{}, terror.Annotatef(terror.ErrRelayWriterFileOperate.New(err.Error()), "open %s", fullName) 481 } 482 defer f.Close() 483 err = f.Truncate(latestPos) 484 if err != nil { 485 return recoverResult{}, terror.Annotatef(terror.ErrRelayWriterFileOperate.New(err.Error()), "truncate %s to %d", fullName, latestPos) 486 } 487 488 return recoverResult{ 489 Truncated: true, 490 LatestPos: mysql.Position{Name: filename, Pos: uint32(latestPos)}, 491 LatestGTIDs: latestGTIDs, 492 }, nil 493 } 494 495 const ( 496 ignoreReasonHeartbeat = "heartbeat event" 497 ignoreReasonArtificialFlag = "artificial flag (0x0020) set" 498 ) 499 500 type preprocessResult struct { 501 Ignore bool // whether the event should be ignored 502 IgnoreReason string // why the transformer ignore the event 503 LogPos uint32 // binlog event's End_log_pos or Position in RotateEvent 504 NextLogName string // next binlog filename, only valid for RotateEvent 505 GTIDSet mysql.GTIDSet // GTIDSet got from QueryEvent and XIDEvent when RawModeEnabled not true 506 CanSaveGTID bool // whether can save GTID into meta, true for DDL query and XIDEvent 507 } 508 509 func (r *Relay) preprocessEvent(e *replication.BinlogEvent, parser2 *parser.Parser) preprocessResult { 510 result := preprocessResult{ 511 LogPos: e.Header.LogPos, 512 } 513 514 switch ev := e.Event.(type) { 515 case *replication.PreviousGTIDsEvent: 516 result.CanSaveGTID = true 517 case *replication.MariadbGTIDListEvent: 518 result.CanSaveGTID = true 519 case *replication.RotateEvent: 520 // NOTE: we need to get the first binlog filename from fake RotateEvent when using auto position 521 result.LogPos = uint32(ev.Position) // next event's position 522 result.NextLogName = string(ev.NextLogName) // for RotateEvent, update binlog name 523 case *replication.QueryEvent: 524 // when RawModeEnabled not true, QueryEvent will be parsed. 525 if parserpkg.CheckIsDDL(string(ev.Query), parser2) { 526 // we only update/save GTID for DDL/XID event 527 // if the query is something like `BEGIN`, we do not update/save GTID. 528 result.GTIDSet = ev.GSet 529 result.CanSaveGTID = true 530 } 531 case *replication.XIDEvent: 532 // when RawModeEnabled not true, XIDEvent will be parsed. 533 result.GTIDSet = ev.GSet 534 result.CanSaveGTID = true // need save GTID for XID 535 case *replication.GenericEvent: 536 // handle some un-parsed events 537 if e.Header.EventType == replication.HEARTBEAT_EVENT { 538 // ignore artificial heartbeat event 539 // ref: https://dev.mysql.com/doc/internals/en/heartbeat-event.html 540 result.Ignore = true 541 result.IgnoreReason = ignoreReasonHeartbeat 542 } 543 default: 544 if e.Header.Flags&replication.LOG_EVENT_ARTIFICIAL_F != 0 { 545 // ignore events with LOG_EVENT_ARTIFICIAL_F flag(0x0020) set 546 // ref: https://dev.mysql.com/doc/internals/en/binlog-event-flag.html 547 result.Ignore = true 548 result.IgnoreReason = ignoreReasonArtificialFlag 549 } 550 } 551 return result 552 } 553 554 // handleEvents handles binlog events, including: 555 // 1. read events from upstream 556 // 2. transform events 557 // 3. write events into relay log files 558 // 4. update metadata if needed. 559 func (r *Relay) handleEvents( 560 ctx context.Context, 561 reader2 Reader, 562 parser2 *parser.Parser, 563 ) error { 564 var ( 565 _, lastPos = r.meta.Pos() 566 _, lastGTID = r.meta.GTID() 567 err error 568 eventIndex int // only for test 569 ) 570 if lastGTID == nil { 571 if lastGTID, err = gtid.ParserGTID(r.cfg.Flavor, ""); err != nil { 572 return err 573 } 574 } 575 576 firstEvent := true 577 relayPosGauge := relayLogPosGauge.WithLabelValues("relay") 578 relayFileGauge := relayLogFileGauge.WithLabelValues("relay") 579 for { 580 // 1. read events from upstream server 581 readTimer := time.Now() 582 rResult, err := reader2.GetEvent(ctx) 583 584 failpoint.Inject("RelayGetEventFailed", func() { 585 err = errors.New("RelayGetEventFailed") 586 }) 587 failpoint.Inject("RelayGetEventFailedAt", func(v failpoint.Value) { 588 if intVal, ok := v.(int); ok && intVal == eventIndex { 589 err = errors.New("fail point triggered") 590 _, gtid := r.meta.GTID() 591 r.logger.Warn("failed to get event", zap.Int("event_index", eventIndex), 592 zap.Any("gtid", gtid), log.ShortError(err)) 593 // wait backoff retry interval 594 time.Sleep(1 * time.Second) 595 } 596 }) 597 if err != nil { 598 switch errors.Cause(err) { 599 case context.Canceled: 600 return nil 601 case replication.ErrChecksumMismatch: 602 relayLogDataCorruptionCounter.Inc() 603 case replication.ErrSyncClosed, replication.ErrNeedSyncAgain: 604 // do nothing, but the error will be returned 605 default: 606 if conn.IsErrBinlogPurged(err) { 607 // TODO: try auto fix GTID, and can support auto switching between upstream server later. 608 cfg := r.cfg.From 609 r.logger.Error("the requested binlog files have purged in the master server or the master server have switched, currently DM do no support to handle this error", 610 zap.String("db host", cfg.Host), zap.Int("db port", cfg.Port), zap.Stringer("last pos", lastPos), log.ShortError(err)) 611 // log the status for debug 612 tctx := tcontext.NewContext(ctx, r.logger) 613 pos, gs, err2 := conn.GetPosAndGs(tctx, r.db, r.cfg.Flavor) 614 if err2 == nil { 615 r.logger.Info("current master status", zap.Stringer("position", pos), log.WrapStringerField("GTID sets", gs)) 616 } 617 } 618 binlogReadErrorCounter.Inc() 619 } 620 return err 621 } 622 623 binlogReadDurationHistogram.Observe(time.Since(readTimer).Seconds()) 624 failpoint.Inject("BlackholeReadBinlog", func(_ failpoint.Value) { 625 // r.logger.Info("back hole read binlog takes effects") 626 failpoint.Continue() 627 }) 628 629 e := rResult.Event 630 r.logger.Debug("receive binlog event with header", zap.Reflect("header", e.Header)) 631 632 // 2. transform events 633 transformTimer := time.Now() 634 tResult := r.preprocessEvent(e, parser2) 635 binlogTransformDurationHistogram.Observe(time.Since(transformTimer).Seconds()) 636 if len(tResult.NextLogName) > 0 && tResult.NextLogName > lastPos.Name { 637 lastPos = mysql.Position{ 638 Name: tResult.NextLogName, 639 Pos: tResult.LogPos, 640 } 641 r.logger.Info("rotate event", zap.Stringer("position", lastPos)) 642 } 643 if tResult.Ignore { 644 r.logger.Info("ignore event by transformer", 645 zap.Reflect("header", e.Header), 646 zap.String("reason", tResult.IgnoreReason)) 647 continue 648 } 649 650 if _, ok := e.Event.(*replication.RotateEvent); ok && utils.IsFakeRotateEvent(e.Header) { 651 isNew, err2 := isNewServer(ctx, r.meta.SubDir(), r.db, r.cfg.Flavor) 652 // should start from the transaction beginning when switch to a new server 653 if err2 != nil { 654 return err2 655 } 656 // upstream database switch 657 // report an error, let outer logic handle it 658 // should start from the transaction beginning when switch to a new server 659 if isNew { 660 return terror.ErrRotateEventWithDifferentServerID.Generate() 661 } 662 } 663 664 if firstEvent { 665 // on the first event we got, which must be a fake rotate event, save and flush meta once to make sure 666 // meta file exists before binlog file exists(lastPos.Name cannot be empty now) 667 // when switch from A to B then back to A, meta's been assigned to minCheckpoint, since it's taken as a new server. 668 // and meta file is not created when relay resumed. 669 firstEvent = false 670 if err2 := r.saveAndFlushMeta(lastPos, lastGTID); err2 != nil { 671 return err2 672 } 673 } 674 675 // 3. save events into file 676 writeTimer := time.Now() 677 if ce := r.logger.Check(zap.DebugLevel, ""); ce != nil { 678 r.logger.Debug("writing binlog event", zap.Reflect("header", e.Header)) 679 } 680 wResult, err := r.writer.WriteEvent(e) 681 if err != nil { 682 relayLogWriteErrorCounter.Inc() 683 return err 684 } else if wResult.Ignore { 685 r.logger.Info("ignore event by writer", 686 zap.Reflect("header", e.Header), 687 zap.String("reason", wResult.IgnoreReason)) 688 r.tryUpdateActiveRelayLog(e, lastPos.Name) // even the event ignored we still need to try this update. 689 continue 690 } 691 692 r.notify(e) 693 694 relayLogWriteDurationHistogram.Observe(time.Since(writeTimer).Seconds()) 695 r.tryUpdateActiveRelayLog(e, lastPos.Name) // wrote a event, try update the current active relay log. 696 697 // 4. update meta and metrics 698 needSavePos := tResult.CanSaveGTID 699 lastPos.Pos = tResult.LogPos 700 lastGTID = tResult.GTIDSet 701 if !r.cfg.EnableGTID { 702 // if go-mysql set RawModeEnabled to true 703 // then it will only parse FormatDescriptionEvent and RotateEvent 704 // then check `e.Event.(type)` for `QueryEvent` and `XIDEvent` will never be true 705 // so we need to update pos for all events 706 // and also save pos for all events 707 if e.Header.EventType != replication.ROTATE_EVENT { 708 lastPos.Pos = e.Header.LogPos // for RotateEvent, lastPos updated to the next binlog file's position. 709 } 710 needSavePos = true 711 } 712 713 relayLogWriteSizeHistogram.Observe(float64(e.Header.EventSize)) 714 relayPosGauge.Set(float64(lastPos.Pos)) 715 if e.Header.EventType == replication.FORMAT_DESCRIPTION_EVENT { 716 if index, err2 := utils.GetFilenameIndex(lastPos.Name); err2 != nil { 717 r.logger.Error("parse binlog file name", zap.String("file name", lastPos.Name), log.ShortError(err2)) 718 } else { 719 relayFileGauge.Set(float64(index)) 720 } 721 } 722 723 if needSavePos { 724 err = r.SaveMeta(lastPos, lastGTID) 725 if err != nil { 726 return terror.Annotatef(err, "save position %s, GTID sets %v into meta", lastPos, lastGTID) 727 } 728 eventIndex = 0 729 } else { 730 eventIndex++ 731 } 732 if tResult.NextLogName != "" && !utils.IsFakeRotateEvent(e.Header) { 733 // if the binlog is rotated, we need to save and flush the next binlog filename to meta 734 lastPos.Name = tResult.NextLogName 735 if err := r.saveAndFlushMeta(lastPos, lastGTID); err != nil { 736 return err 737 } 738 } 739 } 740 } 741 742 func (r *Relay) saveAndFlushMeta(lastPos mysql.Position, lastGTID mysql.GTIDSet) error { 743 if err := r.SaveMeta(lastPos, lastGTID); err != nil { 744 return terror.Annotatef(err, "save position %s, GTID sets %v into meta", lastPos, lastGTID) 745 } 746 return r.FlushMeta() 747 } 748 749 // tryUpdateActiveRelayLog tries to update current active relay log file. 750 // we should to update after received/wrote a FormatDescriptionEvent because it means switched to a new relay log file. 751 // NOTE: we can refactor active (writer/read) relay log mechanism later. 752 func (r *Relay) tryUpdateActiveRelayLog(e *replication.BinlogEvent, filename string) { 753 if e.Header.EventType == replication.FORMAT_DESCRIPTION_EVENT { 754 r.setActiveRelayLog(filename) 755 r.logger.Info("change the active relay log file", zap.String("file name", filename)) 756 } 757 } 758 759 // reSetupMeta re-setup the metadata when switching to a new upstream master server. 760 func (r *Relay) reSetupMeta(ctx context.Context) error { 761 uuid, err := conn.GetServerUUID(tcontext.NewContext(ctx, log.L()), r.db, r.cfg.Flavor) 762 if err != nil { 763 return err 764 } 765 766 var newPos *mysql.Position 767 var newGset mysql.GTIDSet 768 var newUUIDSuffix int 769 if r.cfg.UUIDSuffix > 0 { 770 // if bound or rebound to a source, clear all relay log and meta 771 if err = r.PurgeRelayDir(); err != nil { 772 return err 773 } 774 r.ResetMeta() 775 776 newUUIDSuffix = r.cfg.UUIDSuffix 777 // reset the UUIDSuffix 778 r.cfg.UUIDSuffix = 0 779 780 if len(r.cfg.BinLogName) != 0 { 781 newPos = &mysql.Position{Name: r.cfg.BinLogName, Pos: binlog.MinPosition.Pos} 782 } 783 if len(r.cfg.BinlogGTID) != 0 { 784 newGset, err = gtid.ParserGTID(r.cfg.Flavor, r.cfg.BinlogGTID) 785 if err != nil { 786 return err 787 } 788 } 789 } 790 err = r.meta.AddDir(uuid, newPos, newGset, newUUIDSuffix) 791 if err != nil { 792 return err 793 } 794 err = r.meta.Load() 795 if err != nil { 796 return err 797 } 798 799 var latestPosName, latestGTIDStr string 800 if (r.cfg.EnableGTID && len(r.cfg.BinlogGTID) == 0) || (!r.cfg.EnableGTID && len(r.cfg.BinLogName) == 0) { 801 tctx := tcontext.NewContext(ctx, r.logger) 802 latestPos, latestGTID, err2 := conn.GetPosAndGs(tctx, r.db, r.cfg.Flavor) 803 if err2 != nil { 804 return err2 805 } 806 latestPosName = latestPos.Name 807 latestGTIDStr = latestGTID.String() 808 } 809 810 // try adjust meta with start pos from config 811 _, err = r.meta.AdjustWithStartPos(r.cfg.BinLogName, r.cfg.BinlogGTID, r.cfg.EnableGTID, latestPosName, latestGTIDStr) 812 if err != nil { 813 return err 814 } 815 816 _, pos := r.meta.Pos() 817 _, gs := r.meta.GTID() 818 if r.cfg.EnableGTID { 819 // Adjust given gtid 820 // This means we always pull the binlog from the beginning of file. 821 gs, err = r.adjustGTID(ctx, gs) 822 if err != nil { 823 return terror.Annotate(err, "fail to adjust gtid for relay") 824 } 825 err = r.SaveMeta(pos, gs) 826 if err != nil { 827 return err 828 } 829 } 830 831 r.logger.Info("adjusted meta to start pos", zap.Any("start pos", pos), zap.Stringer("start pos's binlog gtid", gs)) 832 r.updateMetricsRelaySubDirIndex() 833 r.logger.Info("resetup meta", zap.String("uuid", uuid)) 834 835 return nil 836 } 837 838 func (r *Relay) updateMetricsRelaySubDirIndex() { 839 // when switching master server, update sub dir index metrics 840 node := r.masterNode() 841 uuidWithSuffix := r.meta.SubDir() // only change after switch 842 _, suffix, err := utils.ParseRelaySubDir(uuidWithSuffix) 843 if err != nil { 844 r.logger.Error("parse suffix for UUID", zap.String("UUID", uuidWithSuffix), zap.Error(err)) 845 return 846 } 847 relaySubDirIndex.WithLabelValues(node, uuidWithSuffix).Set(float64(suffix)) 848 } 849 850 func (r *Relay) doIntervalOps(ctx context.Context) { 851 flushTicker := time.NewTicker(flushMetaInterval) 852 defer flushTicker.Stop() 853 masterStatusTicker := time.NewTicker(getMasterStatusInterval) 854 defer masterStatusTicker.Stop() 855 trimUUIDsTicker := time.NewTicker(trimUUIDsInterval) 856 defer trimUUIDsTicker.Stop() 857 858 for { 859 select { 860 case <-flushTicker.C: 861 r.RLock() 862 if r.closed.Load() { 863 r.RUnlock() 864 return 865 } 866 if r.meta.Dirty() { 867 err := r.FlushMeta() 868 if err != nil { 869 r.logger.Error("flush meta", zap.Error(err)) 870 } else { 871 r.logger.Info("flush meta finished", zap.Stringer("meta", r.meta)) 872 } 873 } 874 r.RUnlock() 875 case <-masterStatusTicker.C: 876 r.RLock() 877 if r.closed.Load() { 878 r.RUnlock() 879 return 880 } 881 ctx2, cancel2 := context.WithTimeout(ctx, conn.DefaultDBTimeout) 882 tctx := tcontext.NewContext(ctx2, r.logger) 883 pos, _, err := conn.GetPosAndGs(tctx, r.db, r.cfg.Flavor) 884 cancel2() 885 if err != nil { 886 r.logger.Warn("get master status", zap.Error(err)) 887 r.RUnlock() 888 continue 889 } 890 index, err := utils.GetFilenameIndex(pos.Name) 891 if err != nil { 892 r.logger.Error("parse binlog file name", zap.String("file name", pos.Name), log.ShortError(err)) 893 r.RUnlock() 894 continue 895 } 896 relayLogFileGauge.WithLabelValues("master").Set(float64(index)) 897 relayLogPosGauge.WithLabelValues("master").Set(float64(pos.Pos)) 898 r.RUnlock() 899 case <-trimUUIDsTicker.C: 900 r.RLock() 901 if r.closed.Load() { 902 r.RUnlock() 903 return 904 } 905 trimmed, err := r.meta.TrimUUIDIndexFile() 906 if err != nil { 907 r.logger.Error("trim UUIDs", zap.Error(err)) 908 } else if len(trimmed) > 0 { 909 r.logger.Info("trim UUIDs", zap.String("UUIDs", strings.Join(trimmed, ";"))) 910 } 911 r.RUnlock() 912 case <-ctx.Done(): 913 return 914 } 915 } 916 } 917 918 // setUpReader setups the underlying reader used to read binlog events from the upstream master server. 919 func (r *Relay) setUpReader(ctx context.Context) (Reader, error) { 920 ctx2, cancel := context.WithTimeout(ctx, conn.DefaultDBTimeout) 921 defer cancel() 922 923 // always use a new random serverID 924 randomServerID, err := conn.GetRandomServerID(tcontext.NewContext(ctx2, log.L()), r.db) 925 if err != nil { 926 // should never happened unless the master has too many slave 927 return nil, terror.Annotate(err, "fail to get random server id for relay reader") 928 } 929 r.syncerCfg.ServerID = randomServerID 930 r.cfg.ServerID = randomServerID 931 932 uuid, pos := r.meta.Pos() 933 _, gs := r.meta.GTID() 934 cfg := &RConfig{ 935 SyncConfig: r.syncerCfg, 936 Pos: pos, 937 GTIDs: gs, 938 MasterID: r.masterNode(), 939 EnableGTID: r.cfg.EnableGTID, 940 } 941 942 reader2 := NewUpstreamReader(cfg) 943 err = reader2.Start() 944 if err != nil { 945 // do not log the whole config to protect the password in `SyncConfig`. 946 // and other config items should already logged before or included in `err`. 947 return nil, terror.Annotatef(err, "start reader for UUID %s", uuid) 948 } 949 950 r.logger.Info("started underlying reader", zap.String("UUID", uuid)) 951 return reader2, nil 952 } 953 954 func (r *Relay) masterNode() string { 955 return fmt.Sprintf("%s:%d", r.cfg.From.Host, r.cfg.From.Port) 956 } 957 958 // IsClosed tells whether Relay unit is closed or not. 959 func (r *Relay) IsClosed() bool { 960 return r.closed.Load() 961 } 962 963 // SaveMeta save relay meta and update meta in RelayLogInfo. 964 func (r *Relay) SaveMeta(pos mysql.Position, gset mysql.GTIDSet) error { 965 return r.meta.Save(pos, gset) 966 } 967 968 // ResetMeta reset relay meta. 969 func (r *Relay) ResetMeta() { 970 r.Lock() 971 defer r.Unlock() 972 r.meta = NewLocalMeta(r.cfg.Flavor, r.cfg.RelayDir) 973 } 974 975 // FlushMeta flush relay meta. 976 func (r *Relay) FlushMeta() error { 977 return r.meta.Flush() 978 } 979 980 // stopSync stops syncing, now it used by Close and Pause. 981 func (r *Relay) stopSync() { 982 if err := r.FlushMeta(); err != nil { 983 r.logger.Error("flush checkpoint", zap.Error(err)) 984 } 985 } 986 987 func (r *Relay) closeDB() { 988 if r.db != nil { 989 r.db.Close() 990 r.db = nil 991 } 992 } 993 994 // Close implements the dm.Unit interface. 995 func (r *Relay) Close() { 996 r.Lock() 997 defer r.Unlock() 998 if r.closed.Load() { 999 return 1000 } 1001 r.logger.Info("relay unit is closing") 1002 1003 r.stopSync() 1004 1005 r.closeDB() 1006 1007 r.closed.Store(true) 1008 r.logger.Info("relay unit closed") 1009 } 1010 1011 func (r *Relay) IsActive(uuid, filename string) (bool, int64) { 1012 return r.writer.IsActive(uuid, filename) 1013 } 1014 1015 // Status implements the dm.Unit interface. 1016 func (r *Relay) Status(sourceStatus *binlog.SourceStatus) interface{} { 1017 r.RLock() 1018 defer r.RUnlock() 1019 uuid, relayPos := r.meta.Pos() 1020 1021 rs := &pb.RelayStatus{ 1022 RelaySubDir: uuid, 1023 RelayBinlog: relayPos.String(), 1024 } 1025 if _, relayGTIDSet := r.meta.GTID(); relayGTIDSet != nil { 1026 rs.RelayBinlogGtid = relayGTIDSet.String() 1027 } 1028 1029 if sourceStatus != nil { 1030 masterPos, masterGTID := sourceStatus.Location.Position, sourceStatus.Location.GetGTID() 1031 rs.MasterBinlog = masterPos.String() 1032 if masterGTID != nil { // masterGTID maybe a nil interface 1033 rs.MasterBinlogGtid = masterGTID.String() 1034 } 1035 1036 if r.cfg.EnableGTID { 1037 // rely on sorted GTID set when String() 1038 rs.RelayCatchUpMaster = rs.MasterBinlogGtid == rs.RelayBinlogGtid 1039 } else { 1040 rs.RelayCatchUpMaster = masterPos.Compare(relayPos) == 0 1041 } 1042 } 1043 1044 return rs 1045 } 1046 1047 // Error implements the dm.Unit interface. 1048 func (r *Relay) Error() interface{} { 1049 return &pb.RelayError{} 1050 } 1051 1052 // Type implements the dm.Unit interface. 1053 func (r *Relay) Type() pb.UnitType { 1054 return pb.UnitType_Relay 1055 } 1056 1057 // IsFreshTask implements Unit.IsFreshTask. 1058 func (r *Relay) IsFreshTask() (bool, error) { 1059 return true, nil 1060 } 1061 1062 // Pause pauses the process, it can be resumed later. 1063 func (r *Relay) Pause() { 1064 if r.IsClosed() { 1065 r.logger.Warn("try to pause, but already closed") 1066 return 1067 } 1068 1069 r.stopSync() 1070 } 1071 1072 // Resume resumes the paused process. 1073 func (r *Relay) Resume(ctx context.Context, pr chan pb.ProcessResult) { 1074 // do nothing now, re-process called `Process` from outer directly 1075 } 1076 1077 // Update implements Unit.Update. 1078 func (r *Relay) Update(cfg *config.SubTaskConfig) error { 1079 // not support update configuration now 1080 return nil 1081 } 1082 1083 // Reload updates config. 1084 func (r *Relay) Reload(newCfg *Config) error { 1085 r.Lock() 1086 defer r.Unlock() 1087 r.logger.Info("relay unit is updating") 1088 1089 // Update From 1090 r.cfg.From = newCfg.From 1091 1092 // Update Charset 1093 r.cfg.Charset = newCfg.Charset 1094 1095 r.closeDB() 1096 if r.cfg.From.RawDBCfg == nil { 1097 r.cfg.From.RawDBCfg = dbconfig.DefaultRawDBConfig() 1098 } 1099 r.cfg.From.RawDBCfg.ReadTimeout = showStatusConnectionTimeout 1100 db, err := conn.GetUpstreamDB(&r.cfg.From) 1101 if err != nil { 1102 return terror.DBErrorAdapt(err, db.Scope, terror.ErrDBDriverError) 1103 } 1104 r.db = db 1105 1106 if err := r.setSyncConfig(); err != nil { 1107 return err 1108 } 1109 1110 r.logger.Info("relay unit is updated") 1111 1112 return nil 1113 } 1114 1115 // setActiveRelayLog sets or updates the current active relay log to file. 1116 func (r *Relay) setActiveRelayLog(filename string) { 1117 uuid := r.meta.SubDir() 1118 _, suffix, _ := utils.ParseRelaySubDir(uuid) 1119 rli := &pkgstreamer.RelayLogInfo{ 1120 TaskName: fakeRelayTaskName, 1121 SubDir: uuid, 1122 SubDirSuffix: suffix, 1123 Filename: filename, 1124 } 1125 r.activeRelayLog.Lock() 1126 r.activeRelayLog.info = rli 1127 r.activeRelayLog.Unlock() 1128 } 1129 1130 // ActiveRelayLog returns the current active RelayLogInfo. 1131 func (r *Relay) ActiveRelayLog() *pkgstreamer.RelayLogInfo { 1132 r.activeRelayLog.RLock() 1133 defer r.activeRelayLog.RUnlock() 1134 return r.activeRelayLog.info 1135 } 1136 1137 func (r *Relay) setSyncConfig() error { 1138 var tlsConfig *tls.Config 1139 var err error 1140 if r.cfg.From.Security != nil { 1141 if loadErr := r.cfg.From.Security.LoadTLSContent(); loadErr != nil { 1142 return terror.ErrCtlLoadTLSCfg.Delegate(loadErr) 1143 } 1144 tlsConfig, err = util.NewTLSConfig( 1145 util.WithCAContent(r.cfg.From.Security.SSLCABytes), 1146 util.WithCertAndKeyContent(r.cfg.From.Security.SSLCertBytes, r.cfg.From.Security.SSLKeyBytes), 1147 util.WithVerifyCommonName(r.cfg.From.Security.CertAllowedCN), 1148 util.WithMinTLSVersion(tls.VersionTLS10), 1149 ) 1150 if err != nil { 1151 return terror.ErrConnInvalidTLSConfig.Delegate(err) 1152 } 1153 } 1154 1155 h, _ := os.Hostname() 1156 h = "dm-relay-" + h 1157 // https://github.com/mysql/mysql-server/blob/1bfe02bdad6604d54913c62614bde57a055c8332/include/my_hostname.h#L33-L42 1158 if len(h) > 60 { 1159 h = h[:60] 1160 } 1161 syncerCfg := replication.BinlogSyncerConfig{ 1162 ServerID: r.cfg.ServerID, 1163 Flavor: r.cfg.Flavor, 1164 Host: r.cfg.From.Host, 1165 Port: uint16(r.cfg.From.Port), 1166 User: r.cfg.From.User, 1167 Password: r.cfg.From.Password, 1168 Charset: r.cfg.Charset, 1169 TLSConfig: tlsConfig, 1170 Localhost: h, 1171 } 1172 common.SetDefaultReplicationCfg(&syncerCfg, common.MaxBinlogSyncerReconnect) 1173 1174 if !r.cfg.EnableGTID { 1175 syncerCfg.RawModeEnabled = true 1176 } 1177 1178 if r.cfg.Flavor == mysql.MariaDBFlavor { 1179 syncerCfg.DumpCommandFlag |= dumpFlagSendAnnotateRowsEvent 1180 } 1181 1182 r.syncerCfg = syncerCfg 1183 return nil 1184 } 1185 1186 // AdjustGTID implements Relay.AdjustGTID 1187 // starting sync at returned gset will wholly fetch a binlog from beginning of the file. 1188 func (r *Relay) adjustGTID(ctx context.Context, gset mysql.GTIDSet) (mysql.GTIDSet, error) { 1189 // setup a TCP binlog reader (because no relay can be used when upgrading). 1190 syncCfg := r.syncerCfg 1191 // always use a new random serverID 1192 randomServerID, err := conn.GetRandomServerID(tcontext.NewContext(ctx, log.L()), r.db) 1193 if err != nil { 1194 return nil, terror.Annotate(err, "fail to get random server id when relay adjust gtid") 1195 } 1196 syncCfg.ServerID = randomServerID 1197 tcpReader := binlogReader.NewTCPReader(syncCfg) 1198 resultGs, err := binlogReader.GetPreviousGTIDFromGTIDSet(ctx, tcpReader, gset) 1199 if err != nil { 1200 return nil, err 1201 } 1202 1203 dbConn, err2 := r.db.GetBaseConn(ctx) 1204 if err2 != nil { 1205 return nil, err2 1206 } 1207 defer r.db.CloseConnWithoutErr(dbConn) 1208 return conn.AddGSetWithPurged(ctx, resultGs, dbConn) 1209 } 1210 1211 func (r *Relay) notify(e *replication.BinlogEvent) { 1212 r.RLock() 1213 defer r.RUnlock() 1214 for el := range r.listeners { 1215 el.OnEvent(e) 1216 } 1217 } 1218 1219 func (r *Relay) NewReader(logger log.Logger, cfg *BinlogReaderConfig) *BinlogReader { 1220 return newBinlogReader(logger, cfg, r) 1221 } 1222 1223 // RegisterListener implements Process.RegisterListener. 1224 func (r *Relay) RegisterListener(el Listener) { 1225 r.Lock() 1226 defer r.Unlock() 1227 r.listeners[el] = struct{}{} 1228 } 1229 1230 // UnRegisterListener implements Process.UnRegisterListener. 1231 func (r *Relay) UnRegisterListener(el Listener) { 1232 r.Lock() 1233 defer r.Unlock() 1234 delete(r.listeners, el) 1235 }