github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/replication/replication_manager.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package replication 15 16 import ( 17 "bytes" 18 "container/heap" 19 "fmt" 20 "math" 21 "time" 22 23 "github.com/pingcap/errors" 24 "github.com/pingcap/log" 25 "github.com/pingcap/tiflow/cdc/model" 26 "github.com/pingcap/tiflow/cdc/processor/tablepb" 27 "github.com/pingcap/tiflow/cdc/redo" 28 "github.com/pingcap/tiflow/cdc/scheduler/internal" 29 "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" 30 "github.com/pingcap/tiflow/pkg/spanz" 31 "github.com/tikv/client-go/v2/oracle" 32 "go.uber.org/zap" 33 ) 34 35 const ( 36 checkpointCannotProceed = internal.CheckpointCannotProceed 37 38 defaultSlowTableHeapSize = 4 39 logSlowTablesLagThreshold = 30 * time.Second 40 logSlowTablesInterval = 1 * time.Minute 41 logMissingTableInterval = 30 * time.Second 42 ) 43 44 // Callback is invoked when something is done. 45 type Callback func() 46 47 // BurstBalance for changefeed set up or unplanned TiCDC node failure. 48 // TiCDC needs to balance interrupted tables as soon as possible. 49 type BurstBalance struct { 50 AddTables []AddTable 51 RemoveTables []RemoveTable 52 MoveTables []MoveTable 53 } 54 55 func (b BurstBalance) String() string { 56 if len(b.AddTables) != 0 { 57 return fmt.Sprintf("BurstBalance, add tables: %v", b.AddTables) 58 } 59 if len(b.RemoveTables) != 0 { 60 return fmt.Sprintf("BurstBalance, remove tables: %v", b.RemoveTables) 61 } 62 if len(b.MoveTables) != 0 { 63 return fmt.Sprintf("BurstBalance, move tables: %v", b.MoveTables) 64 } 65 return "BurstBalance, no tables" 66 } 67 68 // MoveTable is a schedule task for moving a table. 69 type MoveTable struct { 70 Span tablepb.Span 71 DestCapture model.CaptureID 72 } 73 74 func (t MoveTable) String() string { 75 return fmt.Sprintf("MoveTable, span: %s, dest: %s", 76 t.Span.String(), t.DestCapture) 77 } 78 79 // AddTable is a schedule task for adding a table. 80 type AddTable struct { 81 Span tablepb.Span 82 CaptureID model.CaptureID 83 CheckpointTs model.Ts 84 } 85 86 func (t AddTable) String() string { 87 return fmt.Sprintf("AddTable, span: %s, capture: %s, checkpointTs: %d", 88 t.Span.String(), t.CaptureID, t.CheckpointTs) 89 } 90 91 // RemoveTable is a schedule task for removing a table. 92 type RemoveTable struct { 93 Span tablepb.Span 94 CaptureID model.CaptureID 95 } 96 97 func (t RemoveTable) String() string { 98 return fmt.Sprintf("RemoveTable, span: %s, capture: %s", 99 t.Span.String(), t.CaptureID) 100 } 101 102 // ScheduleTask is a schedule task that wraps add/move/remove table tasks. 103 type ScheduleTask struct { //nolint:revive 104 MoveTable *MoveTable 105 AddTable *AddTable 106 RemoveTable *RemoveTable 107 BurstBalance *BurstBalance 108 109 Accept Callback 110 } 111 112 // Name returns the name of a schedule task. 113 func (s *ScheduleTask) Name() string { 114 if s.MoveTable != nil { 115 return "moveTable" 116 } else if s.AddTable != nil { 117 return "addTable" 118 } else if s.RemoveTable != nil { 119 return "removeTable" 120 } else if s.BurstBalance != nil { 121 return "burstBalance" 122 } 123 return "unknown" 124 } 125 126 func (s *ScheduleTask) String() string { 127 if s.MoveTable != nil { 128 return s.MoveTable.String() 129 } 130 if s.AddTable != nil { 131 return s.AddTable.String() 132 } 133 if s.RemoveTable != nil { 134 return s.RemoveTable.String() 135 } 136 if s.BurstBalance != nil { 137 return s.BurstBalance.String() 138 } 139 return "" 140 } 141 142 // Manager manages replications and running scheduling tasks. 143 type Manager struct { //nolint:revive 144 spans *spanz.BtreeMap[*ReplicationSet] 145 146 runningTasks *spanz.BtreeMap[*ScheduleTask] 147 maxTaskConcurrency int 148 149 changefeedID model.ChangeFeedID 150 slowestPuller tablepb.Span 151 slowestSink tablepb.Span 152 acceptAddTableTask int 153 acceptRemoveTableTask int 154 acceptMoveTableTask int 155 acceptBurstBalanceTask int 156 157 slowTableHeap SetHeap 158 lastLogSlowTablesTime time.Time 159 lastMissTableID tablepb.TableID 160 lastLogMissTime time.Time 161 } 162 163 // NewReplicationManager returns a new replication manager. 164 func NewReplicationManager( 165 maxTaskConcurrency int, changefeedID model.ChangeFeedID, 166 ) *Manager { 167 // degreeReadHeavy is a degree optimized for read heavy map, many Ascend. 168 // There may be a large number of tables. 169 const degreeReadHeavy = 256 170 return &Manager{ 171 spans: spanz.NewBtreeMapWithDegree[*ReplicationSet](degreeReadHeavy), 172 runningTasks: spanz.NewBtreeMap[*ScheduleTask](), 173 maxTaskConcurrency: maxTaskConcurrency, 174 changefeedID: changefeedID, 175 } 176 } 177 178 // HandleCaptureChanges handles capture changes. 179 func (r *Manager) HandleCaptureChanges( 180 init map[model.CaptureID][]tablepb.TableStatus, 181 removed map[model.CaptureID][]tablepb.TableStatus, 182 checkpointTs model.Ts, 183 ) ([]*schedulepb.Message, error) { 184 if init != nil { 185 if r.spans.Len() != 0 { 186 log.Panic("schedulerv3: init again", 187 zap.String("namespace", r.changefeedID.Namespace), 188 zap.String("changefeed", r.changefeedID.ID), 189 zap.Any("init", init), zap.Any("tablesCount", r.spans.Len())) 190 } 191 spanStatusMap := spanz.NewBtreeMap[map[model.CaptureID]*tablepb.TableStatus]() 192 for captureID, spans := range init { 193 for i := range spans { 194 table := spans[i] 195 if _, ok := spanStatusMap.Get(table.Span); !ok { 196 spanStatusMap.ReplaceOrInsert( 197 table.Span, map[model.CaptureID]*tablepb.TableStatus{}) 198 } 199 spanStatusMap.GetV(table.Span)[captureID] = &table 200 } 201 } 202 var err error 203 spanStatusMap.Ascend(func(span tablepb.Span, status map[string]*tablepb.TableStatus) bool { 204 table, err1 := NewReplicationSet(span, checkpointTs, status, r.changefeedID) 205 if err1 != nil { 206 err = errors.Trace(err1) 207 return false 208 } 209 r.spans.ReplaceOrInsert(table.Span, table) 210 return true 211 }) 212 if err != nil { 213 return nil, errors.Trace(err) 214 } 215 } 216 sentMsgs := make([]*schedulepb.Message, 0) 217 if removed != nil { 218 var err error 219 r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool { 220 for captureID := range removed { 221 msgs, affected, err1 := table.handleCaptureShutdown(captureID) 222 if err != nil { 223 err = errors.Trace(err1) 224 return false 225 } 226 sentMsgs = append(sentMsgs, msgs...) 227 if affected { 228 // Cleanup its running task. 229 r.runningTasks.Delete(table.Span) 230 } 231 } 232 return true 233 }) 234 if err != nil { 235 return nil, errors.Trace(err) 236 } 237 } 238 return sentMsgs, nil 239 } 240 241 // HandleMessage handles messages sent by other captures. 242 func (r *Manager) HandleMessage( 243 msgs []*schedulepb.Message, 244 ) ([]*schedulepb.Message, error) { 245 sentMsgs := make([]*schedulepb.Message, 0, len(msgs)) 246 for i := range msgs { 247 msg := msgs[i] 248 switch msg.MsgType { 249 case schedulepb.MsgDispatchTableResponse: 250 msgs, err := r.handleMessageDispatchTableResponse(msg.From, msg.DispatchTableResponse) 251 if err != nil { 252 return nil, errors.Trace(err) 253 } 254 sentMsgs = append(sentMsgs, msgs...) 255 case schedulepb.MsgHeartbeatResponse: 256 msgs, err := r.handleMessageHeartbeatResponse(msg.From, msg.HeartbeatResponse) 257 if err != nil { 258 return nil, errors.Trace(err) 259 } 260 sentMsgs = append(sentMsgs, msgs...) 261 default: 262 log.Warn("schedulerv3: ignore message", 263 zap.String("namespace", r.changefeedID.Namespace), 264 zap.String("changefeed", r.changefeedID.ID), 265 zap.Stringer("type", msg.MsgType), zap.Any("message", msg)) 266 } 267 } 268 return sentMsgs, nil 269 } 270 271 func (r *Manager) handleMessageHeartbeatResponse( 272 from model.CaptureID, msg *schedulepb.HeartbeatResponse, 273 ) ([]*schedulepb.Message, error) { 274 sentMsgs := make([]*schedulepb.Message, 0) 275 for _, status := range msg.Tables { 276 table, ok := r.spans.Get(status.Span) 277 if !ok { 278 log.Info("schedulerv3: ignore table status no table found", 279 zap.String("namespace", r.changefeedID.Namespace), 280 zap.String("changefeed", r.changefeedID.ID), 281 zap.Any("from", from), 282 zap.Any("message", status)) 283 continue 284 } 285 msgs, err := table.handleTableStatus(from, &status) 286 if err != nil { 287 return nil, errors.Trace(err) 288 } 289 if table.hasRemoved() { 290 log.Info("schedulerv3: table has removed", 291 zap.String("namespace", r.changefeedID.Namespace), 292 zap.String("changefeed", r.changefeedID.ID), 293 zap.Any("from", from), 294 zap.Int64("tableID", status.Span.TableID)) 295 r.spans.Delete(status.Span) 296 } 297 sentMsgs = append(sentMsgs, msgs...) 298 } 299 return sentMsgs, nil 300 } 301 302 func (r *Manager) handleMessageDispatchTableResponse( 303 from model.CaptureID, msg *schedulepb.DispatchTableResponse, 304 ) ([]*schedulepb.Message, error) { 305 var status *tablepb.TableStatus 306 switch resp := msg.Response.(type) { 307 case *schedulepb.DispatchTableResponse_AddTable: 308 status = resp.AddTable.Status 309 case *schedulepb.DispatchTableResponse_RemoveTable: 310 status = resp.RemoveTable.Status 311 default: 312 log.Warn("schedulerv3: ignore unknown dispatch table response", 313 zap.String("namespace", r.changefeedID.Namespace), 314 zap.String("changefeed", r.changefeedID.ID), 315 zap.Any("message", msg)) 316 return nil, nil 317 } 318 319 table, ok := r.spans.Get(status.Span) 320 if !ok { 321 log.Info("schedulerv3: ignore table status no table found", 322 zap.String("namespace", r.changefeedID.Namespace), 323 zap.String("changefeed", r.changefeedID.ID), 324 zap.Any("message", status)) 325 return nil, nil 326 } 327 msgs, err := table.handleTableStatus(from, status) 328 if err != nil { 329 return nil, errors.Trace(err) 330 } 331 if table.hasRemoved() { 332 log.Info("schedulerv3: table has removed", 333 zap.String("namespace", r.changefeedID.Namespace), 334 zap.String("changefeed", r.changefeedID.ID), 335 zap.Int64("tableID", status.Span.TableID)) 336 r.spans.Delete(status.Span) 337 } 338 return msgs, nil 339 } 340 341 // HandleTasks handles schedule tasks. 342 func (r *Manager) HandleTasks( 343 tasks []*ScheduleTask, 344 ) ([]*schedulepb.Message, error) { 345 // Check if a running task is finished. 346 var toBeDeleted []tablepb.Span 347 r.runningTasks.Ascend(func(span tablepb.Span, task *ScheduleTask) bool { 348 if table, ok := r.spans.Get(span); ok { 349 // If table is back to Replicating or Removed, 350 // the running task is finished. 351 if table.State == ReplicationSetStateReplicating || table.hasRemoved() { 352 toBeDeleted = append(toBeDeleted, span) 353 } 354 } else { 355 // No table found, remove the task 356 toBeDeleted = append(toBeDeleted, span) 357 } 358 return true 359 }) 360 for _, span := range toBeDeleted { 361 r.runningTasks.Delete(span) 362 } 363 364 sentMsgs := make([]*schedulepb.Message, 0) 365 for _, task := range tasks { 366 // Burst balance does not affect by maxTaskConcurrency. 367 if task.BurstBalance != nil { 368 msgs, err := r.handleBurstBalanceTasks(task.BurstBalance) 369 if err != nil { 370 return nil, errors.Trace(err) 371 } 372 sentMsgs = append(sentMsgs, msgs...) 373 if task.Accept != nil { 374 task.Accept() 375 } 376 continue 377 } 378 379 // Check if accepting one more task exceeds maxTaskConcurrency. 380 if r.runningTasks.Len() == r.maxTaskConcurrency { 381 log.Debug("schedulerv3: too many running task", 382 zap.String("namespace", r.changefeedID.Namespace), 383 zap.String("changefeed", r.changefeedID.ID)) 384 // Does not use break, in case there is burst balance task 385 // in the remaining tasks. 386 continue 387 } 388 389 var span tablepb.Span 390 if task.AddTable != nil { 391 span = task.AddTable.Span 392 } else if task.RemoveTable != nil { 393 span = task.RemoveTable.Span 394 } else if task.MoveTable != nil { 395 span = task.MoveTable.Span 396 } 397 398 // Skip task if the table is already running a task, 399 // or the table has removed. 400 if _, ok := r.runningTasks.Get(span); ok { 401 log.Info("schedulerv3: ignore task, already exists", 402 zap.String("namespace", r.changefeedID.Namespace), 403 zap.String("changefeed", r.changefeedID.ID), 404 zap.Any("task", task)) 405 continue 406 } 407 if _, ok := r.spans.Get(span); !ok && task.AddTable == nil { 408 log.Info("schedulerv3: ignore task, table not found", 409 zap.String("namespace", r.changefeedID.Namespace), 410 zap.String("changefeed", r.changefeedID.ID), 411 zap.Any("task", task)) 412 continue 413 } 414 415 var msgs []*schedulepb.Message 416 var err error 417 if task.AddTable != nil { 418 msgs, err = r.handleAddTableTask(task.AddTable) 419 } else if task.RemoveTable != nil { 420 msgs, err = r.handleRemoveTableTask(task.RemoveTable) 421 } else if task.MoveTable != nil { 422 msgs, err = r.handleMoveTableTask(task.MoveTable) 423 } 424 if err != nil { 425 return nil, errors.Trace(err) 426 } 427 sentMsgs = append(sentMsgs, msgs...) 428 r.runningTasks.ReplaceOrInsert(span, task) 429 if task.Accept != nil { 430 task.Accept() 431 } 432 } 433 return sentMsgs, nil 434 } 435 436 func (r *Manager) handleAddTableTask( 437 task *AddTable, 438 ) ([]*schedulepb.Message, error) { 439 r.acceptAddTableTask++ 440 var err error 441 table, ok := r.spans.Get(task.Span) 442 if !ok { 443 table, err = NewReplicationSet(task.Span, task.CheckpointTs, nil, r.changefeedID) 444 if err != nil { 445 return nil, errors.Trace(err) 446 } 447 r.spans.ReplaceOrInsert(task.Span, table) 448 } 449 return table.handleAddTable(task.CaptureID) 450 } 451 452 func (r *Manager) handleRemoveTableTask( 453 task *RemoveTable, 454 ) ([]*schedulepb.Message, error) { 455 r.acceptRemoveTableTask++ 456 table, _ := r.spans.Get(task.Span) 457 if table.hasRemoved() { 458 log.Info("schedulerv3: table has removed", 459 zap.String("namespace", r.changefeedID.Namespace), 460 zap.String("changefeed", r.changefeedID.ID), 461 zap.Int64("tableID", task.Span.TableID)) 462 r.spans.Delete(task.Span) 463 return nil, nil 464 } 465 return table.handleRemoveTable() 466 } 467 468 func (r *Manager) handleMoveTableTask( 469 task *MoveTable, 470 ) ([]*schedulepb.Message, error) { 471 r.acceptMoveTableTask++ 472 table, _ := r.spans.Get(task.Span) 473 return table.handleMoveTable(task.DestCapture) 474 } 475 476 func (r *Manager) handleBurstBalanceTasks( 477 task *BurstBalance, 478 ) ([]*schedulepb.Message, error) { 479 r.acceptBurstBalanceTask++ 480 perCapture := make(map[model.CaptureID]int) 481 for _, task := range task.AddTables { 482 perCapture[task.CaptureID]++ 483 } 484 for _, task := range task.RemoveTables { 485 perCapture[task.CaptureID]++ 486 } 487 fields := make([]zap.Field, 0) 488 for captureID, count := range perCapture { 489 fields = append(fields, zap.Int(captureID, count)) 490 } 491 fields = append(fields, zap.Int("addTable", len(task.AddTables))) 492 fields = append(fields, zap.Int("removeTable", len(task.RemoveTables))) 493 fields = append(fields, zap.Int("moveTable", len(task.MoveTables))) 494 fields = append(fields, zap.String("namespace", r.changefeedID.Namespace)) 495 fields = append(fields, zap.String("changefeed", r.changefeedID.ID)) 496 log.Info("schedulerv3: handle burst balance task", fields...) 497 498 sentMsgs := make([]*schedulepb.Message, 0, len(task.AddTables)) 499 for i := range task.AddTables { 500 addTable := task.AddTables[i] 501 if _, ok := r.runningTasks.Get(addTable.Span); ok { 502 // Skip add table if the table is already running a task. 503 continue 504 } 505 msgs, err := r.handleAddTableTask(&addTable) 506 if err != nil { 507 return nil, errors.Trace(err) 508 } 509 sentMsgs = append(sentMsgs, msgs...) 510 // Just for place holding. 511 r.runningTasks.ReplaceOrInsert(addTable.Span, &ScheduleTask{}) 512 } 513 for i := range task.RemoveTables { 514 removeTable := task.RemoveTables[i] 515 if _, ok := r.runningTasks.Get(removeTable.Span); ok { 516 // Skip add table if the table is already running a task. 517 continue 518 } 519 msgs, err := r.handleRemoveTableTask(&removeTable) 520 if err != nil { 521 return nil, errors.Trace(err) 522 } 523 sentMsgs = append(sentMsgs, msgs...) 524 // Just for place holding. 525 r.runningTasks.ReplaceOrInsert(removeTable.Span, &ScheduleTask{}) 526 } 527 for i := range task.MoveTables { 528 moveTable := task.MoveTables[i] 529 if _, ok := r.runningTasks.Get(moveTable.Span); ok { 530 // Skip add table if the table is already running a task. 531 continue 532 } 533 msgs, err := r.handleMoveTableTask(&moveTable) 534 if err != nil { 535 return nil, errors.Trace(err) 536 } 537 sentMsgs = append(sentMsgs, msgs...) 538 // Just for place holding. 539 r.runningTasks.ReplaceOrInsert(moveTable.Span, &ScheduleTask{}) 540 } 541 return sentMsgs, nil 542 } 543 544 // ReplicationSets return all tracking replication set 545 // Caller must not modify the returned map. 546 func (r *Manager) ReplicationSets() *spanz.BtreeMap[*ReplicationSet] { 547 return r.spans 548 } 549 550 // RunningTasks return running tasks. 551 // Caller must not modify the returned map. 552 func (r *Manager) RunningTasks() *spanz.BtreeMap[*ScheduleTask] { 553 return r.runningTasks 554 } 555 556 // AdvanceCheckpoint tries to advance checkpoint and returns current checkpoint. 557 func (r *Manager) AdvanceCheckpoint( 558 currentTables *TableRanges, 559 currentPDTime time.Time, 560 barrier *schedulepb.BarrierWithMinTs, 561 redoMetaManager redo.MetaManager, 562 ) (watermark schedulepb.Watermark) { 563 var redoFlushedResolvedTs model.Ts 564 limitBarrierWithRedo := func(watermark *schedulepb.Watermark) { 565 flushedMeta := redoMetaManager.GetFlushedMeta() 566 redoFlushedResolvedTs = flushedMeta.ResolvedTs 567 log.Debug("owner gets flushed redo meta", 568 zap.String("namespace", r.changefeedID.Namespace), 569 zap.String("changefeed", r.changefeedID.ID), 570 zap.Uint64("flushedCheckpointTs", flushedMeta.CheckpointTs), 571 zap.Uint64("flushedResolvedTs", flushedMeta.ResolvedTs)) 572 if flushedMeta.ResolvedTs < watermark.ResolvedTs { 573 watermark.ResolvedTs = flushedMeta.ResolvedTs 574 } 575 576 if watermark.CheckpointTs > watermark.ResolvedTs { 577 watermark.CheckpointTs = watermark.ResolvedTs 578 } 579 580 if barrier.GlobalBarrierTs > watermark.ResolvedTs { 581 barrier.GlobalBarrierTs = watermark.ResolvedTs 582 } 583 } 584 defer func() { 585 if redoFlushedResolvedTs != 0 && barrier.GlobalBarrierTs > redoFlushedResolvedTs { 586 log.Panic("barrierTs should never greater than redo flushed", 587 zap.String("namespace", r.changefeedID.Namespace), 588 zap.String("changefeed", r.changefeedID.ID), 589 zap.Uint64("barrierTs", barrier.GlobalBarrierTs), 590 zap.Uint64("redoFlushedResolvedTs", redoFlushedResolvedTs)) 591 } 592 }() 593 594 r.slowestPuller = tablepb.Span{} 595 r.slowestSink = tablepb.Span{} 596 resolvedTsOfSlowestSink := model.Ts(math.MaxUint64) 597 598 watermark = schedulepb.Watermark{ 599 CheckpointTs: math.MaxUint64, 600 ResolvedTs: math.MaxUint64, 601 LastSyncedTs: 0, 602 PullerResolvedTs: math.MaxUint64, 603 } 604 605 cannotProceed := false 606 currentTables.Iter(func(tableID model.TableID, tableStart, tableEnd tablepb.Span) bool { 607 tableSpanFound, tableHasHole := false, false 608 tableSpanStartFound, tableSpanEndFound := false, false 609 lastSpan := tablepb.Span{} 610 r.spans.AscendRange(tableStart, tableEnd, 611 func(span tablepb.Span, table *ReplicationSet) bool { 612 if lastSpan.TableID != 0 && !bytes.Equal(lastSpan.EndKey, span.StartKey) { 613 log.Warn("schedulerv3: span hole detected, skip advance checkpoint", 614 zap.String("namespace", r.changefeedID.Namespace), 615 zap.String("changefeed", r.changefeedID.ID), 616 zap.String("lastSpan", lastSpan.String()), 617 zap.String("span", span.String())) 618 tableHasHole = true 619 return false 620 } 621 lastSpan = span 622 tableSpanFound = true 623 if bytes.Equal(span.StartKey, tableStart.StartKey) { 624 tableSpanStartFound = true 625 } 626 if bytes.Equal(span.EndKey, tableEnd.StartKey) { 627 tableSpanEndFound = true 628 } 629 630 // Find the minimum checkpoint ts and resolved ts. 631 if watermark.CheckpointTs > table.Checkpoint.CheckpointTs || 632 (watermark.CheckpointTs == table.Checkpoint.CheckpointTs && 633 resolvedTsOfSlowestSink > table.Checkpoint.ResolvedTs) { 634 watermark.CheckpointTs = table.Checkpoint.CheckpointTs 635 r.slowestSink = span 636 resolvedTsOfSlowestSink = table.Checkpoint.ResolvedTs 637 } 638 if watermark.ResolvedTs > table.Checkpoint.ResolvedTs { 639 watermark.ResolvedTs = table.Checkpoint.ResolvedTs 640 } 641 642 // Find the max lastSyncedTs of all tables. 643 if watermark.LastSyncedTs < table.Checkpoint.LastSyncedTs { 644 watermark.LastSyncedTs = table.Checkpoint.LastSyncedTs 645 } 646 // Find the minimum puller resolved ts. 647 if pullerCkpt, ok := table.Stats.StageCheckpoints["puller-egress"]; ok { 648 if watermark.PullerResolvedTs > pullerCkpt.ResolvedTs { 649 watermark.PullerResolvedTs = pullerCkpt.ResolvedTs 650 r.slowestPuller = span 651 } 652 } 653 654 return true 655 }) 656 if !tableSpanFound || !tableSpanStartFound || !tableSpanEndFound || tableHasHole { 657 // Can not advance checkpoint there is a span missing. 658 now := time.Now() 659 if r.lastMissTableID != 0 && r.lastMissTableID == tableID && 660 now.Sub(r.lastLogMissTime) > logMissingTableInterval { 661 log.Warn("schedulerv3: cannot advance checkpoint since missing span", 662 zap.String("namespace", r.changefeedID.Namespace), 663 zap.String("changefeed", r.changefeedID.ID), 664 zap.Bool("tableSpanFound", tableSpanFound), 665 zap.Bool("tableSpanStartFound", tableSpanStartFound), 666 zap.Bool("tableSpanEndFound", tableSpanEndFound), 667 zap.Bool("tableHasHole", tableHasHole), 668 zap.Int64("tableID", tableID)) 669 r.lastLogMissTime = now 670 } 671 r.lastMissTableID = tableID 672 cannotProceed = true 673 return false 674 } 675 r.lastMissTableID = 0 676 return true 677 }) 678 if cannotProceed { 679 if redoMetaManager.Enabled() { 680 // If redo is enabled, GlobalBarrierTs should be limited by redo flushed meta. 681 watermark.ResolvedTs = barrier.RedoBarrierTs 682 watermark.LastSyncedTs = checkpointCannotProceed 683 watermark.PullerResolvedTs = checkpointCannotProceed 684 limitBarrierWithRedo(&watermark) 685 } 686 return schedulepb.Watermark{ 687 CheckpointTs: checkpointCannotProceed, 688 ResolvedTs: checkpointCannotProceed, 689 LastSyncedTs: checkpointCannotProceed, 690 PullerResolvedTs: checkpointCannotProceed, 691 } 692 } 693 694 // If currentTables is empty, we should advance newResolvedTs to global barrier ts and 695 // advance newCheckpointTs to min table barrier ts. 696 if watermark.ResolvedTs == math.MaxUint64 || watermark.CheckpointTs == math.MaxUint64 { 697 if watermark.CheckpointTs != watermark.ResolvedTs || currentTables.Len() != 0 { 698 log.Panic("schedulerv3: newCheckpointTs and newResolvedTs should be both maxUint64 "+ 699 "if currentTables is empty", 700 zap.String("namespace", r.changefeedID.Namespace), 701 zap.String("changefeed", r.changefeedID.ID), 702 zap.Uint64("newCheckpointTs", watermark.CheckpointTs), 703 zap.Uint64("newResolvedTs", watermark.ResolvedTs), 704 zap.Any("currentTables", currentTables)) 705 } 706 watermark.ResolvedTs = barrier.GlobalBarrierTs 707 watermark.CheckpointTs = barrier.MinTableBarrierTs 708 } 709 710 if watermark.CheckpointTs > barrier.MinTableBarrierTs { 711 watermark.CheckpointTs = barrier.MinTableBarrierTs 712 // TODO: add panic after we fix the bug that newCheckpointTs > minTableBarrierTs. 713 // log.Panic("schedulerv3: newCheckpointTs should not be larger than minTableBarrierTs", 714 // zap.Uint64("newCheckpointTs", watermark.CheckpointTs), 715 // zap.Uint64("newResolvedTs", watermark.ResolvedTs), 716 // zap.Any("currentTables", currentTables.currentTables), 717 // zap.Any("barrier", barrier.Barrier), 718 // zap.Any("minTableBarrierTs", barrier.MinTableBarrierTs)) 719 } 720 721 // If changefeed's checkpoint lag is larger than 30s, 722 // log the 4 slowlest table infos every minute, which can 723 // help us find the problematic tables. 724 checkpointLag := currentPDTime.Sub(oracle.GetTimeFromTS(watermark.CheckpointTs)) 725 if checkpointLag > logSlowTablesLagThreshold && 726 time.Since(r.lastLogSlowTablesTime) > logSlowTablesInterval { 727 r.logSlowTableInfo(currentPDTime) 728 r.lastLogSlowTablesTime = time.Now() 729 } 730 731 if redoMetaManager.Enabled() { 732 if watermark.ResolvedTs > barrier.RedoBarrierTs { 733 watermark.ResolvedTs = barrier.RedoBarrierTs 734 } 735 redoMetaManager.UpdateMeta(watermark.CheckpointTs, watermark.ResolvedTs) 736 log.Debug("owner updates redo meta", 737 zap.String("namespace", r.changefeedID.Namespace), 738 zap.String("changefeed", r.changefeedID.ID), 739 zap.Uint64("newCheckpointTs", watermark.CheckpointTs), 740 zap.Uint64("newResolvedTs", watermark.ResolvedTs)) 741 limitBarrierWithRedo(&watermark) 742 } 743 744 return watermark 745 } 746 747 func (r *Manager) logSlowTableInfo(currentPDTime time.Time) { 748 // find the slow tables 749 r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool { 750 lag := currentPDTime.Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)) 751 if lag > logSlowTablesLagThreshold { 752 heap.Push(&r.slowTableHeap, table) 753 if r.slowTableHeap.Len() > defaultSlowTableHeapSize { 754 heap.Pop(&r.slowTableHeap) 755 } 756 } 757 return true 758 }) 759 760 num := r.slowTableHeap.Len() 761 for i := 0; i < num; i++ { 762 table := heap.Pop(&r.slowTableHeap).(*ReplicationSet) 763 log.Info("schedulerv3: slow table", 764 zap.String("namespace", r.changefeedID.Namespace), 765 zap.String("changefeed", r.changefeedID.ID), 766 zap.Int64("tableID", table.Span.TableID), 767 zap.String("tableStatus", table.State.String()), 768 zap.Uint64("checkpointTs", table.Checkpoint.CheckpointTs), 769 zap.Uint64("resolvedTs", table.Checkpoint.ResolvedTs), 770 zap.Duration("checkpointLag", currentPDTime. 771 Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs)))) 772 } 773 } 774 775 // CollectMetrics collects metrics. 776 func (r *Manager) CollectMetrics() { 777 cf := r.changefeedID 778 tableGauge. 779 WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.spans.Len())) 780 if table, ok := r.spans.Get(r.slowestSink); ok { 781 slowestTableIDGauge. 782 WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.slowestSink.TableID)) 783 slowestTableStateGauge. 784 WithLabelValues(cf.Namespace, cf.ID).Set(float64(table.State)) 785 phyCkpTs := oracle.ExtractPhysical(table.Checkpoint.CheckpointTs) 786 slowestTableCheckpointTsGauge. 787 WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyCkpTs)) 788 phyRTs := oracle.ExtractPhysical(table.Checkpoint.ResolvedTs) 789 slowestTableResolvedTsGauge. 790 WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyRTs)) 791 792 // Slow table latency metrics. 793 phyCurrentTs := oracle.ExtractPhysical(table.Stats.CurrentTs) 794 for stage, checkpoint := range table.Stats.StageCheckpoints { 795 // Checkpoint ts 796 phyCkpTs := oracle.ExtractPhysical(checkpoint.CheckpointTs) 797 slowestTableStageCheckpointTsGaugeVec. 798 WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyCkpTs)) 799 checkpointLag := float64(phyCurrentTs-phyCkpTs) / 1e3 800 slowestTableStageCheckpointTsLagGaugeVec. 801 WithLabelValues(cf.Namespace, cf.ID, stage).Set(checkpointLag) 802 slowestTableStageCheckpointTsLagHistogramVec. 803 WithLabelValues(cf.Namespace, cf.ID, stage).Observe(checkpointLag) 804 // Resolved ts 805 phyRTs := oracle.ExtractPhysical(checkpoint.ResolvedTs) 806 slowestTableStageResolvedTsGaugeVec. 807 WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyRTs)) 808 resolvedTsLag := float64(phyCurrentTs-phyRTs) / 1e3 809 slowestTableStageResolvedTsLagGaugeVec. 810 WithLabelValues(cf.Namespace, cf.ID, stage).Set(resolvedTsLag) 811 slowestTableStageResolvedTsLagHistogramVec. 812 WithLabelValues(cf.Namespace, cf.ID, stage).Observe(resolvedTsLag) 813 } 814 // Barrier ts 815 stage := "barrier" 816 phyBTs := oracle.ExtractPhysical(table.Stats.BarrierTs) 817 slowestTableStageResolvedTsGaugeVec. 818 WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyBTs)) 819 barrierTsLag := float64(phyCurrentTs-phyBTs) / 1e3 820 slowestTableStageResolvedTsLagGaugeVec. 821 WithLabelValues(cf.Namespace, cf.ID, stage).Set(barrierTsLag) 822 slowestTableStageResolvedTsLagHistogramVec. 823 WithLabelValues(cf.Namespace, cf.ID, stage).Observe(barrierTsLag) 824 // Region count 825 slowestTableRegionGaugeVec. 826 WithLabelValues(cf.Namespace, cf.ID).Set(float64(table.Stats.RegionCount)) 827 } 828 metricAcceptScheduleTask := acceptScheduleTaskCounter.MustCurryWith(map[string]string{ 829 "namespace": cf.Namespace, "changefeed": cf.ID, 830 }) 831 metricAcceptScheduleTask.WithLabelValues("addTable").Add(float64(r.acceptAddTableTask)) 832 r.acceptAddTableTask = 0 833 metricAcceptScheduleTask.WithLabelValues("removeTable").Add(float64(r.acceptRemoveTableTask)) 834 r.acceptRemoveTableTask = 0 835 metricAcceptScheduleTask.WithLabelValues("moveTable").Add(float64(r.acceptMoveTableTask)) 836 r.acceptMoveTableTask = 0 837 metricAcceptScheduleTask.WithLabelValues("burstBalance").Add(float64(r.acceptBurstBalanceTask)) 838 r.acceptBurstBalanceTask = 0 839 runningScheduleTaskGauge. 840 WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.runningTasks.Len())) 841 var stateCounters [6]int 842 r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool { 843 switch table.State { 844 case ReplicationSetStateUnknown: 845 stateCounters[ReplicationSetStateUnknown]++ 846 case ReplicationSetStateAbsent: 847 stateCounters[ReplicationSetStateAbsent]++ 848 case ReplicationSetStatePrepare: 849 stateCounters[ReplicationSetStatePrepare]++ 850 case ReplicationSetStateCommit: 851 stateCounters[ReplicationSetStateCommit]++ 852 case ReplicationSetStateReplicating: 853 stateCounters[ReplicationSetStateReplicating]++ 854 case ReplicationSetStateRemoving: 855 stateCounters[ReplicationSetStateRemoving]++ 856 } 857 return true 858 }) 859 for s, counter := range stateCounters { 860 tableStateGauge. 861 WithLabelValues(cf.Namespace, cf.ID, ReplicationSetState(s).String()). 862 Set(float64(counter)) 863 } 864 865 if table, ok := r.spans.Get(r.slowestPuller); ok { 866 if pullerCkpt, ok := table.Stats.StageCheckpoints["puller-egress"]; ok { 867 phyCkptTs := oracle.ExtractPhysical(pullerCkpt.ResolvedTs) 868 slowestTablePullerResolvedTs.WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyCkptTs)) 869 870 phyCurrentTs := oracle.ExtractPhysical(table.Stats.CurrentTs) 871 lag := float64(phyCurrentTs-phyCkptTs) / 1e3 872 slowestTablePullerResolvedTsLag.WithLabelValues(cf.Namespace, cf.ID).Set(lag) 873 } 874 } 875 } 876 877 // CleanMetrics cleans metrics. 878 func (r *Manager) CleanMetrics() { 879 cf := r.changefeedID 880 tableGauge.DeleteLabelValues(cf.Namespace, cf.ID) 881 slowestTableIDGauge.DeleteLabelValues(cf.Namespace, cf.ID) 882 slowestTableStateGauge.DeleteLabelValues(cf.Namespace, cf.ID) 883 slowestTableCheckpointTsGauge.DeleteLabelValues(cf.Namespace, cf.ID) 884 slowestTableResolvedTsGauge.DeleteLabelValues(cf.Namespace, cf.ID) 885 runningScheduleTaskGauge.DeleteLabelValues(cf.Namespace, cf.ID) 886 metricAcceptScheduleTask := acceptScheduleTaskCounter.MustCurryWith(map[string]string{ 887 "namespace": cf.Namespace, "changefeed": cf.ID, 888 }) 889 metricAcceptScheduleTask.DeleteLabelValues("addTable") 890 metricAcceptScheduleTask.DeleteLabelValues("removeTable") 891 metricAcceptScheduleTask.DeleteLabelValues("moveTable") 892 metricAcceptScheduleTask.DeleteLabelValues("burstBalance") 893 var stateCounters [6]int 894 for s := range stateCounters { 895 tableStateGauge. 896 DeleteLabelValues(cf.Namespace, cf.ID, ReplicationSetState(s).String()) 897 } 898 slowestTableStageCheckpointTsGaugeVec.Reset() 899 slowestTableStageResolvedTsGaugeVec.Reset() 900 slowestTableStageCheckpointTsLagGaugeVec.Reset() 901 slowestTableStageResolvedTsLagGaugeVec.Reset() 902 slowestTableStageCheckpointTsLagHistogramVec.Reset() 903 slowestTableStageResolvedTsLagHistogramVec.Reset() 904 slowestTableRegionGaugeVec.Reset() 905 } 906 907 // SetReplicationSetForTests is only used in tests. 908 func (r *Manager) SetReplicationSetForTests(rs *ReplicationSet) { 909 r.spans.ReplaceOrInsert(rs.Span, rs) 910 } 911 912 // GetReplicationSetForTests is only used in tests. 913 func (r *Manager) GetReplicationSetForTests() *spanz.BtreeMap[*ReplicationSet] { 914 return r.spans 915 } 916 917 // TableRanges wraps current tables and their ranges. 918 type TableRanges struct { 919 currentTables []model.TableID 920 cache struct { 921 tableRange map[model.TableID]struct{ start, end tablepb.Span } 922 lastGC time.Time 923 } 924 } 925 926 // UpdateTables current tables. 927 func (t *TableRanges) UpdateTables(currentTables []model.TableID) { 928 if time.Since(t.cache.lastGC) > 10*time.Minute { 929 t.cache.tableRange = make(map[model.TableID]struct { 930 start tablepb.Span 931 end tablepb.Span 932 }) 933 t.cache.lastGC = time.Now() 934 } 935 t.currentTables = currentTables 936 } 937 938 // Len returns the length of current tables. 939 func (t *TableRanges) Len() int { 940 return len(t.currentTables) 941 } 942 943 // Iter iterate current tables. 944 func (t *TableRanges) Iter(fn func(tableID model.TableID, tableStart, tableEnd tablepb.Span) bool) { 945 for _, tableID := range t.currentTables { 946 tableRange, ok := t.cache.tableRange[tableID] 947 if !ok { 948 tableRange.start, tableRange.end = spanz.TableIDToComparableRange(tableID) 949 t.cache.tableRange[tableID] = tableRange 950 } 951 if !fn(tableID, tableRange.start, tableRange.end) { 952 break 953 } 954 } 955 }