github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/replication/replication_manager.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package replication
    15  
    16  import (
    17  	"bytes"
    18  	"container/heap"
    19  	"fmt"
    20  	"math"
    21  	"time"
    22  
    23  	"github.com/pingcap/errors"
    24  	"github.com/pingcap/log"
    25  	"github.com/pingcap/tiflow/cdc/model"
    26  	"github.com/pingcap/tiflow/cdc/processor/tablepb"
    27  	"github.com/pingcap/tiflow/cdc/redo"
    28  	"github.com/pingcap/tiflow/cdc/scheduler/internal"
    29  	"github.com/pingcap/tiflow/cdc/scheduler/schedulepb"
    30  	"github.com/pingcap/tiflow/pkg/spanz"
    31  	"github.com/tikv/client-go/v2/oracle"
    32  	"go.uber.org/zap"
    33  )
    34  
    35  const (
    36  	checkpointCannotProceed = internal.CheckpointCannotProceed
    37  
    38  	defaultSlowTableHeapSize  = 4
    39  	logSlowTablesLagThreshold = 30 * time.Second
    40  	logSlowTablesInterval     = 1 * time.Minute
    41  	logMissingTableInterval   = 30 * time.Second
    42  )
    43  
    44  // Callback is invoked when something is done.
    45  type Callback func()
    46  
    47  // BurstBalance for changefeed set up or unplanned TiCDC node failure.
    48  // TiCDC needs to balance interrupted tables as soon as possible.
    49  type BurstBalance struct {
    50  	AddTables    []AddTable
    51  	RemoveTables []RemoveTable
    52  	MoveTables   []MoveTable
    53  }
    54  
    55  func (b BurstBalance) String() string {
    56  	if len(b.AddTables) != 0 {
    57  		return fmt.Sprintf("BurstBalance, add tables: %v", b.AddTables)
    58  	}
    59  	if len(b.RemoveTables) != 0 {
    60  		return fmt.Sprintf("BurstBalance, remove tables: %v", b.RemoveTables)
    61  	}
    62  	if len(b.MoveTables) != 0 {
    63  		return fmt.Sprintf("BurstBalance, move tables: %v", b.MoveTables)
    64  	}
    65  	return "BurstBalance, no tables"
    66  }
    67  
    68  // MoveTable is a schedule task for moving a table.
    69  type MoveTable struct {
    70  	Span        tablepb.Span
    71  	DestCapture model.CaptureID
    72  }
    73  
    74  func (t MoveTable) String() string {
    75  	return fmt.Sprintf("MoveTable, span: %s, dest: %s",
    76  		t.Span.String(), t.DestCapture)
    77  }
    78  
    79  // AddTable is a schedule task for adding a table.
    80  type AddTable struct {
    81  	Span         tablepb.Span
    82  	CaptureID    model.CaptureID
    83  	CheckpointTs model.Ts
    84  }
    85  
    86  func (t AddTable) String() string {
    87  	return fmt.Sprintf("AddTable, span: %s, capture: %s, checkpointTs: %d",
    88  		t.Span.String(), t.CaptureID, t.CheckpointTs)
    89  }
    90  
    91  // RemoveTable is a schedule task for removing a table.
    92  type RemoveTable struct {
    93  	Span      tablepb.Span
    94  	CaptureID model.CaptureID
    95  }
    96  
    97  func (t RemoveTable) String() string {
    98  	return fmt.Sprintf("RemoveTable, span: %s, capture: %s",
    99  		t.Span.String(), t.CaptureID)
   100  }
   101  
   102  // ScheduleTask is a schedule task that wraps add/move/remove table tasks.
   103  type ScheduleTask struct { //nolint:revive
   104  	MoveTable    *MoveTable
   105  	AddTable     *AddTable
   106  	RemoveTable  *RemoveTable
   107  	BurstBalance *BurstBalance
   108  
   109  	Accept Callback
   110  }
   111  
   112  // Name returns the name of a schedule task.
   113  func (s *ScheduleTask) Name() string {
   114  	if s.MoveTable != nil {
   115  		return "moveTable"
   116  	} else if s.AddTable != nil {
   117  		return "addTable"
   118  	} else if s.RemoveTable != nil {
   119  		return "removeTable"
   120  	} else if s.BurstBalance != nil {
   121  		return "burstBalance"
   122  	}
   123  	return "unknown"
   124  }
   125  
   126  func (s *ScheduleTask) String() string {
   127  	if s.MoveTable != nil {
   128  		return s.MoveTable.String()
   129  	}
   130  	if s.AddTable != nil {
   131  		return s.AddTable.String()
   132  	}
   133  	if s.RemoveTable != nil {
   134  		return s.RemoveTable.String()
   135  	}
   136  	if s.BurstBalance != nil {
   137  		return s.BurstBalance.String()
   138  	}
   139  	return ""
   140  }
   141  
   142  // Manager manages replications and running scheduling tasks.
   143  type Manager struct { //nolint:revive
   144  	spans *spanz.BtreeMap[*ReplicationSet]
   145  
   146  	runningTasks       *spanz.BtreeMap[*ScheduleTask]
   147  	maxTaskConcurrency int
   148  
   149  	changefeedID           model.ChangeFeedID
   150  	slowestPuller          tablepb.Span
   151  	slowestSink            tablepb.Span
   152  	acceptAddTableTask     int
   153  	acceptRemoveTableTask  int
   154  	acceptMoveTableTask    int
   155  	acceptBurstBalanceTask int
   156  
   157  	slowTableHeap         SetHeap
   158  	lastLogSlowTablesTime time.Time
   159  	lastMissTableID       tablepb.TableID
   160  	lastLogMissTime       time.Time
   161  }
   162  
   163  // NewReplicationManager returns a new replication manager.
   164  func NewReplicationManager(
   165  	maxTaskConcurrency int, changefeedID model.ChangeFeedID,
   166  ) *Manager {
   167  	// degreeReadHeavy is a degree optimized for read heavy map, many Ascend.
   168  	// There may be a large number of tables.
   169  	const degreeReadHeavy = 256
   170  	return &Manager{
   171  		spans:              spanz.NewBtreeMapWithDegree[*ReplicationSet](degreeReadHeavy),
   172  		runningTasks:       spanz.NewBtreeMap[*ScheduleTask](),
   173  		maxTaskConcurrency: maxTaskConcurrency,
   174  		changefeedID:       changefeedID,
   175  	}
   176  }
   177  
   178  // HandleCaptureChanges handles capture changes.
   179  func (r *Manager) HandleCaptureChanges(
   180  	init map[model.CaptureID][]tablepb.TableStatus,
   181  	removed map[model.CaptureID][]tablepb.TableStatus,
   182  	checkpointTs model.Ts,
   183  ) ([]*schedulepb.Message, error) {
   184  	if init != nil {
   185  		if r.spans.Len() != 0 {
   186  			log.Panic("schedulerv3: init again",
   187  				zap.String("namespace", r.changefeedID.Namespace),
   188  				zap.String("changefeed", r.changefeedID.ID),
   189  				zap.Any("init", init), zap.Any("tablesCount", r.spans.Len()))
   190  		}
   191  		spanStatusMap := spanz.NewBtreeMap[map[model.CaptureID]*tablepb.TableStatus]()
   192  		for captureID, spans := range init {
   193  			for i := range spans {
   194  				table := spans[i]
   195  				if _, ok := spanStatusMap.Get(table.Span); !ok {
   196  					spanStatusMap.ReplaceOrInsert(
   197  						table.Span, map[model.CaptureID]*tablepb.TableStatus{})
   198  				}
   199  				spanStatusMap.GetV(table.Span)[captureID] = &table
   200  			}
   201  		}
   202  		var err error
   203  		spanStatusMap.Ascend(func(span tablepb.Span, status map[string]*tablepb.TableStatus) bool {
   204  			table, err1 := NewReplicationSet(span, checkpointTs, status, r.changefeedID)
   205  			if err1 != nil {
   206  				err = errors.Trace(err1)
   207  				return false
   208  			}
   209  			r.spans.ReplaceOrInsert(table.Span, table)
   210  			return true
   211  		})
   212  		if err != nil {
   213  			return nil, errors.Trace(err)
   214  		}
   215  	}
   216  	sentMsgs := make([]*schedulepb.Message, 0)
   217  	if removed != nil {
   218  		var err error
   219  		r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool {
   220  			for captureID := range removed {
   221  				msgs, affected, err1 := table.handleCaptureShutdown(captureID)
   222  				if err != nil {
   223  					err = errors.Trace(err1)
   224  					return false
   225  				}
   226  				sentMsgs = append(sentMsgs, msgs...)
   227  				if affected {
   228  					// Cleanup its running task.
   229  					r.runningTasks.Delete(table.Span)
   230  				}
   231  			}
   232  			return true
   233  		})
   234  		if err != nil {
   235  			return nil, errors.Trace(err)
   236  		}
   237  	}
   238  	return sentMsgs, nil
   239  }
   240  
   241  // HandleMessage handles messages sent by other captures.
   242  func (r *Manager) HandleMessage(
   243  	msgs []*schedulepb.Message,
   244  ) ([]*schedulepb.Message, error) {
   245  	sentMsgs := make([]*schedulepb.Message, 0, len(msgs))
   246  	for i := range msgs {
   247  		msg := msgs[i]
   248  		switch msg.MsgType {
   249  		case schedulepb.MsgDispatchTableResponse:
   250  			msgs, err := r.handleMessageDispatchTableResponse(msg.From, msg.DispatchTableResponse)
   251  			if err != nil {
   252  				return nil, errors.Trace(err)
   253  			}
   254  			sentMsgs = append(sentMsgs, msgs...)
   255  		case schedulepb.MsgHeartbeatResponse:
   256  			msgs, err := r.handleMessageHeartbeatResponse(msg.From, msg.HeartbeatResponse)
   257  			if err != nil {
   258  				return nil, errors.Trace(err)
   259  			}
   260  			sentMsgs = append(sentMsgs, msgs...)
   261  		default:
   262  			log.Warn("schedulerv3: ignore message",
   263  				zap.String("namespace", r.changefeedID.Namespace),
   264  				zap.String("changefeed", r.changefeedID.ID),
   265  				zap.Stringer("type", msg.MsgType), zap.Any("message", msg))
   266  		}
   267  	}
   268  	return sentMsgs, nil
   269  }
   270  
   271  func (r *Manager) handleMessageHeartbeatResponse(
   272  	from model.CaptureID, msg *schedulepb.HeartbeatResponse,
   273  ) ([]*schedulepb.Message, error) {
   274  	sentMsgs := make([]*schedulepb.Message, 0)
   275  	for _, status := range msg.Tables {
   276  		table, ok := r.spans.Get(status.Span)
   277  		if !ok {
   278  			log.Info("schedulerv3: ignore table status no table found",
   279  				zap.String("namespace", r.changefeedID.Namespace),
   280  				zap.String("changefeed", r.changefeedID.ID),
   281  				zap.Any("from", from),
   282  				zap.Any("message", status))
   283  			continue
   284  		}
   285  		msgs, err := table.handleTableStatus(from, &status)
   286  		if err != nil {
   287  			return nil, errors.Trace(err)
   288  		}
   289  		if table.hasRemoved() {
   290  			log.Info("schedulerv3: table has removed",
   291  				zap.String("namespace", r.changefeedID.Namespace),
   292  				zap.String("changefeed", r.changefeedID.ID),
   293  				zap.Any("from", from),
   294  				zap.Int64("tableID", status.Span.TableID))
   295  			r.spans.Delete(status.Span)
   296  		}
   297  		sentMsgs = append(sentMsgs, msgs...)
   298  	}
   299  	return sentMsgs, nil
   300  }
   301  
   302  func (r *Manager) handleMessageDispatchTableResponse(
   303  	from model.CaptureID, msg *schedulepb.DispatchTableResponse,
   304  ) ([]*schedulepb.Message, error) {
   305  	var status *tablepb.TableStatus
   306  	switch resp := msg.Response.(type) {
   307  	case *schedulepb.DispatchTableResponse_AddTable:
   308  		status = resp.AddTable.Status
   309  	case *schedulepb.DispatchTableResponse_RemoveTable:
   310  		status = resp.RemoveTable.Status
   311  	default:
   312  		log.Warn("schedulerv3: ignore unknown dispatch table response",
   313  			zap.String("namespace", r.changefeedID.Namespace),
   314  			zap.String("changefeed", r.changefeedID.ID),
   315  			zap.Any("message", msg))
   316  		return nil, nil
   317  	}
   318  
   319  	table, ok := r.spans.Get(status.Span)
   320  	if !ok {
   321  		log.Info("schedulerv3: ignore table status no table found",
   322  			zap.String("namespace", r.changefeedID.Namespace),
   323  			zap.String("changefeed", r.changefeedID.ID),
   324  			zap.Any("message", status))
   325  		return nil, nil
   326  	}
   327  	msgs, err := table.handleTableStatus(from, status)
   328  	if err != nil {
   329  		return nil, errors.Trace(err)
   330  	}
   331  	if table.hasRemoved() {
   332  		log.Info("schedulerv3: table has removed",
   333  			zap.String("namespace", r.changefeedID.Namespace),
   334  			zap.String("changefeed", r.changefeedID.ID),
   335  			zap.Int64("tableID", status.Span.TableID))
   336  		r.spans.Delete(status.Span)
   337  	}
   338  	return msgs, nil
   339  }
   340  
   341  // HandleTasks handles schedule tasks.
   342  func (r *Manager) HandleTasks(
   343  	tasks []*ScheduleTask,
   344  ) ([]*schedulepb.Message, error) {
   345  	// Check if a running task is finished.
   346  	var toBeDeleted []tablepb.Span
   347  	r.runningTasks.Ascend(func(span tablepb.Span, task *ScheduleTask) bool {
   348  		if table, ok := r.spans.Get(span); ok {
   349  			// If table is back to Replicating or Removed,
   350  			// the running task is finished.
   351  			if table.State == ReplicationSetStateReplicating || table.hasRemoved() {
   352  				toBeDeleted = append(toBeDeleted, span)
   353  			}
   354  		} else {
   355  			// No table found, remove the task
   356  			toBeDeleted = append(toBeDeleted, span)
   357  		}
   358  		return true
   359  	})
   360  	for _, span := range toBeDeleted {
   361  		r.runningTasks.Delete(span)
   362  	}
   363  
   364  	sentMsgs := make([]*schedulepb.Message, 0)
   365  	for _, task := range tasks {
   366  		// Burst balance does not affect by maxTaskConcurrency.
   367  		if task.BurstBalance != nil {
   368  			msgs, err := r.handleBurstBalanceTasks(task.BurstBalance)
   369  			if err != nil {
   370  				return nil, errors.Trace(err)
   371  			}
   372  			sentMsgs = append(sentMsgs, msgs...)
   373  			if task.Accept != nil {
   374  				task.Accept()
   375  			}
   376  			continue
   377  		}
   378  
   379  		// Check if accepting one more task exceeds maxTaskConcurrency.
   380  		if r.runningTasks.Len() == r.maxTaskConcurrency {
   381  			log.Debug("schedulerv3: too many running task",
   382  				zap.String("namespace", r.changefeedID.Namespace),
   383  				zap.String("changefeed", r.changefeedID.ID))
   384  			// Does not use break, in case there is burst balance task
   385  			// in the remaining tasks.
   386  			continue
   387  		}
   388  
   389  		var span tablepb.Span
   390  		if task.AddTable != nil {
   391  			span = task.AddTable.Span
   392  		} else if task.RemoveTable != nil {
   393  			span = task.RemoveTable.Span
   394  		} else if task.MoveTable != nil {
   395  			span = task.MoveTable.Span
   396  		}
   397  
   398  		// Skip task if the table is already running a task,
   399  		// or the table has removed.
   400  		if _, ok := r.runningTasks.Get(span); ok {
   401  			log.Info("schedulerv3: ignore task, already exists",
   402  				zap.String("namespace", r.changefeedID.Namespace),
   403  				zap.String("changefeed", r.changefeedID.ID),
   404  				zap.Any("task", task))
   405  			continue
   406  		}
   407  		if _, ok := r.spans.Get(span); !ok && task.AddTable == nil {
   408  			log.Info("schedulerv3: ignore task, table not found",
   409  				zap.String("namespace", r.changefeedID.Namespace),
   410  				zap.String("changefeed", r.changefeedID.ID),
   411  				zap.Any("task", task))
   412  			continue
   413  		}
   414  
   415  		var msgs []*schedulepb.Message
   416  		var err error
   417  		if task.AddTable != nil {
   418  			msgs, err = r.handleAddTableTask(task.AddTable)
   419  		} else if task.RemoveTable != nil {
   420  			msgs, err = r.handleRemoveTableTask(task.RemoveTable)
   421  		} else if task.MoveTable != nil {
   422  			msgs, err = r.handleMoveTableTask(task.MoveTable)
   423  		}
   424  		if err != nil {
   425  			return nil, errors.Trace(err)
   426  		}
   427  		sentMsgs = append(sentMsgs, msgs...)
   428  		r.runningTasks.ReplaceOrInsert(span, task)
   429  		if task.Accept != nil {
   430  			task.Accept()
   431  		}
   432  	}
   433  	return sentMsgs, nil
   434  }
   435  
   436  func (r *Manager) handleAddTableTask(
   437  	task *AddTable,
   438  ) ([]*schedulepb.Message, error) {
   439  	r.acceptAddTableTask++
   440  	var err error
   441  	table, ok := r.spans.Get(task.Span)
   442  	if !ok {
   443  		table, err = NewReplicationSet(task.Span, task.CheckpointTs, nil, r.changefeedID)
   444  		if err != nil {
   445  			return nil, errors.Trace(err)
   446  		}
   447  		r.spans.ReplaceOrInsert(task.Span, table)
   448  	}
   449  	return table.handleAddTable(task.CaptureID)
   450  }
   451  
   452  func (r *Manager) handleRemoveTableTask(
   453  	task *RemoveTable,
   454  ) ([]*schedulepb.Message, error) {
   455  	r.acceptRemoveTableTask++
   456  	table, _ := r.spans.Get(task.Span)
   457  	if table.hasRemoved() {
   458  		log.Info("schedulerv3: table has removed",
   459  			zap.String("namespace", r.changefeedID.Namespace),
   460  			zap.String("changefeed", r.changefeedID.ID),
   461  			zap.Int64("tableID", task.Span.TableID))
   462  		r.spans.Delete(task.Span)
   463  		return nil, nil
   464  	}
   465  	return table.handleRemoveTable()
   466  }
   467  
   468  func (r *Manager) handleMoveTableTask(
   469  	task *MoveTable,
   470  ) ([]*schedulepb.Message, error) {
   471  	r.acceptMoveTableTask++
   472  	table, _ := r.spans.Get(task.Span)
   473  	return table.handleMoveTable(task.DestCapture)
   474  }
   475  
   476  func (r *Manager) handleBurstBalanceTasks(
   477  	task *BurstBalance,
   478  ) ([]*schedulepb.Message, error) {
   479  	r.acceptBurstBalanceTask++
   480  	perCapture := make(map[model.CaptureID]int)
   481  	for _, task := range task.AddTables {
   482  		perCapture[task.CaptureID]++
   483  	}
   484  	for _, task := range task.RemoveTables {
   485  		perCapture[task.CaptureID]++
   486  	}
   487  	fields := make([]zap.Field, 0)
   488  	for captureID, count := range perCapture {
   489  		fields = append(fields, zap.Int(captureID, count))
   490  	}
   491  	fields = append(fields, zap.Int("addTable", len(task.AddTables)))
   492  	fields = append(fields, zap.Int("removeTable", len(task.RemoveTables)))
   493  	fields = append(fields, zap.Int("moveTable", len(task.MoveTables)))
   494  	fields = append(fields, zap.String("namespace", r.changefeedID.Namespace))
   495  	fields = append(fields, zap.String("changefeed", r.changefeedID.ID))
   496  	log.Info("schedulerv3: handle burst balance task", fields...)
   497  
   498  	sentMsgs := make([]*schedulepb.Message, 0, len(task.AddTables))
   499  	for i := range task.AddTables {
   500  		addTable := task.AddTables[i]
   501  		if _, ok := r.runningTasks.Get(addTable.Span); ok {
   502  			// Skip add table if the table is already running a task.
   503  			continue
   504  		}
   505  		msgs, err := r.handleAddTableTask(&addTable)
   506  		if err != nil {
   507  			return nil, errors.Trace(err)
   508  		}
   509  		sentMsgs = append(sentMsgs, msgs...)
   510  		// Just for place holding.
   511  		r.runningTasks.ReplaceOrInsert(addTable.Span, &ScheduleTask{})
   512  	}
   513  	for i := range task.RemoveTables {
   514  		removeTable := task.RemoveTables[i]
   515  		if _, ok := r.runningTasks.Get(removeTable.Span); ok {
   516  			// Skip add table if the table is already running a task.
   517  			continue
   518  		}
   519  		msgs, err := r.handleRemoveTableTask(&removeTable)
   520  		if err != nil {
   521  			return nil, errors.Trace(err)
   522  		}
   523  		sentMsgs = append(sentMsgs, msgs...)
   524  		// Just for place holding.
   525  		r.runningTasks.ReplaceOrInsert(removeTable.Span, &ScheduleTask{})
   526  	}
   527  	for i := range task.MoveTables {
   528  		moveTable := task.MoveTables[i]
   529  		if _, ok := r.runningTasks.Get(moveTable.Span); ok {
   530  			// Skip add table if the table is already running a task.
   531  			continue
   532  		}
   533  		msgs, err := r.handleMoveTableTask(&moveTable)
   534  		if err != nil {
   535  			return nil, errors.Trace(err)
   536  		}
   537  		sentMsgs = append(sentMsgs, msgs...)
   538  		// Just for place holding.
   539  		r.runningTasks.ReplaceOrInsert(moveTable.Span, &ScheduleTask{})
   540  	}
   541  	return sentMsgs, nil
   542  }
   543  
   544  // ReplicationSets return all tracking replication set
   545  // Caller must not modify the returned map.
   546  func (r *Manager) ReplicationSets() *spanz.BtreeMap[*ReplicationSet] {
   547  	return r.spans
   548  }
   549  
   550  // RunningTasks return running tasks.
   551  // Caller must not modify the returned map.
   552  func (r *Manager) RunningTasks() *spanz.BtreeMap[*ScheduleTask] {
   553  	return r.runningTasks
   554  }
   555  
   556  // AdvanceCheckpoint tries to advance checkpoint and returns current checkpoint.
   557  func (r *Manager) AdvanceCheckpoint(
   558  	currentTables *TableRanges,
   559  	currentPDTime time.Time,
   560  	barrier *schedulepb.BarrierWithMinTs,
   561  	redoMetaManager redo.MetaManager,
   562  ) (watermark schedulepb.Watermark) {
   563  	var redoFlushedResolvedTs model.Ts
   564  	limitBarrierWithRedo := func(watermark *schedulepb.Watermark) {
   565  		flushedMeta := redoMetaManager.GetFlushedMeta()
   566  		redoFlushedResolvedTs = flushedMeta.ResolvedTs
   567  		log.Debug("owner gets flushed redo meta",
   568  			zap.String("namespace", r.changefeedID.Namespace),
   569  			zap.String("changefeed", r.changefeedID.ID),
   570  			zap.Uint64("flushedCheckpointTs", flushedMeta.CheckpointTs),
   571  			zap.Uint64("flushedResolvedTs", flushedMeta.ResolvedTs))
   572  		if flushedMeta.ResolvedTs < watermark.ResolvedTs {
   573  			watermark.ResolvedTs = flushedMeta.ResolvedTs
   574  		}
   575  
   576  		if watermark.CheckpointTs > watermark.ResolvedTs {
   577  			watermark.CheckpointTs = watermark.ResolvedTs
   578  		}
   579  
   580  		if barrier.GlobalBarrierTs > watermark.ResolvedTs {
   581  			barrier.GlobalBarrierTs = watermark.ResolvedTs
   582  		}
   583  	}
   584  	defer func() {
   585  		if redoFlushedResolvedTs != 0 && barrier.GlobalBarrierTs > redoFlushedResolvedTs {
   586  			log.Panic("barrierTs should never greater than redo flushed",
   587  				zap.String("namespace", r.changefeedID.Namespace),
   588  				zap.String("changefeed", r.changefeedID.ID),
   589  				zap.Uint64("barrierTs", barrier.GlobalBarrierTs),
   590  				zap.Uint64("redoFlushedResolvedTs", redoFlushedResolvedTs))
   591  		}
   592  	}()
   593  
   594  	r.slowestPuller = tablepb.Span{}
   595  	r.slowestSink = tablepb.Span{}
   596  	resolvedTsOfSlowestSink := model.Ts(math.MaxUint64)
   597  
   598  	watermark = schedulepb.Watermark{
   599  		CheckpointTs:     math.MaxUint64,
   600  		ResolvedTs:       math.MaxUint64,
   601  		LastSyncedTs:     0,
   602  		PullerResolvedTs: math.MaxUint64,
   603  	}
   604  
   605  	cannotProceed := false
   606  	currentTables.Iter(func(tableID model.TableID, tableStart, tableEnd tablepb.Span) bool {
   607  		tableSpanFound, tableHasHole := false, false
   608  		tableSpanStartFound, tableSpanEndFound := false, false
   609  		lastSpan := tablepb.Span{}
   610  		r.spans.AscendRange(tableStart, tableEnd,
   611  			func(span tablepb.Span, table *ReplicationSet) bool {
   612  				if lastSpan.TableID != 0 && !bytes.Equal(lastSpan.EndKey, span.StartKey) {
   613  					log.Warn("schedulerv3: span hole detected, skip advance checkpoint",
   614  						zap.String("namespace", r.changefeedID.Namespace),
   615  						zap.String("changefeed", r.changefeedID.ID),
   616  						zap.String("lastSpan", lastSpan.String()),
   617  						zap.String("span", span.String()))
   618  					tableHasHole = true
   619  					return false
   620  				}
   621  				lastSpan = span
   622  				tableSpanFound = true
   623  				if bytes.Equal(span.StartKey, tableStart.StartKey) {
   624  					tableSpanStartFound = true
   625  				}
   626  				if bytes.Equal(span.EndKey, tableEnd.StartKey) {
   627  					tableSpanEndFound = true
   628  				}
   629  
   630  				// Find the minimum checkpoint ts and resolved ts.
   631  				if watermark.CheckpointTs > table.Checkpoint.CheckpointTs ||
   632  					(watermark.CheckpointTs == table.Checkpoint.CheckpointTs &&
   633  						resolvedTsOfSlowestSink > table.Checkpoint.ResolvedTs) {
   634  					watermark.CheckpointTs = table.Checkpoint.CheckpointTs
   635  					r.slowestSink = span
   636  					resolvedTsOfSlowestSink = table.Checkpoint.ResolvedTs
   637  				}
   638  				if watermark.ResolvedTs > table.Checkpoint.ResolvedTs {
   639  					watermark.ResolvedTs = table.Checkpoint.ResolvedTs
   640  				}
   641  
   642  				// Find the max lastSyncedTs of all tables.
   643  				if watermark.LastSyncedTs < table.Checkpoint.LastSyncedTs {
   644  					watermark.LastSyncedTs = table.Checkpoint.LastSyncedTs
   645  				}
   646  				// Find the minimum puller resolved ts.
   647  				if pullerCkpt, ok := table.Stats.StageCheckpoints["puller-egress"]; ok {
   648  					if watermark.PullerResolvedTs > pullerCkpt.ResolvedTs {
   649  						watermark.PullerResolvedTs = pullerCkpt.ResolvedTs
   650  						r.slowestPuller = span
   651  					}
   652  				}
   653  
   654  				return true
   655  			})
   656  		if !tableSpanFound || !tableSpanStartFound || !tableSpanEndFound || tableHasHole {
   657  			// Can not advance checkpoint there is a span missing.
   658  			now := time.Now()
   659  			if r.lastMissTableID != 0 && r.lastMissTableID == tableID &&
   660  				now.Sub(r.lastLogMissTime) > logMissingTableInterval {
   661  				log.Warn("schedulerv3: cannot advance checkpoint since missing span",
   662  					zap.String("namespace", r.changefeedID.Namespace),
   663  					zap.String("changefeed", r.changefeedID.ID),
   664  					zap.Bool("tableSpanFound", tableSpanFound),
   665  					zap.Bool("tableSpanStartFound", tableSpanStartFound),
   666  					zap.Bool("tableSpanEndFound", tableSpanEndFound),
   667  					zap.Bool("tableHasHole", tableHasHole),
   668  					zap.Int64("tableID", tableID))
   669  				r.lastLogMissTime = now
   670  			}
   671  			r.lastMissTableID = tableID
   672  			cannotProceed = true
   673  			return false
   674  		}
   675  		r.lastMissTableID = 0
   676  		return true
   677  	})
   678  	if cannotProceed {
   679  		if redoMetaManager.Enabled() {
   680  			// If redo is enabled, GlobalBarrierTs should be limited by redo flushed meta.
   681  			watermark.ResolvedTs = barrier.RedoBarrierTs
   682  			watermark.LastSyncedTs = checkpointCannotProceed
   683  			watermark.PullerResolvedTs = checkpointCannotProceed
   684  			limitBarrierWithRedo(&watermark)
   685  		}
   686  		return schedulepb.Watermark{
   687  			CheckpointTs:     checkpointCannotProceed,
   688  			ResolvedTs:       checkpointCannotProceed,
   689  			LastSyncedTs:     checkpointCannotProceed,
   690  			PullerResolvedTs: checkpointCannotProceed,
   691  		}
   692  	}
   693  
   694  	// If currentTables is empty, we should advance newResolvedTs to global barrier ts and
   695  	// advance newCheckpointTs to min table barrier ts.
   696  	if watermark.ResolvedTs == math.MaxUint64 || watermark.CheckpointTs == math.MaxUint64 {
   697  		if watermark.CheckpointTs != watermark.ResolvedTs || currentTables.Len() != 0 {
   698  			log.Panic("schedulerv3: newCheckpointTs and newResolvedTs should be both maxUint64 "+
   699  				"if currentTables is empty",
   700  				zap.String("namespace", r.changefeedID.Namespace),
   701  				zap.String("changefeed", r.changefeedID.ID),
   702  				zap.Uint64("newCheckpointTs", watermark.CheckpointTs),
   703  				zap.Uint64("newResolvedTs", watermark.ResolvedTs),
   704  				zap.Any("currentTables", currentTables))
   705  		}
   706  		watermark.ResolvedTs = barrier.GlobalBarrierTs
   707  		watermark.CheckpointTs = barrier.MinTableBarrierTs
   708  	}
   709  
   710  	if watermark.CheckpointTs > barrier.MinTableBarrierTs {
   711  		watermark.CheckpointTs = barrier.MinTableBarrierTs
   712  		// TODO: add panic after we fix the bug that newCheckpointTs > minTableBarrierTs.
   713  		// log.Panic("schedulerv3: newCheckpointTs should not be larger than minTableBarrierTs",
   714  		// 	zap.Uint64("newCheckpointTs", watermark.CheckpointTs),
   715  		// 	zap.Uint64("newResolvedTs", watermark.ResolvedTs),
   716  		// 	zap.Any("currentTables", currentTables.currentTables),
   717  		// 	zap.Any("barrier", barrier.Barrier),
   718  		// 	zap.Any("minTableBarrierTs", barrier.MinTableBarrierTs))
   719  	}
   720  
   721  	// If changefeed's checkpoint lag is larger than 30s,
   722  	// log the 4 slowlest table infos every minute, which can
   723  	// help us find the problematic tables.
   724  	checkpointLag := currentPDTime.Sub(oracle.GetTimeFromTS(watermark.CheckpointTs))
   725  	if checkpointLag > logSlowTablesLagThreshold &&
   726  		time.Since(r.lastLogSlowTablesTime) > logSlowTablesInterval {
   727  		r.logSlowTableInfo(currentPDTime)
   728  		r.lastLogSlowTablesTime = time.Now()
   729  	}
   730  
   731  	if redoMetaManager.Enabled() {
   732  		if watermark.ResolvedTs > barrier.RedoBarrierTs {
   733  			watermark.ResolvedTs = barrier.RedoBarrierTs
   734  		}
   735  		redoMetaManager.UpdateMeta(watermark.CheckpointTs, watermark.ResolvedTs)
   736  		log.Debug("owner updates redo meta",
   737  			zap.String("namespace", r.changefeedID.Namespace),
   738  			zap.String("changefeed", r.changefeedID.ID),
   739  			zap.Uint64("newCheckpointTs", watermark.CheckpointTs),
   740  			zap.Uint64("newResolvedTs", watermark.ResolvedTs))
   741  		limitBarrierWithRedo(&watermark)
   742  	}
   743  
   744  	return watermark
   745  }
   746  
   747  func (r *Manager) logSlowTableInfo(currentPDTime time.Time) {
   748  	// find the slow tables
   749  	r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool {
   750  		lag := currentPDTime.Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs))
   751  		if lag > logSlowTablesLagThreshold {
   752  			heap.Push(&r.slowTableHeap, table)
   753  			if r.slowTableHeap.Len() > defaultSlowTableHeapSize {
   754  				heap.Pop(&r.slowTableHeap)
   755  			}
   756  		}
   757  		return true
   758  	})
   759  
   760  	num := r.slowTableHeap.Len()
   761  	for i := 0; i < num; i++ {
   762  		table := heap.Pop(&r.slowTableHeap).(*ReplicationSet)
   763  		log.Info("schedulerv3: slow table",
   764  			zap.String("namespace", r.changefeedID.Namespace),
   765  			zap.String("changefeed", r.changefeedID.ID),
   766  			zap.Int64("tableID", table.Span.TableID),
   767  			zap.String("tableStatus", table.State.String()),
   768  			zap.Uint64("checkpointTs", table.Checkpoint.CheckpointTs),
   769  			zap.Uint64("resolvedTs", table.Checkpoint.ResolvedTs),
   770  			zap.Duration("checkpointLag", currentPDTime.
   771  				Sub(oracle.GetTimeFromTS(table.Checkpoint.CheckpointTs))))
   772  	}
   773  }
   774  
   775  // CollectMetrics collects metrics.
   776  func (r *Manager) CollectMetrics() {
   777  	cf := r.changefeedID
   778  	tableGauge.
   779  		WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.spans.Len()))
   780  	if table, ok := r.spans.Get(r.slowestSink); ok {
   781  		slowestTableIDGauge.
   782  			WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.slowestSink.TableID))
   783  		slowestTableStateGauge.
   784  			WithLabelValues(cf.Namespace, cf.ID).Set(float64(table.State))
   785  		phyCkpTs := oracle.ExtractPhysical(table.Checkpoint.CheckpointTs)
   786  		slowestTableCheckpointTsGauge.
   787  			WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyCkpTs))
   788  		phyRTs := oracle.ExtractPhysical(table.Checkpoint.ResolvedTs)
   789  		slowestTableResolvedTsGauge.
   790  			WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyRTs))
   791  
   792  		// Slow table latency metrics.
   793  		phyCurrentTs := oracle.ExtractPhysical(table.Stats.CurrentTs)
   794  		for stage, checkpoint := range table.Stats.StageCheckpoints {
   795  			// Checkpoint ts
   796  			phyCkpTs := oracle.ExtractPhysical(checkpoint.CheckpointTs)
   797  			slowestTableStageCheckpointTsGaugeVec.
   798  				WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyCkpTs))
   799  			checkpointLag := float64(phyCurrentTs-phyCkpTs) / 1e3
   800  			slowestTableStageCheckpointTsLagGaugeVec.
   801  				WithLabelValues(cf.Namespace, cf.ID, stage).Set(checkpointLag)
   802  			slowestTableStageCheckpointTsLagHistogramVec.
   803  				WithLabelValues(cf.Namespace, cf.ID, stage).Observe(checkpointLag)
   804  			// Resolved ts
   805  			phyRTs := oracle.ExtractPhysical(checkpoint.ResolvedTs)
   806  			slowestTableStageResolvedTsGaugeVec.
   807  				WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyRTs))
   808  			resolvedTsLag := float64(phyCurrentTs-phyRTs) / 1e3
   809  			slowestTableStageResolvedTsLagGaugeVec.
   810  				WithLabelValues(cf.Namespace, cf.ID, stage).Set(resolvedTsLag)
   811  			slowestTableStageResolvedTsLagHistogramVec.
   812  				WithLabelValues(cf.Namespace, cf.ID, stage).Observe(resolvedTsLag)
   813  		}
   814  		// Barrier ts
   815  		stage := "barrier"
   816  		phyBTs := oracle.ExtractPhysical(table.Stats.BarrierTs)
   817  		slowestTableStageResolvedTsGaugeVec.
   818  			WithLabelValues(cf.Namespace, cf.ID, stage).Set(float64(phyBTs))
   819  		barrierTsLag := float64(phyCurrentTs-phyBTs) / 1e3
   820  		slowestTableStageResolvedTsLagGaugeVec.
   821  			WithLabelValues(cf.Namespace, cf.ID, stage).Set(barrierTsLag)
   822  		slowestTableStageResolvedTsLagHistogramVec.
   823  			WithLabelValues(cf.Namespace, cf.ID, stage).Observe(barrierTsLag)
   824  		// Region count
   825  		slowestTableRegionGaugeVec.
   826  			WithLabelValues(cf.Namespace, cf.ID).Set(float64(table.Stats.RegionCount))
   827  	}
   828  	metricAcceptScheduleTask := acceptScheduleTaskCounter.MustCurryWith(map[string]string{
   829  		"namespace": cf.Namespace, "changefeed": cf.ID,
   830  	})
   831  	metricAcceptScheduleTask.WithLabelValues("addTable").Add(float64(r.acceptAddTableTask))
   832  	r.acceptAddTableTask = 0
   833  	metricAcceptScheduleTask.WithLabelValues("removeTable").Add(float64(r.acceptRemoveTableTask))
   834  	r.acceptRemoveTableTask = 0
   835  	metricAcceptScheduleTask.WithLabelValues("moveTable").Add(float64(r.acceptMoveTableTask))
   836  	r.acceptMoveTableTask = 0
   837  	metricAcceptScheduleTask.WithLabelValues("burstBalance").Add(float64(r.acceptBurstBalanceTask))
   838  	r.acceptBurstBalanceTask = 0
   839  	runningScheduleTaskGauge.
   840  		WithLabelValues(cf.Namespace, cf.ID).Set(float64(r.runningTasks.Len()))
   841  	var stateCounters [6]int
   842  	r.spans.Ascend(func(span tablepb.Span, table *ReplicationSet) bool {
   843  		switch table.State {
   844  		case ReplicationSetStateUnknown:
   845  			stateCounters[ReplicationSetStateUnknown]++
   846  		case ReplicationSetStateAbsent:
   847  			stateCounters[ReplicationSetStateAbsent]++
   848  		case ReplicationSetStatePrepare:
   849  			stateCounters[ReplicationSetStatePrepare]++
   850  		case ReplicationSetStateCommit:
   851  			stateCounters[ReplicationSetStateCommit]++
   852  		case ReplicationSetStateReplicating:
   853  			stateCounters[ReplicationSetStateReplicating]++
   854  		case ReplicationSetStateRemoving:
   855  			stateCounters[ReplicationSetStateRemoving]++
   856  		}
   857  		return true
   858  	})
   859  	for s, counter := range stateCounters {
   860  		tableStateGauge.
   861  			WithLabelValues(cf.Namespace, cf.ID, ReplicationSetState(s).String()).
   862  			Set(float64(counter))
   863  	}
   864  
   865  	if table, ok := r.spans.Get(r.slowestPuller); ok {
   866  		if pullerCkpt, ok := table.Stats.StageCheckpoints["puller-egress"]; ok {
   867  			phyCkptTs := oracle.ExtractPhysical(pullerCkpt.ResolvedTs)
   868  			slowestTablePullerResolvedTs.WithLabelValues(cf.Namespace, cf.ID).Set(float64(phyCkptTs))
   869  
   870  			phyCurrentTs := oracle.ExtractPhysical(table.Stats.CurrentTs)
   871  			lag := float64(phyCurrentTs-phyCkptTs) / 1e3
   872  			slowestTablePullerResolvedTsLag.WithLabelValues(cf.Namespace, cf.ID).Set(lag)
   873  		}
   874  	}
   875  }
   876  
   877  // CleanMetrics cleans metrics.
   878  func (r *Manager) CleanMetrics() {
   879  	cf := r.changefeedID
   880  	tableGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   881  	slowestTableIDGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   882  	slowestTableStateGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   883  	slowestTableCheckpointTsGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   884  	slowestTableResolvedTsGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   885  	runningScheduleTaskGauge.DeleteLabelValues(cf.Namespace, cf.ID)
   886  	metricAcceptScheduleTask := acceptScheduleTaskCounter.MustCurryWith(map[string]string{
   887  		"namespace": cf.Namespace, "changefeed": cf.ID,
   888  	})
   889  	metricAcceptScheduleTask.DeleteLabelValues("addTable")
   890  	metricAcceptScheduleTask.DeleteLabelValues("removeTable")
   891  	metricAcceptScheduleTask.DeleteLabelValues("moveTable")
   892  	metricAcceptScheduleTask.DeleteLabelValues("burstBalance")
   893  	var stateCounters [6]int
   894  	for s := range stateCounters {
   895  		tableStateGauge.
   896  			DeleteLabelValues(cf.Namespace, cf.ID, ReplicationSetState(s).String())
   897  	}
   898  	slowestTableStageCheckpointTsGaugeVec.Reset()
   899  	slowestTableStageResolvedTsGaugeVec.Reset()
   900  	slowestTableStageCheckpointTsLagGaugeVec.Reset()
   901  	slowestTableStageResolvedTsLagGaugeVec.Reset()
   902  	slowestTableStageCheckpointTsLagHistogramVec.Reset()
   903  	slowestTableStageResolvedTsLagHistogramVec.Reset()
   904  	slowestTableRegionGaugeVec.Reset()
   905  }
   906  
   907  // SetReplicationSetForTests is only used in tests.
   908  func (r *Manager) SetReplicationSetForTests(rs *ReplicationSet) {
   909  	r.spans.ReplaceOrInsert(rs.Span, rs)
   910  }
   911  
   912  // GetReplicationSetForTests is only used in tests.
   913  func (r *Manager) GetReplicationSetForTests() *spanz.BtreeMap[*ReplicationSet] {
   914  	return r.spans
   915  }
   916  
   917  // TableRanges wraps current tables and their ranges.
   918  type TableRanges struct {
   919  	currentTables []model.TableID
   920  	cache         struct {
   921  		tableRange map[model.TableID]struct{ start, end tablepb.Span }
   922  		lastGC     time.Time
   923  	}
   924  }
   925  
   926  // UpdateTables current tables.
   927  func (t *TableRanges) UpdateTables(currentTables []model.TableID) {
   928  	if time.Since(t.cache.lastGC) > 10*time.Minute {
   929  		t.cache.tableRange = make(map[model.TableID]struct {
   930  			start tablepb.Span
   931  			end   tablepb.Span
   932  		})
   933  		t.cache.lastGC = time.Now()
   934  	}
   935  	t.currentTables = currentTables
   936  }
   937  
   938  // Len returns the length of current tables.
   939  func (t *TableRanges) Len() int {
   940  	return len(t.currentTables)
   941  }
   942  
   943  // Iter iterate current tables.
   944  func (t *TableRanges) Iter(fn func(tableID model.TableID, tableStart, tableEnd tablepb.Span) bool) {
   945  	for _, tableID := range t.currentTables {
   946  		tableRange, ok := t.cache.tableRange[tableID]
   947  		if !ok {
   948  			tableRange.start, tableRange.end = spanz.TableIDToComparableRange(tableID)
   949  			t.cache.tableRange[tableID] = tableRange
   950  		}
   951  		if !fn(tableID, tableRange.start, tableRange.end) {
   952  			break
   953  		}
   954  	}
   955  }