github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/coordinator.go

github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/coordinator.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package v3
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/pingcap/errors"
    22  	"github.com/pingcap/log"
    23  	"github.com/pingcap/tiflow/cdc/model"
    24  	"github.com/pingcap/tiflow/cdc/processor/tablepb"
    25  	"github.com/pingcap/tiflow/cdc/redo"
    26  	"github.com/pingcap/tiflow/cdc/scheduler/internal"
    27  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/compat"
    28  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/keyspan"
    29  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/member"
    30  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/replication"
    31  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/scheduler"
    32  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/transport"
    33  	"github.com/pingcap/tiflow/cdc/scheduler/schedulepb"
    34  	"github.com/pingcap/tiflow/pkg/config"
    35  	"github.com/pingcap/tiflow/pkg/p2p"
    36  	"github.com/pingcap/tiflow/pkg/pdutil"
    37  	"github.com/pingcap/tiflow/pkg/spanz"
    38  	"github.com/pingcap/tiflow/pkg/upstream"
    39  	"github.com/pingcap/tiflow/pkg/version"
    40  	"go.uber.org/zap"
    41  )
    42  
    43  const (
    44  	// When heavy operations (such as network IO and serialization) take too much time, the program
    45  	// should print a warning log, and if necessary, the timeout should be exposed externally through
    46  	// monitor.
    47  	tickLogsWarnDuration    = 1 * time.Second
    48  	checkpointCannotProceed = internal.CheckpointCannotProceed
    49  	metricsInterval         = 10 * time.Second
    50  )
    51  
    52  var _ internal.Scheduler = (*coordinator)(nil)
    53  
    54  type coordinator struct {
    55  	// A mutex for concurrent access of coordinator in
    56  	// internal.Scheduler and internal.InfoProvider API.
    57  	mu sync.Mutex
    58  
    59  	version         string
    60  	revision        schedulepb.OwnerRevision
    61  	changefeedEpoch uint64
    62  	captureID       model.CaptureID
    63  	trans           transport.Transport
    64  	replicationM    *replication.Manager
    65  	captureM        *member.CaptureManager
    66  	schedulerM      *scheduler.Manager
    67  	reconciler      *keyspan.Reconciler
    68  	compat          *compat.Compat
    69  	pdClock         pdutil.Clock
    70  	tableRanges     replication.TableRanges
    71  	redoMetaManager redo.MetaManager
    72  
    73  	lastCollectTime time.Time
    74  	changefeedID    model.ChangeFeedID
    75  }
    76  
    77  // NewCoordinator returns a two phase scheduler.
    78  func NewCoordinator(
    79  	ctx context.Context,
    80  	captureID model.CaptureID,
    81  	changefeedID model.ChangeFeedID,
    82  	messageServer *p2p.MessageServer,
    83  	messageRouter p2p.MessageRouter,
    84  	ownerRevision int64,
    85  	changefeedEpoch uint64,
    86  	up *upstream.Upstream,
    87  	cfg *config.SchedulerConfig,
    88  	redoMetaManager redo.MetaManager,
    89  ) (internal.Scheduler, error) {
    90  	trans, err := transport.NewTransport(
    91  		ctx, changefeedID, transport.SchedulerRole, messageServer, messageRouter)
    92  	if err != nil {
    93  		return nil, errors.Trace(err)
    94  	}
    95  	reconciler, err := keyspan.NewReconciler(changefeedID, up, cfg.ChangefeedSettings)
    96  	if err != nil {
    97  		return nil, errors.Trace(err)
    98  	}
    99  	revision := schedulepb.OwnerRevision{Revision: ownerRevision}
   100  	return &coordinator{
   101  		version:         version.ReleaseSemver(),
   102  		revision:        revision,
   103  		changefeedEpoch: changefeedEpoch,
   104  		captureID:       captureID,
   105  		trans:           trans,
   106  		replicationM: replication.NewReplicationManager(
   107  			cfg.MaxTaskConcurrency, changefeedID),
   108  		captureM:        member.NewCaptureManager(captureID, changefeedID, revision, cfg),
   109  		schedulerM:      scheduler.NewSchedulerManager(changefeedID, cfg),
   110  		reconciler:      reconciler,
   111  		changefeedID:    changefeedID,
   112  		compat:          compat.New(cfg, map[model.CaptureID]*model.CaptureInfo{}),
   113  		pdClock:         up.PDClock,
   114  		redoMetaManager: redoMetaManager,
   115  	}, nil
   116  }
   117  
   118  // Tick implement the scheduler interface
   119  func (c *coordinator) Tick(
   120  	ctx context.Context,
   121  	// Latest global checkpoint of the changefeed
   122  	checkpointTs model.Ts,
   123  	// All tables that SHOULD be replicated (or started) at the current checkpoint.
   124  	currentTables []model.TableID,
   125  	// All captures that are alive according to the latest Etcd states.
   126  	aliveCaptures map[model.CaptureID]*model.CaptureInfo,
   127  	barrier *schedulepb.BarrierWithMinTs,
   128  ) (watermark schedulepb.Watermark, err error) {
   129  	startTime := time.Now()
   130  	defer func() {
   131  		costTime := time.Since(startTime)
   132  		if costTime > tickLogsWarnDuration {
   133  			log.Warn("scheduler tick took too long",
   134  				zap.String("namespace", c.changefeedID.Namespace),
   135  				zap.String("changefeed", c.changefeedID.ID),
   136  				zap.Duration("duration", costTime))
   137  		}
   138  	}()
   139  
   140  	c.mu.Lock()
   141  	defer c.mu.Unlock()
   142  
   143  	return c.poll(ctx, checkpointTs, currentTables, aliveCaptures, barrier)
   144  }
   145  
   146  // MoveTable implement the scheduler interface
   147  // FIXME: tableID should be Span.
   148  func (c *coordinator) MoveTable(tableID model.TableID, target model.CaptureID) {
   149  	c.mu.Lock()
   150  	defer c.mu.Unlock()
   151  
   152  	if !c.captureM.CheckAllCaptureInitialized() {
   153  		log.Info("schedulerv3: manual move table task ignored, "+
   154  			"since not all captures initialized",
   155  			zap.String("namespace", c.changefeedID.Namespace),
   156  			zap.String("changefeed", c.changefeedID.ID),
   157  			zap.Int64("tableID", tableID),
   158  			zap.String("targetCapture", target))
   159  		return
   160  	}
   161  
   162  	span := spanz.TableIDToComparableSpan(tableID)
   163  	c.schedulerM.MoveTable(span, target)
   164  }
   165  
   166  // Rebalance implement the scheduler interface
   167  func (c *coordinator) Rebalance() {
   168  	c.mu.Lock()
   169  	defer c.mu.Unlock()
   170  
   171  	if !c.captureM.CheckAllCaptureInitialized() {
   172  		log.Info("schedulerv3: manual rebalance task ignored, "+
   173  			"since not all captures initialized",
   174  			zap.String("namespace", c.changefeedID.Namespace),
   175  			zap.String("changefeed", c.changefeedID.ID))
   176  		return
   177  	}
   178  
   179  	c.schedulerM.Rebalance()
   180  }
   181  
   182  // DrainCapture implement the scheduler interface
   183  // return the count of table replicating on the target capture, and true if the request processed.
   184  func (c *coordinator) DrainCapture(target model.CaptureID) (int, error) {
   185  	c.mu.Lock()
   186  	defer c.mu.Unlock()
   187  
   188  	if !c.captureM.CheckAllCaptureInitialized() {
   189  		log.Info("schedulerv3: drain capture request ignored, "+
   190  			"since not all captures initialized",
   191  			zap.String("namespace", c.changefeedID.Namespace),
   192  			zap.String("changefeed", c.changefeedID.ID),
   193  			zap.String("target", target))
   194  		// return count 1 to let client retry.
   195  		return 1, nil
   196  	}
   197  
   198  	var count int
   199  	c.replicationM.ReplicationSets().Ascend(
   200  		func(_ tablepb.Span, rep *replication.ReplicationSet) bool {
   201  			if rep.Primary == target {
   202  				count++
   203  			}
   204  			return true
   205  		})
   206  
   207  	if count == 0 {
   208  		log.Info("schedulerv3: drain capture request ignored, "+
   209  			"the target capture has no replicating table",
   210  			zap.String("namespace", c.changefeedID.Namespace),
   211  			zap.String("changefeed", c.changefeedID.ID),
   212  			zap.String("target", target))
   213  		return count, nil
   214  	}
   215  
   216  	// when draining the capture, tables need to be dispatched to other capture
   217  	// except the draining one, so there should be at least two live captures.
   218  	if len(c.captureM.Captures) <= 1 {
   219  		log.Warn("schedulerv3: drain capture request ignored, "+
   220  			"only one captures alive",
   221  			zap.String("namespace", c.changefeedID.Namespace),
   222  			zap.String("changefeed", c.changefeedID.ID),
   223  			zap.String("target", target),
   224  			zap.Int("tableCount", count))
   225  		return count, nil
   226  	}
   227  
   228  	// the owner is the drain target. In the rolling upgrade scenario, owner should be drained
   229  	// at the last, this should be guaranteed by the caller, since it knows the draining order.
   230  	if target == c.captureID {
   231  		log.Warn("schedulerv3: drain capture request ignored, "+
   232  			"the target is the owner",
   233  			zap.String("namespace", c.changefeedID.Namespace),
   234  			zap.String("changefeed", c.changefeedID.ID),
   235  			zap.String("target", target), zap.Int("tableCount", count))
   236  		return count, nil
   237  	}
   238  
   239  	if !c.schedulerM.DrainCapture(target) {
   240  		log.Info("schedulerv3: drain capture request ignored, "+
   241  			"since there is capture draining",
   242  			zap.String("namespace", c.changefeedID.Namespace),
   243  			zap.String("changefeed", c.changefeedID.ID),
   244  			zap.String("target", target),
   245  			zap.Int("tableCount", count))
   246  	}
   247  
   248  	return count, nil
   249  }
   250  
   251  func (c *coordinator) Close(ctx context.Context) {
   252  	c.mu.Lock()
   253  	defer c.mu.Unlock()
   254  
   255  	_ = c.trans.Close()
   256  	c.captureM.CleanMetrics()
   257  	c.replicationM.CleanMetrics()
   258  	c.schedulerM.CleanMetrics()
   259  
   260  	log.Info("schedulerv3: coordinator closed",
   261  		zap.String("namespace", c.changefeedID.Namespace),
   262  		zap.String("changefeed", c.changefeedID.ID),
   263  		zap.Any("ownerRev", c.captureM.OwnerRev))
   264  }
   265  
   266  // ===========
   267  
   268  func (c *coordinator) poll(
   269  	ctx context.Context,
   270  	checkpointTs model.Ts,
   271  	currentTables []model.TableID,
   272  	aliveCaptures map[model.CaptureID]*model.CaptureInfo,
   273  	barrier *schedulepb.BarrierWithMinTs,
   274  ) (watermark schedulepb.Watermark, err error) {
   275  	c.maybeCollectMetrics()
   276  	if c.compat.UpdateCaptureInfo(aliveCaptures) {
   277  		spanReplicationEnabled := c.compat.CheckSpanReplicationEnabled()
   278  		log.Info("schedulerv3: compat update capture info",
   279  			zap.String("namespace", c.changefeedID.Namespace),
   280  			zap.String("changefeed", c.changefeedID.ID),
   281  			zap.Any("captures", aliveCaptures),
   282  			zap.Bool("spanReplicationEnabled", spanReplicationEnabled))
   283  	}
   284  
   285  	recvMsgs, err := c.recvMsgs(ctx)
   286  	if err != nil {
   287  		return schedulepb.Watermark{
   288  			CheckpointTs:     checkpointCannotProceed,
   289  			ResolvedTs:       checkpointCannotProceed,
   290  			LastSyncedTs:     checkpointCannotProceed,
   291  			PullerResolvedTs: checkpointCannotProceed,
   292  		}, errors.Trace(err)
   293  	}
   294  
   295  	var msgBuf []*schedulepb.Message
   296  	c.captureM.HandleMessage(recvMsgs)
   297  
   298  	msgs := c.captureM.HandleAliveCaptureUpdate(aliveCaptures)
   299  	msgBuf = append(msgBuf, msgs...)
   300  
   301  	// Handle received messages to advance replication set.
   302  	msgs, err = c.replicationM.HandleMessage(recvMsgs)
   303  	if err != nil {
   304  		return schedulepb.Watermark{
   305  			CheckpointTs:     checkpointCannotProceed,
   306  			ResolvedTs:       checkpointCannotProceed,
   307  			LastSyncedTs:     checkpointCannotProceed,
   308  			PullerResolvedTs: checkpointCannotProceed,
   309  		}, errors.Trace(err)
   310  	}
   311  	msgBuf = append(msgBuf, msgs...)
   312  
   313  	pdTime := time.Now()
   314  	// only nil in unit test
   315  	if c.pdClock != nil {
   316  		pdTime = c.pdClock.CurrentTime()
   317  	}
   318  
   319  	c.tableRanges.UpdateTables(currentTables)
   320  	if !c.captureM.CheckAllCaptureInitialized() {
   321  		// Skip generating schedule tasks for replication manager,
   322  		// as not all capture are initialized.
   323  		watermark = c.replicationM.AdvanceCheckpoint(&c.tableRanges, pdTime, barrier, c.redoMetaManager)
   324  		// tick capture manager after checkpoint calculation to take account resolvedTs in barrier
   325  		// when redo is enabled
   326  		msgs = c.captureM.Tick(c.replicationM.ReplicationSets(),
   327  			c.schedulerM.DrainingTarget(), barrier.Barrier)
   328  		msgBuf = append(msgBuf, msgs...)
   329  		return watermark, c.sendMsgs(ctx, msgBuf)
   330  	}
   331  
   332  	// Handle capture membership changes.
   333  	if changes := c.captureM.TakeChanges(); changes != nil {
   334  		msgs, err = c.replicationM.HandleCaptureChanges(
   335  			changes.Init, changes.Removed, checkpointTs)
   336  		if err != nil {
   337  			return schedulepb.Watermark{
   338  				CheckpointTs:     checkpointCannotProceed,
   339  				ResolvedTs:       checkpointCannotProceed,
   340  				LastSyncedTs:     checkpointCannotProceed,
   341  				PullerResolvedTs: checkpointCannotProceed,
   342  			}, errors.Trace(err)
   343  		}
   344  		msgBuf = append(msgBuf, msgs...)
   345  	}
   346  
   347  	// Generate schedule tasks based on the current status.
   348  	replications := c.replicationM.ReplicationSets()
   349  	runningTasks := c.replicationM.RunningTasks()
   350  	currentSpans := c.reconciler.Reconcile(
   351  		ctx, &c.tableRanges, replications, c.captureM.Captures, c.compat)
   352  	allTasks := c.schedulerM.Schedule(
   353  		checkpointTs, currentSpans, c.captureM.Captures, replications, runningTasks)
   354  
   355  	// Handle generated schedule tasks.
   356  	msgs, err = c.replicationM.HandleTasks(allTasks)
   357  	if err != nil {
   358  		return schedulepb.Watermark{
   359  			CheckpointTs:     checkpointCannotProceed,
   360  			ResolvedTs:       checkpointCannotProceed,
   361  			LastSyncedTs:     checkpointCannotProceed,
   362  			PullerResolvedTs: checkpointCannotProceed,
   363  		}, errors.Trace(err)
   364  	}
   365  	msgBuf = append(msgBuf, msgs...)
   366  
   367  	// Checkpoint calculation
   368  	watermark = c.replicationM.AdvanceCheckpoint(&c.tableRanges, pdTime, barrier, c.redoMetaManager)
   369  
   370  	// tick capture manager after checkpoint calculation to take account resolvedTs in barrier
   371  	// when redo is enabled
   372  	msgs = c.captureM.Tick(c.replicationM.ReplicationSets(),
   373  		c.schedulerM.DrainingTarget(), barrier.Barrier)
   374  	msgBuf = append(msgBuf, msgs...)
   375  
   376  	// Send new messages.
   377  	err = c.sendMsgs(ctx, msgBuf)
   378  	if err != nil {
   379  		return schedulepb.Watermark{
   380  			CheckpointTs:     checkpointCannotProceed,
   381  			ResolvedTs:       checkpointCannotProceed,
   382  			LastSyncedTs:     checkpointCannotProceed,
   383  			PullerResolvedTs: checkpointCannotProceed,
   384  		}, errors.Trace(err)
   385  	}
   386  
   387  	return watermark, nil
   388  }
   389  
   390  func (c *coordinator) recvMsgs(ctx context.Context) ([]*schedulepb.Message, error) {
   391  	recvMsgs, err := c.trans.Recv(ctx)
   392  	if err != nil {
   393  		return nil, errors.Trace(err)
   394  	}
   395  
   396  	n := 0
   397  	for _, msg := range recvMsgs {
   398  		// Filter stale messages and lost messages.
   399  		if msg.Header.OwnerRevision != c.revision || msg.To != c.captureID {
   400  			// Owner revision must match and capture ID must match.
   401  			continue
   402  		}
   403  		if c.compat.CheckChangefeedEpochEnabled(msg.From) {
   404  			if msg.Header.ChangefeedEpoch.Epoch != c.changefeedEpoch {
   405  				// Changefeed epoch must match.
   406  				continue
   407  			}
   408  		}
   409  		recvMsgs[n] = msg
   410  		n++
   411  	}
   412  	c.compat.AfterTransportReceive(recvMsgs[:n])
   413  	return recvMsgs[:n], nil
   414  }
   415  
   416  func (c *coordinator) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error {
   417  	for i := range msgs {
   418  		m := msgs[i]
   419  		// Correctness check.
   420  		if len(m.To) == 0 || m.MsgType == schedulepb.MsgUnknown {
   421  			log.Panic("invalid message no destination or unknown message type",
   422  				zap.String("namespace", c.changefeedID.Namespace),
   423  				zap.String("changefeed", c.changefeedID.ID),
   424  				zap.Any("message", m))
   425  		}
   426  
   427  		epoch := schedulepb.ProcessorEpoch{}
   428  		if capture := c.captureM.Captures[m.To]; capture != nil {
   429  			epoch = capture.Epoch
   430  		}
   431  		m.Header = &schedulepb.Message_Header{
   432  			Version:        c.version,
   433  			OwnerRevision:  c.revision,
   434  			ProcessorEpoch: epoch,
   435  			ChangefeedEpoch: schedulepb.ChangefeedEpoch{
   436  				Epoch: c.changefeedEpoch,
   437  			},
   438  		}
   439  		m.From = c.captureID
   440  	}
   441  	c.compat.BeforeTransportSend(msgs)
   442  	return c.trans.Send(ctx, msgs)
   443  }
   444  
   445  func (c *coordinator) maybeCollectMetrics() {
   446  	now := time.Now()
   447  	if now.Sub(c.lastCollectTime) < metricsInterval {
   448  		return
   449  	}
   450  	c.lastCollectTime = now
   451  
   452  	c.schedulerM.CollectMetrics()
   453  	c.replicationM.CollectMetrics()
   454  	c.captureM.CollectMetrics()
   455  }