github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/member/capture_manager.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package member
    15  
    16  import (
    17  	"github.com/pingcap/log"
    18  	"github.com/pingcap/tiflow/cdc/model"
    19  	"github.com/pingcap/tiflow/cdc/processor/tablepb"
    20  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/replication"
    21  	"github.com/pingcap/tiflow/cdc/scheduler/schedulepb"
    22  	"github.com/pingcap/tiflow/pkg/config"
    23  	"github.com/pingcap/tiflow/pkg/spanz"
    24  	"go.uber.org/zap"
    25  )
    26  
    27  // CaptureState is the state of a capture.
    28  //
    29  //	    ┌───────────────┐ Heartbeat Resp ┌─────────────┐
    30  //	    │ Uninitialized ├───────────────>│ Initialized │
    31  //	    └──────┬────────┘                └──────┬──────┘
    32  //	           │                                │
    33  //	IsStopping │          ┌──────────┐          │ IsStopping
    34  //	           └────────> │ Stopping │ <────────┘
    35  //	                      └──────────┘
    36  type CaptureState int
    37  
    38  const (
    39  	// CaptureStateUninitialized means the capture status is unknown,
    40  	// no heartbeat response received yet.
    41  	CaptureStateUninitialized CaptureState = 1
    42  	// CaptureStateInitialized means owner has received heartbeat response.
    43  	CaptureStateInitialized CaptureState = 2
    44  	// CaptureStateStopping means the capture is removing, e.g., shutdown.
    45  	CaptureStateStopping CaptureState = 3
    46  )
    47  
    48  var captureStateMap = map[CaptureState]string{
    49  	CaptureStateUninitialized: "CaptureStateUninitialized",
    50  	CaptureStateInitialized:   "CaptureStateInitialized",
    51  	CaptureStateStopping:      "CaptureStateStopping",
    52  }
    53  
    54  func (s CaptureState) String() string {
    55  	return captureStateMap[s]
    56  }
    57  
    58  // CaptureStatus represent capture's status.
    59  type CaptureStatus struct {
    60  	OwnerRev     schedulepb.OwnerRevision
    61  	Epoch        schedulepb.ProcessorEpoch
    62  	State        CaptureState
    63  	Tables       []tablepb.TableStatus
    64  	ID           model.CaptureID
    65  	Addr         string
    66  	IsOwner      bool
    67  	changefeedID model.ChangeFeedID
    68  }
    69  
    70  func newCaptureStatus(
    71  	rev schedulepb.OwnerRevision, id model.CaptureID, addr string, isOwner bool, changefeedID model.ChangeFeedID,
    72  ) *CaptureStatus {
    73  	return &CaptureStatus{
    74  		OwnerRev:     rev,
    75  		State:        CaptureStateUninitialized,
    76  		ID:           id,
    77  		Addr:         addr,
    78  		IsOwner:      isOwner,
    79  		changefeedID: changefeedID,
    80  	}
    81  }
    82  
    83  func (c *CaptureStatus) handleHeartbeatResponse(
    84  	resp *schedulepb.HeartbeatResponse, epoch schedulepb.ProcessorEpoch,
    85  ) {
    86  	// Check epoch for initialized captures.
    87  	if c.State != CaptureStateUninitialized && c.Epoch.Epoch != epoch.Epoch {
    88  		log.Warn("schedulerv3: ignore heartbeat response",
    89  			zap.String("namespace", c.changefeedID.Namespace),
    90  			zap.String("changefeed", c.changefeedID.ID),
    91  			zap.String("captureAddr", c.Addr),
    92  			zap.String("capture", c.ID),
    93  			zap.String("epoch", c.Epoch.Epoch),
    94  			zap.String("respEpoch", epoch.Epoch),
    95  			zap.Int64("ownerRev", c.OwnerRev.Revision))
    96  		return
    97  	}
    98  
    99  	if c.State == CaptureStateUninitialized {
   100  		c.Epoch = epoch
   101  		c.State = CaptureStateInitialized
   102  		log.Info("schedulerv3: capture initialized",
   103  			zap.String("namespace", c.changefeedID.Namespace),
   104  			zap.String("changefeed", c.changefeedID.ID),
   105  			zap.String("capture", c.ID),
   106  			zap.String("captureAddr", c.Addr))
   107  	}
   108  	if resp.Liveness == model.LivenessCaptureStopping {
   109  		c.State = CaptureStateStopping
   110  		log.Info("schedulerv3: capture stopping",
   111  			zap.String("namespace", c.changefeedID.Namespace),
   112  			zap.String("changefeed", c.changefeedID.ID),
   113  			zap.String("capture", c.ID),
   114  			zap.String("captureAddr", c.Addr))
   115  	}
   116  	c.Tables = resp.Tables
   117  }
   118  
   119  // CaptureChanges wraps changes of captures.
   120  type CaptureChanges struct {
   121  	Init    map[model.CaptureID][]tablepb.TableStatus
   122  	Removed map[model.CaptureID][]tablepb.TableStatus
   123  }
   124  
   125  // CaptureManager manages capture status.
   126  type CaptureManager struct {
   127  	OwnerRev schedulepb.OwnerRevision
   128  	Captures map[model.CaptureID]*CaptureStatus
   129  
   130  	initialized bool
   131  	changes     *CaptureChanges
   132  
   133  	// A logical clock counter, for heartbeat.
   134  	tickCounter      int
   135  	heartbeatTick    int
   136  	collectStatsTick int
   137  	pendingCollect   bool
   138  
   139  	changefeedID model.ChangeFeedID
   140  	ownerID      model.CaptureID
   141  }
   142  
   143  // NewCaptureManager returns a new capture manager.
   144  func NewCaptureManager(
   145  	ownerID model.CaptureID, changefeedID model.ChangeFeedID,
   146  	rev schedulepb.OwnerRevision, cfg *config.SchedulerConfig,
   147  ) *CaptureManager {
   148  	return &CaptureManager{
   149  		OwnerRev:         rev,
   150  		Captures:         make(map[model.CaptureID]*CaptureStatus),
   151  		heartbeatTick:    cfg.HeartbeatTick,
   152  		collectStatsTick: cfg.CollectStatsTick,
   153  
   154  		changefeedID: changefeedID,
   155  		ownerID:      ownerID,
   156  	}
   157  }
   158  
   159  // CheckAllCaptureInitialized check if all capture is initialized.
   160  func (c *CaptureManager) CheckAllCaptureInitialized() bool {
   161  	return c.initialized && c.checkAllCaptureInitialized()
   162  }
   163  
   164  func (c *CaptureManager) checkAllCaptureInitialized() bool {
   165  	for _, captureStatus := range c.Captures {
   166  		// CaptureStateStopping is also considered initialized, because when
   167  		// a capture shutdown, it becomes stopping, we need to move its tables
   168  		// to other captures.
   169  		if captureStatus.State == CaptureStateUninitialized {
   170  			return false
   171  		}
   172  	}
   173  	return len(c.Captures) != 0
   174  }
   175  
   176  // Tick advances the logical clock of capture manager and produce heartbeat when
   177  // necessary.
   178  func (c *CaptureManager) Tick(
   179  	reps *spanz.BtreeMap[*replication.ReplicationSet],
   180  	drainingCapture model.CaptureID,
   181  	barrier *schedulepb.Barrier,
   182  ) []*schedulepb.Message {
   183  	c.tickCounter++
   184  	if c.tickCounter%c.collectStatsTick == 0 {
   185  		c.pendingCollect = true
   186  	}
   187  	if c.tickCounter%c.heartbeatTick != 0 {
   188  		return nil
   189  	}
   190  	tables := make(map[model.CaptureID][]tablepb.Span)
   191  	reps.Ascend(func(span tablepb.Span, rep *replication.ReplicationSet) bool {
   192  		for captureID := range rep.Captures {
   193  			tables[captureID] = append(tables[captureID], span)
   194  		}
   195  		return true
   196  	})
   197  	msgs := make([]*schedulepb.Message, 0, len(c.Captures))
   198  	for to := range c.Captures {
   199  		msgs = append(msgs, &schedulepb.Message{
   200  			To:      to,
   201  			MsgType: schedulepb.MsgHeartbeat,
   202  			Heartbeat: &schedulepb.Heartbeat{
   203  				Spans: tables[to],
   204  				// IsStopping let the receiver capture know that it should be stopping now.
   205  				// At the moment, this is triggered by `DrainCapture` scheduler.
   206  				IsStopping:   drainingCapture == to,
   207  				CollectStats: c.pendingCollect,
   208  				Barrier:      barrier,
   209  			},
   210  		})
   211  	}
   212  	c.pendingCollect = false
   213  	return msgs
   214  }
   215  
   216  // HandleMessage handles messages sent from other captures.
   217  func (c *CaptureManager) HandleMessage(
   218  	msgs []*schedulepb.Message,
   219  ) {
   220  	for _, msg := range msgs {
   221  		if msg.MsgType == schedulepb.MsgHeartbeatResponse {
   222  			captureStatus, ok := c.Captures[msg.From]
   223  			if !ok {
   224  				log.Warn("schedulerv3: heartbeat response from unknown capture",
   225  					zap.String("namespace", c.changefeedID.Namespace),
   226  					zap.String("changefeed", c.changefeedID.ID),
   227  					zap.String("capture", msg.From))
   228  				continue
   229  			}
   230  			captureStatus.handleHeartbeatResponse(
   231  				msg.GetHeartbeatResponse(), msg.Header.ProcessorEpoch)
   232  		}
   233  	}
   234  }
   235  
   236  // HandleAliveCaptureUpdate update captures liveness.
   237  func (c *CaptureManager) HandleAliveCaptureUpdate(
   238  	aliveCaptures map[model.CaptureID]*model.CaptureInfo,
   239  ) []*schedulepb.Message {
   240  	msgs := make([]*schedulepb.Message, 0)
   241  	for id, info := range aliveCaptures {
   242  		if _, ok := c.Captures[id]; !ok {
   243  			// A new capture.
   244  			c.Captures[id] = newCaptureStatus(
   245  				c.OwnerRev, id, info.AdvertiseAddr, c.ownerID == id, c.changefeedID)
   246  			log.Info("schedulerv3: find a new capture",
   247  				zap.String("namespace", c.changefeedID.Namespace),
   248  				zap.String("changefeed", c.changefeedID.ID),
   249  				zap.String("captureAddr", info.AdvertiseAddr),
   250  				zap.String("capture", id))
   251  			msgs = append(msgs, &schedulepb.Message{
   252  				To:        id,
   253  				MsgType:   schedulepb.MsgHeartbeat,
   254  				Heartbeat: &schedulepb.Heartbeat{},
   255  			})
   256  		}
   257  	}
   258  
   259  	// Find removed captures.
   260  	for id, capture := range c.Captures {
   261  		if _, ok := aliveCaptures[id]; !ok {
   262  			log.Info("schedulerv3: removed a capture",
   263  				zap.String("namespace", c.changefeedID.Namespace),
   264  				zap.String("changefeed", c.changefeedID.ID),
   265  				zap.String("captureAddr", capture.Addr),
   266  				zap.String("capture", id))
   267  			delete(c.Captures, id)
   268  
   269  			// Only update changes after initialization.
   270  			if !c.initialized {
   271  				continue
   272  			}
   273  			if c.changes == nil {
   274  				c.changes = &CaptureChanges{}
   275  			}
   276  			if c.changes.Removed == nil {
   277  				c.changes.Removed = make(map[string][]tablepb.TableStatus)
   278  			}
   279  			c.changes.Removed[id] = capture.Tables
   280  
   281  			cf := c.changefeedID
   282  			captureTableGauge.DeleteLabelValues(cf.Namespace, cf.ID, capture.Addr)
   283  		}
   284  	}
   285  
   286  	// Check if this is the first time all captures are initialized.
   287  	if !c.initialized && c.checkAllCaptureInitialized() {
   288  		c.changes = &CaptureChanges{Init: make(map[string][]tablepb.TableStatus)}
   289  		for id, capture := range c.Captures {
   290  			c.changes.Init[id] = capture.Tables
   291  		}
   292  		log.Info("schedulerv3: all capture initialized",
   293  			zap.String("namespace", c.changefeedID.Namespace),
   294  			zap.String("changefeed", c.changefeedID.ID),
   295  			zap.Int("captureCount", len(c.Captures)))
   296  		c.initialized = true
   297  	}
   298  
   299  	return msgs
   300  }
   301  
   302  // TakeChanges takes the changes of captures that it sees so far.
   303  func (c *CaptureManager) TakeChanges() *CaptureChanges {
   304  	// Only return changes when it's initialized.
   305  	if !c.initialized {
   306  		return nil
   307  	}
   308  	changes := c.changes
   309  	c.changes = nil
   310  	return changes
   311  }
   312  
   313  // CollectMetrics collects metrics.
   314  func (c *CaptureManager) CollectMetrics() {
   315  	cf := c.changefeedID
   316  	for _, capture := range c.Captures {
   317  		captureTableGauge.
   318  			WithLabelValues(cf.Namespace, cf.ID, capture.Addr).
   319  			Set(float64(len(capture.Tables)))
   320  	}
   321  }
   322  
   323  // CleanMetrics cleans metrics.
   324  func (c *CaptureManager) CleanMetrics() {
   325  	cf := c.changefeedID
   326  	for _, capture := range c.Captures {
   327  		captureTableGauge.DeleteLabelValues(cf.Namespace, cf.ID, capture.Addr)
   328  	}
   329  }
   330  
   331  // SetInitializedForTests is only used in tests.
   332  func (c *CaptureManager) SetInitializedForTests(init bool) {
   333  	c.initialized = init
   334  }