github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/member/capture_manager.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package member 15 16 import ( 17 "github.com/pingcap/log" 18 "github.com/pingcap/tiflow/cdc/model" 19 "github.com/pingcap/tiflow/cdc/processor/tablepb" 20 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/replication" 21 "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" 22 "github.com/pingcap/tiflow/pkg/config" 23 "github.com/pingcap/tiflow/pkg/spanz" 24 "go.uber.org/zap" 25 ) 26 27 // CaptureState is the state of a capture. 28 // 29 // ┌───────────────┐ Heartbeat Resp ┌─────────────┐ 30 // │ Uninitialized ├───────────────>│ Initialized │ 31 // └──────┬────────┘ └──────┬──────┘ 32 // │ │ 33 // IsStopping │ ┌──────────┐ │ IsStopping 34 // └────────> │ Stopping │ <────────┘ 35 // └──────────┘ 36 type CaptureState int 37 38 const ( 39 // CaptureStateUninitialized means the capture status is unknown, 40 // no heartbeat response received yet. 41 CaptureStateUninitialized CaptureState = 1 42 // CaptureStateInitialized means owner has received heartbeat response. 43 CaptureStateInitialized CaptureState = 2 44 // CaptureStateStopping means the capture is removing, e.g., shutdown. 45 CaptureStateStopping CaptureState = 3 46 ) 47 48 var captureStateMap = map[CaptureState]string{ 49 CaptureStateUninitialized: "CaptureStateUninitialized", 50 CaptureStateInitialized: "CaptureStateInitialized", 51 CaptureStateStopping: "CaptureStateStopping", 52 } 53 54 func (s CaptureState) String() string { 55 return captureStateMap[s] 56 } 57 58 // CaptureStatus represent capture's status. 59 type CaptureStatus struct { 60 OwnerRev schedulepb.OwnerRevision 61 Epoch schedulepb.ProcessorEpoch 62 State CaptureState 63 Tables []tablepb.TableStatus 64 ID model.CaptureID 65 Addr string 66 IsOwner bool 67 changefeedID model.ChangeFeedID 68 } 69 70 func newCaptureStatus( 71 rev schedulepb.OwnerRevision, id model.CaptureID, addr string, isOwner bool, changefeedID model.ChangeFeedID, 72 ) *CaptureStatus { 73 return &CaptureStatus{ 74 OwnerRev: rev, 75 State: CaptureStateUninitialized, 76 ID: id, 77 Addr: addr, 78 IsOwner: isOwner, 79 changefeedID: changefeedID, 80 } 81 } 82 83 func (c *CaptureStatus) handleHeartbeatResponse( 84 resp *schedulepb.HeartbeatResponse, epoch schedulepb.ProcessorEpoch, 85 ) { 86 // Check epoch for initialized captures. 87 if c.State != CaptureStateUninitialized && c.Epoch.Epoch != epoch.Epoch { 88 log.Warn("schedulerv3: ignore heartbeat response", 89 zap.String("namespace", c.changefeedID.Namespace), 90 zap.String("changefeed", c.changefeedID.ID), 91 zap.String("captureAddr", c.Addr), 92 zap.String("capture", c.ID), 93 zap.String("epoch", c.Epoch.Epoch), 94 zap.String("respEpoch", epoch.Epoch), 95 zap.Int64("ownerRev", c.OwnerRev.Revision)) 96 return 97 } 98 99 if c.State == CaptureStateUninitialized { 100 c.Epoch = epoch 101 c.State = CaptureStateInitialized 102 log.Info("schedulerv3: capture initialized", 103 zap.String("namespace", c.changefeedID.Namespace), 104 zap.String("changefeed", c.changefeedID.ID), 105 zap.String("capture", c.ID), 106 zap.String("captureAddr", c.Addr)) 107 } 108 if resp.Liveness == model.LivenessCaptureStopping { 109 c.State = CaptureStateStopping 110 log.Info("schedulerv3: capture stopping", 111 zap.String("namespace", c.changefeedID.Namespace), 112 zap.String("changefeed", c.changefeedID.ID), 113 zap.String("capture", c.ID), 114 zap.String("captureAddr", c.Addr)) 115 } 116 c.Tables = resp.Tables 117 } 118 119 // CaptureChanges wraps changes of captures. 120 type CaptureChanges struct { 121 Init map[model.CaptureID][]tablepb.TableStatus 122 Removed map[model.CaptureID][]tablepb.TableStatus 123 } 124 125 // CaptureManager manages capture status. 126 type CaptureManager struct { 127 OwnerRev schedulepb.OwnerRevision 128 Captures map[model.CaptureID]*CaptureStatus 129 130 initialized bool 131 changes *CaptureChanges 132 133 // A logical clock counter, for heartbeat. 134 tickCounter int 135 heartbeatTick int 136 collectStatsTick int 137 pendingCollect bool 138 139 changefeedID model.ChangeFeedID 140 ownerID model.CaptureID 141 } 142 143 // NewCaptureManager returns a new capture manager. 144 func NewCaptureManager( 145 ownerID model.CaptureID, changefeedID model.ChangeFeedID, 146 rev schedulepb.OwnerRevision, cfg *config.SchedulerConfig, 147 ) *CaptureManager { 148 return &CaptureManager{ 149 OwnerRev: rev, 150 Captures: make(map[model.CaptureID]*CaptureStatus), 151 heartbeatTick: cfg.HeartbeatTick, 152 collectStatsTick: cfg.CollectStatsTick, 153 154 changefeedID: changefeedID, 155 ownerID: ownerID, 156 } 157 } 158 159 // CheckAllCaptureInitialized check if all capture is initialized. 160 func (c *CaptureManager) CheckAllCaptureInitialized() bool { 161 return c.initialized && c.checkAllCaptureInitialized() 162 } 163 164 func (c *CaptureManager) checkAllCaptureInitialized() bool { 165 for _, captureStatus := range c.Captures { 166 // CaptureStateStopping is also considered initialized, because when 167 // a capture shutdown, it becomes stopping, we need to move its tables 168 // to other captures. 169 if captureStatus.State == CaptureStateUninitialized { 170 return false 171 } 172 } 173 return len(c.Captures) != 0 174 } 175 176 // Tick advances the logical clock of capture manager and produce heartbeat when 177 // necessary. 178 func (c *CaptureManager) Tick( 179 reps *spanz.BtreeMap[*replication.ReplicationSet], 180 drainingCapture model.CaptureID, 181 barrier *schedulepb.Barrier, 182 ) []*schedulepb.Message { 183 c.tickCounter++ 184 if c.tickCounter%c.collectStatsTick == 0 { 185 c.pendingCollect = true 186 } 187 if c.tickCounter%c.heartbeatTick != 0 { 188 return nil 189 } 190 tables := make(map[model.CaptureID][]tablepb.Span) 191 reps.Ascend(func(span tablepb.Span, rep *replication.ReplicationSet) bool { 192 for captureID := range rep.Captures { 193 tables[captureID] = append(tables[captureID], span) 194 } 195 return true 196 }) 197 msgs := make([]*schedulepb.Message, 0, len(c.Captures)) 198 for to := range c.Captures { 199 msgs = append(msgs, &schedulepb.Message{ 200 To: to, 201 MsgType: schedulepb.MsgHeartbeat, 202 Heartbeat: &schedulepb.Heartbeat{ 203 Spans: tables[to], 204 // IsStopping let the receiver capture know that it should be stopping now. 205 // At the moment, this is triggered by `DrainCapture` scheduler. 206 IsStopping: drainingCapture == to, 207 CollectStats: c.pendingCollect, 208 Barrier: barrier, 209 }, 210 }) 211 } 212 c.pendingCollect = false 213 return msgs 214 } 215 216 // HandleMessage handles messages sent from other captures. 217 func (c *CaptureManager) HandleMessage( 218 msgs []*schedulepb.Message, 219 ) { 220 for _, msg := range msgs { 221 if msg.MsgType == schedulepb.MsgHeartbeatResponse { 222 captureStatus, ok := c.Captures[msg.From] 223 if !ok { 224 log.Warn("schedulerv3: heartbeat response from unknown capture", 225 zap.String("namespace", c.changefeedID.Namespace), 226 zap.String("changefeed", c.changefeedID.ID), 227 zap.String("capture", msg.From)) 228 continue 229 } 230 captureStatus.handleHeartbeatResponse( 231 msg.GetHeartbeatResponse(), msg.Header.ProcessorEpoch) 232 } 233 } 234 } 235 236 // HandleAliveCaptureUpdate update captures liveness. 237 func (c *CaptureManager) HandleAliveCaptureUpdate( 238 aliveCaptures map[model.CaptureID]*model.CaptureInfo, 239 ) []*schedulepb.Message { 240 msgs := make([]*schedulepb.Message, 0) 241 for id, info := range aliveCaptures { 242 if _, ok := c.Captures[id]; !ok { 243 // A new capture. 244 c.Captures[id] = newCaptureStatus( 245 c.OwnerRev, id, info.AdvertiseAddr, c.ownerID == id, c.changefeedID) 246 log.Info("schedulerv3: find a new capture", 247 zap.String("namespace", c.changefeedID.Namespace), 248 zap.String("changefeed", c.changefeedID.ID), 249 zap.String("captureAddr", info.AdvertiseAddr), 250 zap.String("capture", id)) 251 msgs = append(msgs, &schedulepb.Message{ 252 To: id, 253 MsgType: schedulepb.MsgHeartbeat, 254 Heartbeat: &schedulepb.Heartbeat{}, 255 }) 256 } 257 } 258 259 // Find removed captures. 260 for id, capture := range c.Captures { 261 if _, ok := aliveCaptures[id]; !ok { 262 log.Info("schedulerv3: removed a capture", 263 zap.String("namespace", c.changefeedID.Namespace), 264 zap.String("changefeed", c.changefeedID.ID), 265 zap.String("captureAddr", capture.Addr), 266 zap.String("capture", id)) 267 delete(c.Captures, id) 268 269 // Only update changes after initialization. 270 if !c.initialized { 271 continue 272 } 273 if c.changes == nil { 274 c.changes = &CaptureChanges{} 275 } 276 if c.changes.Removed == nil { 277 c.changes.Removed = make(map[string][]tablepb.TableStatus) 278 } 279 c.changes.Removed[id] = capture.Tables 280 281 cf := c.changefeedID 282 captureTableGauge.DeleteLabelValues(cf.Namespace, cf.ID, capture.Addr) 283 } 284 } 285 286 // Check if this is the first time all captures are initialized. 287 if !c.initialized && c.checkAllCaptureInitialized() { 288 c.changes = &CaptureChanges{Init: make(map[string][]tablepb.TableStatus)} 289 for id, capture := range c.Captures { 290 c.changes.Init[id] = capture.Tables 291 } 292 log.Info("schedulerv3: all capture initialized", 293 zap.String("namespace", c.changefeedID.Namespace), 294 zap.String("changefeed", c.changefeedID.ID), 295 zap.Int("captureCount", len(c.Captures))) 296 c.initialized = true 297 } 298 299 return msgs 300 } 301 302 // TakeChanges takes the changes of captures that it sees so far. 303 func (c *CaptureManager) TakeChanges() *CaptureChanges { 304 // Only return changes when it's initialized. 305 if !c.initialized { 306 return nil 307 } 308 changes := c.changes 309 c.changes = nil 310 return changes 311 } 312 313 // CollectMetrics collects metrics. 314 func (c *CaptureManager) CollectMetrics() { 315 cf := c.changefeedID 316 for _, capture := range c.Captures { 317 captureTableGauge. 318 WithLabelValues(cf.Namespace, cf.ID, capture.Addr). 319 Set(float64(len(capture.Tables))) 320 } 321 } 322 323 // CleanMetrics cleans metrics. 324 func (c *CaptureManager) CleanMetrics() { 325 cf := c.changefeedID 326 for _, capture := range c.Captures { 327 captureTableGauge.DeleteLabelValues(cf.Namespace, cf.ID, capture.Addr) 328 } 329 } 330 331 // SetInitializedForTests is only used in tests. 332 func (c *CaptureManager) SetInitializedForTests(init bool) { 333 c.initialized = init 334 }