github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/coordinator.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package v3 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 "github.com/pingcap/errors" 22 "github.com/pingcap/log" 23 "github.com/pingcap/tiflow/cdc/model" 24 "github.com/pingcap/tiflow/cdc/processor/tablepb" 25 "github.com/pingcap/tiflow/cdc/redo" 26 "github.com/pingcap/tiflow/cdc/scheduler/internal" 27 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/compat" 28 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/keyspan" 29 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/member" 30 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/replication" 31 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/scheduler" 32 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/transport" 33 "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" 34 "github.com/pingcap/tiflow/pkg/config" 35 "github.com/pingcap/tiflow/pkg/p2p" 36 "github.com/pingcap/tiflow/pkg/pdutil" 37 "github.com/pingcap/tiflow/pkg/spanz" 38 "github.com/pingcap/tiflow/pkg/upstream" 39 "github.com/pingcap/tiflow/pkg/version" 40 "go.uber.org/zap" 41 ) 42 43 const ( 44 // When heavy operations (such as network IO and serialization) take too much time, the program 45 // should print a warning log, and if necessary, the timeout should be exposed externally through 46 // monitor. 47 tickLogsWarnDuration = 1 * time.Second 48 checkpointCannotProceed = internal.CheckpointCannotProceed 49 metricsInterval = 10 * time.Second 50 ) 51 52 var _ internal.Scheduler = (*coordinator)(nil) 53 54 type coordinator struct { 55 // A mutex for concurrent access of coordinator in 56 // internal.Scheduler and internal.InfoProvider API. 57 mu sync.Mutex 58 59 version string 60 revision schedulepb.OwnerRevision 61 changefeedEpoch uint64 62 captureID model.CaptureID 63 trans transport.Transport 64 replicationM *replication.Manager 65 captureM *member.CaptureManager 66 schedulerM *scheduler.Manager 67 reconciler *keyspan.Reconciler 68 compat *compat.Compat 69 pdClock pdutil.Clock 70 tableRanges replication.TableRanges 71 redoMetaManager redo.MetaManager 72 73 lastCollectTime time.Time 74 changefeedID model.ChangeFeedID 75 } 76 77 // NewCoordinator returns a two phase scheduler. 78 func NewCoordinator( 79 ctx context.Context, 80 captureID model.CaptureID, 81 changefeedID model.ChangeFeedID, 82 messageServer *p2p.MessageServer, 83 messageRouter p2p.MessageRouter, 84 ownerRevision int64, 85 changefeedEpoch uint64, 86 up *upstream.Upstream, 87 cfg *config.SchedulerConfig, 88 redoMetaManager redo.MetaManager, 89 ) (internal.Scheduler, error) { 90 trans, err := transport.NewTransport( 91 ctx, changefeedID, transport.SchedulerRole, messageServer, messageRouter) 92 if err != nil { 93 return nil, errors.Trace(err) 94 } 95 reconciler, err := keyspan.NewReconciler(changefeedID, up, cfg.ChangefeedSettings) 96 if err != nil { 97 return nil, errors.Trace(err) 98 } 99 revision := schedulepb.OwnerRevision{Revision: ownerRevision} 100 return &coordinator{ 101 version: version.ReleaseSemver(), 102 revision: revision, 103 changefeedEpoch: changefeedEpoch, 104 captureID: captureID, 105 trans: trans, 106 replicationM: replication.NewReplicationManager( 107 cfg.MaxTaskConcurrency, changefeedID), 108 captureM: member.NewCaptureManager(captureID, changefeedID, revision, cfg), 109 schedulerM: scheduler.NewSchedulerManager(changefeedID, cfg), 110 reconciler: reconciler, 111 changefeedID: changefeedID, 112 compat: compat.New(cfg, map[model.CaptureID]*model.CaptureInfo{}), 113 pdClock: up.PDClock, 114 redoMetaManager: redoMetaManager, 115 }, nil 116 } 117 118 // Tick implement the scheduler interface 119 func (c *coordinator) Tick( 120 ctx context.Context, 121 // Latest global checkpoint of the changefeed 122 checkpointTs model.Ts, 123 // All tables that SHOULD be replicated (or started) at the current checkpoint. 124 currentTables []model.TableID, 125 // All captures that are alive according to the latest Etcd states. 126 aliveCaptures map[model.CaptureID]*model.CaptureInfo, 127 barrier *schedulepb.BarrierWithMinTs, 128 ) (watermark schedulepb.Watermark, err error) { 129 startTime := time.Now() 130 defer func() { 131 costTime := time.Since(startTime) 132 if costTime > tickLogsWarnDuration { 133 log.Warn("scheduler tick took too long", 134 zap.String("namespace", c.changefeedID.Namespace), 135 zap.String("changefeed", c.changefeedID.ID), 136 zap.Duration("duration", costTime)) 137 } 138 }() 139 140 c.mu.Lock() 141 defer c.mu.Unlock() 142 143 return c.poll(ctx, checkpointTs, currentTables, aliveCaptures, barrier) 144 } 145 146 // MoveTable implement the scheduler interface 147 // FIXME: tableID should be Span. 148 func (c *coordinator) MoveTable(tableID model.TableID, target model.CaptureID) { 149 c.mu.Lock() 150 defer c.mu.Unlock() 151 152 if !c.captureM.CheckAllCaptureInitialized() { 153 log.Info("schedulerv3: manual move table task ignored, "+ 154 "since not all captures initialized", 155 zap.String("namespace", c.changefeedID.Namespace), 156 zap.String("changefeed", c.changefeedID.ID), 157 zap.Int64("tableID", tableID), 158 zap.String("targetCapture", target)) 159 return 160 } 161 162 span := spanz.TableIDToComparableSpan(tableID) 163 c.schedulerM.MoveTable(span, target) 164 } 165 166 // Rebalance implement the scheduler interface 167 func (c *coordinator) Rebalance() { 168 c.mu.Lock() 169 defer c.mu.Unlock() 170 171 if !c.captureM.CheckAllCaptureInitialized() { 172 log.Info("schedulerv3: manual rebalance task ignored, "+ 173 "since not all captures initialized", 174 zap.String("namespace", c.changefeedID.Namespace), 175 zap.String("changefeed", c.changefeedID.ID)) 176 return 177 } 178 179 c.schedulerM.Rebalance() 180 } 181 182 // DrainCapture implement the scheduler interface 183 // return the count of table replicating on the target capture, and true if the request processed. 184 func (c *coordinator) DrainCapture(target model.CaptureID) (int, error) { 185 c.mu.Lock() 186 defer c.mu.Unlock() 187 188 if !c.captureM.CheckAllCaptureInitialized() { 189 log.Info("schedulerv3: drain capture request ignored, "+ 190 "since not all captures initialized", 191 zap.String("namespace", c.changefeedID.Namespace), 192 zap.String("changefeed", c.changefeedID.ID), 193 zap.String("target", target)) 194 // return count 1 to let client retry. 195 return 1, nil 196 } 197 198 var count int 199 c.replicationM.ReplicationSets().Ascend( 200 func(_ tablepb.Span, rep *replication.ReplicationSet) bool { 201 if rep.Primary == target { 202 count++ 203 } 204 return true 205 }) 206 207 if count == 0 { 208 log.Info("schedulerv3: drain capture request ignored, "+ 209 "the target capture has no replicating table", 210 zap.String("namespace", c.changefeedID.Namespace), 211 zap.String("changefeed", c.changefeedID.ID), 212 zap.String("target", target)) 213 return count, nil 214 } 215 216 // when draining the capture, tables need to be dispatched to other capture 217 // except the draining one, so there should be at least two live captures. 218 if len(c.captureM.Captures) <= 1 { 219 log.Warn("schedulerv3: drain capture request ignored, "+ 220 "only one captures alive", 221 zap.String("namespace", c.changefeedID.Namespace), 222 zap.String("changefeed", c.changefeedID.ID), 223 zap.String("target", target), 224 zap.Int("tableCount", count)) 225 return count, nil 226 } 227 228 // the owner is the drain target. In the rolling upgrade scenario, owner should be drained 229 // at the last, this should be guaranteed by the caller, since it knows the draining order. 230 if target == c.captureID { 231 log.Warn("schedulerv3: drain capture request ignored, "+ 232 "the target is the owner", 233 zap.String("namespace", c.changefeedID.Namespace), 234 zap.String("changefeed", c.changefeedID.ID), 235 zap.String("target", target), zap.Int("tableCount", count)) 236 return count, nil 237 } 238 239 if !c.schedulerM.DrainCapture(target) { 240 log.Info("schedulerv3: drain capture request ignored, "+ 241 "since there is capture draining", 242 zap.String("namespace", c.changefeedID.Namespace), 243 zap.String("changefeed", c.changefeedID.ID), 244 zap.String("target", target), 245 zap.Int("tableCount", count)) 246 } 247 248 return count, nil 249 } 250 251 func (c *coordinator) Close(ctx context.Context) { 252 c.mu.Lock() 253 defer c.mu.Unlock() 254 255 _ = c.trans.Close() 256 c.captureM.CleanMetrics() 257 c.replicationM.CleanMetrics() 258 c.schedulerM.CleanMetrics() 259 260 log.Info("schedulerv3: coordinator closed", 261 zap.String("namespace", c.changefeedID.Namespace), 262 zap.String("changefeed", c.changefeedID.ID), 263 zap.Any("ownerRev", c.captureM.OwnerRev)) 264 } 265 266 // =========== 267 268 func (c *coordinator) poll( 269 ctx context.Context, 270 checkpointTs model.Ts, 271 currentTables []model.TableID, 272 aliveCaptures map[model.CaptureID]*model.CaptureInfo, 273 barrier *schedulepb.BarrierWithMinTs, 274 ) (watermark schedulepb.Watermark, err error) { 275 c.maybeCollectMetrics() 276 if c.compat.UpdateCaptureInfo(aliveCaptures) { 277 spanReplicationEnabled := c.compat.CheckSpanReplicationEnabled() 278 log.Info("schedulerv3: compat update capture info", 279 zap.String("namespace", c.changefeedID.Namespace), 280 zap.String("changefeed", c.changefeedID.ID), 281 zap.Any("captures", aliveCaptures), 282 zap.Bool("spanReplicationEnabled", spanReplicationEnabled)) 283 } 284 285 recvMsgs, err := c.recvMsgs(ctx) 286 if err != nil { 287 return schedulepb.Watermark{ 288 CheckpointTs: checkpointCannotProceed, 289 ResolvedTs: checkpointCannotProceed, 290 LastSyncedTs: checkpointCannotProceed, 291 PullerResolvedTs: checkpointCannotProceed, 292 }, errors.Trace(err) 293 } 294 295 var msgBuf []*schedulepb.Message 296 c.captureM.HandleMessage(recvMsgs) 297 298 msgs := c.captureM.HandleAliveCaptureUpdate(aliveCaptures) 299 msgBuf = append(msgBuf, msgs...) 300 301 // Handle received messages to advance replication set. 302 msgs, err = c.replicationM.HandleMessage(recvMsgs) 303 if err != nil { 304 return schedulepb.Watermark{ 305 CheckpointTs: checkpointCannotProceed, 306 ResolvedTs: checkpointCannotProceed, 307 LastSyncedTs: checkpointCannotProceed, 308 PullerResolvedTs: checkpointCannotProceed, 309 }, errors.Trace(err) 310 } 311 msgBuf = append(msgBuf, msgs...) 312 313 pdTime := time.Now() 314 // only nil in unit test 315 if c.pdClock != nil { 316 pdTime = c.pdClock.CurrentTime() 317 } 318 319 c.tableRanges.UpdateTables(currentTables) 320 if !c.captureM.CheckAllCaptureInitialized() { 321 // Skip generating schedule tasks for replication manager, 322 // as not all capture are initialized. 323 watermark = c.replicationM.AdvanceCheckpoint(&c.tableRanges, pdTime, barrier, c.redoMetaManager) 324 // tick capture manager after checkpoint calculation to take account resolvedTs in barrier 325 // when redo is enabled 326 msgs = c.captureM.Tick(c.replicationM.ReplicationSets(), 327 c.schedulerM.DrainingTarget(), barrier.Barrier) 328 msgBuf = append(msgBuf, msgs...) 329 return watermark, c.sendMsgs(ctx, msgBuf) 330 } 331 332 // Handle capture membership changes. 333 if changes := c.captureM.TakeChanges(); changes != nil { 334 msgs, err = c.replicationM.HandleCaptureChanges( 335 changes.Init, changes.Removed, checkpointTs) 336 if err != nil { 337 return schedulepb.Watermark{ 338 CheckpointTs: checkpointCannotProceed, 339 ResolvedTs: checkpointCannotProceed, 340 LastSyncedTs: checkpointCannotProceed, 341 PullerResolvedTs: checkpointCannotProceed, 342 }, errors.Trace(err) 343 } 344 msgBuf = append(msgBuf, msgs...) 345 } 346 347 // Generate schedule tasks based on the current status. 348 replications := c.replicationM.ReplicationSets() 349 runningTasks := c.replicationM.RunningTasks() 350 currentSpans := c.reconciler.Reconcile( 351 ctx, &c.tableRanges, replications, c.captureM.Captures, c.compat) 352 allTasks := c.schedulerM.Schedule( 353 checkpointTs, currentSpans, c.captureM.Captures, replications, runningTasks) 354 355 // Handle generated schedule tasks. 356 msgs, err = c.replicationM.HandleTasks(allTasks) 357 if err != nil { 358 return schedulepb.Watermark{ 359 CheckpointTs: checkpointCannotProceed, 360 ResolvedTs: checkpointCannotProceed, 361 LastSyncedTs: checkpointCannotProceed, 362 PullerResolvedTs: checkpointCannotProceed, 363 }, errors.Trace(err) 364 } 365 msgBuf = append(msgBuf, msgs...) 366 367 // Checkpoint calculation 368 watermark = c.replicationM.AdvanceCheckpoint(&c.tableRanges, pdTime, barrier, c.redoMetaManager) 369 370 // tick capture manager after checkpoint calculation to take account resolvedTs in barrier 371 // when redo is enabled 372 msgs = c.captureM.Tick(c.replicationM.ReplicationSets(), 373 c.schedulerM.DrainingTarget(), barrier.Barrier) 374 msgBuf = append(msgBuf, msgs...) 375 376 // Send new messages. 377 err = c.sendMsgs(ctx, msgBuf) 378 if err != nil { 379 return schedulepb.Watermark{ 380 CheckpointTs: checkpointCannotProceed, 381 ResolvedTs: checkpointCannotProceed, 382 LastSyncedTs: checkpointCannotProceed, 383 PullerResolvedTs: checkpointCannotProceed, 384 }, errors.Trace(err) 385 } 386 387 return watermark, nil 388 } 389 390 func (c *coordinator) recvMsgs(ctx context.Context) ([]*schedulepb.Message, error) { 391 recvMsgs, err := c.trans.Recv(ctx) 392 if err != nil { 393 return nil, errors.Trace(err) 394 } 395 396 n := 0 397 for _, msg := range recvMsgs { 398 // Filter stale messages and lost messages. 399 if msg.Header.OwnerRevision != c.revision || msg.To != c.captureID { 400 // Owner revision must match and capture ID must match. 401 continue 402 } 403 if c.compat.CheckChangefeedEpochEnabled(msg.From) { 404 if msg.Header.ChangefeedEpoch.Epoch != c.changefeedEpoch { 405 // Changefeed epoch must match. 406 continue 407 } 408 } 409 recvMsgs[n] = msg 410 n++ 411 } 412 c.compat.AfterTransportReceive(recvMsgs[:n]) 413 return recvMsgs[:n], nil 414 } 415 416 func (c *coordinator) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error { 417 for i := range msgs { 418 m := msgs[i] 419 // Correctness check. 420 if len(m.To) == 0 || m.MsgType == schedulepb.MsgUnknown { 421 log.Panic("invalid message no destination or unknown message type", 422 zap.String("namespace", c.changefeedID.Namespace), 423 zap.String("changefeed", c.changefeedID.ID), 424 zap.Any("message", m)) 425 } 426 427 epoch := schedulepb.ProcessorEpoch{} 428 if capture := c.captureM.Captures[m.To]; capture != nil { 429 epoch = capture.Epoch 430 } 431 m.Header = &schedulepb.Message_Header{ 432 Version: c.version, 433 OwnerRevision: c.revision, 434 ProcessorEpoch: epoch, 435 ChangefeedEpoch: schedulepb.ChangefeedEpoch{ 436 Epoch: c.changefeedEpoch, 437 }, 438 } 439 m.From = c.captureID 440 } 441 c.compat.BeforeTransportSend(msgs) 442 return c.trans.Send(ctx, msgs) 443 } 444 445 func (c *coordinator) maybeCollectMetrics() { 446 now := time.Now() 447 if now.Sub(c.lastCollectTime) < metricsInterval { 448 return 449 } 450 c.lastCollectTime = now 451 452 c.schedulerM.CollectMetrics() 453 c.replicationM.CollectMetrics() 454 c.captureM.CollectMetrics() 455 }