github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/agent/agent.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package agent 15 16 import ( 17 "context" 18 "time" 19 20 "github.com/google/uuid" 21 "github.com/pingcap/log" 22 "github.com/pingcap/tiflow/cdc/model" 23 "github.com/pingcap/tiflow/cdc/processor/tablepb" 24 "github.com/pingcap/tiflow/cdc/scheduler/internal" 25 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/compat" 26 "github.com/pingcap/tiflow/cdc/scheduler/internal/v3/transport" 27 "github.com/pingcap/tiflow/cdc/scheduler/schedulepb" 28 "github.com/pingcap/tiflow/pkg/config" 29 "github.com/pingcap/tiflow/pkg/errors" 30 "github.com/pingcap/tiflow/pkg/etcd" 31 "github.com/pingcap/tiflow/pkg/p2p" 32 "github.com/pingcap/tiflow/pkg/version" 33 "go.etcd.io/etcd/client/v3/concurrency" 34 "go.uber.org/zap" 35 ) 36 37 var _ internal.Agent = (*agent)(nil) 38 39 type agent struct { 40 agentInfo 41 trans transport.Transport 42 compat *compat.Compat 43 44 tableM *tableSpanManager 45 46 ownerInfo ownerInfo 47 48 // Liveness of the capture. 49 // It changes to LivenessCaptureStopping in following cases: 50 // 1. The capture receives a SIGTERM signal. 51 // 2. The agent receives a stopping heartbeat. 52 liveness *model.Liveness 53 } 54 55 type agentInfo struct { 56 Version string 57 CaptureID model.CaptureID 58 ChangeFeedID model.ChangeFeedID 59 Epoch schedulepb.ProcessorEpoch 60 changefeedEpoch uint64 61 } 62 63 func (a agentInfo) resetEpoch() { 64 a.Epoch = schedulepb.ProcessorEpoch{Epoch: uuid.New().String()} 65 } 66 67 func newAgentInfo( 68 changefeedID model.ChangeFeedID, captureID model.CaptureID, changefeedEpoch uint64, 69 ) agentInfo { 70 result := agentInfo{ 71 Version: version.ReleaseSemver(), 72 CaptureID: captureID, 73 ChangeFeedID: changefeedID, 74 Epoch: schedulepb.ProcessorEpoch{}, 75 changefeedEpoch: changefeedEpoch, 76 } 77 result.resetEpoch() 78 79 return result 80 } 81 82 type ownerInfo struct { 83 model.CaptureInfo 84 Revision schedulepb.OwnerRevision 85 } 86 87 func newAgent( 88 ctx context.Context, 89 captureID model.CaptureID, 90 liveness *model.Liveness, 91 changeFeedID model.ChangeFeedID, 92 client etcd.OwnerCaptureInfoClient, 93 tableExecutor internal.TableExecutor, 94 changefeedEpoch uint64, 95 cfg *config.SchedulerConfig, 96 ) (internal.Agent, error) { 97 result := &agent{ 98 agentInfo: newAgentInfo(changeFeedID, captureID, changefeedEpoch), 99 tableM: newTableSpanManager(changeFeedID, tableExecutor), 100 liveness: liveness, 101 compat: compat.New(cfg, map[model.CaptureID]*model.CaptureInfo{}), 102 } 103 104 etcdCliCtx, cancel := context.WithTimeout(ctx, 5*time.Second) 105 defer cancel() 106 107 ownerCaptureID, err := client.GetOwnerID(etcdCliCtx) 108 if err != nil { 109 if err != concurrency.ErrElectionNoLeader { 110 return nil, errors.Trace(err) 111 } 112 // We tolerate the situation where there is no owner. 113 // If we are registered in Etcd, an elected Owner will have to 114 // contact us before it can schedule any table. 115 log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.", 116 zap.String("ownerCaptureID", ownerCaptureID), 117 zap.String("namespace", changeFeedID.Namespace), 118 zap.String("changefeed", changeFeedID.ID), 119 zap.Error(err)) 120 return result, nil 121 } 122 var ownerCaptureInfo *model.CaptureInfo 123 _, captures, err := client.GetCaptures(ctx) 124 for _, captureInfo := range captures { 125 if captureInfo.ID == ownerCaptureID { 126 ownerCaptureInfo = captureInfo 127 break 128 } 129 } 130 if ownerCaptureInfo == nil { 131 log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.", 132 zap.String("namespace", changeFeedID.Namespace), 133 zap.String("changefeed", changeFeedID.ID), 134 zap.Error(err)) 135 return result, nil 136 } 137 result.compat.UpdateCaptureInfo(map[model.CaptureID]*model.CaptureInfo{ 138 ownerCaptureID: ownerCaptureInfo, 139 }) 140 141 log.Info("schedulerv3: agent owner found", 142 zap.String("ownerCaptureID", ownerCaptureID), 143 zap.String("captureID", captureID), 144 zap.String("namespace", changeFeedID.Namespace), 145 zap.String("changefeed", changeFeedID.ID)) 146 147 revision, err := client.GetOwnerRevision(etcdCliCtx, ownerCaptureID) 148 if err != nil { 149 if errors.ErrOwnerNotFound.Equal(err) || errors.ErrNotOwner.Equal(err) { 150 // These are expected errors when no owner has been elected 151 log.Info("schedulerv3: no owner found when querying for the owner revision", 152 zap.String("ownerCaptureID", ownerCaptureID), 153 zap.String("captureID", captureID), 154 zap.String("namespace", changeFeedID.Namespace), 155 zap.String("changefeed", changeFeedID.ID), 156 zap.Error(err)) 157 return result, nil 158 } 159 return nil, err 160 } 161 162 // We don't need address, and owner info will be updated when there is a 163 // new owner elected. To avoid confusion, just leave it empty. 164 ownerCaptureInfo.AdvertiseAddr = "" 165 result.ownerInfo = ownerInfo{ 166 Revision: schedulepb.OwnerRevision{Revision: revision}, 167 CaptureInfo: *ownerCaptureInfo, 168 } 169 return result, nil 170 } 171 172 // NewAgent returns a new agent. 173 func NewAgent(ctx context.Context, 174 captureID model.CaptureID, 175 liveness *model.Liveness, 176 changeFeedID model.ChangeFeedID, 177 messageServer *p2p.MessageServer, 178 messageRouter p2p.MessageRouter, 179 ownerInfoClient etcd.OwnerCaptureInfoClient, 180 tableExecutor internal.TableExecutor, 181 changefeedEpoch uint64, 182 cfg *config.SchedulerConfig, 183 ) (internal.Agent, error) { 184 result, err := newAgent( 185 ctx, captureID, liveness, changeFeedID, ownerInfoClient, tableExecutor, 186 changefeedEpoch, cfg) 187 if err != nil { 188 return nil, errors.Trace(err) 189 } 190 191 trans, err := transport.NewTransport( 192 ctx, changeFeedID, transport.AgentRole, messageServer, messageRouter) 193 if err != nil { 194 return nil, errors.Trace(err) 195 } 196 197 result.(*agent).trans = trans 198 return result, nil 199 } 200 201 // Tick implement agent interface 202 func (a *agent) Tick(ctx context.Context) (*schedulepb.Barrier, error) { 203 inboundMessages, err := a.recvMsgs(ctx) 204 if err != nil { 205 return nil, errors.Trace(err) 206 } 207 208 outboundMessages, barrier := a.handleMessage(inboundMessages) 209 210 responses, err := a.tableM.poll(ctx) 211 if err != nil { 212 return nil, errors.Trace(err) 213 } 214 215 outboundMessages = append(outboundMessages, responses...) 216 217 if err := a.sendMsgs(ctx, outboundMessages); err != nil { 218 return nil, errors.Trace(err) 219 } 220 221 return barrier, nil 222 } 223 224 func (a *agent) handleLivenessUpdate(liveness model.Liveness) { 225 currentLiveness := a.liveness.Load() 226 if currentLiveness != liveness { 227 ok := a.liveness.Store(liveness) 228 if ok { 229 log.Info("schedulerv3: agent updates liveness", 230 zap.String("namespace", a.ChangeFeedID.Namespace), 231 zap.String("changefeed", a.ChangeFeedID.ID), 232 zap.String("old", currentLiveness.String()), 233 zap.String("new", liveness.String())) 234 } 235 } 236 } 237 238 func (a *agent) handleMessage(msg []*schedulepb.Message) (result []*schedulepb.Message, barrier *schedulepb.Barrier) { 239 for _, message := range msg { 240 ownerCaptureID := message.GetFrom() 241 header := message.GetHeader() 242 ownerVersion := header.GetVersion() 243 ownerRevision := header.GetOwnerRevision().Revision 244 processorEpoch := header.GetProcessorEpoch() 245 246 if !a.handleOwnerInfo(ownerCaptureID, ownerRevision, ownerVersion) { 247 continue 248 } 249 250 switch message.GetMsgType() { 251 case schedulepb.MsgHeartbeat: 252 var reMsg *schedulepb.Message 253 reMsg, barrier = a.handleMessageHeartbeat(message.GetHeartbeat()) 254 result = append(result, reMsg) 255 case schedulepb.MsgDispatchTableRequest: 256 a.handleMessageDispatchTableRequest(message.DispatchTableRequest, processorEpoch) 257 default: 258 log.Warn("schedulerv3: unknown message received", 259 zap.String("capture", a.CaptureID), 260 zap.String("namespace", a.ChangeFeedID.Namespace), 261 zap.String("changefeed", a.ChangeFeedID.ID), 262 zap.Any("message", message)) 263 } 264 } 265 return 266 } 267 268 func (a *agent) handleMessageHeartbeat(request *schedulepb.Heartbeat) (*schedulepb.Message, *schedulepb.Barrier) { 269 allTables := a.tableM.getAllTableSpans() 270 result := make([]tablepb.TableStatus, 0, allTables.Len()) 271 272 allTables.Ascend(func(span tablepb.Span, table *tableSpan) bool { 273 status := table.getTableSpanStatus(request.CollectStats) 274 if status.Checkpoint.CheckpointTs > status.Checkpoint.ResolvedTs { 275 log.Warn("schedulerv3: CheckpointTs is greater than ResolvedTs", 276 zap.String("namespace", a.ChangeFeedID.Namespace), 277 zap.String("changefeed", a.ChangeFeedID.ID), 278 zap.String("span", span.String())) 279 } 280 if table.task != nil && table.task.IsRemove { 281 status.State = tablepb.TableStateStopping 282 } 283 result = append(result, status) 284 return true 285 }) 286 for _, span := range request.GetSpans() { 287 if _, ok := allTables.Get(span); !ok { 288 status := a.tableM.getTableSpanStatus(span, request.CollectStats) 289 result = append(result, status) 290 } 291 } 292 293 if request.IsStopping { 294 a.handleLivenessUpdate(model.LivenessCaptureStopping) 295 } 296 response := &schedulepb.HeartbeatResponse{ 297 Tables: result, 298 Liveness: a.liveness.Load(), 299 } 300 301 message := &schedulepb.Message{ 302 MsgType: schedulepb.MsgHeartbeatResponse, 303 HeartbeatResponse: response, 304 } 305 306 log.Debug("schedulerv3: agent generate heartbeat response", 307 zap.String("capture", a.CaptureID), 308 zap.String("namespace", a.ChangeFeedID.Namespace), 309 zap.String("changefeed", a.ChangeFeedID.ID), 310 zap.Any("message", message)) 311 312 return message, request.GetBarrier() 313 } 314 315 type dispatchTableTaskStatus int32 316 317 const ( 318 dispatchTableTaskReceived = dispatchTableTaskStatus(iota + 1) 319 dispatchTableTaskProcessed 320 ) 321 322 type dispatchTableTask struct { 323 Span tablepb.Span 324 Checkpoint tablepb.Checkpoint 325 IsRemove bool 326 IsPrepare bool 327 Epoch schedulepb.ProcessorEpoch 328 status dispatchTableTaskStatus 329 } 330 331 func (a *agent) handleMessageDispatchTableRequest( 332 request *schedulepb.DispatchTableRequest, 333 epoch schedulepb.ProcessorEpoch, 334 ) { 335 if a.Epoch != epoch { 336 log.Info("schedulerv3: agent receive dispatch table request "+ 337 "epoch does not match, ignore it", 338 zap.String("capture", a.CaptureID), 339 zap.String("namespace", a.ChangeFeedID.Namespace), 340 zap.String("changefeed", a.ChangeFeedID.ID), 341 zap.String("epoch", epoch.Epoch), 342 zap.String("expected", a.Epoch.Epoch)) 343 return 344 } 345 var ( 346 table *tableSpan 347 task *dispatchTableTask 348 ok bool 349 ) 350 // make the assumption that all tables are tracked by the agent now. 351 // this should be guaranteed by the caller of the method. 352 switch req := request.Request.(type) { 353 case *schedulepb.DispatchTableRequest_AddTable: 354 span := req.AddTable.GetSpan() 355 task = &dispatchTableTask{ 356 Span: span, 357 Checkpoint: req.AddTable.GetCheckpoint(), 358 IsRemove: false, 359 IsPrepare: req.AddTable.GetIsSecondary(), 360 Epoch: epoch, 361 status: dispatchTableTaskReceived, 362 } 363 table = a.tableM.addTableSpan(span) 364 case *schedulepb.DispatchTableRequest_RemoveTable: 365 span := req.RemoveTable.GetSpan() 366 table, ok = a.tableM.getTableSpan(span) 367 if !ok { 368 log.Warn("schedulerv3: agent ignore remove table request, "+ 369 "since the table not found", 370 zap.String("capture", a.CaptureID), 371 zap.String("namespace", a.ChangeFeedID.Namespace), 372 zap.String("changefeed", a.ChangeFeedID.ID), 373 zap.String("span", span.String()), 374 zap.Any("request", request)) 375 return 376 } 377 task = &dispatchTableTask{ 378 Span: span, 379 IsRemove: true, 380 Epoch: epoch, 381 status: dispatchTableTaskReceived, 382 } 383 default: 384 log.Warn("schedulerv3: agent ignore unknown dispatch table request", 385 zap.String("capture", a.CaptureID), 386 zap.String("namespace", a.ChangeFeedID.Namespace), 387 zap.String("changefeed", a.ChangeFeedID.ID), 388 zap.Any("request", request)) 389 return 390 } 391 table.injectDispatchTableTask(task) 392 } 393 394 // Close implement agent interface 395 func (a *agent) Close() error { 396 log.Debug("schedulerv3: agent closed", 397 zap.String("capture", a.CaptureID), 398 zap.String("namespace", a.ChangeFeedID.Namespace), 399 zap.String("changefeed", a.ChangeFeedID.ID)) 400 return a.trans.Close() 401 } 402 403 // handleOwnerInfo return false, if the given owner's info is staled. 404 // update owner's info to the latest otherwise. 405 // id: the incoming owner's capture ID 406 // revision: the incoming owner's revision as generated by Etcd election. 407 // version: the incoming owner's semantic version string 408 func (a *agent) handleOwnerInfo(id model.CaptureID, revision int64, version string) bool { 409 if a.ownerInfo.Revision.Revision == revision { 410 if a.ownerInfo.ID != id { 411 // This panic will happen only if two messages have been received 412 // with the same ownerRev but with different ownerIDs. 413 // This should never happen unless the election via Etcd is buggy. 414 log.Panic("schedulerv3: owner IDs do not match", 415 zap.String("capture", a.CaptureID), 416 zap.String("namespace", a.ChangeFeedID.Namespace), 417 zap.String("changefeed", a.ChangeFeedID.ID), 418 zap.String("expected", a.ownerInfo.ID), 419 zap.String("actual", id)) 420 } 421 return true 422 } 423 424 // the current owner is staled 425 if a.ownerInfo.Revision.Revision < revision { 426 a.ownerInfo.CaptureInfo.ID = id 427 a.ownerInfo.CaptureInfo.Version = version 428 a.ownerInfo.Revision.Revision = revision 429 430 a.resetEpoch() 431 432 captureInfo := a.ownerInfo.CaptureInfo 433 a.compat.UpdateCaptureInfo(map[model.CaptureID]*model.CaptureInfo{ 434 id: &captureInfo, 435 }) 436 log.Info("schedulerv3: new owner in power", 437 zap.String("capture", a.CaptureID), 438 zap.String("namespace", a.ChangeFeedID.Namespace), 439 zap.String("changefeed", a.ChangeFeedID.ID), 440 zap.Any("owner", a.ownerInfo), zap.Any("agent", a)) 441 return true 442 } 443 444 // staled owner heartbeat, just ignore it. 445 log.Info("schedulerv3: message from staled owner", 446 zap.String("capture", a.CaptureID), 447 zap.String("namespace", a.ChangeFeedID.Namespace), 448 zap.String("changefeed", a.ChangeFeedID.ID), 449 zap.Any("staledOwner", ownerInfo{ 450 CaptureInfo: model.CaptureInfo{ 451 ID: id, 452 Version: version, 453 }, 454 Revision: schedulepb.OwnerRevision{Revision: revision}, 455 }), 456 zap.Any("owner", a.ownerInfo), 457 zap.Any("agent", a.agentInfo)) 458 return false 459 } 460 461 func (a *agent) recvMsgs(ctx context.Context) ([]*schedulepb.Message, error) { 462 messages, err := a.trans.Recv(ctx) 463 if err != nil { 464 return nil, errors.Trace(err) 465 } 466 467 n := 0 468 for _, msg := range messages { 469 // only receive not staled messages 470 if !a.handleOwnerInfo(msg.From, msg.Header.OwnerRevision.Revision, msg.Header.Version) { 471 continue 472 } 473 // Check changefeed epoch, drop message if mismatch. 474 if a.compat.CheckChangefeedEpochEnabled(msg.From) && 475 msg.Header.ChangefeedEpoch.Epoch != a.changefeedEpoch { 476 continue 477 } 478 messages[n] = msg 479 n++ 480 } 481 a.compat.AfterTransportReceive(messages[:n]) 482 return messages[:n], nil 483 } 484 485 func (a *agent) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error { 486 for i := range msgs { 487 m := msgs[i] 488 if m.MsgType == schedulepb.MsgUnknown { 489 log.Panic("schedulerv3: invalid message no destination or unknown message type", 490 zap.String("capture", a.CaptureID), 491 zap.String("namespace", a.ChangeFeedID.Namespace), 492 zap.String("changefeed", a.ChangeFeedID.ID), 493 zap.Any("message", m)) 494 } 495 m.Header = &schedulepb.Message_Header{ 496 Version: a.Version, 497 OwnerRevision: a.ownerInfo.Revision, 498 ProcessorEpoch: a.Epoch, 499 ChangefeedEpoch: schedulepb.ChangefeedEpoch{ 500 Epoch: a.changefeedEpoch, 501 }, 502 } 503 m.From = a.CaptureID 504 m.To = a.ownerInfo.ID 505 } 506 a.compat.BeforeTransportSend(msgs) 507 return a.trans.Send(ctx, msgs) 508 }