github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/scheduler/internal/v3/agent/agent.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package agent
    15  
    16  import (
    17  	"context"
    18  	"time"
    19  
    20  	"github.com/google/uuid"
    21  	"github.com/pingcap/log"
    22  	"github.com/pingcap/tiflow/cdc/model"
    23  	"github.com/pingcap/tiflow/cdc/processor/tablepb"
    24  	"github.com/pingcap/tiflow/cdc/scheduler/internal"
    25  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/compat"
    26  	"github.com/pingcap/tiflow/cdc/scheduler/internal/v3/transport"
    27  	"github.com/pingcap/tiflow/cdc/scheduler/schedulepb"
    28  	"github.com/pingcap/tiflow/pkg/config"
    29  	"github.com/pingcap/tiflow/pkg/errors"
    30  	"github.com/pingcap/tiflow/pkg/etcd"
    31  	"github.com/pingcap/tiflow/pkg/p2p"
    32  	"github.com/pingcap/tiflow/pkg/version"
    33  	"go.etcd.io/etcd/client/v3/concurrency"
    34  	"go.uber.org/zap"
    35  )
    36  
    37  var _ internal.Agent = (*agent)(nil)
    38  
    39  type agent struct {
    40  	agentInfo
    41  	trans  transport.Transport
    42  	compat *compat.Compat
    43  
    44  	tableM *tableSpanManager
    45  
    46  	ownerInfo ownerInfo
    47  
    48  	// Liveness of the capture.
    49  	// It changes to LivenessCaptureStopping in following cases:
    50  	// 1. The capture receives a SIGTERM signal.
    51  	// 2. The agent receives a stopping heartbeat.
    52  	liveness *model.Liveness
    53  }
    54  
    55  type agentInfo struct {
    56  	Version         string
    57  	CaptureID       model.CaptureID
    58  	ChangeFeedID    model.ChangeFeedID
    59  	Epoch           schedulepb.ProcessorEpoch
    60  	changefeedEpoch uint64
    61  }
    62  
    63  func (a agentInfo) resetEpoch() {
    64  	a.Epoch = schedulepb.ProcessorEpoch{Epoch: uuid.New().String()}
    65  }
    66  
    67  func newAgentInfo(
    68  	changefeedID model.ChangeFeedID, captureID model.CaptureID, changefeedEpoch uint64,
    69  ) agentInfo {
    70  	result := agentInfo{
    71  		Version:         version.ReleaseSemver(),
    72  		CaptureID:       captureID,
    73  		ChangeFeedID:    changefeedID,
    74  		Epoch:           schedulepb.ProcessorEpoch{},
    75  		changefeedEpoch: changefeedEpoch,
    76  	}
    77  	result.resetEpoch()
    78  
    79  	return result
    80  }
    81  
    82  type ownerInfo struct {
    83  	model.CaptureInfo
    84  	Revision schedulepb.OwnerRevision
    85  }
    86  
    87  func newAgent(
    88  	ctx context.Context,
    89  	captureID model.CaptureID,
    90  	liveness *model.Liveness,
    91  	changeFeedID model.ChangeFeedID,
    92  	client etcd.OwnerCaptureInfoClient,
    93  	tableExecutor internal.TableExecutor,
    94  	changefeedEpoch uint64,
    95  	cfg *config.SchedulerConfig,
    96  ) (internal.Agent, error) {
    97  	result := &agent{
    98  		agentInfo: newAgentInfo(changeFeedID, captureID, changefeedEpoch),
    99  		tableM:    newTableSpanManager(changeFeedID, tableExecutor),
   100  		liveness:  liveness,
   101  		compat:    compat.New(cfg, map[model.CaptureID]*model.CaptureInfo{}),
   102  	}
   103  
   104  	etcdCliCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
   105  	defer cancel()
   106  
   107  	ownerCaptureID, err := client.GetOwnerID(etcdCliCtx)
   108  	if err != nil {
   109  		if err != concurrency.ErrElectionNoLeader {
   110  			return nil, errors.Trace(err)
   111  		}
   112  		// We tolerate the situation where there is no owner.
   113  		// If we are registered in Etcd, an elected Owner will have to
   114  		// contact us before it can schedule any table.
   115  		log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.",
   116  			zap.String("ownerCaptureID", ownerCaptureID),
   117  			zap.String("namespace", changeFeedID.Namespace),
   118  			zap.String("changefeed", changeFeedID.ID),
   119  			zap.Error(err))
   120  		return result, nil
   121  	}
   122  	var ownerCaptureInfo *model.CaptureInfo
   123  	_, captures, err := client.GetCaptures(ctx)
   124  	for _, captureInfo := range captures {
   125  		if captureInfo.ID == ownerCaptureID {
   126  			ownerCaptureInfo = captureInfo
   127  			break
   128  		}
   129  	}
   130  	if ownerCaptureInfo == nil {
   131  		log.Info("schedulerv3: no owner found. We will wait for an owner to contact us.",
   132  			zap.String("namespace", changeFeedID.Namespace),
   133  			zap.String("changefeed", changeFeedID.ID),
   134  			zap.Error(err))
   135  		return result, nil
   136  	}
   137  	result.compat.UpdateCaptureInfo(map[model.CaptureID]*model.CaptureInfo{
   138  		ownerCaptureID: ownerCaptureInfo,
   139  	})
   140  
   141  	log.Info("schedulerv3: agent owner found",
   142  		zap.String("ownerCaptureID", ownerCaptureID),
   143  		zap.String("captureID", captureID),
   144  		zap.String("namespace", changeFeedID.Namespace),
   145  		zap.String("changefeed", changeFeedID.ID))
   146  
   147  	revision, err := client.GetOwnerRevision(etcdCliCtx, ownerCaptureID)
   148  	if err != nil {
   149  		if errors.ErrOwnerNotFound.Equal(err) || errors.ErrNotOwner.Equal(err) {
   150  			// These are expected errors when no owner has been elected
   151  			log.Info("schedulerv3: no owner found when querying for the owner revision",
   152  				zap.String("ownerCaptureID", ownerCaptureID),
   153  				zap.String("captureID", captureID),
   154  				zap.String("namespace", changeFeedID.Namespace),
   155  				zap.String("changefeed", changeFeedID.ID),
   156  				zap.Error(err))
   157  			return result, nil
   158  		}
   159  		return nil, err
   160  	}
   161  
   162  	// We don't need address, and owner info will be updated when there is a
   163  	// new owner elected. To avoid confusion, just leave it empty.
   164  	ownerCaptureInfo.AdvertiseAddr = ""
   165  	result.ownerInfo = ownerInfo{
   166  		Revision:    schedulepb.OwnerRevision{Revision: revision},
   167  		CaptureInfo: *ownerCaptureInfo,
   168  	}
   169  	return result, nil
   170  }
   171  
   172  // NewAgent returns a new agent.
   173  func NewAgent(ctx context.Context,
   174  	captureID model.CaptureID,
   175  	liveness *model.Liveness,
   176  	changeFeedID model.ChangeFeedID,
   177  	messageServer *p2p.MessageServer,
   178  	messageRouter p2p.MessageRouter,
   179  	ownerInfoClient etcd.OwnerCaptureInfoClient,
   180  	tableExecutor internal.TableExecutor,
   181  	changefeedEpoch uint64,
   182  	cfg *config.SchedulerConfig,
   183  ) (internal.Agent, error) {
   184  	result, err := newAgent(
   185  		ctx, captureID, liveness, changeFeedID, ownerInfoClient, tableExecutor,
   186  		changefeedEpoch, cfg)
   187  	if err != nil {
   188  		return nil, errors.Trace(err)
   189  	}
   190  
   191  	trans, err := transport.NewTransport(
   192  		ctx, changeFeedID, transport.AgentRole, messageServer, messageRouter)
   193  	if err != nil {
   194  		return nil, errors.Trace(err)
   195  	}
   196  
   197  	result.(*agent).trans = trans
   198  	return result, nil
   199  }
   200  
   201  // Tick implement agent interface
   202  func (a *agent) Tick(ctx context.Context) (*schedulepb.Barrier, error) {
   203  	inboundMessages, err := a.recvMsgs(ctx)
   204  	if err != nil {
   205  		return nil, errors.Trace(err)
   206  	}
   207  
   208  	outboundMessages, barrier := a.handleMessage(inboundMessages)
   209  
   210  	responses, err := a.tableM.poll(ctx)
   211  	if err != nil {
   212  		return nil, errors.Trace(err)
   213  	}
   214  
   215  	outboundMessages = append(outboundMessages, responses...)
   216  
   217  	if err := a.sendMsgs(ctx, outboundMessages); err != nil {
   218  		return nil, errors.Trace(err)
   219  	}
   220  
   221  	return barrier, nil
   222  }
   223  
   224  func (a *agent) handleLivenessUpdate(liveness model.Liveness) {
   225  	currentLiveness := a.liveness.Load()
   226  	if currentLiveness != liveness {
   227  		ok := a.liveness.Store(liveness)
   228  		if ok {
   229  			log.Info("schedulerv3: agent updates liveness",
   230  				zap.String("namespace", a.ChangeFeedID.Namespace),
   231  				zap.String("changefeed", a.ChangeFeedID.ID),
   232  				zap.String("old", currentLiveness.String()),
   233  				zap.String("new", liveness.String()))
   234  		}
   235  	}
   236  }
   237  
   238  func (a *agent) handleMessage(msg []*schedulepb.Message) (result []*schedulepb.Message, barrier *schedulepb.Barrier) {
   239  	for _, message := range msg {
   240  		ownerCaptureID := message.GetFrom()
   241  		header := message.GetHeader()
   242  		ownerVersion := header.GetVersion()
   243  		ownerRevision := header.GetOwnerRevision().Revision
   244  		processorEpoch := header.GetProcessorEpoch()
   245  
   246  		if !a.handleOwnerInfo(ownerCaptureID, ownerRevision, ownerVersion) {
   247  			continue
   248  		}
   249  
   250  		switch message.GetMsgType() {
   251  		case schedulepb.MsgHeartbeat:
   252  			var reMsg *schedulepb.Message
   253  			reMsg, barrier = a.handleMessageHeartbeat(message.GetHeartbeat())
   254  			result = append(result, reMsg)
   255  		case schedulepb.MsgDispatchTableRequest:
   256  			a.handleMessageDispatchTableRequest(message.DispatchTableRequest, processorEpoch)
   257  		default:
   258  			log.Warn("schedulerv3: unknown message received",
   259  				zap.String("capture", a.CaptureID),
   260  				zap.String("namespace", a.ChangeFeedID.Namespace),
   261  				zap.String("changefeed", a.ChangeFeedID.ID),
   262  				zap.Any("message", message))
   263  		}
   264  	}
   265  	return
   266  }
   267  
   268  func (a *agent) handleMessageHeartbeat(request *schedulepb.Heartbeat) (*schedulepb.Message, *schedulepb.Barrier) {
   269  	allTables := a.tableM.getAllTableSpans()
   270  	result := make([]tablepb.TableStatus, 0, allTables.Len())
   271  
   272  	allTables.Ascend(func(span tablepb.Span, table *tableSpan) bool {
   273  		status := table.getTableSpanStatus(request.CollectStats)
   274  		if status.Checkpoint.CheckpointTs > status.Checkpoint.ResolvedTs {
   275  			log.Warn("schedulerv3: CheckpointTs is greater than ResolvedTs",
   276  				zap.String("namespace", a.ChangeFeedID.Namespace),
   277  				zap.String("changefeed", a.ChangeFeedID.ID),
   278  				zap.String("span", span.String()))
   279  		}
   280  		if table.task != nil && table.task.IsRemove {
   281  			status.State = tablepb.TableStateStopping
   282  		}
   283  		result = append(result, status)
   284  		return true
   285  	})
   286  	for _, span := range request.GetSpans() {
   287  		if _, ok := allTables.Get(span); !ok {
   288  			status := a.tableM.getTableSpanStatus(span, request.CollectStats)
   289  			result = append(result, status)
   290  		}
   291  	}
   292  
   293  	if request.IsStopping {
   294  		a.handleLivenessUpdate(model.LivenessCaptureStopping)
   295  	}
   296  	response := &schedulepb.HeartbeatResponse{
   297  		Tables:   result,
   298  		Liveness: a.liveness.Load(),
   299  	}
   300  
   301  	message := &schedulepb.Message{
   302  		MsgType:           schedulepb.MsgHeartbeatResponse,
   303  		HeartbeatResponse: response,
   304  	}
   305  
   306  	log.Debug("schedulerv3: agent generate heartbeat response",
   307  		zap.String("capture", a.CaptureID),
   308  		zap.String("namespace", a.ChangeFeedID.Namespace),
   309  		zap.String("changefeed", a.ChangeFeedID.ID),
   310  		zap.Any("message", message))
   311  
   312  	return message, request.GetBarrier()
   313  }
   314  
   315  type dispatchTableTaskStatus int32
   316  
   317  const (
   318  	dispatchTableTaskReceived = dispatchTableTaskStatus(iota + 1)
   319  	dispatchTableTaskProcessed
   320  )
   321  
   322  type dispatchTableTask struct {
   323  	Span       tablepb.Span
   324  	Checkpoint tablepb.Checkpoint
   325  	IsRemove   bool
   326  	IsPrepare  bool
   327  	Epoch      schedulepb.ProcessorEpoch
   328  	status     dispatchTableTaskStatus
   329  }
   330  
   331  func (a *agent) handleMessageDispatchTableRequest(
   332  	request *schedulepb.DispatchTableRequest,
   333  	epoch schedulepb.ProcessorEpoch,
   334  ) {
   335  	if a.Epoch != epoch {
   336  		log.Info("schedulerv3: agent receive dispatch table request "+
   337  			"epoch does not match, ignore it",
   338  			zap.String("capture", a.CaptureID),
   339  			zap.String("namespace", a.ChangeFeedID.Namespace),
   340  			zap.String("changefeed", a.ChangeFeedID.ID),
   341  			zap.String("epoch", epoch.Epoch),
   342  			zap.String("expected", a.Epoch.Epoch))
   343  		return
   344  	}
   345  	var (
   346  		table *tableSpan
   347  		task  *dispatchTableTask
   348  		ok    bool
   349  	)
   350  	// make the assumption that all tables are tracked by the agent now.
   351  	// this should be guaranteed by the caller of the method.
   352  	switch req := request.Request.(type) {
   353  	case *schedulepb.DispatchTableRequest_AddTable:
   354  		span := req.AddTable.GetSpan()
   355  		task = &dispatchTableTask{
   356  			Span:       span,
   357  			Checkpoint: req.AddTable.GetCheckpoint(),
   358  			IsRemove:   false,
   359  			IsPrepare:  req.AddTable.GetIsSecondary(),
   360  			Epoch:      epoch,
   361  			status:     dispatchTableTaskReceived,
   362  		}
   363  		table = a.tableM.addTableSpan(span)
   364  	case *schedulepb.DispatchTableRequest_RemoveTable:
   365  		span := req.RemoveTable.GetSpan()
   366  		table, ok = a.tableM.getTableSpan(span)
   367  		if !ok {
   368  			log.Warn("schedulerv3: agent ignore remove table request, "+
   369  				"since the table not found",
   370  				zap.String("capture", a.CaptureID),
   371  				zap.String("namespace", a.ChangeFeedID.Namespace),
   372  				zap.String("changefeed", a.ChangeFeedID.ID),
   373  				zap.String("span", span.String()),
   374  				zap.Any("request", request))
   375  			return
   376  		}
   377  		task = &dispatchTableTask{
   378  			Span:     span,
   379  			IsRemove: true,
   380  			Epoch:    epoch,
   381  			status:   dispatchTableTaskReceived,
   382  		}
   383  	default:
   384  		log.Warn("schedulerv3: agent ignore unknown dispatch table request",
   385  			zap.String("capture", a.CaptureID),
   386  			zap.String("namespace", a.ChangeFeedID.Namespace),
   387  			zap.String("changefeed", a.ChangeFeedID.ID),
   388  			zap.Any("request", request))
   389  		return
   390  	}
   391  	table.injectDispatchTableTask(task)
   392  }
   393  
   394  // Close implement agent interface
   395  func (a *agent) Close() error {
   396  	log.Debug("schedulerv3: agent closed",
   397  		zap.String("capture", a.CaptureID),
   398  		zap.String("namespace", a.ChangeFeedID.Namespace),
   399  		zap.String("changefeed", a.ChangeFeedID.ID))
   400  	return a.trans.Close()
   401  }
   402  
   403  // handleOwnerInfo return false, if the given owner's info is staled.
   404  // update owner's info to the latest otherwise.
   405  // id: the incoming owner's capture ID
   406  // revision: the incoming owner's revision as generated by Etcd election.
   407  // version: the incoming owner's semantic version string
   408  func (a *agent) handleOwnerInfo(id model.CaptureID, revision int64, version string) bool {
   409  	if a.ownerInfo.Revision.Revision == revision {
   410  		if a.ownerInfo.ID != id {
   411  			// This panic will happen only if two messages have been received
   412  			// with the same ownerRev but with different ownerIDs.
   413  			// This should never happen unless the election via Etcd is buggy.
   414  			log.Panic("schedulerv3: owner IDs do not match",
   415  				zap.String("capture", a.CaptureID),
   416  				zap.String("namespace", a.ChangeFeedID.Namespace),
   417  				zap.String("changefeed", a.ChangeFeedID.ID),
   418  				zap.String("expected", a.ownerInfo.ID),
   419  				zap.String("actual", id))
   420  		}
   421  		return true
   422  	}
   423  
   424  	// the current owner is staled
   425  	if a.ownerInfo.Revision.Revision < revision {
   426  		a.ownerInfo.CaptureInfo.ID = id
   427  		a.ownerInfo.CaptureInfo.Version = version
   428  		a.ownerInfo.Revision.Revision = revision
   429  
   430  		a.resetEpoch()
   431  
   432  		captureInfo := a.ownerInfo.CaptureInfo
   433  		a.compat.UpdateCaptureInfo(map[model.CaptureID]*model.CaptureInfo{
   434  			id: &captureInfo,
   435  		})
   436  		log.Info("schedulerv3: new owner in power",
   437  			zap.String("capture", a.CaptureID),
   438  			zap.String("namespace", a.ChangeFeedID.Namespace),
   439  			zap.String("changefeed", a.ChangeFeedID.ID),
   440  			zap.Any("owner", a.ownerInfo), zap.Any("agent", a))
   441  		return true
   442  	}
   443  
   444  	// staled owner heartbeat, just ignore it.
   445  	log.Info("schedulerv3: message from staled owner",
   446  		zap.String("capture", a.CaptureID),
   447  		zap.String("namespace", a.ChangeFeedID.Namespace),
   448  		zap.String("changefeed", a.ChangeFeedID.ID),
   449  		zap.Any("staledOwner", ownerInfo{
   450  			CaptureInfo: model.CaptureInfo{
   451  				ID:      id,
   452  				Version: version,
   453  			},
   454  			Revision: schedulepb.OwnerRevision{Revision: revision},
   455  		}),
   456  		zap.Any("owner", a.ownerInfo),
   457  		zap.Any("agent", a.agentInfo))
   458  	return false
   459  }
   460  
   461  func (a *agent) recvMsgs(ctx context.Context) ([]*schedulepb.Message, error) {
   462  	messages, err := a.trans.Recv(ctx)
   463  	if err != nil {
   464  		return nil, errors.Trace(err)
   465  	}
   466  
   467  	n := 0
   468  	for _, msg := range messages {
   469  		// only receive not staled messages
   470  		if !a.handleOwnerInfo(msg.From, msg.Header.OwnerRevision.Revision, msg.Header.Version) {
   471  			continue
   472  		}
   473  		// Check changefeed epoch, drop message if mismatch.
   474  		if a.compat.CheckChangefeedEpochEnabled(msg.From) &&
   475  			msg.Header.ChangefeedEpoch.Epoch != a.changefeedEpoch {
   476  			continue
   477  		}
   478  		messages[n] = msg
   479  		n++
   480  	}
   481  	a.compat.AfterTransportReceive(messages[:n])
   482  	return messages[:n], nil
   483  }
   484  
   485  func (a *agent) sendMsgs(ctx context.Context, msgs []*schedulepb.Message) error {
   486  	for i := range msgs {
   487  		m := msgs[i]
   488  		if m.MsgType == schedulepb.MsgUnknown {
   489  			log.Panic("schedulerv3: invalid message no destination or unknown message type",
   490  				zap.String("capture", a.CaptureID),
   491  				zap.String("namespace", a.ChangeFeedID.Namespace),
   492  				zap.String("changefeed", a.ChangeFeedID.ID),
   493  				zap.Any("message", m))
   494  		}
   495  		m.Header = &schedulepb.Message_Header{
   496  			Version:        a.Version,
   497  			OwnerRevision:  a.ownerInfo.Revision,
   498  			ProcessorEpoch: a.Epoch,
   499  			ChangefeedEpoch: schedulepb.ChangefeedEpoch{
   500  				Epoch: a.changefeedEpoch,
   501  			},
   502  		}
   503  		m.From = a.CaptureID
   504  		m.To = a.ownerInfo.ID
   505  	}
   506  	a.compat.BeforeTransportSend(msgs)
   507  	return a.trans.Send(ctx, msgs)
   508  }