github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/p2p/grpc_client.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package p2p
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/pingcap/errors"
    22  	"github.com/pingcap/failpoint"
    23  	"github.com/pingcap/log"
    24  	"github.com/pingcap/tiflow/pkg/container/queue"
    25  	cerrors "github.com/pingcap/tiflow/pkg/errors"
    26  	"github.com/pingcap/tiflow/pkg/p2p/internal"
    27  	"github.com/pingcap/tiflow/pkg/security"
    28  	"github.com/pingcap/tiflow/proto/p2p"
    29  	"github.com/prometheus/client_golang/prometheus"
    30  	"go.uber.org/atomic"
    31  	"go.uber.org/zap"
    32  	"golang.org/x/sync/errgroup"
    33  	"golang.org/x/time/rate"
    34  	gRPCPeer "google.golang.org/grpc/peer"
    35  )
    36  
    37  // grpcMessageClient is a client used to send peer messages.
    38  // `Run` must be running before sending any message.
    39  type grpcMessageClient struct {
    40  	sendCh *internal.SendChan
    41  
    42  	topicMu sync.RWMutex
    43  	topics  map[string]*topicEntry
    44  
    45  	senderID NodeID
    46  
    47  	closeCh  chan struct{}
    48  	isClosed atomic.Bool
    49  
    50  	// config is read only
    51  	config *MessageClientConfig
    52  
    53  	// newSender is used to create a new sender.
    54  	// It can be replaced to unit test MessageClient.
    55  	newSenderFn func(MessageClientStream) clientBatchSender[MessageEntry]
    56  
    57  	connector clientConnector
    58  }
    59  
    60  type topicEntry struct {
    61  	sentMessageMu sync.Mutex
    62  	sentMessages  queue.ChunkQueue[*p2p.MessageEntry]
    63  
    64  	nextSeq  atomic.Int64
    65  	ack      atomic.Int64
    66  	lastSent atomic.Int64
    67  }
    68  
    69  // NewGrpcMessageClient creates a new MessageClient
    70  // senderID is an identifier for the local node.
    71  func NewGrpcMessageClient(senderID NodeID, config *MessageClientConfig) *grpcMessageClient {
    72  	return &grpcMessageClient{
    73  		sendCh:   internal.NewSendChan(int64(config.SendChannelSize)),
    74  		topics:   make(map[string]*topicEntry),
    75  		senderID: senderID,
    76  		closeCh:  make(chan struct{}),
    77  		config:   config,
    78  		newSenderFn: func(stream MessageClientStream) clientBatchSender[MessageEntry] {
    79  			return newClientBatchSender(stream, config.MaxBatchBytes, config.MaxBatchCount)
    80  		},
    81  		connector: newClientConnector(),
    82  	}
    83  }
    84  
    85  // Run launches background goroutines for MessageClient to work.
    86  func (c *grpcMessageClient) Run(
    87  	ctx context.Context, network, addr string,
    88  	receiverID NodeID,
    89  	credential *security.Credential,
    90  ) (ret error) {
    91  	defer func() {
    92  		c.isClosed.Store(true)
    93  		close(c.closeCh)
    94  	}()
    95  
    96  	metricsClientCount := clientCount.With(prometheus.Labels{
    97  		"to": addr,
    98  	})
    99  	metricsClientCount.Inc()
   100  	defer metricsClientCount.Dec()
   101  
   102  	rl := rate.NewLimiter(rate.Limit(c.config.RetryRateLimitPerSecond), 1)
   103  	epoch := int64(0)
   104  	for {
   105  		select {
   106  		case <-ctx.Done():
   107  			return errors.Trace(ctx.Err())
   108  		default:
   109  		}
   110  
   111  		if err := rl.Wait(ctx); err != nil {
   112  			return errors.Trace(err)
   113  		}
   114  
   115  		gRPCClient, release, err := c.connector.Connect(clientConnectOptions{
   116  			network:        network,
   117  			addr:           addr,
   118  			credential:     credential,
   119  			timeout:        c.config.DialTimeout,
   120  			maxRecvMsgSize: c.config.MaxRecvMsgSize,
   121  		})
   122  		if err != nil {
   123  			log.Warn("peer-message client: failed to connect to server",
   124  				zap.Error(err))
   125  			continue
   126  		}
   127  
   128  		epoch++
   129  		streamMeta := &p2p.StreamMeta{
   130  			SenderId:             c.senderID,
   131  			ReceiverId:           receiverID,
   132  			Epoch:                epoch,
   133  			ClientVersion:        c.config.ClientVersion,
   134  			SenderAdvertisedAddr: c.config.AdvertisedAddr,
   135  		}
   136  
   137  		err = c.launchStream(ctx, gRPCClient, streamMeta)
   138  		if cerrors.ErrPeerMessageClientPermanentFail.Equal(err) {
   139  			release()
   140  			return errors.Trace(err)
   141  		}
   142  		log.Warn("peer message client detected error, restarting", zap.Error(err))
   143  		release()
   144  		continue
   145  	}
   146  }
   147  
   148  func (c *grpcMessageClient) launchStream(ctx context.Context, gRPCClient p2p.CDCPeerToPeerClient, meta *p2p.StreamMeta) error {
   149  	failpoint.Inject("InjectClientPermanentFailure", func() {
   150  		failpoint.Return(cerrors.ErrPeerMessageClientPermanentFail.GenWithStackByArgs())
   151  	})
   152  
   153  	cancelCtx, cancelStream := context.WithCancel(ctx)
   154  	defer cancelStream()
   155  
   156  	clientStream, err := gRPCClient.SendMessage(cancelCtx)
   157  	if err != nil {
   158  		return errors.Trace(err)
   159  	}
   160  
   161  	err = clientStream.Send(&p2p.MessagePacket{Meta: meta})
   162  	if err != nil {
   163  		return errors.Trace(err)
   164  	}
   165  
   166  	return errors.Trace(c.run(ctx, clientStream, cancelStream))
   167  }
   168  
   169  func (c *grpcMessageClient) run(ctx context.Context, stream MessageClientStream, cancel func()) error {
   170  	errg, ctx := errgroup.WithContext(ctx)
   171  
   172  	errg.Go(func() error {
   173  		defer cancel()
   174  		return c.runTx(ctx, stream)
   175  	})
   176  
   177  	errg.Go(func() error {
   178  		defer cancel()
   179  		return c.runRx(ctx, stream)
   180  	})
   181  
   182  	return errors.Trace(errg.Wait())
   183  }
   184  
   185  func (c *grpcMessageClient) runTx(ctx context.Context, stream MessageClientStream) error {
   186  	if err := c.retrySending(ctx, stream); err != nil {
   187  		return errors.Trace(err)
   188  	}
   189  
   190  	peerAddr := unknownPeerLabel
   191  	peer, ok := gRPCPeer.FromContext(stream.Context())
   192  	if ok {
   193  		peerAddr = peer.Addr.String()
   194  	}
   195  	metricsClientMessageCount := clientMessageCount.With(prometheus.Labels{
   196  		"to": peerAddr,
   197  	})
   198  
   199  	ticker := time.NewTicker(c.config.BatchSendInterval)
   200  	defer ticker.Stop()
   201  
   202  	batchSender := c.newSenderFn(stream)
   203  
   204  	for {
   205  		msg, ok, err := c.sendCh.Receive(ctx, ticker.C)
   206  		if err != nil {
   207  			return errors.Trace(err)
   208  		}
   209  		if !ok {
   210  			// `ticker` has fired and we have not received any message.
   211  			// We try to flush whatever message we already have.
   212  			// The implementation of batchSender guarantees that
   213  			// an empty flush does not send any message.
   214  			if err := batchSender.Flush(); err != nil {
   215  				return errors.Trace(err)
   216  			}
   217  			continue
   218  		}
   219  
   220  		c.topicMu.RLock()
   221  		tpk, ok := c.topics[msg.Topic]
   222  		c.topicMu.RUnlock()
   223  		if !ok {
   224  			// This line should never be reachable unless there is a bug in this file.
   225  			log.Panic("topic not found. Report a bug", zap.String("topic", msg.Topic))
   226  		}
   227  
   228  		// We want to assert that `msg.Sequence` is continuous within a topic.
   229  		if old := tpk.lastSent.Swap(msg.Sequence); old != initAck && msg.Sequence != old+1 {
   230  			log.Panic("unexpected seq of message",
   231  				zap.String("topic", msg.Topic),
   232  				zap.Int64("seq", msg.Sequence))
   233  		}
   234  
   235  		tpk.sentMessageMu.Lock()
   236  		tpk.sentMessages.Push(msg)
   237  		tpk.sentMessageMu.Unlock()
   238  
   239  		metricsClientMessageCount.Inc()
   240  
   241  		log.Debug("Sending Message",
   242  			zap.String("topic", msg.Topic),
   243  			zap.Int64("seq", msg.Sequence))
   244  		if err := batchSender.Append(msg); err != nil {
   245  			return errors.Trace(err)
   246  		}
   247  	}
   248  }
   249  
   250  // retrySending retries sending messages when the gRPC stream is re-established.
   251  func (c *grpcMessageClient) retrySending(ctx context.Context, stream MessageClientStream) error {
   252  	topicsCloned := make(map[string]*topicEntry)
   253  	c.topicMu.RLock()
   254  	for k, v := range c.topics {
   255  		topicsCloned[k] = v
   256  	}
   257  	c.topicMu.RUnlock()
   258  
   259  	batcher := c.newSenderFn(stream)
   260  	for topic, tpk := range topicsCloned {
   261  		select {
   262  		case <-ctx.Done():
   263  			return errors.Trace(ctx.Err())
   264  		default:
   265  		}
   266  
   267  		tpk.sentMessageMu.Lock()
   268  
   269  		if queueHead, ok := tpk.sentMessages.Head(); ok {
   270  			retryFromSeq := queueHead.Sequence
   271  			log.Info("peer-to-peer client retrying",
   272  				zap.String("topic", topic),
   273  				zap.Int64("fromSeq", retryFromSeq))
   274  		}
   275  
   276  		for it := tpk.sentMessages.Begin(); it.Valid(); it.Next() {
   277  			msg := it.Value()
   278  			log.Debug("retry sending msg",
   279  				zap.String("topic", msg.Topic),
   280  				zap.Int64("seq", msg.Sequence))
   281  
   282  			err := batcher.Append(&p2p.MessageEntry{
   283  				Topic:    msg.Topic,
   284  				Content:  msg.Content,
   285  				Sequence: msg.Sequence,
   286  			})
   287  			if err != nil {
   288  				tpk.sentMessageMu.Unlock()
   289  				return errors.Trace(err)
   290  			}
   291  		}
   292  
   293  		if err := batcher.Flush(); err != nil {
   294  			tpk.sentMessageMu.Unlock()
   295  			return errors.Trace(err)
   296  		}
   297  
   298  		tpk.sentMessageMu.Unlock()
   299  	}
   300  
   301  	return nil
   302  }
   303  
   304  func (c *grpcMessageClient) runRx(ctx context.Context, stream MessageClientStream) error {
   305  	peerAddr := unknownPeerLabel
   306  	peer, ok := gRPCPeer.FromContext(stream.Context())
   307  	if ok {
   308  		peerAddr = peer.Addr.String()
   309  	}
   310  	metricsClientAckCount := clientAckCount.With(prometheus.Labels{
   311  		"from": peerAddr,
   312  	})
   313  
   314  	for {
   315  		select {
   316  		case <-ctx.Done():
   317  			return errors.Trace(ctx.Err())
   318  		default:
   319  		}
   320  
   321  		resp, err := stream.Recv()
   322  		if err != nil {
   323  			return errors.Trace(err)
   324  		}
   325  		switch resp.GetExitReason() {
   326  		case p2p.ExitReason_OK:
   327  			break
   328  		case p2p.ExitReason_CAPTURE_ID_MISMATCH:
   329  			return cerrors.ErrPeerMessageClientPermanentFail.GenWithStackByArgs(resp.GetErrorMessage())
   330  		default:
   331  			return cerrors.ErrPeerMessageServerClosed.GenWithStackByArgs(resp.GetErrorMessage())
   332  		}
   333  
   334  		metricsClientAckCount.Inc()
   335  
   336  		for _, ack := range resp.GetAck() {
   337  			c.topicMu.RLock()
   338  			tpk, ok := c.topics[ack.GetTopic()]
   339  			c.topicMu.RUnlock()
   340  			if !ok {
   341  				log.Warn("Received ACK for unknown topic", zap.String("topic", ack.GetTopic()))
   342  				continue
   343  			}
   344  
   345  			tpk.ack.Store(ack.GetLastSeq())
   346  			tpk.sentMessageMu.Lock()
   347  			tpk.sentMessages.RangeAndPop(func(msg *p2p.MessageEntry) bool {
   348  				return msg.Sequence <= ack.GetLastSeq()
   349  			})
   350  			tpk.sentMessageMu.Unlock()
   351  		}
   352  	}
   353  }
   354  
   355  // SendMessage sends a message. It will block if the client is not ready to
   356  // accept the message for now. Once the function returns without an error,
   357  // the client will try its best to send the message, until `Run` is canceled.
   358  func (c *grpcMessageClient) SendMessage(ctx context.Context, topic Topic, value interface{}) (seq Seq, ret error) {
   359  	return c.sendMessage(ctx, topic, value, false)
   360  }
   361  
   362  // TrySendMessage tries to send a message. It will return ErrPeerMessageSendTryAgain
   363  // if the client is not ready to accept the message.
   364  func (c *grpcMessageClient) TrySendMessage(ctx context.Context, topic Topic, value interface{}) (seq Seq, ret error) {
   365  	// FIXME (zixiong): This is a temporary way for testing client congestion.
   366  	// This failpoint will be removed once we abstract the MessageClient as an interface.
   367  	failpoint.Inject("ClientInjectSendMessageTryAgain", func() {
   368  		failpoint.Return(0, cerrors.ErrPeerMessageSendTryAgain.GenWithStackByArgs())
   369  	})
   370  
   371  	// FIXME (zixiong): This is a temporary way for testing whether the caller can handler this error.
   372  	failpoint.Inject("ClientInjectClosed", func() {
   373  		failpoint.Return(0, cerrors.ErrPeerMessageClientClosed.GenWithStackByArgs())
   374  	})
   375  
   376  	return c.sendMessage(ctx, topic, value, true)
   377  }
   378  
   379  func (c *grpcMessageClient) sendMessage(ctx context.Context, topic Topic, value interface{}, nonblocking bool) (seq Seq, ret error) {
   380  	if c.isClosed.Load() {
   381  		return 0, cerrors.ErrPeerMessageClientClosed.GenWithStackByArgs()
   382  	}
   383  
   384  	c.topicMu.RLock()
   385  	tpk, ok := c.topics[topic]
   386  	c.topicMu.RUnlock()
   387  
   388  	if !ok {
   389  		tpk = &topicEntry{
   390  			sentMessages: *queue.NewChunkQueue[*p2p.MessageEntry](),
   391  		}
   392  		tpk.nextSeq.Store(0)
   393  		c.topicMu.Lock()
   394  		if newTpk, ok := c.topics[topic]; !ok {
   395  			c.topics[topic] = tpk
   396  		} else {
   397  			tpk = newTpk
   398  		}
   399  		c.topicMu.Unlock()
   400  	}
   401  
   402  	data, err := marshalMessage(value)
   403  	if err != nil {
   404  		return 0, cerrors.WrapError(cerrors.ErrPeerMessageEncodeError, err)
   405  	}
   406  
   407  	if nonblocking {
   408  		ok, seq := c.sendCh.SendAsync(topic, data, tpk.nextSeq.Inc)
   409  		if !ok {
   410  			return 0, cerrors.ErrPeerMessageSendTryAgain.GenWithStackByArgs()
   411  		}
   412  		return seq, nil
   413  	}
   414  	// blocking
   415  	seq, err = c.sendCh.SendSync(ctx, topic, data, c.closeCh, tpk.nextSeq.Inc)
   416  	if err != nil {
   417  		return 0, errors.Trace(err)
   418  	}
   419  	return seq, nil
   420  }
   421  
   422  // CurrentAck returns (s, true) if all messages with sequence less than or
   423  // equal to s have been processed by the receiver. It returns (0, false) if
   424  // no message for `topic` has been sent.
   425  func (c *grpcMessageClient) CurrentAck(topic Topic) (Seq, bool) {
   426  	c.topicMu.RLock()
   427  	defer c.topicMu.RUnlock()
   428  
   429  	tpk, ok := c.topics[topic]
   430  	if !ok {
   431  		return 0, false
   432  	}
   433  
   434  	return tpk.ack.Load(), true
   435  }