github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/p2p/grpc_client.go (about) 1 // Copyright 2021 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package p2p 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 "github.com/pingcap/errors" 22 "github.com/pingcap/failpoint" 23 "github.com/pingcap/log" 24 "github.com/pingcap/tiflow/pkg/container/queue" 25 cerrors "github.com/pingcap/tiflow/pkg/errors" 26 "github.com/pingcap/tiflow/pkg/p2p/internal" 27 "github.com/pingcap/tiflow/pkg/security" 28 "github.com/pingcap/tiflow/proto/p2p" 29 "github.com/prometheus/client_golang/prometheus" 30 "go.uber.org/atomic" 31 "go.uber.org/zap" 32 "golang.org/x/sync/errgroup" 33 "golang.org/x/time/rate" 34 gRPCPeer "google.golang.org/grpc/peer" 35 ) 36 37 // grpcMessageClient is a client used to send peer messages. 38 // `Run` must be running before sending any message. 39 type grpcMessageClient struct { 40 sendCh *internal.SendChan 41 42 topicMu sync.RWMutex 43 topics map[string]*topicEntry 44 45 senderID NodeID 46 47 closeCh chan struct{} 48 isClosed atomic.Bool 49 50 // config is read only 51 config *MessageClientConfig 52 53 // newSender is used to create a new sender. 54 // It can be replaced to unit test MessageClient. 55 newSenderFn func(MessageClientStream) clientBatchSender[MessageEntry] 56 57 connector clientConnector 58 } 59 60 type topicEntry struct { 61 sentMessageMu sync.Mutex 62 sentMessages queue.ChunkQueue[*p2p.MessageEntry] 63 64 nextSeq atomic.Int64 65 ack atomic.Int64 66 lastSent atomic.Int64 67 } 68 69 // NewGrpcMessageClient creates a new MessageClient 70 // senderID is an identifier for the local node. 71 func NewGrpcMessageClient(senderID NodeID, config *MessageClientConfig) *grpcMessageClient { 72 return &grpcMessageClient{ 73 sendCh: internal.NewSendChan(int64(config.SendChannelSize)), 74 topics: make(map[string]*topicEntry), 75 senderID: senderID, 76 closeCh: make(chan struct{}), 77 config: config, 78 newSenderFn: func(stream MessageClientStream) clientBatchSender[MessageEntry] { 79 return newClientBatchSender(stream, config.MaxBatchBytes, config.MaxBatchCount) 80 }, 81 connector: newClientConnector(), 82 } 83 } 84 85 // Run launches background goroutines for MessageClient to work. 86 func (c *grpcMessageClient) Run( 87 ctx context.Context, network, addr string, 88 receiverID NodeID, 89 credential *security.Credential, 90 ) (ret error) { 91 defer func() { 92 c.isClosed.Store(true) 93 close(c.closeCh) 94 }() 95 96 metricsClientCount := clientCount.With(prometheus.Labels{ 97 "to": addr, 98 }) 99 metricsClientCount.Inc() 100 defer metricsClientCount.Dec() 101 102 rl := rate.NewLimiter(rate.Limit(c.config.RetryRateLimitPerSecond), 1) 103 epoch := int64(0) 104 for { 105 select { 106 case <-ctx.Done(): 107 return errors.Trace(ctx.Err()) 108 default: 109 } 110 111 if err := rl.Wait(ctx); err != nil { 112 return errors.Trace(err) 113 } 114 115 gRPCClient, release, err := c.connector.Connect(clientConnectOptions{ 116 network: network, 117 addr: addr, 118 credential: credential, 119 timeout: c.config.DialTimeout, 120 maxRecvMsgSize: c.config.MaxRecvMsgSize, 121 }) 122 if err != nil { 123 log.Warn("peer-message client: failed to connect to server", 124 zap.Error(err)) 125 continue 126 } 127 128 epoch++ 129 streamMeta := &p2p.StreamMeta{ 130 SenderId: c.senderID, 131 ReceiverId: receiverID, 132 Epoch: epoch, 133 ClientVersion: c.config.ClientVersion, 134 SenderAdvertisedAddr: c.config.AdvertisedAddr, 135 } 136 137 err = c.launchStream(ctx, gRPCClient, streamMeta) 138 if cerrors.ErrPeerMessageClientPermanentFail.Equal(err) { 139 release() 140 return errors.Trace(err) 141 } 142 log.Warn("peer message client detected error, restarting", zap.Error(err)) 143 release() 144 continue 145 } 146 } 147 148 func (c *grpcMessageClient) launchStream(ctx context.Context, gRPCClient p2p.CDCPeerToPeerClient, meta *p2p.StreamMeta) error { 149 failpoint.Inject("InjectClientPermanentFailure", func() { 150 failpoint.Return(cerrors.ErrPeerMessageClientPermanentFail.GenWithStackByArgs()) 151 }) 152 153 cancelCtx, cancelStream := context.WithCancel(ctx) 154 defer cancelStream() 155 156 clientStream, err := gRPCClient.SendMessage(cancelCtx) 157 if err != nil { 158 return errors.Trace(err) 159 } 160 161 err = clientStream.Send(&p2p.MessagePacket{Meta: meta}) 162 if err != nil { 163 return errors.Trace(err) 164 } 165 166 return errors.Trace(c.run(ctx, clientStream, cancelStream)) 167 } 168 169 func (c *grpcMessageClient) run(ctx context.Context, stream MessageClientStream, cancel func()) error { 170 errg, ctx := errgroup.WithContext(ctx) 171 172 errg.Go(func() error { 173 defer cancel() 174 return c.runTx(ctx, stream) 175 }) 176 177 errg.Go(func() error { 178 defer cancel() 179 return c.runRx(ctx, stream) 180 }) 181 182 return errors.Trace(errg.Wait()) 183 } 184 185 func (c *grpcMessageClient) runTx(ctx context.Context, stream MessageClientStream) error { 186 if err := c.retrySending(ctx, stream); err != nil { 187 return errors.Trace(err) 188 } 189 190 peerAddr := unknownPeerLabel 191 peer, ok := gRPCPeer.FromContext(stream.Context()) 192 if ok { 193 peerAddr = peer.Addr.String() 194 } 195 metricsClientMessageCount := clientMessageCount.With(prometheus.Labels{ 196 "to": peerAddr, 197 }) 198 199 ticker := time.NewTicker(c.config.BatchSendInterval) 200 defer ticker.Stop() 201 202 batchSender := c.newSenderFn(stream) 203 204 for { 205 msg, ok, err := c.sendCh.Receive(ctx, ticker.C) 206 if err != nil { 207 return errors.Trace(err) 208 } 209 if !ok { 210 // `ticker` has fired and we have not received any message. 211 // We try to flush whatever message we already have. 212 // The implementation of batchSender guarantees that 213 // an empty flush does not send any message. 214 if err := batchSender.Flush(); err != nil { 215 return errors.Trace(err) 216 } 217 continue 218 } 219 220 c.topicMu.RLock() 221 tpk, ok := c.topics[msg.Topic] 222 c.topicMu.RUnlock() 223 if !ok { 224 // This line should never be reachable unless there is a bug in this file. 225 log.Panic("topic not found. Report a bug", zap.String("topic", msg.Topic)) 226 } 227 228 // We want to assert that `msg.Sequence` is continuous within a topic. 229 if old := tpk.lastSent.Swap(msg.Sequence); old != initAck && msg.Sequence != old+1 { 230 log.Panic("unexpected seq of message", 231 zap.String("topic", msg.Topic), 232 zap.Int64("seq", msg.Sequence)) 233 } 234 235 tpk.sentMessageMu.Lock() 236 tpk.sentMessages.Push(msg) 237 tpk.sentMessageMu.Unlock() 238 239 metricsClientMessageCount.Inc() 240 241 log.Debug("Sending Message", 242 zap.String("topic", msg.Topic), 243 zap.Int64("seq", msg.Sequence)) 244 if err := batchSender.Append(msg); err != nil { 245 return errors.Trace(err) 246 } 247 } 248 } 249 250 // retrySending retries sending messages when the gRPC stream is re-established. 251 func (c *grpcMessageClient) retrySending(ctx context.Context, stream MessageClientStream) error { 252 topicsCloned := make(map[string]*topicEntry) 253 c.topicMu.RLock() 254 for k, v := range c.topics { 255 topicsCloned[k] = v 256 } 257 c.topicMu.RUnlock() 258 259 batcher := c.newSenderFn(stream) 260 for topic, tpk := range topicsCloned { 261 select { 262 case <-ctx.Done(): 263 return errors.Trace(ctx.Err()) 264 default: 265 } 266 267 tpk.sentMessageMu.Lock() 268 269 if queueHead, ok := tpk.sentMessages.Head(); ok { 270 retryFromSeq := queueHead.Sequence 271 log.Info("peer-to-peer client retrying", 272 zap.String("topic", topic), 273 zap.Int64("fromSeq", retryFromSeq)) 274 } 275 276 for it := tpk.sentMessages.Begin(); it.Valid(); it.Next() { 277 msg := it.Value() 278 log.Debug("retry sending msg", 279 zap.String("topic", msg.Topic), 280 zap.Int64("seq", msg.Sequence)) 281 282 err := batcher.Append(&p2p.MessageEntry{ 283 Topic: msg.Topic, 284 Content: msg.Content, 285 Sequence: msg.Sequence, 286 }) 287 if err != nil { 288 tpk.sentMessageMu.Unlock() 289 return errors.Trace(err) 290 } 291 } 292 293 if err := batcher.Flush(); err != nil { 294 tpk.sentMessageMu.Unlock() 295 return errors.Trace(err) 296 } 297 298 tpk.sentMessageMu.Unlock() 299 } 300 301 return nil 302 } 303 304 func (c *grpcMessageClient) runRx(ctx context.Context, stream MessageClientStream) error { 305 peerAddr := unknownPeerLabel 306 peer, ok := gRPCPeer.FromContext(stream.Context()) 307 if ok { 308 peerAddr = peer.Addr.String() 309 } 310 metricsClientAckCount := clientAckCount.With(prometheus.Labels{ 311 "from": peerAddr, 312 }) 313 314 for { 315 select { 316 case <-ctx.Done(): 317 return errors.Trace(ctx.Err()) 318 default: 319 } 320 321 resp, err := stream.Recv() 322 if err != nil { 323 return errors.Trace(err) 324 } 325 switch resp.GetExitReason() { 326 case p2p.ExitReason_OK: 327 break 328 case p2p.ExitReason_CAPTURE_ID_MISMATCH: 329 return cerrors.ErrPeerMessageClientPermanentFail.GenWithStackByArgs(resp.GetErrorMessage()) 330 default: 331 return cerrors.ErrPeerMessageServerClosed.GenWithStackByArgs(resp.GetErrorMessage()) 332 } 333 334 metricsClientAckCount.Inc() 335 336 for _, ack := range resp.GetAck() { 337 c.topicMu.RLock() 338 tpk, ok := c.topics[ack.GetTopic()] 339 c.topicMu.RUnlock() 340 if !ok { 341 log.Warn("Received ACK for unknown topic", zap.String("topic", ack.GetTopic())) 342 continue 343 } 344 345 tpk.ack.Store(ack.GetLastSeq()) 346 tpk.sentMessageMu.Lock() 347 tpk.sentMessages.RangeAndPop(func(msg *p2p.MessageEntry) bool { 348 return msg.Sequence <= ack.GetLastSeq() 349 }) 350 tpk.sentMessageMu.Unlock() 351 } 352 } 353 } 354 355 // SendMessage sends a message. It will block if the client is not ready to 356 // accept the message for now. Once the function returns without an error, 357 // the client will try its best to send the message, until `Run` is canceled. 358 func (c *grpcMessageClient) SendMessage(ctx context.Context, topic Topic, value interface{}) (seq Seq, ret error) { 359 return c.sendMessage(ctx, topic, value, false) 360 } 361 362 // TrySendMessage tries to send a message. It will return ErrPeerMessageSendTryAgain 363 // if the client is not ready to accept the message. 364 func (c *grpcMessageClient) TrySendMessage(ctx context.Context, topic Topic, value interface{}) (seq Seq, ret error) { 365 // FIXME (zixiong): This is a temporary way for testing client congestion. 366 // This failpoint will be removed once we abstract the MessageClient as an interface. 367 failpoint.Inject("ClientInjectSendMessageTryAgain", func() { 368 failpoint.Return(0, cerrors.ErrPeerMessageSendTryAgain.GenWithStackByArgs()) 369 }) 370 371 // FIXME (zixiong): This is a temporary way for testing whether the caller can handler this error. 372 failpoint.Inject("ClientInjectClosed", func() { 373 failpoint.Return(0, cerrors.ErrPeerMessageClientClosed.GenWithStackByArgs()) 374 }) 375 376 return c.sendMessage(ctx, topic, value, true) 377 } 378 379 func (c *grpcMessageClient) sendMessage(ctx context.Context, topic Topic, value interface{}, nonblocking bool) (seq Seq, ret error) { 380 if c.isClosed.Load() { 381 return 0, cerrors.ErrPeerMessageClientClosed.GenWithStackByArgs() 382 } 383 384 c.topicMu.RLock() 385 tpk, ok := c.topics[topic] 386 c.topicMu.RUnlock() 387 388 if !ok { 389 tpk = &topicEntry{ 390 sentMessages: *queue.NewChunkQueue[*p2p.MessageEntry](), 391 } 392 tpk.nextSeq.Store(0) 393 c.topicMu.Lock() 394 if newTpk, ok := c.topics[topic]; !ok { 395 c.topics[topic] = tpk 396 } else { 397 tpk = newTpk 398 } 399 c.topicMu.Unlock() 400 } 401 402 data, err := marshalMessage(value) 403 if err != nil { 404 return 0, cerrors.WrapError(cerrors.ErrPeerMessageEncodeError, err) 405 } 406 407 if nonblocking { 408 ok, seq := c.sendCh.SendAsync(topic, data, tpk.nextSeq.Inc) 409 if !ok { 410 return 0, cerrors.ErrPeerMessageSendTryAgain.GenWithStackByArgs() 411 } 412 return seq, nil 413 } 414 // blocking 415 seq, err = c.sendCh.SendSync(ctx, topic, data, c.closeCh, tpk.nextSeq.Inc) 416 if err != nil { 417 return 0, errors.Trace(err) 418 } 419 return seq, nil 420 } 421 422 // CurrentAck returns (s, true) if all messages with sequence less than or 423 // equal to s have been processed by the receiver. It returns (0, false) if 424 // no message for `topic` has been sent. 425 func (c *grpcMessageClient) CurrentAck(topic Topic) (Seq, bool) { 426 c.topicMu.RLock() 427 defer c.topicMu.RUnlock() 428 429 tpk, ok := c.topics[topic] 430 if !ok { 431 return 0, false 432 } 433 434 return tpk.ack.Load(), true 435 }