github.com/wfusion/gofusion@v1.1.14/common/infra/watermill/pubsub/kafka/subscriber.go (about) 1 package kafka 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "sync/atomic" 9 "time" 10 11 "github.com/IBM/sarama" 12 "github.com/pkg/errors" 13 "go.uber.org/multierr" 14 15 "github.com/wfusion/gofusion/common/infra/watermill" 16 "github.com/wfusion/gofusion/common/infra/watermill/message" 17 "github.com/wfusion/gofusion/common/utils" 18 ) 19 20 type Subscriber struct { 21 config SubscriberConfig 22 logger watermill.LoggerAdapter 23 24 closing chan struct{} 25 subscribersWg sync.WaitGroup 26 27 closed uint32 28 } 29 30 // NewSubscriber creates a new Kafka Subscriber. 31 func NewSubscriber( 32 config SubscriberConfig, 33 logger watermill.LoggerAdapter, 34 ) (sub *Subscriber, err error) { 35 config.setDefaults() 36 37 if err = config.Validate(); err != nil { 38 return 39 } 40 41 if logger == nil { 42 logger = watermill.NopLogger{} 43 } 44 45 logger = logger.With(watermill.LogFields{ 46 "subscriber_uuid": utils.ShortUUID(), 47 }) 48 49 sub = &Subscriber{ 50 config: config, 51 logger: logger, 52 53 closing: make(chan struct{}), 54 } 55 56 return 57 } 58 59 type SubscriberConfig struct { 60 // Kafka brokers list. 61 Brokers []string 62 63 // Unmarshaler is used to unmarshal messages from Kafka format into Watermill format. 64 Unmarshaler Unmarshaler 65 66 // OverwriteSaramaConfig holds additional sarama settings. 67 OverwriteSaramaConfig *sarama.Config 68 69 // Kafka consumer group. 70 // When empty, all messages from all partitions will be returned. 71 ConsumerGroup string 72 73 // How long after Nack message should be redelivered. 74 NackResendSleep time.Duration 75 76 // How long about unsuccessful reconnecting next reconnect will occur. 77 ReconnectRetrySleep time.Duration 78 79 InitializeTopicDetails *sarama.TopicDetail 80 81 // Tracer is used to trace Kafka messages. 82 // If nil, then no tracing will be used. 83 // Tracer SaramaTracer 84 } 85 86 // NoSleep can be set to SubscriberConfig.NackResendSleep and SubscriberConfig.ReconnectRetrySleep. 87 const NoSleep time.Duration = -1 88 89 func (c *SubscriberConfig) setDefaults() { 90 if c.OverwriteSaramaConfig == nil { 91 c.OverwriteSaramaConfig = DefaultSaramaSubscriberConfig() 92 } 93 if c.NackResendSleep == 0 { 94 c.NackResendSleep = time.Millisecond * 100 95 } 96 if c.ReconnectRetrySleep == 0 { 97 c.ReconnectRetrySleep = time.Second 98 } 99 } 100 101 func (c SubscriberConfig) Validate() error { 102 if len(c.Brokers) == 0 { 103 return errors.New("missing brokers") 104 } 105 if c.Unmarshaler == nil { 106 return errors.New("missing unmarshaler") 107 } 108 109 return nil 110 } 111 112 // DefaultSaramaSubscriberConfig creates default Sarama config used by Watermill. 113 // 114 // Custom config can be passed to NewSubscriber and NewPublisher. 115 // 116 // saramaConfig := DefaultSaramaSubscriberConfig() 117 // saramaConfig.Consumer.Offsets.Initial = sarama.OffsetOldest 118 // 119 // subscriberConfig.OverwriteSaramaConfig = saramaConfig 120 // 121 // subscriber, err := NewSubscriber(subscriberConfig, logger) 122 // // ... 123 func DefaultSaramaSubscriberConfig() *sarama.Config { 124 config := sarama.NewConfig() 125 config.Version = sarama.V1_0_0_0 126 config.Consumer.Return.Errors = true 127 config.ClientID = "watermill" 128 129 return config 130 } 131 132 // Subscribe subscribers for messages in Kafka. 133 // 134 // There are multiple subscribers spawned 135 func (s *Subscriber) Subscribe(ctx context.Context, topic string) (<-chan *message.Message, error) { 136 if atomic.LoadUint32(&s.closed) == 1 { 137 return nil, errors.New("subscriber closed") 138 } 139 140 s.subscribersWg.Add(1) 141 142 logFields := watermill.LogFields{ 143 "provider": "kafka", 144 "topic": topic, 145 "consumer_group": s.config.ConsumerGroup, 146 "kafka_consumer_uuid": utils.ShortUUID(), 147 } 148 s.logger.Info("[Common] watermill subscribing to Kafka topic", logFields) 149 150 // we don't want to have buffered channel to not consume message from Kafka when consumer is not consuming 151 output := make(chan *message.Message) 152 153 consumeClosed, err := s.consumeMessages(ctx, topic, output, logFields) 154 if err != nil { 155 s.subscribersWg.Done() 156 return nil, err 157 } 158 159 go func() { 160 // blocking, until s.closing is closed 161 s.handleReconnects(ctx, topic, output, consumeClosed, logFields) 162 close(output) 163 s.subscribersWg.Done() 164 }() 165 166 return output, nil 167 } 168 169 func (s *Subscriber) handleReconnects( 170 ctx context.Context, 171 topic string, 172 output chan *message.Message, 173 consumeClosed chan struct{}, 174 logFields watermill.LogFields, 175 ) { 176 for { 177 // nil channel will cause deadlock 178 if consumeClosed != nil { 179 <-consumeClosed 180 s.logger.Debug("[Common] watermill kafka consumeMessages stopped", logFields) 181 } else { 182 s.logger.Debug("[Common] watermill kafka empty consumeClosed", logFields) 183 } 184 185 select { 186 // it's important to don't exit before consumeClosed, 187 // to not trigger s.subscribersWg.Done() before consumer is closed 188 case <-s.closing: 189 s.logger.Debug("[Common] watermill kafka closing subscriber, no reconnect needed", logFields) 190 return 191 case <-ctx.Done(): 192 s.logger.Debug("[Common] watermill kafka ctx cancelled, no reconnect needed", logFields) 193 return 194 default: 195 s.logger.Debug("[Common] watermill kafka not closing, reconnecting", logFields) 196 } 197 198 s.logger.Info("[Common] watermill kafka reconnecting consumer", logFields) 199 200 var err error 201 consumeClosed, err = s.consumeMessages(ctx, topic, output, logFields) 202 if err != nil { 203 s.logger.Error("[Common] watermill kafka cannot reconnect messages consumer", err, logFields) 204 205 if s.config.ReconnectRetrySleep != NoSleep { 206 time.Sleep(s.config.ReconnectRetrySleep) 207 } 208 continue 209 } 210 } 211 } 212 213 func (s *Subscriber) consumeMessages( 214 ctx context.Context, 215 topic string, 216 output chan *message.Message, 217 logFields watermill.LogFields, 218 ) (consumeMessagesClosed chan struct{}, err error) { 219 s.logger.Info("[Common] watermill kafka starting consuming", logFields) 220 221 // Start with a client 222 client, err := sarama.NewClient(s.config.Brokers, s.config.OverwriteSaramaConfig) 223 if err != nil { 224 return nil, errors.Wrap(err, "cannot create new Sarama client") 225 } 226 227 ctx, cancel := context.WithCancel(ctx) 228 go func() { 229 select { 230 case <-s.closing: 231 s.logger.Debug("[Common] watermill kafka closing subscriber, cancelling consumeMessages", logFields) 232 cancel() 233 case <-ctx.Done(): 234 // avoid goroutine leak 235 } 236 }() 237 238 if s.config.ConsumerGroup == "" { 239 consumeMessagesClosed, err = s.consumeWithoutConsumerGroups(ctx, client, topic, output, logFields) 240 } else { 241 consumeMessagesClosed, err = s.consumeGroupMessages(ctx, client, topic, output, logFields) 242 } 243 if err != nil { 244 s.logger.Debug( 245 "[Common] watermill kafka starting consume failed, cancelling context", 246 logFields.Add(watermill.LogFields{"err": err}), 247 ) 248 cancel() 249 return nil, err 250 } 251 252 go func() { 253 <-consumeMessagesClosed 254 if err := client.Close(); err != nil { 255 s.logger.Error("[Common] watermill kafka cannot close client", err, logFields) 256 } else { 257 s.logger.Debug("[Common] watermill kafka client closed", logFields) 258 } 259 }() 260 261 return consumeMessagesClosed, nil 262 } 263 264 func (s *Subscriber) consumeGroupMessages( 265 ctx context.Context, 266 client sarama.Client, 267 topic string, 268 output chan *message.Message, 269 logFields watermill.LogFields, 270 // tracer SaramaTracer, 271 ) (chan struct{}, error) { 272 // Start a new consumer group 273 group, err := sarama.NewConsumerGroupFromClient(s.config.ConsumerGroup, client) 274 if err != nil { 275 return nil, errors.Wrap(err, "cannot create consumer group client") 276 } 277 278 groupClosed := make(chan struct{}) 279 280 handleGroupErrorsCtx, cancelHandleGroupErrors := context.WithCancel(context.Background()) 281 handleGroupErrorsDone := s.handleGroupErrors(handleGroupErrorsCtx, group, logFields) 282 283 handler := &consumerGroupHandler{ 284 ctx: ctx, 285 messageHandler: s.createMessagesHandler(output), 286 logger: s.logger, 287 closing: s.closing, 288 running: make(chan struct{}), 289 messageLogFields: logFields, 290 } 291 292 // if tracer != nil { 293 // handler = tracer.WrapConsumerGroupHandler(handler) 294 // } 295 296 go func() { 297 defer func() { 298 cancelHandleGroupErrors() 299 <-handleGroupErrorsDone 300 301 if err := group.Close(); err != nil { 302 s.logger.Error("[Common] watermill kafka group close with error", err, logFields) 303 } 304 305 s.logger.Info("[Common] watermill kafka consuming done", logFields) 306 close(groupClosed) 307 }() 308 309 ConsumeLoop: 310 for { 311 select { 312 default: 313 s.logger.Debug("[Common] watermill kafka not closing", logFields) 314 case <-s.closing: 315 s.logger.Debug("[Common] watermill kafka subscriber is closing, stopping group.Consume loop", 316 logFields) 317 break ConsumeLoop 318 case <-ctx.Done(): 319 s.logger.Debug("[Common] watermill kafka ctx was cancelled, stopping group.Consume loop", 320 logFields) 321 break ConsumeLoop 322 } 323 324 if err := group.Consume(ctx, []string{topic}, handler); err != nil { 325 if err == sarama.ErrUnknown { 326 // this is info, because it is often just noise 327 s.logger.Error("[Common] watermill kafka received unknown Sarama error", err, logFields) 328 } else { 329 s.logger.Error("[Common] watermill kafka group consume error", err, logFields) 330 } 331 332 break ConsumeLoop 333 } 334 335 // this is expected behaviour to run Consume again after it exited 336 // see: https://github.com/wfusion/gofusion/common/infra/watermill/issues/210 337 s.logger.Debug("[Common] watermill kafka consume stopped without any error, running consume again", 338 logFields) 339 } 340 }() 341 342 <-handler.running 343 return groupClosed, nil 344 } 345 346 func (s *Subscriber) handleGroupErrors( 347 ctx context.Context, 348 group sarama.ConsumerGroup, 349 logFields watermill.LogFields, 350 ) chan struct{} { 351 done := make(chan struct{}) 352 353 go func() { 354 defer close(done) 355 for { 356 select { 357 case err, ok := <-group.Errors(): 358 if !ok { 359 return 360 } 361 if err == nil { 362 continue 363 } 364 365 s.logger.Error("[Common] watermill kafka sarama internal error", err, logFields) 366 case <-ctx.Done(): 367 return 368 } 369 } 370 }() 371 372 return done 373 } 374 375 func (s *Subscriber) consumeWithoutConsumerGroups( 376 ctx context.Context, 377 client sarama.Client, 378 topic string, 379 output chan *message.Message, 380 logFields watermill.LogFields, 381 // tracer SaramaTracer, 382 ) (chan struct{}, error) { 383 consumer, err := sarama.NewConsumerFromClient(client) 384 if err != nil { 385 return nil, errors.Wrap(err, "cannot create consumer client") 386 } 387 388 // if tracer != nil { 389 // consumer = tracer.WrapConsumer(consumer) 390 // } 391 392 partitions, err := consumer.Partitions(topic) 393 if err != nil { 394 return nil, errors.Wrap(err, "cannot get partitions") 395 } 396 397 partitionConsumersWg := new(sync.WaitGroup) 398 399 for _, partition := range partitions { 400 partitionLogFields := logFields.Add(watermill.LogFields{"kafka_partition": partition}) 401 402 partitionConsumer, err := consumer.ConsumePartition(topic, 403 partition, s.config.OverwriteSaramaConfig.Consumer.Offsets.Initial) 404 if err != nil { 405 if err := client.Close(); err != nil && err != sarama.ErrClosedClient { 406 s.logger.Error("[Common] watermill kafka cannot close client", err, partitionLogFields) 407 } 408 return nil, errors.Wrap(err, "failed to start consumer for partition") 409 } 410 411 // if tracer != nil { 412 // partitionConsumer = tracer.WrapPartitionConsumer(partitionConsumer) 413 // } 414 415 messageHandler := s.createMessagesHandler(output) 416 417 partitionConsumersWg.Add(1) 418 go s.consumePartition(ctx, partitionConsumer, messageHandler, partitionConsumersWg, partitionLogFields) 419 } 420 421 closed := make(chan struct{}) 422 go func() { 423 partitionConsumersWg.Wait() 424 close(closed) 425 }() 426 427 return closed, nil 428 } 429 430 func (s *Subscriber) consumePartition( 431 ctx context.Context, 432 partitionConsumer sarama.PartitionConsumer, 433 messageHandler messageHandler, 434 partitionConsumersWg *sync.WaitGroup, 435 logFields watermill.LogFields, 436 ) { 437 defer func() { 438 if err := partitionConsumer.Close(); err != nil { 439 s.logger.Error("[Common] watermill kafka cannot close partition consumer", err, logFields) 440 } 441 partitionConsumersWg.Done() 442 s.logger.Debug("[Common] watermill kafka consumePartition stopped", logFields) 443 }() 444 445 kafkaMessages := partitionConsumer.Messages() 446 447 for { 448 select { 449 case kafkaMsg := <-kafkaMessages: 450 if kafkaMsg == nil { 451 s.logger.Debug("[Common] watermill kafkaMsg is closed, stopping consumePartition", logFields) 452 return 453 } 454 if err := messageHandler.processMessage(ctx, kafkaMsg, nil, logFields); err != nil { 455 return 456 } 457 case <-s.closing: 458 s.logger.Debug("[Common] watermill kafka subscriber is closing, stopping consumePartition", logFields) 459 return 460 461 case <-ctx.Done(): 462 s.logger.Debug("[Common] watermill kafka ctx was cancelled, stopping consumePartition", logFields) 463 return 464 } 465 } 466 } 467 468 func (s *Subscriber) createMessagesHandler(output chan *message.Message) messageHandler { 469 return messageHandler{ 470 outputChannel: output, 471 saramaCfg: s.config.OverwriteSaramaConfig, 472 unmarshaler: s.config.Unmarshaler, 473 nackResendSleep: s.config.NackResendSleep, 474 logger: s.logger, 475 closing: s.closing, 476 } 477 } 478 479 func (s *Subscriber) Close() error { 480 if !atomic.CompareAndSwapUint32(&s.closed, 0, 1) { 481 return nil 482 } 483 484 close(s.closing) 485 s.subscribersWg.Wait() 486 487 s.logger.Debug("Kafka subscriber closed", nil) 488 489 return nil 490 } 491 492 type consumerGroupHandler struct { 493 ctx context.Context 494 messageHandler messageHandler 495 logger watermill.LoggerAdapter 496 closing chan struct{} 497 messageLogFields watermill.LogFields 498 running chan struct{} 499 } 500 501 func (h *consumerGroupHandler) Setup(sess sarama.ConsumerGroupSession) error { 502 h.logger.Trace( 503 fmt.Sprintf("[Common] watermill kafka get cluster member(%s) generation(%v)", 504 sess.MemberID(), sess.GenerationID()), 505 h.messageLogFields.Copy(), 506 ) 507 h.messageAccepted() 508 return nil 509 } 510 511 func (*consumerGroupHandler) Cleanup(_ sarama.ConsumerGroupSession) error { return nil } 512 513 func (h *consumerGroupHandler) ConsumeClaim(sess sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error { 514 logFields := h.messageLogFields.Copy().Add(watermill.LogFields{ 515 "kafka_partition": claim.Partition(), 516 "kafka_initial_offset": claim.InitialOffset(), 517 }) 518 519 for kafkaMsg := range claim.Messages() { 520 h.logger.Debug("[Common] watermill kafka message claimed", logFields) 521 if err := h.messageHandler.processMessage(h.ctx, kafkaMsg, sess, logFields); err != nil { 522 return err 523 } 524 select { 525 case <-h.closing: 526 h.logger.Debug("[Common] watermill kafka subscriber is closing, stopping consumerGroupHandler", 527 logFields) 528 return nil 529 530 case <-h.ctx.Done(): 531 h.logger.Debug("[Common] watermill kafka ctx was cancelled, stopping consumerGroupHandler", logFields) 532 return nil 533 default: 534 continue 535 } 536 } 537 538 return nil 539 } 540 func (h *consumerGroupHandler) messageAccepted() { 541 select { 542 case _, ok := <-h.running: 543 if !ok { 544 return 545 } 546 default: 547 close(h.running) 548 } 549 } 550 551 type messageHandler struct { 552 outputChannel chan<- *message.Message 553 unmarshaler Unmarshaler 554 saramaCfg *sarama.Config 555 556 nackResendSleep time.Duration 557 558 logger watermill.LoggerAdapter 559 closing chan struct{} 560 } 561 562 func (h messageHandler) processMessage( 563 ctx context.Context, 564 kafkaMsg *sarama.ConsumerMessage, 565 sess sarama.ConsumerGroupSession, 566 messageLogFields watermill.LogFields, 567 ) error { 568 rawMessageID := fmt.Sprintf("partition(%v)_offset(%v)", kafkaMsg.Partition, kafkaMsg.Offset) 569 receivedMsgLogFields := messageLogFields.Add(watermill.LogFields{ 570 "kafka_partition_offset": kafkaMsg.Offset, 571 "kafka_partition": kafkaMsg.Partition, 572 "message_raw_id": rawMessageID, 573 }) 574 575 h.logger.Trace("Received message from Kafka", receivedMsgLogFields) 576 577 ctx = setPartitionToCtx(ctx, kafkaMsg.Partition) 578 ctx = setPartitionOffsetToCtx(ctx, kafkaMsg.Offset) 579 ctx = setMessageTimestampToCtx(ctx, kafkaMsg.Timestamp) 580 ctx = setMessageKeyToCtx(ctx, kafkaMsg.Key) 581 582 msg, err := h.unmarshaler.Unmarshal(kafkaMsg) 583 if err != nil { 584 // resend will make no sense, stopping consumerGroupHandler 585 return errors.Wrap(err, "message unmarshal failed") 586 } 587 588 ctx = context.WithValue(ctx, watermill.ContextKeyMessageUUID, msg.UUID) 589 ctx = context.WithValue(ctx, watermill.ContextKeyRawMessageID, rawMessageID) 590 ctx, cancelCtx := context.WithCancel(ctx) 591 defer cancelCtx() 592 593 msg.SetContext(ctx) 594 msg.Metadata[watermill.ContextKeyMessageUUID] = msg.UUID 595 msg.Metadata[watermill.ContextKeyRawMessageID] = rawMessageID 596 597 // check session has been canceled after re-balancing 598 sessionContext := context.Background() 599 if sess != nil { 600 sessionContext = sess.Context() 601 } 602 603 receivedMsgLogFields = receivedMsgLogFields.Add(watermill.LogFields{ 604 "message_uuid": msg.UUID, 605 }) 606 607 ResendLoop: 608 for { 609 select { 610 case h.outputChannel <- msg: 611 h.logger.Trace("[Common] watermill kafka message sent to consumer", receivedMsgLogFields) 612 case <-h.closing: 613 h.logger.Trace("[Common] watermill kafka closing, message discarded", receivedMsgLogFields) 614 return nil 615 case <-ctx.Done(): 616 h.logger.Trace("[Common] watermill kafka closing, ctx cancelled before sent to consumer", 617 receivedMsgLogFields) 618 return nil 619 case <-sessionContext.Done(): 620 h.logger.Trace("[Common] watermill kafka closing, session ctx cancelled before sent to consumer", 621 receivedMsgLogFields) 622 return nil 623 } 624 625 select { 626 case <-msg.Acked(): 627 if sess != nil { 628 sess.MarkMessage(kafkaMsg, "") 629 if !h.saramaCfg.Consumer.Offsets.AutoCommit.Enable { 630 sess.Commit() 631 } 632 } 633 h.logger.Trace("[Common] watermill message acked", receivedMsgLogFields) 634 break ResendLoop 635 case <-msg.Nacked(): 636 h.logger.Trace("[Common] watermill message nacked", receivedMsgLogFields) 637 638 // reset acks, etc. 639 msg = msg.Copy() 640 msg.SetContext(ctx) 641 if h.nackResendSleep != NoSleep { 642 time.Sleep(h.nackResendSleep) 643 } 644 645 continue ResendLoop 646 case <-h.closing: 647 h.logger.Trace("[Common] watermill kafka closing, message discarded before ack", receivedMsgLogFields) 648 return nil 649 case <-ctx.Done(): 650 h.logger.Trace("[Common] watermill kafka closing, ctx cancelled before ack", receivedMsgLogFields) 651 return nil 652 case <-sessionContext.Done(): 653 h.logger.Trace("[Common] watermill kafka closing, session ctx cancelled before ack", receivedMsgLogFields) 654 return nil 655 } 656 } 657 658 return nil 659 } 660 661 func (s *Subscriber) SubscribeInitialize(topic string) (err error) { 662 if s.config.InitializeTopicDetails == nil { 663 return errors.New("s.config.InitializeTopicDetails is empty, cannot SubscribeInitialize") 664 } 665 666 clusterAdmin, err := sarama.NewClusterAdmin(s.config.Brokers, s.config.OverwriteSaramaConfig) 667 if err != nil { 668 return errors.Wrap(err, "cannot create cluster admin") 669 } 670 defer func() { 671 if closeErr := clusterAdmin.Close(); closeErr != nil { 672 err = multierr.Append(err, closeErr) 673 } 674 }() 675 676 if err := clusterAdmin.CreateTopic(topic, s.config.InitializeTopicDetails, false); err != nil && 677 !strings.Contains(err.Error(), "Topic with this name already exists") { 678 return errors.Wrap(err, "cannot create topic") 679 } 680 681 s.logger.Info("[Common] watermill kafka created Kafka topic", watermill.LogFields{"topic": topic}) 682 683 return nil 684 } 685 686 type PartitionOffset map[int32]int64 687 688 func (s *Subscriber) PartitionOffset(topic string) (PartitionOffset, error) { 689 client, err := sarama.NewClient(s.config.Brokers, s.config.OverwriteSaramaConfig) 690 if err != nil { 691 return nil, errors.Wrap(err, "cannot create new Sarama client") 692 } 693 694 defer func() { 695 if closeErr := client.Close(); closeErr != nil { 696 err = multierr.Append(err, closeErr) 697 } 698 }() 699 700 partitions, err := client.Partitions(topic) 701 if err != nil { 702 return nil, errors.Wrap(err, "cannot get topic partitions") 703 } 704 705 partitionOffset := make(PartitionOffset, len(partitions)) 706 for _, partition := range partitions { 707 offset, err := client.GetOffset(topic, partition, s.config.OverwriteSaramaConfig.Consumer.Offsets.Initial) 708 if err != nil { 709 return nil, err 710 } 711 712 partitionOffset[partition] = offset 713 } 714 715 return partitionOffset, nil 716 }