github.com/Jeffail/benthos/v3@v3.65.0/lib/input/reader/kafka_cg.go (about)

     1  package reader
     2  
     3  import (
     4  	"context"
     5  	"crypto/tls"
     6  	"fmt"
     7  	"io"
     8  	"strconv"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/Jeffail/benthos/v3/lib/log"
    14  	"github.com/Jeffail/benthos/v3/lib/message"
    15  	"github.com/Jeffail/benthos/v3/lib/message/batch"
    16  	"github.com/Jeffail/benthos/v3/lib/metrics"
    17  	"github.com/Jeffail/benthos/v3/lib/types"
    18  	"github.com/Shopify/sarama"
    19  )
    20  
    21  //------------------------------------------------------------------------------
    22  
    23  type asyncMessage struct {
    24  	msg   types.Message
    25  	ackFn AsyncAckFn
    26  }
    27  
    28  // KafkaCG is an input type that reads from a Kafka cluster by balancing
    29  // partitions across other consumers of the same consumer group.
    30  type KafkaCG struct {
    31  	version   sarama.KafkaVersion
    32  	tlsConf   *tls.Config
    33  	addresses []string
    34  	topics    []string
    35  
    36  	commitPeriod      time.Duration
    37  	sessionTimeout    time.Duration
    38  	heartbeatInterval time.Duration
    39  	rebalanceTimeout  time.Duration
    40  	maxProcPeriod     time.Duration
    41  
    42  	cMut          sync.Mutex
    43  	groupCancelFn context.CancelFunc
    44  	session       sarama.ConsumerGroupSession
    45  	msgChan       chan asyncMessage
    46  
    47  	mRebalanced metrics.StatCounter
    48  
    49  	conf  KafkaBalancedConfig
    50  	stats metrics.Type
    51  	log   log.Modular
    52  	mgr   types.Manager
    53  
    54  	closeOnce  sync.Once
    55  	closedChan chan struct{}
    56  }
    57  
    58  // NewKafkaCG creates a new KafkaCG input type.
    59  func NewKafkaCG(
    60  	conf KafkaBalancedConfig, mgr types.Manager, log log.Modular, stats metrics.Type,
    61  ) (*KafkaCG, error) {
    62  	if conf.Batching.IsNoop() {
    63  		conf.Batching.Count = 1
    64  	}
    65  	k := KafkaCG{
    66  		conf:          conf,
    67  		stats:         stats,
    68  		groupCancelFn: func() {},
    69  		log:           log,
    70  		mgr:           mgr,
    71  		mRebalanced:   stats.GetCounter("rebalanced"),
    72  		closedChan:    make(chan struct{}),
    73  	}
    74  	if conf.TLS.Enabled {
    75  		var err error
    76  		if k.tlsConf, err = conf.TLS.Get(); err != nil {
    77  			return nil, err
    78  		}
    79  	}
    80  	for _, addr := range conf.Addresses {
    81  		for _, splitAddr := range strings.Split(addr, ",") {
    82  			if trimmed := strings.TrimSpace(splitAddr); len(trimmed) > 0 {
    83  				k.addresses = append(k.addresses, trimmed)
    84  			}
    85  		}
    86  	}
    87  	for _, t := range conf.Topics {
    88  		for _, splitTopics := range strings.Split(t, ",") {
    89  			if trimmed := strings.TrimSpace(splitTopics); len(trimmed) > 0 {
    90  				k.topics = append(k.topics, trimmed)
    91  			}
    92  		}
    93  	}
    94  	if tout := conf.CommitPeriod; len(tout) > 0 {
    95  		var err error
    96  		if k.commitPeriod, err = time.ParseDuration(tout); err != nil {
    97  			return nil, fmt.Errorf("failed to parse commit period string: %v", err)
    98  		}
    99  	}
   100  	if tout := conf.Group.SessionTimeout; len(tout) > 0 {
   101  		var err error
   102  		if k.sessionTimeout, err = time.ParseDuration(tout); err != nil {
   103  			return nil, fmt.Errorf("failed to parse session timeout string: %v", err)
   104  		}
   105  	}
   106  	if tout := conf.Group.HeartbeatInterval; len(tout) > 0 {
   107  		var err error
   108  		if k.heartbeatInterval, err = time.ParseDuration(tout); err != nil {
   109  			return nil, fmt.Errorf("failed to parse heartbeat interval string: %v", err)
   110  		}
   111  	}
   112  	if tout := conf.Group.RebalanceTimeout; len(tout) > 0 {
   113  		var err error
   114  		if k.rebalanceTimeout, err = time.ParseDuration(tout); err != nil {
   115  			return nil, fmt.Errorf("failed to parse rebalance timeout string: %v", err)
   116  		}
   117  	}
   118  	if tout := conf.MaxProcessingPeriod; len(tout) > 0 {
   119  		var err error
   120  		if k.maxProcPeriod, err = time.ParseDuration(tout); err != nil {
   121  			return nil, fmt.Errorf("failed to parse max processing period string: %v", err)
   122  		}
   123  	}
   124  
   125  	var err error
   126  	if k.version, err = sarama.ParseKafkaVersion(conf.TargetVersion); err != nil {
   127  		return nil, err
   128  	}
   129  	return &k, nil
   130  }
   131  
   132  //------------------------------------------------------------------------------
   133  
   134  // Setup is run at the beginning of a new session, before ConsumeClaim.
   135  func (k *KafkaCG) Setup(sesh sarama.ConsumerGroupSession) error {
   136  	k.cMut.Lock()
   137  	k.session = sesh
   138  	k.cMut.Unlock()
   139  	k.mRebalanced.Incr(1)
   140  	return nil
   141  }
   142  
   143  // Cleanup is run at the end of a session, once all ConsumeClaim goroutines have
   144  // exited but before the offsets are committed for the very last time.
   145  func (k *KafkaCG) Cleanup(sesh sarama.ConsumerGroupSession) error {
   146  	k.cMut.Lock()
   147  	k.session = nil
   148  	k.cMut.Unlock()
   149  	return nil
   150  }
   151  
   152  // ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages().
   153  // Once the Messages() channel is closed, the Handler must finish its processing
   154  // loop and exit.
   155  func (k *KafkaCG) ConsumeClaim(sess sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error {
   156  	topic, partition := claim.Topic(), claim.Partition()
   157  	k.log.Debugf("Consuming messages from topic '%v' partition '%v'\n", topic, partition)
   158  	defer k.log.Debugf("Stopped consuming messages from topic '%v' partition '%v'\n", topic, partition)
   159  
   160  	ackedChan := make(chan error)
   161  
   162  	latestOffset := claim.InitialOffset()
   163  	batchPolicy, err := batch.NewPolicy(k.conf.Batching, k.mgr, k.log, k.stats)
   164  	if err != nil {
   165  		k.log.Errorf("Failed to initialise batch policy: %v, falling back to single messages.\n", err)
   166  		fallBackConf := batch.NewPolicyConfig()
   167  		fallBackConf.Count = 1
   168  		if batchPolicy, err = batch.NewPolicy(fallBackConf, k.mgr, k.log, k.stats); err != nil {
   169  			k.log.Errorf("Failed to initialise fallback batch policy: %v.\n", err)
   170  			// The consume claim gets reopened immediately so let's try and
   171  			// avoid a busy loop (this should never happen anyway).
   172  			<-time.After(time.Second)
   173  			return err
   174  		}
   175  	}
   176  	defer batchPolicy.CloseAsync()
   177  
   178  	var nextTimedBatchChan <-chan time.Time
   179  	flushBatch := func(topic string, partition int32, offset int64) bool {
   180  		nextTimedBatchChan = nil
   181  		msg := batchPolicy.Flush()
   182  		if msg == nil {
   183  			return true
   184  		}
   185  		select {
   186  		case k.msgChan <- asyncMessage{
   187  			msg: msg,
   188  			ackFn: func(ctx context.Context, res types.Response) error {
   189  				resErr := res.Error()
   190  				if resErr == nil {
   191  					k.cMut.Lock()
   192  					if k.session != nil {
   193  						k.log.Debugf("Marking offset for topic '%v' partition '%v'.\n", topic, partition)
   194  						k.session.MarkOffset(topic, partition, offset, "")
   195  					} else {
   196  						k.log.Debugf("Unable to mark offset for topic '%v' partition '%v'.\n", topic, partition)
   197  					}
   198  					k.cMut.Unlock()
   199  				}
   200  				select {
   201  				case ackedChan <- resErr:
   202  				case <-sess.Context().Done():
   203  				}
   204  				return nil
   205  			},
   206  		}:
   207  			select {
   208  			case resErr := <-ackedChan:
   209  				if resErr != nil {
   210  					k.log.Errorf("Received error from message batch: %v, shutting down consumer.\n", resErr)
   211  					return false
   212  				}
   213  			case <-sess.Context().Done():
   214  				return false
   215  			}
   216  		case <-sess.Context().Done():
   217  			return false
   218  		}
   219  		return true
   220  	}
   221  
   222  	for {
   223  		if nextTimedBatchChan == nil {
   224  			if tNext := batchPolicy.UntilNext(); tNext >= 0 {
   225  				nextTimedBatchChan = time.After(tNext)
   226  			}
   227  		}
   228  		select {
   229  		case <-nextTimedBatchChan:
   230  			if !flushBatch(claim.Topic(), claim.Partition(), latestOffset+1) {
   231  				return nil
   232  			}
   233  		case data, open := <-claim.Messages():
   234  			if !open {
   235  				return nil
   236  			}
   237  			latestOffset = data.Offset
   238  			part := message.NewPart(data.Value)
   239  
   240  			meta := part.Metadata()
   241  			for _, hdr := range data.Headers {
   242  				meta.Set(string(hdr.Key), string(hdr.Value))
   243  			}
   244  
   245  			lag := claim.HighWaterMarkOffset() - data.Offset - 1
   246  			if lag < 0 {
   247  				lag = 0
   248  			}
   249  
   250  			meta.Set("kafka_key", string(data.Key))
   251  			meta.Set("kafka_partition", strconv.Itoa(int(data.Partition)))
   252  			meta.Set("kafka_topic", data.Topic)
   253  			meta.Set("kafka_offset", strconv.Itoa(int(data.Offset)))
   254  			meta.Set("kafka_lag", strconv.FormatInt(lag, 10))
   255  			meta.Set("kafka_timestamp_unix", strconv.FormatInt(data.Timestamp.Unix(), 10))
   256  
   257  			if batchPolicy.Add(part) {
   258  				if !flushBatch(claim.Topic(), claim.Partition(), latestOffset+1) {
   259  					return nil
   260  				}
   261  			}
   262  		case <-sess.Context().Done():
   263  			return nil
   264  		}
   265  	}
   266  }
   267  
   268  //------------------------------------------------------------------------------
   269  
   270  func (k *KafkaCG) closeGroup() {
   271  	k.cMut.Lock()
   272  	cancelFn := k.groupCancelFn
   273  	k.cMut.Unlock()
   274  
   275  	if cancelFn != nil {
   276  		k.log.Debugln("Closing group consumers.")
   277  		cancelFn()
   278  	}
   279  
   280  	k.closeOnce.Do(func() {
   281  		close(k.closedChan)
   282  	})
   283  }
   284  
   285  //------------------------------------------------------------------------------
   286  
   287  // ConnectWithContext establishes a KafkaCG connection.
   288  func (k *KafkaCG) ConnectWithContext(ctx context.Context) error {
   289  	k.cMut.Lock()
   290  	defer k.cMut.Unlock()
   291  	if k.msgChan != nil {
   292  		return nil
   293  	}
   294  
   295  	config := sarama.NewConfig()
   296  	config.ClientID = k.conf.ClientID
   297  	config.RackID = k.conf.RackID
   298  	config.Net.DialTimeout = time.Second
   299  	config.Version = k.version
   300  	config.Consumer.Return.Errors = true
   301  	config.Consumer.MaxProcessingTime = k.maxProcPeriod
   302  	config.Consumer.Offsets.AutoCommit.Enable = true
   303  	config.Consumer.Offsets.AutoCommit.Interval = k.commitPeriod
   304  	config.Consumer.Group.Session.Timeout = k.sessionTimeout
   305  	config.Consumer.Group.Heartbeat.Interval = k.heartbeatInterval
   306  	config.Consumer.Group.Rebalance.Timeout = k.rebalanceTimeout
   307  	config.ChannelBufferSize = k.conf.FetchBufferCap
   308  
   309  	if config.Net.ReadTimeout <= k.sessionTimeout {
   310  		config.Net.ReadTimeout = k.sessionTimeout * 2
   311  	}
   312  	if config.Net.ReadTimeout <= k.rebalanceTimeout {
   313  		config.Net.ReadTimeout = k.rebalanceTimeout * 2
   314  	}
   315  
   316  	config.Net.TLS.Enable = k.conf.TLS.Enabled
   317  	if k.conf.TLS.Enabled {
   318  		config.Net.TLS.Config = k.tlsConf
   319  	}
   320  	if k.conf.StartFromOldest {
   321  		config.Consumer.Offsets.Initial = sarama.OffsetOldest
   322  	}
   323  
   324  	if err := k.conf.SASL.Apply(k.mgr, config); err != nil {
   325  		return err
   326  	}
   327  
   328  	// Start a new consumer group
   329  	group, err := sarama.NewConsumerGroup(k.addresses, k.conf.ConsumerGroup, config)
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	// Handle errors
   335  	go func() {
   336  		for {
   337  			gerr, open := <-group.Errors()
   338  			if !open {
   339  				return
   340  			}
   341  			if gerr != nil {
   342  				k.log.Errorf("KafkaCG message recv error: %v\n", gerr)
   343  				if cerr, ok := gerr.(*sarama.ConsumerError); ok {
   344  					if cerr.Err == sarama.ErrUnknownMemberId {
   345  						// Sarama doesn't seem to recover from this error.
   346  						go k.closeGroup()
   347  					}
   348  				}
   349  			}
   350  		}
   351  	}()
   352  
   353  	// Handle session
   354  	go func() {
   355  	groupLoop:
   356  		for {
   357  			ctx, doneFn := context.WithCancel(context.Background())
   358  
   359  			k.cMut.Lock()
   360  			k.groupCancelFn = doneFn
   361  			k.cMut.Unlock()
   362  
   363  			k.log.Debugln("Starting consumer group")
   364  			gerr := group.Consume(ctx, k.topics, k)
   365  			select {
   366  			case <-ctx.Done():
   367  				break groupLoop
   368  			default:
   369  			}
   370  			doneFn()
   371  			if gerr != nil {
   372  				if gerr != io.EOF {
   373  					k.log.Errorf("KafkaCG group session error: %v\n", gerr)
   374  				}
   375  				break groupLoop
   376  			}
   377  		}
   378  		k.log.Debugln("Closing consumer group")
   379  
   380  		group.Close()
   381  
   382  		k.cMut.Lock()
   383  		if k.msgChan != nil {
   384  			close(k.msgChan)
   385  			k.msgChan = nil
   386  		}
   387  		k.cMut.Unlock()
   388  	}()
   389  
   390  	k.msgChan = make(chan asyncMessage)
   391  
   392  	k.log.Infof("Receiving kafka messages from brokers %s as group '%v'\n", k.addresses, k.conf.ConsumerGroup)
   393  	return nil
   394  }
   395  
   396  // ReadWithContext attempts to read a message from a KafkaCG topic.
   397  func (k *KafkaCG) ReadWithContext(ctx context.Context) (types.Message, AsyncAckFn, error) {
   398  	k.cMut.Lock()
   399  	msgChan := k.msgChan
   400  	k.cMut.Unlock()
   401  
   402  	if msgChan == nil {
   403  		return nil, nil, types.ErrNotConnected
   404  	}
   405  
   406  	select {
   407  	case m, open := <-k.msgChan:
   408  		if !open {
   409  			return nil, nil, types.ErrNotConnected
   410  		}
   411  		return m.msg, m.ackFn, nil
   412  	case <-ctx.Done():
   413  	}
   414  	return nil, nil, types.ErrTimeout
   415  }
   416  
   417  // CloseAsync shuts down the KafkaCG input and stops processing requests.
   418  func (k *KafkaCG) CloseAsync() {
   419  	go k.closeGroup()
   420  }
   421  
   422  // WaitForClose blocks until the KafkaCG input has closed down.
   423  func (k *KafkaCG) WaitForClose(timeout time.Duration) error {
   424  	select {
   425  	case <-k.closedChan:
   426  	case <-time.After(timeout):
   427  		return types.ErrTimeout
   428  	}
   429  	return nil
   430  }
   431  
   432  //------------------------------------------------------------------------------