github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/group_by_value.go (about)

     1  package processor
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/Jeffail/benthos/v3/internal/bloblang/field"
     8  	"github.com/Jeffail/benthos/v3/internal/docs"
     9  	"github.com/Jeffail/benthos/v3/internal/interop"
    10  	"github.com/Jeffail/benthos/v3/internal/tracing"
    11  	"github.com/Jeffail/benthos/v3/lib/log"
    12  	"github.com/Jeffail/benthos/v3/lib/message"
    13  	"github.com/Jeffail/benthos/v3/lib/metrics"
    14  	"github.com/Jeffail/benthos/v3/lib/response"
    15  	"github.com/Jeffail/benthos/v3/lib/types"
    16  )
    17  
    18  //------------------------------------------------------------------------------
    19  
    20  func init() {
    21  	Constructors[TypeGroupByValue] = TypeSpec{
    22  		constructor: NewGroupByValue,
    23  		Categories: []Category{
    24  			CategoryComposition,
    25  		},
    26  		Summary: `
    27  Splits a batch of messages into N batches, where each resulting batch contains a
    28  group of messages determined by a
    29  [function interpolated string](/docs/configuration/interpolation#bloblang-queries) evaluated
    30  per message.`,
    31  		Description: `
    32  This allows you to group messages using arbitrary fields within their content or
    33  metadata, process them individually, and send them to unique locations as per
    34  their group.`,
    35  		Footnotes: `
    36  ## Examples
    37  
    38  If we were consuming Kafka messages and needed to group them by their key,
    39  archive the groups, and send them to S3 with the key as part of the path we
    40  could achieve that with the following:
    41  
    42  ` + "```yaml" + `
    43  pipeline:
    44    processors:
    45      - group_by_value:
    46          value: ${! meta("kafka_key") }
    47      - archive:
    48          format: tar
    49      - compress:
    50          algorithm: gzip
    51  output:
    52    aws_s3:
    53      bucket: TODO
    54      path: docs/${! meta("kafka_key") }/${! count("files") }-${! timestamp_unix_nano() }.tar.gz
    55  ` + "```" + ``,
    56  		FieldSpecs: docs.FieldSpecs{
    57  			docs.FieldCommon(
    58  				"value", "The interpolated string to group based on.",
    59  				"${! meta(\"kafka_key\") }", "${! json(\"foo.bar\") }-${! meta(\"baz\") }",
    60  			).IsInterpolated(),
    61  		},
    62  		UsesBatches: true,
    63  	}
    64  }
    65  
    66  //------------------------------------------------------------------------------
    67  
    68  // GroupByValueConfig is a configuration struct containing fields for the
    69  // GroupByValue processor, which breaks message batches down into N batches of a
    70  // smaller size according to a function interpolated string evaluated per
    71  // message part.
    72  type GroupByValueConfig struct {
    73  	Value string `json:"value" yaml:"value"`
    74  }
    75  
    76  // NewGroupByValueConfig returns a GroupByValueConfig with default values.
    77  func NewGroupByValueConfig() GroupByValueConfig {
    78  	return GroupByValueConfig{
    79  		Value: "${! meta(\"example\") }",
    80  	}
    81  }
    82  
    83  //------------------------------------------------------------------------------
    84  
    85  // GroupByValue is a processor that breaks message batches down into N batches
    86  // of a smaller size according to a function interpolated string evaluated per
    87  // message part.
    88  type GroupByValue struct {
    89  	log   log.Modular
    90  	stats metrics.Type
    91  
    92  	value *field.Expression
    93  
    94  	mCount     metrics.StatCounter
    95  	mGroups    metrics.StatGauge
    96  	mSent      metrics.StatCounter
    97  	mBatchSent metrics.StatCounter
    98  }
    99  
   100  // NewGroupByValue returns a GroupByValue processor.
   101  func NewGroupByValue(
   102  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
   103  ) (Type, error) {
   104  	value, err := interop.NewBloblangField(mgr, conf.GroupByValue.Value)
   105  	if err != nil {
   106  		return nil, fmt.Errorf("failed to parse value expression: %v", err)
   107  	}
   108  	return &GroupByValue{
   109  		log:   log,
   110  		stats: stats,
   111  
   112  		value: value,
   113  
   114  		mCount:     stats.GetCounter("count"),
   115  		mGroups:    stats.GetGauge("groups"),
   116  		mSent:      stats.GetCounter("sent"),
   117  		mBatchSent: stats.GetCounter("batch.sent"),
   118  	}, nil
   119  }
   120  
   121  //------------------------------------------------------------------------------
   122  
   123  // ProcessMessage applies the processor to a message, either creating >0
   124  // resulting messages or a response to be sent back to the message source.
   125  func (g *GroupByValue) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   126  	g.mCount.Incr(1)
   127  
   128  	if msg.Len() == 0 {
   129  		return nil, response.NewAck()
   130  	}
   131  
   132  	groupKeys := []string{}
   133  	groupMap := map[string]types.Message{}
   134  
   135  	spans := tracing.CreateChildSpans(TypeGroupByValue, msg)
   136  
   137  	msg.Iter(func(i int, p types.Part) error {
   138  		v := g.value.String(i, msg)
   139  		spans[i].LogKV(
   140  			"event", "grouped",
   141  			"type", v,
   142  		)
   143  		spans[i].SetTag("group", v)
   144  		if group, exists := groupMap[v]; exists {
   145  			group.Append(p)
   146  		} else {
   147  			g.log.Tracef("New group formed: %v\n", v)
   148  			groupKeys = append(groupKeys, v)
   149  			newMsg := message.New(nil)
   150  			newMsg.Append(p)
   151  			groupMap[v] = newMsg
   152  		}
   153  		return nil
   154  	})
   155  
   156  	for _, s := range spans {
   157  		s.Finish()
   158  	}
   159  
   160  	msgs := []types.Message{}
   161  	for _, key := range groupKeys {
   162  		msgs = append(msgs, groupMap[key])
   163  	}
   164  
   165  	g.mGroups.Set(int64(len(groupKeys)))
   166  
   167  	if len(msgs) == 0 {
   168  		return nil, response.NewAck()
   169  	}
   170  
   171  	g.mBatchSent.Incr(int64(len(msgs)))
   172  	for _, m := range msgs {
   173  		g.mSent.Incr(int64(m.Len()))
   174  	}
   175  	return msgs, nil
   176  }
   177  
   178  // CloseAsync shuts down the processor and stops processing requests.
   179  func (g *GroupByValue) CloseAsync() {
   180  }
   181  
   182  // WaitForClose blocks until the processor has closed down.
   183  func (g *GroupByValue) WaitForClose(timeout time.Duration) error {
   184  	return nil
   185  }
   186  
   187  //------------------------------------------------------------------------------