github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/hash_sample.go (about)

     1  package processor
     2  
     3  import (
     4  	"math"
     5  	"time"
     6  
     7  	"github.com/Jeffail/benthos/v3/internal/docs"
     8  	"github.com/Jeffail/benthos/v3/lib/log"
     9  	"github.com/Jeffail/benthos/v3/lib/metrics"
    10  	"github.com/Jeffail/benthos/v3/lib/response"
    11  	"github.com/Jeffail/benthos/v3/lib/types"
    12  	"github.com/OneOfOne/xxhash"
    13  )
    14  
    15  //------------------------------------------------------------------------------
    16  
    17  func init() {
    18  	Constructors[TypeHashSample] = TypeSpec{
    19  		constructor: NewHashSample,
    20  		Status:      docs.StatusDeprecated,
    21  		Footnotes: `
    22  ## Alternatives
    23  
    24  All functionality of this processor has been superseded by the
    25  [bloblang](/docs/components/processors/bloblang) processor.`,
    26  		FieldSpecs: docs.FieldSpecs{
    27  			docs.FieldCommon("retain_min", "The lower percentage of the sample range."),
    28  			docs.FieldCommon("retain_max", "The upper percentage of the sample range."),
    29  			docs.FieldAdvanced("parts", "An array of message indexes within the batch to sample based on. If left empty all messages are included. This field is only applicable when batching messages [at the input level](/docs/configuration/batching).").Array(),
    30  		},
    31  	}
    32  }
    33  
    34  //------------------------------------------------------------------------------
    35  
    36  // hashSamplingNorm is the constant factor to normalise a uint64 into the
    37  // (0.0, 100.0) range.
    38  const hashSamplingNorm = 100.0 / float64(math.MaxUint64)
    39  
    40  func scaleNum(n uint64) float64 {
    41  	return float64(n) * hashSamplingNorm
    42  }
    43  
    44  //------------------------------------------------------------------------------
    45  
    46  // HashSampleConfig contains configuration fields for the HashSample processor.
    47  type HashSampleConfig struct {
    48  	RetainMin float64 `json:"retain_min" yaml:"retain_min"`
    49  	RetainMax float64 `json:"retain_max" yaml:"retain_max"`
    50  	Parts     []int   `json:"parts" yaml:"parts"` // message parts to hash
    51  }
    52  
    53  // NewHashSampleConfig returns a HashSampleConfig with default values.
    54  func NewHashSampleConfig() HashSampleConfig {
    55  	return HashSampleConfig{
    56  		RetainMin: 0.0,
    57  		RetainMax: 10.0,     // retain the first [0, 10%) interval
    58  		Parts:     []int{0}, // only consider the 1st part
    59  	}
    60  }
    61  
    62  //------------------------------------------------------------------------------
    63  
    64  // HashSample is a processor that removes messages based on a sample factor by
    65  // hashing its contents.
    66  type HashSample struct {
    67  	conf  Config
    68  	log   log.Modular
    69  	stats metrics.Type
    70  
    71  	mCount     metrics.StatCounter
    72  	mDropOOB   metrics.StatCounter
    73  	mDropped   metrics.StatCounter
    74  	mErr       metrics.StatCounter
    75  	mSent      metrics.StatCounter
    76  	mBatchSent metrics.StatCounter
    77  }
    78  
    79  // NewHashSample returns a HashSample processor.
    80  func NewHashSample(
    81  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
    82  ) (Type, error) {
    83  	return &HashSample{
    84  		conf:  conf,
    85  		log:   log,
    86  		stats: stats,
    87  
    88  		mCount:     stats.GetCounter("count"),
    89  		mDropOOB:   stats.GetCounter("dropped_part_out_of_bounds"),
    90  		mDropped:   stats.GetCounter("dropped"),
    91  		mErr:       stats.GetCounter("error"),
    92  		mSent:      stats.GetCounter("sent"),
    93  		mBatchSent: stats.GetCounter("batch.sent"),
    94  	}, nil
    95  }
    96  
    97  //------------------------------------------------------------------------------
    98  
    99  // ProcessMessage applies the processor to a message, either creating >0
   100  // resulting messages or a response to be sent back to the message source.
   101  func (s *HashSample) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   102  	s.mCount.Incr(1)
   103  
   104  	hash := xxhash.New64()
   105  
   106  	lParts := msg.Len()
   107  	for _, index := range s.conf.HashSample.Parts {
   108  		if index < 0 {
   109  			// Negative indexes count backwards from the end.
   110  			index = lParts + index
   111  		}
   112  
   113  		// Check boundary of part index.
   114  		if index < 0 || index >= lParts {
   115  			s.mDropOOB.Incr(1)
   116  			s.mDropped.Incr(1)
   117  			s.log.Debugf("Cannot sample message part %v for parts count: %v\n", index, lParts)
   118  			return nil, response.NewAck()
   119  		}
   120  
   121  		// Attempt to add part to hash.
   122  		if _, err := hash.Write(msg.Get(index).Get()); err != nil {
   123  			s.mErr.Incr(1)
   124  			s.log.Debugf("Cannot hash message part for sampling: %v\n", err)
   125  			return nil, response.NewAck()
   126  		}
   127  	}
   128  
   129  	rate := scaleNum(hash.Sum64())
   130  	if rate >= s.conf.HashSample.RetainMin && rate < s.conf.HashSample.RetainMax {
   131  		s.mBatchSent.Incr(1)
   132  		s.mSent.Incr(int64(msg.Len()))
   133  		msgs := [1]types.Message{msg}
   134  		return msgs[:], nil
   135  	}
   136  
   137  	s.mDropped.Incr(int64(msg.Len()))
   138  	return nil, response.NewAck()
   139  }
   140  
   141  // CloseAsync shuts down the processor and stops processing requests.
   142  func (s *HashSample) CloseAsync() {
   143  }
   144  
   145  // WaitForClose blocks until the processor has closed down.
   146  func (s *HashSample) WaitForClose(timeout time.Duration) error {
   147  	return nil
   148  }
   149  
   150  //------------------------------------------------------------------------------