github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/cmd/services/m3aggregator/config/aggregator.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package config
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"math"
    27  	"net"
    28  	"os"
    29  	"runtime"
    30  	"sort"
    31  	"strings"
    32  	"time"
    33  
    34  	"github.com/m3db/m3/src/aggregator/aggregation/quantile/cm"
    35  	"github.com/m3db/m3/src/aggregator/aggregator"
    36  	"github.com/m3db/m3/src/aggregator/aggregator/handler"
    37  	"github.com/m3db/m3/src/aggregator/aggregator/handler/writer"
    38  	aggclient "github.com/m3db/m3/src/aggregator/client"
    39  	aggruntime "github.com/m3db/m3/src/aggregator/runtime"
    40  	"github.com/m3db/m3/src/aggregator/sharding"
    41  	"github.com/m3db/m3/src/cluster/client"
    42  	"github.com/m3db/m3/src/cluster/kv"
    43  	"github.com/m3db/m3/src/cluster/placement"
    44  	"github.com/m3db/m3/src/cluster/services"
    45  	"github.com/m3db/m3/src/cmd/services/m3aggregator/serve"
    46  	"github.com/m3db/m3/src/metrics/aggregation"
    47  	"github.com/m3db/m3/src/metrics/policy"
    48  	"github.com/m3db/m3/src/x/clock"
    49  	"github.com/m3db/m3/src/x/config/hostid"
    50  	"github.com/m3db/m3/src/x/instrument"
    51  	xio "github.com/m3db/m3/src/x/io"
    52  	"github.com/m3db/m3/src/x/pool"
    53  	"github.com/m3db/m3/src/x/retry"
    54  	"github.com/m3db/m3/src/x/sync"
    55  )
    56  
    57  var (
    58  	errNoKVClientConfiguration = errors.New("no kv client configuration")
    59  	errEmptyJitterBucketList   = errors.New("empty jitter bucket list")
    60  )
    61  
    62  var (
    63  	defaultNumPassthroughWriters = 8
    64  	defaultHostID                = "m3aggregator_local"
    65  )
    66  
    67  // AggregatorConfiguration contains aggregator configuration.
    68  type AggregatorConfiguration struct {
    69  	// HostID is the local host ID configuration.
    70  	HostID *hostid.Configuration `yaml:"hostID"`
    71  
    72  	// InstanceID is the instance ID configuration.
    73  	InstanceID InstanceIDConfiguration `yaml:"instanceID"`
    74  
    75  	// VerboseErrors sets whether or not to use verbose errors when
    76  	// value arrives too early, late, or other bad request like operation.
    77  	VerboseErrors bool `yaml:"verboseErrors"`
    78  
    79  	// AggregationTypes configs the aggregation types.
    80  	AggregationTypes aggregation.TypesConfiguration `yaml:"aggregationTypes"`
    81  
    82  	// Common metric prefix.
    83  	MetricPrefix *string `yaml:"metricPrefix"`
    84  
    85  	// Counter metric prefix.
    86  	CounterPrefix *string `yaml:"counterPrefix"`
    87  
    88  	// Timer metric prefix.
    89  	TimerPrefix *string `yaml:"timerPrefix"`
    90  
    91  	// Gauge metric prefix.
    92  	GaugePrefix *string `yaml:"gaugePrefix"`
    93  
    94  	// Stream configuration for computing quantiles.
    95  	Stream streamConfiguration `yaml:"stream"`
    96  
    97  	// Client configuration.
    98  	Client aggclient.Configuration `yaml:"client"`
    99  
   100  	// Placement manager.
   101  	PlacementManager placementManagerConfiguration `yaml:"placementManager"`
   102  
   103  	// Hash type used for sharding.
   104  	HashType *sharding.HashType `yaml:"hashType"`
   105  
   106  	// Amount of time we buffer writes before shard cutover.
   107  	BufferDurationBeforeShardCutover time.Duration `yaml:"bufferDurationBeforeShardCutover"`
   108  
   109  	// Amount of time we buffer writes after shard cutoff.
   110  	BufferDurationAfterShardCutoff time.Duration `yaml:"bufferDurationAfterShardCutoff"`
   111  
   112  	// Amount of time we buffer timed metrics in the past.
   113  	BufferDurationForPastTimedMetric time.Duration `yaml:"bufferDurationForPastTimedMetric"`
   114  
   115  	// Amount of time we buffer timed metrics in the future.
   116  	BufferDurationForFutureTimedMetric time.Duration `yaml:"bufferDurationForFutureTimedMetric"`
   117  
   118  	// Resign timeout.
   119  	ResignTimeout time.Duration `yaml:"resignTimeout"`
   120  
   121  	// ShutdownWaitTimeout if non-zero will be how long the aggregator waits from
   122  	// receiving a shutdown signal to exit. This can make coordinating graceful
   123  	// shutdowns between two replicas safer.
   124  	ShutdownWaitTimeout time.Duration `yaml:"shutdownWaitTimeout"`
   125  
   126  	// Flush times manager.
   127  	FlushTimesManager flushTimesManagerConfiguration `yaml:"flushTimesManager"`
   128  
   129  	// Election manager.
   130  	ElectionManager electionManagerConfiguration `yaml:"electionManager"`
   131  
   132  	// Flush manager.
   133  	FlushManager flushManagerConfiguration `yaml:"flushManager"`
   134  
   135  	// Flushing handler configuration.
   136  	Flush handler.FlushConfiguration `yaml:"flush"`
   137  
   138  	// Passthrough controls the passthrough knobs.
   139  	Passthrough *passthroughConfiguration `yaml:"passthrough"`
   140  
   141  	// Forwarding configuration.
   142  	Forwarding forwardingConfiguration `yaml:"forwarding"`
   143  
   144  	// EntryTTL determines how long an entry remains alive before it may be expired due to inactivity.
   145  	EntryTTL time.Duration `yaml:"entryTTL"`
   146  
   147  	// EntryCheckInterval determines how often entries are checked for expiration.
   148  	EntryCheckInterval time.Duration `yaml:"entryCheckInterval"`
   149  
   150  	// EntryCheckBatchPercent determines the percentage of entries checked in a batch.
   151  	EntryCheckBatchPercent float64 `yaml:"entryCheckBatchPercent" validate:"min=0.0,max=1.0"`
   152  
   153  	// MaxTimerBatchSizePerWrite determines the maximum timer batch size for each batched write.
   154  	MaxTimerBatchSizePerWrite int `yaml:"maxTimerBatchSizePerWrite" validate:"min=0"`
   155  
   156  	// Default storage policies.
   157  	DefaultStoragePolicies []policy.StoragePolicy `yaml:"defaultStoragePolicies"`
   158  
   159  	// Maximum number of cached source sets.
   160  	MaxNumCachedSourceSets *int `yaml:"maxNumCachedSourceSets"`
   161  
   162  	// Whether to discard NaN aggregated values.
   163  	DiscardNaNAggregatedValues *bool `yaml:"discardNaNAggregatedValues"`
   164  
   165  	// Pool of counter elements.
   166  	CounterElemPool pool.ObjectPoolConfiguration `yaml:"counterElemPool"`
   167  
   168  	// Pool of timer elements.
   169  	TimerElemPool pool.ObjectPoolConfiguration `yaml:"timerElemPool"`
   170  
   171  	// Pool of gauge elements.
   172  	GaugeElemPool pool.ObjectPoolConfiguration `yaml:"gaugeElemPool"`
   173  
   174  	// Pool of entries.
   175  	EntryPool pool.ObjectPoolConfiguration `yaml:"entryPool"`
   176  
   177  	// AddToReset is the yaml config for aggregator.Options.AddToReset
   178  	AddToReset bool `yaml:"addToReset"`
   179  
   180  	// TimedMetricsFlushOffsetEnabled enables using FlushOffset for timed metrics.
   181  	TimedMetricsFlushOffsetEnabled bool `yaml:"timedMetricsFlushOffsetEnabled"`
   182  
   183  	// FeatureFlags are feature flags to apply.
   184  	FeatureFlags aggregator.FeatureFlagConfigurations `yaml:"featureFlags"`
   185  
   186  	// WritesIgnoreCutoffCutover allows accepting writes ignoring cutoff/cutover timestamp.
   187  	// Must be in sync with m3msg WriterConfiguration.IgnoreCutoffCutover.
   188  	WritesIgnoreCutoffCutover bool `yaml:"writesIgnoreCutoffCutover"`
   189  }
   190  
   191  // InstanceIDType is the instance ID type that defines how the
   192  // instance ID is constructed, which is then used to lookup the
   193  // aggregator instance in the placement.
   194  type InstanceIDType uint
   195  
   196  const (
   197  	// HostIDPortInstanceIDType specifies to use the host ID
   198  	// concatenated with the port to be used for lookup
   199  	// in the placement.
   200  	// NB: this is a legacy instance ID type and is how the instance
   201  	// ID used to be constructed which imposed the strange
   202  	// requirement that the instance ID in the topology used to require
   203  	// the port concat'd with the host ID).
   204  	HostIDPortInstanceIDType InstanceIDType = iota
   205  	// HostIDInstanceIDType specifies to just use the host ID
   206  	// as the instance ID for lookup in the placement.
   207  	HostIDInstanceIDType
   208  
   209  	// defaultInstanceIDType must be used as the legacy instance ID
   210  	// since the config needs to be backwards compatible and for those
   211  	// not explicitly specifying the instance ID type it will cause
   212  	// existing placements to not work with latest versions of the aggregator
   213  	// in a backwards compatible fashion.
   214  	defaultInstanceIDType = HostIDPortInstanceIDType
   215  )
   216  
   217  func (t InstanceIDType) String() string {
   218  	switch t {
   219  	case HostIDInstanceIDType:
   220  		return "host_id"
   221  	case HostIDPortInstanceIDType:
   222  		return "host_id_port"
   223  	}
   224  	return "unknown"
   225  }
   226  
   227  var validInstanceIDTypes = []InstanceIDType{
   228  	HostIDInstanceIDType,
   229  	HostIDPortInstanceIDType,
   230  }
   231  
   232  // MarshalYAML returns the YAML representation of the InstanceIDType.
   233  func (t InstanceIDType) MarshalYAML() (interface{}, error) {
   234  	return t.String(), nil
   235  }
   236  
   237  // UnmarshalYAML unmarshals a InstanceIDType into a valid type from string.
   238  func (t *InstanceIDType) UnmarshalYAML(unmarshal func(interface{}) error) error {
   239  	var str string
   240  	if err := unmarshal(&str); err != nil {
   241  		return err
   242  	}
   243  	if str == "" {
   244  		*t = defaultInstanceIDType
   245  		return nil
   246  	}
   247  	strs := make([]string, 0, len(validInstanceIDTypes))
   248  	for _, valid := range validInstanceIDTypes {
   249  		if str == valid.String() {
   250  			*t = valid
   251  			return nil
   252  		}
   253  		strs = append(strs, "'"+valid.String()+"'")
   254  	}
   255  	return fmt.Errorf(
   256  		"invalid InstanceIDType '%s' valid types are: %s", str, strings.Join(strs, ", "))
   257  }
   258  
   259  // InstanceIDConfiguration is the instance ID configuration.
   260  type InstanceIDConfiguration struct {
   261  	// InstanceIDType specifies how to construct the instance ID
   262  	// that is used for lookup of the aggregator in the placement.
   263  	InstanceIDType InstanceIDType `yaml:"type"`
   264  }
   265  
   266  // NewAggregatorOptions creates a new set of aggregator options.
   267  func (c *AggregatorConfiguration) NewAggregatorOptions(
   268  	address string,
   269  	client client.Client,
   270  	serveOpts serve.Options,
   271  	runtimeOptsManager aggruntime.OptionsManager,
   272  	clockOpts clock.Options,
   273  	instrumentOpts instrument.Options,
   274  ) (aggregator.Options, error) {
   275  	opts := aggregator.NewOptions(clockOpts).
   276  		SetInstrumentOptions(instrumentOpts).
   277  		SetRuntimeOptionsManager(runtimeOptsManager).
   278  		SetVerboseErrors(c.VerboseErrors).
   279  		SetAddToReset(c.AddToReset).
   280  		SetTimedMetricsFlushOffsetEnabled(c.TimedMetricsFlushOffsetEnabled).
   281  		SetFeatureFlagBundlesParsed(c.FeatureFlags.Parse())
   282  
   283  	rwOpts := serveOpts.RWOptions()
   284  	if rwOpts == nil {
   285  		rwOpts = xio.NewOptions()
   286  	}
   287  
   288  	// Set the aggregation types options.
   289  	aggTypesOpts, err := c.AggregationTypes.NewOptions(instrumentOpts)
   290  	if err != nil {
   291  		return nil, err
   292  	}
   293  	opts = opts.SetAggregationTypesOptions(aggTypesOpts)
   294  
   295  	// Set the prefix for metrics aggregations.
   296  	opts = setMetricPrefix(opts, c.MetricPrefix, opts.SetMetricPrefix)
   297  	opts = setMetricPrefix(opts, c.CounterPrefix, opts.SetCounterPrefix)
   298  	opts = setMetricPrefix(opts, c.TimerPrefix, opts.SetTimerPrefix)
   299  	opts = setMetricPrefix(opts, c.GaugePrefix, opts.SetGaugePrefix)
   300  
   301  	// Set stream options.
   302  	scope := instrumentOpts.MetricsScope()
   303  	iOpts := instrumentOpts.SetMetricsScope(scope.SubScope("stream"))
   304  	streamOpts, err := c.Stream.NewStreamOptions(iOpts)
   305  	if err != nil {
   306  		return nil, err
   307  	}
   308  	opts = opts.SetStreamOptions(streamOpts)
   309  
   310  	// Set administrative client.
   311  	// TODO(xichen): client retry threshold likely needs to be low for faster retries.
   312  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("client"))
   313  	adminClient, err := c.Client.NewAdminClient(
   314  		client, clock.NewOptions(), iOpts, rwOpts)
   315  	if err != nil {
   316  		return nil, err
   317  	}
   318  	if err = adminClient.Init(); err != nil {
   319  		return nil, err
   320  	}
   321  	opts = opts.SetAdminClient(adminClient)
   322  
   323  	// Set instance ID.
   324  	instanceID, err := c.newInstanceID(address)
   325  	if err != nil {
   326  		return nil, err
   327  	}
   328  
   329  	// Set placement manager.
   330  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("placement-manager"))
   331  	placementManager, err := c.PlacementManager.NewPlacementManager(client, instanceID, iOpts)
   332  	if err != nil {
   333  		return nil, err
   334  	}
   335  	opts = opts.SetPlacementManager(placementManager)
   336  
   337  	// Set sharding function.
   338  	hashType := sharding.DefaultHash
   339  	if c.HashType != nil {
   340  		hashType = *c.HashType
   341  	}
   342  	shardFn, err := hashType.ShardFn()
   343  	if err != nil {
   344  		return nil, err
   345  	}
   346  	opts = opts.SetShardFn(shardFn)
   347  
   348  	// Set buffer durations for shard cutovers and shard cutoffs.
   349  	if c.BufferDurationBeforeShardCutover != 0 {
   350  		opts = opts.SetBufferDurationBeforeShardCutover(c.BufferDurationBeforeShardCutover)
   351  	}
   352  	if c.BufferDurationAfterShardCutoff != 0 {
   353  		opts = opts.SetBufferDurationAfterShardCutoff(c.BufferDurationAfterShardCutoff)
   354  	}
   355  	if c.BufferDurationForPastTimedMetric != 0 {
   356  		opts = opts.SetBufferForPastTimedMetric(c.BufferDurationForPastTimedMetric).
   357  			SetBufferForPastTimedMetricFn(bufferForPastTimedMetricFn(c.BufferDurationForPastTimedMetric))
   358  	}
   359  	if c.BufferDurationForFutureTimedMetric != 0 {
   360  		opts = opts.SetBufferForFutureTimedMetric(c.BufferDurationForFutureTimedMetric)
   361  	}
   362  
   363  	// Set resign timeout.
   364  	if c.ResignTimeout != 0 {
   365  		opts = opts.SetResignTimeout(c.ResignTimeout)
   366  	}
   367  
   368  	// Set flush times manager.
   369  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("flush-times-manager"))
   370  	flushTimesManager, err := c.FlushTimesManager.NewFlushTimesManager(client, iOpts)
   371  	if err != nil {
   372  		return nil, err
   373  	}
   374  	opts = opts.SetFlushTimesManager(flushTimesManager)
   375  
   376  	// Set election manager.
   377  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("election-manager"))
   378  	placementNamespace := c.PlacementManager.KVConfig.Namespace
   379  	electionManager, err := c.ElectionManager.NewElectionManager(
   380  		client,
   381  		instanceID,
   382  		placementNamespace,
   383  		placementManager,
   384  		flushTimesManager,
   385  		clockOpts,
   386  		iOpts,
   387  	)
   388  	if err != nil {
   389  		return nil, err
   390  	}
   391  	opts = opts.SetElectionManager(electionManager)
   392  
   393  	// Set flush manager.
   394  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("flush-manager"))
   395  	flushManagerOpts, err := c.FlushManager.NewFlushManagerOptions(
   396  		placementManager,
   397  		electionManager,
   398  		flushTimesManager,
   399  		iOpts,
   400  		opts.BufferForPastTimedMetric(),
   401  	)
   402  	if err != nil {
   403  		return nil, err
   404  	}
   405  	flushManager := aggregator.NewFlushManager(flushManagerOpts)
   406  	opts = opts.SetFlushManager(flushManager)
   407  
   408  	// Set flushing handler.
   409  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("flush-handler"))
   410  	flushHandler, err := c.Flush.NewHandler(client, iOpts, rwOpts)
   411  	if err != nil {
   412  		return nil, err
   413  	}
   414  	opts = opts.SetFlushHandler(flushHandler)
   415  
   416  	// Set passthrough writer.
   417  	aggShardFn, err := hashType.AggregatedShardFn()
   418  	if err != nil {
   419  		return nil, err
   420  	}
   421  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("passthrough-writer"))
   422  	passthroughWriter, err := c.newPassthroughWriter(flushHandler, iOpts, aggShardFn)
   423  	if err != nil {
   424  		return nil, err
   425  	}
   426  	opts = opts.SetPassthroughWriter(passthroughWriter)
   427  
   428  	// Set max allowed forwarding delay function.
   429  	jitterEnabled := flushManagerOpts.JitterEnabled()
   430  	maxJitterFn := flushManagerOpts.MaxJitterFn()
   431  	maxAllowedForwardingDelayFn := c.Forwarding.MaxAllowedForwardingDelayFn(jitterEnabled, maxJitterFn)
   432  	opts = opts.SetMaxAllowedForwardingDelayFn(maxAllowedForwardingDelayFn)
   433  
   434  	// Set entry options.
   435  	if c.EntryTTL != 0 {
   436  		opts = opts.SetEntryTTL(c.EntryTTL)
   437  	}
   438  	if c.EntryCheckInterval != 0 {
   439  		opts = opts.SetEntryCheckInterval(c.EntryCheckInterval)
   440  	}
   441  	if c.EntryCheckBatchPercent != 0.0 {
   442  		opts = opts.SetEntryCheckBatchPercent(c.EntryCheckBatchPercent)
   443  	}
   444  	if c.MaxTimerBatchSizePerWrite != 0 {
   445  		opts = opts.SetMaxTimerBatchSizePerWrite(c.MaxTimerBatchSizePerWrite)
   446  	}
   447  
   448  	// Set default storage policies.
   449  	storagePolicies := make([]policy.StoragePolicy, len(c.DefaultStoragePolicies))
   450  	copy(storagePolicies, c.DefaultStoragePolicies)
   451  	opts = opts.SetDefaultStoragePolicies(storagePolicies)
   452  
   453  	// Set cached source sets options.
   454  	if c.MaxNumCachedSourceSets != nil {
   455  		opts = opts.SetMaxNumCachedSourceSets(*c.MaxNumCachedSourceSets)
   456  	}
   457  
   458  	// Set whether to discard NaN aggregated values.
   459  	if c.DiscardNaNAggregatedValues != nil {
   460  		opts = opts.SetDiscardNaNAggregatedValues(*c.DiscardNaNAggregatedValues)
   461  	}
   462  
   463  	// Set counter elem pool.
   464  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("counter-elem-pool"))
   465  	counterElemPoolOpts := c.CounterElemPool.NewObjectPoolOptions(iOpts)
   466  	counterElemPool := aggregator.NewCounterElemPool(counterElemPoolOpts)
   467  	opts = opts.SetCounterElemPool(counterElemPool)
   468  	// use a singleton ElemOptions to avoid allocs per elem.
   469  	elemOpts := aggregator.NewElemOptions(opts)
   470  	counterElemPool.Init(func() *aggregator.CounterElem {
   471  		return aggregator.MustNewCounterElem(aggregator.ElemData{}, elemOpts)
   472  	})
   473  
   474  	// Set timer elem pool.
   475  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("timer-elem-pool"))
   476  	timerElemPoolOpts := c.TimerElemPool.NewObjectPoolOptions(iOpts)
   477  	timerElemPool := aggregator.NewTimerElemPool(timerElemPoolOpts)
   478  	opts = opts.SetTimerElemPool(timerElemPool)
   479  	timerElemPool.Init(func() *aggregator.TimerElem {
   480  		return aggregator.MustNewTimerElem(aggregator.ElemData{}, elemOpts)
   481  	})
   482  
   483  	// Set gauge elem pool.
   484  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("gauge-elem-pool"))
   485  	gaugeElemPoolOpts := c.GaugeElemPool.NewObjectPoolOptions(iOpts)
   486  	gaugeElemPool := aggregator.NewGaugeElemPool(gaugeElemPoolOpts)
   487  	opts = opts.SetGaugeElemPool(gaugeElemPool)
   488  	gaugeElemPool.Init(func() *aggregator.GaugeElem {
   489  		return aggregator.MustNewGaugeElem(aggregator.ElemData{}, elemOpts)
   490  	})
   491  
   492  	// Set entry pool.
   493  	iOpts = instrumentOpts.SetMetricsScope(scope.SubScope("entry-pool"))
   494  	entryPoolOpts := c.EntryPool.NewObjectPoolOptions(iOpts)
   495  	entryPool := aggregator.NewEntryPool(entryPoolOpts)
   496  	runtimeOpts := runtimeOptsManager.RuntimeOptions()
   497  	opts = opts.SetEntryPool(entryPool)
   498  	// allocate metrics only once to reduce memory utilization
   499  	metrics := aggregator.NewEntryMetrics(iOpts.MetricsScope())
   500  	entryPool.Init(func() *aggregator.Entry {
   501  		return aggregator.NewEntryWithMetrics(nil, metrics, runtimeOpts, opts)
   502  	})
   503  
   504  	opts = opts.SetWritesIgnoreCutoffCutover(c.WritesIgnoreCutoffCutover)
   505  
   506  	return opts, nil
   507  }
   508  
   509  // HostIDOrDefault returns the host ID or default.
   510  func (c *AggregatorConfiguration) HostIDOrDefault() hostid.Configuration {
   511  	if c.HostID == nil {
   512  		return hostid.Configuration{
   513  			Resolver: hostid.ConfigResolver,
   514  			Value:    &defaultHostID,
   515  		}
   516  	}
   517  
   518  	return *c.HostID
   519  }
   520  
   521  func (c *AggregatorConfiguration) newInstanceID(address string) (string, error) {
   522  	var (
   523  		hostIDValue string
   524  		err         error
   525  	)
   526  	if c.HostID != nil {
   527  		hostIDValue, err = c.HostID.Resolve()
   528  	} else {
   529  		hostIDValue, err = os.Hostname()
   530  	}
   531  	if err != nil {
   532  		return "", fmt.Errorf("error determining host ID: %v", err)
   533  	}
   534  
   535  	switch c.InstanceID.InstanceIDType {
   536  	case HostIDInstanceIDType:
   537  		return hostIDValue, nil
   538  	case HostIDPortInstanceIDType:
   539  		_, port, err := net.SplitHostPort(address)
   540  		if err != nil {
   541  			return "", fmt.Errorf("error parsing server address %s: %v", address, err)
   542  		}
   543  		return net.JoinHostPort(hostIDValue, port), nil
   544  	default:
   545  		return "", fmt.Errorf("unknown instance ID type: value=%d, str=%s",
   546  			c.InstanceID.InstanceIDType, c.InstanceID.InstanceIDType.String())
   547  	}
   548  }
   549  
   550  func bufferForPastTimedMetricFn(buffer time.Duration) aggregator.BufferForPastTimedMetricFn {
   551  	return func(resolution time.Duration) time.Duration {
   552  		return buffer + resolution
   553  	}
   554  }
   555  
   556  // streamConfiguration contains configuration for quantile-related metric streams.
   557  type streamConfiguration struct {
   558  	// Error epsilon for quantile computation.
   559  	Eps float64 `yaml:"eps"`
   560  
   561  	// Initial sample pool capacity for quantile computation.
   562  	Capacity int `yaml:"capacity"`
   563  
   564  	// Insertion and compression frequency.
   565  	InsertAndCompressEvery int `yaml:"insertAndCompressEvery"`
   566  
   567  	// FlushEvery is deprecated.
   568  	FlushEvery int `yaml:"flushEvery"`
   569  
   570  	// StreamPool is deprecated.
   571  	StreamPool pool.ObjectPoolConfiguration `yaml:"streamPool"`
   572  
   573  	// SamplePool is deprecated.
   574  	SamplePool *pool.ObjectPoolConfiguration `yaml:"samplePool"`
   575  
   576  	// FloatsPool is deprecated.
   577  	FloatsPool pool.BucketizedPoolConfiguration `yaml:"floatsPool"`
   578  }
   579  
   580  func (c *streamConfiguration) NewStreamOptions(_ instrument.Options) (cm.Options, error) {
   581  	opts := cm.NewOptions().
   582  		SetEps(c.Eps).
   583  		SetCapacity(c.Capacity)
   584  
   585  	if c.InsertAndCompressEvery != 0 {
   586  		opts = opts.SetInsertAndCompressEvery(c.InsertAndCompressEvery)
   587  	}
   588  
   589  	if err := opts.Validate(); err != nil {
   590  		return nil, err
   591  	}
   592  	return opts, nil
   593  }
   594  
   595  type placementManagerConfiguration struct {
   596  	KVConfig kv.OverrideConfiguration       `yaml:"kvConfig"`
   597  	Watcher  placement.WatcherConfiguration `yaml:"placementWatcher"`
   598  }
   599  
   600  func (c placementManagerConfiguration) NewPlacementManager(
   601  	client client.Client,
   602  	instanceID string,
   603  	instrumentOpts instrument.Options,
   604  ) (aggregator.PlacementManager, error) {
   605  	kvOpts, err := c.KVConfig.NewOverrideOptions()
   606  	if err != nil {
   607  		return nil, err
   608  	}
   609  	store, err := client.Store(kvOpts)
   610  	if err != nil {
   611  		return nil, err
   612  	}
   613  	scope := instrumentOpts.MetricsScope()
   614  	iOpts := instrumentOpts.SetMetricsScope(scope.SubScope("placement-watcher"))
   615  	placementWatcherOpts := c.Watcher.NewOptions(store, iOpts)
   616  	placementManagerOpts := aggregator.NewPlacementManagerOptions().
   617  		SetInstrumentOptions(instrumentOpts).
   618  		SetInstanceID(instanceID).
   619  		SetWatcherOptions(placementWatcherOpts)
   620  	return aggregator.NewPlacementManager(placementManagerOpts), nil
   621  }
   622  
   623  type forwardingConfiguration struct {
   624  	// MaxSingleDelay is the maximum delay for a single forward step.
   625  	MaxSingleDelay time.Duration `yaml:"maxSingleDelay"`
   626  	// MaxConstDelay is the maximum delay for a forward step as a constant + resolution*numForwardedTimes.
   627  	MaxConstDelay time.Duration `yaml:"maxConstDelay"`
   628  }
   629  
   630  func (c forwardingConfiguration) MaxAllowedForwardingDelayFn(
   631  	jitterEnabled bool,
   632  	maxJitterFn aggregator.FlushJitterFn,
   633  ) aggregator.MaxAllowedForwardingDelayFn {
   634  	if v := c.MaxConstDelay; v > 0 {
   635  		return func(resolution time.Duration, numForwardedTimes int) time.Duration {
   636  			return v + (resolution * time.Duration(numForwardedTimes))
   637  		}
   638  	}
   639  
   640  	return func(resolution time.Duration, numForwardedTimes int) time.Duration {
   641  		// If jittering is enabled, we use max jitter fn to determine the initial jitter.
   642  		// Otherwise, flushing may start at any point within a resolution interval so we
   643  		// assume the full resolution interval may be used for initial jittering.
   644  		initialJitter := resolution
   645  		if jitterEnabled {
   646  			initialJitter = maxJitterFn(resolution)
   647  		}
   648  		return initialJitter + c.MaxSingleDelay*time.Duration(numForwardedTimes)
   649  	}
   650  }
   651  
   652  type flushTimesManagerConfiguration struct {
   653  	// KV Configuration.
   654  	KVConfig kv.OverrideConfiguration `yaml:"kvConfig"`
   655  
   656  	// Flush times key format.
   657  	FlushTimesKeyFmt string `yaml:"flushTimesKeyFmt" validate:"nonzero"`
   658  
   659  	// Retrier for persisting flush times.
   660  	FlushTimesPersistRetrier retry.Configuration `yaml:"flushTimesPersistRetrier"`
   661  }
   662  
   663  func (c flushTimesManagerConfiguration) NewFlushTimesManager(
   664  	client client.Client,
   665  	instrumentOpts instrument.Options,
   666  ) (aggregator.FlushTimesManager, error) {
   667  	kvOpts, err := c.KVConfig.NewOverrideOptions()
   668  	if err != nil {
   669  		return nil, err
   670  	}
   671  	store, err := client.Store(kvOpts)
   672  	if err != nil {
   673  		return nil, err
   674  	}
   675  	scope := instrumentOpts.MetricsScope()
   676  	retrier := c.FlushTimesPersistRetrier.NewRetrier(scope.SubScope("flush-times-persist-retrier"))
   677  	flushTimesManagerOpts := aggregator.NewFlushTimesManagerOptions().
   678  		SetInstrumentOptions(instrumentOpts).
   679  		SetFlushTimesKeyFmt(c.FlushTimesKeyFmt).
   680  		SetFlushTimesStore(store).
   681  		SetFlushTimesPersistRetrier(retrier)
   682  	return aggregator.NewFlushTimesManager(flushTimesManagerOpts), nil
   683  }
   684  
   685  type electionManagerConfiguration struct {
   686  	Election                   electionConfiguration  `yaml:"election"`
   687  	ServiceID                  serviceIDConfiguration `yaml:"serviceID"`
   688  	LeaderValue                string                 `yaml:"leaderValue"`
   689  	ElectionKeyFmt             string                 `yaml:"electionKeyFmt" validate:"nonzero"`
   690  	CampaignRetrier            retry.Configuration    `yaml:"campaignRetrier"`
   691  	ChangeRetrier              retry.Configuration    `yaml:"changeRetrier"`
   692  	ResignRetrier              retry.Configuration    `yaml:"resignRetrier"`
   693  	CampaignStateCheckInterval time.Duration          `yaml:"campaignStateCheckInterval"`
   694  	ShardCutoffCheckOffset     time.Duration          `yaml:"shardCutoffCheckOffset"`
   695  }
   696  
   697  func (c electionManagerConfiguration) NewElectionManager(
   698  	client client.Client,
   699  	instanceID string,
   700  	placementNamespace string,
   701  	placementManager aggregator.PlacementManager,
   702  	flushTimesManager aggregator.FlushTimesManager,
   703  	clockOpts clock.Options,
   704  	instrumentOpts instrument.Options,
   705  ) (aggregator.ElectionManager, error) {
   706  	electionOpts, err := c.Election.NewElectionOptions()
   707  	if err != nil {
   708  		return nil, err
   709  	}
   710  	serviceID := c.ServiceID.NewServiceID()
   711  	namespaceOpts := services.NewNamespaceOptions().SetPlacementNamespace(placementNamespace)
   712  	serviceOpts := services.NewOverrideOptions().SetNamespaceOptions(namespaceOpts)
   713  	svcs, err := client.Services(serviceOpts)
   714  	if err != nil {
   715  		return nil, err
   716  	}
   717  	leaderService, err := svcs.LeaderService(serviceID, electionOpts)
   718  	if err != nil {
   719  		return nil, err
   720  	}
   721  	campaignOpts, err := services.NewCampaignOptions()
   722  	if err != nil {
   723  		return nil, err
   724  	}
   725  	leaderValue := instanceID
   726  	if c.LeaderValue != "" {
   727  		leaderValue = c.LeaderValue
   728  	}
   729  	campaignOpts = campaignOpts.SetLeaderValue(leaderValue)
   730  	scope := instrumentOpts.MetricsScope()
   731  	campaignRetryOpts := c.CampaignRetrier.NewOptions(scope.SubScope("campaign-retrier"))
   732  	changeRetryOpts := c.ChangeRetrier.NewOptions(scope.SubScope("change-retrier"))
   733  	resignRetryOpts := c.ResignRetrier.NewOptions(scope.SubScope("resign-retrier"))
   734  	opts := aggregator.NewElectionManagerOptions().
   735  		SetClockOptions(clockOpts).
   736  		SetInstrumentOptions(instrumentOpts).
   737  		SetElectionOptions(electionOpts).
   738  		SetCampaignOptions(campaignOpts).
   739  		SetCampaignRetryOptions(campaignRetryOpts).
   740  		SetChangeRetryOptions(changeRetryOpts).
   741  		SetResignRetryOptions(resignRetryOpts).
   742  		SetElectionKeyFmt(c.ElectionKeyFmt).
   743  		SetLeaderService(leaderService).
   744  		SetPlacementManager(placementManager).
   745  		SetFlushTimesManager(flushTimesManager)
   746  	if c.CampaignStateCheckInterval != 0 {
   747  		opts = opts.SetCampaignStateCheckInterval(c.CampaignStateCheckInterval)
   748  	}
   749  	if c.ShardCutoffCheckOffset != 0 {
   750  		opts = opts.SetShardCutoffCheckOffset(c.ShardCutoffCheckOffset)
   751  	}
   752  	electionManager := aggregator.NewElectionManager(opts)
   753  	return electionManager, nil
   754  }
   755  
   756  type electionConfiguration struct {
   757  	LeaderTimeout time.Duration `yaml:"leaderTimeout"`
   758  	ResignTimeout time.Duration `yaml:"resignTimeout"`
   759  	TTLSeconds    int           `yaml:"ttlSeconds"`
   760  }
   761  
   762  func (c electionConfiguration) NewElectionOptions() (services.ElectionOptions, error) {
   763  	opts := services.NewElectionOptions()
   764  	if c.LeaderTimeout != 0 {
   765  		opts = opts.SetLeaderTimeout(c.LeaderTimeout)
   766  	}
   767  	if c.ResignTimeout != 0 {
   768  		opts = opts.SetResignTimeout(c.ResignTimeout)
   769  	}
   770  	if c.TTLSeconds != 0 {
   771  		opts = opts.SetTTLSecs(c.TTLSeconds)
   772  	}
   773  	return opts, nil
   774  }
   775  
   776  // TODO: move this to m3cluster.
   777  type serviceIDConfiguration struct {
   778  	Name        string `yaml:"name"`
   779  	Environment string `yaml:"environment"`
   780  	Zone        string `yaml:"zone"`
   781  }
   782  
   783  func (c serviceIDConfiguration) NewServiceID() services.ServiceID {
   784  	sid := services.NewServiceID()
   785  	if c.Name != "" {
   786  		sid = sid.SetName(c.Name)
   787  	}
   788  	if c.Environment != "" {
   789  		sid = sid.SetEnvironment(c.Environment)
   790  	}
   791  	if c.Zone != "" {
   792  		sid = sid.SetZone(c.Zone)
   793  	}
   794  	return sid
   795  }
   796  
   797  type flushManagerConfiguration struct {
   798  	// How frequently the flush manager checks for next flush.
   799  	CheckEvery time.Duration `yaml:"checkEvery"`
   800  
   801  	// Whether jittering is enabled.
   802  	JitterEnabled *bool `yaml:"jitterEnabled"`
   803  
   804  	// Buckets for determining max jitter amounts.
   805  	MaxJitters []jitterBucket `yaml:"maxJitters"`
   806  
   807  	// Number of workers per CPU.
   808  	NumWorkersPerCPU float64 `yaml:"numWorkersPerCPU" validate:"min=0.0,max=1.0"`
   809  
   810  	// DeprecatedFlushTimesPersistEvery controlled how often flush times were
   811  	// persisted, but is now deprecated.
   812  	DeprecatedFlushTimesPersistEvery time.Duration `yaml:"flushTimesPersistEvery"`
   813  
   814  	// Maximum buffer size.
   815  	MaxBufferSize time.Duration `yaml:"maxBufferSize"`
   816  
   817  	// Window size for a forced flush.
   818  	ForcedFlushWindowSize time.Duration `yaml:"forcedFlushWindowSize"`
   819  }
   820  
   821  func (c flushManagerConfiguration) NewFlushManagerOptions(
   822  	placementManager aggregator.PlacementManager,
   823  	electionManager aggregator.ElectionManager,
   824  	flushTimesManager aggregator.FlushTimesManager,
   825  	instrumentOpts instrument.Options,
   826  	bufferForPastTimedMetric time.Duration,
   827  ) (aggregator.FlushManagerOptions, error) {
   828  	opts := aggregator.NewFlushManagerOptions().
   829  		SetInstrumentOptions(instrumentOpts).
   830  		SetPlacementManager(placementManager).
   831  		SetElectionManager(electionManager).
   832  		SetFlushTimesManager(flushTimesManager).
   833  		SetBufferForPastTimedMetric(bufferForPastTimedMetric)
   834  	if c.CheckEvery != 0 {
   835  		opts = opts.SetCheckEvery(c.CheckEvery)
   836  	}
   837  	if c.JitterEnabled != nil {
   838  		opts = opts.SetJitterEnabled(*c.JitterEnabled)
   839  	}
   840  	if c.MaxJitters != nil {
   841  		maxJitterFn, err := jitterBuckets(c.MaxJitters).NewMaxJitterFn()
   842  		if err != nil {
   843  			return nil, err
   844  		}
   845  		opts = opts.SetMaxJitterFn(maxJitterFn)
   846  	}
   847  	if c.NumWorkersPerCPU != 0 {
   848  		runtimeCPU := float64(runtime.GOMAXPROCS(0))
   849  		numWorkers := c.NumWorkersPerCPU * runtimeCPU
   850  		workerPoolSize := int(math.Ceil(numWorkers))
   851  		if workerPoolSize < 1 {
   852  			workerPoolSize = 1
   853  		}
   854  		workerPool := sync.NewWorkerPool(workerPoolSize)
   855  		workerPool.Init()
   856  		opts = opts.SetWorkerPool(workerPool)
   857  	}
   858  	if c.MaxBufferSize != 0 {
   859  		opts = opts.SetMaxBufferSize(c.MaxBufferSize)
   860  	}
   861  	if c.ForcedFlushWindowSize != 0 {
   862  		opts = opts.SetForcedFlushWindowSize(c.ForcedFlushWindowSize)
   863  	}
   864  	return opts, nil
   865  }
   866  
   867  // jitterBucket determines the max jitter percent for lists whose flush
   868  // intervals are no more than the bucket flush interval.
   869  type jitterBucket struct {
   870  	FlushInterval    time.Duration `yaml:"flushInterval" validate:"nonzero"`
   871  	MaxJitterPercent float64       `yaml:"maxJitterPercent" validate:"min=0.0,max=1.0"`
   872  }
   873  
   874  type jitterBuckets []jitterBucket
   875  
   876  func (buckets jitterBuckets) NewMaxJitterFn() (aggregator.FlushJitterFn, error) {
   877  	numBuckets := len(buckets)
   878  	if numBuckets == 0 {
   879  		return nil, errEmptyJitterBucketList
   880  	}
   881  	res := make([]jitterBucket, numBuckets)
   882  	copy(res, buckets)
   883  	sort.Sort(jitterBucketsByIntervalAscending(res))
   884  
   885  	return func(interval time.Duration) time.Duration {
   886  		idx := sort.Search(numBuckets, func(i int) bool {
   887  			return res[i].FlushInterval >= interval
   888  		})
   889  		if idx == numBuckets {
   890  			idx--
   891  		}
   892  		return time.Duration(res[idx].MaxJitterPercent * float64(interval))
   893  	}, nil
   894  }
   895  
   896  type jitterBucketsByIntervalAscending []jitterBucket
   897  
   898  func (b jitterBucketsByIntervalAscending) Len() int      { return len(b) }
   899  func (b jitterBucketsByIntervalAscending) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
   900  
   901  func (b jitterBucketsByIntervalAscending) Less(i, j int) bool {
   902  	return b[i].FlushInterval < b[j].FlushInterval
   903  }
   904  
   905  type metricPrefixSetter func(b []byte) aggregator.Options
   906  
   907  func setMetricPrefix(
   908  	opts aggregator.Options,
   909  	str *string,
   910  	fn metricPrefixSetter,
   911  ) aggregator.Options {
   912  	if str == nil {
   913  		return opts
   914  	}
   915  	return fn([]byte(*str))
   916  }
   917  
   918  // PassthroughConfiguration contains the knobs for pass-through server.
   919  type passthroughConfiguration struct {
   920  	// Enabled controls whether the passthrough server/writer is enabled.
   921  	Enabled bool `yaml:"enabled"`
   922  
   923  	// NumWriters controls the number of passthrough writers used.
   924  	NumWriters int `yaml:"numWriters"`
   925  }
   926  
   927  func (c *AggregatorConfiguration) newPassthroughWriter(
   928  	flushHandler handler.Handler,
   929  	iOpts instrument.Options,
   930  	shardFn sharding.AggregatedShardFn,
   931  ) (writer.Writer, error) {
   932  	// fallback gracefully
   933  	if c.Passthrough == nil || !c.Passthrough.Enabled {
   934  		iOpts.Logger().Info("passthrough writer disabled, blackholing all passthrough writes")
   935  		return writer.NewBlackholeWriter(), nil
   936  	}
   937  
   938  	count := defaultNumPassthroughWriters
   939  	if c.Passthrough.NumWriters != 0 {
   940  		count = c.Passthrough.NumWriters
   941  	}
   942  
   943  	writers := make([]writer.Writer, 0, count)
   944  	for i := 0; i < count; i++ {
   945  		writer, err := flushHandler.NewWriter(iOpts.MetricsScope())
   946  		if err != nil {
   947  			return nil, err
   948  		}
   949  		writers = append(writers, writer)
   950  	}
   951  
   952  	return writer.NewShardedWriter(writers, shardFn, iOpts)
   953  }