github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/aggregator/client/tcp_client.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package client
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"math"
    27  	"time"
    28  
    29  	"github.com/uber-go/tally"
    30  
    31  	"github.com/m3db/m3/src/aggregator/sharding"
    32  	"github.com/m3db/m3/src/cluster/placement"
    33  	"github.com/m3db/m3/src/cluster/shard"
    34  	"github.com/m3db/m3/src/metrics/metadata"
    35  	"github.com/m3db/m3/src/metrics/metric/aggregated"
    36  	"github.com/m3db/m3/src/metrics/metric/id"
    37  	"github.com/m3db/m3/src/metrics/metric/unaggregated"
    38  	"github.com/m3db/m3/src/metrics/policy"
    39  	"github.com/m3db/m3/src/x/clock"
    40  	xerrors "github.com/m3db/m3/src/x/errors"
    41  )
    42  
    43  var (
    44  	_ AdminClient = (*TCPClient)(nil)
    45  
    46  	errNilPlacement = errors.New("placement is nil")
    47  )
    48  
    49  // TCPClient sends metrics to M3 Aggregator via over custom TCP protocol.
    50  type TCPClient struct {
    51  	nowFn                      clock.NowFn
    52  	shardCutoverWarmupDuration time.Duration
    53  	shardCutoffLingerDuration  time.Duration
    54  	writerMgr                  instanceWriterManager
    55  	shardFn                    sharding.ShardFn
    56  	placementWatcher           placement.Watcher
    57  	metrics                    tcpClientMetrics
    58  }
    59  
    60  // NewTCPClient returns new Protobuf over TCP M3 Aggregator client.
    61  func NewTCPClient(opts Options) (*TCPClient, error) {
    62  	if err := opts.Validate(); err != nil {
    63  		return nil, err
    64  	}
    65  
    66  	var (
    67  		instrumentOpts   = opts.InstrumentOptions()
    68  		writerMgr        instanceWriterManager
    69  		placementWatcher placement.Watcher
    70  	)
    71  
    72  	writerMgrScope := instrumentOpts.MetricsScope().SubScope("writer-manager")
    73  	writerMgrOpts := opts.SetInstrumentOptions(instrumentOpts.SetMetricsScope(writerMgrScope))
    74  	writerMgr, err := newInstanceWriterManager(writerMgrOpts)
    75  	if err != nil {
    76  		return nil, err
    77  	}
    78  
    79  	onPlacementChangedFn := func(prev, curr placement.Placement) {
    80  		writerMgr.AddInstances(curr.Instances()) // nolint: errcheck
    81  
    82  		if prev != nil {
    83  			writerMgr.RemoveInstances(prev.Instances()) // nolint: errcheck
    84  		}
    85  	}
    86  
    87  	placementWatcher = placement.NewPlacementsWatcher(
    88  		opts.WatcherOptions().
    89  			SetOnPlacementChangedFn(onPlacementChangedFn))
    90  
    91  	return &TCPClient{
    92  		nowFn:                      opts.ClockOptions().NowFn(),
    93  		shardCutoverWarmupDuration: opts.ShardCutoverWarmupDuration(),
    94  		shardCutoffLingerDuration:  opts.ShardCutoffLingerDuration(),
    95  		writerMgr:                  writerMgr,
    96  		shardFn:                    opts.ShardFn(),
    97  		placementWatcher:           placementWatcher,
    98  		metrics:                    newTCPClientMetrics(instrumentOpts.MetricsScope()),
    99  	}, nil
   100  }
   101  
   102  // Init initializes TCPClient.
   103  func (c *TCPClient) Init() error {
   104  	return c.placementWatcher.Watch()
   105  }
   106  
   107  // WriteUntimedCounter writes untimed counter metrics.
   108  func (c *TCPClient) WriteUntimedCounter(
   109  	counter unaggregated.Counter,
   110  	metadatas metadata.StagedMetadatas,
   111  ) error {
   112  	payload := payloadUnion{
   113  		payloadType: untimedType,
   114  		untimed: untimedPayload{
   115  			metric:    counter.ToUnion(),
   116  			metadatas: metadatas,
   117  		},
   118  	}
   119  
   120  	c.metrics.writeUntimedCounter.Inc(1)
   121  	return c.write(counter.ID, c.nowFn().UnixNano(), payload)
   122  }
   123  
   124  // WriteUntimedBatchTimer writes untimed batch timer metrics.
   125  func (c *TCPClient) WriteUntimedBatchTimer(
   126  	batchTimer unaggregated.BatchTimer,
   127  	metadatas metadata.StagedMetadatas,
   128  ) error {
   129  	payload := payloadUnion{
   130  		payloadType: untimedType,
   131  		untimed: untimedPayload{
   132  			metric:    batchTimer.ToUnion(),
   133  			metadatas: metadatas,
   134  		},
   135  	}
   136  
   137  	c.metrics.writeUntimedBatchTimer.Inc(1)
   138  	return c.write(batchTimer.ID, c.nowFn().UnixNano(), payload)
   139  }
   140  
   141  // WriteUntimedGauge writes untimed gauge metrics.
   142  func (c *TCPClient) WriteUntimedGauge(
   143  	gauge unaggregated.Gauge,
   144  	metadatas metadata.StagedMetadatas,
   145  ) error {
   146  	payload := payloadUnion{
   147  		payloadType: untimedType,
   148  		untimed: untimedPayload{
   149  			metric:    gauge.ToUnion(),
   150  			metadatas: metadatas,
   151  		},
   152  	}
   153  
   154  	c.metrics.writeUntimedGauge.Inc(1)
   155  	return c.write(gauge.ID, c.nowFn().UnixNano(), payload)
   156  }
   157  
   158  // WriteTimed writes timed metrics.
   159  func (c *TCPClient) WriteTimed(
   160  	metric aggregated.Metric,
   161  	metadata metadata.TimedMetadata,
   162  ) error {
   163  	payload := payloadUnion{
   164  		payloadType: timedType,
   165  		timed: timedPayload{
   166  			metric:   metric,
   167  			metadata: metadata,
   168  		},
   169  	}
   170  
   171  	c.metrics.writeForwarded.Inc(1)
   172  	return c.write(metric.ID, metric.TimeNanos, payload)
   173  }
   174  
   175  // WritePassthrough writes passthrough metrics.
   176  func (c *TCPClient) WritePassthrough(
   177  	metric aggregated.Metric,
   178  	storagePolicy policy.StoragePolicy,
   179  ) error {
   180  	payload := payloadUnion{
   181  		payloadType: passthroughType,
   182  		passthrough: passthroughPayload{
   183  			metric:        metric,
   184  			storagePolicy: storagePolicy,
   185  		},
   186  	}
   187  
   188  	c.metrics.writePassthrough.Inc(1)
   189  	return c.write(metric.ID, metric.TimeNanos, payload)
   190  }
   191  
   192  // WriteTimedWithStagedMetadatas writes timed metrics with staged metadatas.
   193  func (c *TCPClient) WriteTimedWithStagedMetadatas(
   194  	metric aggregated.Metric,
   195  	metadatas metadata.StagedMetadatas,
   196  ) error {
   197  	payload := payloadUnion{
   198  		payloadType: timedWithStagedMetadatasType,
   199  		timedWithStagedMetadatas: timedWithStagedMetadatas{
   200  			metric:    metric,
   201  			metadatas: metadatas,
   202  		},
   203  	}
   204  
   205  	c.metrics.writeForwarded.Inc(1)
   206  	return c.write(metric.ID, metric.TimeNanos, payload)
   207  }
   208  
   209  // WriteForwarded writes forwarded metrics.
   210  func (c *TCPClient) WriteForwarded(
   211  	metric aggregated.ForwardedMetric,
   212  	metadata metadata.ForwardMetadata,
   213  ) error {
   214  	payload := payloadUnion{
   215  		payloadType: forwardedType,
   216  		forwarded: forwardedPayload{
   217  			metric:   metric,
   218  			metadata: metadata,
   219  		},
   220  	}
   221  
   222  	c.metrics.writeForwarded.Inc(1)
   223  	return c.write(metric.ID, metric.TimeNanos, payload)
   224  }
   225  
   226  // ActivePlacement returns a copy of the currently active placement and its version.
   227  func (c *TCPClient) ActivePlacement() (placement.Placement, int, error) {
   228  	placement, err := c.placementWatcher.Get()
   229  	if err != nil {
   230  		return nil, 0, err
   231  	}
   232  	if placement == nil {
   233  		return nil, 0, errNilPlacement
   234  	}
   235  
   236  	return placement.Clone(), placement.Version(), nil
   237  }
   238  
   239  // ActivePlacementVersion returns a copy of the currently active placement version. It is a far less expensive call
   240  // than ActivePlacement, as it does not clone the placement.
   241  func (c *TCPClient) ActivePlacementVersion() (int, error) {
   242  	placement, err := c.placementWatcher.Get()
   243  	if err != nil {
   244  		return 0, err
   245  	}
   246  	if placement == nil {
   247  		return 0, errNilPlacement
   248  	}
   249  
   250  	return placement.Version(), nil
   251  }
   252  
   253  // Flush flushes any remaining data buffered by the client.
   254  func (c *TCPClient) Flush() error {
   255  	c.metrics.flush.Inc(1)
   256  	return c.writerMgr.Flush()
   257  }
   258  
   259  // Close closes the client.
   260  func (c *TCPClient) Close() error {
   261  	c.writerMgr.Flush()          //nolint:errcheck
   262  	c.placementWatcher.Unwatch() //nolint:errcheck
   263  	// writerMgr errors out if trying to close twice
   264  	return c.writerMgr.Close()
   265  }
   266  
   267  //nolint:gocritic
   268  func (c *TCPClient) write(
   269  	metricID id.RawID,
   270  	timeNanos int64,
   271  	payload payloadUnion,
   272  ) error {
   273  	placement, err := c.placementWatcher.Get()
   274  	if err != nil {
   275  		return err
   276  	}
   277  	if placement == nil {
   278  		return errNilPlacement
   279  	}
   280  	var (
   281  		shardID            = c.shardFn(metricID, uint32(placement.NumShards()))
   282  		instances          = placement.InstancesForShard(shardID)
   283  		multiErr           = xerrors.NewMultiError()
   284  		oneOrMoreSucceeded = false
   285  	)
   286  	for _, instance := range instances {
   287  		// NB(xichen): the shard should technically always be found because the instances
   288  		// are computed from the placement, but protect against errors here regardless.
   289  		shard, ok := instance.Shards().Shard(shardID)
   290  		if !ok {
   291  			err = fmt.Errorf("instance %s does not own shard %d", instance.ID(), shardID)
   292  			multiErr = multiErr.Add(err)
   293  			c.metrics.shardNotOwned.Inc(1)
   294  			continue
   295  		}
   296  		if !c.shouldWriteForShard(timeNanos, shard) {
   297  			c.metrics.shardNotWriteable.Inc(1)
   298  			continue
   299  		}
   300  		if err = c.writerMgr.Write(instance, shardID, payload); err != nil {
   301  			multiErr = multiErr.Add(err)
   302  			continue
   303  		}
   304  
   305  		oneOrMoreSucceeded = true
   306  	}
   307  
   308  	if !oneOrMoreSucceeded {
   309  		// unrectifiable loss
   310  		c.metrics.dropped.Inc(1)
   311  	}
   312  
   313  	return multiErr.FinalError()
   314  }
   315  
   316  func (c *TCPClient) shouldWriteForShard(nowNanos int64, shard shard.Shard) bool {
   317  	writeEarliestNanos, writeLatestNanos := c.writeTimeRangeFor(shard)
   318  	return nowNanos >= writeEarliestNanos && nowNanos <= writeLatestNanos
   319  }
   320  
   321  // writeTimeRangeFor returns the time range for writes going to a given shard.
   322  func (c *TCPClient) writeTimeRangeFor(shard shard.Shard) (int64, int64) {
   323  	var (
   324  		cutoverNanos  = shard.CutoverNanos()
   325  		cutoffNanos   = shard.CutoffNanos()
   326  		earliestNanos = int64(0)
   327  		latestNanos   = int64(math.MaxInt64)
   328  	)
   329  
   330  	if cutoverNanos >= int64(c.shardCutoverWarmupDuration) {
   331  		earliestNanos = cutoverNanos - int64(c.shardCutoverWarmupDuration)
   332  	}
   333  
   334  	if cutoffNanos <= math.MaxInt64-int64(c.shardCutoffLingerDuration) {
   335  		latestNanos = cutoffNanos + int64(c.shardCutoffLingerDuration)
   336  	}
   337  	return earliestNanos, latestNanos
   338  }
   339  
   340  type tcpClientMetrics struct {
   341  	writeUntimedCounter    tally.Counter
   342  	writeUntimedBatchTimer tally.Counter
   343  	writeUntimedGauge      tally.Counter
   344  	writePassthrough       tally.Counter
   345  	writeForwarded         tally.Counter
   346  	flush                  tally.Counter
   347  	shardNotOwned          tally.Counter
   348  	shardNotWriteable      tally.Counter
   349  	dropped                tally.Counter
   350  }
   351  
   352  func newTCPClientMetrics(
   353  	scope tally.Scope,
   354  ) tcpClientMetrics {
   355  	return tcpClientMetrics{
   356  		writeUntimedCounter:    scope.Counter("writeUntimedCounter"),
   357  		writeUntimedBatchTimer: scope.Counter("writeUntimedBatchTimer"),
   358  		writeUntimedGauge:      scope.Counter("writeUntimedGauge"),
   359  		writePassthrough:       scope.Counter("writePassthrough"),
   360  		writeForwarded:         scope.Counter("writeForwarded"),
   361  		flush:                  scope.Counter("flush"),
   362  		shardNotOwned:          scope.Counter("shard-not-owned"),
   363  		shardNotWriteable:      scope.Counter("shard-not-writeable"),
   364  		dropped:                scope.Counter("dropped"),
   365  	}
   366  }