github.com/m3db/m3@v1.5.0/src/aggregator/client/tcp_client.go (about) 1 // Copyright (c) 2020 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package client 22 23 import ( 24 "errors" 25 "fmt" 26 "math" 27 "time" 28 29 "github.com/uber-go/tally" 30 31 "github.com/m3db/m3/src/aggregator/sharding" 32 "github.com/m3db/m3/src/cluster/placement" 33 "github.com/m3db/m3/src/cluster/shard" 34 "github.com/m3db/m3/src/metrics/metadata" 35 "github.com/m3db/m3/src/metrics/metric/aggregated" 36 "github.com/m3db/m3/src/metrics/metric/id" 37 "github.com/m3db/m3/src/metrics/metric/unaggregated" 38 "github.com/m3db/m3/src/metrics/policy" 39 "github.com/m3db/m3/src/x/clock" 40 xerrors "github.com/m3db/m3/src/x/errors" 41 ) 42 43 var ( 44 _ AdminClient = (*TCPClient)(nil) 45 46 errNilPlacement = errors.New("placement is nil") 47 ) 48 49 // TCPClient sends metrics to M3 Aggregator via over custom TCP protocol. 50 type TCPClient struct { 51 nowFn clock.NowFn 52 shardCutoverWarmupDuration time.Duration 53 shardCutoffLingerDuration time.Duration 54 writerMgr instanceWriterManager 55 shardFn sharding.ShardFn 56 placementWatcher placement.Watcher 57 metrics tcpClientMetrics 58 } 59 60 // NewTCPClient returns new Protobuf over TCP M3 Aggregator client. 61 func NewTCPClient(opts Options) (*TCPClient, error) { 62 if err := opts.Validate(); err != nil { 63 return nil, err 64 } 65 66 var ( 67 instrumentOpts = opts.InstrumentOptions() 68 writerMgr instanceWriterManager 69 placementWatcher placement.Watcher 70 ) 71 72 writerMgrScope := instrumentOpts.MetricsScope().SubScope("writer-manager") 73 writerMgrOpts := opts.SetInstrumentOptions(instrumentOpts.SetMetricsScope(writerMgrScope)) 74 writerMgr, err := newInstanceWriterManager(writerMgrOpts) 75 if err != nil { 76 return nil, err 77 } 78 79 onPlacementChangedFn := func(prev, curr placement.Placement) { 80 writerMgr.AddInstances(curr.Instances()) // nolint: errcheck 81 82 if prev != nil { 83 writerMgr.RemoveInstances(prev.Instances()) // nolint: errcheck 84 } 85 } 86 87 placementWatcher = placement.NewPlacementsWatcher( 88 opts.WatcherOptions(). 89 SetOnPlacementChangedFn(onPlacementChangedFn)) 90 91 return &TCPClient{ 92 nowFn: opts.ClockOptions().NowFn(), 93 shardCutoverWarmupDuration: opts.ShardCutoverWarmupDuration(), 94 shardCutoffLingerDuration: opts.ShardCutoffLingerDuration(), 95 writerMgr: writerMgr, 96 shardFn: opts.ShardFn(), 97 placementWatcher: placementWatcher, 98 metrics: newTCPClientMetrics(instrumentOpts.MetricsScope()), 99 }, nil 100 } 101 102 // Init initializes TCPClient. 103 func (c *TCPClient) Init() error { 104 return c.placementWatcher.Watch() 105 } 106 107 // WriteUntimedCounter writes untimed counter metrics. 108 func (c *TCPClient) WriteUntimedCounter( 109 counter unaggregated.Counter, 110 metadatas metadata.StagedMetadatas, 111 ) error { 112 payload := payloadUnion{ 113 payloadType: untimedType, 114 untimed: untimedPayload{ 115 metric: counter.ToUnion(), 116 metadatas: metadatas, 117 }, 118 } 119 120 c.metrics.writeUntimedCounter.Inc(1) 121 return c.write(counter.ID, c.nowFn().UnixNano(), payload) 122 } 123 124 // WriteUntimedBatchTimer writes untimed batch timer metrics. 125 func (c *TCPClient) WriteUntimedBatchTimer( 126 batchTimer unaggregated.BatchTimer, 127 metadatas metadata.StagedMetadatas, 128 ) error { 129 payload := payloadUnion{ 130 payloadType: untimedType, 131 untimed: untimedPayload{ 132 metric: batchTimer.ToUnion(), 133 metadatas: metadatas, 134 }, 135 } 136 137 c.metrics.writeUntimedBatchTimer.Inc(1) 138 return c.write(batchTimer.ID, c.nowFn().UnixNano(), payload) 139 } 140 141 // WriteUntimedGauge writes untimed gauge metrics. 142 func (c *TCPClient) WriteUntimedGauge( 143 gauge unaggregated.Gauge, 144 metadatas metadata.StagedMetadatas, 145 ) error { 146 payload := payloadUnion{ 147 payloadType: untimedType, 148 untimed: untimedPayload{ 149 metric: gauge.ToUnion(), 150 metadatas: metadatas, 151 }, 152 } 153 154 c.metrics.writeUntimedGauge.Inc(1) 155 return c.write(gauge.ID, c.nowFn().UnixNano(), payload) 156 } 157 158 // WriteTimed writes timed metrics. 159 func (c *TCPClient) WriteTimed( 160 metric aggregated.Metric, 161 metadata metadata.TimedMetadata, 162 ) error { 163 payload := payloadUnion{ 164 payloadType: timedType, 165 timed: timedPayload{ 166 metric: metric, 167 metadata: metadata, 168 }, 169 } 170 171 c.metrics.writeForwarded.Inc(1) 172 return c.write(metric.ID, metric.TimeNanos, payload) 173 } 174 175 // WritePassthrough writes passthrough metrics. 176 func (c *TCPClient) WritePassthrough( 177 metric aggregated.Metric, 178 storagePolicy policy.StoragePolicy, 179 ) error { 180 payload := payloadUnion{ 181 payloadType: passthroughType, 182 passthrough: passthroughPayload{ 183 metric: metric, 184 storagePolicy: storagePolicy, 185 }, 186 } 187 188 c.metrics.writePassthrough.Inc(1) 189 return c.write(metric.ID, metric.TimeNanos, payload) 190 } 191 192 // WriteTimedWithStagedMetadatas writes timed metrics with staged metadatas. 193 func (c *TCPClient) WriteTimedWithStagedMetadatas( 194 metric aggregated.Metric, 195 metadatas metadata.StagedMetadatas, 196 ) error { 197 payload := payloadUnion{ 198 payloadType: timedWithStagedMetadatasType, 199 timedWithStagedMetadatas: timedWithStagedMetadatas{ 200 metric: metric, 201 metadatas: metadatas, 202 }, 203 } 204 205 c.metrics.writeForwarded.Inc(1) 206 return c.write(metric.ID, metric.TimeNanos, payload) 207 } 208 209 // WriteForwarded writes forwarded metrics. 210 func (c *TCPClient) WriteForwarded( 211 metric aggregated.ForwardedMetric, 212 metadata metadata.ForwardMetadata, 213 ) error { 214 payload := payloadUnion{ 215 payloadType: forwardedType, 216 forwarded: forwardedPayload{ 217 metric: metric, 218 metadata: metadata, 219 }, 220 } 221 222 c.metrics.writeForwarded.Inc(1) 223 return c.write(metric.ID, metric.TimeNanos, payload) 224 } 225 226 // ActivePlacement returns a copy of the currently active placement and its version. 227 func (c *TCPClient) ActivePlacement() (placement.Placement, int, error) { 228 placement, err := c.placementWatcher.Get() 229 if err != nil { 230 return nil, 0, err 231 } 232 if placement == nil { 233 return nil, 0, errNilPlacement 234 } 235 236 return placement.Clone(), placement.Version(), nil 237 } 238 239 // ActivePlacementVersion returns a copy of the currently active placement version. It is a far less expensive call 240 // than ActivePlacement, as it does not clone the placement. 241 func (c *TCPClient) ActivePlacementVersion() (int, error) { 242 placement, err := c.placementWatcher.Get() 243 if err != nil { 244 return 0, err 245 } 246 if placement == nil { 247 return 0, errNilPlacement 248 } 249 250 return placement.Version(), nil 251 } 252 253 // Flush flushes any remaining data buffered by the client. 254 func (c *TCPClient) Flush() error { 255 c.metrics.flush.Inc(1) 256 return c.writerMgr.Flush() 257 } 258 259 // Close closes the client. 260 func (c *TCPClient) Close() error { 261 c.writerMgr.Flush() //nolint:errcheck 262 c.placementWatcher.Unwatch() //nolint:errcheck 263 // writerMgr errors out if trying to close twice 264 return c.writerMgr.Close() 265 } 266 267 //nolint:gocritic 268 func (c *TCPClient) write( 269 metricID id.RawID, 270 timeNanos int64, 271 payload payloadUnion, 272 ) error { 273 placement, err := c.placementWatcher.Get() 274 if err != nil { 275 return err 276 } 277 if placement == nil { 278 return errNilPlacement 279 } 280 var ( 281 shardID = c.shardFn(metricID, uint32(placement.NumShards())) 282 instances = placement.InstancesForShard(shardID) 283 multiErr = xerrors.NewMultiError() 284 oneOrMoreSucceeded = false 285 ) 286 for _, instance := range instances { 287 // NB(xichen): the shard should technically always be found because the instances 288 // are computed from the placement, but protect against errors here regardless. 289 shard, ok := instance.Shards().Shard(shardID) 290 if !ok { 291 err = fmt.Errorf("instance %s does not own shard %d", instance.ID(), shardID) 292 multiErr = multiErr.Add(err) 293 c.metrics.shardNotOwned.Inc(1) 294 continue 295 } 296 if !c.shouldWriteForShard(timeNanos, shard) { 297 c.metrics.shardNotWriteable.Inc(1) 298 continue 299 } 300 if err = c.writerMgr.Write(instance, shardID, payload); err != nil { 301 multiErr = multiErr.Add(err) 302 continue 303 } 304 305 oneOrMoreSucceeded = true 306 } 307 308 if !oneOrMoreSucceeded { 309 // unrectifiable loss 310 c.metrics.dropped.Inc(1) 311 } 312 313 return multiErr.FinalError() 314 } 315 316 func (c *TCPClient) shouldWriteForShard(nowNanos int64, shard shard.Shard) bool { 317 writeEarliestNanos, writeLatestNanos := c.writeTimeRangeFor(shard) 318 return nowNanos >= writeEarliestNanos && nowNanos <= writeLatestNanos 319 } 320 321 // writeTimeRangeFor returns the time range for writes going to a given shard. 322 func (c *TCPClient) writeTimeRangeFor(shard shard.Shard) (int64, int64) { 323 var ( 324 cutoverNanos = shard.CutoverNanos() 325 cutoffNanos = shard.CutoffNanos() 326 earliestNanos = int64(0) 327 latestNanos = int64(math.MaxInt64) 328 ) 329 330 if cutoverNanos >= int64(c.shardCutoverWarmupDuration) { 331 earliestNanos = cutoverNanos - int64(c.shardCutoverWarmupDuration) 332 } 333 334 if cutoffNanos <= math.MaxInt64-int64(c.shardCutoffLingerDuration) { 335 latestNanos = cutoffNanos + int64(c.shardCutoffLingerDuration) 336 } 337 return earliestNanos, latestNanos 338 } 339 340 type tcpClientMetrics struct { 341 writeUntimedCounter tally.Counter 342 writeUntimedBatchTimer tally.Counter 343 writeUntimedGauge tally.Counter 344 writePassthrough tally.Counter 345 writeForwarded tally.Counter 346 flush tally.Counter 347 shardNotOwned tally.Counter 348 shardNotWriteable tally.Counter 349 dropped tally.Counter 350 } 351 352 func newTCPClientMetrics( 353 scope tally.Scope, 354 ) tcpClientMetrics { 355 return tcpClientMetrics{ 356 writeUntimedCounter: scope.Counter("writeUntimedCounter"), 357 writeUntimedBatchTimer: scope.Counter("writeUntimedBatchTimer"), 358 writeUntimedGauge: scope.Counter("writeUntimedGauge"), 359 writePassthrough: scope.Counter("writePassthrough"), 360 writeForwarded: scope.Counter("writeForwarded"), 361 flush: scope.Counter("flush"), 362 shardNotOwned: scope.Counter("shard-not-owned"), 363 shardNotWriteable: scope.Counter("shard-not-writeable"), 364 dropped: scope.Counter("dropped"), 365 } 366 }