github.com/m3db/m3@v1.5.0/src/dbnode/client/session.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package client 22 23 import ( 24 "bytes" 25 gocontext "context" 26 "errors" 27 "fmt" 28 "math" 29 "sort" 30 "strings" 31 "sync" 32 "sync/atomic" 33 "time" 34 35 "github.com/m3db/m3/src/cluster/shard" 36 "github.com/m3db/m3/src/dbnode/digest" 37 "github.com/m3db/m3/src/dbnode/encoding" 38 "github.com/m3db/m3/src/dbnode/generated/thrift/rpc" 39 "github.com/m3db/m3/src/dbnode/namespace" 40 "github.com/m3db/m3/src/dbnode/network/server/tchannelthrift/convert" 41 "github.com/m3db/m3/src/dbnode/runtime" 42 "github.com/m3db/m3/src/dbnode/storage/block" 43 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 44 "github.com/m3db/m3/src/dbnode/storage/index" 45 idxconvert "github.com/m3db/m3/src/dbnode/storage/index/convert" 46 "github.com/m3db/m3/src/dbnode/topology" 47 "github.com/m3db/m3/src/dbnode/ts" 48 "github.com/m3db/m3/src/dbnode/x/xio" 49 "github.com/m3db/m3/src/dbnode/x/xpool" 50 "github.com/m3db/m3/src/x/checked" 51 "github.com/m3db/m3/src/x/clock" 52 "github.com/m3db/m3/src/x/context" 53 xerrors "github.com/m3db/m3/src/x/errors" 54 "github.com/m3db/m3/src/x/ident" 55 "github.com/m3db/m3/src/x/instrument" 56 "github.com/m3db/m3/src/x/pool" 57 xresource "github.com/m3db/m3/src/x/resource" 58 xretry "github.com/m3db/m3/src/x/retry" 59 "github.com/m3db/m3/src/x/sampler" 60 "github.com/m3db/m3/src/x/serialize" 61 xsync "github.com/m3db/m3/src/x/sync" 62 xtime "github.com/m3db/m3/src/x/time" 63 64 apachethrift "github.com/apache/thrift/lib/go/thrift" 65 "github.com/uber-go/tally" 66 "github.com/uber/tchannel-go/thrift" 67 "go.uber.org/zap" 68 "go.uber.org/zap/zapcore" 69 ) 70 71 const ( 72 clusterConnectWaitInterval = 10 * time.Millisecond 73 gaugeReportInterval = 500 * time.Millisecond 74 blockMetadataChBufSize = 65536 75 hostNotAvailableMinSleepInterval = 1 * time.Millisecond 76 hostNotAvailableMaxSleepInterval = 100 * time.Millisecond 77 ) 78 79 type resultTypeEnum string 80 81 const ( 82 resultTypeMetadata resultTypeEnum = "metadata" 83 resultTypeBootstrap = "bootstrap" 84 resultTypeRaw = "raw" 85 ) 86 87 var ( 88 errUnknownWriteAttemptType = errors.New( 89 "unknown write attempt type specified, internal error") 90 ) 91 92 var ( 93 // ErrClusterConnectTimeout is raised when connecting to the cluster and 94 // ensuring at least each partition has an up node with a connection to it 95 ErrClusterConnectTimeout = errors.New("timed out establishing min connections to cluster") 96 // errSessionStatusNotInitial is raised when trying to open a session and 97 // its not in the initial clean state 98 errSessionStatusNotInitial = errors.New("session not in initial state") 99 // ErrSessionStatusNotOpen is raised when operations are requested when the 100 // session is not in the open state 101 ErrSessionStatusNotOpen = errors.New("session not in open state") 102 // errSessionBadBlockResultFromPeer is raised when there is a bad block 103 // return from a peer when fetching blocks from peers 104 errSessionBadBlockResultFromPeer = errors.New("session fetched bad block result from peer") 105 // errSessionInvalidConnectClusterConnectConsistencyLevel is raised when 106 // the connect consistency level specified is not recognized 107 errSessionInvalidConnectClusterConnectConsistencyLevel = errors.New( 108 "session has invalid connect consistency level specified", 109 ) 110 // errSessionHasNoHostQueueForHost is raised when host queue requested for a missing host 111 errSessionHasNoHostQueueForHost = newHostNotAvailableError(errors.New("session has no host queue for host")) 112 // errUnableToEncodeTags is raised when the server is unable to encode provided tags 113 // to be sent over the wire. 114 errUnableToEncodeTags = errors.New("unable to include tags") 115 // errEnqueueChIsClosed is returned when attempting to use a closed enqueuCh. 116 errEnqueueChIsClosed = errors.New("error enqueueCh is cosed") 117 ) 118 119 // sessionState is volatile state that is protected by a 120 // read/write mutex 121 type sessionState struct { 122 sync.RWMutex 123 124 status status 125 126 writeLevel topology.ConsistencyLevel 127 readLevel topology.ReadConsistencyLevel 128 bootstrapLevel topology.ReadConsistencyLevel 129 130 queues []hostQueue 131 queuesByHostID map[string]hostQueue 132 topo topology.Topology 133 topoMap topology.Map 134 topoWatch topology.MapWatch 135 replicas int 136 majority int 137 } 138 139 func (s *sessionState) readConsistencyLevelWithRLock( 140 override *topology.ReadConsistencyLevel, 141 ) topology.ReadConsistencyLevel { 142 if override == nil { 143 return s.readLevel 144 } 145 return *override 146 } 147 148 type session struct { 149 state sessionState 150 opts Options 151 runtimeOptsListenerCloser xresource.SimpleCloser 152 scope tally.Scope 153 nowFn clock.NowFn 154 log *zap.Logger 155 logWriteErrorSampler *sampler.Sampler 156 logFetchErrorSampler *sampler.Sampler 157 newHostQueueFn newHostQueueFn 158 writeRetrier xretry.Retrier 159 fetchRetrier xretry.Retrier 160 streamBlocksRetrier xretry.Retrier 161 pools sessionPools 162 fetchBatchSize int 163 newPeerBlocksQueueFn newPeerBlocksQueueFn 164 reattemptStreamBlocksFromPeersFn reattemptStreamBlocksFromPeersFn 165 pickBestPeerFn pickBestPeerFn 166 healthCheckNewConnFn healthCheckFn 167 origin topology.Host 168 streamBlocksMaxBlockRetries int 169 streamBlocksWorkers xsync.WorkerPool 170 streamBlocksBatchSize int 171 streamBlocksMetadataBatchTimeout time.Duration 172 streamBlocksBatchTimeout time.Duration 173 writeShardsInitializing bool 174 shardsLeavingCountTowardsConsistency bool 175 metrics sessionMetrics 176 } 177 178 type shardMetricsKey struct { 179 shardID uint32 180 resultType resultTypeEnum 181 } 182 183 type sessionMetrics struct { 184 sync.RWMutex 185 writeSuccess tally.Counter 186 writeErrorsBadRequest tally.Counter 187 writeErrorsInternalError tally.Counter 188 writeLatencyHistogram tally.Histogram 189 writeNodesRespondingErrors []tally.Counter 190 writeNodesRespondingBadRequestErrors []tally.Counter 191 fetchSuccess tally.Counter 192 fetchErrorsBadRequest tally.Counter 193 fetchErrorsInternalError tally.Counter 194 fetchLatencyHistogram tally.Histogram 195 fetchNodesRespondingErrors []tally.Counter 196 fetchNodesRespondingBadRequestErrors []tally.Counter 197 topologyUpdatedSuccess tally.Counter 198 topologyUpdatedError tally.Counter 199 streamFromPeersMetrics map[shardMetricsKey]streamFromPeersMetrics 200 } 201 202 func newSessionMetrics(scope tally.Scope) sessionMetrics { 203 return sessionMetrics{ 204 writeSuccess: scope.Counter("write.success"), 205 writeErrorsBadRequest: scope.Tagged(map[string]string{ 206 "error_type": "bad_request", 207 }).Counter("write.errors"), 208 writeErrorsInternalError: scope.Tagged(map[string]string{ 209 "error_type": "internal_error", 210 }).Counter("write.errors"), 211 writeLatencyHistogram: histogramWithDurationBuckets(scope, "write.latency"), 212 fetchSuccess: scope.Counter("fetch.success"), 213 fetchErrorsBadRequest: scope.Tagged(map[string]string{ 214 "error_type": "bad_request", 215 }).Counter("fetch.errors"), 216 fetchErrorsInternalError: scope.Tagged(map[string]string{ 217 "error_type": "internal_error", 218 }).Counter("fetch.errors"), 219 fetchLatencyHistogram: histogramWithDurationBuckets(scope, "fetch.latency"), 220 topologyUpdatedSuccess: scope.Counter("topology.updated-success"), 221 topologyUpdatedError: scope.Counter("topology.updated-error"), 222 streamFromPeersMetrics: make(map[shardMetricsKey]streamFromPeersMetrics), 223 } 224 } 225 226 type streamFromPeersMetrics struct { 227 fetchBlocksFromPeers tally.Gauge 228 metadataFetches tally.Gauge 229 metadataFetchBatchCall tally.Counter 230 metadataFetchBatchSuccess tally.Counter 231 metadataFetchBatchError tally.Counter 232 metadataFetchBatchBlockErr tally.Counter 233 metadataReceived tally.Counter 234 metadataPeerRetry tally.Counter 235 fetchBlockSuccess tally.Counter 236 fetchBlockError tally.Counter 237 fetchBlockFullRetry tally.Counter 238 fetchBlockFinalError tally.Counter 239 fetchBlockRetriesReqError tally.Counter 240 fetchBlockRetriesRespError tally.Counter 241 fetchBlockRetriesConsistencyLevelNotAchievedError tally.Counter 242 blocksEnqueueChannel tally.Gauge 243 } 244 245 type hostQueueOpts struct { 246 writeBatchRawRequestPool writeBatchRawRequestPool 247 writeBatchRawV2RequestPool writeBatchRawV2RequestPool 248 writeBatchRawRequestElementArrayPool writeBatchRawRequestElementArrayPool 249 writeBatchRawV2RequestElementArrayPool writeBatchRawV2RequestElementArrayPool 250 writeTaggedBatchRawRequestPool writeTaggedBatchRawRequestPool 251 writeTaggedBatchRawV2RequestPool writeTaggedBatchRawV2RequestPool 252 writeTaggedBatchRawRequestElementArrayPool writeTaggedBatchRawRequestElementArrayPool 253 writeTaggedBatchRawV2RequestElementArrayPool writeTaggedBatchRawV2RequestElementArrayPool 254 fetchBatchRawV2RequestPool fetchBatchRawV2RequestPool 255 fetchBatchRawV2RequestElementArrayPool fetchBatchRawV2RequestElementArrayPool 256 opts Options 257 } 258 259 type newHostQueueFn func( 260 host topology.Host, 261 hostQueueOpts hostQueueOpts, 262 ) (hostQueue, error) 263 264 func newSession(opts Options) (clientSession, error) { 265 topo, err := opts.TopologyInitializer().Init() 266 if err != nil { 267 return nil, err 268 } 269 270 logWriteErrorSampler, err := sampler.NewSampler(opts.LogErrorSampleRate()) 271 if err != nil { 272 return nil, err 273 } 274 275 logFetchErrorSampler, err := sampler.NewSampler(opts.LogErrorSampleRate()) 276 if err != nil { 277 return nil, err 278 } 279 280 scope := opts.InstrumentOptions().MetricsScope() 281 282 s := &session{ 283 state: sessionState{ 284 writeLevel: opts.WriteConsistencyLevel(), 285 readLevel: opts.ReadConsistencyLevel(), 286 queuesByHostID: make(map[string]hostQueue), 287 topo: topo, 288 }, 289 opts: opts, 290 scope: scope, 291 nowFn: opts.ClockOptions().NowFn(), 292 log: opts.InstrumentOptions().Logger(), 293 logWriteErrorSampler: logWriteErrorSampler, 294 logFetchErrorSampler: logFetchErrorSampler, 295 newHostQueueFn: newHostQueue, 296 fetchBatchSize: opts.FetchBatchSize(), 297 newPeerBlocksQueueFn: newPeerBlocksQueue, 298 healthCheckNewConnFn: healthCheck, 299 writeRetrier: opts.WriteRetrier(), 300 fetchRetrier: opts.FetchRetrier(), 301 pools: sessionPools{ 302 context: opts.ContextPool(), 303 checkedBytes: opts.CheckedBytesPool(), 304 id: opts.IdentifierPool(), 305 }, 306 writeShardsInitializing: opts.WriteShardsInitializing(), 307 shardsLeavingCountTowardsConsistency: opts.ShardsLeavingCountTowardsConsistency(), 308 metrics: newSessionMetrics(scope), 309 } 310 s.reattemptStreamBlocksFromPeersFn = s.streamBlocksReattemptFromPeers 311 s.pickBestPeerFn = s.streamBlocksPickBestPeer 312 writeAttemptPoolOpts := pool.NewObjectPoolOptions(). 313 SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()). 314 SetSize(int(s.opts.WriteOpPoolSize())). 315 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 316 scope.SubScope("write-attempt-pool"), 317 )) 318 s.pools.writeAttempt = newWriteAttemptPool(s, writeAttemptPoolOpts) 319 s.pools.writeAttempt.Init() 320 321 fetchAttemptPoolOpts := pool.NewObjectPoolOptions(). 322 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 323 SetSize(int(s.opts.FetchBatchOpPoolSize())). 324 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 325 scope.SubScope("fetch-attempt-pool"), 326 )) 327 s.pools.fetchAttempt = newFetchAttemptPool(s, fetchAttemptPoolOpts) 328 s.pools.fetchAttempt.Init() 329 330 fetchTaggedAttemptPoolImplOpts := pool.NewObjectPoolOptions(). 331 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 332 SetSize(int(s.opts.FetchBatchOpPoolSize())). 333 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 334 scope.SubScope("fetch-tagged-attempt-pool"), 335 )) 336 s.pools.fetchTaggedAttempt = newFetchTaggedAttemptPool(s, fetchTaggedAttemptPoolImplOpts) 337 s.pools.fetchTaggedAttempt.Init() 338 339 aggregateAttemptPoolImplOpts := pool.NewObjectPoolOptions(). 340 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 341 SetSize(int(s.opts.FetchBatchOpPoolSize())). 342 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 343 scope.SubScope("aggregate-attempt-pool"), 344 )) 345 s.pools.aggregateAttempt = newAggregateAttemptPool(s, aggregateAttemptPoolImplOpts) 346 s.pools.aggregateAttempt.Init() 347 348 tagEncoderPoolOpts := pool.NewObjectPoolOptions(). 349 SetDynamic(s.opts.TagEncoderPoolSize().IsDynamic()). 350 SetSize(int(s.opts.TagEncoderPoolSize())). 351 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 352 scope.SubScope("tag-encoder-pool"), 353 )) 354 s.pools.tagEncoder = serialize.NewTagEncoderPool(opts.TagEncoderOptions(), tagEncoderPoolOpts) 355 s.pools.tagEncoder.Init() 356 357 tagDecoderPoolOpts := pool.NewObjectPoolOptions(). 358 SetDynamic(s.opts.TagDecoderPoolSize().IsDynamic()). 359 SetSize(int(s.opts.TagDecoderPoolSize())). 360 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 361 scope.SubScope("tag-decoder-pool"), 362 )) 363 s.pools.tagDecoder = serialize.NewTagDecoderPool(opts.TagDecoderOptions(), tagDecoderPoolOpts) 364 s.pools.tagDecoder.Init() 365 366 wrapperPoolOpts := pool.NewObjectPoolOptions(). 367 SetDynamic(s.opts.CheckedBytesWrapperPoolSize().IsDynamic()). 368 SetSize(int(s.opts.CheckedBytesWrapperPoolSize())). 369 SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope( 370 scope.SubScope("client-checked-bytes-wrapper-pool"))) 371 s.pools.checkedBytesWrapper = xpool.NewCheckedBytesWrapperPool(wrapperPoolOpts) 372 s.pools.checkedBytesWrapper.Init() 373 374 if opts, ok := opts.(AdminOptions); ok { 375 s.state.bootstrapLevel = opts.BootstrapConsistencyLevel() 376 s.origin = opts.Origin() 377 s.streamBlocksMaxBlockRetries = opts.FetchSeriesBlocksMaxBlockRetries() 378 s.streamBlocksWorkers = xsync.NewWorkerPool(opts.FetchSeriesBlocksBatchConcurrency()) 379 s.streamBlocksWorkers.Init() 380 s.streamBlocksBatchSize = opts.FetchSeriesBlocksBatchSize() 381 s.streamBlocksMetadataBatchTimeout = opts.FetchSeriesBlocksMetadataBatchTimeout() 382 s.streamBlocksBatchTimeout = opts.FetchSeriesBlocksBatchTimeout() 383 s.streamBlocksRetrier = opts.StreamBlocksRetrier() 384 } 385 386 if runtimeOptsMgr := opts.RuntimeOptionsManager(); runtimeOptsMgr != nil { 387 runtimeOptsMgr.RegisterListener(s) 388 } 389 390 return s, nil 391 } 392 393 func (s *session) SetRuntimeOptions(value runtime.Options) { 394 s.state.Lock() 395 s.state.bootstrapLevel = value.ClientBootstrapConsistencyLevel() 396 s.state.readLevel = value.ClientReadConsistencyLevel() 397 s.state.writeLevel = value.ClientWriteConsistencyLevel() 398 s.state.Unlock() 399 } 400 401 func (s *session) ShardID(id ident.ID) (uint32, error) { 402 s.state.RLock() 403 if s.state.status != statusOpen { 404 s.state.RUnlock() 405 return 0, ErrSessionStatusNotOpen 406 } 407 value := s.state.topoMap.ShardSet().Lookup(id) 408 s.state.RUnlock() 409 return value, nil 410 } 411 412 // newPeerMetadataStreamingProgressMetrics returns a struct with an embedded 413 // list of fields that can be used to emit metrics about the current state of 414 // the peer metadata streaming process 415 func (s *session) newPeerMetadataStreamingProgressMetrics( 416 shard uint32, 417 resultType resultTypeEnum, 418 ) *streamFromPeersMetrics { 419 mKey := shardMetricsKey{shardID: shard, resultType: resultType} 420 s.metrics.RLock() 421 m, ok := s.metrics.streamFromPeersMetrics[mKey] 422 s.metrics.RUnlock() 423 424 if ok { 425 return &m 426 } 427 428 scope := s.opts.InstrumentOptions().MetricsScope() 429 430 s.metrics.Lock() 431 m, ok = s.metrics.streamFromPeersMetrics[mKey] 432 if ok { 433 s.metrics.Unlock() 434 return &m 435 } 436 scope = scope.SubScope("stream-from-peers").Tagged(map[string]string{ 437 "shard": fmt.Sprintf("%d", shard), 438 "resultType": string(resultType), 439 }) 440 m = streamFromPeersMetrics{ 441 fetchBlocksFromPeers: scope.Gauge("fetch-blocks-inprogress"), 442 metadataFetches: scope.Gauge("fetch-metadata-peers-inprogress"), 443 metadataFetchBatchCall: scope.Counter("fetch-metadata-peers-batch-call"), 444 metadataFetchBatchSuccess: scope.Counter("fetch-metadata-peers-batch-success"), 445 metadataFetchBatchError: scope.Counter("fetch-metadata-peers-batch-error"), 446 metadataFetchBatchBlockErr: scope.Counter("fetch-metadata-peers-batch-block-err"), 447 metadataReceived: scope.Counter("fetch-metadata-peers-received"), 448 metadataPeerRetry: scope.Counter("fetch-metadata-peers-peer-retry"), 449 fetchBlockSuccess: scope.Counter("fetch-block-success"), 450 fetchBlockError: scope.Counter("fetch-block-error"), 451 fetchBlockFinalError: scope.Counter("fetch-block-final-error"), 452 fetchBlockFullRetry: scope.Counter("fetch-block-full-retry"), 453 fetchBlockRetriesReqError: scope.Tagged(map[string]string{ 454 "reason": "request-error", 455 }).Counter("fetch-block-retries"), 456 fetchBlockRetriesRespError: scope.Tagged(map[string]string{ 457 "reason": "response-error", 458 }).Counter("fetch-block-retries"), 459 fetchBlockRetriesConsistencyLevelNotAchievedError: scope.Tagged(map[string]string{ 460 "reason": "consistency-level-not-achieved-error", 461 }).Counter("fetch-block-retries"), 462 blocksEnqueueChannel: scope.Gauge("fetch-blocks-enqueue-channel-length"), 463 } 464 s.metrics.streamFromPeersMetrics[mKey] = m 465 s.metrics.Unlock() 466 return &m 467 } 468 469 func (s *session) recordWriteMetrics(consistencyResultErr error, respErrs int32, start time.Time) { 470 if idx := s.nodesRespondingErrorsMetricIndex(respErrs); idx >= 0 { 471 if IsBadRequestError(consistencyResultErr) { 472 s.metrics.writeNodesRespondingBadRequestErrors[idx].Inc(1) 473 } else { 474 s.metrics.writeNodesRespondingErrors[idx].Inc(1) 475 } 476 } 477 if consistencyResultErr == nil { 478 s.metrics.writeSuccess.Inc(1) 479 } else if IsBadRequestError(consistencyResultErr) { 480 s.metrics.writeErrorsBadRequest.Inc(1) 481 } else { 482 s.metrics.writeErrorsInternalError.Inc(1) 483 } 484 s.metrics.writeLatencyHistogram.RecordDuration(s.nowFn().Sub(start)) 485 486 if consistencyResultErr != nil && s.logWriteErrorSampler.Sample() { 487 s.log.Error("m3db client write error occurred", 488 zap.Float64("sampleRateLog", s.logWriteErrorSampler.SampleRate().Value()), 489 zap.Error(consistencyResultErr)) 490 } 491 } 492 493 func (s *session) recordFetchMetrics(consistencyResultErr error, respErrs int32, start time.Time) { 494 if idx := s.nodesRespondingErrorsMetricIndex(respErrs); idx >= 0 { 495 if IsBadRequestError(consistencyResultErr) { 496 s.metrics.fetchNodesRespondingBadRequestErrors[idx].Inc(1) 497 } else { 498 s.metrics.fetchNodesRespondingErrors[idx].Inc(1) 499 } 500 } 501 if consistencyResultErr == nil { 502 s.metrics.fetchSuccess.Inc(1) 503 } else if IsBadRequestError(consistencyResultErr) { 504 s.metrics.fetchErrorsBadRequest.Inc(1) 505 } else { 506 s.metrics.fetchErrorsInternalError.Inc(1) 507 } 508 s.metrics.fetchLatencyHistogram.RecordDuration(s.nowFn().Sub(start)) 509 510 if consistencyResultErr != nil && s.logFetchErrorSampler.Sample() { 511 s.log.Error("m3db client fetch error occurred", 512 zap.Float64("sampleRateLog", s.logFetchErrorSampler.SampleRate().Value()), 513 zap.Error(consistencyResultErr)) 514 } 515 } 516 517 func (s *session) nodesRespondingErrorsMetricIndex(respErrs int32) int32 { 518 idx := respErrs - 1 519 replicas := int32(s.Replicas()) 520 if respErrs > replicas { 521 // Cap to the max replicas, we might get more errors 522 // when a node is initializing a shard causing replicas + 1 523 // nodes to respond to operations 524 idx = replicas - 1 525 } 526 return idx 527 } 528 529 func (s *session) Open() error { 530 s.state.Lock() 531 if s.state.status != statusNotOpen { 532 s.state.Unlock() 533 return errSessionStatusNotInitial 534 } 535 536 watch, err := s.state.topo.Watch() 537 if err != nil { 538 s.state.Unlock() 539 return err 540 } 541 542 // Wait for the topology to be available 543 <-watch.C() 544 545 topoMap := watch.Get() 546 547 queues, replicas, majority, err := s.hostQueues(topoMap, nil) 548 if err != nil { 549 s.state.Unlock() 550 return err 551 } 552 s.setTopologyWithLock(topoMap, queues, replicas, majority) 553 s.state.topoWatch = watch 554 555 // NB(r): Alloc pools that can take some time in Open, expectation 556 // is already that Open will take some time 557 writeOperationPoolOpts := pool.NewObjectPoolOptions(). 558 SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()). 559 SetSize(int(s.opts.WriteOpPoolSize())). 560 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 561 s.scope.SubScope("write-op-pool"), 562 )) 563 s.pools.writeOperation = newWriteOperationPool(writeOperationPoolOpts) 564 s.pools.writeOperation.Init() 565 566 writeTaggedOperationPoolOpts := pool.NewObjectPoolOptions(). 567 SetDynamic(s.opts.WriteTaggedOpPoolSize().IsDynamic()). 568 SetSize(int(s.opts.WriteTaggedOpPoolSize())). 569 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 570 s.scope.SubScope("write-op-tagged-pool"), 571 )) 572 s.pools.writeTaggedOperation = newWriteTaggedOpPool(writeTaggedOperationPoolOpts) 573 s.pools.writeTaggedOperation.Init() 574 575 writeStatePoolOpts := pool.NewObjectPoolOptions(). 576 SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()). 577 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 578 s.scope.SubScope("write-state-pool"), 579 )) 580 581 if !s.opts.WriteOpPoolSize().IsDynamic() { 582 writeStatePoolSize := s.opts.WriteOpPoolSize() 583 if !s.opts.WriteTaggedOpPoolSize().IsDynamic() && s.opts.WriteTaggedOpPoolSize() > writeStatePoolSize { 584 writeStatePoolSize = s.opts.WriteTaggedOpPoolSize() 585 } 586 writeStatePoolOpts = writeStatePoolOpts.SetSize(int(writeStatePoolSize)) 587 } 588 s.pools.writeState = newWriteStatePool(s.pools.tagEncoder, writeStatePoolOpts) 589 s.pools.writeState.Init() 590 591 fetchBatchOpPoolOpts := pool.NewObjectPoolOptions(). 592 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 593 SetSize(int(s.opts.FetchBatchOpPoolSize())). 594 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 595 s.scope.SubScope("fetch-batch-op-pool"), 596 )) 597 s.pools.fetchBatchOp = newFetchBatchOpPool(fetchBatchOpPoolOpts, s.fetchBatchSize) 598 s.pools.fetchBatchOp.Init() 599 600 fetchTaggedOpPoolOpts := pool.NewObjectPoolOptions(). 601 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 602 SetSize(int(s.opts.FetchBatchOpPoolSize())). 603 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 604 s.scope.SubScope("fetch-tagged-op-pool"), 605 )) 606 s.pools.fetchTaggedOp = newFetchTaggedOpPool(fetchTaggedOpPoolOpts) 607 s.pools.fetchTaggedOp.Init() 608 609 aggregateOpPoolOpts := pool.NewObjectPoolOptions(). 610 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 611 SetSize(int(s.opts.FetchBatchOpPoolSize())). 612 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 613 s.scope.SubScope("aggregate-op-pool"), 614 )) 615 s.pools.aggregateOp = newAggregateOpPool(aggregateOpPoolOpts) 616 s.pools.aggregateOp.Init() 617 618 fetchStatePoolOpts := pool.NewObjectPoolOptions(). 619 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 620 SetSize(int(s.opts.FetchBatchOpPoolSize())). 621 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 622 s.scope.SubScope("fetch-tagged-state-pool"), 623 )) 624 s.pools.fetchState = newFetchStatePool(fetchStatePoolOpts) 625 s.pools.fetchState.Init() 626 627 seriesIteratorPoolOpts := pool.NewObjectPoolOptions(). 628 SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()). 629 SetSize(int(s.opts.SeriesIteratorPoolSize())). 630 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 631 s.scope.SubScope("series-iterator-pool"), 632 )) 633 s.pools.seriesIterator = encoding.NewSeriesIteratorPool(seriesIteratorPoolOpts) 634 s.pools.seriesIterator.Init() 635 s.state.status = statusOpen 636 s.state.Unlock() 637 638 go func() { 639 for range watch.C() { 640 s.log.Info("received update for topology") 641 topoMap := watch.Get() 642 643 s.state.RLock() 644 existingQueues := s.state.queues 645 s.state.RUnlock() 646 647 queues, replicas, majority, err := s.hostQueues(topoMap, existingQueues) 648 if err != nil { 649 s.log.Error("could not update topology map", zap.Error(err)) 650 s.metrics.topologyUpdatedError.Inc(1) 651 continue 652 } 653 s.state.Lock() 654 s.setTopologyWithLock(topoMap, queues, replicas, majority) 655 s.state.Unlock() 656 s.metrics.topologyUpdatedSuccess.Inc(1) 657 } 658 }() 659 660 return nil 661 } 662 663 func (s *session) BorrowConnections( 664 shardID uint32, 665 fn WithBorrowConnectionFn, 666 opts BorrowConnectionOptions, 667 ) (BorrowConnectionsResult, error) { 668 var result BorrowConnectionsResult 669 s.state.RLock() 670 topoMap, err := s.topologyMapWithStateRLock() 671 s.state.RUnlock() 672 if err != nil { 673 return result, err 674 } 675 676 var ( 677 multiErr = xerrors.NewMultiError() 678 breakLoop bool 679 ) 680 err = topoMap.RouteShardForEach(shardID, func( 681 _ int, 682 shard shard.Shard, 683 host topology.Host, 684 ) { 685 if multiErr.NumErrors() > 0 || breakLoop { 686 // Error or has broken 687 return 688 } 689 if opts.ExcludeOrigin && s.origin != nil && s.origin.ID() == host.ID() { 690 // Skip origin host. 691 return 692 } 693 694 var ( 695 userResult WithBorrowConnectionResult 696 userErr error 697 ) 698 borrowErr := s.BorrowConnection(host.ID(), func( 699 client rpc.TChanNode, 700 channel Channel, 701 ) { 702 userResult, userErr = fn(shard, host, client, channel) 703 }) 704 if borrowErr != nil { 705 // Wasn't able to even borrow, skip if don't want to error 706 // on down hosts or return the borrow error. 707 if !opts.ContinueOnBorrowError { 708 multiErr = multiErr.Add(borrowErr) 709 } 710 return 711 } 712 713 // Track successful borrow. 714 result.Borrowed++ 715 716 // Track whether has broken loop. 717 breakLoop = userResult.Break 718 719 // Return whether user error occurred to break or not. 720 if userErr != nil { 721 multiErr = multiErr.Add(userErr) 722 } 723 }) 724 if err != nil { 725 // Route error. 726 return result, err 727 } 728 // Potentially a user error or borrow error, otherwise 729 // FinalError() will return nil. 730 return result, multiErr.FinalError() 731 } 732 733 func (s *session) BorrowConnection(hostID string, fn WithConnectionFn) error { 734 s.state.RLock() 735 unlocked := false 736 queue, ok := s.state.queuesByHostID[hostID] 737 if !ok { 738 s.state.RUnlock() 739 return errSessionHasNoHostQueueForHost 740 } 741 err := queue.BorrowConnection(func(client rpc.TChanNode, ch Channel) { 742 // Unlock early on success 743 s.state.RUnlock() 744 unlocked = true 745 746 // Execute function with borrowed connection 747 fn(client, ch) 748 }) 749 if !unlocked { 750 s.state.RUnlock() 751 } 752 return err 753 } 754 755 func (s *session) DedicatedConnection( 756 shardID uint32, 757 opts DedicatedConnectionOptions, 758 ) (rpc.TChanNode, Channel, error) { 759 s.state.RLock() 760 topoMap, err := s.topologyMapWithStateRLock() 761 s.state.RUnlock() 762 if err != nil { 763 return nil, nil, err 764 } 765 766 var ( 767 client rpc.TChanNode 768 channel Channel 769 succeeded bool 770 multiErr = xerrors.NewMultiError() 771 ) 772 err = topoMap.RouteShardForEach(shardID, func( 773 _ int, 774 targetShard shard.Shard, 775 host topology.Host, 776 ) { 777 stateFilter := opts.ShardStateFilter 778 if succeeded || !(stateFilter == shard.Unknown || targetShard.State() == stateFilter) { 779 return 780 } 781 782 if s.origin != nil && s.origin.ID() == host.ID() { 783 // Skip origin host. 784 return 785 } 786 787 newConnFn := s.opts.NewConnectionFn() 788 channel, client, err = newConnFn(channelName, host.Address(), s.opts) 789 if err != nil { 790 multiErr = multiErr.Add(err) 791 return 792 } 793 794 if err := s.healthCheckNewConnFn(client, s.opts, opts.BootstrappedNodesOnly); err != nil { 795 channel.Close() 796 multiErr = multiErr.Add(err) 797 return 798 } 799 800 succeeded = true 801 }) 802 if err != nil { 803 return nil, nil, err 804 } 805 806 if !succeeded { 807 multiErr = multiErr.Add( 808 fmt.Errorf("failed to create a dedicated connection for shard %d", shardID)) 809 return nil, nil, multiErr.FinalError() 810 } 811 812 return client, channel, nil 813 } 814 815 func (s *session) hostQueues( 816 topoMap topology.Map, 817 existing []hostQueue, 818 ) ([]hostQueue, int, int, error) { 819 // NB(r): we leave existing writes in the host queues to finish 820 // as they are already enroute to their destination. This is an edge case 821 // that might result in leaving nodes counting towards quorum, but fixing it 822 // would result in additional chatter. 823 824 start := s.nowFn() 825 826 existingByHostID := make(map[string]hostQueue, len(existing)) 827 for _, queue := range existing { 828 existingByHostID[queue.Host().ID()] = queue 829 } 830 831 hosts := topoMap.Hosts() 832 queues := make([]hostQueue, 0, len(hosts)) 833 newQueues := make([]hostQueue, 0, len(hosts)) 834 for _, host := range hosts { 835 if existingQueue, ok := existingByHostID[host.ID()]; ok { 836 queues = append(queues, existingQueue) 837 continue 838 } 839 newQueue, err := s.newHostQueue(host, topoMap) 840 if err != nil { 841 return nil, 0, 0, err 842 } 843 queues = append(queues, newQueue) 844 newQueues = append(newQueues, newQueue) 845 } 846 847 replicas := topoMap.Replicas() 848 majority := topoMap.MajorityReplicas() 849 850 firstConnectConsistencyLevel := s.opts.ClusterConnectConsistencyLevel() 851 if firstConnectConsistencyLevel == topology.ConnectConsistencyLevelNone { 852 // Return immediately if no connect consistency required 853 return queues, replicas, majority, nil 854 } 855 856 connectConsistencyLevel := firstConnectConsistencyLevel 857 if connectConsistencyLevel == topology.ConnectConsistencyLevelAny { 858 // If level any specified, first attempt all then proceed lowering requirement 859 connectConsistencyLevel = topology.ConnectConsistencyLevelAll 860 } 861 862 // Abort if we do not connect 863 connected := false 864 defer func() { 865 if !connected { 866 for _, queue := range newQueues { 867 queue.Close() 868 } 869 } 870 }() 871 872 for { 873 if now := s.nowFn(); now.Sub(start) >= s.opts.ClusterConnectTimeout() { 874 switch firstConnectConsistencyLevel { 875 case topology.ConnectConsistencyLevelAny: 876 // If connecting with connect any strategy then keep 877 // trying but lower consistency requirement 878 start = now 879 connectConsistencyLevel-- 880 if connectConsistencyLevel == topology.ConnectConsistencyLevelNone { 881 // Already tried to resolve all consistency requirements, just 882 // return successfully at this point 883 err := fmt.Errorf("timed out connecting, returning success") 884 s.log.Warn("cluster connect with consistency any", zap.Error(err)) 885 connected = true 886 return queues, replicas, majority, nil 887 } 888 default: 889 // Timed out connecting to a specific consistency requirement 890 return nil, 0, 0, ErrClusterConnectTimeout 891 } 892 } 893 894 var level topology.ConsistencyLevel 895 switch connectConsistencyLevel { 896 case topology.ConnectConsistencyLevelAll: 897 level = topology.ConsistencyLevelAll 898 case topology.ConnectConsistencyLevelMajority: 899 level = topology.ConsistencyLevelMajority 900 case topology.ConnectConsistencyLevelOne: 901 level = topology.ConsistencyLevelOne 902 default: 903 return nil, 0, 0, errSessionInvalidConnectClusterConnectConsistencyLevel 904 } 905 clusterAvailable, err := s.clusterAvailabilityWithQueuesAndMap(level, 906 queues, topoMap) 907 if err != nil { 908 return nil, 0, 0, err 909 } 910 if clusterAvailable { 911 // All done 912 break 913 } 914 time.Sleep(clusterConnectWaitInterval) 915 } 916 917 connected = true 918 return queues, replicas, majority, nil 919 } 920 921 func (s *session) WriteClusterAvailability() (bool, error) { 922 level := s.opts.WriteConsistencyLevel() 923 return s.clusterAvailability(level) 924 } 925 926 func (s *session) ReadClusterAvailability() (bool, error) { 927 var convertedConsistencyLevel topology.ConsistencyLevel 928 level := s.opts.ReadConsistencyLevel() 929 switch level { 930 case topology.ReadConsistencyLevelNone: 931 // Already ready. 932 return true, nil 933 case topology.ReadConsistencyLevelOne: 934 convertedConsistencyLevel = topology.ConsistencyLevelOne 935 case topology.ReadConsistencyLevelUnstrictMajority: 936 convertedConsistencyLevel = topology.ConsistencyLevelOne 937 case topology.ReadConsistencyLevelMajority: 938 convertedConsistencyLevel = topology.ConsistencyLevelMajority 939 case topology.ReadConsistencyLevelUnstrictAll: 940 convertedConsistencyLevel = topology.ConsistencyLevelOne 941 case topology.ReadConsistencyLevelAll: 942 convertedConsistencyLevel = topology.ConsistencyLevelAll 943 default: 944 return false, fmt.Errorf("unknown consistency level: %d", level) 945 } 946 return s.clusterAvailability(convertedConsistencyLevel) 947 } 948 949 func (s *session) clusterAvailability( 950 level topology.ConsistencyLevel, 951 ) (bool, error) { 952 s.state.RLock() 953 queues := s.state.queues 954 topoMap, err := s.topologyMapWithStateRLock() 955 s.state.RUnlock() 956 if err != nil { 957 return false, err 958 } 959 return s.clusterAvailabilityWithQueuesAndMap(level, queues, topoMap) 960 } 961 962 func (s *session) clusterAvailabilityWithQueuesAndMap( 963 level topology.ConsistencyLevel, 964 queues []hostQueue, 965 topoMap topology.Map, 966 ) (bool, error) { 967 shards := topoMap.ShardSet().AllIDs() 968 minConnectionCount := s.opts.MinConnectionCount() 969 replicas := topoMap.Replicas() 970 majority := topoMap.MajorityReplicas() 971 972 for _, shardID := range shards { 973 shardReplicasAvailable := 0 974 routeErr := topoMap.RouteShardForEach(shardID, func(idx int, _ shard.Shard, _ topology.Host) { 975 if queues[idx].ConnectionCount() >= minConnectionCount { 976 shardReplicasAvailable++ 977 } 978 }) 979 if routeErr != nil { 980 return false, routeErr 981 } 982 var clusterAvailableForShard bool 983 switch level { 984 case topology.ConsistencyLevelAll: 985 clusterAvailableForShard = shardReplicasAvailable == replicas 986 case topology.ConsistencyLevelMajority: 987 clusterAvailableForShard = shardReplicasAvailable >= majority 988 case topology.ConsistencyLevelOne: 989 clusterAvailableForShard = shardReplicasAvailable > 0 990 default: 991 return false, fmt.Errorf("unknown consistency level: %d", level) 992 } 993 if !clusterAvailableForShard { 994 return false, nil 995 } 996 } 997 998 return true, nil 999 } 1000 1001 func (s *session) setTopologyWithLock(topoMap topology.Map, queues []hostQueue, replicas, majority int) { 1002 prevQueues := s.state.queues 1003 1004 newQueuesByHostID := make(map[string]hostQueue, len(queues)) 1005 for _, queue := range queues { 1006 newQueuesByHostID[queue.Host().ID()] = queue 1007 } 1008 1009 s.state.queues = queues 1010 s.state.queuesByHostID = newQueuesByHostID 1011 1012 s.state.topoMap = topoMap 1013 1014 s.state.replicas = replicas 1015 s.state.majority = majority 1016 1017 // If the number of hostQueues has changed then we need to recreate the fetch 1018 // batch op array pool as it must be the exact length of the queues as we index 1019 // directly into the return array in fetch calls. 1020 if len(queues) != len(prevQueues) { 1021 poolOpts := pool.NewObjectPoolOptions(). 1022 SetSize(int(s.opts.FetchBatchOpPoolSize())). 1023 SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()). 1024 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1025 s.scope.SubScope("fetch-batch-op-array-array-pool"), 1026 )) 1027 s.pools.fetchBatchOpArrayArray = newFetchBatchOpArrayArrayPool( 1028 poolOpts, 1029 len(queues), 1030 int(s.opts.FetchBatchOpPoolSize())/len(queues)) 1031 s.pools.fetchBatchOpArrayArray.Init() 1032 } 1033 1034 if s.pools.multiReaderIteratorArray == nil { 1035 s.pools.multiReaderIteratorArray = encoding.NewMultiReaderIteratorArrayPool([]pool.Bucket{ 1036 { 1037 Capacity: replicas, 1038 Count: s.opts.SeriesIteratorPoolSize(), 1039 }, 1040 }) 1041 s.pools.multiReaderIteratorArray.Init() 1042 } 1043 if s.pools.readerSliceOfSlicesIterator == nil { 1044 size := int(s.opts.SeriesIteratorPoolSize()) 1045 if !s.opts.SeriesIteratorPoolSize().IsDynamic() { 1046 size = replicas * int(s.opts.SeriesIteratorPoolSize()) 1047 } 1048 poolOpts := pool.NewObjectPoolOptions(). 1049 SetSize(size). 1050 SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()). 1051 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1052 s.scope.SubScope("reader-slice-of-slices-iterator-pool"), 1053 )) 1054 s.pools.readerSliceOfSlicesIterator = newReaderSliceOfSlicesIteratorPool(poolOpts) 1055 s.pools.readerSliceOfSlicesIterator.Init() 1056 } 1057 if s.pools.multiReaderIterator == nil { 1058 size := int(s.opts.SeriesIteratorPoolSize()) 1059 if !s.opts.SeriesIteratorPoolSize().IsDynamic() { 1060 size = replicas * int(s.opts.SeriesIteratorPoolSize()) 1061 } 1062 poolOpts := pool.NewObjectPoolOptions(). 1063 SetSize(size). 1064 SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()). 1065 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1066 s.scope.SubScope("multi-reader-iterator-pool"), 1067 )) 1068 s.pools.multiReaderIterator = encoding.NewMultiReaderIteratorPool(poolOpts) 1069 s.pools.multiReaderIterator.Init(s.opts.ReaderIteratorAllocate()) 1070 } 1071 if replicas > len(s.metrics.writeNodesRespondingErrors) { 1072 curr := len(s.metrics.writeNodesRespondingErrors) 1073 for i := curr; i < replicas; i++ { 1074 tags := map[string]string{"nodes": fmt.Sprintf("%d", i+1)} 1075 name := "write.nodes-responding-error" 1076 serverErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{ 1077 "error_type": "server_error", 1078 }) 1079 badRequestErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{ 1080 "error_type": "bad_request_error", 1081 }) 1082 s.metrics.writeNodesRespondingErrors = 1083 append(s.metrics.writeNodesRespondingErrors, serverErrsSubScope.Counter(name)) 1084 s.metrics.writeNodesRespondingBadRequestErrors = 1085 append(s.metrics.writeNodesRespondingBadRequestErrors, badRequestErrsSubScope.Counter(name)) 1086 } 1087 } 1088 if replicas > len(s.metrics.fetchNodesRespondingErrors) { 1089 curr := len(s.metrics.fetchNodesRespondingErrors) 1090 for i := curr; i < replicas; i++ { 1091 tags := map[string]string{"nodes": fmt.Sprintf("%d", i+1)} 1092 name := "fetch.nodes-responding-error" 1093 serverErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{ 1094 "error_type": "server_error", 1095 }) 1096 badRequestErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{ 1097 "error_type": "bad_request_error", 1098 }) 1099 s.metrics.fetchNodesRespondingErrors = 1100 append(s.metrics.fetchNodesRespondingErrors, serverErrsSubScope.Counter(name)) 1101 s.metrics.fetchNodesRespondingBadRequestErrors = 1102 append(s.metrics.fetchNodesRespondingBadRequestErrors, badRequestErrsSubScope.Counter(name)) 1103 } 1104 } 1105 1106 // Asynchronously close the set of host queues no longer in use 1107 go func() { 1108 for _, queue := range prevQueues { 1109 newQueue, ok := newQueuesByHostID[queue.Host().ID()] 1110 if !ok || newQueue != queue { 1111 queue.Close() 1112 } 1113 } 1114 }() 1115 1116 s.log.Info("successfully updated topology", 1117 zap.Int("numHosts", topoMap.HostsLen()), 1118 zap.Int("numShards", len(topoMap.ShardSet().AllIDs()))) 1119 } 1120 1121 func (s *session) newHostQueue(host topology.Host, topoMap topology.Map) (hostQueue, error) { 1122 // NB(r): Due to hosts being replicas we have: 1123 // = replica * numWrites 1124 // = total writes to all hosts 1125 // We need to pool: 1126 // = replica * (numWrites / writeBatchSize) 1127 // = number of batch request structs to pool 1128 // For purposes of simplifying the options for pooling the write op pool size 1129 // represents the number of ops to pool not including replication, this is due 1130 // to the fact that the ops are shared between the different host queue replicas. 1131 writeOpPoolSize := s.opts.WriteOpPoolSize() 1132 if s.opts.WriteTaggedOpPoolSize() > writeOpPoolSize { 1133 writeOpPoolSize = s.opts.WriteTaggedOpPoolSize() 1134 } 1135 totalBatches := topoMap.Replicas() * 1136 int(math.Ceil(float64(writeOpPoolSize)/float64(s.opts.WriteBatchSize()))) 1137 hostBatches := int(math.Ceil(float64(totalBatches) / float64(topoMap.HostsLen()))) 1138 1139 writeBatchRequestPoolOpts := pool.NewObjectPoolOptions(). 1140 SetSize(hostBatches). 1141 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1142 s.scope.SubScope("write-batch-request-pool"), 1143 )) 1144 writeBatchRequestPool := newWriteBatchRawRequestPool(writeBatchRequestPoolOpts) 1145 writeBatchRequestPool.Init() 1146 writeBatchV2RequestPool := newWriteBatchRawV2RequestPool(writeBatchRequestPoolOpts) 1147 writeBatchV2RequestPool.Init() 1148 1149 writeTaggedBatchRequestPoolOpts := pool.NewObjectPoolOptions(). 1150 SetSize(hostBatches). 1151 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1152 s.scope.SubScope("write-tagged-batch-request-pool"), 1153 )) 1154 writeTaggedBatchRequestPool := newWriteTaggedBatchRawRequestPool(writeTaggedBatchRequestPoolOpts) 1155 writeTaggedBatchRequestPool.Init() 1156 writeTaggedBatchV2RequestPool := newWriteTaggedBatchRawV2RequestPool(writeBatchRequestPoolOpts) 1157 writeTaggedBatchV2RequestPool.Init() 1158 1159 writeBatchRawRequestElementArrayPoolOpts := pool.NewObjectPoolOptions(). 1160 SetSize(hostBatches). 1161 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1162 s.scope.SubScope("id-datapoint-array-pool"), 1163 )) 1164 writeBatchRawRequestElementArrayPool := newWriteBatchRawRequestElementArrayPool( 1165 writeBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize()) 1166 writeBatchRawRequestElementArrayPool.Init() 1167 writeBatchRawV2RequestElementArrayPool := newWriteBatchRawV2RequestElementArrayPool( 1168 writeBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize()) 1169 writeBatchRawV2RequestElementArrayPool.Init() 1170 1171 writeTaggedBatchRawRequestElementArrayPoolOpts := pool.NewObjectPoolOptions(). 1172 SetSize(hostBatches). 1173 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1174 s.scope.SubScope("id-tagged-datapoint-array-pool"), 1175 )) 1176 writeTaggedBatchRawRequestElementArrayPool := newWriteTaggedBatchRawRequestElementArrayPool( 1177 writeTaggedBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize()) 1178 writeTaggedBatchRawRequestElementArrayPool.Init() 1179 writeTaggedBatchRawV2RequestElementArrayPool := newWriteTaggedBatchRawV2RequestElementArrayPool( 1180 writeTaggedBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize()) 1181 writeTaggedBatchRawV2RequestElementArrayPool.Init() 1182 1183 fetchBatchRawV2RequestPoolOpts := pool.NewObjectPoolOptions(). 1184 SetSize(hostBatches). 1185 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1186 s.scope.SubScope("fetch-batch-request-pool"), 1187 )) 1188 fetchBatchRawV2RequestPool := newFetchBatchRawV2RequestPool(fetchBatchRawV2RequestPoolOpts) 1189 fetchBatchRawV2RequestPool.Init() 1190 1191 fetchBatchRawV2RequestElementArrayPoolOpts := pool.NewObjectPoolOptions(). 1192 SetSize(hostBatches). 1193 SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope( 1194 s.scope.SubScope("fetch-batch-request-array-pool"), 1195 )) 1196 fetchBatchRawV2RequestElementArrPool := newFetchBatchRawV2RequestElementArrayPool( 1197 fetchBatchRawV2RequestElementArrayPoolOpts, s.opts.FetchBatchSize(), 1198 ) 1199 fetchBatchRawV2RequestElementArrPool.Init() 1200 1201 hostQueue, err := s.newHostQueueFn(host, hostQueueOpts{ 1202 writeBatchRawRequestPool: writeBatchRequestPool, 1203 writeBatchRawV2RequestPool: writeBatchV2RequestPool, 1204 writeBatchRawRequestElementArrayPool: writeBatchRawRequestElementArrayPool, 1205 writeBatchRawV2RequestElementArrayPool: writeBatchRawV2RequestElementArrayPool, 1206 writeTaggedBatchRawRequestPool: writeTaggedBatchRequestPool, 1207 writeTaggedBatchRawV2RequestPool: writeTaggedBatchV2RequestPool, 1208 writeTaggedBatchRawRequestElementArrayPool: writeTaggedBatchRawRequestElementArrayPool, 1209 writeTaggedBatchRawV2RequestElementArrayPool: writeTaggedBatchRawV2RequestElementArrayPool, 1210 fetchBatchRawV2RequestPool: fetchBatchRawV2RequestPool, 1211 fetchBatchRawV2RequestElementArrayPool: fetchBatchRawV2RequestElementArrPool, 1212 opts: s.opts, 1213 }) 1214 if err != nil { 1215 return nil, err 1216 } 1217 hostQueue.Open() 1218 return hostQueue, nil 1219 } 1220 1221 func (s *session) Write( 1222 nsID, id ident.ID, 1223 t xtime.UnixNano, 1224 value float64, 1225 unit xtime.Unit, 1226 annotation []byte, 1227 ) error { 1228 w := s.pools.writeAttempt.Get() 1229 w.args.attemptType = untaggedWriteAttemptType 1230 w.args.namespace, w.args.id = nsID, id 1231 w.args.tags = ident.EmptyTagIterator 1232 w.args.t = t 1233 w.args.value, w.args.unit, w.args.annotation = value, unit, annotation 1234 err := s.writeRetrier.Attempt(w.attemptFn) 1235 s.pools.writeAttempt.Put(w) 1236 return err 1237 } 1238 1239 func (s *session) WriteTagged( 1240 nsID, id ident.ID, 1241 tags ident.TagIterator, 1242 t xtime.UnixNano, 1243 value float64, 1244 unit xtime.Unit, 1245 annotation []byte, 1246 ) error { 1247 w := s.pools.writeAttempt.Get() 1248 w.args.attemptType = taggedWriteAttemptType 1249 w.args.namespace, w.args.id, w.args.tags = nsID, id, tags 1250 w.args.t = t 1251 w.args.value, w.args.unit, w.args.annotation = value, unit, annotation 1252 err := s.writeRetrier.Attempt(w.attemptFn) 1253 s.pools.writeAttempt.Put(w) 1254 return err 1255 } 1256 1257 func (s *session) writeAttempt( 1258 wType writeAttemptType, 1259 nsID, id ident.ID, 1260 inputTags ident.TagIterator, 1261 t xtime.UnixNano, 1262 value float64, 1263 unit xtime.Unit, 1264 annotation []byte, 1265 ) error { 1266 startWriteAttempt := s.nowFn() 1267 1268 timeType, timeTypeErr := convert.ToTimeType(unit) 1269 if timeTypeErr != nil { 1270 return timeTypeErr 1271 } 1272 1273 timestamp, timestampErr := convert.ToValue(t, timeType) 1274 if timestampErr != nil { 1275 return timestampErr 1276 } 1277 1278 s.state.RLock() 1279 if s.state.status != statusOpen { 1280 s.state.RUnlock() 1281 return ErrSessionStatusNotOpen 1282 } 1283 1284 state, majority, enqueued, err := s.writeAttemptWithRLock( 1285 wType, nsID, id, inputTags, timestamp, value, timeType, annotation) 1286 s.state.RUnlock() 1287 1288 if err != nil { 1289 return err 1290 } 1291 1292 // it's safe to Wait() here, as we still hold the lock on state, after it's 1293 // returned from writeAttemptWithRLock. 1294 state.Wait() 1295 1296 err = s.writeConsistencyResult(state.consistencyLevel, majority, enqueued, 1297 enqueued-state.pending, int32(len(state.errors)), state.errors) 1298 1299 s.recordWriteMetrics(err, int32(len(state.errors)), startWriteAttempt) 1300 1301 // must Unlock before decRef'ing, as the latter releases the writeState back into a 1302 // pool if ref count == 0. 1303 state.Unlock() 1304 state.decRef() 1305 1306 return err 1307 } 1308 1309 // NB(prateek): the returned writeState, if valid, still holds the lock. Its ownership 1310 // is transferred to the calling function, and is expected to manage the lifecycle of 1311 // of the object (including releasing the lock/decRef'ing it). 1312 func (s *session) writeAttemptWithRLock( 1313 wType writeAttemptType, 1314 namespace, id ident.ID, 1315 inputTags ident.TagIterator, 1316 timestamp int64, 1317 value float64, 1318 timeType rpc.TimeType, 1319 annotation []byte, 1320 ) (*writeState, int32, int32, error) { 1321 var ( 1322 majority = int32(s.state.majority) 1323 enqueued int32 1324 ) 1325 1326 // NB(prateek): We retain an individual copy of the namespace, ID per 1327 // writeState, as each writeState tracks the lifecycle of it's resources in 1328 // use in the various queues. Tracking per writeAttempt isn't sufficient as 1329 // we may enqueue multiple writeStates concurrently depending on retries 1330 // and consistency level checks. 1331 var tagEncoder serialize.TagEncoder 1332 if wType == taggedWriteAttemptType { 1333 tagEncoder = s.pools.tagEncoder.Get() 1334 if err := tagEncoder.Encode(inputTags); err != nil { 1335 tagEncoder.Finalize() 1336 return nil, 0, 0, err 1337 } 1338 } 1339 nsID := s.cloneFinalizable(namespace) 1340 tsID := s.cloneFinalizable(id) 1341 1342 var ( 1343 clonedAnnotation checked.Bytes 1344 clonedAnnotationBytes []byte 1345 ) 1346 if len(annotation) > 0 { 1347 clonedAnnotation = s.pools.checkedBytes.Get(len(annotation)) 1348 clonedAnnotation.IncRef() 1349 clonedAnnotation.AppendAll(annotation) 1350 clonedAnnotationBytes = clonedAnnotation.Bytes() 1351 } 1352 1353 var op writeOp 1354 switch wType { 1355 case untaggedWriteAttemptType: 1356 wop := s.pools.writeOperation.Get() 1357 wop.namespace = nsID 1358 wop.shardID = s.state.topoMap.ShardSet().Lookup(tsID) 1359 wop.request.ID = tsID.Bytes() 1360 wop.request.Datapoint.Value = value 1361 wop.request.Datapoint.Timestamp = timestamp 1362 wop.request.Datapoint.TimestampTimeType = timeType 1363 wop.request.Datapoint.Annotation = clonedAnnotationBytes 1364 wop.requestV2.ID = wop.request.ID 1365 wop.requestV2.Datapoint = wop.request.Datapoint 1366 op = wop 1367 case taggedWriteAttemptType: 1368 wop := s.pools.writeTaggedOperation.Get() 1369 wop.namespace = nsID 1370 wop.shardID = s.state.topoMap.ShardSet().Lookup(tsID) 1371 wop.request.ID = tsID.Bytes() 1372 encodedTagBytes, ok := tagEncoder.Data() 1373 if !ok { 1374 return nil, 0, 0, errUnableToEncodeTags 1375 } 1376 wop.request.EncodedTags = encodedTagBytes.Bytes() 1377 wop.request.Datapoint.Value = value 1378 wop.request.Datapoint.Timestamp = timestamp 1379 wop.request.Datapoint.TimestampTimeType = timeType 1380 wop.request.Datapoint.Annotation = clonedAnnotationBytes 1381 wop.requestV2.ID = wop.request.ID 1382 wop.requestV2.EncodedTags = wop.request.EncodedTags 1383 wop.requestV2.Datapoint = wop.request.Datapoint 1384 op = wop 1385 default: 1386 // should never happen 1387 return nil, 0, 0, errUnknownWriteAttemptType 1388 } 1389 1390 state := s.pools.writeState.Get() 1391 state.consistencyLevel = s.state.writeLevel 1392 state.shardsLeavingCountTowardsConsistency = s.shardsLeavingCountTowardsConsistency 1393 state.topoMap = s.state.topoMap 1394 state.incRef() 1395 1396 // todo@bl: Can we combine the writeOpPool and the writeStatePool? 1397 state.op, state.majority = op, majority 1398 state.nsID, state.tsID, state.tagEncoder, state.annotation = nsID, tsID, tagEncoder, clonedAnnotation 1399 op.SetCompletionFn(state.completionFn) 1400 1401 if err := s.state.topoMap.RouteForEach(tsID, func( 1402 idx int, 1403 hostShard shard.Shard, 1404 host topology.Host, 1405 ) { 1406 if !s.writeShardsInitializing && hostShard.State() == shard.Initializing { 1407 // NB(r): Do not write to this node as the shard is initializing 1408 // and writing to intialized shards is not enabled (also 1409 // depending on your config initializing shards won't count 1410 // towards quorum, current defaults, so this is ok consistency wise). 1411 return 1412 } 1413 1414 // Count pending write requests before we enqueue the completion fns, 1415 // which rely on the count when executing 1416 state.pending++ 1417 state.queues = append(state.queues, s.state.queues[idx]) 1418 }); err != nil { 1419 state.decRef() 1420 return nil, 0, 0, err 1421 } 1422 1423 state.Lock() 1424 for i := range state.queues { 1425 state.incRef() 1426 if err := state.queues[i].Enqueue(state.op); err != nil { 1427 state.Unlock() 1428 state.decRef() 1429 1430 // NB(r): if this happens we have a bug, once we are in the read 1431 // lock the current queues should never be closed 1432 s.log.Error("[invariant violated] failed to enqueue write", zap.Error(err)) 1433 return nil, 0, 0, err 1434 } 1435 enqueued++ 1436 } 1437 1438 // NB(prateek): the current go-routine still holds a lock on the 1439 // returned writeState object. 1440 return state, majority, enqueued, nil 1441 } 1442 1443 func (s *session) Fetch( 1444 nsID ident.ID, 1445 id ident.ID, 1446 startInclusive, endExclusive xtime.UnixNano, 1447 ) (encoding.SeriesIterator, error) { 1448 tsIDs := ident.NewIDsIterator(id) 1449 results, err := s.FetchIDs(nsID, tsIDs, startInclusive, endExclusive) 1450 if err != nil { 1451 return nil, err 1452 } 1453 mutableResults := results.(encoding.MutableSeriesIterators) 1454 iters := mutableResults.Iters() 1455 iter := iters[0] 1456 // Reset to zero so that when we close this results set the iter doesn't get closed 1457 mutableResults.Reset(0) 1458 mutableResults.Close() 1459 return iter, nil 1460 } 1461 1462 func (s *session) FetchIDs( 1463 nsID ident.ID, 1464 ids ident.Iterator, 1465 startInclusive, endExclusive xtime.UnixNano, 1466 ) (encoding.SeriesIterators, error) { 1467 f := s.pools.fetchAttempt.Get() 1468 f.args.namespace, f.args.ids = nsID, ids 1469 f.args.start = startInclusive 1470 f.args.end = endExclusive 1471 err := s.fetchRetrier.Attempt(f.attemptFn) 1472 result := f.result 1473 s.pools.fetchAttempt.Put(f) 1474 return result, err 1475 } 1476 1477 func (s *session) Aggregate( 1478 ctx gocontext.Context, 1479 ns ident.ID, 1480 q index.Query, 1481 opts index.AggregationOptions, 1482 ) (AggregatedTagsIterator, FetchResponseMetadata, error) { 1483 f := s.pools.aggregateAttempt.Get() 1484 f.args.ctx = ctx 1485 f.args.ns = ns 1486 f.args.query = q 1487 f.args.opts = opts 1488 err := s.fetchRetrier.Attempt(f.attemptFn) 1489 iter, metadata := f.resultIter, f.resultMetadata 1490 s.pools.aggregateAttempt.Put(f) 1491 return iter, metadata, err 1492 } 1493 1494 func (s *session) aggregateAttempt( 1495 ctx gocontext.Context, 1496 ns ident.ID, 1497 q index.Query, 1498 opts index.AggregationOptions, 1499 ) (AggregatedTagsIterator, FetchResponseMetadata, error) { 1500 s.state.RLock() 1501 if s.state.status != statusOpen { 1502 s.state.RUnlock() 1503 return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen 1504 } 1505 1506 // NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle 1507 // of the hostQueues responding is less than the lifecycle of the current method. 1508 nsClone := s.pools.id.Clone(ns) 1509 1510 req, err := convert.ToRPCAggregateQueryRawRequest(nsClone, q, opts) 1511 if err != nil { 1512 s.state.RUnlock() 1513 nsClone.Finalize() 1514 return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err) 1515 } 1516 if req.SeriesLimit != nil && opts.InstanceMultiple > 0 { 1517 topo := s.state.topoMap 1518 iPerReplica := int64(len(topo.Hosts()) / topo.Replicas()) 1519 iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica 1520 if iSeriesLimit < *req.SeriesLimit { 1521 req.SeriesLimit = &iSeriesLimit 1522 } 1523 } 1524 1525 fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{ 1526 stateType: aggregateFetchState, 1527 aggregateRequest: req, 1528 startInclusive: opts.StartInclusive, 1529 endExclusive: opts.EndExclusive, 1530 readConsistencyLevel: opts.ReadConsistencyLevel, 1531 }) 1532 s.state.RUnlock() 1533 1534 if err != nil { 1535 return nil, FetchResponseMetadata{}, err 1536 } 1537 1538 // it's safe to Wait() here, as we still hold the lock on fetchState, after it's 1539 // returned from newFetchStateWithRLock. 1540 fetchState.Wait() 1541 1542 // must Unlock before calling `asEncodingSeriesIterators` as the latter needs to acquire 1543 // the fetchState Lock 1544 fetchState.Unlock() 1545 iters, meta, err := fetchState.asAggregatedTagsIterator(s.pools, opts.SeriesLimit) 1546 1547 // must Unlock() before decRef'ing, as the latter releases the fetchState back into a 1548 // pool if ref count == 0. 1549 fetchState.decRef() 1550 1551 return iters, meta, err 1552 } 1553 1554 func (s *session) FetchTagged( 1555 ctx gocontext.Context, 1556 ns ident.ID, 1557 q index.Query, 1558 opts index.QueryOptions, 1559 ) (encoding.SeriesIterators, FetchResponseMetadata, error) { 1560 f := s.pools.fetchTaggedAttempt.Get() 1561 f.args.ctx = ctx 1562 f.args.ns = ns 1563 f.args.query = q 1564 f.args.opts = opts 1565 err := s.fetchRetrier.Attempt(f.dataAttemptFn) 1566 iters, metadata := f.dataResultIters, f.dataResultMetadata 1567 s.pools.fetchTaggedAttempt.Put(f) 1568 return iters, metadata, err 1569 } 1570 1571 func (s *session) FetchTaggedIDs( 1572 ctx gocontext.Context, 1573 ns ident.ID, 1574 q index.Query, 1575 opts index.QueryOptions, 1576 ) (TaggedIDsIterator, FetchResponseMetadata, error) { 1577 f := s.pools.fetchTaggedAttempt.Get() 1578 f.args.ctx = ctx 1579 f.args.ns = ns 1580 f.args.query = q 1581 f.args.opts = opts 1582 err := s.fetchRetrier.Attempt(f.idsAttemptFn) 1583 iter, metadata := f.idsResultIter, f.idsResultMetadata 1584 s.pools.fetchTaggedAttempt.Put(f) 1585 return iter, metadata, err 1586 } 1587 1588 func (s *session) fetchTaggedAttempt( 1589 ctx gocontext.Context, 1590 ns ident.ID, 1591 q index.Query, 1592 opts index.QueryOptions, 1593 ) (encoding.SeriesIterators, FetchResponseMetadata, error) { 1594 nsCtx, err := s.nsCtxFor(ns) 1595 if err != nil { 1596 return nil, FetchResponseMetadata{}, err 1597 } 1598 s.state.RLock() 1599 if s.state.status != statusOpen { 1600 s.state.RUnlock() 1601 return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen 1602 } 1603 1604 // NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle 1605 // of the hostQueues responding is less than the lifecycle of the current method. 1606 nsClone := s.pools.id.Clone(ns) 1607 1608 // FOLLOWUP(prateek): currently both `index.Query` and the returned request depend on 1609 // native, un-pooled types; so we do not Clone() either. We will start doing so 1610 // once https://github.com/m3db/m3ninx/issues/42 lands. Including transferring ownership 1611 // of the Clone()'d value to the `fetchState`. 1612 const fetchData = true 1613 req, err := convert.ToRPCFetchTaggedRequest(nsClone, q, opts, fetchData) 1614 if err != nil { 1615 s.state.RUnlock() 1616 nsClone.Finalize() 1617 return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err) 1618 } 1619 if req.SeriesLimit != nil && opts.InstanceMultiple > 0 { 1620 topo := s.state.topoMap 1621 iPerReplica := int64(len(topo.Hosts()) / topo.Replicas()) 1622 iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica 1623 if iSeriesLimit < *req.SeriesLimit { 1624 req.SeriesLimit = &iSeriesLimit 1625 } 1626 } 1627 1628 fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{ 1629 stateType: fetchTaggedFetchState, 1630 fetchTaggedRequest: req, 1631 startInclusive: opts.StartInclusive, 1632 endExclusive: opts.EndExclusive, 1633 readConsistencyLevel: opts.ReadConsistencyLevel, 1634 }) 1635 s.state.RUnlock() 1636 1637 if err != nil { 1638 return nil, FetchResponseMetadata{}, err 1639 } 1640 1641 // it's safe to Wait() here, as we still hold the lock on fetchState, after it's 1642 // returned from newFetchStateWithRLock. 1643 fetchState.Wait() 1644 1645 // must Unlock before calling `asEncodingSeriesIterators` as the latter needs to acquire 1646 // the fetchState Lock 1647 fetchState.Unlock() 1648 1649 iterOpts := s.opts.IterationOptions() 1650 if opts.IterateEqualTimestampStrategy != nil { 1651 iterOpts.IterateEqualTimestampStrategy = *opts.IterateEqualTimestampStrategy 1652 } 1653 1654 iters, metadata, err := fetchState.asEncodingSeriesIterators( 1655 s.pools, nsCtx.Schema, iterOpts, opts.SeriesLimit) 1656 1657 // must Unlock() before decRef'ing, as the latter releases the fetchState back into a 1658 // pool if ref count == 0. 1659 fetchState.decRef() 1660 1661 return iters, metadata, err 1662 } 1663 1664 func (s *session) fetchTaggedIDsAttempt( 1665 ctx gocontext.Context, 1666 ns ident.ID, 1667 q index.Query, 1668 opts index.QueryOptions, 1669 ) (TaggedIDsIterator, FetchResponseMetadata, error) { 1670 s.state.RLock() 1671 if s.state.status != statusOpen { 1672 s.state.RUnlock() 1673 return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen 1674 } 1675 1676 // NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle 1677 // of the hostQueues responding is less than the lifecycle of the current method. 1678 nsClone := s.pools.id.Clone(ns) 1679 1680 // FOLLOWUP(prateek): currently both `index.Query` and the returned request depend on 1681 // native, un-pooled types; so we do not Clone() either. We will start doing so 1682 // once https://github.com/m3db/m3ninx/issues/42 lands. Including transferring ownership 1683 // of the Clone()'d value to the `fetchState`. 1684 const fetchData = false 1685 req, err := convert.ToRPCFetchTaggedRequest(nsClone, q, opts, fetchData) 1686 if err != nil { 1687 s.state.RUnlock() 1688 nsClone.Finalize() 1689 return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err) 1690 } 1691 if req.SeriesLimit != nil && opts.InstanceMultiple > 0 { 1692 topo := s.state.topoMap 1693 iPerReplica := int64(len(topo.Hosts()) / topo.Replicas()) 1694 iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica 1695 if iSeriesLimit < *req.SeriesLimit { 1696 req.SeriesLimit = &iSeriesLimit 1697 } 1698 } 1699 1700 fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{ 1701 stateType: fetchTaggedFetchState, 1702 fetchTaggedRequest: req, 1703 startInclusive: opts.StartInclusive, 1704 endExclusive: opts.EndExclusive, 1705 readConsistencyLevel: opts.ReadConsistencyLevel, 1706 }) 1707 s.state.RUnlock() 1708 1709 if err != nil { 1710 return nil, FetchResponseMetadata{}, err 1711 } 1712 1713 // it's safe to Wait() here, as we still hold the lock on fetchState, after it's 1714 // returned from newFetchStateWithRLock. 1715 fetchState.Wait() 1716 1717 // must Unlock before calling `asTaggedIDsIterator` as the latter needs to acquire 1718 // the fetchState Lock 1719 fetchState.Unlock() 1720 iter, metadata, err := fetchState.asTaggedIDsIterator(s.pools, opts.SeriesLimit) 1721 1722 // must Unlock() before decRef'ing, as the latter releases the fetchState back into a 1723 // pool if ref count == 0. 1724 fetchState.decRef() 1725 1726 return iter, metadata, err 1727 } 1728 1729 type newFetchStateOpts struct { 1730 stateType fetchStateType 1731 startInclusive xtime.UnixNano 1732 endExclusive xtime.UnixNano 1733 readConsistencyLevel *topology.ReadConsistencyLevel 1734 1735 // only valid if stateType == fetchTaggedFetchState 1736 fetchTaggedRequest rpc.FetchTaggedRequest 1737 1738 // only valid if stateType == aggregateFetchState 1739 aggregateRequest rpc.AggregateQueryRawRequest 1740 } 1741 1742 // NB(prateek): the returned fetchState, if valid, still holds the lock. Its ownership 1743 // is transferred to the calling function, and is expected to manage the lifecycle of 1744 // of the object (including releasing the lock/decRef'ing it). 1745 // NB: ownership of ns is transferred to the returned fetchState object. 1746 func (s *session) newFetchStateWithRLock( 1747 ctx gocontext.Context, 1748 ns ident.ID, 1749 opts newFetchStateOpts, 1750 ) (*fetchState, error) { 1751 var ( 1752 topoMap = s.state.topoMap 1753 fetchState = s.pools.fetchState.Get() 1754 ) 1755 fetchState.nsID = ns // transfer ownership to `fetchState` 1756 fetchState.incRef() // indicate current go-routine has a reference to the fetchState 1757 1758 readLevel := s.state.readConsistencyLevelWithRLock(opts.readConsistencyLevel) 1759 1760 // wire up the operation based on the opts specified 1761 var ( 1762 op op 1763 closer func() 1764 ) 1765 switch opts.stateType { 1766 case fetchTaggedFetchState: 1767 fetchOp := s.pools.fetchTaggedOp.Get() 1768 fetchOp.incRef() // indicate current go-routine has a reference to the op 1769 closer = fetchOp.decRef // release the ref for the current go-routine 1770 fetchOp.update(ctx, opts.fetchTaggedRequest, fetchState.completionFn) 1771 fetchState.ResetFetchTagged(opts.startInclusive, opts.endExclusive, 1772 fetchOp, topoMap, s.state.majority, readLevel) 1773 op = fetchOp 1774 1775 case aggregateFetchState: 1776 aggOp := s.pools.aggregateOp.Get() 1777 aggOp.incRef() // indicate current go-routine has a reference to the op 1778 closer = aggOp.decRef // release the ref for the current go-routine 1779 aggOp.update(ctx, opts.aggregateRequest, fetchState.completionFn) 1780 fetchState.ResetAggregate(opts.startInclusive, opts.endExclusive, 1781 aggOp, topoMap, s.state.majority, readLevel) 1782 op = aggOp 1783 1784 default: 1785 fetchState.decRef() // release fetchState 1786 instrument.EmitInvariantViolation(s.opts.InstrumentOptions()) 1787 return nil, xerrors.NewNonRetryableError(instrument.InvariantErrorf( 1788 "unknown fetchState type: %v", opts.stateType)) 1789 } 1790 1791 fetchState.Lock() 1792 for _, hq := range s.state.queues { 1793 // inc to indicate the hostQueue has a reference to `op` which has a ref to the fetchState 1794 fetchState.incRef() 1795 if err := hq.Enqueue(op); err != nil { 1796 fetchState.Unlock() 1797 closer() // release the ref for the current go-routine 1798 fetchState.decRef() // release the ref for the hostQueue 1799 fetchState.decRef() // release the ref for the current go-routine 1800 1801 // NB: if this happens we have a bug, once we are in the read 1802 // lock the current queues should never be closed 1803 wrappedErr := xerrors.NewNonRetryableError(fmt.Errorf("failed to enqueue in fetchState: %v", err)) 1804 instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) { 1805 l.Error(wrappedErr.Error()) 1806 }) 1807 return nil, wrappedErr 1808 } 1809 } 1810 1811 closer() // release the ref for the current go-routine 1812 1813 // NB(prateek): the calling go-routine still holds the lock and a ref 1814 // on the returned fetchState object. 1815 return fetchState, nil 1816 } 1817 1818 func (s *session) fetchIDsAttempt( 1819 inputNamespace ident.ID, 1820 inputIDs ident.Iterator, 1821 startInclusive, endExclusive xtime.UnixNano, 1822 ) (encoding.SeriesIterators, error) { 1823 nsCtx, err := s.nsCtxFor(inputNamespace) 1824 if err != nil { 1825 return nil, err 1826 } 1827 1828 var ( 1829 wg sync.WaitGroup 1830 allPending int32 1831 routeErr error 1832 enqueueErr error 1833 resultErrLock sync.RWMutex 1834 resultErr error 1835 resultErrs int32 1836 majority int32 1837 numReplicas int32 1838 readLevel topology.ReadConsistencyLevel 1839 fetchBatchOpsByHostIdx [][]*fetchBatchOp 1840 success = false 1841 startFetchAttempt = s.nowFn() 1842 ) 1843 1844 // NB(prateek): need to make a copy of inputNamespace and inputIDs to control 1845 // their life-cycle within this function. 1846 namespace := s.pools.id.Clone(inputNamespace) 1847 // First, we duplicate the iterator (only the struct referencing the underlying slice, 1848 // not the slice itself). Need this to be able to iterate the original iterator 1849 // multiple times in case of retries. 1850 ids := inputIDs.Duplicate() 1851 1852 rangeStart, tsErr := convert.ToValue(startInclusive, rpc.TimeType_UNIX_NANOSECONDS) 1853 if tsErr != nil { 1854 return nil, tsErr 1855 } 1856 1857 rangeEnd, tsErr := convert.ToValue(endExclusive, rpc.TimeType_UNIX_NANOSECONDS) 1858 if tsErr != nil { 1859 return nil, tsErr 1860 } 1861 1862 s.state.RLock() 1863 if s.state.status != statusOpen { 1864 s.state.RUnlock() 1865 return nil, ErrSessionStatusNotOpen 1866 } 1867 1868 iters := encoding.NewSizedSeriesIterators(ids.Remaining()) 1869 1870 defer func() { 1871 // NB(r): Ensure we cover all edge cases and close the iters in any case 1872 // of an error being returned 1873 if !success { 1874 iters.Close() 1875 } 1876 }() 1877 1878 // NB(r): We must take and return pooled items in the session read lock for the 1879 // pools that change during a topology update. 1880 // This is due to when a queue is re-initialized it enqueues a fixed number 1881 // of entries into the backing channel for the pool and will forever stall 1882 // on the last few puts if any unexpected entries find their way there 1883 // while it is filling. 1884 fetchBatchOpsByHostIdx = s.pools.fetchBatchOpArrayArray.Get() 1885 1886 readLevel = s.state.readLevel 1887 majority = int32(s.state.majority) 1888 numReplicas = int32(s.state.replicas) 1889 1890 // NB(prateek): namespaceAccessors tracks the number of pending accessors for nsID. 1891 // It is set to incremented by `replica` for each requested ID during fetch enqueuing, 1892 // and once by initial request, and is decremented for each replica retrieved, inside 1893 // completionFn, and once by the allCompletionFn. So know we can Finalize `namespace` 1894 // once it's value reaches 0. 1895 namespaceAccessors := int32(0) 1896 1897 for idx := 0; ids.Next(); idx++ { 1898 var ( 1899 idx = idx // capture loop variable 1900 tsID = s.pools.id.Clone(ids.Current()) 1901 1902 wgIsDone int32 1903 // NB(xichen): resultsAccessors and idAccessors get initialized to number of replicas + 1 1904 // before enqueuing (incremented when iterating over the replicas for this ID), and gets 1905 // decremented for each replica as well as inside the allCompletionFn so we know when 1906 // resultsAccessors is 0, results are no longer accessed and it's safe to return results 1907 // to the pool. 1908 resultsAccessors int32 = 1 1909 idAccessors int32 = 1 1910 resultsLock sync.RWMutex 1911 results []encoding.MultiReaderIterator 1912 enqueued int32 1913 pending int32 1914 success int32 1915 errors []error 1916 errs int32 1917 ) 1918 1919 // increment namespaceAccesors by 1 to indicate it still needs to be handled by the 1920 // allCompletionFn for tsID. 1921 atomic.AddInt32(&namespaceAccessors, 1) 1922 1923 wg.Add(1) 1924 allCompletionFn := func() { 1925 var reportErrors []error 1926 errsLen := atomic.LoadInt32(&errs) 1927 if errsLen > 0 { 1928 resultErrLock.RLock() 1929 reportErrors = errors[:] 1930 resultErrLock.RUnlock() 1931 } 1932 responded := enqueued - atomic.LoadInt32(&pending) 1933 err := s.readConsistencyResult(readLevel, majority, enqueued, 1934 responded, errsLen, reportErrors) 1935 s.recordFetchMetrics(err, errsLen, startFetchAttempt) 1936 if err != nil { 1937 resultErrLock.Lock() 1938 if resultErr == nil { 1939 resultErr = err 1940 } 1941 resultErrs++ 1942 resultErrLock.Unlock() 1943 } else { 1944 resultsLock.RLock() 1945 numItersToInclude := int(success) 1946 numDesired := topology.NumDesiredForReadConsistency(readLevel, int(numReplicas), int(majority)) 1947 if numDesired < numItersToInclude { 1948 // Avoid decoding more data than is required to satisfy the consistency guarantees. 1949 numItersToInclude = numDesired 1950 } 1951 1952 itersToInclude := results[:numItersToInclude] 1953 resultsLock.RUnlock() 1954 1955 iter := s.pools.seriesIterator.Get() 1956 // NB(prateek): we need to allocate a copy of ident.ID to allow the seriesIterator 1957 // to have control over the lifecycle of ID. We cannot allow seriesIterator 1958 // to control the lifecycle of the original ident.ID, as it might still be in use 1959 // due to a pending request in queue. 1960 seriesID := s.pools.id.Clone(tsID) 1961 namespaceID := s.pools.id.Clone(namespace) 1962 consolidator := s.opts.IterationOptions().SeriesIteratorConsolidator 1963 iter.Reset(encoding.SeriesIteratorOptions{ 1964 ID: seriesID, 1965 Namespace: namespaceID, 1966 StartInclusive: startInclusive, 1967 EndExclusive: endExclusive, 1968 Replicas: itersToInclude, 1969 SeriesIteratorConsolidator: consolidator, 1970 }) 1971 iters.SetAt(idx, iter) 1972 } 1973 if atomic.AddInt32(&resultsAccessors, -1) == 0 { 1974 s.pools.multiReaderIteratorArray.Put(results) 1975 } 1976 if atomic.AddInt32(&idAccessors, -1) == 0 { 1977 tsID.Finalize() 1978 } 1979 if atomic.AddInt32(&namespaceAccessors, -1) == 0 { 1980 namespace.Finalize() 1981 } 1982 wg.Done() 1983 } 1984 completionFn := func(result interface{}, err error) { 1985 var snapshotSuccess int32 1986 if err != nil { 1987 if IsBadRequestError(err) { 1988 // Wrap with invalid params and non-retryable so it is 1989 // not retried. 1990 err = xerrors.NewInvalidParamsError(err) 1991 err = xerrors.NewNonRetryableError(err) 1992 } 1993 atomic.AddInt32(&errs, 1) 1994 // NB(r): reuse the error lock here as we do not want to create 1995 // a whole lot of locks for every single ID fetched due to size 1996 // of mutex being non-trivial and likely to cause more stack growth 1997 // or GC pressure if ends up on heap which is likely due to naive 1998 // escape analysis. 1999 resultErrLock.Lock() 2000 errors = append(errors, err) 2001 resultErrLock.Unlock() 2002 } else { 2003 slicesIter := s.pools.readerSliceOfSlicesIterator.Get() 2004 slicesIter.Reset(result.([]*rpc.Segments)) 2005 multiIter := s.pools.multiReaderIterator.Get() 2006 multiIter.ResetSliceOfSlices(slicesIter, nsCtx.Schema) 2007 // Results is pre-allocated after creating fetch ops for this ID below 2008 resultsLock.Lock() 2009 results[success] = multiIter 2010 success++ 2011 snapshotSuccess = success 2012 resultsLock.Unlock() 2013 } 2014 // NB(xichen): decrementing pending and checking remaining against zero must 2015 // come after incrementing success, otherwise we might end up passing results[:success] 2016 // to iter.Reset down below before setting the iterator in the results array, 2017 // which would cause a nil pointer exception. 2018 remaining := atomic.AddInt32(&pending, -1) 2019 shouldTerminate := topology.ReadConsistencyTermination( 2020 readLevel, majority, remaining, snapshotSuccess, 2021 ) 2022 if shouldTerminate && atomic.CompareAndSwapInt32(&wgIsDone, 0, 1) { 2023 allCompletionFn() 2024 } 2025 2026 if atomic.AddInt32(&resultsAccessors, -1) == 0 { 2027 s.pools.multiReaderIteratorArray.Put(results) 2028 } 2029 if atomic.AddInt32(&idAccessors, -1) == 0 { 2030 tsID.Finalize() 2031 } 2032 if atomic.AddInt32(&namespaceAccessors, -1) == 0 { 2033 namespace.Finalize() 2034 } 2035 } 2036 2037 if err := s.state.topoMap.RouteForEach(tsID, func( 2038 hostIdx int, 2039 hostShard shard.Shard, 2040 host topology.Host, 2041 ) { 2042 // Inc safely as this for each is sequential 2043 enqueued++ 2044 pending++ 2045 allPending++ 2046 resultsAccessors++ 2047 namespaceAccessors++ 2048 idAccessors++ 2049 2050 ops := fetchBatchOpsByHostIdx[hostIdx] 2051 2052 var f *fetchBatchOp 2053 if len(ops) > 0 { 2054 // Find the last and potentially current fetch op for this host 2055 f = ops[len(ops)-1] 2056 } 2057 if f == nil || f.Size() >= s.fetchBatchSize { 2058 // If no current fetch op or existing one is at batch capacity add one 2059 // NB(r): Note that we defer to the host queue to take ownership 2060 // of these ops and for returning the ops to the pool when done as 2061 // they know when their use is complete. 2062 f = s.pools.fetchBatchOp.Get() 2063 f.IncRef() 2064 fetchBatchOpsByHostIdx[hostIdx] = append(fetchBatchOpsByHostIdx[hostIdx], f) 2065 f.request.RangeStart = rangeStart 2066 f.request.RangeEnd = rangeEnd 2067 f.request.RangeTimeType = rpc.TimeType_UNIX_NANOSECONDS 2068 } 2069 2070 // Append IDWithNamespace to this request 2071 f.append(namespace.Bytes(), tsID.Bytes(), completionFn) 2072 }); err != nil { 2073 routeErr = err 2074 break 2075 } 2076 2077 // Once we've enqueued we know how many to expect so retrieve and set length 2078 results = s.pools.multiReaderIteratorArray.Get(int(enqueued)) 2079 results = results[:enqueued] 2080 } 2081 2082 if routeErr != nil { 2083 s.state.RUnlock() 2084 return nil, routeErr 2085 } 2086 2087 // Enqueue fetch ops 2088 for idx := range fetchBatchOpsByHostIdx { 2089 for _, f := range fetchBatchOpsByHostIdx[idx] { 2090 // Passing ownership of the op itself to the host queue 2091 f.DecRef() 2092 if err := s.state.queues[idx].Enqueue(f); err != nil && enqueueErr == nil { 2093 enqueueErr = err 2094 break 2095 } 2096 } 2097 if enqueueErr != nil { 2098 break 2099 } 2100 } 2101 s.pools.fetchBatchOpArrayArray.Put(fetchBatchOpsByHostIdx) 2102 s.state.RUnlock() 2103 2104 if enqueueErr != nil { 2105 s.log.Error("failed to enqueue fetch", zap.Error(enqueueErr)) 2106 return nil, enqueueErr 2107 } 2108 2109 wg.Wait() 2110 2111 resultErrLock.RLock() 2112 retErr := resultErr 2113 resultErrLock.RUnlock() 2114 if retErr != nil { 2115 return nil, retErr 2116 } 2117 success = true 2118 return iters, nil 2119 } 2120 2121 func (s *session) writeConsistencyResult( 2122 level topology.ConsistencyLevel, 2123 majority, enqueued, responded, resultErrs int32, 2124 errs []error, 2125 ) error { 2126 // Check consistency level satisfied 2127 success := enqueued - resultErrs 2128 if !topology.WriteConsistencyAchieved(level, int(majority), int(enqueued), int(success)) { 2129 return newConsistencyResultError(level, int(enqueued), int(responded), errs) 2130 } 2131 return nil 2132 } 2133 2134 func (s *session) readConsistencyResult( 2135 level topology.ReadConsistencyLevel, 2136 majority, enqueued, responded, resultErrs int32, 2137 errs []error, 2138 ) error { 2139 // Check consistency level satisfied 2140 success := enqueued - resultErrs 2141 if !topology.ReadConsistencyAchieved(level, int(majority), int(enqueued), int(success)) { 2142 return newConsistencyResultError(level, int(enqueued), int(responded), errs) 2143 } 2144 return nil 2145 } 2146 2147 func (s *session) IteratorPools() (encoding.IteratorPools, error) { 2148 s.state.RLock() 2149 defer s.state.RUnlock() 2150 if s.state.status != statusOpen { 2151 return nil, ErrSessionStatusNotOpen 2152 } 2153 return s.pools, nil 2154 } 2155 2156 func (s *session) Close() error { 2157 s.state.Lock() 2158 if s.state.status != statusOpen { 2159 s.state.Unlock() 2160 return ErrSessionStatusNotOpen 2161 } 2162 s.state.status = statusClosed 2163 queues := s.state.queues 2164 topoWatch := s.state.topoWatch 2165 topo := s.state.topo 2166 s.state.Unlock() 2167 2168 for _, q := range queues { 2169 q.Close() 2170 } 2171 2172 topoWatch.Close() 2173 topo.Close() 2174 2175 if closer := s.runtimeOptsListenerCloser; closer != nil { 2176 closer.Close() 2177 } 2178 2179 return nil 2180 } 2181 2182 func (s *session) Origin() topology.Host { 2183 return s.origin 2184 } 2185 2186 func (s *session) Replicas() int { 2187 s.state.RLock() 2188 v := s.state.replicas 2189 s.state.RUnlock() 2190 return v 2191 } 2192 2193 func (s *session) TopologyMap() (topology.Map, error) { 2194 s.state.RLock() 2195 topoMap, err := s.topologyMapWithStateRLock() 2196 s.state.RUnlock() 2197 return topoMap, err 2198 } 2199 2200 func (s *session) topologyMapWithStateRLock() (topology.Map, error) { 2201 status := s.state.status 2202 topoMap := s.state.topoMap 2203 2204 // Make sure the session is open, as thats what sets the initial topology. 2205 if status != statusOpen { 2206 return nil, ErrSessionStatusNotOpen 2207 } 2208 if topoMap == nil { 2209 // Should never happen. 2210 return nil, instrument.InvariantErrorf("session does not have a topology map") 2211 } 2212 2213 return topoMap, nil 2214 } 2215 2216 func (s *session) Truncate(namespace ident.ID) (int64, error) { 2217 var ( 2218 wg sync.WaitGroup 2219 enqueueErr xerrors.MultiError 2220 resultErrLock sync.Mutex 2221 resultErr xerrors.MultiError 2222 truncated int64 2223 ) 2224 2225 t := &truncateOp{} 2226 t.request.NameSpace = namespace.Bytes() 2227 t.completionFn = func(result interface{}, err error) { 2228 if err != nil { 2229 resultErrLock.Lock() 2230 resultErr = resultErr.Add(err) 2231 resultErrLock.Unlock() 2232 } else { 2233 res := result.(*rpc.TruncateResult_) 2234 atomic.AddInt64(&truncated, res.NumSeries) 2235 } 2236 wg.Done() 2237 } 2238 2239 s.state.RLock() 2240 for idx := range s.state.queues { 2241 wg.Add(1) 2242 if err := s.state.queues[idx].Enqueue(t); err != nil { 2243 wg.Done() 2244 enqueueErr = enqueueErr.Add(err) 2245 } 2246 } 2247 s.state.RUnlock() 2248 2249 if err := enqueueErr.FinalError(); err != nil { 2250 s.log.Error("failed to enqueue request", zap.Error(err)) 2251 return 0, err 2252 } 2253 2254 // Wait for namespace to be truncated on all replicas 2255 wg.Wait() 2256 2257 return truncated, resultErr.FinalError() 2258 } 2259 2260 // NB(r): Excluding maligned struct check here as we can 2261 // live with a few extra bytes since this struct is only 2262 // ever passed by stack, its much more readable not optimized 2263 // nolint: maligned 2264 type peers struct { 2265 peers []peer 2266 shard uint32 2267 majorityReplicas int 2268 selfExcluded bool 2269 selfHostShardSet topology.HostShardSet 2270 } 2271 2272 func (p peers) selfExcludedAndSelfHasShardAvailable() bool { 2273 if !p.selfExcluded { 2274 return false 2275 } 2276 state, err := p.selfHostShardSet.ShardSet().LookupStateByID(p.shard) 2277 if err != nil { 2278 return false 2279 } 2280 return state == shard.Available 2281 } 2282 2283 func (s *session) peersForShard(shardID uint32) (peers, error) { 2284 s.state.RLock() 2285 var ( 2286 lookupErr error 2287 result = peers{ 2288 peers: make([]peer, 0, s.state.topoMap.Replicas()), 2289 shard: shardID, 2290 majorityReplicas: s.state.topoMap.MajorityReplicas(), 2291 } 2292 ) 2293 err := s.state.topoMap.RouteShardForEach(shardID, func( 2294 idx int, 2295 _ shard.Shard, 2296 host topology.Host, 2297 ) { 2298 if s.origin != nil && s.origin.ID() == host.ID() { 2299 // Don't include the origin host 2300 result.selfExcluded = true 2301 // Include the origin host shard set for help determining quorum 2302 hostShardSet, ok := s.state.topoMap.LookupHostShardSet(host.ID()) 2303 if !ok { 2304 lookupErr = fmt.Errorf("could not find shard set for host ID: %s", host.ID()) 2305 } 2306 result.selfHostShardSet = hostShardSet 2307 return 2308 } 2309 result.peers = append(result.peers, newPeer(s, host)) 2310 }) 2311 s.state.RUnlock() 2312 if resultErr := xerrors.FirstError(err, lookupErr); resultErr != nil { 2313 return peers{}, resultErr 2314 } 2315 return result, nil 2316 } 2317 2318 func (s *session) FetchBootstrapBlocksMetadataFromPeers( 2319 namespace ident.ID, 2320 shard uint32, 2321 start, end xtime.UnixNano, 2322 resultOpts result.Options, 2323 ) (PeerBlockMetadataIter, error) { 2324 level := newSessionBootstrapRuntimeReadConsistencyLevel(s) 2325 return s.fetchBlocksMetadataFromPeers(namespace, 2326 shard, start, end, level, resultOpts) 2327 } 2328 2329 func (s *session) FetchBlocksMetadataFromPeers( 2330 namespace ident.ID, 2331 shard uint32, 2332 start, end xtime.UnixNano, 2333 consistencyLevel topology.ReadConsistencyLevel, 2334 resultOpts result.Options, 2335 ) (PeerBlockMetadataIter, error) { 2336 level := newStaticRuntimeReadConsistencyLevel(consistencyLevel) 2337 return s.fetchBlocksMetadataFromPeers(namespace, 2338 shard, start, end, level, resultOpts) 2339 } 2340 2341 func (s *session) fetchBlocksMetadataFromPeers( 2342 namespace ident.ID, 2343 shard uint32, 2344 start, end xtime.UnixNano, 2345 level runtimeReadConsistencyLevel, 2346 resultOpts result.Options, 2347 ) (PeerBlockMetadataIter, error) { 2348 peers, err := s.peersForShard(shard) 2349 if err != nil { 2350 return nil, err 2351 } 2352 2353 var ( 2354 metadataCh = make(chan receivedBlockMetadata, 2355 blockMetadataChBufSize) 2356 errCh = make(chan error, 1) 2357 meta = resultTypeMetadata 2358 m = s.newPeerMetadataStreamingProgressMetrics(shard, meta) 2359 ) 2360 go func() { 2361 errCh <- s.streamBlocksMetadataFromPeers(namespace, shard, 2362 peers, start, end, level, metadataCh, resultOpts, m) 2363 close(metadataCh) 2364 close(errCh) 2365 }() 2366 2367 iter := newMetadataIter(metadataCh, errCh, 2368 s.pools.tagDecoder, s.pools.id) 2369 return iter, nil 2370 } 2371 2372 // FetchBootstrapBlocksFromPeers will fetch the specified blocks from peers for 2373 // bootstrapping purposes. Refer to peer_bootstrapping.md for more details. 2374 func (s *session) FetchBootstrapBlocksFromPeers( 2375 nsMetadata namespace.Metadata, 2376 shard uint32, 2377 start, end xtime.UnixNano, 2378 opts result.Options, 2379 ) (result.ShardResult, error) { 2380 nsCtx, err := s.nsCtxFromMetadata(nsMetadata) 2381 if err != nil { 2382 return nil, err 2383 } 2384 var ( 2385 result = newBulkBlocksResult(nsCtx, s.opts, opts, 2386 s.pools.tagDecoder, s.pools.id) 2387 doneCh = make(chan struct{}) 2388 progress = s.newPeerMetadataStreamingProgressMetrics(shard, 2389 resultTypeBootstrap) 2390 level = newSessionBootstrapRuntimeReadConsistencyLevel(s) 2391 ) 2392 2393 // Determine which peers own the specified shard 2394 peers, err := s.peersForShard(shard) 2395 if err != nil { 2396 return nil, err 2397 } 2398 2399 // Emit a gauge indicating whether we're done or not 2400 go func() { 2401 for { 2402 select { 2403 case <-doneCh: 2404 progress.fetchBlocksFromPeers.Update(0) 2405 return 2406 default: 2407 progress.fetchBlocksFromPeers.Update(1) 2408 time.Sleep(gaugeReportInterval) 2409 } 2410 } 2411 }() 2412 defer close(doneCh) 2413 2414 // Begin pulling metadata, if one or multiple peers fail no error will 2415 // be returned from this routine as long as one peer succeeds completely 2416 metadataCh := make(chan receivedBlockMetadata, blockMetadataChBufSize) 2417 // Spin up a background goroutine which will begin streaming metadata from 2418 // all the peers and pushing them into the metadatach 2419 errCh := make(chan error, 1) 2420 go func() { 2421 errCh <- s.streamBlocksMetadataFromPeers(nsMetadata.ID(), shard, 2422 peers, start, end, level, metadataCh, opts, progress) 2423 close(metadataCh) 2424 }() 2425 2426 // Begin consuming metadata and making requests. This will block until all 2427 // data has been streamed (or failed to stream). Note that while this function 2428 // does return an error, an error will only be returned in a select few cases. 2429 // There are some scenarios in which if something goes wrong here we won't report it to 2430 // the caller, but metrics and logs are emitted internally. Also note that the 2431 // streamAndGroupCollectedBlocksMetadata function is injected. 2432 err = s.streamBlocksFromPeers(nsMetadata, shard, peers, metadataCh, opts, 2433 level, result, progress, s.streamAndGroupCollectedBlocksMetadata) 2434 if err != nil { 2435 return nil, err 2436 } 2437 2438 // Check if an error occurred during the metadata streaming 2439 if err = <-errCh; err != nil { 2440 return nil, err 2441 } 2442 2443 return result.result, nil 2444 } 2445 2446 func (s *session) FetchBlocksFromPeers( 2447 nsMetadata namespace.Metadata, 2448 shard uint32, 2449 consistencyLevel topology.ReadConsistencyLevel, 2450 metadatas []block.ReplicaMetadata, 2451 opts result.Options, 2452 ) (PeerBlocksIter, error) { 2453 nsCtx, err := s.nsCtxFromMetadata(nsMetadata) 2454 if err != nil { 2455 return nil, err 2456 } 2457 var ( 2458 logger = opts.InstrumentOptions().Logger() 2459 level = newStaticRuntimeReadConsistencyLevel(consistencyLevel) 2460 complete = int64(0) 2461 doneCh = make(chan error, 1) 2462 outputCh = make(chan peerBlocksDatapoint, 4096) 2463 result = newStreamBlocksResult(nsCtx, s.opts, opts, outputCh, 2464 s.pools.tagDecoder, s.pools.id) 2465 onDone = func(err error) { 2466 atomic.StoreInt64(&complete, 1) 2467 select { 2468 case doneCh <- err: 2469 default: 2470 } 2471 } 2472 progress = s.newPeerMetadataStreamingProgressMetrics(shard, resultTypeRaw) 2473 ) 2474 2475 peers, err := s.peersForShard(shard) 2476 if err != nil { 2477 return nil, err 2478 } 2479 2480 peersByHost := make(map[string]peer, len(peers.peers)) 2481 for _, peer := range peers.peers { 2482 peersByHost[peer.Host().ID()] = peer 2483 } 2484 2485 // If any metadata has tags then encode them up front so can 2486 // return an error on tag encoding rather than logging error that would 2487 // possibly get missed. 2488 var ( 2489 metadatasEncodedTags []checked.Bytes 2490 anyTags bool 2491 ) 2492 for _, meta := range metadatas { 2493 if len(meta.Tags.Values()) > 0 { 2494 anyTags = true 2495 break 2496 } 2497 } 2498 if anyTags { 2499 // NB(r): Allocate exact length so nil is used and each index 2500 // references same index as the incoming metadatas being fetched. 2501 metadatasEncodedTags = make([]checked.Bytes, len(metadatas)) 2502 tagsIter := ident.NewTagsIterator(ident.Tags{}) 2503 for idx, meta := range metadatas { 2504 if len(meta.Tags.Values()) == 0 { 2505 continue 2506 } 2507 2508 tagsIter.Reset(meta.Tags) 2509 tagsEncoder := s.pools.tagEncoder.Get() 2510 if err := tagsEncoder.Encode(tagsIter); err != nil { 2511 return nil, err 2512 } 2513 2514 encodedTagsCheckedBytes, ok := tagsEncoder.Data() 2515 if !ok { 2516 return nil, fmt.Errorf("could not encode tags: id=%s", meta.ID.String()) 2517 } 2518 2519 metadatasEncodedTags[idx] = encodedTagsCheckedBytes 2520 } 2521 } 2522 2523 go func() { 2524 for atomic.LoadInt64(&complete) == 0 { 2525 progress.fetchBlocksFromPeers.Update(1) 2526 time.Sleep(gaugeReportInterval) 2527 } 2528 progress.fetchBlocksFromPeers.Update(0) 2529 }() 2530 2531 metadataCh := make(chan receivedBlockMetadata, blockMetadataChBufSize) 2532 go func() { 2533 for idx, rb := range metadatas { 2534 peer, ok := peersByHost[rb.Host.ID()] 2535 if !ok { 2536 logger.Warn("replica requested from unknown peer, skipping", 2537 zap.Stringer("peer", rb.Host), 2538 zap.Stringer("id", rb.ID), 2539 zap.Time("start", rb.Start.ToTime()), 2540 ) 2541 continue 2542 } 2543 2544 // Attach encoded tags if present. 2545 var encodedTags checked.Bytes 2546 if idx < len(metadatasEncodedTags) { 2547 // Note: could still be nil if had no tags, but the slice 2548 // was built so need to take ref to encoded tags if 2549 // was encoded. 2550 encodedTags = metadatasEncodedTags[idx] 2551 } 2552 2553 metadataCh <- receivedBlockMetadata{ 2554 id: rb.Metadata.ID, 2555 encodedTags: encodedTags, 2556 peer: peer, 2557 block: blockMetadata{ 2558 start: rb.Start, 2559 size: rb.Size, 2560 checksum: rb.Checksum, 2561 lastRead: rb.LastRead, 2562 }, 2563 } 2564 } 2565 close(metadataCh) 2566 }() 2567 2568 // Begin consuming metadata and making requests. 2569 go func() { 2570 err := s.streamBlocksFromPeers(nsMetadata, shard, peers, metadataCh, 2571 opts, level, result, progress, s.passThroughBlocksMetadata) 2572 close(outputCh) 2573 onDone(err) 2574 }() 2575 2576 pbi := newPeerBlocksIter(outputCh, doneCh) 2577 return pbi, nil 2578 } 2579 2580 func (s *session) streamBlocksMetadataFromPeers( 2581 namespace ident.ID, 2582 shardID uint32, 2583 peers peers, 2584 start, end xtime.UnixNano, 2585 level runtimeReadConsistencyLevel, 2586 metadataCh chan<- receivedBlockMetadata, 2587 resultOpts result.Options, 2588 progress *streamFromPeersMetrics, 2589 ) error { 2590 var ( 2591 wg sync.WaitGroup 2592 errs = newSyncAbortableErrorsMap() 2593 pending = int64(len(peers.peers)) 2594 majority = int32(peers.majorityReplicas) 2595 enqueued = int32(len(peers.peers)) 2596 responded int32 2597 success int32 2598 ) 2599 if peers.selfExcludedAndSelfHasShardAvailable() { 2600 // If we excluded ourselves from fetching, we basically treat ourselves 2601 // as a successful peer response since we can bootstrap from ourselves 2602 // just fine 2603 enqueued++ 2604 success++ 2605 } 2606 2607 progress.metadataFetches.Update(float64(pending)) 2608 for idx, peer := range peers.peers { 2609 idx := idx 2610 peer := peer 2611 2612 wg.Add(1) 2613 go func() { 2614 defer func() { 2615 // Success or error counts towards a response 2616 atomic.AddInt32(&responded, 1) 2617 2618 // Decrement pending 2619 progress.metadataFetches.Update(float64(atomic.AddInt64(&pending, -1))) 2620 2621 // Mark done 2622 wg.Done() 2623 }() 2624 2625 var ( 2626 firstAttempt = true 2627 // NB(r): currPageToken keeps the position into the pagination of the 2628 // metadata from this peer, it begins as nil but if an error is 2629 // returned it will likely not be nil, this lets us restart fetching 2630 // if we need to (if consistency has not been achieved yet) without 2631 // losing place in the pagination. 2632 currPageToken pageToken 2633 currHostNotAvailableSleepInterval = hostNotAvailableMinSleepInterval 2634 ) 2635 condition := func() bool { 2636 if firstAttempt { 2637 // Always attempt at least once 2638 firstAttempt = false 2639 return true 2640 } 2641 2642 var ( 2643 currLevel = level.value() 2644 majority = int(majority) 2645 enqueued = int(enqueued) 2646 success = int(atomic.LoadInt32(&success)) 2647 ) 2648 metReadConsistency := topology.ReadConsistencyAchieved( 2649 currLevel, majority, enqueued, success) 2650 doRetry := !metReadConsistency && errs.getAbortError() == nil 2651 2652 if doRetry { 2653 // Track that we are reattempting the fetch metadata 2654 // pagination from a peer 2655 progress.metadataPeerRetry.Inc(1) 2656 } 2657 return doRetry 2658 } 2659 for condition() { 2660 var err error 2661 currPageToken, err = s.streamBlocksMetadataFromPeer(namespace, shardID, 2662 peer, start, end, currPageToken, metadataCh, resultOpts, progress) 2663 // Set error or success if err is nil 2664 errs.setError(idx, err) 2665 2666 // hostNotAvailable is a NonRetryableError for the purposes of short-circuiting 2667 // the automatic retry functionality, but in this case the client should avoid 2668 // aborting and continue retrying at this level until consistency can be reached. 2669 if isHostNotAvailableError(err) { 2670 // Prevent the loop from spinning too aggressively in the short-circuiting case. 2671 time.Sleep(currHostNotAvailableSleepInterval) 2672 currHostNotAvailableSleepInterval = minDuration( 2673 currHostNotAvailableSleepInterval*2, 2674 hostNotAvailableMaxSleepInterval, 2675 ) 2676 continue 2677 } 2678 2679 if err != nil && xerrors.IsNonRetryableError(err) { 2680 errs.setAbortError(err) 2681 return // Cannot recover from this error, so we break from the loop 2682 } 2683 2684 if err == nil { 2685 atomic.AddInt32(&success, 1) 2686 return 2687 } 2688 2689 // There was a retryable error, continue looping. 2690 } 2691 }() 2692 } 2693 2694 wg.Wait() 2695 2696 if err := errs.getAbortError(); err != nil { 2697 return err 2698 } 2699 2700 errors := errs.getErrors() 2701 return s.readConsistencyResult(level.value(), majority, enqueued, 2702 atomic.LoadInt32(&responded), int32(len(errors)), errors) 2703 } 2704 2705 type pageToken []byte 2706 2707 // streamBlocksMetadataFromPeer has several heap allocated anonymous 2708 // function, however, they're only allocated once per peer/shard combination 2709 // for the entire peer bootstrapping process so performance is acceptable 2710 func (s *session) streamBlocksMetadataFromPeer( 2711 namespace ident.ID, 2712 shard uint32, 2713 peer peer, 2714 start, end xtime.UnixNano, 2715 startPageToken pageToken, 2716 metadataCh chan<- receivedBlockMetadata, 2717 resultOpts result.Options, 2718 progress *streamFromPeersMetrics, 2719 ) (pageToken, error) { 2720 var ( 2721 optionIncludeSizes = true 2722 optionIncludeChecksums = true 2723 optionIncludeLastRead = true 2724 moreResults = true 2725 idPool = s.pools.id 2726 bytesPool = resultOpts.DatabaseBlockOptions().BytesPool() 2727 2728 // Only used for logs 2729 peerStr = peer.Host().ID() 2730 metadataCountByBlock = map[xtime.UnixNano]int64{} 2731 ) 2732 defer func() { 2733 for block, numMetadata := range metadataCountByBlock { 2734 s.log.Debug("finished streaming blocks metadata from peer", 2735 zap.Uint32("shard", shard), 2736 zap.String("peer", peerStr), 2737 zap.Int64("numMetadata", numMetadata), 2738 zap.Time("block", block.ToTime()), 2739 ) 2740 } 2741 }() 2742 2743 // Declare before loop to avoid redeclaring each iteration 2744 attemptFn := func(client rpc.TChanNode) error { 2745 tctx, _ := thrift.NewContext(s.streamBlocksMetadataBatchTimeout) 2746 req := rpc.NewFetchBlocksMetadataRawV2Request() 2747 req.NameSpace = namespace.Bytes() 2748 req.Shard = int32(shard) 2749 req.RangeStart = int64(start) 2750 req.RangeEnd = int64(end) 2751 req.Limit = int64(s.streamBlocksBatchSize) 2752 req.PageToken = startPageToken 2753 req.IncludeSizes = &optionIncludeSizes 2754 req.IncludeChecksums = &optionIncludeChecksums 2755 req.IncludeLastRead = &optionIncludeLastRead 2756 2757 progress.metadataFetchBatchCall.Inc(1) 2758 result, err := client.FetchBlocksMetadataRawV2(tctx, req) 2759 if err != nil { 2760 progress.metadataFetchBatchError.Inc(1) 2761 return err 2762 } 2763 2764 progress.metadataFetchBatchSuccess.Inc(1) 2765 progress.metadataReceived.Inc(int64(len(result.Elements))) 2766 2767 if result.NextPageToken != nil { 2768 // Reset pageToken + copy new pageToken into previously allocated memory, 2769 // extending as necessary 2770 startPageToken = append(startPageToken[:0], result.NextPageToken...) 2771 } else { 2772 // No further results 2773 moreResults = false 2774 } 2775 2776 for _, elem := range result.Elements { 2777 blockStart := xtime.UnixNano(elem.Start) 2778 2779 data := bytesPool.Get(len(elem.ID)) 2780 data.IncRef() 2781 data.AppendAll(elem.ID) 2782 data.DecRef() 2783 clonedID := idPool.BinaryID(data) 2784 // Return thrift bytes to pool once the ID has been copied. 2785 apachethrift.BytesPoolPut(elem.ID) 2786 2787 var encodedTags checked.Bytes 2788 if tagBytes := elem.EncodedTags; len(tagBytes) != 0 { 2789 encodedTags = bytesPool.Get(len(tagBytes)) 2790 encodedTags.IncRef() 2791 encodedTags.AppendAll(tagBytes) 2792 encodedTags.DecRef() 2793 // Return thrift bytes to pool once the tags have been copied. 2794 apachethrift.BytesPoolPut(tagBytes) 2795 } 2796 2797 // Error occurred retrieving block metadata, use default values 2798 if err := elem.Err; err != nil { 2799 progress.metadataFetchBatchBlockErr.Inc(1) 2800 s.log.Error("error occurred retrieving block metadata", 2801 zap.Uint32("shard", shard), 2802 zap.String("peer", peerStr), 2803 zap.Time("block", blockStart.ToTime()), 2804 zap.Error(err), 2805 ) 2806 // Enqueue with a zeroed checksum which triggers a fanout fetch 2807 metadataCh <- receivedBlockMetadata{ 2808 peer: peer, 2809 id: clonedID, 2810 encodedTags: encodedTags, 2811 block: blockMetadata{ 2812 start: blockStart, 2813 }, 2814 } 2815 continue 2816 } 2817 2818 var size int64 2819 if elem.Size != nil { 2820 size = *elem.Size 2821 } 2822 2823 var pChecksum *uint32 2824 if elem.Checksum != nil { 2825 value := uint32(*elem.Checksum) 2826 pChecksum = &value 2827 } 2828 2829 var lastRead xtime.UnixNano 2830 if elem.LastRead != nil { 2831 value, err := convert.ToTime(*elem.LastRead, elem.LastReadTimeType) 2832 if err == nil { 2833 lastRead = value 2834 } 2835 } 2836 2837 metadataCh <- receivedBlockMetadata{ 2838 peer: peer, 2839 id: clonedID, 2840 encodedTags: encodedTags, 2841 block: blockMetadata{ 2842 start: blockStart, 2843 size: size, 2844 checksum: pChecksum, 2845 lastRead: lastRead, 2846 }, 2847 } 2848 // Only used for logs 2849 metadataCountByBlock[blockStart]++ 2850 } 2851 return nil 2852 } 2853 2854 var attemptErr error 2855 checkedAttemptFn := func(client rpc.TChanNode, _ Channel) { 2856 attemptErr = attemptFn(client) 2857 } 2858 2859 fetchFn := func() error { 2860 borrowErr := peer.BorrowConnection(checkedAttemptFn) 2861 return xerrors.FirstError(borrowErr, attemptErr) 2862 } 2863 2864 for moreResults { 2865 if err := s.streamBlocksRetrier.Attempt(fetchFn); err != nil { 2866 return startPageToken, err 2867 } 2868 } 2869 return nil, nil 2870 } 2871 2872 func (s *session) streamBlocksFromPeers( 2873 nsMetadata namespace.Metadata, 2874 shard uint32, 2875 peers peers, 2876 metadataCh <-chan receivedBlockMetadata, 2877 opts result.Options, 2878 consistencyLevel runtimeReadConsistencyLevel, 2879 result blocksResult, 2880 progress *streamFromPeersMetrics, 2881 streamMetadataFn streamBlocksMetadataFn, 2882 ) error { 2883 var ( 2884 enqueueCh = newEnqueueChannel(progress) 2885 peerBlocksBatchSize = s.streamBlocksBatchSize 2886 numPeers = len(peers.peers) 2887 uncheckedBytesPool = opts.DatabaseBlockOptions().BytesPool().BytesPool() 2888 ) 2889 2890 // Consume the incoming metadata and enqueue to the ready channel 2891 // Spin up background goroutine to consume 2892 go func() { 2893 streamMetadataFn(numPeers, metadataCh, enqueueCh, uncheckedBytesPool) 2894 // Begin assessing the queue and how much is processed, once queue 2895 // is entirely processed then we can close the enqueue channel 2896 enqueueCh.closeOnAllProcessed() 2897 }() 2898 2899 // Fetch blocks from peers as results become ready 2900 peerQueues := make(peerBlocksQueues, 0, numPeers) 2901 for _, peer := range peers.peers { 2902 peer := peer 2903 size := peerBlocksBatchSize 2904 workers := s.streamBlocksWorkers 2905 drainEvery := 100 * time.Millisecond 2906 queue := s.newPeerBlocksQueueFn(peer, size, drainEvery, workers, 2907 func(batch []receivedBlockMetadata) { 2908 s.streamBlocksBatchFromPeer(nsMetadata, shard, peer, batch, opts, 2909 result, enqueueCh, s.streamBlocksRetrier, progress) 2910 }) 2911 peerQueues = append(peerQueues, queue) 2912 } 2913 2914 var ( 2915 selected []receivedBlockMetadata 2916 pooled selectPeersFromPerPeerBlockMetadatasPooledResources 2917 onQueueItemProcessed = func() { 2918 enqueueCh.trackProcessed(1) 2919 } 2920 ) 2921 for perPeerBlocksMetadata := range enqueueCh.read() { 2922 // Filter and select which blocks to retrieve from which peers 2923 selected, pooled = s.selectPeersFromPerPeerBlockMetadatas( 2924 perPeerBlocksMetadata, peerQueues, enqueueCh, consistencyLevel, peers, 2925 pooled, progress) 2926 2927 if len(selected) == 0 { 2928 onQueueItemProcessed() 2929 continue 2930 } 2931 2932 if len(selected) == 1 { 2933 queue := peerQueues.findQueue(selected[0].peer) 2934 queue.enqueue(selected[0], onQueueItemProcessed) 2935 continue 2936 } 2937 2938 // Need to fan out, only track this as processed once all peer 2939 // queues have completed their fetches, so account for the extra 2940 // items assigned to be fetched 2941 enqueueCh.trackPending(len(selected) - 1) 2942 for _, receivedBlockMetadata := range selected { 2943 queue := peerQueues.findQueue(receivedBlockMetadata.peer) 2944 queue.enqueue(receivedBlockMetadata, onQueueItemProcessed) 2945 } 2946 } 2947 2948 // Close all queues 2949 peerQueues.closeAll() 2950 2951 return nil 2952 } 2953 2954 type streamBlocksMetadataFn func( 2955 peersLen int, 2956 ch <-chan receivedBlockMetadata, 2957 enqueueCh enqueueChannel, 2958 pool pool.BytesPool, 2959 ) 2960 2961 func (s *session) passThroughBlocksMetadata( 2962 peersLen int, 2963 ch <-chan receivedBlockMetadata, 2964 enqueueCh enqueueChannel, 2965 _ pool.BytesPool, 2966 ) { 2967 // Receive off of metadata channel 2968 for { 2969 m, ok := <-ch 2970 if !ok { 2971 break 2972 } 2973 res := []receivedBlockMetadata{m} 2974 enqueueCh.enqueue(res) 2975 } 2976 } 2977 2978 func (s *session) streamAndGroupCollectedBlocksMetadata( 2979 peersLen int, 2980 metadataCh <-chan receivedBlockMetadata, 2981 enqueueCh enqueueChannel, 2982 pool pool.BytesPool, 2983 ) { 2984 metadata := newReceivedBlocksMap(pool) 2985 defer metadata.Reset() // Delete all the keys and return slices to pools 2986 2987 for { 2988 m, ok := <-metadataCh 2989 if !ok { 2990 break 2991 } 2992 2993 key := idAndBlockStart{ 2994 id: m.id, 2995 blockStart: int64(m.block.start), 2996 } 2997 received, ok := metadata.Get(key) 2998 if !ok { 2999 received = receivedBlocks{ 3000 results: make([]receivedBlockMetadata, 0, peersLen), 3001 } 3002 } 3003 3004 // The entry has already been enqueued which means the metadata we just 3005 // received is a duplicate. Discard it and move on. 3006 if received.enqueued { 3007 s.emitDuplicateMetadataLog(received, m) 3008 continue 3009 } 3010 3011 // Determine if the incoming metadata is a duplicate by checking if we've 3012 // already received metadata from this peer. 3013 existingIndex := -1 3014 for i, existingMetadata := range received.results { 3015 if existingMetadata.peer.Host().ID() == m.peer.Host().ID() { 3016 existingIndex = i 3017 break 3018 } 3019 } 3020 3021 if existingIndex != -1 { 3022 // If it is a duplicate, then overwrite it (always keep the most recent 3023 // duplicate) 3024 received.results[existingIndex] = m 3025 } else { 3026 // Otherwise it's not a duplicate, so its safe to append. 3027 received.results = append(received.results, m) 3028 } 3029 3030 // Since we always perform an overwrite instead of an append for duplicates 3031 // from the same peer, once len(received.results == peersLen) then we know 3032 // that we've received at least one metadata from every peer and its safe 3033 // to enqueue the entry. 3034 if len(received.results) == peersLen { 3035 enqueueCh.enqueue(received.results) 3036 received.enqueued = true 3037 } 3038 3039 // Ensure tracking enqueued by setting modified result back to map 3040 metadata.Set(key, received) 3041 } 3042 3043 // Enqueue all unenqueued received metadata. Note that these entries will have 3044 // metadata from only a subset of their peers. 3045 for _, entry := range metadata.Iter() { 3046 received := entry.Value() 3047 if received.enqueued { 3048 continue 3049 } 3050 enqueueCh.enqueue(received.results) 3051 } 3052 } 3053 3054 // emitDuplicateMetadataLog emits a log with the details of the duplicate metadata 3055 // event. Note: We're able to log the blocks themselves because the slice is no longer 3056 // mutated downstream after enqueuing into the enqueue channel, it's copied before 3057 // mutated or operated on. 3058 func (s *session) emitDuplicateMetadataLog( 3059 received receivedBlocks, 3060 metadata receivedBlockMetadata, 3061 ) { 3062 // Debug-level because this is a common enough occurrence that logging it by 3063 // default would be noisy. 3064 // This is due to peers sending the most recent data 3065 // to the oldest data in that order, hence sometimes its possible to resend 3066 // data for a block already sent over the wire if it just moved from being 3067 // mutable in memory to immutable on disk. 3068 if !s.log.Core().Enabled(zapcore.DebugLevel) { 3069 return 3070 } 3071 3072 var checksum uint32 3073 if v := metadata.block.checksum; v != nil { 3074 checksum = *v 3075 } 3076 3077 fields := make([]zapcore.Field, 0, len(received.results)+1) 3078 fields = append(fields, zap.String("incoming-metadata", fmt.Sprintf( 3079 "id=%s, peer=%s, start=%s, size=%v, checksum=%v", 3080 metadata.id.String(), 3081 metadata.peer.Host().String(), 3082 metadata.block.start.String(), 3083 metadata.block.size, 3084 checksum))) 3085 3086 for i, existing := range received.results { 3087 checksum = 0 3088 if v := existing.block.checksum; v != nil { 3089 checksum = *v 3090 } 3091 3092 fields = append(fields, zap.String( 3093 fmt.Sprintf("existing-metadata-%d", i), 3094 fmt.Sprintf( 3095 "id=%s, peer=%s, start=%s, size=%v, checksum=%v", 3096 existing.id.String(), 3097 existing.peer.Host().String(), 3098 existing.block.start.String(), 3099 existing.block.size, 3100 checksum))) 3101 } 3102 3103 s.log.Debug("received metadata, but peer metadata has already been submitted", fields...) 3104 } 3105 3106 type pickBestPeerFn func( 3107 perPeerBlockMetadata []receivedBlockMetadata, 3108 peerQueues peerBlocksQueues, 3109 resources pickBestPeerPooledResources, 3110 ) (index int, pooled pickBestPeerPooledResources) 3111 3112 type pickBestPeerPooledResources struct { 3113 ranking []receivedBlockMetadataQueue 3114 } 3115 3116 func (s *session) streamBlocksPickBestPeer( 3117 perPeerBlockMetadata []receivedBlockMetadata, 3118 peerQueues peerBlocksQueues, 3119 pooled pickBestPeerPooledResources, 3120 ) (int, pickBestPeerPooledResources) { 3121 // Order by least attempts then by least outstanding blocks being fetched 3122 pooled.ranking = pooled.ranking[:0] 3123 for i := range perPeerBlockMetadata { 3124 elem := receivedBlockMetadataQueue{ 3125 blockMetadata: perPeerBlockMetadata[i], 3126 queue: peerQueues.findQueue(perPeerBlockMetadata[i].peer), 3127 } 3128 pooled.ranking = append(pooled.ranking, elem) 3129 } 3130 elems := receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc(pooled.ranking) 3131 sort.Stable(elems) 3132 3133 // Return index of the best peer 3134 var ( 3135 bestPeer = pooled.ranking[0].queue.peer 3136 idx int 3137 ) 3138 for i := range perPeerBlockMetadata { 3139 if bestPeer == perPeerBlockMetadata[i].peer { 3140 idx = i 3141 break 3142 } 3143 } 3144 return idx, pooled 3145 } 3146 3147 type selectPeersFromPerPeerBlockMetadatasPooledResources struct { 3148 currEligible []receivedBlockMetadata 3149 pickBestPeerPooledResources pickBestPeerPooledResources 3150 } 3151 3152 func (s *session) selectPeersFromPerPeerBlockMetadatas( 3153 perPeerBlocksMetadata []receivedBlockMetadata, 3154 peerQueues peerBlocksQueues, 3155 reEnqueueCh enqueueChannel, 3156 consistencyLevel runtimeReadConsistencyLevel, 3157 peers peers, 3158 pooled selectPeersFromPerPeerBlockMetadatasPooledResources, 3159 m *streamFromPeersMetrics, 3160 ) ([]receivedBlockMetadata, selectPeersFromPerPeerBlockMetadatasPooledResources) { 3161 // Copy into pooled array so we don't mutate existing slice passed 3162 pooled.currEligible = pooled.currEligible[:0] 3163 pooled.currEligible = append(pooled.currEligible, perPeerBlocksMetadata...) 3164 3165 currEligible := pooled.currEligible[:] 3166 3167 // Sort the per peer metadatas by peer ID for consistent results 3168 sort.Sort(peerBlockMetadataByID(currEligible)) 3169 3170 // Only select from peers not already attempted 3171 curr := currEligible[0] 3172 currID := curr.id 3173 currBlock := curr.block 3174 for i := len(currEligible) - 1; i >= 0; i-- { 3175 if currEligible[i].block.reattempt.attempt == 0 { 3176 // Not attempted yet 3177 continue 3178 } 3179 3180 // Check if eligible 3181 n := s.streamBlocksMaxBlockRetries 3182 if currEligible[i].block.reattempt.peerAttempts(currEligible[i].peer) >= n { 3183 // Swap current entry to tail 3184 receivedBlockMetadatas(currEligible).swap(i, len(currEligible)-1) 3185 // Trim newly last entry 3186 currEligible = currEligible[:len(currEligible)-1] 3187 continue 3188 } 3189 } 3190 3191 if len(currEligible) == 0 { 3192 // No current eligible peers to select from 3193 majority := peers.majorityReplicas 3194 enqueued := len(peers.peers) 3195 success := 0 3196 if peers.selfExcludedAndSelfHasShardAvailable() { 3197 // If we excluded ourselves from fetching, we basically treat ourselves 3198 // as a successful peer response since our copy counts towards quorum 3199 enqueued++ 3200 success++ 3201 } 3202 3203 errMsg := "all retries failed for streaming blocks from peers" 3204 fanoutFetchState := currBlock.reattempt.fanoutFetchState 3205 if fanoutFetchState != nil { 3206 if fanoutFetchState.decrementAndReturnPending() > 0 { 3207 // This block was fanned out to fetch from all peers and we haven't 3208 // received all the results yet, so don't retry it just yet 3209 return nil, pooled 3210 } 3211 3212 // NB(r): This was enqueued after a failed fetch and all other fanout 3213 // fetches have completed, check if the consistency level was achieved, 3214 // if not then re-enqueue to continue to retry otherwise do not 3215 // re-enqueue and see if we need mark this as an error. 3216 success = fanoutFetchState.success() 3217 } 3218 3219 level := consistencyLevel.value() 3220 achievedConsistencyLevel := topology.ReadConsistencyAchieved(level, majority, enqueued, success) 3221 if achievedConsistencyLevel { 3222 if success > 0 { 3223 // Some level of success met, no need to log an error 3224 return nil, pooled 3225 } 3226 3227 // No success, inform operator that although consistency level achieved 3228 // there were no successful fetches. This can happen if consistency 3229 // level is set to None. 3230 m.fetchBlockFinalError.Inc(1) 3231 s.log.Error(errMsg, 3232 zap.Stringer("id", currID), 3233 zap.Time("start", currBlock.start.ToTime()), 3234 zap.Int("attempted", currBlock.reattempt.attempt), 3235 zap.String("attemptErrs", xerrors.Errors(currBlock.reattempt.errs).Error()), 3236 zap.Stringer("consistencyLevel", level), 3237 ) 3238 3239 return nil, pooled 3240 } 3241 3242 // Retry again by re-enqueuing, have not met consistency level yet 3243 m.fetchBlockFullRetry.Inc(1) 3244 3245 err := fmt.Errorf(errMsg+": attempts=%d", curr.block.reattempt.attempt) 3246 reattemptReason := consistencyLevelNotAchievedErrReason 3247 reattemptType := fullRetryReattemptType 3248 reattemptBlocks := []receivedBlockMetadata{curr} 3249 s.reattemptStreamBlocksFromPeersFn(reattemptBlocks, reEnqueueCh, 3250 err, reattemptReason, reattemptType, m) 3251 3252 return nil, pooled 3253 } 3254 3255 var ( 3256 singlePeer = len(currEligible) == 1 3257 sameNonNilChecksum = true 3258 curChecksum *uint32 3259 ) 3260 for i := range currEligible { 3261 // If any peer has a nil checksum, this might be the most recent block 3262 // and therefore not sealed so we want to merge from all peers 3263 if currEligible[i].block.checksum == nil { 3264 sameNonNilChecksum = false 3265 break 3266 } 3267 if curChecksum == nil { 3268 curChecksum = currEligible[i].block.checksum 3269 } else if *curChecksum != *currEligible[i].block.checksum { 3270 sameNonNilChecksum = false 3271 break 3272 } 3273 } 3274 3275 // If all the peers have the same non-nil checksum, we pick the peer with the 3276 // fewest attempts and fewest outstanding requests 3277 if singlePeer || sameNonNilChecksum { 3278 var idx int 3279 if singlePeer { 3280 idx = 0 3281 } else { 3282 pooledResources := pooled.pickBestPeerPooledResources 3283 idx, pooledResources = s.pickBestPeerFn(currEligible, peerQueues, 3284 pooledResources) 3285 pooled.pickBestPeerPooledResources = pooledResources 3286 } 3287 3288 // Set the reattempt metadata 3289 selected := currEligible[idx] 3290 selected.block.reattempt.attempt++ 3291 selected.block.reattempt.attempted = 3292 append(selected.block.reattempt.attempted, selected.peer) 3293 selected.block.reattempt.fanoutFetchState = nil 3294 selected.block.reattempt.retryPeersMetadata = perPeerBlocksMetadata 3295 selected.block.reattempt.fetchedPeersMetadata = perPeerBlocksMetadata 3296 3297 // Return just the single peer we selected 3298 currEligible = currEligible[:1] 3299 currEligible[0] = selected 3300 } else { 3301 fanoutFetchState := newBlockFanoutFetchState(len(currEligible)) 3302 for i := range currEligible { 3303 // Set the reattempt metadata 3304 // NB(xichen): each block will only be retried on the same peer because we 3305 // already fan out the request to all peers. This means we merge data on 3306 // a best-effort basis and only fail if we failed to reach the desired 3307 // consistency level when reading data from all peers. 3308 var retryFrom []receivedBlockMetadata 3309 for j := range perPeerBlocksMetadata { 3310 if currEligible[i].peer == perPeerBlocksMetadata[j].peer { 3311 // NB(r): Take a ref to a subslice from the originally passed 3312 // slice as that is not mutated, whereas currEligible is reused 3313 retryFrom = perPeerBlocksMetadata[j : j+1] 3314 } 3315 } 3316 currEligible[i].block.reattempt.attempt++ 3317 currEligible[i].block.reattempt.attempted = 3318 append(currEligible[i].block.reattempt.attempted, currEligible[i].peer) 3319 currEligible[i].block.reattempt.fanoutFetchState = fanoutFetchState 3320 currEligible[i].block.reattempt.retryPeersMetadata = retryFrom 3321 currEligible[i].block.reattempt.fetchedPeersMetadata = perPeerBlocksMetadata 3322 } 3323 } 3324 3325 return currEligible, pooled 3326 } 3327 3328 func (s *session) streamBlocksBatchFromPeer( 3329 namespaceMetadata namespace.Metadata, 3330 shard uint32, 3331 peer peer, 3332 batch []receivedBlockMetadata, 3333 opts result.Options, 3334 blocksResult blocksResult, 3335 enqueueCh enqueueChannel, 3336 retrier xretry.Retrier, 3337 m *streamFromPeersMetrics, 3338 ) { 3339 // Prepare request 3340 var ( 3341 req = rpc.NewFetchBlocksRawRequest() 3342 result *rpc.FetchBlocksRawResult_ 3343 reqBlocksLen uint 3344 3345 nowFn = opts.ClockOptions().NowFn() 3346 ropts = namespaceMetadata.Options().RetentionOptions() 3347 retention = ropts.RetentionPeriod() 3348 earliestBlockStart = xtime.ToUnixNano(nowFn()). 3349 Add(-retention). 3350 Truncate(ropts.BlockSize()) 3351 ) 3352 req.NameSpace = namespaceMetadata.ID().Bytes() 3353 req.Shard = int32(shard) 3354 req.Elements = make([]*rpc.FetchBlocksRawRequestElement, 0, len(batch)) 3355 for i := range batch { 3356 blockStart := batch[i].block.start 3357 if blockStart.Before(earliestBlockStart) { 3358 continue // Fell out of retention while we were streaming blocks 3359 } 3360 req.Elements = append(req.Elements, &rpc.FetchBlocksRawRequestElement{ 3361 ID: batch[i].id.Bytes(), 3362 Starts: []int64{int64(blockStart)}, 3363 }) 3364 reqBlocksLen++ 3365 } 3366 if reqBlocksLen == 0 { 3367 // All blocks fell out of retention while streaming 3368 return 3369 } 3370 3371 // Attempt request 3372 if err := retrier.Attempt(func() error { 3373 var attemptErr error 3374 borrowErr := peer.BorrowConnection(func(client rpc.TChanNode, _ Channel) { 3375 tctx, _ := thrift.NewContext(s.streamBlocksBatchTimeout) 3376 result, attemptErr = client.FetchBlocksRaw(tctx, req) 3377 }) 3378 err := xerrors.FirstError(borrowErr, attemptErr) 3379 return err 3380 }); err != nil { 3381 blocksErr := fmt.Errorf( 3382 "stream blocks request error: error=%s, peer=%s", 3383 err.Error(), peer.Host().String(), 3384 ) 3385 s.reattemptStreamBlocksFromPeersFn(batch, enqueueCh, blocksErr, 3386 reqErrReason, nextRetryReattemptType, m) 3387 m.fetchBlockError.Inc(int64(reqBlocksLen)) 3388 s.log.Debug(blocksErr.Error()) 3389 return 3390 } 3391 3392 // Parse and act on result 3393 tooManyIDsLogged := false 3394 for i := range result.Elements { 3395 if i >= len(batch) { 3396 m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts))) 3397 m.fetchBlockFinalError.Inc(int64(len(req.Elements[i].Starts))) 3398 if !tooManyIDsLogged { 3399 tooManyIDsLogged = true 3400 s.log.Error("stream blocks more IDs than expected", 3401 zap.Stringer("peer", peer.Host()), 3402 ) 3403 } 3404 continue 3405 } 3406 3407 id := batch[i].id 3408 if !bytes.Equal(id.Bytes(), result.Elements[i].ID) { 3409 blocksErr := fmt.Errorf( 3410 "stream blocks mismatched ID: expectedID=%s, actualID=%s, indexID=%d, peer=%s", 3411 batch[i].id.String(), id.String(), i, peer.Host().String(), 3412 ) 3413 failed := []receivedBlockMetadata{batch[i]} 3414 s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr, 3415 respErrReason, nextRetryReattemptType, m) 3416 m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts))) 3417 s.log.Debug(blocksErr.Error()) 3418 continue 3419 } 3420 3421 if len(result.Elements[i].Blocks) == 0 { 3422 // If fell out of retention during request this is healthy, otherwise 3423 // missing blocks will be repaired during an active repair 3424 continue 3425 } 3426 3427 // We only ever fetch a single block for a series 3428 if len(result.Elements[i].Blocks) != 1 { 3429 errMsg := "stream blocks returned more blocks than expected" 3430 blocksErr := fmt.Errorf(errMsg+": expected=%d, actual=%d", 3431 1, len(result.Elements[i].Blocks)) 3432 failed := []receivedBlockMetadata{batch[i]} 3433 s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr, 3434 respErrReason, nextRetryReattemptType, m) 3435 m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts))) 3436 s.log.Error(errMsg, 3437 zap.Stringer("id", id), 3438 zap.Times("expectedStarts", newTimesByUnixNanos(req.Elements[i].Starts)), 3439 zap.Times("actualStarts", newTimesByRPCBlocks(result.Elements[i].Blocks)), 3440 zap.Stringer("peer", peer.Host()), 3441 ) 3442 continue 3443 } 3444 3445 for j, block := range result.Elements[i].Blocks { 3446 if block.Start != int64(batch[i].block.start) { 3447 errMsg := "stream blocks returned different blocks than expected" 3448 blocksErr := fmt.Errorf(errMsg+": expected=%s, actual=%d", 3449 batch[i].block.start.String(), time.Unix(0, block.Start).String()) 3450 failed := []receivedBlockMetadata{batch[i]} 3451 s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr, 3452 respErrReason, nextRetryReattemptType, m) 3453 m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts))) 3454 s.log.Error(errMsg, 3455 zap.Stringer("id", id), 3456 zap.Times("expectedStarts", newTimesByUnixNanos(req.Elements[i].Starts)), 3457 zap.Times("actualStarts", newTimesByRPCBlocks(result.Elements[i].Blocks)), 3458 zap.Stringer("peer", peer.Host()), 3459 ) 3460 continue 3461 } 3462 3463 // Verify and if verify succeeds add the block from the peer 3464 err := s.verifyFetchedBlock(block) 3465 if err == nil { 3466 err = blocksResult.addBlockFromPeer(id, batch[i].encodedTags, 3467 peer.Host(), block) 3468 } 3469 if err != nil { 3470 failed := []receivedBlockMetadata{batch[i]} 3471 blocksErr := fmt.Errorf( 3472 "stream blocks bad block: id=%s, start=%d, error=%s, indexID=%d, indexBlock=%d, peer=%s", 3473 id.String(), block.Start, err.Error(), i, j, peer.Host().String()) 3474 s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr, 3475 respErrReason, nextRetryReattemptType, m) 3476 m.fetchBlockError.Inc(1) 3477 s.log.Debug(blocksErr.Error()) 3478 continue 3479 } 3480 3481 // NB(r): Track a fanned out block fetch success if added block 3482 fanout := batch[i].block.reattempt.fanoutFetchState 3483 if fanout != nil { 3484 fanout.incrementSuccess() 3485 } 3486 3487 m.fetchBlockSuccess.Inc(1) 3488 } 3489 } 3490 } 3491 3492 func (s *session) verifyFetchedBlock(block *rpc.Block) error { 3493 if block.Err != nil { 3494 return fmt.Errorf("block error from peer: %s %s", block.Err.Type.String(), block.Err.Message) 3495 } 3496 if block.Segments == nil { 3497 return fmt.Errorf("block segments is bad: segments is nil") 3498 } 3499 if block.Segments.Merged == nil && len(block.Segments.Unmerged) == 0 { 3500 return fmt.Errorf("block segments is bad: merged and unmerged not set") 3501 } 3502 3503 if checksum := block.Checksum; checksum != nil { 3504 var ( 3505 d = digest.NewDigest() 3506 expected = uint32(*checksum) 3507 ) 3508 if merged := block.Segments.Merged; merged != nil { 3509 d = d.Update(merged.Head).Update(merged.Tail) 3510 } else { 3511 for _, s := range block.Segments.Unmerged { 3512 d = d.Update(s.Head).Update(s.Tail) 3513 } 3514 } 3515 if actual := d.Sum32(); actual != expected { 3516 return fmt.Errorf("block checksum is bad: expected=%d, actual=%d", expected, actual) 3517 } 3518 } 3519 3520 return nil 3521 } 3522 3523 func (s *session) cloneFinalizable(id ident.ID) ident.ID { 3524 if id.IsNoFinalize() { 3525 return id 3526 } 3527 return s.pools.id.Clone(id) 3528 } 3529 3530 func (s *session) nsCtxFromMetadata(nsMeta namespace.Metadata) (namespace.Context, error) { 3531 nsCtx := namespace.NewContextFrom(nsMeta) 3532 if s.opts.IsSetEncodingProto() && nsCtx.Schema == nil { 3533 return nsCtx, fmt.Errorf("no protobuf schema found for namespace: %s", nsMeta.ID().String()) 3534 } 3535 return nsCtx, nil 3536 } 3537 3538 func (s *session) nsCtxFor(ns ident.ID) (namespace.Context, error) { 3539 nsCtx := namespace.NewContextFor(ns, s.opts.SchemaRegistry()) 3540 if s.opts.IsSetEncodingProto() && nsCtx.Schema == nil { 3541 return nsCtx, fmt.Errorf("no protobuf schema found for namespace: %s", ns.String()) 3542 } 3543 return nsCtx, nil 3544 } 3545 3546 type reason int 3547 3548 const ( 3549 reqErrReason reason = iota 3550 respErrReason 3551 consistencyLevelNotAchievedErrReason 3552 ) 3553 3554 type reattemptType int 3555 3556 const ( 3557 nextRetryReattemptType reattemptType = iota 3558 fullRetryReattemptType 3559 ) 3560 3561 type reattemptStreamBlocksFromPeersFn func( 3562 []receivedBlockMetadata, 3563 enqueueChannel, 3564 error, 3565 reason, 3566 reattemptType, 3567 *streamFromPeersMetrics, 3568 ) error 3569 3570 func (s *session) streamBlocksReattemptFromPeers( 3571 blocks []receivedBlockMetadata, 3572 enqueueCh enqueueChannel, 3573 attemptErr error, 3574 reason reason, 3575 reattemptType reattemptType, 3576 m *streamFromPeersMetrics, 3577 ) error { 3578 switch reason { 3579 case reqErrReason: 3580 m.fetchBlockRetriesReqError.Inc(int64(len(blocks))) 3581 case respErrReason: 3582 m.fetchBlockRetriesRespError.Inc(int64(len(blocks))) 3583 case consistencyLevelNotAchievedErrReason: 3584 m.fetchBlockRetriesConsistencyLevelNotAchievedError.Inc(int64(len(blocks))) 3585 } 3586 3587 // Must do this asynchronously or else could get into a deadlock scenario 3588 // where cannot enqueue into the reattempt channel because no more work is 3589 // getting done because new attempts are blocked on existing attempts completing 3590 // and existing attempts are trying to enqueue into a full reattempt channel 3591 enqueue, done, err := enqueueCh.enqueueDelayed(len(blocks)) 3592 if err != nil { 3593 return err 3594 } 3595 go s.streamBlocksReattemptFromPeersEnqueue(blocks, attemptErr, reattemptType, 3596 enqueue, done) 3597 return nil 3598 } 3599 3600 func (s *session) streamBlocksReattemptFromPeersEnqueue( 3601 blocks []receivedBlockMetadata, 3602 attemptErr error, 3603 reattemptType reattemptType, 3604 enqueueFn enqueueDelayedFn, 3605 enqueueDoneFn enqueueDelayedDoneFn, 3606 ) { 3607 // NB(r): Notify the delayed enqueue is done. 3608 defer enqueueDoneFn() 3609 3610 for i := range blocks { 3611 var reattemptPeersMetadata []receivedBlockMetadata 3612 switch reattemptType { 3613 case nextRetryReattemptType: 3614 reattemptPeersMetadata = blocks[i].block.reattempt.retryPeersMetadata 3615 case fullRetryReattemptType: 3616 reattemptPeersMetadata = blocks[i].block.reattempt.fetchedPeersMetadata 3617 } 3618 if len(reattemptPeersMetadata) == 0 { 3619 continue 3620 } 3621 3622 // Reconstruct peers metadata for reattempt 3623 reattemptBlocksMetadata := make([]receivedBlockMetadata, len(reattemptPeersMetadata)) 3624 for j := range reattemptPeersMetadata { 3625 var reattempt blockMetadataReattempt 3626 if reattemptType == nextRetryReattemptType { 3627 // Only if a default type of retry do we want to actually want 3628 // to set all the retry metadata, otherwise this re-enqueued metadata 3629 // should start fresh 3630 reattempt = blocks[i].block.reattempt 3631 3632 // Copy the errors for every peer so they don't shard the same error 3633 // slice and therefore are not subject to race conditions when the 3634 // error slice is modified 3635 reattemptErrs := make([]error, len(reattempt.errs)+1) 3636 n := copy(reattemptErrs, reattempt.errs) 3637 reattemptErrs[n] = attemptErr 3638 reattempt.errs = reattemptErrs 3639 } 3640 3641 reattemptBlocksMetadata[j] = receivedBlockMetadata{ 3642 peer: reattemptPeersMetadata[j].peer, 3643 id: blocks[i].id, 3644 block: blockMetadata{ 3645 start: reattemptPeersMetadata[j].block.start, 3646 size: reattemptPeersMetadata[j].block.size, 3647 checksum: reattemptPeersMetadata[j].block.checksum, 3648 reattempt: reattempt, 3649 }, 3650 } 3651 } 3652 3653 // Re-enqueue the block to be fetched from all peers requested 3654 // to reattempt from 3655 enqueueFn(reattemptBlocksMetadata) 3656 } 3657 } 3658 3659 type blocksResult interface { 3660 addBlockFromPeer( 3661 id ident.ID, 3662 encodedTags checked.Bytes, 3663 peer topology.Host, 3664 block *rpc.Block, 3665 ) error 3666 } 3667 3668 type baseBlocksResult struct { 3669 nsCtx namespace.Context 3670 blockOpts block.Options 3671 blockAllocSize int 3672 contextPool context.Pool 3673 encoderPool encoding.EncoderPool 3674 multiReaderIteratorPool encoding.MultiReaderIteratorPool 3675 } 3676 3677 func newBaseBlocksResult( 3678 nsCtx namespace.Context, 3679 opts Options, 3680 resultOpts result.Options, 3681 ) baseBlocksResult { 3682 blockOpts := resultOpts.DatabaseBlockOptions() 3683 return baseBlocksResult{ 3684 nsCtx: nsCtx, 3685 blockOpts: blockOpts, 3686 blockAllocSize: blockOpts.DatabaseBlockAllocSize(), 3687 contextPool: opts.ContextPool(), 3688 encoderPool: blockOpts.EncoderPool(), 3689 multiReaderIteratorPool: blockOpts.MultiReaderIteratorPool(), 3690 } 3691 } 3692 3693 func (b *baseBlocksResult) segmentForBlock(seg *rpc.Segment) ts.Segment { 3694 var ( 3695 bytesPool = b.blockOpts.BytesPool() 3696 head, tail checked.Bytes 3697 ) 3698 if len(seg.Head) > 0 { 3699 head = bytesPool.Get(len(seg.Head)) 3700 head.IncRef() 3701 head.AppendAll(seg.Head) 3702 head.DecRef() 3703 } 3704 if len(seg.Tail) > 0 { 3705 tail = bytesPool.Get(len(seg.Tail)) 3706 tail.IncRef() 3707 tail.AppendAll(seg.Tail) 3708 tail.DecRef() 3709 } 3710 var checksum uint32 3711 if seg.Checksum != nil { 3712 checksum = uint32(*seg.Checksum) 3713 } 3714 3715 return ts.NewSegment(head, tail, checksum, ts.FinalizeHead&ts.FinalizeTail) 3716 } 3717 3718 func (b *baseBlocksResult) mergeReaders( 3719 start xtime.UnixNano, blockSize time.Duration, readers []xio.SegmentReader, 3720 ) (encoding.Encoder, error) { 3721 iter := b.multiReaderIteratorPool.Get() 3722 iter.Reset(readers, start, blockSize, b.nsCtx.Schema) 3723 defer iter.Close() 3724 3725 encoder := b.encoderPool.Get() 3726 encoder.Reset(start, b.blockAllocSize, b.nsCtx.Schema) 3727 3728 for iter.Next() { 3729 dp, unit, annotation := iter.Current() 3730 if err := encoder.Encode(dp, unit, annotation); err != nil { 3731 encoder.Close() 3732 return nil, err 3733 } 3734 } 3735 if err := iter.Err(); err != nil { 3736 encoder.Close() 3737 return nil, err 3738 } 3739 3740 return encoder, nil 3741 } 3742 3743 func (b *baseBlocksResult) newDatabaseBlock(block *rpc.Block) (block.DatabaseBlock, error) { 3744 var ( 3745 start = xtime.UnixNano(block.Start) 3746 segments = block.Segments 3747 result = b.blockOpts.DatabaseBlockPool().Get() 3748 ) 3749 3750 if segments == nil { 3751 result.Close() // return block to pool 3752 return nil, errSessionBadBlockResultFromPeer 3753 } 3754 3755 switch { 3756 case segments.Merged != nil: 3757 // Unmerged, can insert directly into a single block 3758 mergedBlock := segments.Merged 3759 result.Reset( 3760 start, 3761 durationConvert(mergedBlock.BlockSize), 3762 b.segmentForBlock(mergedBlock), 3763 b.nsCtx, 3764 ) 3765 3766 case segments.Unmerged != nil: 3767 // Must merge to provide a single block 3768 segmentReaderPool := b.blockOpts.SegmentReaderPool() 3769 readers := make([]xio.SegmentReader, len(segments.Unmerged)) 3770 3771 blockSize := time.Duration(0) 3772 for i, seg := range segments.Unmerged { 3773 segmentReader := segmentReaderPool.Get() 3774 segmentReader.Reset(b.segmentForBlock(seg)) 3775 readers[i] = segmentReader 3776 3777 bs := durationConvert(seg.BlockSize) 3778 if bs > blockSize { 3779 blockSize = bs 3780 } 3781 } 3782 encoder, err := b.mergeReaders(start, blockSize, readers) 3783 for _, reader := range readers { 3784 // Close each reader 3785 reader.Finalize() 3786 } 3787 3788 if err != nil { 3789 // mergeReaders(...) already calls encoder.Close() upon error 3790 result.Close() // return block to pool 3791 return nil, err 3792 } 3793 3794 // Set the block data 3795 result.Reset(start, blockSize, encoder.Discard(), b.nsCtx) 3796 3797 default: 3798 result.Close() // return block to pool 3799 return nil, errSessionBadBlockResultFromPeer 3800 } 3801 3802 return result, nil 3803 } 3804 3805 // Ensure streamBlocksResult implements blocksResult 3806 var _ blocksResult = (*streamBlocksResult)(nil) 3807 3808 type streamBlocksResult struct { 3809 baseBlocksResult 3810 outputCh chan<- peerBlocksDatapoint 3811 tagDecoderPool serialize.TagDecoderPool 3812 idPool ident.Pool 3813 nsCtx namespace.Context 3814 } 3815 3816 func newStreamBlocksResult( 3817 nsCtx namespace.Context, 3818 opts Options, 3819 resultOpts result.Options, 3820 outputCh chan<- peerBlocksDatapoint, 3821 tagDecoderPool serialize.TagDecoderPool, 3822 idPool ident.Pool, 3823 ) *streamBlocksResult { 3824 return &streamBlocksResult{ 3825 nsCtx: nsCtx, 3826 baseBlocksResult: newBaseBlocksResult(nsCtx, opts, resultOpts), 3827 outputCh: outputCh, 3828 tagDecoderPool: tagDecoderPool, 3829 idPool: idPool, 3830 } 3831 } 3832 3833 type peerBlocksDatapoint struct { 3834 id ident.ID 3835 tags ident.Tags 3836 peer topology.Host 3837 block block.DatabaseBlock 3838 } 3839 3840 func (s *streamBlocksResult) addBlockFromPeer( 3841 id ident.ID, 3842 encodedTags checked.Bytes, 3843 peer topology.Host, 3844 block *rpc.Block, 3845 ) error { 3846 result, err := s.newDatabaseBlock(block) 3847 if err != nil { 3848 return err 3849 } 3850 tags, err := newTagsFromEncodedTags(id, encodedTags, 3851 s.tagDecoderPool, s.idPool) 3852 if err != nil { 3853 return err 3854 } 3855 s.outputCh <- peerBlocksDatapoint{ 3856 id: id, 3857 tags: tags, 3858 peer: peer, 3859 block: result, 3860 } 3861 return nil 3862 } 3863 3864 type peerBlocksIter struct { 3865 inputCh <-chan peerBlocksDatapoint 3866 errCh <-chan error 3867 current peerBlocksDatapoint 3868 err error 3869 done bool 3870 } 3871 3872 func newPeerBlocksIter( 3873 inputC <-chan peerBlocksDatapoint, 3874 errC <-chan error, 3875 ) *peerBlocksIter { 3876 return &peerBlocksIter{ 3877 inputCh: inputC, 3878 errCh: errC, 3879 } 3880 } 3881 3882 func (it *peerBlocksIter) Current() (topology.Host, ident.ID, ident.Tags, block.DatabaseBlock) { 3883 return it.current.peer, it.current.id, it.current.tags, it.current.block 3884 } 3885 3886 func (it *peerBlocksIter) Err() error { 3887 return it.err 3888 } 3889 3890 func (it *peerBlocksIter) Next() bool { 3891 if it.done || it.err != nil { 3892 return false 3893 } 3894 m, more := <-it.inputCh 3895 3896 if !more { 3897 it.err = <-it.errCh 3898 it.done = true 3899 return false 3900 } 3901 3902 it.current = m 3903 return true 3904 } 3905 3906 // Ensure streamBlocksResult implements blocksResult 3907 var _ blocksResult = (*bulkBlocksResult)(nil) 3908 3909 type bulkBlocksResult struct { 3910 sync.RWMutex 3911 baseBlocksResult 3912 result result.ShardResult 3913 tagDecoderPool serialize.TagDecoderPool 3914 idPool ident.Pool 3915 nsCtx namespace.Context 3916 } 3917 3918 func newBulkBlocksResult( 3919 nsCtx namespace.Context, 3920 opts Options, 3921 resultOpts result.Options, 3922 tagDecoderPool serialize.TagDecoderPool, 3923 idPool ident.Pool, 3924 ) *bulkBlocksResult { 3925 return &bulkBlocksResult{ 3926 nsCtx: nsCtx, 3927 baseBlocksResult: newBaseBlocksResult(nsCtx, opts, resultOpts), 3928 result: result.NewShardResult(resultOpts), 3929 tagDecoderPool: tagDecoderPool, 3930 idPool: idPool, 3931 } 3932 } 3933 3934 func (r *bulkBlocksResult) addBlockFromPeer( 3935 id ident.ID, 3936 encodedTags checked.Bytes, 3937 peer topology.Host, 3938 block *rpc.Block, 3939 ) error { 3940 start := xtime.UnixNano(block.Start) 3941 result, err := r.newDatabaseBlock(block) 3942 if err != nil { 3943 return err 3944 } 3945 3946 var ( 3947 tags ident.Tags 3948 attemptedDecodeTags bool 3949 ) 3950 for { 3951 r.Lock() 3952 currBlock, exists := r.result.BlockAt(id, start) 3953 if !exists { 3954 if encodedTags == nil || attemptedDecodeTags { 3955 r.result.AddBlock(id, tags, result) 3956 r.Unlock() 3957 break 3958 } 3959 r.Unlock() 3960 3961 // Tags not decoded yet, attempt decoded and then reinsert 3962 attemptedDecodeTags = true 3963 tags, err = newTagsFromEncodedTags(id, encodedTags, 3964 r.tagDecoderPool, r.idPool) 3965 if err != nil { 3966 return err 3967 } 3968 continue 3969 } 3970 3971 // Remove the existing block from the result so it doesn't get 3972 // merged again 3973 r.result.RemoveBlockAt(id, start) 3974 r.Unlock() 3975 3976 // If we've already received data for this block, merge them 3977 // with the new block if possible 3978 tmpCtx := r.contextPool.Get() 3979 currReader, err := currBlock.Stream(tmpCtx) 3980 if err != nil { 3981 return err 3982 } 3983 3984 // If there are no data in the current block, there is no 3985 // need to merge 3986 if currReader.IsEmpty() { 3987 continue 3988 } 3989 3990 resultReader, err := result.Stream(tmpCtx) 3991 if err != nil { 3992 return err 3993 } 3994 if resultReader.IsEmpty() { 3995 return nil 3996 } 3997 3998 readers := []xio.SegmentReader{currReader.SegmentReader, resultReader.SegmentReader} 3999 blockSize := currReader.BlockSize 4000 4001 encoder, err := r.mergeReaders(start, blockSize, readers) 4002 4003 if err != nil { 4004 return err 4005 } 4006 4007 result.Close() 4008 4009 result = r.blockOpts.DatabaseBlockPool().Get() 4010 result.Reset(start, blockSize, encoder.Discard(), r.nsCtx) 4011 4012 tmpCtx.Close() 4013 } 4014 4015 return nil 4016 } 4017 4018 type enqueueCh struct { 4019 sync.Mutex 4020 sending int 4021 enqueued int 4022 processed int 4023 peersMetadataCh chan []receivedBlockMetadata 4024 closed bool 4025 enqueueDelayedFn enqueueDelayedFn 4026 enqueueDelayedDoneFn enqueueDelayedDoneFn 4027 metrics *streamFromPeersMetrics 4028 } 4029 4030 // enqueueChannelDefaultLen is the queue length for processing series ready to 4031 // be fetched from other peers. 4032 // It was reduced from 32k to 512 since each struct in the queue is quite large 4033 // and with 32k capacity was using significant memory with high shard 4034 // concurrency. 4035 const enqueueChannelDefaultLen = 512 4036 4037 func newEnqueueChannel(m *streamFromPeersMetrics) enqueueChannel { 4038 c := &enqueueCh{ 4039 peersMetadataCh: make(chan []receivedBlockMetadata, enqueueChannelDefaultLen), 4040 metrics: m, 4041 } 4042 4043 // Allocate the enqueue delayed fn just once 4044 c.enqueueDelayedFn = func(peersMetadata []receivedBlockMetadata) { 4045 c.peersMetadataCh <- peersMetadata 4046 } 4047 c.enqueueDelayedDoneFn = func() { 4048 c.Lock() 4049 c.sending-- 4050 c.Unlock() 4051 } 4052 4053 go func() { 4054 for { 4055 c.Lock() 4056 closed := c.closed 4057 numEnqueued := float64(len(c.peersMetadataCh)) 4058 c.Unlock() 4059 if closed { 4060 return 4061 } 4062 m.blocksEnqueueChannel.Update(numEnqueued) 4063 time.Sleep(gaugeReportInterval) 4064 } 4065 }() 4066 return c 4067 } 4068 4069 func (c *enqueueCh) enqueue(peersMetadata []receivedBlockMetadata) error { 4070 c.Lock() 4071 if c.closed { 4072 c.Unlock() 4073 return errEnqueueChIsClosed 4074 } 4075 c.enqueued++ 4076 c.sending++ 4077 c.Unlock() 4078 c.peersMetadataCh <- peersMetadata 4079 c.Lock() 4080 c.sending-- 4081 c.Unlock() 4082 return nil 4083 } 4084 4085 func (c *enqueueCh) enqueueDelayed(numToEnqueue int) (enqueueDelayedFn, enqueueDelayedDoneFn, error) { 4086 c.Lock() 4087 if c.closed { 4088 c.Unlock() 4089 return nil, nil, errEnqueueChIsClosed 4090 } 4091 c.sending++ // NB(r): This is decremented by calling the returned enqueue done function 4092 c.enqueued += numToEnqueue 4093 c.Unlock() 4094 return c.enqueueDelayedFn, c.enqueueDelayedDoneFn, nil 4095 } 4096 4097 // read is always safe to call since you can safely range 4098 // over a closed channel, and/or do a checked read in case 4099 // it is closed (unlike when publishing to a channel). 4100 func (c *enqueueCh) read() <-chan []receivedBlockMetadata { 4101 return c.peersMetadataCh 4102 } 4103 4104 func (c *enqueueCh) trackPending(amount int) { 4105 c.Lock() 4106 c.enqueued += amount 4107 c.Unlock() 4108 } 4109 4110 func (c *enqueueCh) trackProcessed(amount int) { 4111 c.Lock() 4112 c.processed += amount 4113 c.Unlock() 4114 } 4115 4116 func (c *enqueueCh) unprocessedLen() int { 4117 c.Lock() 4118 unprocessed := c.unprocessedLenWithLock() 4119 c.Unlock() 4120 return unprocessed 4121 } 4122 4123 func (c *enqueueCh) unprocessedLenWithLock() int { 4124 return c.enqueued - c.processed 4125 } 4126 4127 func (c *enqueueCh) closeOnAllProcessed() { 4128 for { 4129 c.Lock() 4130 if c.unprocessedLenWithLock() == 0 && c.sending == 0 { 4131 close(c.peersMetadataCh) 4132 c.closed = true 4133 c.Unlock() 4134 return 4135 } 4136 c.Unlock() 4137 time.Sleep(100 * time.Millisecond) 4138 } 4139 } 4140 4141 type receivedBlocks struct { 4142 enqueued bool 4143 results []receivedBlockMetadata 4144 } 4145 4146 type processFn func(batch []receivedBlockMetadata) 4147 4148 // peerBlocksQueue is a per peer queue of blocks to be retrieved from a peer 4149 type peerBlocksQueue struct { 4150 sync.RWMutex 4151 closed bool 4152 peer peer 4153 queue []receivedBlockMetadata 4154 doneFns []func() 4155 assigned uint64 4156 completed uint64 4157 maxQueueSize int 4158 workers xsync.WorkerPool 4159 processFn processFn 4160 } 4161 4162 type newPeerBlocksQueueFn func( 4163 peer peer, 4164 maxQueueSize int, 4165 interval time.Duration, 4166 workers xsync.WorkerPool, 4167 processFn processFn, 4168 ) *peerBlocksQueue 4169 4170 func newPeerBlocksQueue( 4171 peer peer, 4172 maxQueueSize int, 4173 interval time.Duration, 4174 workers xsync.WorkerPool, 4175 processFn processFn, 4176 ) *peerBlocksQueue { 4177 q := &peerBlocksQueue{ 4178 peer: peer, 4179 maxQueueSize: maxQueueSize, 4180 workers: workers, 4181 processFn: processFn, 4182 } 4183 if interval > 0 { 4184 go q.drainEvery(interval) 4185 } 4186 return q 4187 } 4188 4189 func (q *peerBlocksQueue) drainEvery(interval time.Duration) { 4190 for { 4191 q.Lock() 4192 if q.closed { 4193 q.Unlock() 4194 return 4195 } 4196 q.drainWithLock() 4197 q.Unlock() 4198 time.Sleep(interval) 4199 } 4200 } 4201 4202 func (q *peerBlocksQueue) close() { 4203 q.Lock() 4204 defer q.Unlock() 4205 q.closed = true 4206 } 4207 4208 func (q *peerBlocksQueue) trackAssigned(amount int) { 4209 atomic.AddUint64(&q.assigned, uint64(amount)) 4210 } 4211 4212 func (q *peerBlocksQueue) trackCompleted(amount int) { 4213 atomic.AddUint64(&q.completed, uint64(amount)) 4214 } 4215 4216 func (q *peerBlocksQueue) enqueue(bl receivedBlockMetadata, doneFn func()) { 4217 q.Lock() 4218 4219 if len(q.queue) == 0 && cap(q.queue) < q.maxQueueSize { 4220 // Lazy initialize queue 4221 q.queue = make([]receivedBlockMetadata, 0, q.maxQueueSize) 4222 } 4223 if len(q.doneFns) == 0 && cap(q.doneFns) < q.maxQueueSize { 4224 // Lazy initialize doneFns 4225 q.doneFns = make([]func(), 0, q.maxQueueSize) 4226 } 4227 q.queue = append(q.queue, bl) 4228 if doneFn != nil { 4229 q.doneFns = append(q.doneFns, doneFn) 4230 } 4231 q.trackAssigned(1) 4232 4233 // Determine if should drain immediately 4234 if len(q.queue) < q.maxQueueSize { 4235 // Require more to fill up block 4236 q.Unlock() 4237 return 4238 } 4239 q.drainWithLock() 4240 4241 q.Unlock() 4242 } 4243 4244 func (q *peerBlocksQueue) drain() { 4245 q.Lock() 4246 q.drainWithLock() 4247 q.Unlock() 4248 } 4249 4250 func (q *peerBlocksQueue) drainWithLock() { 4251 if len(q.queue) == 0 { 4252 // None to drain 4253 return 4254 } 4255 enqueued := q.queue 4256 doneFns := q.doneFns 4257 q.queue = nil 4258 q.doneFns = nil 4259 q.workers.Go(func() { 4260 q.processFn(enqueued) 4261 // Call done callbacks 4262 for i := range doneFns { 4263 doneFns[i]() 4264 } 4265 // Track completed blocks 4266 q.trackCompleted(len(enqueued)) 4267 }) 4268 } 4269 4270 type peerBlocksQueues []*peerBlocksQueue 4271 4272 func (qs peerBlocksQueues) findQueue(peer peer) *peerBlocksQueue { 4273 for _, q := range qs { 4274 if q.peer == peer { 4275 return q 4276 } 4277 } 4278 return nil 4279 } 4280 4281 func (qs peerBlocksQueues) closeAll() { 4282 for _, q := range qs { 4283 q.close() 4284 } 4285 } 4286 4287 type receivedBlockMetadata struct { 4288 peer peer 4289 id ident.ID 4290 encodedTags checked.Bytes 4291 block blockMetadata 4292 } 4293 4294 type receivedBlockMetadatas []receivedBlockMetadata 4295 4296 func (arr receivedBlockMetadatas) swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] } 4297 4298 type peerBlockMetadataByID []receivedBlockMetadata 4299 4300 func (arr peerBlockMetadataByID) Len() int { return len(arr) } 4301 func (arr peerBlockMetadataByID) Swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] } 4302 func (arr peerBlockMetadataByID) Less(i, j int) bool { 4303 return strings.Compare(arr[i].peer.Host().ID(), arr[j].peer.Host().ID()) < 0 4304 } 4305 4306 type receivedBlockMetadataQueue struct { 4307 blockMetadata receivedBlockMetadata 4308 queue *peerBlocksQueue 4309 } 4310 4311 type receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc []receivedBlockMetadataQueue 4312 4313 func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Len() int { 4314 return len(arr) 4315 } 4316 func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Swap(i, j int) { 4317 arr[i], arr[j] = arr[j], arr[i] 4318 } 4319 func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Less(i, j int) bool { 4320 peerI := arr[i].queue.peer 4321 peerJ := arr[j].queue.peer 4322 attemptsI := arr[i].blockMetadata.block.reattempt.peerAttempts(peerI) 4323 attemptsJ := arr[j].blockMetadata.block.reattempt.peerAttempts(peerJ) 4324 if attemptsI != attemptsJ { 4325 return attemptsI < attemptsJ 4326 } 4327 4328 outstandingI := 4329 atomic.LoadUint64(&arr[i].queue.assigned) - 4330 atomic.LoadUint64(&arr[i].queue.completed) 4331 outstandingJ := 4332 atomic.LoadUint64(&arr[j].queue.assigned) - 4333 atomic.LoadUint64(&arr[j].queue.completed) 4334 return outstandingI < outstandingJ 4335 } 4336 4337 type blockMetadata struct { 4338 start xtime.UnixNano 4339 size int64 4340 checksum *uint32 4341 lastRead xtime.UnixNano 4342 reattempt blockMetadataReattempt 4343 } 4344 4345 type blockMetadataReattempt struct { 4346 attempt int 4347 fanoutFetchState *blockFanoutFetchState 4348 attempted []peer 4349 errs []error 4350 retryPeersMetadata []receivedBlockMetadata 4351 fetchedPeersMetadata []receivedBlockMetadata 4352 } 4353 4354 type blockFanoutFetchState struct { 4355 numPending int32 4356 numSuccess int32 4357 } 4358 4359 func newBlockFanoutFetchState( 4360 pending int, 4361 ) *blockFanoutFetchState { 4362 return &blockFanoutFetchState{ 4363 numPending: int32(pending), 4364 } 4365 } 4366 4367 func (s *blockFanoutFetchState) success() int { 4368 return int(atomic.LoadInt32(&s.numSuccess)) 4369 } 4370 4371 func (s *blockFanoutFetchState) incrementSuccess() { 4372 atomic.AddInt32(&s.numSuccess, 1) 4373 } 4374 4375 func (s *blockFanoutFetchState) decrementAndReturnPending() int { 4376 return int(atomic.AddInt32(&s.numPending, -1)) 4377 } 4378 4379 func (b blockMetadataReattempt) peerAttempts(p peer) int { 4380 r := 0 4381 for i := range b.attempted { 4382 if b.attempted[i] == p { 4383 r++ 4384 } 4385 } 4386 return r 4387 } 4388 4389 func newTimesByUnixNanos(values []int64) []time.Time { 4390 result := make([]time.Time, len(values)) 4391 for i := range values { 4392 result[i] = time.Unix(0, values[i]) 4393 } 4394 return result 4395 } 4396 4397 func newTimesByRPCBlocks(values []*rpc.Block) []time.Time { 4398 result := make([]time.Time, len(values)) 4399 for i := range values { 4400 result[i] = time.Unix(0, values[i].Start) 4401 } 4402 return result 4403 } 4404 4405 type metadataIter struct { 4406 inputCh <-chan receivedBlockMetadata 4407 errCh <-chan error 4408 host topology.Host 4409 metadata block.Metadata 4410 tagDecoderPool serialize.TagDecoderPool 4411 idPool ident.Pool 4412 done bool 4413 err error 4414 } 4415 4416 func newMetadataIter( 4417 inputCh <-chan receivedBlockMetadata, 4418 errCh <-chan error, 4419 tagDecoderPool serialize.TagDecoderPool, 4420 idPool ident.Pool, 4421 ) PeerBlockMetadataIter { 4422 return &metadataIter{ 4423 inputCh: inputCh, 4424 errCh: errCh, 4425 tagDecoderPool: tagDecoderPool, 4426 idPool: idPool, 4427 } 4428 } 4429 4430 func (it *metadataIter) Next() bool { 4431 if it.done || it.err != nil { 4432 return false 4433 } 4434 m, more := <-it.inputCh 4435 if !more { 4436 it.err = <-it.errCh 4437 it.done = true 4438 return false 4439 } 4440 var tags ident.Tags 4441 tags, it.err = newTagsFromEncodedTags(m.id, m.encodedTags, 4442 it.tagDecoderPool, it.idPool) 4443 if it.err != nil { 4444 return false 4445 } 4446 it.host = m.peer.Host() 4447 it.metadata = block.NewMetadata(m.id, tags, m.block.start, 4448 m.block.size, m.block.checksum, m.block.lastRead) 4449 return true 4450 } 4451 4452 func (it *metadataIter) Current() (topology.Host, block.Metadata) { 4453 return it.host, it.metadata 4454 } 4455 4456 func (it *metadataIter) Err() error { 4457 return it.err 4458 } 4459 4460 type idAndBlockStart struct { 4461 id ident.ID 4462 blockStart int64 4463 } 4464 4465 func newTagsFromEncodedTags( 4466 seriesID ident.ID, 4467 encodedTags checked.Bytes, 4468 tagDecoderPool serialize.TagDecoderPool, 4469 idPool ident.Pool, 4470 ) (ident.Tags, error) { 4471 if encodedTags == nil { 4472 return ident.Tags{}, nil 4473 } 4474 4475 encodedTags.IncRef() 4476 4477 tagDecoder := tagDecoderPool.Get() 4478 tagDecoder.Reset(encodedTags) 4479 defer tagDecoder.Close() 4480 4481 tags, err := idxconvert.TagsFromTagsIter(seriesID, tagDecoder, idPool) 4482 4483 encodedTags.DecRef() 4484 4485 return tags, err 4486 } 4487 4488 const ( 4489 // histogramDurationBucketsVersion must be bumped if histogramDurationBuckets is changed 4490 // to namespace the different buckets from each other so they don't overlap and cause the 4491 // histogram function to error out due to overlapping buckets in the same query. 4492 histogramDurationBucketsVersion = "v1" 4493 // histogramDurationBucketsVersionTag is the tag for the version of the buckets in use. 4494 histogramDurationBucketsVersionTag = "schema" 4495 ) 4496 4497 // histogramDurationBuckets is a high resolution set of duration buckets. 4498 func histogramDurationBuckets() tally.DurationBuckets { 4499 return append(tally.DurationBuckets{0}, 4500 tally.MustMakeExponentialDurationBuckets(time.Millisecond, 1.25, 60)...) 4501 } 4502 4503 // histogramWithDurationBuckets returns a histogram with the standard duration buckets. 4504 func histogramWithDurationBuckets(scope tally.Scope, name string) tally.Histogram { 4505 sub := scope.Tagged(map[string]string{ 4506 histogramDurationBucketsVersionTag: histogramDurationBucketsVersion, 4507 }) 4508 return sub.Histogram(name, histogramDurationBuckets()) 4509 } 4510 4511 func minDuration(x, y time.Duration) time.Duration { 4512 if x < y { 4513 return x 4514 } 4515 return y 4516 }