
     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    19  // THE SOFTWARE.
    21  package client
    23  import (
    24  	"bytes"
    25  	gocontext "context"
    26  	"errors"
    27  	"fmt"
    28  	"math"
    29  	"sort"
    30  	"strings"
    31  	"sync"
    32  	"sync/atomic"
    33  	"time"
    35  	""
    36  	""
    37  	""
    38  	""
    39  	""
    40  	""
    41  	""
    42  	""
    43  	""
    44  	""
    45  	idxconvert ""
    46  	""
    47  	""
    48  	""
    49  	""
    50  	""
    51  	""
    52  	""
    53  	xerrors ""
    54  	""
    55  	""
    56  	""
    57  	xresource ""
    58  	xretry ""
    59  	""
    60  	""
    61  	xsync ""
    62  	xtime ""
    64  	apachethrift ""
    65  	""
    66  	""
    67  	""
    68  	""
    69  )
    71  const (
    72  	clusterConnectWaitInterval       = 10 * time.Millisecond
    73  	gaugeReportInterval              = 500 * time.Millisecond
    74  	blockMetadataChBufSize           = 65536
    75  	hostNotAvailableMinSleepInterval = 1 * time.Millisecond
    76  	hostNotAvailableMaxSleepInterval = 100 * time.Millisecond
    77  )
    79  type resultTypeEnum string
    81  const (
    82  	resultTypeMetadata  resultTypeEnum = "metadata"
    83  	resultTypeBootstrap                = "bootstrap"
    84  	resultTypeRaw                      = "raw"
    85  )
    87  var (
    88  	errUnknownWriteAttemptType = errors.New(
    89  		"unknown write attempt type specified, internal error")
    90  )
    92  var (
    93  	// ErrClusterConnectTimeout is raised when connecting to the cluster and
    94  	// ensuring at least each partition has an up node with a connection to it
    95  	ErrClusterConnectTimeout = errors.New("timed out establishing min connections to cluster")
    96  	// errSessionStatusNotInitial is raised when trying to open a session and
    97  	// its not in the initial clean state
    98  	errSessionStatusNotInitial = errors.New("session not in initial state")
    99  	// ErrSessionStatusNotOpen is raised when operations are requested when the
   100  	// session is not in the open state
   101  	ErrSessionStatusNotOpen = errors.New("session not in open state")
   102  	// errSessionBadBlockResultFromPeer is raised when there is a bad block
   103  	// return from a peer when fetching blocks from peers
   104  	errSessionBadBlockResultFromPeer = errors.New("session fetched bad block result from peer")
   105  	// errSessionInvalidConnectClusterConnectConsistencyLevel is raised when
   106  	// the connect consistency level specified is not recognized
   107  	errSessionInvalidConnectClusterConnectConsistencyLevel = errors.New(
   108  		"session has invalid connect consistency level specified",
   109  	)
   110  	// errSessionHasNoHostQueueForHost is raised when host queue requested for a missing host
   111  	errSessionHasNoHostQueueForHost = newHostNotAvailableError(errors.New("session has no host queue for host"))
   112  	// errUnableToEncodeTags is raised when the server is unable to encode provided tags
   113  	// to be sent over the wire.
   114  	errUnableToEncodeTags = errors.New("unable to include tags")
   115  	// errEnqueueChIsClosed is returned when attempting to use a closed enqueuCh.
   116  	errEnqueueChIsClosed = errors.New("error enqueueCh is cosed")
   117  )
   119  // sessionState is volatile state that is protected by a
   120  // read/write mutex
   121  type sessionState struct {
   122  	sync.RWMutex
   124  	status status
   126  	writeLevel     topology.ConsistencyLevel
   127  	readLevel      topology.ReadConsistencyLevel
   128  	bootstrapLevel topology.ReadConsistencyLevel
   130  	queues         []hostQueue
   131  	queuesByHostID map[string]hostQueue
   132  	topo           topology.Topology
   133  	topoMap        topology.Map
   134  	topoWatch      topology.MapWatch
   135  	replicas       int
   136  	majority       int
   137  }
   139  func (s *sessionState) readConsistencyLevelWithRLock(
   140  	override *topology.ReadConsistencyLevel,
   141  ) topology.ReadConsistencyLevel {
   142  	if override == nil {
   143  		return s.readLevel
   144  	}
   145  	return *override
   146  }
   148  type session struct {
   149  	state                                sessionState
   150  	opts                                 Options
   151  	runtimeOptsListenerCloser            xresource.SimpleCloser
   152  	scope                                tally.Scope
   153  	nowFn                                clock.NowFn
   154  	log                                  *zap.Logger
   155  	logWriteErrorSampler                 *sampler.Sampler
   156  	logFetchErrorSampler                 *sampler.Sampler
   157  	newHostQueueFn                       newHostQueueFn
   158  	writeRetrier                         xretry.Retrier
   159  	fetchRetrier                         xretry.Retrier
   160  	streamBlocksRetrier                  xretry.Retrier
   161  	pools                                sessionPools
   162  	fetchBatchSize                       int
   163  	newPeerBlocksQueueFn                 newPeerBlocksQueueFn
   164  	reattemptStreamBlocksFromPeersFn     reattemptStreamBlocksFromPeersFn
   165  	pickBestPeerFn                       pickBestPeerFn
   166  	healthCheckNewConnFn                 healthCheckFn
   167  	origin                               topology.Host
   168  	streamBlocksMaxBlockRetries          int
   169  	streamBlocksWorkers                  xsync.WorkerPool
   170  	streamBlocksBatchSize                int
   171  	streamBlocksMetadataBatchTimeout     time.Duration
   172  	streamBlocksBatchTimeout             time.Duration
   173  	writeShardsInitializing              bool
   174  	shardsLeavingCountTowardsConsistency bool
   175  	metrics                              sessionMetrics
   176  }
   178  type shardMetricsKey struct {
   179  	shardID    uint32
   180  	resultType resultTypeEnum
   181  }
   183  type sessionMetrics struct {
   184  	sync.RWMutex
   185  	writeSuccess                         tally.Counter
   186  	writeErrorsBadRequest                tally.Counter
   187  	writeErrorsInternalError             tally.Counter
   188  	writeLatencyHistogram                tally.Histogram
   189  	writeNodesRespondingErrors           []tally.Counter
   190  	writeNodesRespondingBadRequestErrors []tally.Counter
   191  	fetchSuccess                         tally.Counter
   192  	fetchErrorsBadRequest                tally.Counter
   193  	fetchErrorsInternalError             tally.Counter
   194  	fetchLatencyHistogram                tally.Histogram
   195  	fetchNodesRespondingErrors           []tally.Counter
   196  	fetchNodesRespondingBadRequestErrors []tally.Counter
   197  	topologyUpdatedSuccess               tally.Counter
   198  	topologyUpdatedError                 tally.Counter
   199  	streamFromPeersMetrics               map[shardMetricsKey]streamFromPeersMetrics
   200  }
   202  func newSessionMetrics(scope tally.Scope) sessionMetrics {
   203  	return sessionMetrics{
   204  		writeSuccess: scope.Counter("write.success"),
   205  		writeErrorsBadRequest: scope.Tagged(map[string]string{
   206  			"error_type": "bad_request",
   207  		}).Counter("write.errors"),
   208  		writeErrorsInternalError: scope.Tagged(map[string]string{
   209  			"error_type": "internal_error",
   210  		}).Counter("write.errors"),
   211  		writeLatencyHistogram: histogramWithDurationBuckets(scope, "write.latency"),
   212  		fetchSuccess:          scope.Counter("fetch.success"),
   213  		fetchErrorsBadRequest: scope.Tagged(map[string]string{
   214  			"error_type": "bad_request",
   215  		}).Counter("fetch.errors"),
   216  		fetchErrorsInternalError: scope.Tagged(map[string]string{
   217  			"error_type": "internal_error",
   218  		}).Counter("fetch.errors"),
   219  		fetchLatencyHistogram:  histogramWithDurationBuckets(scope, "fetch.latency"),
   220  		topologyUpdatedSuccess: scope.Counter("topology.updated-success"),
   221  		topologyUpdatedError:   scope.Counter("topology.updated-error"),
   222  		streamFromPeersMetrics: make(map[shardMetricsKey]streamFromPeersMetrics),
   223  	}
   224  }
   226  type streamFromPeersMetrics struct {
   227  	fetchBlocksFromPeers                              tally.Gauge
   228  	metadataFetches                                   tally.Gauge
   229  	metadataFetchBatchCall                            tally.Counter
   230  	metadataFetchBatchSuccess                         tally.Counter
   231  	metadataFetchBatchError                           tally.Counter
   232  	metadataFetchBatchBlockErr                        tally.Counter
   233  	metadataReceived                                  tally.Counter
   234  	metadataPeerRetry                                 tally.Counter
   235  	fetchBlockSuccess                                 tally.Counter
   236  	fetchBlockError                                   tally.Counter
   237  	fetchBlockFullRetry                               tally.Counter
   238  	fetchBlockFinalError                              tally.Counter
   239  	fetchBlockRetriesReqError                         tally.Counter
   240  	fetchBlockRetriesRespError                        tally.Counter
   241  	fetchBlockRetriesConsistencyLevelNotAchievedError tally.Counter
   242  	blocksEnqueueChannel                              tally.Gauge
   243  }
   245  type hostQueueOpts struct {
   246  	writeBatchRawRequestPool                     writeBatchRawRequestPool
   247  	writeBatchRawV2RequestPool                   writeBatchRawV2RequestPool
   248  	writeBatchRawRequestElementArrayPool         writeBatchRawRequestElementArrayPool
   249  	writeBatchRawV2RequestElementArrayPool       writeBatchRawV2RequestElementArrayPool
   250  	writeTaggedBatchRawRequestPool               writeTaggedBatchRawRequestPool
   251  	writeTaggedBatchRawV2RequestPool             writeTaggedBatchRawV2RequestPool
   252  	writeTaggedBatchRawRequestElementArrayPool   writeTaggedBatchRawRequestElementArrayPool
   253  	writeTaggedBatchRawV2RequestElementArrayPool writeTaggedBatchRawV2RequestElementArrayPool
   254  	fetchBatchRawV2RequestPool                   fetchBatchRawV2RequestPool
   255  	fetchBatchRawV2RequestElementArrayPool       fetchBatchRawV2RequestElementArrayPool
   256  	opts                                         Options
   257  }
   259  type newHostQueueFn func(
   260  	host topology.Host,
   261  	hostQueueOpts hostQueueOpts,
   262  ) (hostQueue, error)
   264  func newSession(opts Options) (clientSession, error) {
   265  	topo, err := opts.TopologyInitializer().Init()
   266  	if err != nil {
   267  		return nil, err
   268  	}
   270  	logWriteErrorSampler, err := sampler.NewSampler(opts.LogErrorSampleRate())
   271  	if err != nil {
   272  		return nil, err
   273  	}
   275  	logFetchErrorSampler, err := sampler.NewSampler(opts.LogErrorSampleRate())
   276  	if err != nil {
   277  		return nil, err
   278  	}
   280  	scope := opts.InstrumentOptions().MetricsScope()
   282  	s := &session{
   283  		state: sessionState{
   284  			writeLevel:     opts.WriteConsistencyLevel(),
   285  			readLevel:      opts.ReadConsistencyLevel(),
   286  			queuesByHostID: make(map[string]hostQueue),
   287  			topo:           topo,
   288  		},
   289  		opts:                 opts,
   290  		scope:                scope,
   291  		nowFn:                opts.ClockOptions().NowFn(),
   292  		log:                  opts.InstrumentOptions().Logger(),
   293  		logWriteErrorSampler: logWriteErrorSampler,
   294  		logFetchErrorSampler: logFetchErrorSampler,
   295  		newHostQueueFn:       newHostQueue,
   296  		fetchBatchSize:       opts.FetchBatchSize(),
   297  		newPeerBlocksQueueFn: newPeerBlocksQueue,
   298  		healthCheckNewConnFn: healthCheck,
   299  		writeRetrier:         opts.WriteRetrier(),
   300  		fetchRetrier:         opts.FetchRetrier(),
   301  		pools: sessionPools{
   302  			context:      opts.ContextPool(),
   303  			checkedBytes: opts.CheckedBytesPool(),
   304  			id:           opts.IdentifierPool(),
   305  		},
   306  		writeShardsInitializing:              opts.WriteShardsInitializing(),
   307  		shardsLeavingCountTowardsConsistency: opts.ShardsLeavingCountTowardsConsistency(),
   308  		metrics:                              newSessionMetrics(scope),
   309  	}
   310  	s.reattemptStreamBlocksFromPeersFn = s.streamBlocksReattemptFromPeers
   311  	s.pickBestPeerFn = s.streamBlocksPickBestPeer
   312  	writeAttemptPoolOpts := pool.NewObjectPoolOptions().
   313  		SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()).
   314  		SetSize(int(s.opts.WriteOpPoolSize())).
   315  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   316  			scope.SubScope("write-attempt-pool"),
   317  		))
   318  	s.pools.writeAttempt = newWriteAttemptPool(s, writeAttemptPoolOpts)
   319  	s.pools.writeAttempt.Init()
   321  	fetchAttemptPoolOpts := pool.NewObjectPoolOptions().
   322  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   323  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   324  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   325  			scope.SubScope("fetch-attempt-pool"),
   326  		))
   327  	s.pools.fetchAttempt = newFetchAttemptPool(s, fetchAttemptPoolOpts)
   328  	s.pools.fetchAttempt.Init()
   330  	fetchTaggedAttemptPoolImplOpts := pool.NewObjectPoolOptions().
   331  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   332  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   333  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   334  			scope.SubScope("fetch-tagged-attempt-pool"),
   335  		))
   336  	s.pools.fetchTaggedAttempt = newFetchTaggedAttemptPool(s, fetchTaggedAttemptPoolImplOpts)
   337  	s.pools.fetchTaggedAttempt.Init()
   339  	aggregateAttemptPoolImplOpts := pool.NewObjectPoolOptions().
   340  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   341  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   342  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   343  			scope.SubScope("aggregate-attempt-pool"),
   344  		))
   345  	s.pools.aggregateAttempt = newAggregateAttemptPool(s, aggregateAttemptPoolImplOpts)
   346  	s.pools.aggregateAttempt.Init()
   348  	tagEncoderPoolOpts := pool.NewObjectPoolOptions().
   349  		SetDynamic(s.opts.TagEncoderPoolSize().IsDynamic()).
   350  		SetSize(int(s.opts.TagEncoderPoolSize())).
   351  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   352  			scope.SubScope("tag-encoder-pool"),
   353  		))
   354  	s.pools.tagEncoder = serialize.NewTagEncoderPool(opts.TagEncoderOptions(), tagEncoderPoolOpts)
   355  	s.pools.tagEncoder.Init()
   357  	tagDecoderPoolOpts := pool.NewObjectPoolOptions().
   358  		SetDynamic(s.opts.TagDecoderPoolSize().IsDynamic()).
   359  		SetSize(int(s.opts.TagDecoderPoolSize())).
   360  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   361  			scope.SubScope("tag-decoder-pool"),
   362  		))
   363  	s.pools.tagDecoder = serialize.NewTagDecoderPool(opts.TagDecoderOptions(), tagDecoderPoolOpts)
   364  	s.pools.tagDecoder.Init()
   366  	wrapperPoolOpts := pool.NewObjectPoolOptions().
   367  		SetDynamic(s.opts.CheckedBytesWrapperPoolSize().IsDynamic()).
   368  		SetSize(int(s.opts.CheckedBytesWrapperPoolSize())).
   369  		SetInstrumentOptions(opts.InstrumentOptions().SetMetricsScope(
   370  			scope.SubScope("client-checked-bytes-wrapper-pool")))
   371  	s.pools.checkedBytesWrapper = xpool.NewCheckedBytesWrapperPool(wrapperPoolOpts)
   372  	s.pools.checkedBytesWrapper.Init()
   374  	if opts, ok := opts.(AdminOptions); ok {
   375  		s.state.bootstrapLevel = opts.BootstrapConsistencyLevel()
   376  		s.origin = opts.Origin()
   377  		s.streamBlocksMaxBlockRetries = opts.FetchSeriesBlocksMaxBlockRetries()
   378  		s.streamBlocksWorkers = xsync.NewWorkerPool(opts.FetchSeriesBlocksBatchConcurrency())
   379  		s.streamBlocksWorkers.Init()
   380  		s.streamBlocksBatchSize = opts.FetchSeriesBlocksBatchSize()
   381  		s.streamBlocksMetadataBatchTimeout = opts.FetchSeriesBlocksMetadataBatchTimeout()
   382  		s.streamBlocksBatchTimeout = opts.FetchSeriesBlocksBatchTimeout()
   383  		s.streamBlocksRetrier = opts.StreamBlocksRetrier()
   384  	}
   386  	if runtimeOptsMgr := opts.RuntimeOptionsManager(); runtimeOptsMgr != nil {
   387  		runtimeOptsMgr.RegisterListener(s)
   388  	}
   390  	return s, nil
   391  }
   393  func (s *session) SetRuntimeOptions(value runtime.Options) {
   394  	s.state.Lock()
   395  	s.state.bootstrapLevel = value.ClientBootstrapConsistencyLevel()
   396  	s.state.readLevel = value.ClientReadConsistencyLevel()
   397  	s.state.writeLevel = value.ClientWriteConsistencyLevel()
   398  	s.state.Unlock()
   399  }
   401  func (s *session) ShardID(id ident.ID) (uint32, error) {
   402  	s.state.RLock()
   403  	if s.state.status != statusOpen {
   404  		s.state.RUnlock()
   405  		return 0, ErrSessionStatusNotOpen
   406  	}
   407  	value := s.state.topoMap.ShardSet().Lookup(id)
   408  	s.state.RUnlock()
   409  	return value, nil
   410  }
   412  // newPeerMetadataStreamingProgressMetrics returns a struct with an embedded
   413  // list of fields that can be used to emit metrics about the current state of
   414  // the peer metadata streaming process
   415  func (s *session) newPeerMetadataStreamingProgressMetrics(
   416  	shard uint32,
   417  	resultType resultTypeEnum,
   418  ) *streamFromPeersMetrics {
   419  	mKey := shardMetricsKey{shardID: shard, resultType: resultType}
   420  	s.metrics.RLock()
   421  	m, ok := s.metrics.streamFromPeersMetrics[mKey]
   422  	s.metrics.RUnlock()
   424  	if ok {
   425  		return &m
   426  	}
   428  	scope := s.opts.InstrumentOptions().MetricsScope()
   430  	s.metrics.Lock()
   431  	m, ok = s.metrics.streamFromPeersMetrics[mKey]
   432  	if ok {
   433  		s.metrics.Unlock()
   434  		return &m
   435  	}
   436  	scope = scope.SubScope("stream-from-peers").Tagged(map[string]string{
   437  		"shard":      fmt.Sprintf("%d", shard),
   438  		"resultType": string(resultType),
   439  	})
   440  	m = streamFromPeersMetrics{
   441  		fetchBlocksFromPeers:       scope.Gauge("fetch-blocks-inprogress"),
   442  		metadataFetches:            scope.Gauge("fetch-metadata-peers-inprogress"),
   443  		metadataFetchBatchCall:     scope.Counter("fetch-metadata-peers-batch-call"),
   444  		metadataFetchBatchSuccess:  scope.Counter("fetch-metadata-peers-batch-success"),
   445  		metadataFetchBatchError:    scope.Counter("fetch-metadata-peers-batch-error"),
   446  		metadataFetchBatchBlockErr: scope.Counter("fetch-metadata-peers-batch-block-err"),
   447  		metadataReceived:           scope.Counter("fetch-metadata-peers-received"),
   448  		metadataPeerRetry:          scope.Counter("fetch-metadata-peers-peer-retry"),
   449  		fetchBlockSuccess:          scope.Counter("fetch-block-success"),
   450  		fetchBlockError:            scope.Counter("fetch-block-error"),
   451  		fetchBlockFinalError:       scope.Counter("fetch-block-final-error"),
   452  		fetchBlockFullRetry:        scope.Counter("fetch-block-full-retry"),
   453  		fetchBlockRetriesReqError: scope.Tagged(map[string]string{
   454  			"reason": "request-error",
   455  		}).Counter("fetch-block-retries"),
   456  		fetchBlockRetriesRespError: scope.Tagged(map[string]string{
   457  			"reason": "response-error",
   458  		}).Counter("fetch-block-retries"),
   459  		fetchBlockRetriesConsistencyLevelNotAchievedError: scope.Tagged(map[string]string{
   460  			"reason": "consistency-level-not-achieved-error",
   461  		}).Counter("fetch-block-retries"),
   462  		blocksEnqueueChannel: scope.Gauge("fetch-blocks-enqueue-channel-length"),
   463  	}
   464  	s.metrics.streamFromPeersMetrics[mKey] = m
   465  	s.metrics.Unlock()
   466  	return &m
   467  }
   469  func (s *session) recordWriteMetrics(consistencyResultErr error, respErrs int32, start time.Time) {
   470  	if idx := s.nodesRespondingErrorsMetricIndex(respErrs); idx >= 0 {
   471  		if IsBadRequestError(consistencyResultErr) {
   472  			s.metrics.writeNodesRespondingBadRequestErrors[idx].Inc(1)
   473  		} else {
   474  			s.metrics.writeNodesRespondingErrors[idx].Inc(1)
   475  		}
   476  	}
   477  	if consistencyResultErr == nil {
   478  		s.metrics.writeSuccess.Inc(1)
   479  	} else if IsBadRequestError(consistencyResultErr) {
   480  		s.metrics.writeErrorsBadRequest.Inc(1)
   481  	} else {
   482  		s.metrics.writeErrorsInternalError.Inc(1)
   483  	}
   484  	s.metrics.writeLatencyHistogram.RecordDuration(s.nowFn().Sub(start))
   486  	if consistencyResultErr != nil && s.logWriteErrorSampler.Sample() {
   487  		s.log.Error("m3db client write error occurred",
   488  			zap.Float64("sampleRateLog", s.logWriteErrorSampler.SampleRate().Value()),
   489  			zap.Error(consistencyResultErr))
   490  	}
   491  }
   493  func (s *session) recordFetchMetrics(consistencyResultErr error, respErrs int32, start time.Time) {
   494  	if idx := s.nodesRespondingErrorsMetricIndex(respErrs); idx >= 0 {
   495  		if IsBadRequestError(consistencyResultErr) {
   496  			s.metrics.fetchNodesRespondingBadRequestErrors[idx].Inc(1)
   497  		} else {
   498  			s.metrics.fetchNodesRespondingErrors[idx].Inc(1)
   499  		}
   500  	}
   501  	if consistencyResultErr == nil {
   502  		s.metrics.fetchSuccess.Inc(1)
   503  	} else if IsBadRequestError(consistencyResultErr) {
   504  		s.metrics.fetchErrorsBadRequest.Inc(1)
   505  	} else {
   506  		s.metrics.fetchErrorsInternalError.Inc(1)
   507  	}
   508  	s.metrics.fetchLatencyHistogram.RecordDuration(s.nowFn().Sub(start))
   510  	if consistencyResultErr != nil && s.logFetchErrorSampler.Sample() {
   511  		s.log.Error("m3db client fetch error occurred",
   512  			zap.Float64("sampleRateLog", s.logFetchErrorSampler.SampleRate().Value()),
   513  			zap.Error(consistencyResultErr))
   514  	}
   515  }
   517  func (s *session) nodesRespondingErrorsMetricIndex(respErrs int32) int32 {
   518  	idx := respErrs - 1
   519  	replicas := int32(s.Replicas())
   520  	if respErrs > replicas {
   521  		// Cap to the max replicas, we might get more errors
   522  		// when a node is initializing a shard causing replicas + 1
   523  		// nodes to respond to operations
   524  		idx = replicas - 1
   525  	}
   526  	return idx
   527  }
   529  func (s *session) Open() error {
   530  	s.state.Lock()
   531  	if s.state.status != statusNotOpen {
   532  		s.state.Unlock()
   533  		return errSessionStatusNotInitial
   534  	}
   536  	watch, err := s.state.topo.Watch()
   537  	if err != nil {
   538  		s.state.Unlock()
   539  		return err
   540  	}
   542  	// Wait for the topology to be available
   543  	<-watch.C()
   545  	topoMap := watch.Get()
   547  	queues, replicas, majority, err := s.hostQueues(topoMap, nil)
   548  	if err != nil {
   549  		s.state.Unlock()
   550  		return err
   551  	}
   552  	s.setTopologyWithLock(topoMap, queues, replicas, majority)
   553  	s.state.topoWatch = watch
   555  	// NB(r): Alloc pools that can take some time in Open, expectation
   556  	// is already that Open will take some time
   557  	writeOperationPoolOpts := pool.NewObjectPoolOptions().
   558  		SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()).
   559  		SetSize(int(s.opts.WriteOpPoolSize())).
   560  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   561  			s.scope.SubScope("write-op-pool"),
   562  		))
   563  	s.pools.writeOperation = newWriteOperationPool(writeOperationPoolOpts)
   564  	s.pools.writeOperation.Init()
   566  	writeTaggedOperationPoolOpts := pool.NewObjectPoolOptions().
   567  		SetDynamic(s.opts.WriteTaggedOpPoolSize().IsDynamic()).
   568  		SetSize(int(s.opts.WriteTaggedOpPoolSize())).
   569  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   570  			s.scope.SubScope("write-op-tagged-pool"),
   571  		))
   572  	s.pools.writeTaggedOperation = newWriteTaggedOpPool(writeTaggedOperationPoolOpts)
   573  	s.pools.writeTaggedOperation.Init()
   575  	writeStatePoolOpts := pool.NewObjectPoolOptions().
   576  		SetDynamic(s.opts.WriteOpPoolSize().IsDynamic()).
   577  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   578  			s.scope.SubScope("write-state-pool"),
   579  		))
   581  	if !s.opts.WriteOpPoolSize().IsDynamic() {
   582  		writeStatePoolSize := s.opts.WriteOpPoolSize()
   583  		if !s.opts.WriteTaggedOpPoolSize().IsDynamic() && s.opts.WriteTaggedOpPoolSize() > writeStatePoolSize {
   584  			writeStatePoolSize = s.opts.WriteTaggedOpPoolSize()
   585  		}
   586  		writeStatePoolOpts = writeStatePoolOpts.SetSize(int(writeStatePoolSize))
   587  	}
   588  	s.pools.writeState = newWriteStatePool(s.pools.tagEncoder, writeStatePoolOpts)
   589  	s.pools.writeState.Init()
   591  	fetchBatchOpPoolOpts := pool.NewObjectPoolOptions().
   592  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   593  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   594  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   595  			s.scope.SubScope("fetch-batch-op-pool"),
   596  		))
   597  	s.pools.fetchBatchOp = newFetchBatchOpPool(fetchBatchOpPoolOpts, s.fetchBatchSize)
   598  	s.pools.fetchBatchOp.Init()
   600  	fetchTaggedOpPoolOpts := pool.NewObjectPoolOptions().
   601  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   602  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   603  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   604  			s.scope.SubScope("fetch-tagged-op-pool"),
   605  		))
   606  	s.pools.fetchTaggedOp = newFetchTaggedOpPool(fetchTaggedOpPoolOpts)
   607  	s.pools.fetchTaggedOp.Init()
   609  	aggregateOpPoolOpts := pool.NewObjectPoolOptions().
   610  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   611  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   612  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   613  			s.scope.SubScope("aggregate-op-pool"),
   614  		))
   615  	s.pools.aggregateOp = newAggregateOpPool(aggregateOpPoolOpts)
   616  	s.pools.aggregateOp.Init()
   618  	fetchStatePoolOpts := pool.NewObjectPoolOptions().
   619  		SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
   620  		SetSize(int(s.opts.FetchBatchOpPoolSize())).
   621  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   622  			s.scope.SubScope("fetch-tagged-state-pool"),
   623  		))
   624  	s.pools.fetchState = newFetchStatePool(fetchStatePoolOpts)
   625  	s.pools.fetchState.Init()
   627  	seriesIteratorPoolOpts := pool.NewObjectPoolOptions().
   628  		SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()).
   629  		SetSize(int(s.opts.SeriesIteratorPoolSize())).
   630  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
   631  			s.scope.SubScope("series-iterator-pool"),
   632  		))
   633  	s.pools.seriesIterator = encoding.NewSeriesIteratorPool(seriesIteratorPoolOpts)
   634  	s.pools.seriesIterator.Init()
   635  	s.state.status = statusOpen
   636  	s.state.Unlock()
   638  	go func() {
   639  		for range watch.C() {
   640  			s.log.Info("received update for topology")
   641  			topoMap := watch.Get()
   643  			s.state.RLock()
   644  			existingQueues := s.state.queues
   645  			s.state.RUnlock()
   647  			queues, replicas, majority, err := s.hostQueues(topoMap, existingQueues)
   648  			if err != nil {
   649  				s.log.Error("could not update topology map", zap.Error(err))
   650  				s.metrics.topologyUpdatedError.Inc(1)
   651  				continue
   652  			}
   653  			s.state.Lock()
   654  			s.setTopologyWithLock(topoMap, queues, replicas, majority)
   655  			s.state.Unlock()
   656  			s.metrics.topologyUpdatedSuccess.Inc(1)
   657  		}
   658  	}()
   660  	return nil
   661  }
   663  func (s *session) BorrowConnections(
   664  	shardID uint32,
   665  	fn WithBorrowConnectionFn,
   666  	opts BorrowConnectionOptions,
   667  ) (BorrowConnectionsResult, error) {
   668  	var result BorrowConnectionsResult
   669  	s.state.RLock()
   670  	topoMap, err := s.topologyMapWithStateRLock()
   671  	s.state.RUnlock()
   672  	if err != nil {
   673  		return result, err
   674  	}
   676  	var (
   677  		multiErr  = xerrors.NewMultiError()
   678  		breakLoop bool
   679  	)
   680  	err = topoMap.RouteShardForEach(shardID, func(
   681  		_ int,
   682  		shard shard.Shard,
   683  		host topology.Host,
   684  	) {
   685  		if multiErr.NumErrors() > 0 || breakLoop {
   686  			// Error or has broken
   687  			return
   688  		}
   689  		if opts.ExcludeOrigin && s.origin != nil && s.origin.ID() == host.ID() {
   690  			// Skip origin host.
   691  			return
   692  		}
   694  		var (
   695  			userResult WithBorrowConnectionResult
   696  			userErr    error
   697  		)
   698  		borrowErr := s.BorrowConnection(host.ID(), func(
   699  			client rpc.TChanNode,
   700  			channel Channel,
   701  		) {
   702  			userResult, userErr = fn(shard, host, client, channel)
   703  		})
   704  		if borrowErr != nil {
   705  			// Wasn't able to even borrow, skip if don't want to error
   706  			// on down hosts or return the borrow error.
   707  			if !opts.ContinueOnBorrowError {
   708  				multiErr = multiErr.Add(borrowErr)
   709  			}
   710  			return
   711  		}
   713  		// Track successful borrow.
   714  		result.Borrowed++
   716  		// Track whether has broken loop.
   717  		breakLoop = userResult.Break
   719  		// Return whether user error occurred to break or not.
   720  		if userErr != nil {
   721  			multiErr = multiErr.Add(userErr)
   722  		}
   723  	})
   724  	if err != nil {
   725  		// Route error.
   726  		return result, err
   727  	}
   728  	// Potentially a user error or borrow error, otherwise
   729  	// FinalError() will return nil.
   730  	return result, multiErr.FinalError()
   731  }
   733  func (s *session) BorrowConnection(hostID string, fn WithConnectionFn) error {
   734  	s.state.RLock()
   735  	unlocked := false
   736  	queue, ok := s.state.queuesByHostID[hostID]
   737  	if !ok {
   738  		s.state.RUnlock()
   739  		return errSessionHasNoHostQueueForHost
   740  	}
   741  	err := queue.BorrowConnection(func(client rpc.TChanNode, ch Channel) {
   742  		// Unlock early on success
   743  		s.state.RUnlock()
   744  		unlocked = true
   746  		// Execute function with borrowed connection
   747  		fn(client, ch)
   748  	})
   749  	if !unlocked {
   750  		s.state.RUnlock()
   751  	}
   752  	return err
   753  }
   755  func (s *session) DedicatedConnection(
   756  	shardID uint32,
   757  	opts DedicatedConnectionOptions,
   758  ) (rpc.TChanNode, Channel, error) {
   759  	s.state.RLock()
   760  	topoMap, err := s.topologyMapWithStateRLock()
   761  	s.state.RUnlock()
   762  	if err != nil {
   763  		return nil, nil, err
   764  	}
   766  	var (
   767  		client    rpc.TChanNode
   768  		channel   Channel
   769  		succeeded bool
   770  		multiErr  = xerrors.NewMultiError()
   771  	)
   772  	err = topoMap.RouteShardForEach(shardID, func(
   773  		_ int,
   774  		targetShard shard.Shard,
   775  		host topology.Host,
   776  	) {
   777  		stateFilter := opts.ShardStateFilter
   778  		if succeeded || !(stateFilter == shard.Unknown || targetShard.State() == stateFilter) {
   779  			return
   780  		}
   782  		if s.origin != nil && s.origin.ID() == host.ID() {
   783  			// Skip origin host.
   784  			return
   785  		}
   787  		newConnFn := s.opts.NewConnectionFn()
   788  		channel, client, err = newConnFn(channelName, host.Address(), s.opts)
   789  		if err != nil {
   790  			multiErr = multiErr.Add(err)
   791  			return
   792  		}
   794  		if err := s.healthCheckNewConnFn(client, s.opts, opts.BootstrappedNodesOnly); err != nil {
   795  			channel.Close()
   796  			multiErr = multiErr.Add(err)
   797  			return
   798  		}
   800  		succeeded = true
   801  	})
   802  	if err != nil {
   803  		return nil, nil, err
   804  	}
   806  	if !succeeded {
   807  		multiErr = multiErr.Add(
   808  			fmt.Errorf("failed to create a dedicated connection for shard %d", shardID))
   809  		return nil, nil, multiErr.FinalError()
   810  	}
   812  	return client, channel, nil
   813  }
   815  func (s *session) hostQueues(
   816  	topoMap topology.Map,
   817  	existing []hostQueue,
   818  ) ([]hostQueue, int, int, error) {
   819  	// NB(r): we leave existing writes in the host queues to finish
   820  	// as they are already enroute to their destination. This is an edge case
   821  	// that might result in leaving nodes counting towards quorum, but fixing it
   822  	// would result in additional chatter.
   824  	start := s.nowFn()
   826  	existingByHostID := make(map[string]hostQueue, len(existing))
   827  	for _, queue := range existing {
   828  		existingByHostID[queue.Host().ID()] = queue
   829  	}
   831  	hosts := topoMap.Hosts()
   832  	queues := make([]hostQueue, 0, len(hosts))
   833  	newQueues := make([]hostQueue, 0, len(hosts))
   834  	for _, host := range hosts {
   835  		if existingQueue, ok := existingByHostID[host.ID()]; ok {
   836  			queues = append(queues, existingQueue)
   837  			continue
   838  		}
   839  		newQueue, err := s.newHostQueue(host, topoMap)
   840  		if err != nil {
   841  			return nil, 0, 0, err
   842  		}
   843  		queues = append(queues, newQueue)
   844  		newQueues = append(newQueues, newQueue)
   845  	}
   847  	replicas := topoMap.Replicas()
   848  	majority := topoMap.MajorityReplicas()
   850  	firstConnectConsistencyLevel := s.opts.ClusterConnectConsistencyLevel()
   851  	if firstConnectConsistencyLevel == topology.ConnectConsistencyLevelNone {
   852  		// Return immediately if no connect consistency required
   853  		return queues, replicas, majority, nil
   854  	}
   856  	connectConsistencyLevel := firstConnectConsistencyLevel
   857  	if connectConsistencyLevel == topology.ConnectConsistencyLevelAny {
   858  		// If level any specified, first attempt all then proceed lowering requirement
   859  		connectConsistencyLevel = topology.ConnectConsistencyLevelAll
   860  	}
   862  	// Abort if we do not connect
   863  	connected := false
   864  	defer func() {
   865  		if !connected {
   866  			for _, queue := range newQueues {
   867  				queue.Close()
   868  			}
   869  		}
   870  	}()
   872  	for {
   873  		if now := s.nowFn(); now.Sub(start) >= s.opts.ClusterConnectTimeout() {
   874  			switch firstConnectConsistencyLevel {
   875  			case topology.ConnectConsistencyLevelAny:
   876  				// If connecting with connect any strategy then keep
   877  				// trying but lower consistency requirement
   878  				start = now
   879  				connectConsistencyLevel--
   880  				if connectConsistencyLevel == topology.ConnectConsistencyLevelNone {
   881  					// Already tried to resolve all consistency requirements, just
   882  					// return successfully at this point
   883  					err := fmt.Errorf("timed out connecting, returning success")
   884  					s.log.Warn("cluster connect with consistency any", zap.Error(err))
   885  					connected = true
   886  					return queues, replicas, majority, nil
   887  				}
   888  			default:
   889  				// Timed out connecting to a specific consistency requirement
   890  				return nil, 0, 0, ErrClusterConnectTimeout
   891  			}
   892  		}
   894  		var level topology.ConsistencyLevel
   895  		switch connectConsistencyLevel {
   896  		case topology.ConnectConsistencyLevelAll:
   897  			level = topology.ConsistencyLevelAll
   898  		case topology.ConnectConsistencyLevelMajority:
   899  			level = topology.ConsistencyLevelMajority
   900  		case topology.ConnectConsistencyLevelOne:
   901  			level = topology.ConsistencyLevelOne
   902  		default:
   903  			return nil, 0, 0, errSessionInvalidConnectClusterConnectConsistencyLevel
   904  		}
   905  		clusterAvailable, err := s.clusterAvailabilityWithQueuesAndMap(level,
   906  			queues, topoMap)
   907  		if err != nil {
   908  			return nil, 0, 0, err
   909  		}
   910  		if clusterAvailable {
   911  			// All done
   912  			break
   913  		}
   914  		time.Sleep(clusterConnectWaitInterval)
   915  	}
   917  	connected = true
   918  	return queues, replicas, majority, nil
   919  }
   921  func (s *session) WriteClusterAvailability() (bool, error) {
   922  	level := s.opts.WriteConsistencyLevel()
   923  	return s.clusterAvailability(level)
   924  }
   926  func (s *session) ReadClusterAvailability() (bool, error) {
   927  	var convertedConsistencyLevel topology.ConsistencyLevel
   928  	level := s.opts.ReadConsistencyLevel()
   929  	switch level {
   930  	case topology.ReadConsistencyLevelNone:
   931  		// Already ready.
   932  		return true, nil
   933  	case topology.ReadConsistencyLevelOne:
   934  		convertedConsistencyLevel = topology.ConsistencyLevelOne
   935  	case topology.ReadConsistencyLevelUnstrictMajority:
   936  		convertedConsistencyLevel = topology.ConsistencyLevelOne
   937  	case topology.ReadConsistencyLevelMajority:
   938  		convertedConsistencyLevel = topology.ConsistencyLevelMajority
   939  	case topology.ReadConsistencyLevelUnstrictAll:
   940  		convertedConsistencyLevel = topology.ConsistencyLevelOne
   941  	case topology.ReadConsistencyLevelAll:
   942  		convertedConsistencyLevel = topology.ConsistencyLevelAll
   943  	default:
   944  		return false, fmt.Errorf("unknown consistency level: %d", level)
   945  	}
   946  	return s.clusterAvailability(convertedConsistencyLevel)
   947  }
   949  func (s *session) clusterAvailability(
   950  	level topology.ConsistencyLevel,
   951  ) (bool, error) {
   952  	s.state.RLock()
   953  	queues := s.state.queues
   954  	topoMap, err := s.topologyMapWithStateRLock()
   955  	s.state.RUnlock()
   956  	if err != nil {
   957  		return false, err
   958  	}
   959  	return s.clusterAvailabilityWithQueuesAndMap(level, queues, topoMap)
   960  }
   962  func (s *session) clusterAvailabilityWithQueuesAndMap(
   963  	level topology.ConsistencyLevel,
   964  	queues []hostQueue,
   965  	topoMap topology.Map,
   966  ) (bool, error) {
   967  	shards := topoMap.ShardSet().AllIDs()
   968  	minConnectionCount := s.opts.MinConnectionCount()
   969  	replicas := topoMap.Replicas()
   970  	majority := topoMap.MajorityReplicas()
   972  	for _, shardID := range shards {
   973  		shardReplicasAvailable := 0
   974  		routeErr := topoMap.RouteShardForEach(shardID, func(idx int, _ shard.Shard, _ topology.Host) {
   975  			if queues[idx].ConnectionCount() >= minConnectionCount {
   976  				shardReplicasAvailable++
   977  			}
   978  		})
   979  		if routeErr != nil {
   980  			return false, routeErr
   981  		}
   982  		var clusterAvailableForShard bool
   983  		switch level {
   984  		case topology.ConsistencyLevelAll:
   985  			clusterAvailableForShard = shardReplicasAvailable == replicas
   986  		case topology.ConsistencyLevelMajority:
   987  			clusterAvailableForShard = shardReplicasAvailable >= majority
   988  		case topology.ConsistencyLevelOne:
   989  			clusterAvailableForShard = shardReplicasAvailable > 0
   990  		default:
   991  			return false, fmt.Errorf("unknown consistency level: %d", level)
   992  		}
   993  		if !clusterAvailableForShard {
   994  			return false, nil
   995  		}
   996  	}
   998  	return true, nil
   999  }
  1001  func (s *session) setTopologyWithLock(topoMap topology.Map, queues []hostQueue, replicas, majority int) {
  1002  	prevQueues := s.state.queues
  1004  	newQueuesByHostID := make(map[string]hostQueue, len(queues))
  1005  	for _, queue := range queues {
  1006  		newQueuesByHostID[queue.Host().ID()] = queue
  1007  	}
  1009  	s.state.queues = queues
  1010  	s.state.queuesByHostID = newQueuesByHostID
  1012  	s.state.topoMap = topoMap
  1014  	s.state.replicas = replicas
  1015  	s.state.majority = majority
  1017  	// If the number of hostQueues has changed then we need to recreate the fetch
  1018  	// batch op array pool as it must be the exact length of the queues as we index
  1019  	// directly into the return array in fetch calls.
  1020  	if len(queues) != len(prevQueues) {
  1021  		poolOpts := pool.NewObjectPoolOptions().
  1022  			SetSize(int(s.opts.FetchBatchOpPoolSize())).
  1023  			SetDynamic(s.opts.FetchBatchOpPoolSize().IsDynamic()).
  1024  			SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1025  				s.scope.SubScope("fetch-batch-op-array-array-pool"),
  1026  			))
  1027  		s.pools.fetchBatchOpArrayArray = newFetchBatchOpArrayArrayPool(
  1028  			poolOpts,
  1029  			len(queues),
  1030  			int(s.opts.FetchBatchOpPoolSize())/len(queues))
  1031  		s.pools.fetchBatchOpArrayArray.Init()
  1032  	}
  1034  	if s.pools.multiReaderIteratorArray == nil {
  1035  		s.pools.multiReaderIteratorArray = encoding.NewMultiReaderIteratorArrayPool([]pool.Bucket{
  1036  			{
  1037  				Capacity: replicas,
  1038  				Count:    s.opts.SeriesIteratorPoolSize(),
  1039  			},
  1040  		})
  1041  		s.pools.multiReaderIteratorArray.Init()
  1042  	}
  1043  	if s.pools.readerSliceOfSlicesIterator == nil {
  1044  		size := int(s.opts.SeriesIteratorPoolSize())
  1045  		if !s.opts.SeriesIteratorPoolSize().IsDynamic() {
  1046  			size = replicas * int(s.opts.SeriesIteratorPoolSize())
  1047  		}
  1048  		poolOpts := pool.NewObjectPoolOptions().
  1049  			SetSize(size).
  1050  			SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()).
  1051  			SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1052  				s.scope.SubScope("reader-slice-of-slices-iterator-pool"),
  1053  			))
  1054  		s.pools.readerSliceOfSlicesIterator = newReaderSliceOfSlicesIteratorPool(poolOpts)
  1055  		s.pools.readerSliceOfSlicesIterator.Init()
  1056  	}
  1057  	if s.pools.multiReaderIterator == nil {
  1058  		size := int(s.opts.SeriesIteratorPoolSize())
  1059  		if !s.opts.SeriesIteratorPoolSize().IsDynamic() {
  1060  			size = replicas * int(s.opts.SeriesIteratorPoolSize())
  1061  		}
  1062  		poolOpts := pool.NewObjectPoolOptions().
  1063  			SetSize(size).
  1064  			SetDynamic(s.opts.SeriesIteratorPoolSize().IsDynamic()).
  1065  			SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1066  				s.scope.SubScope("multi-reader-iterator-pool"),
  1067  			))
  1068  		s.pools.multiReaderIterator = encoding.NewMultiReaderIteratorPool(poolOpts)
  1069  		s.pools.multiReaderIterator.Init(s.opts.ReaderIteratorAllocate())
  1070  	}
  1071  	if replicas > len(s.metrics.writeNodesRespondingErrors) {
  1072  		curr := len(s.metrics.writeNodesRespondingErrors)
  1073  		for i := curr; i < replicas; i++ {
  1074  			tags := map[string]string{"nodes": fmt.Sprintf("%d", i+1)}
  1075  			name := "write.nodes-responding-error"
  1076  			serverErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{
  1077  				"error_type": "server_error",
  1078  			})
  1079  			badRequestErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{
  1080  				"error_type": "bad_request_error",
  1081  			})
  1082  			s.metrics.writeNodesRespondingErrors =
  1083  				append(s.metrics.writeNodesRespondingErrors, serverErrsSubScope.Counter(name))
  1084  			s.metrics.writeNodesRespondingBadRequestErrors =
  1085  				append(s.metrics.writeNodesRespondingBadRequestErrors, badRequestErrsSubScope.Counter(name))
  1086  		}
  1087  	}
  1088  	if replicas > len(s.metrics.fetchNodesRespondingErrors) {
  1089  		curr := len(s.metrics.fetchNodesRespondingErrors)
  1090  		for i := curr; i < replicas; i++ {
  1091  			tags := map[string]string{"nodes": fmt.Sprintf("%d", i+1)}
  1092  			name := "fetch.nodes-responding-error"
  1093  			serverErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{
  1094  				"error_type": "server_error",
  1095  			})
  1096  			badRequestErrsSubScope := s.scope.Tagged(tags).Tagged(map[string]string{
  1097  				"error_type": "bad_request_error",
  1098  			})
  1099  			s.metrics.fetchNodesRespondingErrors =
  1100  				append(s.metrics.fetchNodesRespondingErrors, serverErrsSubScope.Counter(name))
  1101  			s.metrics.fetchNodesRespondingBadRequestErrors =
  1102  				append(s.metrics.fetchNodesRespondingBadRequestErrors, badRequestErrsSubScope.Counter(name))
  1103  		}
  1104  	}
  1106  	// Asynchronously close the set of host queues no longer in use
  1107  	go func() {
  1108  		for _, queue := range prevQueues {
  1109  			newQueue, ok := newQueuesByHostID[queue.Host().ID()]
  1110  			if !ok || newQueue != queue {
  1111  				queue.Close()
  1112  			}
  1113  		}
  1114  	}()
  1116  	s.log.Info("successfully updated topology",
  1117  		zap.Int("numHosts", topoMap.HostsLen()),
  1118  		zap.Int("numShards", len(topoMap.ShardSet().AllIDs())))
  1119  }
  1121  func (s *session) newHostQueue(host topology.Host, topoMap topology.Map) (hostQueue, error) {
  1122  	// NB(r): Due to hosts being replicas we have:
  1123  	// = replica * numWrites
  1124  	// = total writes to all hosts
  1125  	// We need to pool:
  1126  	// = replica * (numWrites / writeBatchSize)
  1127  	// = number of batch request structs to pool
  1128  	// For purposes of simplifying the options for pooling the write op pool size
  1129  	// represents the number of ops to pool not including replication, this is due
  1130  	// to the fact that the ops are shared between the different host queue replicas.
  1131  	writeOpPoolSize := s.opts.WriteOpPoolSize()
  1132  	if s.opts.WriteTaggedOpPoolSize() > writeOpPoolSize {
  1133  		writeOpPoolSize = s.opts.WriteTaggedOpPoolSize()
  1134  	}
  1135  	totalBatches := topoMap.Replicas() *
  1136  		int(math.Ceil(float64(writeOpPoolSize)/float64(s.opts.WriteBatchSize())))
  1137  	hostBatches := int(math.Ceil(float64(totalBatches) / float64(topoMap.HostsLen())))
  1139  	writeBatchRequestPoolOpts := pool.NewObjectPoolOptions().
  1140  		SetSize(hostBatches).
  1141  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1142  			s.scope.SubScope("write-batch-request-pool"),
  1143  		))
  1144  	writeBatchRequestPool := newWriteBatchRawRequestPool(writeBatchRequestPoolOpts)
  1145  	writeBatchRequestPool.Init()
  1146  	writeBatchV2RequestPool := newWriteBatchRawV2RequestPool(writeBatchRequestPoolOpts)
  1147  	writeBatchV2RequestPool.Init()
  1149  	writeTaggedBatchRequestPoolOpts := pool.NewObjectPoolOptions().
  1150  		SetSize(hostBatches).
  1151  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1152  			s.scope.SubScope("write-tagged-batch-request-pool"),
  1153  		))
  1154  	writeTaggedBatchRequestPool := newWriteTaggedBatchRawRequestPool(writeTaggedBatchRequestPoolOpts)
  1155  	writeTaggedBatchRequestPool.Init()
  1156  	writeTaggedBatchV2RequestPool := newWriteTaggedBatchRawV2RequestPool(writeBatchRequestPoolOpts)
  1157  	writeTaggedBatchV2RequestPool.Init()
  1159  	writeBatchRawRequestElementArrayPoolOpts := pool.NewObjectPoolOptions().
  1160  		SetSize(hostBatches).
  1161  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1162  			s.scope.SubScope("id-datapoint-array-pool"),
  1163  		))
  1164  	writeBatchRawRequestElementArrayPool := newWriteBatchRawRequestElementArrayPool(
  1165  		writeBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize())
  1166  	writeBatchRawRequestElementArrayPool.Init()
  1167  	writeBatchRawV2RequestElementArrayPool := newWriteBatchRawV2RequestElementArrayPool(
  1168  		writeBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize())
  1169  	writeBatchRawV2RequestElementArrayPool.Init()
  1171  	writeTaggedBatchRawRequestElementArrayPoolOpts := pool.NewObjectPoolOptions().
  1172  		SetSize(hostBatches).
  1173  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1174  			s.scope.SubScope("id-tagged-datapoint-array-pool"),
  1175  		))
  1176  	writeTaggedBatchRawRequestElementArrayPool := newWriteTaggedBatchRawRequestElementArrayPool(
  1177  		writeTaggedBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize())
  1178  	writeTaggedBatchRawRequestElementArrayPool.Init()
  1179  	writeTaggedBatchRawV2RequestElementArrayPool := newWriteTaggedBatchRawV2RequestElementArrayPool(
  1180  		writeTaggedBatchRawRequestElementArrayPoolOpts, s.opts.WriteBatchSize())
  1181  	writeTaggedBatchRawV2RequestElementArrayPool.Init()
  1183  	fetchBatchRawV2RequestPoolOpts := pool.NewObjectPoolOptions().
  1184  		SetSize(hostBatches).
  1185  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1186  			s.scope.SubScope("fetch-batch-request-pool"),
  1187  		))
  1188  	fetchBatchRawV2RequestPool := newFetchBatchRawV2RequestPool(fetchBatchRawV2RequestPoolOpts)
  1189  	fetchBatchRawV2RequestPool.Init()
  1191  	fetchBatchRawV2RequestElementArrayPoolOpts := pool.NewObjectPoolOptions().
  1192  		SetSize(hostBatches).
  1193  		SetInstrumentOptions(s.opts.InstrumentOptions().SetMetricsScope(
  1194  			s.scope.SubScope("fetch-batch-request-array-pool"),
  1195  		))
  1196  	fetchBatchRawV2RequestElementArrPool := newFetchBatchRawV2RequestElementArrayPool(
  1197  		fetchBatchRawV2RequestElementArrayPoolOpts, s.opts.FetchBatchSize(),
  1198  	)
  1199  	fetchBatchRawV2RequestElementArrPool.Init()
  1201  	hostQueue, err := s.newHostQueueFn(host, hostQueueOpts{
  1202  		writeBatchRawRequestPool:                     writeBatchRequestPool,
  1203  		writeBatchRawV2RequestPool:                   writeBatchV2RequestPool,
  1204  		writeBatchRawRequestElementArrayPool:         writeBatchRawRequestElementArrayPool,
  1205  		writeBatchRawV2RequestElementArrayPool:       writeBatchRawV2RequestElementArrayPool,
  1206  		writeTaggedBatchRawRequestPool:               writeTaggedBatchRequestPool,
  1207  		writeTaggedBatchRawV2RequestPool:             writeTaggedBatchV2RequestPool,
  1208  		writeTaggedBatchRawRequestElementArrayPool:   writeTaggedBatchRawRequestElementArrayPool,
  1209  		writeTaggedBatchRawV2RequestElementArrayPool: writeTaggedBatchRawV2RequestElementArrayPool,
  1210  		fetchBatchRawV2RequestPool:                   fetchBatchRawV2RequestPool,
  1211  		fetchBatchRawV2RequestElementArrayPool:       fetchBatchRawV2RequestElementArrPool,
  1212  		opts:                                         s.opts,
  1213  	})
  1214  	if err != nil {
  1215  		return nil, err
  1216  	}
  1217  	hostQueue.Open()
  1218  	return hostQueue, nil
  1219  }
  1221  func (s *session) Write(
  1222  	nsID, id ident.ID,
  1223  	t xtime.UnixNano,
  1224  	value float64,
  1225  	unit xtime.Unit,
  1226  	annotation []byte,
  1227  ) error {
  1228  	w := s.pools.writeAttempt.Get()
  1229  	w.args.attemptType = untaggedWriteAttemptType
  1230  	w.args.namespace, = nsID, id
  1231  	w.args.tags = ident.EmptyTagIterator
  1232  	w.args.t = t
  1233  	w.args.value, w.args.unit, w.args.annotation = value, unit, annotation
  1234  	err := s.writeRetrier.Attempt(w.attemptFn)
  1235  	s.pools.writeAttempt.Put(w)
  1236  	return err
  1237  }
  1239  func (s *session) WriteTagged(
  1240  	nsID, id ident.ID,
  1241  	tags ident.TagIterator,
  1242  	t xtime.UnixNano,
  1243  	value float64,
  1244  	unit xtime.Unit,
  1245  	annotation []byte,
  1246  ) error {
  1247  	w := s.pools.writeAttempt.Get()
  1248  	w.args.attemptType = taggedWriteAttemptType
  1249  	w.args.namespace,, w.args.tags = nsID, id, tags
  1250  	w.args.t = t
  1251  	w.args.value, w.args.unit, w.args.annotation = value, unit, annotation
  1252  	err := s.writeRetrier.Attempt(w.attemptFn)
  1253  	s.pools.writeAttempt.Put(w)
  1254  	return err
  1255  }
  1257  func (s *session) writeAttempt(
  1258  	wType writeAttemptType,
  1259  	nsID, id ident.ID,
  1260  	inputTags ident.TagIterator,
  1261  	t xtime.UnixNano,
  1262  	value float64,
  1263  	unit xtime.Unit,
  1264  	annotation []byte,
  1265  ) error {
  1266  	startWriteAttempt := s.nowFn()
  1268  	timeType, timeTypeErr := convert.ToTimeType(unit)
  1269  	if timeTypeErr != nil {
  1270  		return timeTypeErr
  1271  	}
  1273  	timestamp, timestampErr := convert.ToValue(t, timeType)
  1274  	if timestampErr != nil {
  1275  		return timestampErr
  1276  	}
  1278  	s.state.RLock()
  1279  	if s.state.status != statusOpen {
  1280  		s.state.RUnlock()
  1281  		return ErrSessionStatusNotOpen
  1282  	}
  1284  	state, majority, enqueued, err := s.writeAttemptWithRLock(
  1285  		wType, nsID, id, inputTags, timestamp, value, timeType, annotation)
  1286  	s.state.RUnlock()
  1288  	if err != nil {
  1289  		return err
  1290  	}
  1292  	// it's safe to Wait() here, as we still hold the lock on state, after it's
  1293  	// returned from writeAttemptWithRLock.
  1294  	state.Wait()
  1296  	err = s.writeConsistencyResult(state.consistencyLevel, majority, enqueued,
  1297  		enqueued-state.pending, int32(len(state.errors)), state.errors)
  1299  	s.recordWriteMetrics(err, int32(len(state.errors)), startWriteAttempt)
  1301  	// must Unlock before decRef'ing, as the latter releases the writeState back into a
  1302  	// pool if ref count == 0.
  1303  	state.Unlock()
  1304  	state.decRef()
  1306  	return err
  1307  }
  1309  // NB(prateek): the returned writeState, if valid, still holds the lock. Its ownership
  1310  // is transferred to the calling function, and is expected to manage the lifecycle of
  1311  // of the object (including releasing the lock/decRef'ing it).
  1312  func (s *session) writeAttemptWithRLock(
  1313  	wType writeAttemptType,
  1314  	namespace, id ident.ID,
  1315  	inputTags ident.TagIterator,
  1316  	timestamp int64,
  1317  	value float64,
  1318  	timeType rpc.TimeType,
  1319  	annotation []byte,
  1320  ) (*writeState, int32, int32, error) {
  1321  	var (
  1322  		majority = int32(s.state.majority)
  1323  		enqueued int32
  1324  	)
  1326  	// NB(prateek): We retain an individual copy of the namespace, ID per
  1327  	// writeState, as each writeState tracks the lifecycle of it's resources in
  1328  	// use in the various queues. Tracking per writeAttempt isn't sufficient as
  1329  	// we may enqueue multiple writeStates concurrently depending on retries
  1330  	// and consistency level checks.
  1331  	var tagEncoder serialize.TagEncoder
  1332  	if wType == taggedWriteAttemptType {
  1333  		tagEncoder = s.pools.tagEncoder.Get()
  1334  		if err := tagEncoder.Encode(inputTags); err != nil {
  1335  			tagEncoder.Finalize()
  1336  			return nil, 0, 0, err
  1337  		}
  1338  	}
  1339  	nsID := s.cloneFinalizable(namespace)
  1340  	tsID := s.cloneFinalizable(id)
  1342  	var (
  1343  		clonedAnnotation      checked.Bytes
  1344  		clonedAnnotationBytes []byte
  1345  	)
  1346  	if len(annotation) > 0 {
  1347  		clonedAnnotation = s.pools.checkedBytes.Get(len(annotation))
  1348  		clonedAnnotation.IncRef()
  1349  		clonedAnnotation.AppendAll(annotation)
  1350  		clonedAnnotationBytes = clonedAnnotation.Bytes()
  1351  	}
  1353  	var op writeOp
  1354  	switch wType {
  1355  	case untaggedWriteAttemptType:
  1356  		wop := s.pools.writeOperation.Get()
  1357  		wop.namespace = nsID
  1358  		wop.shardID = s.state.topoMap.ShardSet().Lookup(tsID)
  1359  		wop.request.ID = tsID.Bytes()
  1360  		wop.request.Datapoint.Value = value
  1361  		wop.request.Datapoint.Timestamp = timestamp
  1362  		wop.request.Datapoint.TimestampTimeType = timeType
  1363  		wop.request.Datapoint.Annotation = clonedAnnotationBytes
  1364  		wop.requestV2.ID = wop.request.ID
  1365  		wop.requestV2.Datapoint = wop.request.Datapoint
  1366  		op = wop
  1367  	case taggedWriteAttemptType:
  1368  		wop := s.pools.writeTaggedOperation.Get()
  1369  		wop.namespace = nsID
  1370  		wop.shardID = s.state.topoMap.ShardSet().Lookup(tsID)
  1371  		wop.request.ID = tsID.Bytes()
  1372  		encodedTagBytes, ok := tagEncoder.Data()
  1373  		if !ok {
  1374  			return nil, 0, 0, errUnableToEncodeTags
  1375  		}
  1376  		wop.request.EncodedTags = encodedTagBytes.Bytes()
  1377  		wop.request.Datapoint.Value = value
  1378  		wop.request.Datapoint.Timestamp = timestamp
  1379  		wop.request.Datapoint.TimestampTimeType = timeType
  1380  		wop.request.Datapoint.Annotation = clonedAnnotationBytes
  1381  		wop.requestV2.ID = wop.request.ID
  1382  		wop.requestV2.EncodedTags = wop.request.EncodedTags
  1383  		wop.requestV2.Datapoint = wop.request.Datapoint
  1384  		op = wop
  1385  	default:
  1386  		// should never happen
  1387  		return nil, 0, 0, errUnknownWriteAttemptType
  1388  	}
  1390  	state := s.pools.writeState.Get()
  1391  	state.consistencyLevel = s.state.writeLevel
  1392  	state.shardsLeavingCountTowardsConsistency = s.shardsLeavingCountTowardsConsistency
  1393  	state.topoMap = s.state.topoMap
  1394  	state.incRef()
  1396  	// todo@bl: Can we combine the writeOpPool and the writeStatePool?
  1397  	state.op, state.majority = op, majority
  1398  	state.nsID, state.tsID, state.tagEncoder, state.annotation = nsID, tsID, tagEncoder, clonedAnnotation
  1399  	op.SetCompletionFn(state.completionFn)
  1401  	if err := s.state.topoMap.RouteForEach(tsID, func(
  1402  		idx int,
  1403  		hostShard shard.Shard,
  1404  		host topology.Host,
  1405  	) {
  1406  		if !s.writeShardsInitializing && hostShard.State() == shard.Initializing {
  1407  			// NB(r): Do not write to this node as the shard is initializing
  1408  			// and writing to intialized shards is not enabled (also
  1409  			// depending on your config initializing shards won't count
  1410  			// towards quorum, current defaults, so this is ok consistency wise).
  1411  			return
  1412  		}
  1414  		// Count pending write requests before we enqueue the completion fns,
  1415  		// which rely on the count when executing
  1416  		state.pending++
  1417  		state.queues = append(state.queues, s.state.queues[idx])
  1418  	}); err != nil {
  1419  		state.decRef()
  1420  		return nil, 0, 0, err
  1421  	}
  1423  	state.Lock()
  1424  	for i := range state.queues {
  1425  		state.incRef()
  1426  		if err := state.queues[i].Enqueue(state.op); err != nil {
  1427  			state.Unlock()
  1428  			state.decRef()
  1430  			// NB(r): if this happens we have a bug, once we are in the read
  1431  			// lock the current queues should never be closed
  1432  			s.log.Error("[invariant violated] failed to enqueue write", zap.Error(err))
  1433  			return nil, 0, 0, err
  1434  		}
  1435  		enqueued++
  1436  	}
  1438  	// NB(prateek): the current go-routine still holds a lock on the
  1439  	// returned writeState object.
  1440  	return state, majority, enqueued, nil
  1441  }
  1443  func (s *session) Fetch(
  1444  	nsID ident.ID,
  1445  	id ident.ID,
  1446  	startInclusive, endExclusive xtime.UnixNano,
  1447  ) (encoding.SeriesIterator, error) {
  1448  	tsIDs := ident.NewIDsIterator(id)
  1449  	results, err := s.FetchIDs(nsID, tsIDs, startInclusive, endExclusive)
  1450  	if err != nil {
  1451  		return nil, err
  1452  	}
  1453  	mutableResults := results.(encoding.MutableSeriesIterators)
  1454  	iters := mutableResults.Iters()
  1455  	iter := iters[0]
  1456  	// Reset to zero so that when we close this results set the iter doesn't get closed
  1457  	mutableResults.Reset(0)
  1458  	mutableResults.Close()
  1459  	return iter, nil
  1460  }
  1462  func (s *session) FetchIDs(
  1463  	nsID ident.ID,
  1464  	ids ident.Iterator,
  1465  	startInclusive, endExclusive xtime.UnixNano,
  1466  ) (encoding.SeriesIterators, error) {
  1467  	f := s.pools.fetchAttempt.Get()
  1468  	f.args.namespace, f.args.ids = nsID, ids
  1469  	f.args.start = startInclusive
  1470  	f.args.end = endExclusive
  1471  	err := s.fetchRetrier.Attempt(f.attemptFn)
  1472  	result := f.result
  1473  	s.pools.fetchAttempt.Put(f)
  1474  	return result, err
  1475  }
  1477  func (s *session) Aggregate(
  1478  	ctx gocontext.Context,
  1479  	ns ident.ID,
  1480  	q index.Query,
  1481  	opts index.AggregationOptions,
  1482  ) (AggregatedTagsIterator, FetchResponseMetadata, error) {
  1483  	f := s.pools.aggregateAttempt.Get()
  1484  	f.args.ctx = ctx
  1485  	f.args.ns = ns
  1486  	f.args.query = q
  1487  	f.args.opts = opts
  1488  	err := s.fetchRetrier.Attempt(f.attemptFn)
  1489  	iter, metadata := f.resultIter, f.resultMetadata
  1490  	s.pools.aggregateAttempt.Put(f)
  1491  	return iter, metadata, err
  1492  }
  1494  func (s *session) aggregateAttempt(
  1495  	ctx gocontext.Context,
  1496  	ns ident.ID,
  1497  	q index.Query,
  1498  	opts index.AggregationOptions,
  1499  ) (AggregatedTagsIterator, FetchResponseMetadata, error) {
  1500  	s.state.RLock()
  1501  	if s.state.status != statusOpen {
  1502  		s.state.RUnlock()
  1503  		return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen
  1504  	}
  1506  	// NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle
  1507  	// of the hostQueues responding is less than the lifecycle of the current method.
  1508  	nsClone :=
  1510  	req, err := convert.ToRPCAggregateQueryRawRequest(nsClone, q, opts)
  1511  	if err != nil {
  1512  		s.state.RUnlock()
  1513  		nsClone.Finalize()
  1514  		return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err)
  1515  	}
  1516  	if req.SeriesLimit != nil && opts.InstanceMultiple > 0 {
  1517  		topo := s.state.topoMap
  1518  		iPerReplica := int64(len(topo.Hosts()) / topo.Replicas())
  1519  		iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica
  1520  		if iSeriesLimit < *req.SeriesLimit {
  1521  			req.SeriesLimit = &iSeriesLimit
  1522  		}
  1523  	}
  1525  	fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{
  1526  		stateType:            aggregateFetchState,
  1527  		aggregateRequest:     req,
  1528  		startInclusive:       opts.StartInclusive,
  1529  		endExclusive:         opts.EndExclusive,
  1530  		readConsistencyLevel: opts.ReadConsistencyLevel,
  1531  	})
  1532  	s.state.RUnlock()
  1534  	if err != nil {
  1535  		return nil, FetchResponseMetadata{}, err
  1536  	}
  1538  	// it's safe to Wait() here, as we still hold the lock on fetchState, after it's
  1539  	// returned from newFetchStateWithRLock.
  1540  	fetchState.Wait()
  1542  	// must Unlock before calling `asEncodingSeriesIterators` as the latter needs to acquire
  1543  	// the fetchState Lock
  1544  	fetchState.Unlock()
  1545  	iters, meta, err := fetchState.asAggregatedTagsIterator(s.pools, opts.SeriesLimit)
  1547  	// must Unlock() before decRef'ing, as the latter releases the fetchState back into a
  1548  	// pool if ref count == 0.
  1549  	fetchState.decRef()
  1551  	return iters, meta, err
  1552  }
  1554  func (s *session) FetchTagged(
  1555  	ctx gocontext.Context,
  1556  	ns ident.ID,
  1557  	q index.Query,
  1558  	opts index.QueryOptions,
  1559  ) (encoding.SeriesIterators, FetchResponseMetadata, error) {
  1560  	f := s.pools.fetchTaggedAttempt.Get()
  1561  	f.args.ctx = ctx
  1562  	f.args.ns = ns
  1563  	f.args.query = q
  1564  	f.args.opts = opts
  1565  	err := s.fetchRetrier.Attempt(f.dataAttemptFn)
  1566  	iters, metadata := f.dataResultIters, f.dataResultMetadata
  1567  	s.pools.fetchTaggedAttempt.Put(f)
  1568  	return iters, metadata, err
  1569  }
  1571  func (s *session) FetchTaggedIDs(
  1572  	ctx gocontext.Context,
  1573  	ns ident.ID,
  1574  	q index.Query,
  1575  	opts index.QueryOptions,
  1576  ) (TaggedIDsIterator, FetchResponseMetadata, error) {
  1577  	f := s.pools.fetchTaggedAttempt.Get()
  1578  	f.args.ctx = ctx
  1579  	f.args.ns = ns
  1580  	f.args.query = q
  1581  	f.args.opts = opts
  1582  	err := s.fetchRetrier.Attempt(f.idsAttemptFn)
  1583  	iter, metadata := f.idsResultIter, f.idsResultMetadata
  1584  	s.pools.fetchTaggedAttempt.Put(f)
  1585  	return iter, metadata, err
  1586  }
  1588  func (s *session) fetchTaggedAttempt(
  1589  	ctx gocontext.Context,
  1590  	ns ident.ID,
  1591  	q index.Query,
  1592  	opts index.QueryOptions,
  1593  ) (encoding.SeriesIterators, FetchResponseMetadata, error) {
  1594  	nsCtx, err := s.nsCtxFor(ns)
  1595  	if err != nil {
  1596  		return nil, FetchResponseMetadata{}, err
  1597  	}
  1598  	s.state.RLock()
  1599  	if s.state.status != statusOpen {
  1600  		s.state.RUnlock()
  1601  		return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen
  1602  	}
  1604  	// NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle
  1605  	// of the hostQueues responding is less than the lifecycle of the current method.
  1606  	nsClone :=
  1608  	// FOLLOWUP(prateek): currently both `index.Query` and the returned request depend on
  1609  	// native, un-pooled types; so we do not Clone() either. We will start doing so
  1610  	// once lands. Including transferring ownership
  1611  	// of the Clone()'d value to the `fetchState`.
  1612  	const fetchData = true
  1613  	req, err := convert.ToRPCFetchTaggedRequest(nsClone, q, opts, fetchData)
  1614  	if err != nil {
  1615  		s.state.RUnlock()
  1616  		nsClone.Finalize()
  1617  		return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err)
  1618  	}
  1619  	if req.SeriesLimit != nil && opts.InstanceMultiple > 0 {
  1620  		topo := s.state.topoMap
  1621  		iPerReplica := int64(len(topo.Hosts()) / topo.Replicas())
  1622  		iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica
  1623  		if iSeriesLimit < *req.SeriesLimit {
  1624  			req.SeriesLimit = &iSeriesLimit
  1625  		}
  1626  	}
  1628  	fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{
  1629  		stateType:            fetchTaggedFetchState,
  1630  		fetchTaggedRequest:   req,
  1631  		startInclusive:       opts.StartInclusive,
  1632  		endExclusive:         opts.EndExclusive,
  1633  		readConsistencyLevel: opts.ReadConsistencyLevel,
  1634  	})
  1635  	s.state.RUnlock()
  1637  	if err != nil {
  1638  		return nil, FetchResponseMetadata{}, err
  1639  	}
  1641  	// it's safe to Wait() here, as we still hold the lock on fetchState, after it's
  1642  	// returned from newFetchStateWithRLock.
  1643  	fetchState.Wait()
  1645  	// must Unlock before calling `asEncodingSeriesIterators` as the latter needs to acquire
  1646  	// the fetchState Lock
  1647  	fetchState.Unlock()
  1649  	iterOpts := s.opts.IterationOptions()
  1650  	if opts.IterateEqualTimestampStrategy != nil {
  1651  		iterOpts.IterateEqualTimestampStrategy = *opts.IterateEqualTimestampStrategy
  1652  	}
  1654  	iters, metadata, err := fetchState.asEncodingSeriesIterators(
  1655  		s.pools, nsCtx.Schema, iterOpts, opts.SeriesLimit)
  1657  	// must Unlock() before decRef'ing, as the latter releases the fetchState back into a
  1658  	// pool if ref count == 0.
  1659  	fetchState.decRef()
  1661  	return iters, metadata, err
  1662  }
  1664  func (s *session) fetchTaggedIDsAttempt(
  1665  	ctx gocontext.Context,
  1666  	ns ident.ID,
  1667  	q index.Query,
  1668  	opts index.QueryOptions,
  1669  ) (TaggedIDsIterator, FetchResponseMetadata, error) {
  1670  	s.state.RLock()
  1671  	if s.state.status != statusOpen {
  1672  		s.state.RUnlock()
  1673  		return nil, FetchResponseMetadata{}, ErrSessionStatusNotOpen
  1674  	}
  1676  	// NB(prateek): we have to clone the namespace, as we cannot guarantee the lifecycle
  1677  	// of the hostQueues responding is less than the lifecycle of the current method.
  1678  	nsClone :=
  1680  	// FOLLOWUP(prateek): currently both `index.Query` and the returned request depend on
  1681  	// native, un-pooled types; so we do not Clone() either. We will start doing so
  1682  	// once lands. Including transferring ownership
  1683  	// of the Clone()'d value to the `fetchState`.
  1684  	const fetchData = false
  1685  	req, err := convert.ToRPCFetchTaggedRequest(nsClone, q, opts, fetchData)
  1686  	if err != nil {
  1687  		s.state.RUnlock()
  1688  		nsClone.Finalize()
  1689  		return nil, FetchResponseMetadata{}, xerrors.NewNonRetryableError(err)
  1690  	}
  1691  	if req.SeriesLimit != nil && opts.InstanceMultiple > 0 {
  1692  		topo := s.state.topoMap
  1693  		iPerReplica := int64(len(topo.Hosts()) / topo.Replicas())
  1694  		iSeriesLimit := int64(float32(opts.SeriesLimit)*opts.InstanceMultiple) / iPerReplica
  1695  		if iSeriesLimit < *req.SeriesLimit {
  1696  			req.SeriesLimit = &iSeriesLimit
  1697  		}
  1698  	}
  1700  	fetchState, err := s.newFetchStateWithRLock(ctx, nsClone, newFetchStateOpts{
  1701  		stateType:            fetchTaggedFetchState,
  1702  		fetchTaggedRequest:   req,
  1703  		startInclusive:       opts.StartInclusive,
  1704  		endExclusive:         opts.EndExclusive,
  1705  		readConsistencyLevel: opts.ReadConsistencyLevel,
  1706  	})
  1707  	s.state.RUnlock()
  1709  	if err != nil {
  1710  		return nil, FetchResponseMetadata{}, err
  1711  	}
  1713  	// it's safe to Wait() here, as we still hold the lock on fetchState, after it's
  1714  	// returned from newFetchStateWithRLock.
  1715  	fetchState.Wait()
  1717  	// must Unlock before calling `asTaggedIDsIterator` as the latter needs to acquire
  1718  	// the fetchState Lock
  1719  	fetchState.Unlock()
  1720  	iter, metadata, err := fetchState.asTaggedIDsIterator(s.pools, opts.SeriesLimit)
  1722  	// must Unlock() before decRef'ing, as the latter releases the fetchState back into a
  1723  	// pool if ref count == 0.
  1724  	fetchState.decRef()
  1726  	return iter, metadata, err
  1727  }
  1729  type newFetchStateOpts struct {
  1730  	stateType            fetchStateType
  1731  	startInclusive       xtime.UnixNano
  1732  	endExclusive         xtime.UnixNano
  1733  	readConsistencyLevel *topology.ReadConsistencyLevel
  1735  	// only valid if stateType == fetchTaggedFetchState
  1736  	fetchTaggedRequest rpc.FetchTaggedRequest
  1738  	// only valid if stateType == aggregateFetchState
  1739  	aggregateRequest rpc.AggregateQueryRawRequest
  1740  }
  1742  // NB(prateek): the returned fetchState, if valid, still holds the lock. Its ownership
  1743  // is transferred to the calling function, and is expected to manage the lifecycle of
  1744  // of the object (including releasing the lock/decRef'ing it).
  1745  // NB: ownership of ns is transferred to the returned fetchState object.
  1746  func (s *session) newFetchStateWithRLock(
  1747  	ctx gocontext.Context,
  1748  	ns ident.ID,
  1749  	opts newFetchStateOpts,
  1750  ) (*fetchState, error) {
  1751  	var (
  1752  		topoMap    = s.state.topoMap
  1753  		fetchState = s.pools.fetchState.Get()
  1754  	)
  1755  	fetchState.nsID = ns // transfer ownership to `fetchState`
  1756  	fetchState.incRef()  // indicate current go-routine has a reference to the fetchState
  1758  	readLevel := s.state.readConsistencyLevelWithRLock(opts.readConsistencyLevel)
  1760  	// wire up the operation based on the opts specified
  1761  	var (
  1762  		op     op
  1763  		closer func()
  1764  	)
  1765  	switch opts.stateType {
  1766  	case fetchTaggedFetchState:
  1767  		fetchOp := s.pools.fetchTaggedOp.Get()
  1768  		fetchOp.incRef()        // indicate current go-routine has a reference to the op
  1769  		closer = fetchOp.decRef // release the ref for the current go-routine
  1770  		fetchOp.update(ctx, opts.fetchTaggedRequest, fetchState.completionFn)
  1771  		fetchState.ResetFetchTagged(opts.startInclusive, opts.endExclusive,
  1772  			fetchOp, topoMap, s.state.majority, readLevel)
  1773  		op = fetchOp
  1775  	case aggregateFetchState:
  1776  		aggOp := s.pools.aggregateOp.Get()
  1777  		aggOp.incRef()        // indicate current go-routine has a reference to the op
  1778  		closer = aggOp.decRef // release the ref for the current go-routine
  1779  		aggOp.update(ctx, opts.aggregateRequest, fetchState.completionFn)
  1780  		fetchState.ResetAggregate(opts.startInclusive, opts.endExclusive,
  1781  			aggOp, topoMap, s.state.majority, readLevel)
  1782  		op = aggOp
  1784  	default:
  1785  		fetchState.decRef() // release fetchState
  1786  		instrument.EmitInvariantViolation(s.opts.InstrumentOptions())
  1787  		return nil, xerrors.NewNonRetryableError(instrument.InvariantErrorf(
  1788  			"unknown fetchState type: %v", opts.stateType))
  1789  	}
  1791  	fetchState.Lock()
  1792  	for _, hq := range s.state.queues {
  1793  		// inc to indicate the hostQueue has a reference to `op` which has a ref to the fetchState
  1794  		fetchState.incRef()
  1795  		if err := hq.Enqueue(op); err != nil {
  1796  			fetchState.Unlock()
  1797  			closer()            // release the ref for the current go-routine
  1798  			fetchState.decRef() // release the ref for the hostQueue
  1799  			fetchState.decRef() // release the ref for the current go-routine
  1801  			// NB: if this happens we have a bug, once we are in the read
  1802  			// lock the current queues should never be closed
  1803  			wrappedErr := xerrors.NewNonRetryableError(fmt.Errorf("failed to enqueue in fetchState: %v", err))
  1804  			instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) {
  1805  				l.Error(wrappedErr.Error())
  1806  			})
  1807  			return nil, wrappedErr
  1808  		}
  1809  	}
  1811  	closer() // release the ref for the current go-routine
  1813  	// NB(prateek): the calling go-routine still holds the lock and a ref
  1814  	// on the returned fetchState object.
  1815  	return fetchState, nil
  1816  }
  1818  func (s *session) fetchIDsAttempt(
  1819  	inputNamespace ident.ID,
  1820  	inputIDs ident.Iterator,
  1821  	startInclusive, endExclusive xtime.UnixNano,
  1822  ) (encoding.SeriesIterators, error) {
  1823  	nsCtx, err := s.nsCtxFor(inputNamespace)
  1824  	if err != nil {
  1825  		return nil, err
  1826  	}
  1828  	var (
  1829  		wg                     sync.WaitGroup
  1830  		allPending             int32
  1831  		routeErr               error
  1832  		enqueueErr             error
  1833  		resultErrLock          sync.RWMutex
  1834  		resultErr              error
  1835  		resultErrs             int32
  1836  		majority               int32
  1837  		numReplicas            int32
  1838  		readLevel              topology.ReadConsistencyLevel
  1839  		fetchBatchOpsByHostIdx [][]*fetchBatchOp
  1840  		success                = false
  1841  		startFetchAttempt      = s.nowFn()
  1842  	)
  1844  	// NB(prateek): need to make a copy of inputNamespace and inputIDs to control
  1845  	// their life-cycle within this function.
  1846  	namespace :=
  1847  	// First, we duplicate the iterator (only the struct referencing the underlying slice,
  1848  	// not the slice itself). Need this to be able to iterate the original iterator
  1849  	// multiple times in case of retries.
  1850  	ids := inputIDs.Duplicate()
  1852  	rangeStart, tsErr := convert.ToValue(startInclusive, rpc.TimeType_UNIX_NANOSECONDS)
  1853  	if tsErr != nil {
  1854  		return nil, tsErr
  1855  	}
  1857  	rangeEnd, tsErr := convert.ToValue(endExclusive, rpc.TimeType_UNIX_NANOSECONDS)
  1858  	if tsErr != nil {
  1859  		return nil, tsErr
  1860  	}
  1862  	s.state.RLock()
  1863  	if s.state.status != statusOpen {
  1864  		s.state.RUnlock()
  1865  		return nil, ErrSessionStatusNotOpen
  1866  	}
  1868  	iters := encoding.NewSizedSeriesIterators(ids.Remaining())
  1870  	defer func() {
  1871  		// NB(r): Ensure we cover all edge cases and close the iters in any case
  1872  		// of an error being returned
  1873  		if !success {
  1874  			iters.Close()
  1875  		}
  1876  	}()
  1878  	// NB(r): We must take and return pooled items in the session read lock for the
  1879  	// pools that change during a topology update.
  1880  	// This is due to when a queue is re-initialized it enqueues a fixed number
  1881  	// of entries into the backing channel for the pool and will forever stall
  1882  	// on the last few puts if any unexpected entries find their way there
  1883  	// while it is filling.
  1884  	fetchBatchOpsByHostIdx = s.pools.fetchBatchOpArrayArray.Get()
  1886  	readLevel = s.state.readLevel
  1887  	majority = int32(s.state.majority)
  1888  	numReplicas = int32(s.state.replicas)
  1890  	// NB(prateek): namespaceAccessors tracks the number of pending accessors for nsID.
  1891  	// It is set to incremented by `replica` for each requested ID during fetch enqueuing,
  1892  	// and once by initial request, and is decremented for each replica retrieved, inside
  1893  	// completionFn, and once by the allCompletionFn. So know we can Finalize `namespace`
  1894  	// once it's value reaches 0.
  1895  	namespaceAccessors := int32(0)
  1897  	for idx := 0; ids.Next(); idx++ {
  1898  		var (
  1899  			idx  = idx // capture loop variable
  1900  			tsID =
  1902  			wgIsDone int32
  1903  			// NB(xichen): resultsAccessors and idAccessors get initialized to number of replicas + 1
  1904  			// before enqueuing (incremented when iterating over the replicas for this ID), and gets
  1905  			// decremented for each replica as well as inside the allCompletionFn so we know when
  1906  			// resultsAccessors is 0, results are no longer accessed and it's safe to return results
  1907  			// to the pool.
  1908  			resultsAccessors int32 = 1
  1909  			idAccessors      int32 = 1
  1910  			resultsLock      sync.RWMutex
  1911  			results          []encoding.MultiReaderIterator
  1912  			enqueued         int32
  1913  			pending          int32
  1914  			success          int32
  1915  			errors           []error
  1916  			errs             int32
  1917  		)
  1919  		// increment namespaceAccesors by 1 to indicate it still needs to be handled by the
  1920  		// allCompletionFn for tsID.
  1921  		atomic.AddInt32(&namespaceAccessors, 1)
  1923  		wg.Add(1)
  1924  		allCompletionFn := func() {
  1925  			var reportErrors []error
  1926  			errsLen := atomic.LoadInt32(&errs)
  1927  			if errsLen > 0 {
  1928  				resultErrLock.RLock()
  1929  				reportErrors = errors[:]
  1930  				resultErrLock.RUnlock()
  1931  			}
  1932  			responded := enqueued - atomic.LoadInt32(&pending)
  1933  			err := s.readConsistencyResult(readLevel, majority, enqueued,
  1934  				responded, errsLen, reportErrors)
  1935  			s.recordFetchMetrics(err, errsLen, startFetchAttempt)
  1936  			if err != nil {
  1937  				resultErrLock.Lock()
  1938  				if resultErr == nil {
  1939  					resultErr = err
  1940  				}
  1941  				resultErrs++
  1942  				resultErrLock.Unlock()
  1943  			} else {
  1944  				resultsLock.RLock()
  1945  				numItersToInclude := int(success)
  1946  				numDesired := topology.NumDesiredForReadConsistency(readLevel, int(numReplicas), int(majority))
  1947  				if numDesired < numItersToInclude {
  1948  					// Avoid decoding more data than is required to satisfy the consistency guarantees.
  1949  					numItersToInclude = numDesired
  1950  				}
  1952  				itersToInclude := results[:numItersToInclude]
  1953  				resultsLock.RUnlock()
  1955  				iter := s.pools.seriesIterator.Get()
  1956  				// NB(prateek): we need to allocate a copy of ident.ID to allow the seriesIterator
  1957  				// to have control over the lifecycle of ID. We cannot allow seriesIterator
  1958  				// to control the lifecycle of the original ident.ID, as it might still be in use
  1959  				// due to a pending request in queue.
  1960  				seriesID :=
  1961  				namespaceID :=
  1962  				consolidator := s.opts.IterationOptions().SeriesIteratorConsolidator
  1963  				iter.Reset(encoding.SeriesIteratorOptions{
  1964  					ID:                         seriesID,
  1965  					Namespace:                  namespaceID,
  1966  					StartInclusive:             startInclusive,
  1967  					EndExclusive:               endExclusive,
  1968  					Replicas:                   itersToInclude,
  1969  					SeriesIteratorConsolidator: consolidator,
  1970  				})
  1971  				iters.SetAt(idx, iter)
  1972  			}
  1973  			if atomic.AddInt32(&resultsAccessors, -1) == 0 {
  1974  				s.pools.multiReaderIteratorArray.Put(results)
  1975  			}
  1976  			if atomic.AddInt32(&idAccessors, -1) == 0 {
  1977  				tsID.Finalize()
  1978  			}
  1979  			if atomic.AddInt32(&namespaceAccessors, -1) == 0 {
  1980  				namespace.Finalize()
  1981  			}
  1982  			wg.Done()
  1983  		}
  1984  		completionFn := func(result interface{}, err error) {
  1985  			var snapshotSuccess int32
  1986  			if err != nil {
  1987  				if IsBadRequestError(err) {
  1988  					// Wrap with invalid params and non-retryable so it is
  1989  					// not retried.
  1990  					err = xerrors.NewInvalidParamsError(err)
  1991  					err = xerrors.NewNonRetryableError(err)
  1992  				}
  1993  				atomic.AddInt32(&errs, 1)
  1994  				// NB(r): reuse the error lock here as we do not want to create
  1995  				// a whole lot of locks for every single ID fetched due to size
  1996  				// of mutex being non-trivial and likely to cause more stack growth
  1997  				// or GC pressure if ends up on heap which is likely due to naive
  1998  				// escape analysis.
  1999  				resultErrLock.Lock()
  2000  				errors = append(errors, err)
  2001  				resultErrLock.Unlock()
  2002  			} else {
  2003  				slicesIter := s.pools.readerSliceOfSlicesIterator.Get()
  2004  				slicesIter.Reset(result.([]*rpc.Segments))
  2005  				multiIter := s.pools.multiReaderIterator.Get()
  2006  				multiIter.ResetSliceOfSlices(slicesIter, nsCtx.Schema)
  2007  				// Results is pre-allocated after creating fetch ops for this ID below
  2008  				resultsLock.Lock()
  2009  				results[success] = multiIter
  2010  				success++
  2011  				snapshotSuccess = success
  2012  				resultsLock.Unlock()
  2013  			}
  2014  			// NB(xichen): decrementing pending and checking remaining against zero must
  2015  			// come after incrementing success, otherwise we might end up passing results[:success]
  2016  			// to iter.Reset down below before setting the iterator in the results array,
  2017  			// which would cause a nil pointer exception.
  2018  			remaining := atomic.AddInt32(&pending, -1)
  2019  			shouldTerminate := topology.ReadConsistencyTermination(
  2020  				readLevel, majority, remaining, snapshotSuccess,
  2021  			)
  2022  			if shouldTerminate && atomic.CompareAndSwapInt32(&wgIsDone, 0, 1) {
  2023  				allCompletionFn()
  2024  			}
  2026  			if atomic.AddInt32(&resultsAccessors, -1) == 0 {
  2027  				s.pools.multiReaderIteratorArray.Put(results)
  2028  			}
  2029  			if atomic.AddInt32(&idAccessors, -1) == 0 {
  2030  				tsID.Finalize()
  2031  			}
  2032  			if atomic.AddInt32(&namespaceAccessors, -1) == 0 {
  2033  				namespace.Finalize()
  2034  			}
  2035  		}
  2037  		if err := s.state.topoMap.RouteForEach(tsID, func(
  2038  			hostIdx int,
  2039  			hostShard shard.Shard,
  2040  			host topology.Host,
  2041  		) {
  2042  			// Inc safely as this for each is sequential
  2043  			enqueued++
  2044  			pending++
  2045  			allPending++
  2046  			resultsAccessors++
  2047  			namespaceAccessors++
  2048  			idAccessors++
  2050  			ops := fetchBatchOpsByHostIdx[hostIdx]
  2052  			var f *fetchBatchOp
  2053  			if len(ops) > 0 {
  2054  				// Find the last and potentially current fetch op for this host
  2055  				f = ops[len(ops)-1]
  2056  			}
  2057  			if f == nil || f.Size() >= s.fetchBatchSize {
  2058  				// If no current fetch op or existing one is at batch capacity add one
  2059  				// NB(r): Note that we defer to the host queue to take ownership
  2060  				// of these ops and for returning the ops to the pool when done as
  2061  				// they know when their use is complete.
  2062  				f = s.pools.fetchBatchOp.Get()
  2063  				f.IncRef()
  2064  				fetchBatchOpsByHostIdx[hostIdx] = append(fetchBatchOpsByHostIdx[hostIdx], f)
  2065  				f.request.RangeStart = rangeStart
  2066  				f.request.RangeEnd = rangeEnd
  2067  				f.request.RangeTimeType = rpc.TimeType_UNIX_NANOSECONDS
  2068  			}
  2070  			// Append IDWithNamespace to this request
  2071  			f.append(namespace.Bytes(), tsID.Bytes(), completionFn)
  2072  		}); err != nil {
  2073  			routeErr = err
  2074  			break
  2075  		}
  2077  		// Once we've enqueued we know how many to expect so retrieve and set length
  2078  		results = s.pools.multiReaderIteratorArray.Get(int(enqueued))
  2079  		results = results[:enqueued]
  2080  	}
  2082  	if routeErr != nil {
  2083  		s.state.RUnlock()
  2084  		return nil, routeErr
  2085  	}
  2087  	// Enqueue fetch ops
  2088  	for idx := range fetchBatchOpsByHostIdx {
  2089  		for _, f := range fetchBatchOpsByHostIdx[idx] {
  2090  			// Passing ownership of the op itself to the host queue
  2091  			f.DecRef()
  2092  			if err := s.state.queues[idx].Enqueue(f); err != nil && enqueueErr == nil {
  2093  				enqueueErr = err
  2094  				break
  2095  			}
  2096  		}
  2097  		if enqueueErr != nil {
  2098  			break
  2099  		}
  2100  	}
  2101  	s.pools.fetchBatchOpArrayArray.Put(fetchBatchOpsByHostIdx)
  2102  	s.state.RUnlock()
  2104  	if enqueueErr != nil {
  2105  		s.log.Error("failed to enqueue fetch", zap.Error(enqueueErr))
  2106  		return nil, enqueueErr
  2107  	}
  2109  	wg.Wait()
  2111  	resultErrLock.RLock()
  2112  	retErr := resultErr
  2113  	resultErrLock.RUnlock()
  2114  	if retErr != nil {
  2115  		return nil, retErr
  2116  	}
  2117  	success = true
  2118  	return iters, nil
  2119  }
  2121  func (s *session) writeConsistencyResult(
  2122  	level topology.ConsistencyLevel,
  2123  	majority, enqueued, responded, resultErrs int32,
  2124  	errs []error,
  2125  ) error {
  2126  	// Check consistency level satisfied
  2127  	success := enqueued - resultErrs
  2128  	if !topology.WriteConsistencyAchieved(level, int(majority), int(enqueued), int(success)) {
  2129  		return newConsistencyResultError(level, int(enqueued), int(responded), errs)
  2130  	}
  2131  	return nil
  2132  }
  2134  func (s *session) readConsistencyResult(
  2135  	level topology.ReadConsistencyLevel,
  2136  	majority, enqueued, responded, resultErrs int32,
  2137  	errs []error,
  2138  ) error {
  2139  	// Check consistency level satisfied
  2140  	success := enqueued - resultErrs
  2141  	if !topology.ReadConsistencyAchieved(level, int(majority), int(enqueued), int(success)) {
  2142  		return newConsistencyResultError(level, int(enqueued), int(responded), errs)
  2143  	}
  2144  	return nil
  2145  }
  2147  func (s *session) IteratorPools() (encoding.IteratorPools, error) {
  2148  	s.state.RLock()
  2149  	defer s.state.RUnlock()
  2150  	if s.state.status != statusOpen {
  2151  		return nil, ErrSessionStatusNotOpen
  2152  	}
  2153  	return s.pools, nil
  2154  }
  2156  func (s *session) Close() error {
  2157  	s.state.Lock()
  2158  	if s.state.status != statusOpen {
  2159  		s.state.Unlock()
  2160  		return ErrSessionStatusNotOpen
  2161  	}
  2162  	s.state.status = statusClosed
  2163  	queues := s.state.queues
  2164  	topoWatch := s.state.topoWatch
  2165  	topo := s.state.topo
  2166  	s.state.Unlock()
  2168  	for _, q := range queues {
  2169  		q.Close()
  2170  	}
  2172  	topoWatch.Close()
  2173  	topo.Close()
  2175  	if closer := s.runtimeOptsListenerCloser; closer != nil {
  2176  		closer.Close()
  2177  	}
  2179  	return nil
  2180  }
  2182  func (s *session) Origin() topology.Host {
  2183  	return s.origin
  2184  }
  2186  func (s *session) Replicas() int {
  2187  	s.state.RLock()
  2188  	v := s.state.replicas
  2189  	s.state.RUnlock()
  2190  	return v
  2191  }
  2193  func (s *session) TopologyMap() (topology.Map, error) {
  2194  	s.state.RLock()
  2195  	topoMap, err := s.topologyMapWithStateRLock()
  2196  	s.state.RUnlock()
  2197  	return topoMap, err
  2198  }
  2200  func (s *session) topologyMapWithStateRLock() (topology.Map, error) {
  2201  	status := s.state.status
  2202  	topoMap := s.state.topoMap
  2204  	// Make sure the session is open, as thats what sets the initial topology.
  2205  	if status != statusOpen {
  2206  		return nil, ErrSessionStatusNotOpen
  2207  	}
  2208  	if topoMap == nil {
  2209  		// Should never happen.
  2210  		return nil, instrument.InvariantErrorf("session does not have a topology map")
  2211  	}
  2213  	return topoMap, nil
  2214  }
  2216  func (s *session) Truncate(namespace ident.ID) (int64, error) {
  2217  	var (
  2218  		wg            sync.WaitGroup
  2219  		enqueueErr    xerrors.MultiError
  2220  		resultErrLock sync.Mutex
  2221  		resultErr     xerrors.MultiError
  2222  		truncated     int64
  2223  	)
  2225  	t := &truncateOp{}
  2226  	t.request.NameSpace = namespace.Bytes()
  2227  	t.completionFn = func(result interface{}, err error) {
  2228  		if err != nil {
  2229  			resultErrLock.Lock()
  2230  			resultErr = resultErr.Add(err)
  2231  			resultErrLock.Unlock()
  2232  		} else {
  2233  			res := result.(*rpc.TruncateResult_)
  2234  			atomic.AddInt64(&truncated, res.NumSeries)
  2235  		}
  2236  		wg.Done()
  2237  	}
  2239  	s.state.RLock()
  2240  	for idx := range s.state.queues {
  2241  		wg.Add(1)
  2242  		if err := s.state.queues[idx].Enqueue(t); err != nil {
  2243  			wg.Done()
  2244  			enqueueErr = enqueueErr.Add(err)
  2245  		}
  2246  	}
  2247  	s.state.RUnlock()
  2249  	if err := enqueueErr.FinalError(); err != nil {
  2250  		s.log.Error("failed to enqueue request", zap.Error(err))
  2251  		return 0, err
  2252  	}
  2254  	// Wait for namespace to be truncated on all replicas
  2255  	wg.Wait()
  2257  	return truncated, resultErr.FinalError()
  2258  }
  2260  // NB(r): Excluding maligned struct check here as we can
  2261  // live with a few extra bytes since this struct is only
  2262  // ever passed by stack, its much more readable not optimized
  2263  // nolint: maligned
  2264  type peers struct {
  2265  	peers            []peer
  2266  	shard            uint32
  2267  	majorityReplicas int
  2268  	selfExcluded     bool
  2269  	selfHostShardSet topology.HostShardSet
  2270  }
  2272  func (p peers) selfExcludedAndSelfHasShardAvailable() bool {
  2273  	if !p.selfExcluded {
  2274  		return false
  2275  	}
  2276  	state, err := p.selfHostShardSet.ShardSet().LookupStateByID(p.shard)
  2277  	if err != nil {
  2278  		return false
  2279  	}
  2280  	return state == shard.Available
  2281  }
  2283  func (s *session) peersForShard(shardID uint32) (peers, error) {
  2284  	s.state.RLock()
  2285  	var (
  2286  		lookupErr error
  2287  		result    = peers{
  2288  			peers:            make([]peer, 0, s.state.topoMap.Replicas()),
  2289  			shard:            shardID,
  2290  			majorityReplicas: s.state.topoMap.MajorityReplicas(),
  2291  		}
  2292  	)
  2293  	err := s.state.topoMap.RouteShardForEach(shardID, func(
  2294  		idx int,
  2295  		_ shard.Shard,
  2296  		host topology.Host,
  2297  	) {
  2298  		if s.origin != nil && s.origin.ID() == host.ID() {
  2299  			// Don't include the origin host
  2300  			result.selfExcluded = true
  2301  			// Include the origin host shard set for help determining quorum
  2302  			hostShardSet, ok := s.state.topoMap.LookupHostShardSet(host.ID())
  2303  			if !ok {
  2304  				lookupErr = fmt.Errorf("could not find shard set for host ID: %s", host.ID())
  2305  			}
  2306  			result.selfHostShardSet = hostShardSet
  2307  			return
  2308  		}
  2309  		result.peers = append(result.peers, newPeer(s, host))
  2310  	})
  2311  	s.state.RUnlock()
  2312  	if resultErr := xerrors.FirstError(err, lookupErr); resultErr != nil {
  2313  		return peers{}, resultErr
  2314  	}
  2315  	return result, nil
  2316  }
  2318  func (s *session) FetchBootstrapBlocksMetadataFromPeers(
  2319  	namespace ident.ID,
  2320  	shard uint32,
  2321  	start, end xtime.UnixNano,
  2322  	resultOpts result.Options,
  2323  ) (PeerBlockMetadataIter, error) {
  2324  	level := newSessionBootstrapRuntimeReadConsistencyLevel(s)
  2325  	return s.fetchBlocksMetadataFromPeers(namespace,
  2326  		shard, start, end, level, resultOpts)
  2327  }
  2329  func (s *session) FetchBlocksMetadataFromPeers(
  2330  	namespace ident.ID,
  2331  	shard uint32,
  2332  	start, end xtime.UnixNano,
  2333  	consistencyLevel topology.ReadConsistencyLevel,
  2334  	resultOpts result.Options,
  2335  ) (PeerBlockMetadataIter, error) {
  2336  	level := newStaticRuntimeReadConsistencyLevel(consistencyLevel)
  2337  	return s.fetchBlocksMetadataFromPeers(namespace,
  2338  		shard, start, end, level, resultOpts)
  2339  }
  2341  func (s *session) fetchBlocksMetadataFromPeers(
  2342  	namespace ident.ID,
  2343  	shard uint32,
  2344  	start, end xtime.UnixNano,
  2345  	level runtimeReadConsistencyLevel,
  2346  	resultOpts result.Options,
  2347  ) (PeerBlockMetadataIter, error) {
  2348  	peers, err := s.peersForShard(shard)
  2349  	if err != nil {
  2350  		return nil, err
  2351  	}
  2353  	var (
  2354  		metadataCh = make(chan receivedBlockMetadata,
  2355  			blockMetadataChBufSize)
  2356  		errCh = make(chan error, 1)
  2357  		meta  = resultTypeMetadata
  2358  		m     = s.newPeerMetadataStreamingProgressMetrics(shard, meta)
  2359  	)
  2360  	go func() {
  2361  		errCh <- s.streamBlocksMetadataFromPeers(namespace, shard,
  2362  			peers, start, end, level, metadataCh, resultOpts, m)
  2363  		close(metadataCh)
  2364  		close(errCh)
  2365  	}()
  2367  	iter := newMetadataIter(metadataCh, errCh,
  2368  		s.pools.tagDecoder,
  2369  	return iter, nil
  2370  }
  2372  // FetchBootstrapBlocksFromPeers will fetch the specified blocks from peers for
  2373  // bootstrapping purposes. Refer to for more details.
  2374  func (s *session) FetchBootstrapBlocksFromPeers(
  2375  	nsMetadata namespace.Metadata,
  2376  	shard uint32,
  2377  	start, end xtime.UnixNano,
  2378  	opts result.Options,
  2379  ) (result.ShardResult, error) {
  2380  	nsCtx, err := s.nsCtxFromMetadata(nsMetadata)
  2381  	if err != nil {
  2382  		return nil, err
  2383  	}
  2384  	var (
  2385  		result = newBulkBlocksResult(nsCtx, s.opts, opts,
  2386  			s.pools.tagDecoder,
  2387  		doneCh   = make(chan struct{})
  2388  		progress = s.newPeerMetadataStreamingProgressMetrics(shard,
  2389  			resultTypeBootstrap)
  2390  		level = newSessionBootstrapRuntimeReadConsistencyLevel(s)
  2391  	)
  2393  	// Determine which peers own the specified shard
  2394  	peers, err := s.peersForShard(shard)
  2395  	if err != nil {
  2396  		return nil, err
  2397  	}
  2399  	// Emit a gauge indicating whether we're done or not
  2400  	go func() {
  2401  		for {
  2402  			select {
  2403  			case <-doneCh:
  2404  				progress.fetchBlocksFromPeers.Update(0)
  2405  				return
  2406  			default:
  2407  				progress.fetchBlocksFromPeers.Update(1)
  2408  				time.Sleep(gaugeReportInterval)
  2409  			}
  2410  		}
  2411  	}()
  2412  	defer close(doneCh)
  2414  	// Begin pulling metadata, if one or multiple peers fail no error will
  2415  	// be returned from this routine as long as one peer succeeds completely
  2416  	metadataCh := make(chan receivedBlockMetadata, blockMetadataChBufSize)
  2417  	// Spin up a background goroutine which will begin streaming metadata from
  2418  	// all the peers and pushing them into the metadatach
  2419  	errCh := make(chan error, 1)
  2420  	go func() {
  2421  		errCh <- s.streamBlocksMetadataFromPeers(nsMetadata.ID(), shard,
  2422  			peers, start, end, level, metadataCh, opts, progress)
  2423  		close(metadataCh)
  2424  	}()
  2426  	// Begin consuming metadata and making requests. This will block until all
  2427  	// data has been streamed (or failed to stream). Note that while this function
  2428  	// does return an error, an error will only be returned in a select few cases.
  2429  	// There are some scenarios in which if something goes wrong here we won't report it to
  2430  	// the caller, but metrics and logs are emitted internally. Also note that the
  2431  	// streamAndGroupCollectedBlocksMetadata function is injected.
  2432  	err = s.streamBlocksFromPeers(nsMetadata, shard, peers, metadataCh, opts,
  2433  		level, result, progress, s.streamAndGroupCollectedBlocksMetadata)
  2434  	if err != nil {
  2435  		return nil, err
  2436  	}
  2438  	// Check if an error occurred during the metadata streaming
  2439  	if err = <-errCh; err != nil {
  2440  		return nil, err
  2441  	}
  2443  	return result.result, nil
  2444  }
  2446  func (s *session) FetchBlocksFromPeers(
  2447  	nsMetadata namespace.Metadata,
  2448  	shard uint32,
  2449  	consistencyLevel topology.ReadConsistencyLevel,
  2450  	metadatas []block.ReplicaMetadata,
  2451  	opts result.Options,
  2452  ) (PeerBlocksIter, error) {
  2453  	nsCtx, err := s.nsCtxFromMetadata(nsMetadata)
  2454  	if err != nil {
  2455  		return nil, err
  2456  	}
  2457  	var (
  2458  		logger   = opts.InstrumentOptions().Logger()
  2459  		level    = newStaticRuntimeReadConsistencyLevel(consistencyLevel)
  2460  		complete = int64(0)
  2461  		doneCh   = make(chan error, 1)
  2462  		outputCh = make(chan peerBlocksDatapoint, 4096)
  2463  		result   = newStreamBlocksResult(nsCtx, s.opts, opts, outputCh,
  2464  			s.pools.tagDecoder,
  2465  		onDone = func(err error) {
  2466  			atomic.StoreInt64(&complete, 1)
  2467  			select {
  2468  			case doneCh <- err:
  2469  			default:
  2470  			}
  2471  		}
  2472  		progress = s.newPeerMetadataStreamingProgressMetrics(shard, resultTypeRaw)
  2473  	)
  2475  	peers, err := s.peersForShard(shard)
  2476  	if err != nil {
  2477  		return nil, err
  2478  	}
  2480  	peersByHost := make(map[string]peer, len(peers.peers))
  2481  	for _, peer := range peers.peers {
  2482  		peersByHost[peer.Host().ID()] = peer
  2483  	}
  2485  	// If any metadata has tags then encode them up front so can
  2486  	// return an error on tag encoding rather than logging error that would
  2487  	// possibly get missed.
  2488  	var (
  2489  		metadatasEncodedTags []checked.Bytes
  2490  		anyTags              bool
  2491  	)
  2492  	for _, meta := range metadatas {
  2493  		if len(meta.Tags.Values()) > 0 {
  2494  			anyTags = true
  2495  			break
  2496  		}
  2497  	}
  2498  	if anyTags {
  2499  		// NB(r): Allocate exact length so nil is used and each index
  2500  		// references same index as the incoming metadatas being fetched.
  2501  		metadatasEncodedTags = make([]checked.Bytes, len(metadatas))
  2502  		tagsIter := ident.NewTagsIterator(ident.Tags{})
  2503  		for idx, meta := range metadatas {
  2504  			if len(meta.Tags.Values()) == 0 {
  2505  				continue
  2506  			}
  2508  			tagsIter.Reset(meta.Tags)
  2509  			tagsEncoder := s.pools.tagEncoder.Get()
  2510  			if err := tagsEncoder.Encode(tagsIter); err != nil {
  2511  				return nil, err
  2512  			}
  2514  			encodedTagsCheckedBytes, ok := tagsEncoder.Data()
  2515  			if !ok {
  2516  				return nil, fmt.Errorf("could not encode tags: id=%s", meta.ID.String())
  2517  			}
  2519  			metadatasEncodedTags[idx] = encodedTagsCheckedBytes
  2520  		}
  2521  	}
  2523  	go func() {
  2524  		for atomic.LoadInt64(&complete) == 0 {
  2525  			progress.fetchBlocksFromPeers.Update(1)
  2526  			time.Sleep(gaugeReportInterval)
  2527  		}
  2528  		progress.fetchBlocksFromPeers.Update(0)
  2529  	}()
  2531  	metadataCh := make(chan receivedBlockMetadata, blockMetadataChBufSize)
  2532  	go func() {
  2533  		for idx, rb := range metadatas {
  2534  			peer, ok := peersByHost[rb.Host.ID()]
  2535  			if !ok {
  2536  				logger.Warn("replica requested from unknown peer, skipping",
  2537  					zap.Stringer("peer", rb.Host),
  2538  					zap.Stringer("id", rb.ID),
  2539  					zap.Time("start", rb.Start.ToTime()),
  2540  				)
  2541  				continue
  2542  			}
  2544  			// Attach encoded tags if present.
  2545  			var encodedTags checked.Bytes
  2546  			if idx < len(metadatasEncodedTags) {
  2547  				// Note: could still be nil if had no tags, but the slice
  2548  				// was built so need to take ref to encoded tags if
  2549  				// was encoded.
  2550  				encodedTags = metadatasEncodedTags[idx]
  2551  			}
  2553  			metadataCh <- receivedBlockMetadata{
  2554  				id:          rb.Metadata.ID,
  2555  				encodedTags: encodedTags,
  2556  				peer:        peer,
  2557  				block: blockMetadata{
  2558  					start:    rb.Start,
  2559  					size:     rb.Size,
  2560  					checksum: rb.Checksum,
  2561  					lastRead: rb.LastRead,
  2562  				},
  2563  			}
  2564  		}
  2565  		close(metadataCh)
  2566  	}()
  2568  	// Begin consuming metadata and making requests.
  2569  	go func() {
  2570  		err := s.streamBlocksFromPeers(nsMetadata, shard, peers, metadataCh,
  2571  			opts, level, result, progress, s.passThroughBlocksMetadata)
  2572  		close(outputCh)
  2573  		onDone(err)
  2574  	}()
  2576  	pbi := newPeerBlocksIter(outputCh, doneCh)
  2577  	return pbi, nil
  2578  }
  2580  func (s *session) streamBlocksMetadataFromPeers(
  2581  	namespace ident.ID,
  2582  	shardID uint32,
  2583  	peers peers,
  2584  	start, end xtime.UnixNano,
  2585  	level runtimeReadConsistencyLevel,
  2586  	metadataCh chan<- receivedBlockMetadata,
  2587  	resultOpts result.Options,
  2588  	progress *streamFromPeersMetrics,
  2589  ) error {
  2590  	var (
  2591  		wg        sync.WaitGroup
  2592  		errs      = newSyncAbortableErrorsMap()
  2593  		pending   = int64(len(peers.peers))
  2594  		majority  = int32(peers.majorityReplicas)
  2595  		enqueued  = int32(len(peers.peers))
  2596  		responded int32
  2597  		success   int32
  2598  	)
  2599  	if peers.selfExcludedAndSelfHasShardAvailable() {
  2600  		// If we excluded ourselves from fetching, we basically treat ourselves
  2601  		// as a successful peer response since we can bootstrap from ourselves
  2602  		// just fine
  2603  		enqueued++
  2604  		success++
  2605  	}
  2607  	progress.metadataFetches.Update(float64(pending))
  2608  	for idx, peer := range peers.peers {
  2609  		idx := idx
  2610  		peer := peer
  2612  		wg.Add(1)
  2613  		go func() {
  2614  			defer func() {
  2615  				// Success or error counts towards a response
  2616  				atomic.AddInt32(&responded, 1)
  2618  				// Decrement pending
  2619  				progress.metadataFetches.Update(float64(atomic.AddInt64(&pending, -1)))
  2621  				// Mark done
  2622  				wg.Done()
  2623  			}()
  2625  			var (
  2626  				firstAttempt = true
  2627  				// NB(r): currPageToken keeps the position into the pagination of the
  2628  				// metadata from this peer, it begins as nil but if an error is
  2629  				// returned it will likely not be nil, this lets us restart fetching
  2630  				// if we need to (if consistency has not been achieved yet) without
  2631  				// losing place in the pagination.
  2632  				currPageToken                     pageToken
  2633  				currHostNotAvailableSleepInterval = hostNotAvailableMinSleepInterval
  2634  			)
  2635  			condition := func() bool {
  2636  				if firstAttempt {
  2637  					// Always attempt at least once
  2638  					firstAttempt = false
  2639  					return true
  2640  				}
  2642  				var (
  2643  					currLevel = level.value()
  2644  					majority  = int(majority)
  2645  					enqueued  = int(enqueued)
  2646  					success   = int(atomic.LoadInt32(&success))
  2647  				)
  2648  				metReadConsistency := topology.ReadConsistencyAchieved(
  2649  					currLevel, majority, enqueued, success)
  2650  				doRetry := !metReadConsistency && errs.getAbortError() == nil
  2652  				if doRetry {
  2653  					// Track that we are reattempting the fetch metadata
  2654  					// pagination from a peer
  2655  					progress.metadataPeerRetry.Inc(1)
  2656  				}
  2657  				return doRetry
  2658  			}
  2659  			for condition() {
  2660  				var err error
  2661  				currPageToken, err = s.streamBlocksMetadataFromPeer(namespace, shardID,
  2662  					peer, start, end, currPageToken, metadataCh, resultOpts, progress)
  2663  				// Set error or success if err is nil
  2664  				errs.setError(idx, err)
  2666  				// hostNotAvailable is a NonRetryableError for the purposes of short-circuiting
  2667  				// the automatic retry functionality, but in this case the client should avoid
  2668  				// aborting and continue retrying at this level until consistency can be reached.
  2669  				if isHostNotAvailableError(err) {
  2670  					// Prevent the loop from spinning too aggressively in the short-circuiting case.
  2671  					time.Sleep(currHostNotAvailableSleepInterval)
  2672  					currHostNotAvailableSleepInterval = minDuration(
  2673  						currHostNotAvailableSleepInterval*2,
  2674  						hostNotAvailableMaxSleepInterval,
  2675  					)
  2676  					continue
  2677  				}
  2679  				if err != nil && xerrors.IsNonRetryableError(err) {
  2680  					errs.setAbortError(err)
  2681  					return // Cannot recover from this error, so we break from the loop
  2682  				}
  2684  				if err == nil {
  2685  					atomic.AddInt32(&success, 1)
  2686  					return
  2687  				}
  2689  				// There was a retryable error, continue looping.
  2690  			}
  2691  		}()
  2692  	}
  2694  	wg.Wait()
  2696  	if err := errs.getAbortError(); err != nil {
  2697  		return err
  2698  	}
  2700  	errors := errs.getErrors()
  2701  	return s.readConsistencyResult(level.value(), majority, enqueued,
  2702  		atomic.LoadInt32(&responded), int32(len(errors)), errors)
  2703  }
  2705  type pageToken []byte
  2707  // streamBlocksMetadataFromPeer has several heap allocated anonymous
  2708  // function, however, they're only allocated once per peer/shard combination
  2709  // for the entire peer bootstrapping process so performance is acceptable
  2710  func (s *session) streamBlocksMetadataFromPeer(
  2711  	namespace ident.ID,
  2712  	shard uint32,
  2713  	peer peer,
  2714  	start, end xtime.UnixNano,
  2715  	startPageToken pageToken,
  2716  	metadataCh chan<- receivedBlockMetadata,
  2717  	resultOpts result.Options,
  2718  	progress *streamFromPeersMetrics,
  2719  ) (pageToken, error) {
  2720  	var (
  2721  		optionIncludeSizes     = true
  2722  		optionIncludeChecksums = true
  2723  		optionIncludeLastRead  = true
  2724  		moreResults            = true
  2725  		idPool                 =
  2726  		bytesPool              = resultOpts.DatabaseBlockOptions().BytesPool()
  2728  		// Only used for logs
  2729  		peerStr              = peer.Host().ID()
  2730  		metadataCountByBlock = map[xtime.UnixNano]int64{}
  2731  	)
  2732  	defer func() {
  2733  		for block, numMetadata := range metadataCountByBlock {
  2734  			s.log.Debug("finished streaming blocks metadata from peer",
  2735  				zap.Uint32("shard", shard),
  2736  				zap.String("peer", peerStr),
  2737  				zap.Int64("numMetadata", numMetadata),
  2738  				zap.Time("block", block.ToTime()),
  2739  			)
  2740  		}
  2741  	}()
  2743  	// Declare before loop to avoid redeclaring each iteration
  2744  	attemptFn := func(client rpc.TChanNode) error {
  2745  		tctx, _ := thrift.NewContext(s.streamBlocksMetadataBatchTimeout)
  2746  		req := rpc.NewFetchBlocksMetadataRawV2Request()
  2747  		req.NameSpace = namespace.Bytes()
  2748  		req.Shard = int32(shard)
  2749  		req.RangeStart = int64(start)
  2750  		req.RangeEnd = int64(end)
  2751  		req.Limit = int64(s.streamBlocksBatchSize)
  2752  		req.PageToken = startPageToken
  2753  		req.IncludeSizes = &optionIncludeSizes
  2754  		req.IncludeChecksums = &optionIncludeChecksums
  2755  		req.IncludeLastRead = &optionIncludeLastRead
  2757  		progress.metadataFetchBatchCall.Inc(1)
  2758  		result, err := client.FetchBlocksMetadataRawV2(tctx, req)
  2759  		if err != nil {
  2760  			progress.metadataFetchBatchError.Inc(1)
  2761  			return err
  2762  		}
  2764  		progress.metadataFetchBatchSuccess.Inc(1)
  2765  		progress.metadataReceived.Inc(int64(len(result.Elements)))
  2767  		if result.NextPageToken != nil {
  2768  			// Reset pageToken + copy new pageToken into previously allocated memory,
  2769  			// extending as necessary
  2770  			startPageToken = append(startPageToken[:0], result.NextPageToken...)
  2771  		} else {
  2772  			// No further results
  2773  			moreResults = false
  2774  		}
  2776  		for _, elem := range result.Elements {
  2777  			blockStart := xtime.UnixNano(elem.Start)
  2779  			data := bytesPool.Get(len(elem.ID))
  2780  			data.IncRef()
  2781  			data.AppendAll(elem.ID)
  2782  			data.DecRef()
  2783  			clonedID := idPool.BinaryID(data)
  2784  			// Return thrift bytes to pool once the ID has been copied.
  2785  			apachethrift.BytesPoolPut(elem.ID)
  2787  			var encodedTags checked.Bytes
  2788  			if tagBytes := elem.EncodedTags; len(tagBytes) != 0 {
  2789  				encodedTags = bytesPool.Get(len(tagBytes))
  2790  				encodedTags.IncRef()
  2791  				encodedTags.AppendAll(tagBytes)
  2792  				encodedTags.DecRef()
  2793  				// Return thrift bytes to pool once the tags have been copied.
  2794  				apachethrift.BytesPoolPut(tagBytes)
  2795  			}
  2797  			// Error occurred retrieving block metadata, use default values
  2798  			if err := elem.Err; err != nil {
  2799  				progress.metadataFetchBatchBlockErr.Inc(1)
  2800  				s.log.Error("error occurred retrieving block metadata",
  2801  					zap.Uint32("shard", shard),
  2802  					zap.String("peer", peerStr),
  2803  					zap.Time("block", blockStart.ToTime()),
  2804  					zap.Error(err),
  2805  				)
  2806  				// Enqueue with a zeroed checksum which triggers a fanout fetch
  2807  				metadataCh <- receivedBlockMetadata{
  2808  					peer:        peer,
  2809  					id:          clonedID,
  2810  					encodedTags: encodedTags,
  2811  					block: blockMetadata{
  2812  						start: blockStart,
  2813  					},
  2814  				}
  2815  				continue
  2816  			}
  2818  			var size int64
  2819  			if elem.Size != nil {
  2820  				size = *elem.Size
  2821  			}
  2823  			var pChecksum *uint32
  2824  			if elem.Checksum != nil {
  2825  				value := uint32(*elem.Checksum)
  2826  				pChecksum = &value
  2827  			}
  2829  			var lastRead xtime.UnixNano
  2830  			if elem.LastRead != nil {
  2831  				value, err := convert.ToTime(*elem.LastRead, elem.LastReadTimeType)
  2832  				if err == nil {
  2833  					lastRead = value
  2834  				}
  2835  			}
  2837  			metadataCh <- receivedBlockMetadata{
  2838  				peer:        peer,
  2839  				id:          clonedID,
  2840  				encodedTags: encodedTags,
  2841  				block: blockMetadata{
  2842  					start:    blockStart,
  2843  					size:     size,
  2844  					checksum: pChecksum,
  2845  					lastRead: lastRead,
  2846  				},
  2847  			}
  2848  			// Only used for logs
  2849  			metadataCountByBlock[blockStart]++
  2850  		}
  2851  		return nil
  2852  	}
  2854  	var attemptErr error
  2855  	checkedAttemptFn := func(client rpc.TChanNode, _ Channel) {
  2856  		attemptErr = attemptFn(client)
  2857  	}
  2859  	fetchFn := func() error {
  2860  		borrowErr := peer.BorrowConnection(checkedAttemptFn)
  2861  		return xerrors.FirstError(borrowErr, attemptErr)
  2862  	}
  2864  	for moreResults {
  2865  		if err := s.streamBlocksRetrier.Attempt(fetchFn); err != nil {
  2866  			return startPageToken, err
  2867  		}
  2868  	}
  2869  	return nil, nil
  2870  }
  2872  func (s *session) streamBlocksFromPeers(
  2873  	nsMetadata namespace.Metadata,
  2874  	shard uint32,
  2875  	peers peers,
  2876  	metadataCh <-chan receivedBlockMetadata,
  2877  	opts result.Options,
  2878  	consistencyLevel runtimeReadConsistencyLevel,
  2879  	result blocksResult,
  2880  	progress *streamFromPeersMetrics,
  2881  	streamMetadataFn streamBlocksMetadataFn,
  2882  ) error {
  2883  	var (
  2884  		enqueueCh           = newEnqueueChannel(progress)
  2885  		peerBlocksBatchSize = s.streamBlocksBatchSize
  2886  		numPeers            = len(peers.peers)
  2887  		uncheckedBytesPool  = opts.DatabaseBlockOptions().BytesPool().BytesPool()
  2888  	)
  2890  	// Consume the incoming metadata and enqueue to the ready channel
  2891  	// Spin up background goroutine to consume
  2892  	go func() {
  2893  		streamMetadataFn(numPeers, metadataCh, enqueueCh, uncheckedBytesPool)
  2894  		// Begin assessing the queue and how much is processed, once queue
  2895  		// is entirely processed then we can close the enqueue channel
  2896  		enqueueCh.closeOnAllProcessed()
  2897  	}()
  2899  	// Fetch blocks from peers as results become ready
  2900  	peerQueues := make(peerBlocksQueues, 0, numPeers)
  2901  	for _, peer := range peers.peers {
  2902  		peer := peer
  2903  		size := peerBlocksBatchSize
  2904  		workers := s.streamBlocksWorkers
  2905  		drainEvery := 100 * time.Millisecond
  2906  		queue := s.newPeerBlocksQueueFn(peer, size, drainEvery, workers,
  2907  			func(batch []receivedBlockMetadata) {
  2908  				s.streamBlocksBatchFromPeer(nsMetadata, shard, peer, batch, opts,
  2909  					result, enqueueCh, s.streamBlocksRetrier, progress)
  2910  			})
  2911  		peerQueues = append(peerQueues, queue)
  2912  	}
  2914  	var (
  2915  		selected             []receivedBlockMetadata
  2916  		pooled               selectPeersFromPerPeerBlockMetadatasPooledResources
  2917  		onQueueItemProcessed = func() {
  2918  			enqueueCh.trackProcessed(1)
  2919  		}
  2920  	)
  2921  	for perPeerBlocksMetadata := range {
  2922  		// Filter and select which blocks to retrieve from which peers
  2923  		selected, pooled = s.selectPeersFromPerPeerBlockMetadatas(
  2924  			perPeerBlocksMetadata, peerQueues, enqueueCh, consistencyLevel, peers,
  2925  			pooled, progress)
  2927  		if len(selected) == 0 {
  2928  			onQueueItemProcessed()
  2929  			continue
  2930  		}
  2932  		if len(selected) == 1 {
  2933  			queue := peerQueues.findQueue(selected[0].peer)
  2934  			queue.enqueue(selected[0], onQueueItemProcessed)
  2935  			continue
  2936  		}
  2938  		// Need to fan out, only track this as processed once all peer
  2939  		// queues have completed their fetches, so account for the extra
  2940  		// items assigned to be fetched
  2941  		enqueueCh.trackPending(len(selected) - 1)
  2942  		for _, receivedBlockMetadata := range selected {
  2943  			queue := peerQueues.findQueue(receivedBlockMetadata.peer)
  2944  			queue.enqueue(receivedBlockMetadata, onQueueItemProcessed)
  2945  		}
  2946  	}
  2948  	// Close all queues
  2949  	peerQueues.closeAll()
  2951  	return nil
  2952  }
  2954  type streamBlocksMetadataFn func(
  2955  	peersLen int,
  2956  	ch <-chan receivedBlockMetadata,
  2957  	enqueueCh enqueueChannel,
  2958  	pool pool.BytesPool,
  2959  )
  2961  func (s *session) passThroughBlocksMetadata(
  2962  	peersLen int,
  2963  	ch <-chan receivedBlockMetadata,
  2964  	enqueueCh enqueueChannel,
  2965  	_ pool.BytesPool,
  2966  ) {
  2967  	// Receive off of metadata channel
  2968  	for {
  2969  		m, ok := <-ch
  2970  		if !ok {
  2971  			break
  2972  		}
  2973  		res := []receivedBlockMetadata{m}
  2974  		enqueueCh.enqueue(res)
  2975  	}
  2976  }
  2978  func (s *session) streamAndGroupCollectedBlocksMetadata(
  2979  	peersLen int,
  2980  	metadataCh <-chan receivedBlockMetadata,
  2981  	enqueueCh enqueueChannel,
  2982  	pool pool.BytesPool,
  2983  ) {
  2984  	metadata := newReceivedBlocksMap(pool)
  2985  	defer metadata.Reset() // Delete all the keys and return slices to pools
  2987  	for {
  2988  		m, ok := <-metadataCh
  2989  		if !ok {
  2990  			break
  2991  		}
  2993  		key := idAndBlockStart{
  2994  			id:,
  2995  			blockStart: int64(m.block.start),
  2996  		}
  2997  		received, ok := metadata.Get(key)
  2998  		if !ok {
  2999  			received = receivedBlocks{
  3000  				results: make([]receivedBlockMetadata, 0, peersLen),
  3001  			}
  3002  		}
  3004  		// The entry has already been enqueued which means the metadata we just
  3005  		// received is a duplicate. Discard it and move on.
  3006  		if received.enqueued {
  3007  			s.emitDuplicateMetadataLog(received, m)
  3008  			continue
  3009  		}
  3011  		// Determine if the incoming metadata is a duplicate by checking if we've
  3012  		// already received metadata from this peer.
  3013  		existingIndex := -1
  3014  		for i, existingMetadata := range received.results {
  3015  			if existingMetadata.peer.Host().ID() == m.peer.Host().ID() {
  3016  				existingIndex = i
  3017  				break
  3018  			}
  3019  		}
  3021  		if existingIndex != -1 {
  3022  			// If it is a duplicate, then overwrite it (always keep the most recent
  3023  			// duplicate)
  3024  			received.results[existingIndex] = m
  3025  		} else {
  3026  			// Otherwise it's not a duplicate, so its safe to append.
  3027  			received.results = append(received.results, m)
  3028  		}
  3030  		// Since we always perform an overwrite instead of an append for duplicates
  3031  		// from the same peer, once len(received.results == peersLen) then we know
  3032  		// that we've received at least one metadata from every peer and its safe
  3033  		// to enqueue the entry.
  3034  		if len(received.results) == peersLen {
  3035  			enqueueCh.enqueue(received.results)
  3036  			received.enqueued = true
  3037  		}
  3039  		// Ensure tracking enqueued by setting modified result back to map
  3040  		metadata.Set(key, received)
  3041  	}
  3043  	// Enqueue all unenqueued received metadata. Note that these entries will have
  3044  	// metadata from only a subset of their peers.
  3045  	for _, entry := range metadata.Iter() {
  3046  		received := entry.Value()
  3047  		if received.enqueued {
  3048  			continue
  3049  		}
  3050  		enqueueCh.enqueue(received.results)
  3051  	}
  3052  }
  3054  // emitDuplicateMetadataLog emits a log with the details of the duplicate metadata
  3055  // event. Note: We're able to log the blocks themselves because the slice is no longer
  3056  // mutated downstream after enqueuing into the enqueue channel, it's copied before
  3057  // mutated or operated on.
  3058  func (s *session) emitDuplicateMetadataLog(
  3059  	received receivedBlocks,
  3060  	metadata receivedBlockMetadata,
  3061  ) {
  3062  	// Debug-level because this is a common enough occurrence that logging it by
  3063  	// default would be noisy.
  3064  	// This is due to peers sending the most recent data
  3065  	// to the oldest data in that order, hence sometimes its possible to resend
  3066  	// data for a block already sent over the wire if it just moved from being
  3067  	// mutable in memory to immutable on disk.
  3068  	if !s.log.Core().Enabled(zapcore.DebugLevel) {
  3069  		return
  3070  	}
  3072  	var checksum uint32
  3073  	if v := metadata.block.checksum; v != nil {
  3074  		checksum = *v
  3075  	}
  3077  	fields := make([]zapcore.Field, 0, len(received.results)+1)
  3078  	fields = append(fields, zap.String("incoming-metadata", fmt.Sprintf(
  3079  		"id=%s, peer=%s, start=%s, size=%v, checksum=%v",
  3081  		metadata.peer.Host().String(),
  3082  		metadata.block.start.String(),
  3083  		metadata.block.size,
  3084  		checksum)))
  3086  	for i, existing := range received.results {
  3087  		checksum = 0
  3088  		if v := existing.block.checksum; v != nil {
  3089  			checksum = *v
  3090  		}
  3092  		fields = append(fields, zap.String(
  3093  			fmt.Sprintf("existing-metadata-%d", i),
  3094  			fmt.Sprintf(
  3095  				"id=%s, peer=%s, start=%s, size=%v, checksum=%v",
  3097  				existing.peer.Host().String(),
  3098  				existing.block.start.String(),
  3099  				existing.block.size,
  3100  				checksum)))
  3101  	}
  3103  	s.log.Debug("received metadata, but peer metadata has already been submitted", fields...)
  3104  }
  3106  type pickBestPeerFn func(
  3107  	perPeerBlockMetadata []receivedBlockMetadata,
  3108  	peerQueues peerBlocksQueues,
  3109  	resources pickBestPeerPooledResources,
  3110  ) (index int, pooled pickBestPeerPooledResources)
  3112  type pickBestPeerPooledResources struct {
  3113  	ranking []receivedBlockMetadataQueue
  3114  }
  3116  func (s *session) streamBlocksPickBestPeer(
  3117  	perPeerBlockMetadata []receivedBlockMetadata,
  3118  	peerQueues peerBlocksQueues,
  3119  	pooled pickBestPeerPooledResources,
  3120  ) (int, pickBestPeerPooledResources) {
  3121  	// Order by least attempts then by least outstanding blocks being fetched
  3122  	pooled.ranking = pooled.ranking[:0]
  3123  	for i := range perPeerBlockMetadata {
  3124  		elem := receivedBlockMetadataQueue{
  3125  			blockMetadata: perPeerBlockMetadata[i],
  3126  			queue:         peerQueues.findQueue(perPeerBlockMetadata[i].peer),
  3127  		}
  3128  		pooled.ranking = append(pooled.ranking, elem)
  3129  	}
  3130  	elems := receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc(pooled.ranking)
  3131  	sort.Stable(elems)
  3133  	// Return index of the best peer
  3134  	var (
  3135  		bestPeer = pooled.ranking[0].queue.peer
  3136  		idx      int
  3137  	)
  3138  	for i := range perPeerBlockMetadata {
  3139  		if bestPeer == perPeerBlockMetadata[i].peer {
  3140  			idx = i
  3141  			break
  3142  		}
  3143  	}
  3144  	return idx, pooled
  3145  }
  3147  type selectPeersFromPerPeerBlockMetadatasPooledResources struct {
  3148  	currEligible                []receivedBlockMetadata
  3149  	pickBestPeerPooledResources pickBestPeerPooledResources
  3150  }
  3152  func (s *session) selectPeersFromPerPeerBlockMetadatas(
  3153  	perPeerBlocksMetadata []receivedBlockMetadata,
  3154  	peerQueues peerBlocksQueues,
  3155  	reEnqueueCh enqueueChannel,
  3156  	consistencyLevel runtimeReadConsistencyLevel,
  3157  	peers peers,
  3158  	pooled selectPeersFromPerPeerBlockMetadatasPooledResources,
  3159  	m *streamFromPeersMetrics,
  3160  ) ([]receivedBlockMetadata, selectPeersFromPerPeerBlockMetadatasPooledResources) {
  3161  	// Copy into pooled array so we don't mutate existing slice passed
  3162  	pooled.currEligible = pooled.currEligible[:0]
  3163  	pooled.currEligible = append(pooled.currEligible, perPeerBlocksMetadata...)
  3165  	currEligible := pooled.currEligible[:]
  3167  	// Sort the per peer metadatas by peer ID for consistent results
  3168  	sort.Sort(peerBlockMetadataByID(currEligible))
  3170  	// Only select from peers not already attempted
  3171  	curr := currEligible[0]
  3172  	currID :=
  3173  	currBlock := curr.block
  3174  	for i := len(currEligible) - 1; i >= 0; i-- {
  3175  		if currEligible[i].block.reattempt.attempt == 0 {
  3176  			// Not attempted yet
  3177  			continue
  3178  		}
  3180  		// Check if eligible
  3181  		n := s.streamBlocksMaxBlockRetries
  3182  		if currEligible[i].block.reattempt.peerAttempts(currEligible[i].peer) >= n {
  3183  			// Swap current entry to tail
  3184  			receivedBlockMetadatas(currEligible).swap(i, len(currEligible)-1)
  3185  			// Trim newly last entry
  3186  			currEligible = currEligible[:len(currEligible)-1]
  3187  			continue
  3188  		}
  3189  	}
  3191  	if len(currEligible) == 0 {
  3192  		// No current eligible peers to select from
  3193  		majority := peers.majorityReplicas
  3194  		enqueued := len(peers.peers)
  3195  		success := 0
  3196  		if peers.selfExcludedAndSelfHasShardAvailable() {
  3197  			// If we excluded ourselves from fetching, we basically treat ourselves
  3198  			// as a successful peer response since our copy counts towards quorum
  3199  			enqueued++
  3200  			success++
  3201  		}
  3203  		errMsg := "all retries failed for streaming blocks from peers"
  3204  		fanoutFetchState := currBlock.reattempt.fanoutFetchState
  3205  		if fanoutFetchState != nil {
  3206  			if fanoutFetchState.decrementAndReturnPending() > 0 {
  3207  				// This block was fanned out to fetch from all peers and we haven't
  3208  				// received all the results yet, so don't retry it just yet
  3209  				return nil, pooled
  3210  			}
  3212  			// NB(r): This was enqueued after a failed fetch and all other fanout
  3213  			// fetches have completed, check if the consistency level was achieved,
  3214  			// if not then re-enqueue to continue to retry otherwise do not
  3215  			// re-enqueue and see if we need mark this as an error.
  3216  			success = fanoutFetchState.success()
  3217  		}
  3219  		level := consistencyLevel.value()
  3220  		achievedConsistencyLevel := topology.ReadConsistencyAchieved(level, majority, enqueued, success)
  3221  		if achievedConsistencyLevel {
  3222  			if success > 0 {
  3223  				// Some level of success met, no need to log an error
  3224  				return nil, pooled
  3225  			}
  3227  			// No success, inform operator that although consistency level achieved
  3228  			// there were no successful fetches. This can happen if consistency
  3229  			// level is set to None.
  3230  			m.fetchBlockFinalError.Inc(1)
  3231  			s.log.Error(errMsg,
  3232  				zap.Stringer("id", currID),
  3233  				zap.Time("start", currBlock.start.ToTime()),
  3234  				zap.Int("attempted", currBlock.reattempt.attempt),
  3235  				zap.String("attemptErrs", xerrors.Errors(currBlock.reattempt.errs).Error()),
  3236  				zap.Stringer("consistencyLevel", level),
  3237  			)
  3239  			return nil, pooled
  3240  		}
  3242  		// Retry again by re-enqueuing, have not met consistency level yet
  3243  		m.fetchBlockFullRetry.Inc(1)
  3245  		err := fmt.Errorf(errMsg+": attempts=%d", curr.block.reattempt.attempt)
  3246  		reattemptReason := consistencyLevelNotAchievedErrReason
  3247  		reattemptType := fullRetryReattemptType
  3248  		reattemptBlocks := []receivedBlockMetadata{curr}
  3249  		s.reattemptStreamBlocksFromPeersFn(reattemptBlocks, reEnqueueCh,
  3250  			err, reattemptReason, reattemptType, m)
  3252  		return nil, pooled
  3253  	}
  3255  	var (
  3256  		singlePeer         = len(currEligible) == 1
  3257  		sameNonNilChecksum = true
  3258  		curChecksum        *uint32
  3259  	)
  3260  	for i := range currEligible {
  3261  		// If any peer has a nil checksum, this might be the most recent block
  3262  		// and therefore not sealed so we want to merge from all peers
  3263  		if currEligible[i].block.checksum == nil {
  3264  			sameNonNilChecksum = false
  3265  			break
  3266  		}
  3267  		if curChecksum == nil {
  3268  			curChecksum = currEligible[i].block.checksum
  3269  		} else if *curChecksum != *currEligible[i].block.checksum {
  3270  			sameNonNilChecksum = false
  3271  			break
  3272  		}
  3273  	}
  3275  	// If all the peers have the same non-nil checksum, we pick the peer with the
  3276  	// fewest attempts and fewest outstanding requests
  3277  	if singlePeer || sameNonNilChecksum {
  3278  		var idx int
  3279  		if singlePeer {
  3280  			idx = 0
  3281  		} else {
  3282  			pooledResources := pooled.pickBestPeerPooledResources
  3283  			idx, pooledResources = s.pickBestPeerFn(currEligible, peerQueues,
  3284  				pooledResources)
  3285  			pooled.pickBestPeerPooledResources = pooledResources
  3286  		}
  3288  		// Set the reattempt metadata
  3289  		selected := currEligible[idx]
  3290  		selected.block.reattempt.attempt++
  3291  		selected.block.reattempt.attempted =
  3292  			append(selected.block.reattempt.attempted, selected.peer)
  3293  		selected.block.reattempt.fanoutFetchState = nil
  3294  		selected.block.reattempt.retryPeersMetadata = perPeerBlocksMetadata
  3295  		selected.block.reattempt.fetchedPeersMetadata = perPeerBlocksMetadata
  3297  		// Return just the single peer we selected
  3298  		currEligible = currEligible[:1]
  3299  		currEligible[0] = selected
  3300  	} else {
  3301  		fanoutFetchState := newBlockFanoutFetchState(len(currEligible))
  3302  		for i := range currEligible {
  3303  			// Set the reattempt metadata
  3304  			// NB(xichen): each block will only be retried on the same peer because we
  3305  			// already fan out the request to all peers. This means we merge data on
  3306  			// a best-effort basis and only fail if we failed to reach the desired
  3307  			// consistency level when reading data from all peers.
  3308  			var retryFrom []receivedBlockMetadata
  3309  			for j := range perPeerBlocksMetadata {
  3310  				if currEligible[i].peer == perPeerBlocksMetadata[j].peer {
  3311  					// NB(r): Take a ref to a subslice from the originally passed
  3312  					// slice as that is not mutated, whereas currEligible is reused
  3313  					retryFrom = perPeerBlocksMetadata[j : j+1]
  3314  				}
  3315  			}
  3316  			currEligible[i].block.reattempt.attempt++
  3317  			currEligible[i].block.reattempt.attempted =
  3318  				append(currEligible[i].block.reattempt.attempted, currEligible[i].peer)
  3319  			currEligible[i].block.reattempt.fanoutFetchState = fanoutFetchState
  3320  			currEligible[i].block.reattempt.retryPeersMetadata = retryFrom
  3321  			currEligible[i].block.reattempt.fetchedPeersMetadata = perPeerBlocksMetadata
  3322  		}
  3323  	}
  3325  	return currEligible, pooled
  3326  }
  3328  func (s *session) streamBlocksBatchFromPeer(
  3329  	namespaceMetadata namespace.Metadata,
  3330  	shard uint32,
  3331  	peer peer,
  3332  	batch []receivedBlockMetadata,
  3333  	opts result.Options,
  3334  	blocksResult blocksResult,
  3335  	enqueueCh enqueueChannel,
  3336  	retrier xretry.Retrier,
  3337  	m *streamFromPeersMetrics,
  3338  ) {
  3339  	// Prepare request
  3340  	var (
  3341  		req          = rpc.NewFetchBlocksRawRequest()
  3342  		result       *rpc.FetchBlocksRawResult_
  3343  		reqBlocksLen uint
  3345  		nowFn              = opts.ClockOptions().NowFn()
  3346  		ropts              = namespaceMetadata.Options().RetentionOptions()
  3347  		retention          = ropts.RetentionPeriod()
  3348  		earliestBlockStart = xtime.ToUnixNano(nowFn()).
  3349  					Add(-retention).
  3350  					Truncate(ropts.BlockSize())
  3351  	)
  3352  	req.NameSpace = namespaceMetadata.ID().Bytes()
  3353  	req.Shard = int32(shard)
  3354  	req.Elements = make([]*rpc.FetchBlocksRawRequestElement, 0, len(batch))
  3355  	for i := range batch {
  3356  		blockStart := batch[i].block.start
  3357  		if blockStart.Before(earliestBlockStart) {
  3358  			continue // Fell out of retention while we were streaming blocks
  3359  		}
  3360  		req.Elements = append(req.Elements, &rpc.FetchBlocksRawRequestElement{
  3361  			ID:     batch[i].id.Bytes(),
  3362  			Starts: []int64{int64(blockStart)},
  3363  		})
  3364  		reqBlocksLen++
  3365  	}
  3366  	if reqBlocksLen == 0 {
  3367  		// All blocks fell out of retention while streaming
  3368  		return
  3369  	}
  3371  	// Attempt request
  3372  	if err := retrier.Attempt(func() error {
  3373  		var attemptErr error
  3374  		borrowErr := peer.BorrowConnection(func(client rpc.TChanNode, _ Channel) {
  3375  			tctx, _ := thrift.NewContext(s.streamBlocksBatchTimeout)
  3376  			result, attemptErr = client.FetchBlocksRaw(tctx, req)
  3377  		})
  3378  		err := xerrors.FirstError(borrowErr, attemptErr)
  3379  		return err
  3380  	}); err != nil {
  3381  		blocksErr := fmt.Errorf(
  3382  			"stream blocks request error: error=%s, peer=%s",
  3383  			err.Error(), peer.Host().String(),
  3384  		)
  3385  		s.reattemptStreamBlocksFromPeersFn(batch, enqueueCh, blocksErr,
  3386  			reqErrReason, nextRetryReattemptType, m)
  3387  		m.fetchBlockError.Inc(int64(reqBlocksLen))
  3388  		s.log.Debug(blocksErr.Error())
  3389  		return
  3390  	}
  3392  	// Parse and act on result
  3393  	tooManyIDsLogged := false
  3394  	for i := range result.Elements {
  3395  		if i >= len(batch) {
  3396  			m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts)))
  3397  			m.fetchBlockFinalError.Inc(int64(len(req.Elements[i].Starts)))
  3398  			if !tooManyIDsLogged {
  3399  				tooManyIDsLogged = true
  3400  				s.log.Error("stream blocks more IDs than expected",
  3401  					zap.Stringer("peer", peer.Host()),
  3402  				)
  3403  			}
  3404  			continue
  3405  		}
  3407  		id := batch[i].id
  3408  		if !bytes.Equal(id.Bytes(), result.Elements[i].ID) {
  3409  			blocksErr := fmt.Errorf(
  3410  				"stream blocks mismatched ID: expectedID=%s, actualID=%s, indexID=%d, peer=%s",
  3411  				batch[i].id.String(), id.String(), i, peer.Host().String(),
  3412  			)
  3413  			failed := []receivedBlockMetadata{batch[i]}
  3414  			s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr,
  3415  				respErrReason, nextRetryReattemptType, m)
  3416  			m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts)))
  3417  			s.log.Debug(blocksErr.Error())
  3418  			continue
  3419  		}
  3421  		if len(result.Elements[i].Blocks) == 0 {
  3422  			// If fell out of retention during request this is healthy, otherwise
  3423  			// missing blocks will be repaired during an active repair
  3424  			continue
  3425  		}
  3427  		// We only ever fetch a single block for a series
  3428  		if len(result.Elements[i].Blocks) != 1 {
  3429  			errMsg := "stream blocks returned more blocks than expected"
  3430  			blocksErr := fmt.Errorf(errMsg+": expected=%d, actual=%d",
  3431  				1, len(result.Elements[i].Blocks))
  3432  			failed := []receivedBlockMetadata{batch[i]}
  3433  			s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr,
  3434  				respErrReason, nextRetryReattemptType, m)
  3435  			m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts)))
  3436  			s.log.Error(errMsg,
  3437  				zap.Stringer("id", id),
  3438  				zap.Times("expectedStarts", newTimesByUnixNanos(req.Elements[i].Starts)),
  3439  				zap.Times("actualStarts", newTimesByRPCBlocks(result.Elements[i].Blocks)),
  3440  				zap.Stringer("peer", peer.Host()),
  3441  			)
  3442  			continue
  3443  		}
  3445  		for j, block := range result.Elements[i].Blocks {
  3446  			if block.Start != int64(batch[i].block.start) {
  3447  				errMsg := "stream blocks returned different blocks than expected"
  3448  				blocksErr := fmt.Errorf(errMsg+": expected=%s, actual=%d",
  3449  					batch[i].block.start.String(), time.Unix(0, block.Start).String())
  3450  				failed := []receivedBlockMetadata{batch[i]}
  3451  				s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr,
  3452  					respErrReason, nextRetryReattemptType, m)
  3453  				m.fetchBlockError.Inc(int64(len(req.Elements[i].Starts)))
  3454  				s.log.Error(errMsg,
  3455  					zap.Stringer("id", id),
  3456  					zap.Times("expectedStarts", newTimesByUnixNanos(req.Elements[i].Starts)),
  3457  					zap.Times("actualStarts", newTimesByRPCBlocks(result.Elements[i].Blocks)),
  3458  					zap.Stringer("peer", peer.Host()),
  3459  				)
  3460  				continue
  3461  			}
  3463  			// Verify and if verify succeeds add the block from the peer
  3464  			err := s.verifyFetchedBlock(block)
  3465  			if err == nil {
  3466  				err = blocksResult.addBlockFromPeer(id, batch[i].encodedTags,
  3467  					peer.Host(), block)
  3468  			}
  3469  			if err != nil {
  3470  				failed := []receivedBlockMetadata{batch[i]}
  3471  				blocksErr := fmt.Errorf(
  3472  					"stream blocks bad block: id=%s, start=%d, error=%s, indexID=%d, indexBlock=%d, peer=%s",
  3473  					id.String(), block.Start, err.Error(), i, j, peer.Host().String())
  3474  				s.reattemptStreamBlocksFromPeersFn(failed, enqueueCh, blocksErr,
  3475  					respErrReason, nextRetryReattemptType, m)
  3476  				m.fetchBlockError.Inc(1)
  3477  				s.log.Debug(blocksErr.Error())
  3478  				continue
  3479  			}
  3481  			// NB(r): Track a fanned out block fetch success if added block
  3482  			fanout := batch[i].block.reattempt.fanoutFetchState
  3483  			if fanout != nil {
  3484  				fanout.incrementSuccess()
  3485  			}
  3487  			m.fetchBlockSuccess.Inc(1)
  3488  		}
  3489  	}
  3490  }
  3492  func (s *session) verifyFetchedBlock(block *rpc.Block) error {
  3493  	if block.Err != nil {
  3494  		return fmt.Errorf("block error from peer: %s %s", block.Err.Type.String(), block.Err.Message)
  3495  	}
  3496  	if block.Segments == nil {
  3497  		return fmt.Errorf("block segments is bad: segments is nil")
  3498  	}
  3499  	if block.Segments.Merged == nil && len(block.Segments.Unmerged) == 0 {
  3500  		return fmt.Errorf("block segments is bad: merged and unmerged not set")
  3501  	}
  3503  	if checksum := block.Checksum; checksum != nil {
  3504  		var (
  3505  			d        = digest.NewDigest()
  3506  			expected = uint32(*checksum)
  3507  		)
  3508  		if merged := block.Segments.Merged; merged != nil {
  3509  			d = d.Update(merged.Head).Update(merged.Tail)
  3510  		} else {
  3511  			for _, s := range block.Segments.Unmerged {
  3512  				d = d.Update(s.Head).Update(s.Tail)
  3513  			}
  3514  		}
  3515  		if actual := d.Sum32(); actual != expected {
  3516  			return fmt.Errorf("block checksum is bad: expected=%d, actual=%d", expected, actual)
  3517  		}
  3518  	}
  3520  	return nil
  3521  }
  3523  func (s *session) cloneFinalizable(id ident.ID) ident.ID {
  3524  	if id.IsNoFinalize() {
  3525  		return id
  3526  	}
  3527  	return
  3528  }
  3530  func (s *session) nsCtxFromMetadata(nsMeta namespace.Metadata) (namespace.Context, error) {
  3531  	nsCtx := namespace.NewContextFrom(nsMeta)
  3532  	if s.opts.IsSetEncodingProto() && nsCtx.Schema == nil {
  3533  		return nsCtx, fmt.Errorf("no protobuf schema found for namespace: %s", nsMeta.ID().String())
  3534  	}
  3535  	return nsCtx, nil
  3536  }
  3538  func (s *session) nsCtxFor(ns ident.ID) (namespace.Context, error) {
  3539  	nsCtx := namespace.NewContextFor(ns, s.opts.SchemaRegistry())
  3540  	if s.opts.IsSetEncodingProto() && nsCtx.Schema == nil {
  3541  		return nsCtx, fmt.Errorf("no protobuf schema found for namespace: %s", ns.String())
  3542  	}
  3543  	return nsCtx, nil
  3544  }
  3546  type reason int
  3548  const (
  3549  	reqErrReason reason = iota
  3550  	respErrReason
  3551  	consistencyLevelNotAchievedErrReason
  3552  )
  3554  type reattemptType int
  3556  const (
  3557  	nextRetryReattemptType reattemptType = iota
  3558  	fullRetryReattemptType
  3559  )
  3561  type reattemptStreamBlocksFromPeersFn func(
  3562  	[]receivedBlockMetadata,
  3563  	enqueueChannel,
  3564  	error,
  3565  	reason,
  3566  	reattemptType,
  3567  	*streamFromPeersMetrics,
  3568  ) error
  3570  func (s *session) streamBlocksReattemptFromPeers(
  3571  	blocks []receivedBlockMetadata,
  3572  	enqueueCh enqueueChannel,
  3573  	attemptErr error,
  3574  	reason reason,
  3575  	reattemptType reattemptType,
  3576  	m *streamFromPeersMetrics,
  3577  ) error {
  3578  	switch reason {
  3579  	case reqErrReason:
  3580  		m.fetchBlockRetriesReqError.Inc(int64(len(blocks)))
  3581  	case respErrReason:
  3582  		m.fetchBlockRetriesRespError.Inc(int64(len(blocks)))
  3583  	case consistencyLevelNotAchievedErrReason:
  3584  		m.fetchBlockRetriesConsistencyLevelNotAchievedError.Inc(int64(len(blocks)))
  3585  	}
  3587  	// Must do this asynchronously or else could get into a deadlock scenario
  3588  	// where cannot enqueue into the reattempt channel because no more work is
  3589  	// getting done because new attempts are blocked on existing attempts completing
  3590  	// and existing attempts are trying to enqueue into a full reattempt channel
  3591  	enqueue, done, err := enqueueCh.enqueueDelayed(len(blocks))
  3592  	if err != nil {
  3593  		return err
  3594  	}
  3595  	go s.streamBlocksReattemptFromPeersEnqueue(blocks, attemptErr, reattemptType,
  3596  		enqueue, done)
  3597  	return nil
  3598  }
  3600  func (s *session) streamBlocksReattemptFromPeersEnqueue(
  3601  	blocks []receivedBlockMetadata,
  3602  	attemptErr error,
  3603  	reattemptType reattemptType,
  3604  	enqueueFn enqueueDelayedFn,
  3605  	enqueueDoneFn enqueueDelayedDoneFn,
  3606  ) {
  3607  	// NB(r): Notify the delayed enqueue is done.
  3608  	defer enqueueDoneFn()
  3610  	for i := range blocks {
  3611  		var reattemptPeersMetadata []receivedBlockMetadata
  3612  		switch reattemptType {
  3613  		case nextRetryReattemptType:
  3614  			reattemptPeersMetadata = blocks[i].block.reattempt.retryPeersMetadata
  3615  		case fullRetryReattemptType:
  3616  			reattemptPeersMetadata = blocks[i].block.reattempt.fetchedPeersMetadata
  3617  		}
  3618  		if len(reattemptPeersMetadata) == 0 {
  3619  			continue
  3620  		}
  3622  		// Reconstruct peers metadata for reattempt
  3623  		reattemptBlocksMetadata := make([]receivedBlockMetadata, len(reattemptPeersMetadata))
  3624  		for j := range reattemptPeersMetadata {
  3625  			var reattempt blockMetadataReattempt
  3626  			if reattemptType == nextRetryReattemptType {
  3627  				// Only if a default type of retry do we want to actually want
  3628  				// to set all the retry metadata, otherwise this re-enqueued metadata
  3629  				// should start fresh
  3630  				reattempt = blocks[i].block.reattempt
  3632  				// Copy the errors for every peer so they don't shard the same error
  3633  				// slice and therefore are not subject to race conditions when the
  3634  				// error slice is modified
  3635  				reattemptErrs := make([]error, len(reattempt.errs)+1)
  3636  				n := copy(reattemptErrs, reattempt.errs)
  3637  				reattemptErrs[n] = attemptErr
  3638  				reattempt.errs = reattemptErrs
  3639  			}
  3641  			reattemptBlocksMetadata[j] = receivedBlockMetadata{
  3642  				peer: reattemptPeersMetadata[j].peer,
  3643  				id:   blocks[i].id,
  3644  				block: blockMetadata{
  3645  					start:     reattemptPeersMetadata[j].block.start,
  3646  					size:      reattemptPeersMetadata[j].block.size,
  3647  					checksum:  reattemptPeersMetadata[j].block.checksum,
  3648  					reattempt: reattempt,
  3649  				},
  3650  			}
  3651  		}
  3653  		// Re-enqueue the block to be fetched from all peers requested
  3654  		// to reattempt from
  3655  		enqueueFn(reattemptBlocksMetadata)
  3656  	}
  3657  }
  3659  type blocksResult interface {
  3660  	addBlockFromPeer(
  3661  		id ident.ID,
  3662  		encodedTags checked.Bytes,
  3663  		peer topology.Host,
  3664  		block *rpc.Block,
  3665  	) error
  3666  }
  3668  type baseBlocksResult struct {
  3669  	nsCtx                   namespace.Context
  3670  	blockOpts               block.Options
  3671  	blockAllocSize          int
  3672  	contextPool             context.Pool
  3673  	encoderPool             encoding.EncoderPool
  3674  	multiReaderIteratorPool encoding.MultiReaderIteratorPool
  3675  }
  3677  func newBaseBlocksResult(
  3678  	nsCtx namespace.Context,
  3679  	opts Options,
  3680  	resultOpts result.Options,
  3681  ) baseBlocksResult {
  3682  	blockOpts := resultOpts.DatabaseBlockOptions()
  3683  	return baseBlocksResult{
  3684  		nsCtx:                   nsCtx,
  3685  		blockOpts:               blockOpts,
  3686  		blockAllocSize:          blockOpts.DatabaseBlockAllocSize(),
  3687  		contextPool:             opts.ContextPool(),
  3688  		encoderPool:             blockOpts.EncoderPool(),
  3689  		multiReaderIteratorPool: blockOpts.MultiReaderIteratorPool(),
  3690  	}
  3691  }
  3693  func (b *baseBlocksResult) segmentForBlock(seg *rpc.Segment) ts.Segment {
  3694  	var (
  3695  		bytesPool  = b.blockOpts.BytesPool()
  3696  		head, tail checked.Bytes
  3697  	)
  3698  	if len(seg.Head) > 0 {
  3699  		head = bytesPool.Get(len(seg.Head))
  3700  		head.IncRef()
  3701  		head.AppendAll(seg.Head)
  3702  		head.DecRef()
  3703  	}
  3704  	if len(seg.Tail) > 0 {
  3705  		tail = bytesPool.Get(len(seg.Tail))
  3706  		tail.IncRef()
  3707  		tail.AppendAll(seg.Tail)
  3708  		tail.DecRef()
  3709  	}
  3710  	var checksum uint32
  3711  	if seg.Checksum != nil {
  3712  		checksum = uint32(*seg.Checksum)
  3713  	}
  3715  	return ts.NewSegment(head, tail, checksum, ts.FinalizeHead&ts.FinalizeTail)
  3716  }
  3718  func (b *baseBlocksResult) mergeReaders(
  3719  	start xtime.UnixNano, blockSize time.Duration, readers []xio.SegmentReader,
  3720  ) (encoding.Encoder, error) {
  3721  	iter := b.multiReaderIteratorPool.Get()
  3722  	iter.Reset(readers, start, blockSize, b.nsCtx.Schema)
  3723  	defer iter.Close()
  3725  	encoder := b.encoderPool.Get()
  3726  	encoder.Reset(start, b.blockAllocSize, b.nsCtx.Schema)
  3728  	for iter.Next() {
  3729  		dp, unit, annotation := iter.Current()
  3730  		if err := encoder.Encode(dp, unit, annotation); err != nil {
  3731  			encoder.Close()
  3732  			return nil, err
  3733  		}
  3734  	}
  3735  	if err := iter.Err(); err != nil {
  3736  		encoder.Close()
  3737  		return nil, err
  3738  	}
  3740  	return encoder, nil
  3741  }
  3743  func (b *baseBlocksResult) newDatabaseBlock(block *rpc.Block) (block.DatabaseBlock, error) {
  3744  	var (
  3745  		start    = xtime.UnixNano(block.Start)
  3746  		segments = block.Segments
  3747  		result   = b.blockOpts.DatabaseBlockPool().Get()
  3748  	)
  3750  	if segments == nil {
  3751  		result.Close() // return block to pool
  3752  		return nil, errSessionBadBlockResultFromPeer
  3753  	}
  3755  	switch {
  3756  	case segments.Merged != nil:
  3757  		// Unmerged, can insert directly into a single block
  3758  		mergedBlock := segments.Merged
  3759  		result.Reset(
  3760  			start,
  3761  			durationConvert(mergedBlock.BlockSize),
  3762  			b.segmentForBlock(mergedBlock),
  3763  			b.nsCtx,
  3764  		)
  3766  	case segments.Unmerged != nil:
  3767  		// Must merge to provide a single block
  3768  		segmentReaderPool := b.blockOpts.SegmentReaderPool()
  3769  		readers := make([]xio.SegmentReader, len(segments.Unmerged))
  3771  		blockSize := time.Duration(0)
  3772  		for i, seg := range segments.Unmerged {
  3773  			segmentReader := segmentReaderPool.Get()
  3774  			segmentReader.Reset(b.segmentForBlock(seg))
  3775  			readers[i] = segmentReader
  3777  			bs := durationConvert(seg.BlockSize)
  3778  			if bs > blockSize {
  3779  				blockSize = bs
  3780  			}
  3781  		}
  3782  		encoder, err := b.mergeReaders(start, blockSize, readers)
  3783  		for _, reader := range readers {
  3784  			// Close each reader
  3785  			reader.Finalize()
  3786  		}
  3788  		if err != nil {
  3789  			// mergeReaders(...) already calls encoder.Close() upon error
  3790  			result.Close() // return block to pool
  3791  			return nil, err
  3792  		}
  3794  		// Set the block data
  3795  		result.Reset(start, blockSize, encoder.Discard(), b.nsCtx)
  3797  	default:
  3798  		result.Close() // return block to pool
  3799  		return nil, errSessionBadBlockResultFromPeer
  3800  	}
  3802  	return result, nil
  3803  }
  3805  // Ensure streamBlocksResult implements blocksResult
  3806  var _ blocksResult = (*streamBlocksResult)(nil)
  3808  type streamBlocksResult struct {
  3809  	baseBlocksResult
  3810  	outputCh       chan<- peerBlocksDatapoint
  3811  	tagDecoderPool serialize.TagDecoderPool
  3812  	idPool         ident.Pool
  3813  	nsCtx          namespace.Context
  3814  }
  3816  func newStreamBlocksResult(
  3817  	nsCtx namespace.Context,
  3818  	opts Options,
  3819  	resultOpts result.Options,
  3820  	outputCh chan<- peerBlocksDatapoint,
  3821  	tagDecoderPool serialize.TagDecoderPool,
  3822  	idPool ident.Pool,
  3823  ) *streamBlocksResult {
  3824  	return &streamBlocksResult{
  3825  		nsCtx:            nsCtx,
  3826  		baseBlocksResult: newBaseBlocksResult(nsCtx, opts, resultOpts),
  3827  		outputCh:         outputCh,
  3828  		tagDecoderPool:   tagDecoderPool,
  3829  		idPool:           idPool,
  3830  	}
  3831  }
  3833  type peerBlocksDatapoint struct {
  3834  	id    ident.ID
  3835  	tags  ident.Tags
  3836  	peer  topology.Host
  3837  	block block.DatabaseBlock
  3838  }
  3840  func (s *streamBlocksResult) addBlockFromPeer(
  3841  	id ident.ID,
  3842  	encodedTags checked.Bytes,
  3843  	peer topology.Host,
  3844  	block *rpc.Block,
  3845  ) error {
  3846  	result, err := s.newDatabaseBlock(block)
  3847  	if err != nil {
  3848  		return err
  3849  	}
  3850  	tags, err := newTagsFromEncodedTags(id, encodedTags,
  3851  		s.tagDecoderPool, s.idPool)
  3852  	if err != nil {
  3853  		return err
  3854  	}
  3855  	s.outputCh <- peerBlocksDatapoint{
  3856  		id:    id,
  3857  		tags:  tags,
  3858  		peer:  peer,
  3859  		block: result,
  3860  	}
  3861  	return nil
  3862  }
  3864  type peerBlocksIter struct {
  3865  	inputCh <-chan peerBlocksDatapoint
  3866  	errCh   <-chan error
  3867  	current peerBlocksDatapoint
  3868  	err     error
  3869  	done    bool
  3870  }
  3872  func newPeerBlocksIter(
  3873  	inputC <-chan peerBlocksDatapoint,
  3874  	errC <-chan error,
  3875  ) *peerBlocksIter {
  3876  	return &peerBlocksIter{
  3877  		inputCh: inputC,
  3878  		errCh:   errC,
  3879  	}
  3880  }
  3882  func (it *peerBlocksIter) Current() (topology.Host, ident.ID, ident.Tags, block.DatabaseBlock) {
  3883  	return it.current.peer,, it.current.tags, it.current.block
  3884  }
  3886  func (it *peerBlocksIter) Err() error {
  3887  	return it.err
  3888  }
  3890  func (it *peerBlocksIter) Next() bool {
  3891  	if it.done || it.err != nil {
  3892  		return false
  3893  	}
  3894  	m, more := <-it.inputCh
  3896  	if !more {
  3897  		it.err = <-it.errCh
  3898  		it.done = true
  3899  		return false
  3900  	}
  3902  	it.current = m
  3903  	return true
  3904  }
  3906  // Ensure streamBlocksResult implements blocksResult
  3907  var _ blocksResult = (*bulkBlocksResult)(nil)
  3909  type bulkBlocksResult struct {
  3910  	sync.RWMutex
  3911  	baseBlocksResult
  3912  	result         result.ShardResult
  3913  	tagDecoderPool serialize.TagDecoderPool
  3914  	idPool         ident.Pool
  3915  	nsCtx          namespace.Context
  3916  }
  3918  func newBulkBlocksResult(
  3919  	nsCtx namespace.Context,
  3920  	opts Options,
  3921  	resultOpts result.Options,
  3922  	tagDecoderPool serialize.TagDecoderPool,
  3923  	idPool ident.Pool,
  3924  ) *bulkBlocksResult {
  3925  	return &bulkBlocksResult{
  3926  		nsCtx:            nsCtx,
  3927  		baseBlocksResult: newBaseBlocksResult(nsCtx, opts, resultOpts),
  3928  		result:           result.NewShardResult(resultOpts),
  3929  		tagDecoderPool:   tagDecoderPool,
  3930  		idPool:           idPool,
  3931  	}
  3932  }
  3934  func (r *bulkBlocksResult) addBlockFromPeer(
  3935  	id ident.ID,
  3936  	encodedTags checked.Bytes,
  3937  	peer topology.Host,
  3938  	block *rpc.Block,
  3939  ) error {
  3940  	start := xtime.UnixNano(block.Start)
  3941  	result, err := r.newDatabaseBlock(block)
  3942  	if err != nil {
  3943  		return err
  3944  	}
  3946  	var (
  3947  		tags                ident.Tags
  3948  		attemptedDecodeTags bool
  3949  	)
  3950  	for {
  3951  		r.Lock()
  3952  		currBlock, exists := r.result.BlockAt(id, start)
  3953  		if !exists {
  3954  			if encodedTags == nil || attemptedDecodeTags {
  3955  				r.result.AddBlock(id, tags, result)
  3956  				r.Unlock()
  3957  				break
  3958  			}
  3959  			r.Unlock()
  3961  			// Tags not decoded yet, attempt decoded and then reinsert
  3962  			attemptedDecodeTags = true
  3963  			tags, err = newTagsFromEncodedTags(id, encodedTags,
  3964  				r.tagDecoderPool, r.idPool)
  3965  			if err != nil {
  3966  				return err
  3967  			}
  3968  			continue
  3969  		}
  3971  		// Remove the existing block from the result so it doesn't get
  3972  		// merged again
  3973  		r.result.RemoveBlockAt(id, start)
  3974  		r.Unlock()
  3976  		// If we've already received data for this block, merge them
  3977  		// with the new block if possible
  3978  		tmpCtx := r.contextPool.Get()
  3979  		currReader, err := currBlock.Stream(tmpCtx)
  3980  		if err != nil {
  3981  			return err
  3982  		}
  3984  		// If there are no data in the current block, there is no
  3985  		// need to merge
  3986  		if currReader.IsEmpty() {
  3987  			continue
  3988  		}
  3990  		resultReader, err := result.Stream(tmpCtx)
  3991  		if err != nil {
  3992  			return err
  3993  		}
  3994  		if resultReader.IsEmpty() {
  3995  			return nil
  3996  		}
  3998  		readers := []xio.SegmentReader{currReader.SegmentReader, resultReader.SegmentReader}
  3999  		blockSize := currReader.BlockSize
  4001  		encoder, err := r.mergeReaders(start, blockSize, readers)
  4003  		if err != nil {
  4004  			return err
  4005  		}
  4007  		result.Close()
  4009  		result = r.blockOpts.DatabaseBlockPool().Get()
  4010  		result.Reset(start, blockSize, encoder.Discard(), r.nsCtx)
  4012  		tmpCtx.Close()
  4013  	}
  4015  	return nil
  4016  }
  4018  type enqueueCh struct {
  4019  	sync.Mutex
  4020  	sending              int
  4021  	enqueued             int
  4022  	processed            int
  4023  	peersMetadataCh      chan []receivedBlockMetadata
  4024  	closed               bool
  4025  	enqueueDelayedFn     enqueueDelayedFn
  4026  	enqueueDelayedDoneFn enqueueDelayedDoneFn
  4027  	metrics              *streamFromPeersMetrics
  4028  }
  4030  // enqueueChannelDefaultLen is the queue length for processing series ready to
  4031  // be fetched from other peers.
  4032  // It was reduced from 32k to 512 since each struct in the queue is quite large
  4033  // and with 32k capacity was using significant memory with high shard
  4034  // concurrency.
  4035  const enqueueChannelDefaultLen = 512
  4037  func newEnqueueChannel(m *streamFromPeersMetrics) enqueueChannel {
  4038  	c := &enqueueCh{
  4039  		peersMetadataCh: make(chan []receivedBlockMetadata, enqueueChannelDefaultLen),
  4040  		metrics:         m,
  4041  	}
  4043  	// Allocate the enqueue delayed fn just once
  4044  	c.enqueueDelayedFn = func(peersMetadata []receivedBlockMetadata) {
  4045  		c.peersMetadataCh <- peersMetadata
  4046  	}
  4047  	c.enqueueDelayedDoneFn = func() {
  4048  		c.Lock()
  4049  		c.sending--
  4050  		c.Unlock()
  4051  	}
  4053  	go func() {
  4054  		for {
  4055  			c.Lock()
  4056  			closed := c.closed
  4057  			numEnqueued := float64(len(c.peersMetadataCh))
  4058  			c.Unlock()
  4059  			if closed {
  4060  				return
  4061  			}
  4062  			m.blocksEnqueueChannel.Update(numEnqueued)
  4063  			time.Sleep(gaugeReportInterval)
  4064  		}
  4065  	}()
  4066  	return c
  4067  }
  4069  func (c *enqueueCh) enqueue(peersMetadata []receivedBlockMetadata) error {
  4070  	c.Lock()
  4071  	if c.closed {
  4072  		c.Unlock()
  4073  		return errEnqueueChIsClosed
  4074  	}
  4075  	c.enqueued++
  4076  	c.sending++
  4077  	c.Unlock()
  4078  	c.peersMetadataCh <- peersMetadata
  4079  	c.Lock()
  4080  	c.sending--
  4081  	c.Unlock()
  4082  	return nil
  4083  }
  4085  func (c *enqueueCh) enqueueDelayed(numToEnqueue int) (enqueueDelayedFn, enqueueDelayedDoneFn, error) {
  4086  	c.Lock()
  4087  	if c.closed {
  4088  		c.Unlock()
  4089  		return nil, nil, errEnqueueChIsClosed
  4090  	}
  4091  	c.sending++ // NB(r): This is decremented by calling the returned enqueue done function
  4092  	c.enqueued += numToEnqueue
  4093  	c.Unlock()
  4094  	return c.enqueueDelayedFn, c.enqueueDelayedDoneFn, nil
  4095  }
  4097  // read is always safe to call since you can safely range
  4098  // over a closed channel, and/or do a checked read in case
  4099  // it is closed (unlike when publishing to a channel).
  4100  func (c *enqueueCh) read() <-chan []receivedBlockMetadata {
  4101  	return c.peersMetadataCh
  4102  }
  4104  func (c *enqueueCh) trackPending(amount int) {
  4105  	c.Lock()
  4106  	c.enqueued += amount
  4107  	c.Unlock()
  4108  }
  4110  func (c *enqueueCh) trackProcessed(amount int) {
  4111  	c.Lock()
  4112  	c.processed += amount
  4113  	c.Unlock()
  4114  }
  4116  func (c *enqueueCh) unprocessedLen() int {
  4117  	c.Lock()
  4118  	unprocessed := c.unprocessedLenWithLock()
  4119  	c.Unlock()
  4120  	return unprocessed
  4121  }
  4123  func (c *enqueueCh) unprocessedLenWithLock() int {
  4124  	return c.enqueued - c.processed
  4125  }
  4127  func (c *enqueueCh) closeOnAllProcessed() {
  4128  	for {
  4129  		c.Lock()
  4130  		if c.unprocessedLenWithLock() == 0 && c.sending == 0 {
  4131  			close(c.peersMetadataCh)
  4132  			c.closed = true
  4133  			c.Unlock()
  4134  			return
  4135  		}
  4136  		c.Unlock()
  4137  		time.Sleep(100 * time.Millisecond)
  4138  	}
  4139  }
  4141  type receivedBlocks struct {
  4142  	enqueued bool
  4143  	results  []receivedBlockMetadata
  4144  }
  4146  type processFn func(batch []receivedBlockMetadata)
  4148  // peerBlocksQueue is a per peer queue of blocks to be retrieved from a peer
  4149  type peerBlocksQueue struct {
  4150  	sync.RWMutex
  4151  	closed       bool
  4152  	peer         peer
  4153  	queue        []receivedBlockMetadata
  4154  	doneFns      []func()
  4155  	assigned     uint64
  4156  	completed    uint64
  4157  	maxQueueSize int
  4158  	workers      xsync.WorkerPool
  4159  	processFn    processFn
  4160  }
  4162  type newPeerBlocksQueueFn func(
  4163  	peer peer,
  4164  	maxQueueSize int,
  4165  	interval time.Duration,
  4166  	workers xsync.WorkerPool,
  4167  	processFn processFn,
  4168  ) *peerBlocksQueue
  4170  func newPeerBlocksQueue(
  4171  	peer peer,
  4172  	maxQueueSize int,
  4173  	interval time.Duration,
  4174  	workers xsync.WorkerPool,
  4175  	processFn processFn,
  4176  ) *peerBlocksQueue {
  4177  	q := &peerBlocksQueue{
  4178  		peer:         peer,
  4179  		maxQueueSize: maxQueueSize,
  4180  		workers:      workers,
  4181  		processFn:    processFn,
  4182  	}
  4183  	if interval > 0 {
  4184  		go q.drainEvery(interval)
  4185  	}
  4186  	return q
  4187  }
  4189  func (q *peerBlocksQueue) drainEvery(interval time.Duration) {
  4190  	for {
  4191  		q.Lock()
  4192  		if q.closed {
  4193  			q.Unlock()
  4194  			return
  4195  		}
  4196  		q.drainWithLock()
  4197  		q.Unlock()
  4198  		time.Sleep(interval)
  4199  	}
  4200  }
  4202  func (q *peerBlocksQueue) close() {
  4203  	q.Lock()
  4204  	defer q.Unlock()
  4205  	q.closed = true
  4206  }
  4208  func (q *peerBlocksQueue) trackAssigned(amount int) {
  4209  	atomic.AddUint64(&q.assigned, uint64(amount))
  4210  }
  4212  func (q *peerBlocksQueue) trackCompleted(amount int) {
  4213  	atomic.AddUint64(&q.completed, uint64(amount))
  4214  }
  4216  func (q *peerBlocksQueue) enqueue(bl receivedBlockMetadata, doneFn func()) {
  4217  	q.Lock()
  4219  	if len(q.queue) == 0 && cap(q.queue) < q.maxQueueSize {
  4220  		// Lazy initialize queue
  4221  		q.queue = make([]receivedBlockMetadata, 0, q.maxQueueSize)
  4222  	}
  4223  	if len(q.doneFns) == 0 && cap(q.doneFns) < q.maxQueueSize {
  4224  		// Lazy initialize doneFns
  4225  		q.doneFns = make([]func(), 0, q.maxQueueSize)
  4226  	}
  4227  	q.queue = append(q.queue, bl)
  4228  	if doneFn != nil {
  4229  		q.doneFns = append(q.doneFns, doneFn)
  4230  	}
  4231  	q.trackAssigned(1)
  4233  	// Determine if should drain immediately
  4234  	if len(q.queue) < q.maxQueueSize {
  4235  		// Require more to fill up block
  4236  		q.Unlock()
  4237  		return
  4238  	}
  4239  	q.drainWithLock()
  4241  	q.Unlock()
  4242  }
  4244  func (q *peerBlocksQueue) drain() {
  4245  	q.Lock()
  4246  	q.drainWithLock()
  4247  	q.Unlock()
  4248  }
  4250  func (q *peerBlocksQueue) drainWithLock() {
  4251  	if len(q.queue) == 0 {
  4252  		// None to drain
  4253  		return
  4254  	}
  4255  	enqueued := q.queue
  4256  	doneFns := q.doneFns
  4257  	q.queue = nil
  4258  	q.doneFns = nil
  4259  	q.workers.Go(func() {
  4260  		q.processFn(enqueued)
  4261  		// Call done callbacks
  4262  		for i := range doneFns {
  4263  			doneFns[i]()
  4264  		}
  4265  		// Track completed blocks
  4266  		q.trackCompleted(len(enqueued))
  4267  	})
  4268  }
  4270  type peerBlocksQueues []*peerBlocksQueue
  4272  func (qs peerBlocksQueues) findQueue(peer peer) *peerBlocksQueue {
  4273  	for _, q := range qs {
  4274  		if q.peer == peer {
  4275  			return q
  4276  		}
  4277  	}
  4278  	return nil
  4279  }
  4281  func (qs peerBlocksQueues) closeAll() {
  4282  	for _, q := range qs {
  4283  		q.close()
  4284  	}
  4285  }
  4287  type receivedBlockMetadata struct {
  4288  	peer        peer
  4289  	id          ident.ID
  4290  	encodedTags checked.Bytes
  4291  	block       blockMetadata
  4292  }
  4294  type receivedBlockMetadatas []receivedBlockMetadata
  4296  func (arr receivedBlockMetadatas) swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] }
  4298  type peerBlockMetadataByID []receivedBlockMetadata
  4300  func (arr peerBlockMetadataByID) Len() int      { return len(arr) }
  4301  func (arr peerBlockMetadataByID) Swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] }
  4302  func (arr peerBlockMetadataByID) Less(i, j int) bool {
  4303  	return strings.Compare(arr[i].peer.Host().ID(), arr[j].peer.Host().ID()) < 0
  4304  }
  4306  type receivedBlockMetadataQueue struct {
  4307  	blockMetadata receivedBlockMetadata
  4308  	queue         *peerBlocksQueue
  4309  }
  4311  type receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc []receivedBlockMetadataQueue
  4313  func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Len() int {
  4314  	return len(arr)
  4315  }
  4316  func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Swap(i, j int) {
  4317  	arr[i], arr[j] = arr[j], arr[i]
  4318  }
  4319  func (arr receivedBlockMetadataQueuesByAttemptsAscOutstandingAsc) Less(i, j int) bool {
  4320  	peerI := arr[i].queue.peer
  4321  	peerJ := arr[j].queue.peer
  4322  	attemptsI := arr[i].blockMetadata.block.reattempt.peerAttempts(peerI)
  4323  	attemptsJ := arr[j].blockMetadata.block.reattempt.peerAttempts(peerJ)
  4324  	if attemptsI != attemptsJ {
  4325  		return attemptsI < attemptsJ
  4326  	}
  4328  	outstandingI :=
  4329  		atomic.LoadUint64(&arr[i].queue.assigned) -
  4330  			atomic.LoadUint64(&arr[i].queue.completed)
  4331  	outstandingJ :=
  4332  		atomic.LoadUint64(&arr[j].queue.assigned) -
  4333  			atomic.LoadUint64(&arr[j].queue.completed)
  4334  	return outstandingI < outstandingJ
  4335  }
  4337  type blockMetadata struct {
  4338  	start     xtime.UnixNano
  4339  	size      int64
  4340  	checksum  *uint32
  4341  	lastRead  xtime.UnixNano
  4342  	reattempt blockMetadataReattempt
  4343  }
  4345  type blockMetadataReattempt struct {
  4346  	attempt              int
  4347  	fanoutFetchState     *blockFanoutFetchState
  4348  	attempted            []peer
  4349  	errs                 []error
  4350  	retryPeersMetadata   []receivedBlockMetadata
  4351  	fetchedPeersMetadata []receivedBlockMetadata
  4352  }
  4354  type blockFanoutFetchState struct {
  4355  	numPending int32
  4356  	numSuccess int32
  4357  }
  4359  func newBlockFanoutFetchState(
  4360  	pending int,
  4361  ) *blockFanoutFetchState {
  4362  	return &blockFanoutFetchState{
  4363  		numPending: int32(pending),
  4364  	}
  4365  }
  4367  func (s *blockFanoutFetchState) success() int {
  4368  	return int(atomic.LoadInt32(&s.numSuccess))
  4369  }
  4371  func (s *blockFanoutFetchState) incrementSuccess() {
  4372  	atomic.AddInt32(&s.numSuccess, 1)
  4373  }
  4375  func (s *blockFanoutFetchState) decrementAndReturnPending() int {
  4376  	return int(atomic.AddInt32(&s.numPending, -1))
  4377  }
  4379  func (b blockMetadataReattempt) peerAttempts(p peer) int {
  4380  	r := 0
  4381  	for i := range b.attempted {
  4382  		if b.attempted[i] == p {
  4383  			r++
  4384  		}
  4385  	}
  4386  	return r
  4387  }
  4389  func newTimesByUnixNanos(values []int64) []time.Time {
  4390  	result := make([]time.Time, len(values))
  4391  	for i := range values {
  4392  		result[i] = time.Unix(0, values[i])
  4393  	}
  4394  	return result
  4395  }
  4397  func newTimesByRPCBlocks(values []*rpc.Block) []time.Time {
  4398  	result := make([]time.Time, len(values))
  4399  	for i := range values {
  4400  		result[i] = time.Unix(0, values[i].Start)
  4401  	}
  4402  	return result
  4403  }
  4405  type metadataIter struct {
  4406  	inputCh        <-chan receivedBlockMetadata
  4407  	errCh          <-chan error
  4408  	host           topology.Host
  4409  	metadata       block.Metadata
  4410  	tagDecoderPool serialize.TagDecoderPool
  4411  	idPool         ident.Pool
  4412  	done           bool
  4413  	err            error
  4414  }
  4416  func newMetadataIter(
  4417  	inputCh <-chan receivedBlockMetadata,
  4418  	errCh <-chan error,
  4419  	tagDecoderPool serialize.TagDecoderPool,
  4420  	idPool ident.Pool,
  4421  ) PeerBlockMetadataIter {
  4422  	return &metadataIter{
  4423  		inputCh:        inputCh,
  4424  		errCh:          errCh,
  4425  		tagDecoderPool: tagDecoderPool,
  4426  		idPool:         idPool,
  4427  	}
  4428  }
  4430  func (it *metadataIter) Next() bool {
  4431  	if it.done || it.err != nil {
  4432  		return false
  4433  	}
  4434  	m, more := <-it.inputCh
  4435  	if !more {
  4436  		it.err = <-it.errCh
  4437  		it.done = true
  4438  		return false
  4439  	}
  4440  	var tags ident.Tags
  4441  	tags, it.err = newTagsFromEncodedTags(, m.encodedTags,
  4442  		it.tagDecoderPool, it.idPool)
  4443  	if it.err != nil {
  4444  		return false
  4445  	}
  4446 = m.peer.Host()
  4447  	it.metadata = block.NewMetadata(, tags, m.block.start,
  4448  		m.block.size, m.block.checksum, m.block.lastRead)
  4449  	return true
  4450  }
  4452  func (it *metadataIter) Current() (topology.Host, block.Metadata) {
  4453  	return, it.metadata
  4454  }
  4456  func (it *metadataIter) Err() error {
  4457  	return it.err
  4458  }
  4460  type idAndBlockStart struct {
  4461  	id         ident.ID
  4462  	blockStart int64
  4463  }
  4465  func newTagsFromEncodedTags(
  4466  	seriesID ident.ID,
  4467  	encodedTags checked.Bytes,
  4468  	tagDecoderPool serialize.TagDecoderPool,
  4469  	idPool ident.Pool,
  4470  ) (ident.Tags, error) {
  4471  	if encodedTags == nil {
  4472  		return ident.Tags{}, nil
  4473  	}
  4475  	encodedTags.IncRef()
  4477  	tagDecoder := tagDecoderPool.Get()
  4478  	tagDecoder.Reset(encodedTags)
  4479  	defer tagDecoder.Close()
  4481  	tags, err := idxconvert.TagsFromTagsIter(seriesID, tagDecoder, idPool)
  4483  	encodedTags.DecRef()
  4485  	return tags, err
  4486  }
  4488  const (
  4489  	// histogramDurationBucketsVersion must be bumped if histogramDurationBuckets is changed
  4490  	// to namespace the different buckets from each other so they don't overlap and cause the
  4491  	// histogram function to error out due to overlapping buckets in the same query.
  4492  	histogramDurationBucketsVersion = "v1"
  4493  	// histogramDurationBucketsVersionTag is the tag for the version of the buckets in use.
  4494  	histogramDurationBucketsVersionTag = "schema"
  4495  )
  4497  // histogramDurationBuckets is a high resolution set of duration buckets.
  4498  func histogramDurationBuckets() tally.DurationBuckets {
  4499  	return append(tally.DurationBuckets{0},
  4500  		tally.MustMakeExponentialDurationBuckets(time.Millisecond, 1.25, 60)...)
  4501  }
  4503  // histogramWithDurationBuckets returns a histogram with the standard duration buckets.
  4504  func histogramWithDurationBuckets(scope tally.Scope, name string) tally.Histogram {
  4505  	sub := scope.Tagged(map[string]string{
  4506  		histogramDurationBucketsVersionTag: histogramDurationBucketsVersion,
  4507  	})
  4508  	return sub.Histogram(name, histogramDurationBuckets())
  4509  }
  4511  func minDuration(x, y time.Duration) time.Duration {
  4512  	if x < y {
  4513  		return x
  4514  	}
  4515  	return y
  4516  }