github.com/m3db/m3@v1.5.0/src/dbnode/persist/fs/retriever.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  // The block retriever is used to stream blocks of data from disk. It controls
    22  // the fetch concurrency on a per-Namespace basis I.E if the server is using
    23  // spinning-disks the concurrency can be set to 1 to serialize all disk fetches
    24  // for a given namespace, and the concurrency be set higher in the case of SSDs.
    25  // This fetch concurrency is primarily implemented via the number of concurrent
    26  // fetchLoops that the retriever creates.
    27  //
    28  // The block retriever also handles batching of requests for data, as well as
    29  // re-arranging the order of requests to increase data locality when seeking
    30  // through and across files.
    31  
    32  package fs
    33  
    34  import (
    35  	stdctx "context"
    36  	"errors"
    37  	"sort"
    38  	"sync"
    39  	"sync/atomic"
    40  	"time"
    41  
    42  	"github.com/m3db/m3/src/dbnode/namespace"
    43  	"github.com/m3db/m3/src/dbnode/sharding"
    44  	"github.com/m3db/m3/src/dbnode/storage/block"
    45  	"github.com/m3db/m3/src/dbnode/storage/limits"
    46  	"github.com/m3db/m3/src/dbnode/ts"
    47  	"github.com/m3db/m3/src/dbnode/x/xio"
    48  	"github.com/m3db/m3/src/x/context"
    49  	"github.com/m3db/m3/src/x/ident"
    50  	"github.com/m3db/m3/src/x/pool"
    51  	xtime "github.com/m3db/m3/src/x/time"
    52  
    53  	"github.com/uber-go/tally"
    54  	"go.uber.org/zap"
    55  )
    56  
    57  var (
    58  	errBlockRetrieverNotOpen             = errors.New("block retriever is not open")
    59  	errBlockRetrieverAlreadyOpenOrClosed = errors.New("block retriever already open or is closed")
    60  	errBlockRetrieverAlreadyClosed       = errors.New("block retriever already closed")
    61  	errNoSeekerMgr                       = errors.New("there is no open seeker manager")
    62  )
    63  
    64  const (
    65  	defaultRetrieveRequestQueueCapacity = 4096
    66  )
    67  
    68  type blockRetrieverStatus int
    69  
    70  type newSeekerMgrFn func(
    71  	bytesPool pool.CheckedBytesPool,
    72  	opts Options,
    73  	blockRetrieverOpts BlockRetrieverOptions,
    74  ) DataFileSetSeekerManager
    75  
    76  const (
    77  	blockRetrieverNotOpen blockRetrieverStatus = iota
    78  	blockRetrieverOpen
    79  	blockRetrieverClosed
    80  )
    81  
    82  type blockRetriever struct {
    83  	sync.RWMutex
    84  
    85  	opts                    BlockRetrieverOptions
    86  	fsOpts                  Options
    87  	logger                  *zap.Logger
    88  	queryLimits             limits.QueryLimits
    89  	bytesReadLimit          limits.LookbackLimit
    90  	seriesBloomFilterMisses tally.Counter
    91  
    92  	newSeekerMgrFn newSeekerMgrFn
    93  
    94  	reqPool    RetrieveRequestPool
    95  	bytesPool  pool.CheckedBytesPool
    96  	idPool     ident.Pool
    97  	nsMetadata namespace.Metadata
    98  
    99  	blockSize               time.Duration
   100  	nsCacheBlocksOnRetrieve bool
   101  
   102  	status                     blockRetrieverStatus
   103  	reqsByShardIdx             []*shardRetrieveRequests
   104  	seekerMgr                  DataFileSetSeekerManager
   105  	notifyFetch                chan struct{}
   106  	fetchLoopsShouldShutdownCh chan struct{}
   107  	fetchLoopsHaveShutdownCh   chan struct{}
   108  }
   109  
   110  // NewBlockRetriever returns a new block retriever for TSDB file sets.
   111  func NewBlockRetriever(
   112  	opts BlockRetrieverOptions,
   113  	fsOpts Options,
   114  ) (DataBlockRetriever, error) {
   115  	if err := opts.Validate(); err != nil {
   116  		return nil, err
   117  	}
   118  
   119  	scope := fsOpts.InstrumentOptions().MetricsScope().SubScope("retriever")
   120  
   121  	return &blockRetriever{
   122  		opts:                    opts,
   123  		fsOpts:                  fsOpts,
   124  		logger:                  fsOpts.InstrumentOptions().Logger(),
   125  		queryLimits:             opts.QueryLimits(),
   126  		bytesReadLimit:          opts.QueryLimits().BytesReadLimit(),
   127  		seriesBloomFilterMisses: scope.Counter("series-bloom-filter-misses"),
   128  		newSeekerMgrFn:          NewSeekerManager,
   129  		reqPool:                 opts.RetrieveRequestPool(),
   130  		bytesPool:               opts.BytesPool(),
   131  		idPool:                  opts.IdentifierPool(),
   132  		status:                  blockRetrieverNotOpen,
   133  		notifyFetch:             make(chan struct{}, 1),
   134  		// We just close this channel when the fetchLoops should shutdown, so no
   135  		// buffering is required
   136  		fetchLoopsShouldShutdownCh: make(chan struct{}),
   137  		fetchLoopsHaveShutdownCh:   make(chan struct{}, opts.FetchConcurrency()),
   138  	}, nil
   139  }
   140  
   141  func (r *blockRetriever) Open(
   142  	ns namespace.Metadata,
   143  	shardSet sharding.ShardSet,
   144  ) error {
   145  	r.Lock()
   146  	defer r.Unlock()
   147  
   148  	if r.status != blockRetrieverNotOpen {
   149  		return errBlockRetrieverAlreadyOpenOrClosed
   150  	}
   151  
   152  	seekerMgr := r.newSeekerMgrFn(r.bytesPool, r.fsOpts, r.opts)
   153  	if err := seekerMgr.Open(ns, shardSet); err != nil {
   154  		return err
   155  	}
   156  
   157  	r.nsMetadata = ns
   158  	r.status = blockRetrieverOpen
   159  	r.seekerMgr = seekerMgr
   160  
   161  	// Cache blockSize result and namespace specific block caching option
   162  	r.blockSize = ns.Options().RetentionOptions().BlockSize()
   163  	r.nsCacheBlocksOnRetrieve = ns.Options().CacheBlocksOnRetrieve()
   164  
   165  	for i := 0; i < r.opts.FetchConcurrency(); i++ {
   166  		go r.fetchLoop(seekerMgr)
   167  	}
   168  	return nil
   169  }
   170  
   171  func (r *blockRetriever) CacheShardIndices(shards []uint32) error {
   172  	r.RLock()
   173  	if r.status != blockRetrieverOpen {
   174  		r.RUnlock()
   175  		return errBlockRetrieverNotOpen
   176  	}
   177  	seekerMgr := r.seekerMgr
   178  	r.RUnlock()
   179  
   180  	// Don't hold the RLock() for duration of CacheShardIndices because
   181  	// it can take a very long time and it could block the regular read
   182  	// path (which sometimes needs to acquire an exclusive lock). In practice
   183  	// this is fine, it just means that the Retriever could be closed while a
   184  	// call to CacheShardIndices is still outstanding.
   185  	return seekerMgr.CacheShardIndices(shards)
   186  }
   187  
   188  func (r *blockRetriever) AssignShardSet(shardSet sharding.ShardSet) {
   189  	// NB(bodu): Block retriever will always be open before calling this method.
   190  	// But have this check anyways to be safe.
   191  	r.RLock()
   192  	defer r.RUnlock()
   193  	if r.status != blockRetrieverOpen {
   194  		return
   195  	}
   196  	r.seekerMgr.AssignShardSet(shardSet)
   197  }
   198  
   199  func (r *blockRetriever) fetchLoop(seekerMgr DataFileSetSeekerManager) {
   200  	var (
   201  		seekerResources    = NewReusableSeekerResources(r.fsOpts)
   202  		retrieverResources = newReusableRetrieverResources()
   203  		inFlight           []*retrieveRequest
   204  		currBatchReqs      []*retrieveRequest
   205  	)
   206  	for {
   207  		// Free references to the inflight requests
   208  		for i := range inFlight {
   209  			inFlight[i] = nil
   210  		}
   211  		inFlight = inFlight[:0]
   212  
   213  		// Select in flight requests
   214  		r.RLock()
   215  		// Move requests from shard retriever reqs into in flight slice
   216  		for _, reqs := range r.reqsByShardIdx {
   217  			reqs.Lock()
   218  			if len(reqs.queued) > 0 {
   219  				inFlight = append(inFlight, reqs.queued...)
   220  				reqs.resetQueued()
   221  			}
   222  			reqs.Unlock()
   223  		}
   224  
   225  		status := r.status
   226  		n := len(inFlight)
   227  		r.RUnlock()
   228  
   229  		// Exit if not open and fulfilled all open requests
   230  		if n == 0 && status != blockRetrieverOpen {
   231  			break
   232  		}
   233  
   234  		// If no fetches then no work to do, yield
   235  		if n == 0 {
   236  			select {
   237  			case <-r.notifyFetch:
   238  				continue
   239  			case <-r.fetchLoopsShouldShutdownCh:
   240  				break
   241  			}
   242  		}
   243  
   244  		// Files are all by shard and block time, the locality of
   245  		// files is therefore foremost by block time as that is when they are
   246  		// all written. Note that this sort does NOT mean that we're going to stripe
   247  		// through different files at once as you might expect at first, but simply
   248  		// that since all the fileset files are written at the end of a block period
   249  		// those files are more likely to be physically located close to each other
   250  		// on disk. In other words, instead of accessing files like this:
   251  		// 		shard1T1 --> shard1T2 --> shard1T3 --> shard2T1 --> shard2T2 --> shard2T3
   252  		// its probably faster to access them like this:
   253  		// 		shard1T1 --> shard2T1 --> shard1T2 --> shard2T2 --> shard1T3 --> shard2T3
   254  		// so we re-arrange the order of the requests to achieve that
   255  		sort.Sort(retrieveRequestByStartAscShardAsc(inFlight))
   256  
   257  		// Iterate through all in flight requests and send them to the seeker in
   258  		// batches of block time + shard.
   259  		currBatchShard := uint32(0)
   260  		currBatchStart := xtime.UnixNano(0)
   261  		currBatchReqs = currBatchReqs[:0]
   262  		for _, req := range inFlight {
   263  			if !req.start.Equal(currBatchStart) ||
   264  				req.shard != currBatchShard {
   265  				// Fetch any outstanding in the current batch
   266  				if len(currBatchReqs) > 0 {
   267  					r.fetchBatch(seekerMgr, currBatchShard, currBatchStart,
   268  						currBatchReqs, seekerResources, retrieverResources)
   269  					for i := range currBatchReqs {
   270  						currBatchReqs[i] = nil
   271  					}
   272  					currBatchReqs = currBatchReqs[:0]
   273  				}
   274  
   275  				// Set the new batch attributes
   276  				currBatchShard = req.shard
   277  				currBatchStart = req.start
   278  			}
   279  
   280  			// Enqueue into the current batch
   281  			currBatchReqs = append(currBatchReqs, req)
   282  		}
   283  
   284  		// Fetch any finally outstanding in the current batch
   285  		if len(currBatchReqs) > 0 {
   286  			r.fetchBatch(seekerMgr, currBatchShard, currBatchStart,
   287  				currBatchReqs, seekerResources, retrieverResources)
   288  			for i := range currBatchReqs {
   289  				currBatchReqs[i] = nil
   290  			}
   291  			currBatchReqs = currBatchReqs[:0]
   292  		}
   293  	}
   294  
   295  	r.fetchLoopsHaveShutdownCh <- struct{}{}
   296  }
   297  
   298  func (r *blockRetriever) fetchBatch(
   299  	seekerMgr DataFileSetSeekerManager,
   300  	shard uint32,
   301  	blockStart xtime.UnixNano,
   302  	allReqs []*retrieveRequest,
   303  	seekerResources ReusableSeekerResources,
   304  	retrieverResources *reusableRetrieverResources,
   305  ) {
   306  	var (
   307  		seeker     ConcurrentDataFileSetSeeker
   308  		callbackWg sync.WaitGroup
   309  	)
   310  
   311  	defer func() {
   312  		filteredReqs := allReqs[:0]
   313  		// Make sure requests are always fulfilled so if there's a code bug
   314  		// then errSeekNotCompleted is returned because req.success is not set
   315  		// rather than we have dangling goroutines stacking up.
   316  		for _, req := range allReqs {
   317  			if !req.waitingForCallback {
   318  				req.onDone()
   319  				continue
   320  			}
   321  
   322  			filteredReqs = append(filteredReqs, req)
   323  		}
   324  
   325  		callbackWg.Wait()
   326  		for _, req := range filteredReqs {
   327  			req.onDone()
   328  		}
   329  
   330  		// Reset resources to free any pointers in the slices still pointing
   331  		// to requests that are now completed and returned to pools.
   332  		retrieverResources.resetAll()
   333  
   334  		if seeker == nil {
   335  			// No borrowed seeker to return.
   336  			return
   337  		}
   338  
   339  		// Return borrowed seeker.
   340  		err := seekerMgr.Return(shard, blockStart, seeker)
   341  		if err != nil {
   342  			r.logger.Error("err returning seeker for shard",
   343  				zap.Uint32("shard", shard),
   344  				zap.Int64("blockStart", blockStart.Seconds()),
   345  				zap.Error(err),
   346  			)
   347  		}
   348  	}()
   349  
   350  	var err error
   351  	seeker, err = seekerMgr.Borrow(shard, blockStart)
   352  	if err != nil {
   353  		for _, req := range allReqs {
   354  			req.err = err
   355  		}
   356  		return
   357  	}
   358  
   359  	retrieverResources.resetDataReqs()
   360  	retrieverResources.dataReqs = append(retrieverResources.dataReqs, allReqs...)
   361  	reqs := retrieverResources.dataReqs
   362  
   363  	var limitErr error
   364  	if err := r.queryLimits.AnyFetchExceeded(); err != nil {
   365  		for _, req := range reqs {
   366  			req.err = err
   367  		}
   368  		return
   369  	}
   370  
   371  	for _, req := range reqs {
   372  		if limitErr != nil {
   373  			req.err = limitErr
   374  			continue
   375  		}
   376  
   377  		select {
   378  		case <-req.stdCtx.Done():
   379  			req.err = req.stdCtx.Err()
   380  			continue
   381  		default:
   382  		}
   383  
   384  		entry, err := seeker.SeekIndexEntry(req.id, seekerResources)
   385  		if err != nil && !errors.Is(err, errSeekIDNotFound) {
   386  			req.err = err
   387  			continue
   388  		}
   389  
   390  		if err := r.bytesReadLimit.Inc(int(entry.Size), req.source); err != nil {
   391  			req.err = err
   392  			limitErr = err
   393  			continue
   394  		}
   395  
   396  		if errors.Is(err, errSeekIDNotFound) {
   397  			req.notFound = true
   398  		}
   399  
   400  		req.indexEntry = entry
   401  	}
   402  
   403  	sort.Sort(retrieveRequestByIndexEntryOffsetAsc(reqs))
   404  	tagDecoderPool := r.fsOpts.TagDecoderPool()
   405  
   406  	blockCachingEnabled := r.opts.CacheBlocksOnRetrieve() && r.nsCacheBlocksOnRetrieve
   407  
   408  	// Seek and execute all requests
   409  	for _, req := range reqs {
   410  		if req.err != nil {
   411  			// Skip requests with error, will already get appropriate callback.
   412  			continue
   413  		}
   414  
   415  		if req.notFound {
   416  			// Only try to seek the ID if it exists and there haven't been any errors so
   417  			// far, otherwise we'll get a checksum mismatch error because the default
   418  			// offset value for indexEntry is zero.
   419  			req.success = true
   420  			req.onCallerOrRetrieverDone()
   421  			continue
   422  		}
   423  
   424  		select {
   425  		case <-req.stdCtx.Done():
   426  			req.err = req.stdCtx.Err()
   427  			continue
   428  		default:
   429  		}
   430  
   431  		data, err := seeker.SeekByIndexEntry(req.indexEntry, seekerResources)
   432  		if err != nil {
   433  			// If not found error is returned here, that's still an error since
   434  			// it's expected to be found if it was found in the index file.
   435  			req.err = err
   436  			continue
   437  		}
   438  
   439  		var (
   440  			seg, onRetrieveSeg ts.Segment
   441  			checksum           = req.indexEntry.DataChecksum
   442  		)
   443  		seg = ts.NewSegment(data, nil, checksum, ts.FinalizeHead)
   444  
   445  		// We don't need to call onRetrieve.OnRetrieveBlock if the ID was not found.
   446  		callOnRetrieve := blockCachingEnabled && req.onRetrieve != nil
   447  		if callOnRetrieve {
   448  			// NB(r): Need to also trigger callback with a copy of the data.
   449  			// This is used by the database to cache the in memory data for
   450  			// consequent fetches.
   451  			dataCopy := r.bytesPool.Get(data.Len())
   452  			onRetrieveSeg = ts.NewSegment(dataCopy, nil, checksum, ts.FinalizeHead)
   453  			dataCopy.AppendAll(data.Bytes())
   454  
   455  			if tags := req.indexEntry.EncodedTags; tags != nil && tags.Len() > 0 {
   456  				decoder := tagDecoderPool.Get()
   457  				// DecRef because we're transferring ownership from the index entry to
   458  				// the tagDecoder which will IncRef().
   459  				tags.DecRef()
   460  				decoder.Reset(tags)
   461  				req.tags = decoder
   462  			}
   463  		} else {
   464  			// If we didn't transfer ownership of the tags to the decoder above, then we
   465  			// no longer need them and we can can finalize them.
   466  			if tags := req.indexEntry.EncodedTags; tags != nil {
   467  				tags.DecRef()
   468  				tags.Finalize()
   469  			}
   470  		}
   471  
   472  		// Complete request.
   473  		req.onRetrieved(seg, req.nsCtx)
   474  		req.success = true
   475  
   476  		if !callOnRetrieve {
   477  			// No need to call the onRetrieve callback, but do need to call
   478  			// onCallerOrRetrieverDone since data requests do not get finalized
   479  			// when req.onDone is called since sometimes they need deferred
   480  			// finalization (when callOnRetrieve is true).
   481  			req.onCallerOrRetrieverDone()
   482  			continue
   483  		}
   484  
   485  		callbackWg.Add(1)
   486  		req.waitingForCallback = true
   487  		go func(r *retrieveRequest) {
   488  			// Call the onRetrieve callback and finalize.
   489  			r.onRetrieve.OnRetrieveBlock(r.id, r.tags, r.start, onRetrieveSeg, r.nsCtx)
   490  			r.onCallerOrRetrieverDone()
   491  			callbackWg.Done()
   492  		}(req)
   493  	}
   494  }
   495  
   496  func (r *blockRetriever) seriesPresentInBloomFilter(
   497  	id ident.ID,
   498  	shard uint32,
   499  	startTime xtime.UnixNano,
   500  ) (bool, error) {
   501  	// Capture variable and RLock() because this slice can be modified in the
   502  	// Open() method
   503  	r.RLock()
   504  	seekerMgr := r.seekerMgr
   505  	r.RUnlock()
   506  
   507  	// This should never happen unless caller tries to use Stream() before Open()
   508  	if seekerMgr == nil {
   509  		return false, errNoSeekerMgr
   510  	}
   511  
   512  	idExists, err := seekerMgr.Test(id, shard, startTime)
   513  	if err != nil {
   514  		return false, err
   515  	}
   516  
   517  	if !idExists {
   518  		r.seriesBloomFilterMisses.Inc(1)
   519  	}
   520  
   521  	return idExists, nil
   522  }
   523  
   524  // streamRequest returns a bool indicating if the ID was found, and any errors.
   525  func (r *blockRetriever) streamRequest(
   526  	ctx context.Context,
   527  	req *retrieveRequest,
   528  	shard uint32,
   529  	id ident.ID,
   530  	startTime xtime.UnixNano,
   531  ) error {
   532  	req.resultWg.Add(1)
   533  	req.shard = shard
   534  
   535  	// NB(r): If the ID is a ident.BytesID then we can just hold
   536  	// onto this ID.
   537  	seriesID := id
   538  	if !seriesID.IsNoFinalize() {
   539  		// NB(r): Clone the ID as we're not positive it will stay valid throughout
   540  		// the lifecycle of the async request.
   541  		seriesID = r.idPool.Clone(id)
   542  	}
   543  
   544  	req.id = seriesID
   545  	req.start = startTime
   546  	req.blockSize = r.blockSize
   547  
   548  	// Ensure to finalize at the end of request
   549  	ctx.RegisterFinalizer(req)
   550  
   551  	reqs, err := r.shardRequests(shard)
   552  	if err != nil {
   553  		return err
   554  	}
   555  
   556  	reqs.Lock()
   557  	reqs.queued = append(reqs.queued, req)
   558  	reqs.Unlock()
   559  
   560  	// Notify fetch loop
   561  	select {
   562  	case r.notifyFetch <- struct{}{}:
   563  	default:
   564  		// Loop busy, already ready to consume notification
   565  	}
   566  
   567  	// The request may not have completed yet, but it has an internal
   568  	// waitgroup which the caller will have to wait for before retrieving
   569  	// the data. This means that even though we're returning nil for error
   570  	// here, the caller may still encounter an error when they attempt to
   571  	// read the data.
   572  	return nil
   573  }
   574  
   575  func (r *blockRetriever) Stream(
   576  	ctx context.Context,
   577  	shard uint32,
   578  	id ident.ID,
   579  	startTime xtime.UnixNano,
   580  	onRetrieve block.OnRetrieveBlock,
   581  	nsCtx namespace.Context,
   582  ) (xio.BlockReader, error) {
   583  	found, err := r.seriesPresentInBloomFilter(id, shard, startTime)
   584  	if err != nil {
   585  		return xio.EmptyBlockReader, err
   586  	}
   587  	// If the ID is not in the seeker's bloom filter, then it's definitely not on
   588  	// disk and we can return immediately.
   589  	if !found {
   590  		return xio.EmptyBlockReader, nil
   591  	}
   592  
   593  	req := r.reqPool.Get()
   594  	// only save the go ctx to ensure we don't accidentally use the m3 ctx after it's been closed by the caller.
   595  	req.stdCtx = ctx.GoContext()
   596  	req.onRetrieve = onRetrieve
   597  
   598  	if source, ok := req.stdCtx.Value(limits.SourceContextKey).([]byte); ok {
   599  		req.source = source
   600  	}
   601  
   602  	err = r.streamRequest(ctx, req, shard, id, startTime)
   603  	if err != nil {
   604  		req.resultWg.Done()
   605  		return xio.EmptyBlockReader, err
   606  	}
   607  
   608  	// The request may not have completed yet, but it has an internal
   609  	// waitgroup which the caller will have to wait for before retrieving
   610  	// the data. This means that even though we're returning nil for error
   611  	// here, the caller may still encounter an error when they attempt to
   612  	// read the data.
   613  	return req.toBlock(), nil
   614  }
   615  
   616  func (r *blockRetriever) shardRequests(
   617  	shard uint32,
   618  ) (*shardRetrieveRequests, error) {
   619  	r.RLock()
   620  	if r.status != blockRetrieverOpen {
   621  		r.RUnlock()
   622  		return nil, errBlockRetrieverNotOpen
   623  	}
   624  	if int(shard) < len(r.reqsByShardIdx) {
   625  		reqs := r.reqsByShardIdx[shard]
   626  		r.RUnlock()
   627  		return reqs, nil
   628  	}
   629  	r.RUnlock()
   630  
   631  	r.Lock()
   632  	defer r.Unlock()
   633  
   634  	// Check if raced with another call to this method
   635  	if r.status != blockRetrieverOpen {
   636  		return nil, errBlockRetrieverNotOpen
   637  	}
   638  	if int(shard) < len(r.reqsByShardIdx) {
   639  		reqs := r.reqsByShardIdx[shard]
   640  		return reqs, nil
   641  	}
   642  
   643  	reqsByShardIdx := make([]*shardRetrieveRequests, shard+1)
   644  
   645  	for i := range reqsByShardIdx {
   646  		if i < len(r.reqsByShardIdx) {
   647  			reqsByShardIdx[i] = r.reqsByShardIdx[i]
   648  			continue
   649  		}
   650  		capacity := defaultRetrieveRequestQueueCapacity
   651  		reqsByShardIdx[i] = &shardRetrieveRequests{
   652  			shard:  uint32(i),
   653  			queued: make([]*retrieveRequest, 0, capacity),
   654  		}
   655  	}
   656  
   657  	r.reqsByShardIdx = reqsByShardIdx
   658  	reqs := r.reqsByShardIdx[shard]
   659  
   660  	return reqs, nil
   661  }
   662  
   663  func (r *blockRetriever) Close() error {
   664  	r.Lock()
   665  	if r.status == blockRetrieverClosed {
   666  		r.Unlock()
   667  		return errBlockRetrieverAlreadyClosed
   668  	}
   669  	r.nsMetadata = nil
   670  	r.status = blockRetrieverClosed
   671  
   672  	r.blockSize = 0
   673  	r.Unlock()
   674  
   675  	close(r.fetchLoopsShouldShutdownCh)
   676  	for i := 0; i < r.opts.FetchConcurrency(); i++ {
   677  		<-r.fetchLoopsHaveShutdownCh
   678  	}
   679  
   680  	return r.seekerMgr.Close()
   681  }
   682  
   683  type shardRetrieveRequests struct {
   684  	sync.Mutex
   685  	shard  uint32
   686  	queued []*retrieveRequest
   687  }
   688  
   689  func (reqs *shardRetrieveRequests) resetQueued() {
   690  	// Free references to the queued requests
   691  	for i := range reqs.queued {
   692  		reqs.queued[i] = nil
   693  	}
   694  	reqs.queued = reqs.queued[:0]
   695  }
   696  
   697  // Don't forget to update the resetForReuse method when adding a new field
   698  type retrieveRequest struct {
   699  	finalized          bool
   700  	waitingForCallback bool
   701  	resultWg           sync.WaitGroup
   702  
   703  	pool *reqPool
   704  
   705  	id         ident.ID
   706  	tags       ident.TagIterator
   707  	start      xtime.UnixNano
   708  	blockSize  time.Duration
   709  	onRetrieve block.OnRetrieveBlock
   710  	nsCtx      namespace.Context
   711  	source     []byte
   712  	stdCtx     stdctx.Context
   713  
   714  	indexEntry IndexEntry
   715  	reader     xio.SegmentReader
   716  
   717  	err error
   718  
   719  	// Finalize requires two calls to finalize (once both the user of the
   720  	// request and the retriever fetch loop is done, and only then, can
   721  	// we free this request) so we track this with an atomic here.
   722  	finalizes uint32
   723  	shard     uint32
   724  
   725  	notFound bool
   726  	success  bool
   727  }
   728  
   729  func (req *retrieveRequest) toBlock() xio.BlockReader {
   730  	return xio.BlockReader{
   731  		SegmentReader: req,
   732  		Start:         req.start,
   733  		BlockSize:     req.blockSize,
   734  	}
   735  }
   736  
   737  func (req *retrieveRequest) onRetrieved(segment ts.Segment, nsCtx namespace.Context) {
   738  	req.nsCtx = nsCtx
   739  	req.Reset(segment)
   740  }
   741  
   742  func (req *retrieveRequest) onDone() {
   743  	var (
   744  		err     = req.err
   745  		success = req.success
   746  	)
   747  
   748  	if err == nil && !success {
   749  		// Require explicit success, otherwise this request
   750  		// was never completed.
   751  		// This helps catch code bugs where this element wasn't explicitly
   752  		// handled as completed during a fetch batch call instead of
   753  		// returning but with no actual result set properly.
   754  		req.err = errSeekNotCompleted
   755  	}
   756  
   757  	req.resultWg.Done()
   758  
   759  	// Do not call onCallerOrRetrieverDone since the OnRetrieveCallback
   760  	// code path will call req.onCallerOrRetrieverDone() when it's done.
   761  	// If encountered an error though, should call it since not waiting for
   762  	// callback to finish or even if not waiting for callback to finish
   763  	// the happy path that calls this pre-emptively has not executed either.
   764  	// That is if-and-only-if request is data request and is successful and
   765  	// will req.onCallerOrRetrieverDone() be called in a deferred manner.
   766  	if !success {
   767  		req.onCallerOrRetrieverDone()
   768  	}
   769  }
   770  
   771  func (req *retrieveRequest) Reset(segment ts.Segment) {
   772  	req.reader.Reset(segment)
   773  }
   774  
   775  func (req *retrieveRequest) ResetWindowed(
   776  	segment ts.Segment,
   777  	start xtime.UnixNano,
   778  	blockSize time.Duration,
   779  ) {
   780  	req.start = start
   781  	req.blockSize = blockSize
   782  	req.Reset(segment)
   783  }
   784  
   785  func (req *retrieveRequest) onCallerOrRetrieverDone() {
   786  	if atomic.AddUint32(&req.finalizes, 1) != 2 {
   787  		return
   788  	}
   789  
   790  	if req.id != nil {
   791  		req.id.Finalize()
   792  		req.id = nil
   793  	}
   794  	if req.tags != nil {
   795  		req.tags.Close()
   796  		req.tags = ident.EmptyTagIterator
   797  	}
   798  	if req.reader != nil {
   799  		req.reader.Finalize()
   800  		req.reader = nil
   801  	}
   802  
   803  	req.pool.Put(req)
   804  }
   805  
   806  func (req *retrieveRequest) SegmentReader() (xio.SegmentReader, error) {
   807  	return req.reader, nil
   808  }
   809  
   810  // NB: be aware to avoid calling Clone() in a hot path, since it copies the
   811  // underlying bytes.
   812  func (req *retrieveRequest) Clone(
   813  	pool pool.CheckedBytesPool,
   814  ) (xio.SegmentReader, error) {
   815  	req.resultWg.Wait() // wait until result is ready
   816  	if req.err != nil {
   817  		return nil, req.err
   818  	}
   819  	return req.reader.Clone(pool)
   820  }
   821  
   822  func (req *retrieveRequest) BlockSize() time.Duration {
   823  	return req.blockSize
   824  }
   825  
   826  func (req *retrieveRequest) Read64() (word uint64, n byte, err error) {
   827  	req.resultWg.Wait()
   828  	if req.err != nil {
   829  		return 0, 0, req.err
   830  	}
   831  	return req.reader.Read64()
   832  }
   833  
   834  func (req *retrieveRequest) Peek64() (word uint64, n byte, err error) {
   835  	req.resultWg.Wait()
   836  	if req.err != nil {
   837  		return 0, 0, req.err
   838  	}
   839  	return req.reader.Peek64()
   840  }
   841  
   842  func (req *retrieveRequest) Segment() (ts.Segment, error) {
   843  	req.resultWg.Wait()
   844  	if req.err != nil {
   845  		return ts.Segment{}, req.err
   846  	}
   847  	return req.reader.Segment()
   848  }
   849  
   850  func (req *retrieveRequest) Finalize() {
   851  	// May not actually finalize the request, depending on if
   852  	// retriever is done too
   853  	if req.finalized {
   854  		return
   855  	}
   856  
   857  	req.resultWg.Wait()
   858  	req.finalized = true
   859  	req.onCallerOrRetrieverDone()
   860  }
   861  
   862  func (req *retrieveRequest) resetForReuse() {
   863  	req.resultWg = sync.WaitGroup{}
   864  	req.finalized = false
   865  	req.finalizes = 0
   866  	req.source = nil
   867  	req.shard = 0
   868  	req.id = nil
   869  	req.tags = ident.EmptyTagIterator
   870  	req.start = 0
   871  	req.blockSize = 0
   872  	req.onRetrieve = nil
   873  	req.indexEntry = IndexEntry{}
   874  	req.reader = nil
   875  	req.err = nil
   876  	req.notFound = false
   877  	req.success = false
   878  	req.stdCtx = nil
   879  }
   880  
   881  type retrieveRequestByStartAscShardAsc []*retrieveRequest
   882  
   883  func (r retrieveRequestByStartAscShardAsc) Len() int      { return len(r) }
   884  func (r retrieveRequestByStartAscShardAsc) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
   885  func (r retrieveRequestByStartAscShardAsc) Less(i, j int) bool {
   886  	if !r[i].start.Equal(r[j].start) {
   887  		return r[i].start.Before(r[j].start)
   888  	}
   889  	return r[i].shard < r[j].shard
   890  }
   891  
   892  type retrieveRequestByIndexEntryOffsetAsc []*retrieveRequest
   893  
   894  func (r retrieveRequestByIndexEntryOffsetAsc) Len() int      { return len(r) }
   895  func (r retrieveRequestByIndexEntryOffsetAsc) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
   896  func (r retrieveRequestByIndexEntryOffsetAsc) Less(i, j int) bool {
   897  	return r[i].indexEntry.Offset < r[j].indexEntry.Offset
   898  }
   899  
   900  // RetrieveRequestPool is the retrieve request pool.
   901  type RetrieveRequestPool interface {
   902  	// Init initializes the request pool.
   903  	Init()
   904  	// Get gets a retrieve request.
   905  	Get() *retrieveRequest
   906  	// Put returns a retrieve request to the pool.
   907  	Put(req *retrieveRequest)
   908  }
   909  
   910  type reqPool struct {
   911  	segmentReaderPool xio.SegmentReaderPool
   912  	pool              pool.ObjectPool
   913  }
   914  
   915  // NewRetrieveRequestPool returns a new retrieve request pool.
   916  func NewRetrieveRequestPool(
   917  	segmentReaderPool xio.SegmentReaderPool,
   918  	opts pool.ObjectPoolOptions,
   919  ) RetrieveRequestPool {
   920  	return &reqPool{
   921  		segmentReaderPool: segmentReaderPool,
   922  		pool:              pool.NewObjectPool(opts),
   923  	}
   924  }
   925  
   926  func (p *reqPool) Init() {
   927  	p.pool.Init(func() interface{} {
   928  		return &retrieveRequest{pool: p}
   929  	})
   930  }
   931  
   932  func (p *reqPool) Get() *retrieveRequest {
   933  	req := p.pool.Get().(*retrieveRequest)
   934  	req.resetForReuse()
   935  	req.reader = p.segmentReaderPool.Get()
   936  	return req
   937  }
   938  
   939  func (p *reqPool) Put(req *retrieveRequest) {
   940  	// Also call reset for reuse to nil any references before
   941  	// putting back in pool to avoid holding strong refs to any
   942  	// shortly lived objects while still in the pool
   943  	req.resetForReuse()
   944  	p.pool.Put(req)
   945  }
   946  
   947  type reusableRetrieverResources struct {
   948  	dataReqs []*retrieveRequest
   949  }
   950  
   951  func newReusableRetrieverResources() *reusableRetrieverResources {
   952  	return &reusableRetrieverResources{}
   953  }
   954  
   955  func (r *reusableRetrieverResources) resetAll() {
   956  	r.resetDataReqs()
   957  }
   958  
   959  func (r *reusableRetrieverResources) resetDataReqs() {
   960  	for i := range r.dataReqs {
   961  		r.dataReqs[i] = nil
   962  	}
   963  	r.dataReqs = r.dataReqs[:0]
   964  }