github.com/m3db/m3@v1.5.0/src/query/storage/m3/storage.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package m3
    22  
    23  import (
    24  	"bytes"
    25  	"context"
    26  	goerrors "errors"
    27  	"fmt"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/opentracing/opentracing-go/log"
    32  	"github.com/prometheus/common/model"
    33  	"go.uber.org/zap"
    34  	"go.uber.org/zap/zapcore"
    35  
    36  	coordmodel "github.com/m3db/m3/src/cmd/services/m3coordinator/model"
    37  	"github.com/m3db/m3/src/dbnode/client"
    38  	"github.com/m3db/m3/src/dbnode/storage/index"
    39  	"github.com/m3db/m3/src/query/block"
    40  	"github.com/m3db/m3/src/query/errors"
    41  	"github.com/m3db/m3/src/query/generated/proto/prompb"
    42  	"github.com/m3db/m3/src/query/models"
    43  	"github.com/m3db/m3/src/query/storage"
    44  	"github.com/m3db/m3/src/query/storage/m3/consolidators"
    45  	"github.com/m3db/m3/src/query/storage/m3/storagemetadata"
    46  	"github.com/m3db/m3/src/query/tracepoint"
    47  	"github.com/m3db/m3/src/query/ts"
    48  	xcontext "github.com/m3db/m3/src/x/context"
    49  	xerrors "github.com/m3db/m3/src/x/errors"
    50  	"github.com/m3db/m3/src/x/ident"
    51  	"github.com/m3db/m3/src/x/instrument"
    52  	xtime "github.com/m3db/m3/src/x/time"
    53  )
    54  
    55  const (
    56  	minWriteWaitTimeout = time.Second
    57  )
    58  
    59  var (
    60  	// The default name for the name tag in Prometheus metrics.
    61  	promDefaultName = []byte(model.MetricNameLabel)
    62  	// The prefix for reserved labels, e.g. __name__
    63  	reservedLabelPrefix = []byte(model.ReservedLabelPrefix)
    64  	// The name for the rollup tag defined by the coordinator model.
    65  	rollupTagName = []byte(coordmodel.RollupTagName)
    66  	// The value for the rollup tag defined by the coordinator model.
    67  	rollupTagValue = []byte(coordmodel.RollupTagValue)
    68  
    69  	errUnaggregatedAndAggregatedDisabled = goerrors.New("fetch options has both" +
    70  		" aggregated and unaggregated namespace lookup disabled")
    71  	errNoNamespacesConfigured             = goerrors.New("no namespaces configured")
    72  	errUnaggregatedNamespaceUninitialized = goerrors.New(
    73  		"unaggregated namespace is not yet initialized")
    74  )
    75  
    76  type m3storage struct {
    77  	clusters Clusters
    78  	opts     Options
    79  	nowFn    func() time.Time
    80  	logger   *zap.Logger
    81  }
    82  
    83  // NewStorage creates a new local m3storage instance.
    84  func NewStorage(
    85  	clusters Clusters,
    86  	opts Options,
    87  	instrumentOpts instrument.Options,
    88  ) (Storage, error) {
    89  	if err := opts.Validate(); err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	return &m3storage{
    94  		clusters: clusters,
    95  		opts:     opts,
    96  		nowFn:    time.Now,
    97  		logger:   instrumentOpts.Logger(),
    98  	}, nil
    99  }
   100  
   101  func (s *m3storage) QueryStorageMetadataAttributes(
   102  	_ context.Context,
   103  	queryStart, queryEnd time.Time,
   104  	opts *storage.FetchOptions,
   105  ) ([]storagemetadata.Attributes, error) {
   106  	now := xtime.ToUnixNano(s.nowFn())
   107  	_, namespaces, err := resolveClusterNamespacesForQuery(now,
   108  		xtime.ToUnixNano(queryStart),
   109  		xtime.ToUnixNano(queryEnd),
   110  		s.clusters,
   111  		opts.FanoutOptions,
   112  		opts.RestrictQueryOptions,
   113  		opts.RelatedQueryOptions)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	results := make([]storagemetadata.Attributes, 0, len(namespaces))
   119  	for _, ns := range namespaces {
   120  		results = append(results, ns.Options().Attributes())
   121  	}
   122  	return results, nil
   123  }
   124  
   125  func (s *m3storage) ErrorBehavior() storage.ErrorBehavior {
   126  	return storage.BehaviorFail
   127  }
   128  
   129  func (s *m3storage) Name() string {
   130  	return "local_store"
   131  }
   132  
   133  // Find a reserved label target (one that begins with the reservedLabelPrefix)
   134  // from an array of sorted labels.
   135  func findReservedLabel(labels []prompb.Label, target []byte) []byte {
   136  	// The target should always contain the reservedLabelPrefix.
   137  	// If it doesn't, then we won't be able to find it within
   138  	// the reserved labels by definition.
   139  	if !bytes.HasPrefix(target, reservedLabelPrefix) {
   140  		return nil
   141  	}
   142  
   143  	foundReservedLabels := false
   144  	for idx := 0; idx < len(labels); idx++ {
   145  		label := labels[idx]
   146  		if !bytes.HasPrefix(label.Name, reservedLabelPrefix) {
   147  			if foundReservedLabels {
   148  				// We previously found reserved labels, and now that we've iterated
   149  				// past the end of the section that contains them, we know the target
   150  				// doesn't exist.
   151  				return nil
   152  			}
   153  			// We haven't found reserve labels yet, so keep going.
   154  			continue
   155  		}
   156  
   157  		// At this point we know that the current label contains the reservedLabelPrefix
   158  		foundReservedLabels = true
   159  		if bytes.Equal(label.Name, target) {
   160  			return label.Value
   161  		}
   162  	}
   163  
   164  	return nil
   165  }
   166  
   167  func calculateMetadataByName(result *prompb.QueryResult, metadata *block.ResultMetadata) {
   168  	for _, series := range result.Timeseries {
   169  		if series == nil {
   170  			continue
   171  		}
   172  
   173  		name := findReservedLabel(series.Labels, promDefaultName)
   174  		rollup := findReservedLabel(series.Labels, rollupTagName)
   175  		if bytes.Equal(rollup, rollupTagValue) {
   176  			metadata.ByName(name).Aggregated++
   177  		} else {
   178  			metadata.ByName(name).Unaggregated++
   179  		}
   180  	}
   181  }
   182  
   183  func (s *m3storage) FetchProm(
   184  	ctx context.Context,
   185  	query *storage.FetchQuery,
   186  	options *storage.FetchOptions,
   187  ) (storage.PromResult, error) {
   188  	queryOptions, err := storage.FetchOptionsToM3Options(options, query)
   189  	if err != nil {
   190  		return storage.PromResult{}, err
   191  	}
   192  
   193  	accumulator, _, err := s.fetchCompressed(ctx, query, options, queryOptions)
   194  	if err != nil {
   195  		return storage.PromResult{}, err
   196  	}
   197  
   198  	defer accumulator.Close()
   199  	result, attrs, err := accumulator.FinalResultWithAttrs()
   200  	if err != nil {
   201  		return storage.PromResult{}, err
   202  	}
   203  
   204  	resolutions := make([]time.Duration, 0, len(attrs))
   205  	for _, attr := range attrs {
   206  		resolutions = append(resolutions, attr.Resolution)
   207  	}
   208  
   209  	result.Metadata.Resolutions = resolutions
   210  	fetchResult, err := storage.SeriesIteratorsToPromResult(
   211  		ctx,
   212  		result,
   213  		s.opts.ReadWorkerPool(),
   214  		s.opts.TagOptions(),
   215  		s.opts.PromConvertOptions(),
   216  		options,
   217  	)
   218  	if err != nil {
   219  		return storage.PromResult{}, err
   220  	}
   221  
   222  	if options != nil && options.MaxMetricMetadataStats > 0 {
   223  		calculateMetadataByName(fetchResult.PromResult, &fetchResult.Metadata)
   224  	}
   225  
   226  	return fetchResult, nil
   227  }
   228  
   229  // FetchResultToBlockResult converts an encoded SeriesIterator fetch result
   230  // into blocks.
   231  func FetchResultToBlockResult(
   232  	result consolidators.SeriesFetchResult,
   233  	query *storage.FetchQuery,
   234  	options *storage.FetchOptions,
   235  	opts Options,
   236  ) (block.Result, error) {
   237  	// If using multiblock, update options to reflect this.
   238  	if options.BlockType == models.TypeMultiBlock {
   239  		opts = opts.
   240  			SetSplitSeriesByBlock(true)
   241  	}
   242  
   243  	start := query.Start
   244  	bounds := models.Bounds{
   245  		Start:    xtime.ToUnixNano(start),
   246  		Duration: query.End.Sub(start),
   247  		StepSize: query.Interval,
   248  	}
   249  
   250  	blocks, err := ConvertM3DBSeriesIterators(
   251  		result,
   252  		bounds,
   253  		opts,
   254  	)
   255  	if err != nil {
   256  		return block.Result{
   257  			Metadata: block.NewResultMetadata(),
   258  		}, err
   259  	}
   260  
   261  	return block.Result{
   262  		Blocks:   blocks,
   263  		Metadata: result.Metadata,
   264  	}, nil
   265  }
   266  
   267  func (s *m3storage) FetchBlocks(
   268  	ctx context.Context,
   269  	query *storage.FetchQuery,
   270  	options *storage.FetchOptions,
   271  ) (block.Result, error) {
   272  	// Override options with whatever is the current specified lookback duration.
   273  	opts := s.opts.SetLookbackDuration(
   274  		options.LookbackDurationOrDefault(s.opts.LookbackDuration()))
   275  
   276  	result, _, err := s.FetchCompressedResult(ctx, query, options)
   277  	if err != nil {
   278  		return block.Result{
   279  			Metadata: block.NewResultMetadata(),
   280  		}, err
   281  	}
   282  
   283  	return FetchResultToBlockResult(result, query, options, opts)
   284  }
   285  
   286  func (s *m3storage) FetchCompressed(
   287  	ctx context.Context,
   288  	query *storage.FetchQuery,
   289  	options *storage.FetchOptions,
   290  ) (consolidators.MultiFetchResult, error) {
   291  	queryOptions, _ := storage.FetchOptionsToM3Options(options, query)
   292  	accumulator, _, err := s.fetchCompressed(ctx, query, options, queryOptions)
   293  	return accumulator, err
   294  }
   295  
   296  func (s *m3storage) FetchCompressedResult(
   297  	ctx context.Context,
   298  	query *storage.FetchQuery,
   299  	options *storage.FetchOptions,
   300  ) (consolidators.SeriesFetchResult, Cleanup, error) {
   301  	queryOptions, err := storage.FetchOptionsToM3Options(options, query)
   302  	if err != nil {
   303  		return consolidators.SeriesFetchResult{
   304  			Metadata: block.NewResultMetadata(),
   305  		}, noop, err
   306  	}
   307  
   308  	accumulator, m3query, err := s.fetchCompressed(ctx, query, options, queryOptions)
   309  	if err != nil {
   310  		return consolidators.SeriesFetchResult{
   311  			Metadata: block.NewResultMetadata(),
   312  		}, noop, err
   313  	}
   314  
   315  	result, attrs, err := accumulator.FinalResultWithAttrs()
   316  	if err != nil {
   317  		accumulator.Close()
   318  		return result, noop, err
   319  	}
   320  
   321  	if processor := s.opts.SeriesIteratorProcessor(); processor != nil {
   322  		_, span, sampled := xcontext.StartSampledTraceSpan(ctx,
   323  			tracepoint.FetchCompressedInspectSeries)
   324  		iters := result.SeriesIterators()
   325  		if err := processor.InspectSeries(ctx, m3query, queryOptions, iters); err != nil {
   326  			s.logger.Error("error inspecting series", zap.Error(err))
   327  		}
   328  		if sampled {
   329  			span.LogFields(
   330  				log.String("query", query.Raw),
   331  				log.String("start", query.Start.String()),
   332  				log.String("end", query.End.String()),
   333  				log.String("interval", query.Interval.String()),
   334  			)
   335  		}
   336  		span.Finish()
   337  	}
   338  
   339  	resolutions := make([]time.Duration, 0, len(attrs))
   340  	for _, attr := range attrs {
   341  		resolutions = append(resolutions, attr.Resolution)
   342  	}
   343  
   344  	result.Metadata.Resolutions = resolutions
   345  	return result, accumulator.Close, nil
   346  }
   347  
   348  // fetches compressed series, returning a MultiFetchResult accumulator
   349  func (s *m3storage) fetchCompressed(
   350  	ctx context.Context,
   351  	query *storage.FetchQuery,
   352  	options *storage.FetchOptions,
   353  	queryOptions index.QueryOptions,
   354  ) (consolidators.MultiFetchResult, index.Query, error) {
   355  	if err := options.BlockType.Validate(); err != nil {
   356  		// This is an invariant error; should not be able to get to here.
   357  		return nil, index.Query{}, instrument.InvariantErrorf("invalid block type on "+
   358  			"fetch, got: %v with error %v", options.BlockType, err)
   359  	}
   360  
   361  	// Check if the query was interrupted.
   362  	select {
   363  	case <-ctx.Done():
   364  		return nil, index.Query{}, ctx.Err()
   365  	default:
   366  	}
   367  
   368  	m3query, err := storage.FetchQueryToM3Query(query, options)
   369  	if err != nil {
   370  		return nil, index.Query{}, err
   371  	}
   372  
   373  	var (
   374  		queryStart = queryOptions.StartInclusive
   375  		queryEnd   = queryOptions.EndExclusive
   376  	)
   377  
   378  	// NB(r): Since we don't use a single index we fan out to each
   379  	// cluster that can completely fulfill this range and then prefer the
   380  	// highest resolution (most fine grained) results.
   381  	// This needs to be optimized, however this is a start.
   382  	fanout, namespaces, err := resolveClusterNamespacesForQuery(
   383  		xtime.ToUnixNano(s.nowFn()),
   384  		queryStart,
   385  		queryEnd,
   386  		s.clusters,
   387  		options.FanoutOptions,
   388  		options.RestrictQueryOptions,
   389  		options.RelatedQueryOptions,
   390  	)
   391  	if err != nil {
   392  		return nil, index.Query{}, err
   393  	}
   394  
   395  	if s.logger.Core().Enabled(zapcore.DebugLevel) {
   396  		for _, n := range namespaces {
   397  			// NB(r): Need to perform log on inner loop, cannot reuse a
   398  			// checked entry returned from logger.Check(...).
   399  			// Will see: "Unsafe CheckedEntry re-use near Entry ..." otherwise.
   400  			debugLog := s.logger.Check(zapcore.DebugLevel,
   401  				"query resolved cluster namespace, will use most granular per result")
   402  			if debugLog == nil {
   403  				continue
   404  			}
   405  
   406  			debugLog.Write(
   407  				zap.String("query", query.Raw),
   408  				zap.String("m3query", m3query.String()),
   409  				zap.Time("start", queryStart.ToTime()),
   410  				zap.Time("narrowing.start", n.narrowing.start.ToTime()),
   411  				zap.Time("end", queryEnd.ToTime()),
   412  				zap.Time("narrowing.end", n.narrowing.end.ToTime()),
   413  				zap.String("fanoutType", fanout.String()),
   414  				zap.String("namespace", n.NamespaceID().String()),
   415  				zap.String("type", n.Options().Attributes().MetricsType.String()),
   416  				zap.String("retention", n.Options().Attributes().Retention.String()),
   417  				zap.String("resolution", n.Options().Attributes().Resolution.String()),
   418  				zap.Bool("remote", options.Remote))
   419  		}
   420  	}
   421  
   422  	var wg sync.WaitGroup
   423  	if len(namespaces) == 0 {
   424  		return nil, index.Query{}, errNoNamespacesConfigured
   425  	}
   426  
   427  	matchOpts := s.opts.SeriesConsolidationMatchOptions()
   428  	tagOpts := s.opts.TagOptions()
   429  	limitOpts := consolidators.LimitOptions{
   430  		Limit: options.SeriesLimit,
   431  		// Piggy back on the new InstanceMultiple option to enable checking require exhaustive. This preserves the
   432  		// existing buggy behavior of the coordinators not requiring exhaustive. Once InstanceMultiple is enabled by
   433  		// default, this can be removed.
   434  		RequireExhaustive: queryOptions.InstanceMultiple > 0 && options.RequireExhaustive,
   435  	}
   436  	result := consolidators.NewMultiFetchResult(fanout, matchOpts, tagOpts, limitOpts)
   437  	for _, namespace := range namespaces {
   438  		namespace := namespace // Capture var
   439  
   440  		wg.Add(1)
   441  		go func() {
   442  			defer wg.Done()
   443  			_, span, sampled := xcontext.StartSampledTraceSpan(ctx,
   444  				tracepoint.FetchCompressedFetchTagged)
   445  			defer span.Finish()
   446  
   447  			session := namespace.Session()
   448  			namespaceID := namespace.NamespaceID()
   449  			narrowedQueryOpts := narrowQueryOpts(queryOptions, namespace)
   450  			iters, metadata, err := session.FetchTagged(ctx, namespaceID, m3query, narrowedQueryOpts)
   451  			if err == nil && sampled {
   452  				span.LogFields(
   453  					log.String("namespace", namespaceID.String()),
   454  					log.Int("series", iters.Len()),
   455  					log.Bool("exhaustive", metadata.Exhaustive),
   456  					log.Int("responses", metadata.Responses),
   457  					log.Int("estimateTotalBytes", metadata.EstimateTotalBytes),
   458  				)
   459  			}
   460  
   461  			blockMeta := block.NewResultMetadata()
   462  			blockMeta.AddNamespace(namespaceID.String())
   463  			blockMeta.FetchedResponses = metadata.Responses
   464  			blockMeta.FetchedBytesEstimate = metadata.EstimateTotalBytes
   465  			blockMeta.Exhaustive = metadata.Exhaustive
   466  			blockMeta.WaitedIndex = metadata.WaitedIndex
   467  			blockMeta.WaitedSeriesRead = metadata.WaitedSeriesRead
   468  			// Ignore error from getting iterator pools, since operation
   469  			// will not be dramatically impacted if pools is nil
   470  			result.Add(consolidators.MultiFetchResults{
   471  				SeriesIterators: iters,
   472  				Metadata:        blockMeta,
   473  				Attrs:           namespace.Options().Attributes(),
   474  				Err:             err,
   475  			})
   476  		}()
   477  	}
   478  
   479  	wg.Wait()
   480  
   481  	// Check if the query was interrupted.
   482  	select {
   483  	case <-ctx.Done():
   484  		return nil, index.Query{}, ctx.Err()
   485  	default:
   486  	}
   487  
   488  	return result, m3query, err
   489  }
   490  
   491  func (s *m3storage) SearchSeries(
   492  	ctx context.Context,
   493  	query *storage.FetchQuery,
   494  	options *storage.FetchOptions,
   495  ) (*storage.SearchResults, error) {
   496  	tagResult, cleanup, err := s.SearchCompressed(ctx, query, options)
   497  	defer cleanup()
   498  	if err != nil {
   499  		return nil, err
   500  	}
   501  
   502  	metrics := make(models.Metrics, 0, len(tagResult.Tags))
   503  	for _, result := range tagResult.Tags {
   504  		m, err := storage.FromM3IdentToMetric(result.ID,
   505  			result.Iter, s.opts.TagOptions())
   506  		if err != nil {
   507  			return nil, err
   508  		}
   509  
   510  		metrics = append(metrics, m)
   511  	}
   512  
   513  	return &storage.SearchResults{
   514  		Metrics:  metrics,
   515  		Metadata: tagResult.Metadata,
   516  	}, nil
   517  }
   518  
   519  // CompleteTagsCompressed has the same behavior as CompleteTags.
   520  func (s *m3storage) CompleteTagsCompressed(
   521  	ctx context.Context,
   522  	query *storage.CompleteTagsQuery,
   523  	options *storage.FetchOptions,
   524  ) (*consolidators.CompleteTagsResult, error) {
   525  	return s.CompleteTags(ctx, query, options)
   526  }
   527  
   528  func (s *m3storage) CompleteTags(
   529  	ctx context.Context,
   530  	query *storage.CompleteTagsQuery,
   531  	options *storage.FetchOptions,
   532  ) (*consolidators.CompleteTagsResult, error) {
   533  	// Check if the query was interrupted.
   534  	select {
   535  	case <-ctx.Done():
   536  		return nil, ctx.Err()
   537  	default:
   538  	}
   539  
   540  	fetchQuery := &storage.FetchQuery{
   541  		TagMatchers: query.TagMatchers,
   542  	}
   543  
   544  	m3query, err := storage.FetchQueryToM3Query(fetchQuery, options)
   545  	if err != nil {
   546  		return nil, err
   547  	}
   548  
   549  	aggOpts, err := storage.FetchOptionsToAggregateOptions(options, query)
   550  	if err != nil {
   551  		return nil, err
   552  	}
   553  
   554  	var (
   555  		queryStart      = aggOpts.StartInclusive
   556  		queryEnd        = aggOpts.EndExclusive
   557  		nameOnly        = query.CompleteNameOnly
   558  		tagOpts         = s.opts.TagOptions()
   559  		accumulatedTags = consolidators.NewCompleteTagsResultBuilder(nameOnly, tagOpts)
   560  		multiErr        syncMultiErrs
   561  		wg              sync.WaitGroup
   562  	)
   563  
   564  	debugLog := s.logger.Check(zapcore.DebugLevel,
   565  		"completing tags")
   566  	if debugLog != nil {
   567  		filters := make([]string, len(query.FilterNameTags))
   568  		for i, t := range query.FilterNameTags {
   569  			filters[i] = string(t)
   570  		}
   571  
   572  		debugLog.Write(zap.Bool("nameOnly", nameOnly),
   573  			zap.Strings("filterNames", filters),
   574  			zap.String("matchers", query.TagMatchers.String()),
   575  			zap.String("m3query", m3query.String()),
   576  			zap.Time("start", queryStart.ToTime()),
   577  			zap.Time("end", queryEnd.ToTime()),
   578  			zap.Bool("remote", options.Remote),
   579  		)
   580  	}
   581  
   582  	// NB(r): Since we don't use a single index we fan out to each
   583  	// cluster that can completely fulfill this range and then prefer the
   584  	// highest resolution (most fine-grained) results.
   585  	// This needs to be optimized, however this is a start.
   586  	_, namespaces, err := resolveClusterNamespacesForQuery(xtime.ToUnixNano(s.nowFn()),
   587  		queryStart,
   588  		queryEnd,
   589  		s.clusters,
   590  		options.FanoutOptions,
   591  		options.RestrictQueryOptions,
   592  		nil)
   593  	if err != nil {
   594  		return nil, err
   595  	}
   596  
   597  	var mu sync.Mutex
   598  	aggIterators := make([]client.AggregatedTagsIterator, 0, len(namespaces))
   599  	defer func() {
   600  		mu.Lock()
   601  		for _, it := range aggIterators {
   602  			it.Finalize()
   603  		}
   604  
   605  		mu.Unlock()
   606  	}()
   607  
   608  	wg.Add(len(namespaces))
   609  	for _, namespace := range namespaces {
   610  		namespace := namespace // Capture var
   611  		go func() {
   612  			_, span, sampled := xcontext.StartSampledTraceSpan(ctx, tracepoint.CompleteTagsAggregate)
   613  			defer func() {
   614  				span.Finish()
   615  				wg.Done()
   616  			}()
   617  
   618  			session := namespace.Session()
   619  			namespaceID := namespace.NamespaceID()
   620  			narrowedAggOpts := narrowAggOpts(aggOpts, namespace)
   621  			aggTagIter, metadata, err := session.Aggregate(ctx, namespaceID, m3query, narrowedAggOpts)
   622  			if err != nil {
   623  				multiErr.add(err)
   624  				return
   625  			}
   626  
   627  			if sampled {
   628  				span.LogFields(
   629  					log.String("namespace", namespaceID.String()),
   630  					log.Int("results", aggTagIter.Remaining()),
   631  					log.Bool("exhaustive", metadata.Exhaustive),
   632  					log.Int("responses", metadata.Responses),
   633  					log.Int("estimateTotalBytes", metadata.EstimateTotalBytes),
   634  				)
   635  			}
   636  
   637  			mu.Lock()
   638  			aggIterators = append(aggIterators, aggTagIter)
   639  			mu.Unlock()
   640  
   641  			completedTags := make([]consolidators.CompletedTag, 0, aggTagIter.Remaining())
   642  			for aggTagIter.Next() {
   643  				name, values := aggTagIter.Current()
   644  				tagValues := make([][]byte, 0, values.Remaining())
   645  				for values.Next() {
   646  					tagValues = append(tagValues, values.Current().Bytes())
   647  				}
   648  
   649  				if err := values.Err(); err != nil {
   650  					multiErr.add(err)
   651  					return
   652  				}
   653  
   654  				completedTags = append(completedTags, consolidators.CompletedTag{
   655  					Name:   name.Bytes(),
   656  					Values: tagValues,
   657  				})
   658  			}
   659  
   660  			if err := aggTagIter.Err(); err != nil {
   661  				multiErr.add(err)
   662  				return
   663  			}
   664  
   665  			blockMeta := block.NewResultMetadata()
   666  			blockMeta.AddNamespace(namespaceID.String())
   667  			blockMeta.FetchedResponses = metadata.Responses
   668  			blockMeta.FetchedBytesEstimate = metadata.EstimateTotalBytes
   669  			blockMeta.Exhaustive = metadata.Exhaustive
   670  			blockMeta.WaitedIndex = metadata.WaitedIndex
   671  			blockMeta.WaitedSeriesRead = metadata.WaitedSeriesRead
   672  			result := &consolidators.CompleteTagsResult{
   673  				CompleteNameOnly: query.CompleteNameOnly,
   674  				CompletedTags:    completedTags,
   675  				Metadata:         blockMeta,
   676  			}
   677  
   678  			if err := accumulatedTags.Add(result); err != nil {
   679  				multiErr.add(err)
   680  			}
   681  		}()
   682  	}
   683  
   684  	wg.Wait()
   685  	if err := multiErr.lastError(); err != nil {
   686  		return nil, err
   687  	}
   688  
   689  	built := accumulatedTags.Build()
   690  	return &built, nil
   691  }
   692  
   693  func (s *m3storage) SearchCompressed(
   694  	ctx context.Context,
   695  	query *storage.FetchQuery,
   696  	options *storage.FetchOptions,
   697  ) (consolidators.TagResult, Cleanup, error) {
   698  	// Check if the query was interrupted.
   699  	tagResult := consolidators.TagResult{
   700  		Metadata: block.NewResultMetadata(),
   701  	}
   702  
   703  	select {
   704  	case <-ctx.Done():
   705  		return tagResult, noop, ctx.Err()
   706  	default:
   707  	}
   708  
   709  	m3query, err := storage.FetchQueryToM3Query(query, options)
   710  	if err != nil {
   711  		return tagResult, noop, err
   712  	}
   713  
   714  	m3opts, err := storage.FetchOptionsToM3Options(options, query)
   715  	if err != nil {
   716  		return tagResult, noop, err
   717  	}
   718  
   719  	var (
   720  		queryStart = m3opts.StartInclusive
   721  		queryEnd   = m3opts.EndExclusive
   722  		result     = consolidators.NewMultiFetchTagsResult(s.opts.TagOptions())
   723  		wg         sync.WaitGroup
   724  	)
   725  
   726  	// NB(r): Since we don't use a single index we fan out to each
   727  	// cluster that can completely fulfill this range and then prefer the
   728  	// highest resolution (most fine grained) results.
   729  	// This needs to be optimized, however this is a start.
   730  	_, namespaces, err := resolveClusterNamespacesForQuery(xtime.ToUnixNano(s.nowFn()),
   731  		queryStart,
   732  		queryEnd,
   733  		s.clusters,
   734  		options.FanoutOptions,
   735  		options.RestrictQueryOptions,
   736  		nil)
   737  	if err != nil {
   738  		return tagResult, noop, err
   739  	}
   740  
   741  	debugLog := s.logger.Check(zapcore.DebugLevel,
   742  		"searching")
   743  	if debugLog != nil {
   744  		debugLog.Write(zap.String("query", query.Raw),
   745  			zap.String("m3_query", m3query.String()),
   746  			zap.Time("start", queryStart.ToTime()),
   747  			zap.Time("end", queryEnd.ToTime()),
   748  			zap.Bool("remote", options.Remote),
   749  		)
   750  	}
   751  
   752  	wg.Add(len(namespaces))
   753  	for _, namespace := range namespaces {
   754  		namespace := namespace // Capture var
   755  		go func() {
   756  			_, span, sampled := xcontext.StartSampledTraceSpan(ctx,
   757  				tracepoint.SearchCompressedFetchTaggedIDs)
   758  			defer span.Finish()
   759  
   760  			session := namespace.Session()
   761  			namespaceID := namespace.NamespaceID()
   762  			narrowedM3Opts := narrowQueryOpts(m3opts, namespace)
   763  			iter, metadata, err := session.FetchTaggedIDs(ctx, namespaceID, m3query, narrowedM3Opts)
   764  			if err == nil && sampled {
   765  				span.LogFields(
   766  					log.String("namespace", namespaceID.String()),
   767  					log.Int("series", iter.Remaining()),
   768  					log.Bool("exhaustive", metadata.Exhaustive),
   769  					log.Int("responses", metadata.Responses),
   770  					log.Int("estimateTotalBytes", metadata.EstimateTotalBytes),
   771  				)
   772  			}
   773  
   774  			blockMeta := block.NewResultMetadata()
   775  			blockMeta.AddNamespace(namespaceID.String())
   776  			blockMeta.FetchedResponses = metadata.Responses
   777  			blockMeta.FetchedBytesEstimate = metadata.EstimateTotalBytes
   778  			blockMeta.Exhaustive = metadata.Exhaustive
   779  			blockMeta.WaitedIndex = metadata.WaitedIndex
   780  			blockMeta.WaitedSeriesRead = metadata.WaitedSeriesRead
   781  			result.Add(iter, blockMeta, err)
   782  			wg.Done()
   783  		}()
   784  	}
   785  
   786  	wg.Wait()
   787  
   788  	tagResult, err = result.FinalResult()
   789  	return tagResult, result.Close, err
   790  }
   791  
   792  func (s *m3storage) Write(
   793  	ctx context.Context,
   794  	query *storage.WriteQuery,
   795  ) error {
   796  	if query == nil {
   797  		return errors.ErrNilWriteQuery
   798  	}
   799  
   800  	var (
   801  		// TODO: Pool this once an ident pool is setup. We will have
   802  		// to stop calling NoFinalize() below if we do that.
   803  		tags       = query.Tags()
   804  		datapoints = query.Datapoints()
   805  		idBuf      = tags.ID()
   806  		id         = ident.BytesID(idBuf)
   807  		err        error
   808  		namespace  ClusterNamespace
   809  		exists     bool
   810  	)
   811  
   812  	attributes := query.Attributes()
   813  	switch attributes.MetricsType {
   814  	case storagemetadata.UnaggregatedMetricsType:
   815  		namespace, exists = s.clusters.UnaggregatedClusterNamespace()
   816  		if !exists {
   817  			err = errUnaggregatedNamespaceUninitialized
   818  		}
   819  	case storagemetadata.AggregatedMetricsType:
   820  		attrs := RetentionResolution{
   821  			Retention:  attributes.Retention,
   822  			Resolution: attributes.Resolution,
   823  		}
   824  		namespace, exists = s.clusters.AggregatedClusterNamespace(attrs)
   825  		if !exists {
   826  			err = fmt.Errorf("no configured cluster namespace for: retention=%s,"+
   827  				" resolution=%s", attrs.Retention.String(), attrs.Resolution.String())
   828  			break
   829  		}
   830  		if namespace.Options().ReadOnly() {
   831  			err = fmt.Errorf(
   832  				"cannot write to read only namespace %s (%s:%s)",
   833  				namespace.NamespaceID(), attrs.Resolution.String(), attrs.Retention.String())
   834  		}
   835  	default:
   836  		metricsType := attributes.MetricsType
   837  		err = fmt.Errorf("invalid write request metrics type: %s (%d)",
   838  			metricsType.String(), uint(metricsType))
   839  	}
   840  	if err != nil {
   841  		return err
   842  	}
   843  
   844  	// Set id to NoFinalize to avoid cloning it in write operations
   845  	id.NoFinalize()
   846  
   847  	if s.opts.RateLimiter().Limit(ctx, namespace, datapoints, tags.Tags) {
   848  		return xerrors.NewResourceExhaustedError(goerrors.New("rate limit exceeded"))
   849  	}
   850  
   851  	tags.Tags, err = s.opts.TagsTransform()(ctx, namespace, tags.Tags)
   852  	if err != nil {
   853  		return err
   854  	}
   855  	tagIterator := storage.TagsToIdentTagIterator(tags)
   856  
   857  	if len(datapoints) == 1 {
   858  		// Special case single datapoint because it is common and we
   859  		// can avoid the overhead of a waitgroup, goroutine, multierr,
   860  		// iterator duplication etc.
   861  		return s.writeSingle(query, datapoints[0], id, tagIterator, namespace)
   862  	}
   863  
   864  	var (
   865  		wg       sync.WaitGroup
   866  		multiErr syncMultiErrs
   867  	)
   868  
   869  	for _, datapoint := range datapoints {
   870  		tagIter := tagIterator.Duplicate()
   871  		// capture var
   872  		datapoint := datapoint
   873  		wg.Add(1)
   874  
   875  		var (
   876  			now                      = time.Now()
   877  			deadline, deadlineExists = ctx.Deadline()
   878  			timeout                  = minWriteWaitTimeout
   879  		)
   880  		if deadlineExists {
   881  			if remain := deadline.Sub(now); remain >= timeout {
   882  				timeout = remain
   883  			}
   884  		}
   885  		spawned := s.opts.WriteWorkerPool().GoWithTimeout(func() {
   886  			if err := s.writeSingle(query, datapoint, id, tagIter, namespace); err != nil {
   887  				multiErr.add(err)
   888  			}
   889  
   890  			tagIter.Close()
   891  			wg.Done()
   892  		}, timeout)
   893  		if !spawned {
   894  			multiErr.add(fmt.Errorf("timeout exceeded waiting: %v", timeout))
   895  		}
   896  	}
   897  
   898  	wg.Wait()
   899  	return multiErr.lastError()
   900  }
   901  
   902  func (s *m3storage) Type() storage.Type {
   903  	return storage.TypeLocalDC
   904  }
   905  
   906  func (s *m3storage) Close() error {
   907  	return nil
   908  }
   909  
   910  func (s *m3storage) writeSingle(
   911  	query *storage.WriteQuery,
   912  	datapoint ts.Datapoint,
   913  	identID ident.ID,
   914  	iterator ident.TagIterator,
   915  	namespace ClusterNamespace,
   916  ) error {
   917  	namespaceID := namespace.NamespaceID()
   918  	session := namespace.Session()
   919  	return session.WriteTagged(namespaceID, identID, iterator,
   920  		datapoint.Timestamp, datapoint.Value, query.Unit(), query.Annotation())
   921  }
   922  
   923  func narrowQueryOpts(o index.QueryOptions, namespace resolvedNamespace) index.QueryOptions {
   924  	narrowed := o
   925  	if !namespace.narrowing.start.IsZero() && namespace.narrowing.start.After(o.StartInclusive) {
   926  		narrowed.StartInclusive = namespace.narrowing.start
   927  	}
   928  	if !namespace.narrowing.end.IsZero() && namespace.narrowing.end.Before(o.EndExclusive) {
   929  		narrowed.EndExclusive = namespace.narrowing.end
   930  	}
   931  
   932  	return narrowed
   933  }
   934  
   935  func narrowAggOpts(o index.AggregationOptions, namespace resolvedNamespace) index.AggregationOptions {
   936  	narrowed := o
   937  	narrowed.QueryOptions = narrowQueryOpts(o.QueryOptions, namespace)
   938  
   939  	return narrowed
   940  }