github.com/grafana/pyroscope@v1.18.0/pkg/querier/replication.go (about)

     1  package querier
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"sort"
     8  
     9  	"github.com/cespare/xxhash/v2"
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/grafana/dskit/ring"
    13  	"github.com/opentracing/opentracing-go"
    14  	otlog "github.com/opentracing/opentracing-go/log"
    15  	"github.com/samber/lo"
    16  	"golang.org/x/sync/errgroup"
    17  
    18  	ingestv1 "github.com/grafana/pyroscope/api/gen/proto/go/ingester/v1"
    19  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    20  	"github.com/grafana/pyroscope/pkg/phlaredb/sharding"
    21  	"github.com/grafana/pyroscope/pkg/util"
    22  	"github.com/grafana/pyroscope/pkg/util/spanlogger"
    23  )
    24  
    25  type ResponseFromReplica[T any] struct {
    26  	addr     string
    27  	response T
    28  }
    29  
    30  type QueryReplicaFn[T any, Querier any] func(ctx context.Context, q Querier) (T, error)
    31  
    32  type QueryReplicaWithHintsFn[T any, Querier any] func(ctx context.Context, q Querier, hint *ingestv1.Hints) (T, error)
    33  
    34  type Closer interface {
    35  	CloseRequest() error
    36  	CloseResponse() error
    37  }
    38  
    39  type ClientFactory[T any] func(addr string) (T, error)
    40  
    41  // cleanupResult, will make sure if the result was streamed, that we close the request and response
    42  func cleanupStreams[Result any](result ResponseFromReplica[Result]) {
    43  	if stream, ok := any(result.response).(interface {
    44  		CloseRequest() error
    45  	}); ok {
    46  		if err := stream.CloseRequest(); err != nil {
    47  			level.Warn(util.Logger).Log("msg", "failed to close request", "err", err)
    48  		}
    49  	}
    50  	if stream, ok := any(result.response).(interface {
    51  		CloseResponse() error
    52  	}); ok {
    53  		if err := stream.CloseResponse(); err != nil {
    54  			level.Warn(util.Logger).Log("msg", "failed to close response", "err", err)
    55  		}
    56  	}
    57  }
    58  
    59  // forGivenReplicationSet runs f, in parallel, for given replica set.
    60  // Under the hood it returns only enough responses to satisfy the quorum.
    61  func forGivenReplicationSet[Result any, Querier any](ctx context.Context, clientFactory func(string) (Querier, error), replicationSet ring.ReplicationSet, f QueryReplicaFn[Result, Querier]) ([]ResponseFromReplica[Result], error) {
    62  	results, err := ring.DoUntilQuorumWithoutSuccessfulContextCancellation(
    63  		ctx,
    64  		replicationSet,
    65  		ring.DoUntilQuorumConfig{
    66  			MinimizeRequests: true,
    67  		},
    68  		func(ctx context.Context, ingester *ring.InstanceDesc, _ context.CancelCauseFunc) (ResponseFromReplica[Result], error) {
    69  			var res ResponseFromReplica[Result]
    70  			client, err := clientFactory(ingester.Addr)
    71  			if err != nil {
    72  				return res, err
    73  			}
    74  
    75  			resp, err := f(ctx, client)
    76  			if err != nil {
    77  				return res, err
    78  			}
    79  
    80  			return ResponseFromReplica[Result]{ingester.Addr, resp}, nil
    81  		},
    82  		cleanupStreams[Result],
    83  	)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	return results, err
    89  }
    90  
    91  // forGivenPlan runs f, in parallel, for given plan.
    92  func forGivenPlan[Result any, Querier any](
    93  	ctx context.Context,
    94  	plan map[string]*blockPlanEntry,
    95  	clientFactory func(string) (Querier, error),
    96  	replicationSet ring.ReplicationSet, f QueryReplicaWithHintsFn[Result, Querier],
    97  ) ([]ResponseFromReplica[Result], error) {
    98  	g, _ := errgroup.WithContext(ctx)
    99  
   100  	var (
   101  		idx    = 0
   102  		result = make([]ResponseFromReplica[Result], len(plan))
   103  	)
   104  
   105  	for replica, planEntry := range plan {
   106  		if !replicationSet.Includes(replica) {
   107  			continue
   108  		}
   109  		var (
   110  			i = idx
   111  			r = replica
   112  			h = planEntry.BlockHints
   113  		)
   114  		idx++
   115  		g.Go(func() error {
   116  			client, err := clientFactory(r)
   117  			if err != nil {
   118  				return err
   119  			}
   120  
   121  			resp, err := f(ctx, client, &ingestv1.Hints{Block: h})
   122  			if err != nil {
   123  				return err
   124  			}
   125  
   126  			result[i] = ResponseFromReplica[Result]{r, resp}
   127  
   128  			return nil
   129  		})
   130  	}
   131  
   132  	if err := g.Wait(); err != nil {
   133  		return nil, err
   134  	}
   135  
   136  	result = result[:idx]
   137  
   138  	return result, nil
   139  }
   140  
   141  type instanceType uint8
   142  
   143  const (
   144  	unknownInstanceType instanceType = iota
   145  	ingesterInstance
   146  	storeGatewayInstance
   147  )
   148  
   149  // map of block ID to replicas containing the block, when empty replicas, the
   150  // block is already contained by a higher compaction level block in full.
   151  type replicasPerBlockID struct {
   152  	m             map[string][]string
   153  	meta          map[string]*typesv1.BlockInfo
   154  	instanceTypes map[string][]instanceType
   155  	logger        log.Logger
   156  }
   157  
   158  func newReplicasPerBlockID(logger log.Logger) *replicasPerBlockID {
   159  	return &replicasPerBlockID{
   160  		m:             make(map[string][]string),
   161  		meta:          make(map[string]*typesv1.BlockInfo),
   162  		instanceTypes: make(map[string][]instanceType),
   163  		logger:        logger,
   164  	}
   165  }
   166  
   167  func (r *replicasPerBlockID) add(result []ResponseFromReplica[[]*typesv1.BlockInfo], t instanceType) {
   168  	for _, replica := range result {
   169  		// mark the replica's instance types (in single binary we can have the same replica have multiple types)
   170  		r.instanceTypes[replica.addr] = append(r.instanceTypes[replica.addr], t)
   171  
   172  		for _, block := range replica.response {
   173  			// add block to map
   174  			v, exists := r.m[block.Ulid]
   175  			if exists && len(v) > 0 || !exists {
   176  				r.m[block.Ulid] = append(r.m[block.Ulid], replica.addr)
   177  			}
   178  
   179  			// add block meta to map
   180  			// note: we do override existing meta, as meta is immutable for all replicas
   181  			r.meta[block.Ulid] = block
   182  		}
   183  	}
   184  }
   185  
   186  func shardFromBlock(m *typesv1.BlockInfo) (shard uint64, shardCount uint64, ok bool) {
   187  	for _, lp := range m.Labels {
   188  		if lp.Name != sharding.CompactorShardIDLabel {
   189  			continue
   190  		}
   191  
   192  		shardID, shardCount, err := sharding.ParseShardIDLabelValue(lp.Value)
   193  		if err == nil {
   194  			return shardID, shardCount, true
   195  		}
   196  	}
   197  
   198  	return 0, 0, false
   199  }
   200  
   201  func (r *replicasPerBlockID) removeBlock(ulid string) {
   202  	delete(r.m, ulid)
   203  	delete(r.meta, ulid)
   204  }
   205  
   206  // this step removes sharded blocks that don't have all the shards present for a time window
   207  func (r *replicasPerBlockID) pruneIncompleteShardedBlocks() (bool, error) {
   208  	type compactionKey struct {
   209  		level   int32
   210  		minTime int64
   211  	}
   212  	compactions := make(map[compactionKey][]string)
   213  
   214  	// group blocks by compaction level
   215  	for blockID := range r.m {
   216  		meta, ok := r.meta[blockID]
   217  		if !ok {
   218  			return false, fmt.Errorf("meta missing for block id %s", blockID)
   219  		}
   220  
   221  		key := compactionKey{
   222  			level:   0,
   223  			minTime: meta.MinTime,
   224  		}
   225  
   226  		if meta.Compaction != nil {
   227  			key.level = meta.Compaction.Level
   228  		}
   229  		compactions[key] = append(compactions[key], blockID)
   230  	}
   231  
   232  	// now we go through every group and check if we see at least a block for each shard
   233  	var (
   234  		shardsSeen       []bool
   235  		shardedBlocks    []string
   236  		hasShardedBlocks bool
   237  	)
   238  	for _, blocks := range compactions {
   239  		shardsSeen = shardsSeen[:0]
   240  		shardedBlocks = shardedBlocks[:0]
   241  		for _, block := range blocks {
   242  			meta, ok := r.meta[block]
   243  			if !ok {
   244  				return false, fmt.Errorf("meta missing for block id %s", block)
   245  			}
   246  
   247  			shardIdx, shards, ok := shardFromBlock(meta)
   248  			if !ok {
   249  				// not a sharded block continue
   250  				continue
   251  			}
   252  			hasShardedBlocks = true
   253  			shardedBlocks = append(shardedBlocks, block)
   254  
   255  			if len(shardsSeen) == 0 {
   256  				if cap(shardsSeen) < int(shards) {
   257  					shardsSeen = make([]bool, shards)
   258  				} else {
   259  					shardsSeen = shardsSeen[:shards]
   260  					for idx := range shardsSeen {
   261  						shardsSeen[idx] = false
   262  					}
   263  				}
   264  			}
   265  
   266  			if len(shardsSeen) != int(shards) {
   267  				return false, fmt.Errorf("shard length mismatch, shards seen: %d, shards as per label: %d", len(shardsSeen), shards)
   268  			}
   269  
   270  			shardsSeen[shardIdx] = true
   271  		}
   272  		// check if all shards are present
   273  		allShardsPresent := true
   274  		for _, shardSeen := range shardsSeen {
   275  			if !shardSeen {
   276  				allShardsPresent = false
   277  				break
   278  			}
   279  		}
   280  
   281  		if allShardsPresent {
   282  			continue
   283  		}
   284  
   285  		// now remove all blocks that are shareded but not complete
   286  		for _, block := range shardedBlocks {
   287  			r.removeBlock(block)
   288  		}
   289  	}
   290  
   291  	return hasShardedBlocks, nil
   292  }
   293  
   294  // prunes blocks that are contained by a higher compaction level block
   295  func (r *replicasPerBlockID) pruneSupersededBlocks(sharded bool) error {
   296  	for blockID := range r.m {
   297  		meta, ok := r.meta[blockID]
   298  		if !ok {
   299  			return fmt.Errorf("meta missing for block id %s", blockID)
   300  		}
   301  		if meta.Compaction == nil {
   302  			continue
   303  		}
   304  		if meta.Compaction.Level < 2 {
   305  			continue
   306  		}
   307  		// At split phase of compaction, L2 is an intermediate step where we
   308  		// split each group into split_shards parts, thus there will be up to
   309  		// groups_num * split_shards blocks, which is typically _significantly_
   310  		// greater that the number of source blocks. Moreover, these blocks are
   311  		// not yet deduplicated, therefore we should prefer L1 blocks over them.
   312  		// As an optimisation, we drop all L2 blocks.
   313  		if sharded && meta.Compaction.Level == 2 {
   314  			r.removeBlock(blockID)
   315  			continue
   316  		}
   317  		for _, blockID := range meta.Compaction.Parents {
   318  			r.removeBlock(blockID)
   319  		}
   320  		for _, blockID := range meta.Compaction.Sources {
   321  			r.removeBlock(blockID)
   322  		}
   323  	}
   324  	return nil
   325  }
   326  
   327  type blockPlanEntry struct {
   328  	*ingestv1.BlockHints
   329  	InstanceTypes []instanceType
   330  }
   331  
   332  type blockPlan map[string]*blockPlanEntry
   333  
   334  func BlockHints(p blockPlan, replica string) (*ingestv1.BlockHints, error) {
   335  	entry, ok := p[replica]
   336  	if !ok && p != nil {
   337  		return nil, fmt.Errorf("no hints found for replica %s", replica)
   338  	}
   339  	if entry == nil {
   340  		return nil, nil
   341  	}
   342  	return entry.BlockHints, nil
   343  }
   344  
   345  func (p blockPlan) String() string {
   346  	data, _ := json.Marshal(p)
   347  	return string(data)
   348  }
   349  
   350  func (r *replicasPerBlockID) blockPlan(ctx context.Context) map[string]*blockPlanEntry {
   351  	sp, _ := opentracing.StartSpanFromContext(ctx, "blockPlan")
   352  	defer sp.Finish()
   353  
   354  	var (
   355  		deduplicate             = false
   356  		hash                    = xxhash.New()
   357  		plan                    = make(map[string]*blockPlanEntry)
   358  		smallestCompactionLevel = int32(0)
   359  	)
   360  
   361  	sharded, err := r.pruneIncompleteShardedBlocks()
   362  	if err != nil {
   363  		level.Warn(r.logger).Log("msg", "block planning failed to prune incomplete sharded blocks", "err", err)
   364  		return nil
   365  	}
   366  
   367  	// Depending on whether split sharding is used, the compaction level at
   368  	// which the data gets deduplicated differs: if split sharding is enabled,
   369  	// we deduplicate at level 3, and at level 2 otherwise.
   370  	var deduplicationLevel int32 = 2
   371  	if sharded {
   372  		deduplicationLevel = 3
   373  	}
   374  
   375  	if err := r.pruneSupersededBlocks(sharded); err != nil {
   376  		level.Warn(r.logger).Log("msg", "block planning failed to prune superseded blocks", "err", err)
   377  		return nil
   378  	}
   379  
   380  	// now we go through all blocks and choose the replicas that we want to query
   381  	for blockID, replicas := range r.m {
   382  		// skip if we have no replicas, then block is already contained i an higher compaction level one
   383  		if len(replicas) == 0 {
   384  			continue
   385  		}
   386  
   387  		meta, ok := r.meta[blockID]
   388  		if !ok {
   389  			continue
   390  		}
   391  		// when we see a block with CompactionLevel less than the level at which data is deduplicated,
   392  		// or a block without compaction section, we want the queriers to deduplicate
   393  		if meta.Compaction == nil || meta.Compaction.Level < deduplicationLevel {
   394  			deduplicate = true
   395  		}
   396  
   397  		// record the lowest compaction level
   398  		if meta.Compaction != nil && (smallestCompactionLevel == 0 || meta.Compaction.Level < smallestCompactionLevel) {
   399  			smallestCompactionLevel = meta.Compaction.Level
   400  		}
   401  
   402  		// only get store gateways replicas
   403  		sgReplicas := lo.Filter(replicas, func(replica string, _ int) bool {
   404  			instanceTypes, ok := r.instanceTypes[replica]
   405  			if !ok {
   406  				return false
   407  			}
   408  			for _, t := range instanceTypes {
   409  				if t == storeGatewayInstance {
   410  					return true
   411  				}
   412  			}
   413  			return false
   414  		})
   415  
   416  		if len(sgReplicas) > 0 {
   417  			// if we have store gateway replicas, we want to query them
   418  			replicas = sgReplicas
   419  		}
   420  
   421  		// now select one replica, based on block id
   422  		sort.Strings(replicas)
   423  		hash.Reset()
   424  		_, _ = hash.WriteString(blockID)
   425  		hashIdx := int(hash.Sum64())
   426  		if hashIdx < 0 {
   427  			hashIdx = -hashIdx
   428  		}
   429  		selectedReplica := replicas[hashIdx%len(replicas)]
   430  
   431  		// add block to plan
   432  		p, exists := plan[selectedReplica]
   433  		if !exists {
   434  			p = &blockPlanEntry{
   435  				BlockHints:    &ingestv1.BlockHints{},
   436  				InstanceTypes: r.instanceTypes[selectedReplica],
   437  			}
   438  			plan[selectedReplica] = p
   439  		}
   440  		p.Ulids = append(p.Ulids, blockID)
   441  
   442  		// set the selected replica
   443  		r.m[blockID] = []string{selectedReplica}
   444  	}
   445  
   446  	// adapt the plan to make sure all replicas will deduplicate
   447  	if deduplicate {
   448  		for _, hints := range plan {
   449  			hints.Deduplication = deduplicate
   450  		}
   451  	}
   452  
   453  	var plannedIngesterBlocks, plannedStoreGatewayBlocks int
   454  	for replica, blocks := range plan {
   455  		instanceTypes, ok := r.instanceTypes[replica]
   456  		if !ok {
   457  			continue
   458  		}
   459  		for _, t := range instanceTypes {
   460  			if t == storeGatewayInstance {
   461  				plannedStoreGatewayBlocks += len(blocks.Ulids)
   462  			}
   463  			if t == ingesterInstance {
   464  				plannedIngesterBlocks += len(blocks.Ulids)
   465  			}
   466  		}
   467  	}
   468  
   469  	sp.LogFields(
   470  		otlog.Bool("deduplicate", deduplicate),
   471  		otlog.Int32("smallest_compaction_level", smallestCompactionLevel),
   472  		otlog.Int("planned_blocks_ingesters", plannedIngesterBlocks),
   473  		otlog.Int("planned_blocks_store_gateways", plannedStoreGatewayBlocks),
   474  	)
   475  
   476  	level.Debug(spanlogger.FromContext(ctx, r.logger)).Log(
   477  		"msg", "block plan created",
   478  		"deduplicate", deduplicate,
   479  		"smallest_compaction_level", smallestCompactionLevel,
   480  		"planned_blocks_ingesters", plannedIngesterBlocks,
   481  		"planned_blocks_store_gateways", plannedStoreGatewayBlocks,
   482  		"plan", blockPlan(plan),
   483  	)
   484  
   485  	return plan
   486  }