github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/distributor/distributor.go (about)

     1  package distributor
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"slices"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/grafana/dskit/ring"
    12  
    13  	"github.com/grafana/pyroscope/pkg/iter"
    14  	"github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement"
    15  )
    16  
    17  // NOTE(kolesnikovae): Essentially, we do not depend on the dskit/ring and
    18  // only use it as a discovery mechanism build on top of the memberlist.
    19  // It would be better to access the memberlist/serf directly.
    20  var op = ring.NewOp([]ring.InstanceState{ring.ACTIVE, ring.LEAVING, ring.PENDING, ring.JOINING}, nil)
    21  
    22  const defaultRingUpdateInterval = 5 * time.Second
    23  
    24  type Distributor struct {
    25  	mu           sync.RWMutex
    26  	ring         ring.ReadRing
    27  	placement    placement.Placement
    28  	distribution *distribution
    29  
    30  	RingUpdateInterval time.Duration
    31  }
    32  
    33  func NewDistributor(placement placement.Placement, r ring.ReadRing) *Distributor {
    34  	return &Distributor{
    35  		ring:               r,
    36  		placement:          placement,
    37  		RingUpdateInterval: defaultRingUpdateInterval,
    38  	}
    39  }
    40  
    41  func (d *Distributor) Distribute(k placement.Key) (*placement.ShardMapping, error) {
    42  	if err := d.updateDistribution(d.ring, d.RingUpdateInterval); err != nil {
    43  		return nil, err
    44  	}
    45  	return d.distribute(k), nil
    46  }
    47  
    48  func (d *Distributor) updateDistribution(r ring.ReadRing, maxAge time.Duration) error {
    49  	d.mu.RLock()
    50  	x := d.distribution
    51  	if x != nil && !x.isExpired(maxAge) {
    52  		d.mu.RUnlock()
    53  		return nil
    54  	}
    55  	d.mu.RUnlock()
    56  	d.mu.Lock()
    57  	defer d.mu.Unlock()
    58  	x = d.distribution
    59  	if x != nil && !x.isExpired(maxAge) {
    60  		return nil
    61  	}
    62  	if x == nil {
    63  		x = newDistribution()
    64  	}
    65  	if err := x.readRing(r); err != nil {
    66  		return fmt.Errorf("failed to read ring: %w", err)
    67  	}
    68  	d.distribution = x
    69  	return nil
    70  }
    71  
    72  // emptyMapping is returned by distributor if the ring is empty.
    73  // This helps to handle a case when requests arrive before the
    74  // ring is populated (no instances registered).
    75  var emptyMapping = &placement.ShardMapping{
    76  	Instances: iter.NewEmptyIterator[ring.InstanceDesc](),
    77  	Shard:     0,
    78  }
    79  
    80  func (d *Distributor) distribute(k placement.Key) *placement.ShardMapping {
    81  	d.mu.RLock()
    82  	defer d.mu.RUnlock()
    83  	// Determine the number of shards for the tenant within the available
    84  	// space, and the dataset shards within the tenant subring.
    85  	s := len(d.distribution.shards)
    86  	if s == 0 {
    87  		return emptyMapping
    88  	}
    89  	p := d.placement.Policy(k)
    90  	tenantSize := p.TenantShards
    91  	if tenantSize == 0 || tenantSize > s {
    92  		tenantSize = s
    93  	}
    94  	datasetSize := min(tenantSize, max(1, p.DatasetShards))
    95  	// When we create subrings, we need to ensure that each of them has at
    96  	// least p shards. However, the data distribution must be restricted
    97  	// according to the limits.
    98  	all := newSubring(s)
    99  	tenant := all.subring(k.Tenant, tenantSize)
   100  	dataset := tenant.subring(k.Dataset, datasetSize)
   101  	// We pick a shard from the dataset subring: its index is relative
   102  	// to the dataset subring.
   103  	offset := p.PickShard(datasetSize)
   104  	// Next we want to find p instances eligible to host the key.
   105  	// The choice must be limited to the dataset / tenant subring,
   106  	// but extended if needed.
   107  	return &placement.ShardMapping{
   108  		Shard:     uint32(dataset.at(offset)) + 1, // 0 shard ID is a sentinel
   109  		Instances: d.distribution.instances(dataset, offset),
   110  	}
   111  }
   112  
   113  type distribution struct {
   114  	timestamp time.Time
   115  	shards    []uint32 // Shard ID -> Instance ID.
   116  	desc      []ring.InstanceDesc
   117  	perm      *perm
   118  }
   119  
   120  func newDistribution() *distribution {
   121  	return &distribution{
   122  		timestamp: time.Now(),
   123  		perm:      new(perm),
   124  	}
   125  }
   126  
   127  func (d *distribution) isExpired(maxAge time.Duration) bool {
   128  	return time.Now().Add(-maxAge).After(d.timestamp)
   129  }
   130  
   131  func (d *distribution) readRing(r ring.ReadRing) error {
   132  	all, err := r.GetAllHealthy(op)
   133  	if err != nil {
   134  		return err
   135  	}
   136  	if len(all.Instances) == 0 {
   137  		return ring.ErrEmptyRing
   138  	}
   139  	d.timestamp = time.Now()
   140  	d.desc = all.Instances
   141  	// Jump consistent hashing requires a deterministic order of instances.
   142  	// Moreover, instances can be only added to the end, otherwise this may
   143  	// cause massive relocations.
   144  	slices.SortFunc(d.desc, func(a, b ring.InstanceDesc) int {
   145  		return strings.Compare(a.Id, b.Id)
   146  	})
   147  	// Now we create a mapping of shards to instances.
   148  	var tmp [256]uint32 // Try to allocate on stack.
   149  	instances := tmp[:0]
   150  	for j := range d.desc {
   151  		for range all.Instances[j].Tokens {
   152  			instances = append(instances, uint32(j))
   153  		}
   154  	}
   155  	// We use shuffling to avoid hotspots: a contiguous range of shards
   156  	// is distributed over instances in a pseudo-random fashion.
   157  	// Given that the number of shards and instances is known in advance,
   158  	// we maintain a deterministic permutation that perturbs as little as
   159  	// possible, when the number of shards or instances changes: only the
   160  	// delta moves.
   161  	size := len(instances)
   162  	d.perm.resize(size)
   163  	// Note that we can't reuse d.shards because it may be used by iterators.
   164  	// In fact, this is a snapshot that must not be modified.
   165  	d.shards = make([]uint32, size)
   166  	for j := range d.shards {
   167  		d.shards[j] = instances[d.perm.v[j]]
   168  	}
   169  	return nil
   170  }
   171  
   172  // instances returns an iterator that iterates over instances
   173  // that may host the shard at the offset in the order of preference:
   174  // dataset -> tenant -> all shards.
   175  func (d *distribution) instances(r subring, off int) *iterator {
   176  	return &iterator{
   177  		off:    off,
   178  		lim:    r.size(),
   179  		ring:   r,
   180  		shards: d.shards,
   181  		desc:   d.desc,
   182  	}
   183  }
   184  
   185  // The inputs are a key and the number of buckets.
   186  // It outputs a bucket number in the range [0, buckets).
   187  //
   188  // Refer to https://arxiv.org/pdf/1406.2294:
   189  // The function satisfies the two properties:
   190  //  1. About the same number of keys map to each bucket.
   191  //  2. The mapping from key to bucket is perturbed as little as possible when
   192  //     the number of buckets is changed. Thus, the only data that needs to move
   193  //     when the number of buckets changes is the data for the relatively small
   194  //     number of keys whose bucket assignment changed.
   195  func jump(key uint64, buckets int) int {
   196  	var b, j = -1, 0
   197  	for j < buckets {
   198  		b = j
   199  		key = key*2862933555777941757 + 1
   200  		j = int(float64(b+1) * (float64(int64(1)<<31) / float64((key>>33)+1)))
   201  	}
   202  	return b
   203  }
   204  
   205  // Subring is a utility to calculate the subring
   206  // for a given key within the available space:
   207  //
   208  // Note that this is not a recursive implementation,
   209  // but a more straightforward one, optimized for the
   210  // case where there can be up to two nested rings.
   211  type subring struct {
   212  	// |<---------n----------->| Available space.
   213  	// | . a---|---------b . . | Ring.
   214  	// | . . . c-----d . . . . | Subring.
   215  	n, a, b, c, d int
   216  }
   217  
   218  func newSubring(n int) subring { return subring{n: n, b: n, d: n} }
   219  
   220  // The function creates a subring of the specified size for the given key.
   221  // The subring offset is calculated with the jump function.
   222  func (s subring) subring(k uint64, size int) subring {
   223  	n := s
   224  	n.a, n.b = n.c, n.d
   225  	n.c = n.a + jump(k, n.b-n.a)
   226  	n.d = n.c + size
   227  	return n
   228  }
   229  
   230  func (s subring) pop() subring {
   231  	n := s
   232  	n.c, n.d = n.a, n.b
   233  	n.a, n.b = 0, n.n
   234  	return n
   235  }
   236  
   237  // The function returns the absolute offset of the relative n.
   238  func (s subring) at(n int) int {
   239  	// [ . a-------|-----b . . ]
   240  	// [ . . . . . c-----|-x-d ]
   241  	//
   242  	// [ . a-------|-----b . . ]
   243  	// [ . |-x-d . c-----| . . ]
   244  	n %= s.d - s.c
   245  	x := s.c + n
   246  	x = (x - s.a) % (s.b - s.a)
   247  	p := (x + s.a) % s.n
   248  	return p
   249  }
   250  
   251  // offset reports offset in the parent ring.
   252  func (s subring) offset() int { return s.c - s.a }
   253  
   254  // size reports the size of the ring.
   255  func (s subring) size() int { return s.d - s.c }
   256  
   257  // iterator iterates instances that host the shards of the subring.
   258  // The iterator is not limited to the subring, and will continue with
   259  // the parent subring when the current one is exhausted.
   260  type iterator struct {
   261  	n   int // Number of instances collected.
   262  	off int // Current offset in the ring (relative).
   263  	lim int // Remaining instances in the subring.
   264  
   265  	ring   subring
   266  	shards []uint32
   267  	desc   []ring.InstanceDesc
   268  }
   269  
   270  func (i *iterator) Err() error { return nil }
   271  
   272  func (i *iterator) Close() error { return nil }
   273  
   274  func (i *iterator) Next() bool {
   275  	if i.n >= i.ring.n {
   276  		return false
   277  	}
   278  	if i.lim > 0 {
   279  		i.lim--
   280  	} else {
   281  		for i.lim <= 0 {
   282  			// We have exhausted the subring.
   283  			// Navigate to the parent ring.
   284  			if i.ring.n == i.ring.size() {
   285  				// No parent rings left.
   286  				return false
   287  			}
   288  			// Start with the offset right after the subring.
   289  			size := i.ring.size()
   290  			i.off = i.ring.offset() + size
   291  			p := i.ring.pop() // Load parent.
   292  			// How many items remain in the ring.
   293  			i.lim = p.size() - size - 1
   294  			i.ring = p
   295  		}
   296  	}
   297  	i.off++
   298  	i.n++
   299  	return true
   300  }
   301  
   302  func (i *iterator) At() ring.InstanceDesc {
   303  	a := i.ring.at(i.off - 1) // Translate the relative offset to absolute.
   304  	x := i.shards[a]          // Map the shard to the instance.
   305  	return i.desc[x]
   306  }
   307  
   308  // Fisher–Yates shuffle with predefined steps.
   309  // Rand source with a seed is not enough as we
   310  // can't guarantee the same sequence of calls
   311  // with identical arguments, which would make
   312  // the state of two instances incoherent.
   313  type perm struct{ v []uint32 }
   314  
   315  func (p *perm) resize(n int) {
   316  	d := max(0, n-len(p.v))
   317  	p.v = slices.Grow(p.v, d)[:n]
   318  	// We do want to start with 0 (in contrast to the standard
   319  	// implementation) as this is required for the n == 1 case:
   320  	// we need to zero v[0].
   321  	// Although, it's possible to make the change incrementally,
   322  	// for simplicity, we just rebuild the permutation.
   323  	for i := 0; i < n; i++ {
   324  		j := steps[i]
   325  		p.v[i], p.v[j] = p.v[j], uint32(i)
   326  	}
   327  }
   328  
   329  var steps [4 << 10]uint32
   330  
   331  func init() {
   332  	// The seed impacts mapping of shards to nodes.
   333  	// TODO(kolesnikovae):
   334  	//  Stochastic approach does not work well
   335  	//  in all the cases; it should be replaced
   336  	//  with a deterministic one.
   337  	const randSeed = -3035313949336265834
   338  	setSeed(randSeed)
   339  }
   340  
   341  func setSeed(n int64) {
   342  	r := rand.New(rand.NewSource(n))
   343  	for i := range steps {
   344  		steps[i] = uint32(r.Intn(i + 1))
   345  	}
   346  }