github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/client/distributor/distributor.go (about) 1 package distributor 2 3 import ( 4 "fmt" 5 "math/rand" 6 "slices" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/grafana/dskit/ring" 12 13 "github.com/grafana/pyroscope/pkg/iter" 14 "github.com/grafana/pyroscope/pkg/segmentwriter/client/distributor/placement" 15 ) 16 17 // NOTE(kolesnikovae): Essentially, we do not depend on the dskit/ring and 18 // only use it as a discovery mechanism build on top of the memberlist. 19 // It would be better to access the memberlist/serf directly. 20 var op = ring.NewOp([]ring.InstanceState{ring.ACTIVE, ring.LEAVING, ring.PENDING, ring.JOINING}, nil) 21 22 const defaultRingUpdateInterval = 5 * time.Second 23 24 type Distributor struct { 25 mu sync.RWMutex 26 ring ring.ReadRing 27 placement placement.Placement 28 distribution *distribution 29 30 RingUpdateInterval time.Duration 31 } 32 33 func NewDistributor(placement placement.Placement, r ring.ReadRing) *Distributor { 34 return &Distributor{ 35 ring: r, 36 placement: placement, 37 RingUpdateInterval: defaultRingUpdateInterval, 38 } 39 } 40 41 func (d *Distributor) Distribute(k placement.Key) (*placement.ShardMapping, error) { 42 if err := d.updateDistribution(d.ring, d.RingUpdateInterval); err != nil { 43 return nil, err 44 } 45 return d.distribute(k), nil 46 } 47 48 func (d *Distributor) updateDistribution(r ring.ReadRing, maxAge time.Duration) error { 49 d.mu.RLock() 50 x := d.distribution 51 if x != nil && !x.isExpired(maxAge) { 52 d.mu.RUnlock() 53 return nil 54 } 55 d.mu.RUnlock() 56 d.mu.Lock() 57 defer d.mu.Unlock() 58 x = d.distribution 59 if x != nil && !x.isExpired(maxAge) { 60 return nil 61 } 62 if x == nil { 63 x = newDistribution() 64 } 65 if err := x.readRing(r); err != nil { 66 return fmt.Errorf("failed to read ring: %w", err) 67 } 68 d.distribution = x 69 return nil 70 } 71 72 // emptyMapping is returned by distributor if the ring is empty. 73 // This helps to handle a case when requests arrive before the 74 // ring is populated (no instances registered). 75 var emptyMapping = &placement.ShardMapping{ 76 Instances: iter.NewEmptyIterator[ring.InstanceDesc](), 77 Shard: 0, 78 } 79 80 func (d *Distributor) distribute(k placement.Key) *placement.ShardMapping { 81 d.mu.RLock() 82 defer d.mu.RUnlock() 83 // Determine the number of shards for the tenant within the available 84 // space, and the dataset shards within the tenant subring. 85 s := len(d.distribution.shards) 86 if s == 0 { 87 return emptyMapping 88 } 89 p := d.placement.Policy(k) 90 tenantSize := p.TenantShards 91 if tenantSize == 0 || tenantSize > s { 92 tenantSize = s 93 } 94 datasetSize := min(tenantSize, max(1, p.DatasetShards)) 95 // When we create subrings, we need to ensure that each of them has at 96 // least p shards. However, the data distribution must be restricted 97 // according to the limits. 98 all := newSubring(s) 99 tenant := all.subring(k.Tenant, tenantSize) 100 dataset := tenant.subring(k.Dataset, datasetSize) 101 // We pick a shard from the dataset subring: its index is relative 102 // to the dataset subring. 103 offset := p.PickShard(datasetSize) 104 // Next we want to find p instances eligible to host the key. 105 // The choice must be limited to the dataset / tenant subring, 106 // but extended if needed. 107 return &placement.ShardMapping{ 108 Shard: uint32(dataset.at(offset)) + 1, // 0 shard ID is a sentinel 109 Instances: d.distribution.instances(dataset, offset), 110 } 111 } 112 113 type distribution struct { 114 timestamp time.Time 115 shards []uint32 // Shard ID -> Instance ID. 116 desc []ring.InstanceDesc 117 perm *perm 118 } 119 120 func newDistribution() *distribution { 121 return &distribution{ 122 timestamp: time.Now(), 123 perm: new(perm), 124 } 125 } 126 127 func (d *distribution) isExpired(maxAge time.Duration) bool { 128 return time.Now().Add(-maxAge).After(d.timestamp) 129 } 130 131 func (d *distribution) readRing(r ring.ReadRing) error { 132 all, err := r.GetAllHealthy(op) 133 if err != nil { 134 return err 135 } 136 if len(all.Instances) == 0 { 137 return ring.ErrEmptyRing 138 } 139 d.timestamp = time.Now() 140 d.desc = all.Instances 141 // Jump consistent hashing requires a deterministic order of instances. 142 // Moreover, instances can be only added to the end, otherwise this may 143 // cause massive relocations. 144 slices.SortFunc(d.desc, func(a, b ring.InstanceDesc) int { 145 return strings.Compare(a.Id, b.Id) 146 }) 147 // Now we create a mapping of shards to instances. 148 var tmp [256]uint32 // Try to allocate on stack. 149 instances := tmp[:0] 150 for j := range d.desc { 151 for range all.Instances[j].Tokens { 152 instances = append(instances, uint32(j)) 153 } 154 } 155 // We use shuffling to avoid hotspots: a contiguous range of shards 156 // is distributed over instances in a pseudo-random fashion. 157 // Given that the number of shards and instances is known in advance, 158 // we maintain a deterministic permutation that perturbs as little as 159 // possible, when the number of shards or instances changes: only the 160 // delta moves. 161 size := len(instances) 162 d.perm.resize(size) 163 // Note that we can't reuse d.shards because it may be used by iterators. 164 // In fact, this is a snapshot that must not be modified. 165 d.shards = make([]uint32, size) 166 for j := range d.shards { 167 d.shards[j] = instances[d.perm.v[j]] 168 } 169 return nil 170 } 171 172 // instances returns an iterator that iterates over instances 173 // that may host the shard at the offset in the order of preference: 174 // dataset -> tenant -> all shards. 175 func (d *distribution) instances(r subring, off int) *iterator { 176 return &iterator{ 177 off: off, 178 lim: r.size(), 179 ring: r, 180 shards: d.shards, 181 desc: d.desc, 182 } 183 } 184 185 // The inputs are a key and the number of buckets. 186 // It outputs a bucket number in the range [0, buckets). 187 // 188 // Refer to https://arxiv.org/pdf/1406.2294: 189 // The function satisfies the two properties: 190 // 1. About the same number of keys map to each bucket. 191 // 2. The mapping from key to bucket is perturbed as little as possible when 192 // the number of buckets is changed. Thus, the only data that needs to move 193 // when the number of buckets changes is the data for the relatively small 194 // number of keys whose bucket assignment changed. 195 func jump(key uint64, buckets int) int { 196 var b, j = -1, 0 197 for j < buckets { 198 b = j 199 key = key*2862933555777941757 + 1 200 j = int(float64(b+1) * (float64(int64(1)<<31) / float64((key>>33)+1))) 201 } 202 return b 203 } 204 205 // Subring is a utility to calculate the subring 206 // for a given key within the available space: 207 // 208 // Note that this is not a recursive implementation, 209 // but a more straightforward one, optimized for the 210 // case where there can be up to two nested rings. 211 type subring struct { 212 // |<---------n----------->| Available space. 213 // | . a---|---------b . . | Ring. 214 // | . . . c-----d . . . . | Subring. 215 n, a, b, c, d int 216 } 217 218 func newSubring(n int) subring { return subring{n: n, b: n, d: n} } 219 220 // The function creates a subring of the specified size for the given key. 221 // The subring offset is calculated with the jump function. 222 func (s subring) subring(k uint64, size int) subring { 223 n := s 224 n.a, n.b = n.c, n.d 225 n.c = n.a + jump(k, n.b-n.a) 226 n.d = n.c + size 227 return n 228 } 229 230 func (s subring) pop() subring { 231 n := s 232 n.c, n.d = n.a, n.b 233 n.a, n.b = 0, n.n 234 return n 235 } 236 237 // The function returns the absolute offset of the relative n. 238 func (s subring) at(n int) int { 239 // [ . a-------|-----b . . ] 240 // [ . . . . . c-----|-x-d ] 241 // 242 // [ . a-------|-----b . . ] 243 // [ . |-x-d . c-----| . . ] 244 n %= s.d - s.c 245 x := s.c + n 246 x = (x - s.a) % (s.b - s.a) 247 p := (x + s.a) % s.n 248 return p 249 } 250 251 // offset reports offset in the parent ring. 252 func (s subring) offset() int { return s.c - s.a } 253 254 // size reports the size of the ring. 255 func (s subring) size() int { return s.d - s.c } 256 257 // iterator iterates instances that host the shards of the subring. 258 // The iterator is not limited to the subring, and will continue with 259 // the parent subring when the current one is exhausted. 260 type iterator struct { 261 n int // Number of instances collected. 262 off int // Current offset in the ring (relative). 263 lim int // Remaining instances in the subring. 264 265 ring subring 266 shards []uint32 267 desc []ring.InstanceDesc 268 } 269 270 func (i *iterator) Err() error { return nil } 271 272 func (i *iterator) Close() error { return nil } 273 274 func (i *iterator) Next() bool { 275 if i.n >= i.ring.n { 276 return false 277 } 278 if i.lim > 0 { 279 i.lim-- 280 } else { 281 for i.lim <= 0 { 282 // We have exhausted the subring. 283 // Navigate to the parent ring. 284 if i.ring.n == i.ring.size() { 285 // No parent rings left. 286 return false 287 } 288 // Start with the offset right after the subring. 289 size := i.ring.size() 290 i.off = i.ring.offset() + size 291 p := i.ring.pop() // Load parent. 292 // How many items remain in the ring. 293 i.lim = p.size() - size - 1 294 i.ring = p 295 } 296 } 297 i.off++ 298 i.n++ 299 return true 300 } 301 302 func (i *iterator) At() ring.InstanceDesc { 303 a := i.ring.at(i.off - 1) // Translate the relative offset to absolute. 304 x := i.shards[a] // Map the shard to the instance. 305 return i.desc[x] 306 } 307 308 // Fisher–Yates shuffle with predefined steps. 309 // Rand source with a seed is not enough as we 310 // can't guarantee the same sequence of calls 311 // with identical arguments, which would make 312 // the state of two instances incoherent. 313 type perm struct{ v []uint32 } 314 315 func (p *perm) resize(n int) { 316 d := max(0, n-len(p.v)) 317 p.v = slices.Grow(p.v, d)[:n] 318 // We do want to start with 0 (in contrast to the standard 319 // implementation) as this is required for the n == 1 case: 320 // we need to zero v[0]. 321 // Although, it's possible to make the change incrementally, 322 // for simplicity, we just rebuild the permutation. 323 for i := 0; i < n; i++ { 324 j := steps[i] 325 p.v[i], p.v[j] = p.v[j], uint32(i) 326 } 327 } 328 329 var steps [4 << 10]uint32 330 331 func init() { 332 // The seed impacts mapping of shards to nodes. 333 // TODO(kolesnikovae): 334 // Stochastic approach does not work well 335 // in all the cases; it should be replaced 336 // with a deterministic one. 337 const randSeed = -3035313949336265834 338 setSeed(randSeed) 339 } 340 341 func setSeed(n int64) { 342 r := rand.New(rand.NewSource(n)) 343 for i := range steps { 344 steps[i] = uint32(r.Intn(i + 1)) 345 } 346 }