github.com/grafana/pyroscope@v1.18.0/pkg/querier/replication.go (about) 1 package querier 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "sort" 8 9 "github.com/cespare/xxhash/v2" 10 "github.com/go-kit/log" 11 "github.com/go-kit/log/level" 12 "github.com/grafana/dskit/ring" 13 "github.com/opentracing/opentracing-go" 14 otlog "github.com/opentracing/opentracing-go/log" 15 "github.com/samber/lo" 16 "golang.org/x/sync/errgroup" 17 18 ingestv1 "github.com/grafana/pyroscope/api/gen/proto/go/ingester/v1" 19 typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1" 20 "github.com/grafana/pyroscope/pkg/phlaredb/sharding" 21 "github.com/grafana/pyroscope/pkg/util" 22 "github.com/grafana/pyroscope/pkg/util/spanlogger" 23 ) 24 25 type ResponseFromReplica[T any] struct { 26 addr string 27 response T 28 } 29 30 type QueryReplicaFn[T any, Querier any] func(ctx context.Context, q Querier) (T, error) 31 32 type QueryReplicaWithHintsFn[T any, Querier any] func(ctx context.Context, q Querier, hint *ingestv1.Hints) (T, error) 33 34 type Closer interface { 35 CloseRequest() error 36 CloseResponse() error 37 } 38 39 type ClientFactory[T any] func(addr string) (T, error) 40 41 // cleanupResult, will make sure if the result was streamed, that we close the request and response 42 func cleanupStreams[Result any](result ResponseFromReplica[Result]) { 43 if stream, ok := any(result.response).(interface { 44 CloseRequest() error 45 }); ok { 46 if err := stream.CloseRequest(); err != nil { 47 level.Warn(util.Logger).Log("msg", "failed to close request", "err", err) 48 } 49 } 50 if stream, ok := any(result.response).(interface { 51 CloseResponse() error 52 }); ok { 53 if err := stream.CloseResponse(); err != nil { 54 level.Warn(util.Logger).Log("msg", "failed to close response", "err", err) 55 } 56 } 57 } 58 59 // forGivenReplicationSet runs f, in parallel, for given replica set. 60 // Under the hood it returns only enough responses to satisfy the quorum. 61 func forGivenReplicationSet[Result any, Querier any](ctx context.Context, clientFactory func(string) (Querier, error), replicationSet ring.ReplicationSet, f QueryReplicaFn[Result, Querier]) ([]ResponseFromReplica[Result], error) { 62 results, err := ring.DoUntilQuorumWithoutSuccessfulContextCancellation( 63 ctx, 64 replicationSet, 65 ring.DoUntilQuorumConfig{ 66 MinimizeRequests: true, 67 }, 68 func(ctx context.Context, ingester *ring.InstanceDesc, _ context.CancelCauseFunc) (ResponseFromReplica[Result], error) { 69 var res ResponseFromReplica[Result] 70 client, err := clientFactory(ingester.Addr) 71 if err != nil { 72 return res, err 73 } 74 75 resp, err := f(ctx, client) 76 if err != nil { 77 return res, err 78 } 79 80 return ResponseFromReplica[Result]{ingester.Addr, resp}, nil 81 }, 82 cleanupStreams[Result], 83 ) 84 if err != nil { 85 return nil, err 86 } 87 88 return results, err 89 } 90 91 // forGivenPlan runs f, in parallel, for given plan. 92 func forGivenPlan[Result any, Querier any]( 93 ctx context.Context, 94 plan map[string]*blockPlanEntry, 95 clientFactory func(string) (Querier, error), 96 replicationSet ring.ReplicationSet, f QueryReplicaWithHintsFn[Result, Querier], 97 ) ([]ResponseFromReplica[Result], error) { 98 g, _ := errgroup.WithContext(ctx) 99 100 var ( 101 idx = 0 102 result = make([]ResponseFromReplica[Result], len(plan)) 103 ) 104 105 for replica, planEntry := range plan { 106 if !replicationSet.Includes(replica) { 107 continue 108 } 109 var ( 110 i = idx 111 r = replica 112 h = planEntry.BlockHints 113 ) 114 idx++ 115 g.Go(func() error { 116 client, err := clientFactory(r) 117 if err != nil { 118 return err 119 } 120 121 resp, err := f(ctx, client, &ingestv1.Hints{Block: h}) 122 if err != nil { 123 return err 124 } 125 126 result[i] = ResponseFromReplica[Result]{r, resp} 127 128 return nil 129 }) 130 } 131 132 if err := g.Wait(); err != nil { 133 return nil, err 134 } 135 136 result = result[:idx] 137 138 return result, nil 139 } 140 141 type instanceType uint8 142 143 const ( 144 unknownInstanceType instanceType = iota 145 ingesterInstance 146 storeGatewayInstance 147 ) 148 149 // map of block ID to replicas containing the block, when empty replicas, the 150 // block is already contained by a higher compaction level block in full. 151 type replicasPerBlockID struct { 152 m map[string][]string 153 meta map[string]*typesv1.BlockInfo 154 instanceTypes map[string][]instanceType 155 logger log.Logger 156 } 157 158 func newReplicasPerBlockID(logger log.Logger) *replicasPerBlockID { 159 return &replicasPerBlockID{ 160 m: make(map[string][]string), 161 meta: make(map[string]*typesv1.BlockInfo), 162 instanceTypes: make(map[string][]instanceType), 163 logger: logger, 164 } 165 } 166 167 func (r *replicasPerBlockID) add(result []ResponseFromReplica[[]*typesv1.BlockInfo], t instanceType) { 168 for _, replica := range result { 169 // mark the replica's instance types (in single binary we can have the same replica have multiple types) 170 r.instanceTypes[replica.addr] = append(r.instanceTypes[replica.addr], t) 171 172 for _, block := range replica.response { 173 // add block to map 174 v, exists := r.m[block.Ulid] 175 if exists && len(v) > 0 || !exists { 176 r.m[block.Ulid] = append(r.m[block.Ulid], replica.addr) 177 } 178 179 // add block meta to map 180 // note: we do override existing meta, as meta is immutable for all replicas 181 r.meta[block.Ulid] = block 182 } 183 } 184 } 185 186 func shardFromBlock(m *typesv1.BlockInfo) (shard uint64, shardCount uint64, ok bool) { 187 for _, lp := range m.Labels { 188 if lp.Name != sharding.CompactorShardIDLabel { 189 continue 190 } 191 192 shardID, shardCount, err := sharding.ParseShardIDLabelValue(lp.Value) 193 if err == nil { 194 return shardID, shardCount, true 195 } 196 } 197 198 return 0, 0, false 199 } 200 201 func (r *replicasPerBlockID) removeBlock(ulid string) { 202 delete(r.m, ulid) 203 delete(r.meta, ulid) 204 } 205 206 // this step removes sharded blocks that don't have all the shards present for a time window 207 func (r *replicasPerBlockID) pruneIncompleteShardedBlocks() (bool, error) { 208 type compactionKey struct { 209 level int32 210 minTime int64 211 } 212 compactions := make(map[compactionKey][]string) 213 214 // group blocks by compaction level 215 for blockID := range r.m { 216 meta, ok := r.meta[blockID] 217 if !ok { 218 return false, fmt.Errorf("meta missing for block id %s", blockID) 219 } 220 221 key := compactionKey{ 222 level: 0, 223 minTime: meta.MinTime, 224 } 225 226 if meta.Compaction != nil { 227 key.level = meta.Compaction.Level 228 } 229 compactions[key] = append(compactions[key], blockID) 230 } 231 232 // now we go through every group and check if we see at least a block for each shard 233 var ( 234 shardsSeen []bool 235 shardedBlocks []string 236 hasShardedBlocks bool 237 ) 238 for _, blocks := range compactions { 239 shardsSeen = shardsSeen[:0] 240 shardedBlocks = shardedBlocks[:0] 241 for _, block := range blocks { 242 meta, ok := r.meta[block] 243 if !ok { 244 return false, fmt.Errorf("meta missing for block id %s", block) 245 } 246 247 shardIdx, shards, ok := shardFromBlock(meta) 248 if !ok { 249 // not a sharded block continue 250 continue 251 } 252 hasShardedBlocks = true 253 shardedBlocks = append(shardedBlocks, block) 254 255 if len(shardsSeen) == 0 { 256 if cap(shardsSeen) < int(shards) { 257 shardsSeen = make([]bool, shards) 258 } else { 259 shardsSeen = shardsSeen[:shards] 260 for idx := range shardsSeen { 261 shardsSeen[idx] = false 262 } 263 } 264 } 265 266 if len(shardsSeen) != int(shards) { 267 return false, fmt.Errorf("shard length mismatch, shards seen: %d, shards as per label: %d", len(shardsSeen), shards) 268 } 269 270 shardsSeen[shardIdx] = true 271 } 272 // check if all shards are present 273 allShardsPresent := true 274 for _, shardSeen := range shardsSeen { 275 if !shardSeen { 276 allShardsPresent = false 277 break 278 } 279 } 280 281 if allShardsPresent { 282 continue 283 } 284 285 // now remove all blocks that are shareded but not complete 286 for _, block := range shardedBlocks { 287 r.removeBlock(block) 288 } 289 } 290 291 return hasShardedBlocks, nil 292 } 293 294 // prunes blocks that are contained by a higher compaction level block 295 func (r *replicasPerBlockID) pruneSupersededBlocks(sharded bool) error { 296 for blockID := range r.m { 297 meta, ok := r.meta[blockID] 298 if !ok { 299 return fmt.Errorf("meta missing for block id %s", blockID) 300 } 301 if meta.Compaction == nil { 302 continue 303 } 304 if meta.Compaction.Level < 2 { 305 continue 306 } 307 // At split phase of compaction, L2 is an intermediate step where we 308 // split each group into split_shards parts, thus there will be up to 309 // groups_num * split_shards blocks, which is typically _significantly_ 310 // greater that the number of source blocks. Moreover, these blocks are 311 // not yet deduplicated, therefore we should prefer L1 blocks over them. 312 // As an optimisation, we drop all L2 blocks. 313 if sharded && meta.Compaction.Level == 2 { 314 r.removeBlock(blockID) 315 continue 316 } 317 for _, blockID := range meta.Compaction.Parents { 318 r.removeBlock(blockID) 319 } 320 for _, blockID := range meta.Compaction.Sources { 321 r.removeBlock(blockID) 322 } 323 } 324 return nil 325 } 326 327 type blockPlanEntry struct { 328 *ingestv1.BlockHints 329 InstanceTypes []instanceType 330 } 331 332 type blockPlan map[string]*blockPlanEntry 333 334 func BlockHints(p blockPlan, replica string) (*ingestv1.BlockHints, error) { 335 entry, ok := p[replica] 336 if !ok && p != nil { 337 return nil, fmt.Errorf("no hints found for replica %s", replica) 338 } 339 if entry == nil { 340 return nil, nil 341 } 342 return entry.BlockHints, nil 343 } 344 345 func (p blockPlan) String() string { 346 data, _ := json.Marshal(p) 347 return string(data) 348 } 349 350 func (r *replicasPerBlockID) blockPlan(ctx context.Context) map[string]*blockPlanEntry { 351 sp, _ := opentracing.StartSpanFromContext(ctx, "blockPlan") 352 defer sp.Finish() 353 354 var ( 355 deduplicate = false 356 hash = xxhash.New() 357 plan = make(map[string]*blockPlanEntry) 358 smallestCompactionLevel = int32(0) 359 ) 360 361 sharded, err := r.pruneIncompleteShardedBlocks() 362 if err != nil { 363 level.Warn(r.logger).Log("msg", "block planning failed to prune incomplete sharded blocks", "err", err) 364 return nil 365 } 366 367 // Depending on whether split sharding is used, the compaction level at 368 // which the data gets deduplicated differs: if split sharding is enabled, 369 // we deduplicate at level 3, and at level 2 otherwise. 370 var deduplicationLevel int32 = 2 371 if sharded { 372 deduplicationLevel = 3 373 } 374 375 if err := r.pruneSupersededBlocks(sharded); err != nil { 376 level.Warn(r.logger).Log("msg", "block planning failed to prune superseded blocks", "err", err) 377 return nil 378 } 379 380 // now we go through all blocks and choose the replicas that we want to query 381 for blockID, replicas := range r.m { 382 // skip if we have no replicas, then block is already contained i an higher compaction level one 383 if len(replicas) == 0 { 384 continue 385 } 386 387 meta, ok := r.meta[blockID] 388 if !ok { 389 continue 390 } 391 // when we see a block with CompactionLevel less than the level at which data is deduplicated, 392 // or a block without compaction section, we want the queriers to deduplicate 393 if meta.Compaction == nil || meta.Compaction.Level < deduplicationLevel { 394 deduplicate = true 395 } 396 397 // record the lowest compaction level 398 if meta.Compaction != nil && (smallestCompactionLevel == 0 || meta.Compaction.Level < smallestCompactionLevel) { 399 smallestCompactionLevel = meta.Compaction.Level 400 } 401 402 // only get store gateways replicas 403 sgReplicas := lo.Filter(replicas, func(replica string, _ int) bool { 404 instanceTypes, ok := r.instanceTypes[replica] 405 if !ok { 406 return false 407 } 408 for _, t := range instanceTypes { 409 if t == storeGatewayInstance { 410 return true 411 } 412 } 413 return false 414 }) 415 416 if len(sgReplicas) > 0 { 417 // if we have store gateway replicas, we want to query them 418 replicas = sgReplicas 419 } 420 421 // now select one replica, based on block id 422 sort.Strings(replicas) 423 hash.Reset() 424 _, _ = hash.WriteString(blockID) 425 hashIdx := int(hash.Sum64()) 426 if hashIdx < 0 { 427 hashIdx = -hashIdx 428 } 429 selectedReplica := replicas[hashIdx%len(replicas)] 430 431 // add block to plan 432 p, exists := plan[selectedReplica] 433 if !exists { 434 p = &blockPlanEntry{ 435 BlockHints: &ingestv1.BlockHints{}, 436 InstanceTypes: r.instanceTypes[selectedReplica], 437 } 438 plan[selectedReplica] = p 439 } 440 p.Ulids = append(p.Ulids, blockID) 441 442 // set the selected replica 443 r.m[blockID] = []string{selectedReplica} 444 } 445 446 // adapt the plan to make sure all replicas will deduplicate 447 if deduplicate { 448 for _, hints := range plan { 449 hints.Deduplication = deduplicate 450 } 451 } 452 453 var plannedIngesterBlocks, plannedStoreGatewayBlocks int 454 for replica, blocks := range plan { 455 instanceTypes, ok := r.instanceTypes[replica] 456 if !ok { 457 continue 458 } 459 for _, t := range instanceTypes { 460 if t == storeGatewayInstance { 461 plannedStoreGatewayBlocks += len(blocks.Ulids) 462 } 463 if t == ingesterInstance { 464 plannedIngesterBlocks += len(blocks.Ulids) 465 } 466 } 467 } 468 469 sp.LogFields( 470 otlog.Bool("deduplicate", deduplicate), 471 otlog.Int32("smallest_compaction_level", smallestCompactionLevel), 472 otlog.Int("planned_blocks_ingesters", plannedIngesterBlocks), 473 otlog.Int("planned_blocks_store_gateways", plannedStoreGatewayBlocks), 474 ) 475 476 level.Debug(spanlogger.FromContext(ctx, r.logger)).Log( 477 "msg", "block plan created", 478 "deduplicate", deduplicate, 479 "smallest_compaction_level", smallestCompactionLevel, 480 "planned_blocks_ingesters", plannedIngesterBlocks, 481 "planned_blocks_store_gateways", plannedStoreGatewayBlocks, 482 "plan", blockPlan(plan), 483 ) 484 485 return plan 486 }