github.com/grafana/pyroscope@v1.18.0/pkg/block/compaction.go (about) 1 package block 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "io" 8 "os" 9 "slices" 10 "sort" 11 "strings" 12 "sync" 13 14 "github.com/grafana/dskit/multierror" 15 "github.com/parquet-go/parquet-go" 16 "github.com/prometheus/common/model" 17 "github.com/prometheus/prometheus/storage" 18 "golang.org/x/sync/errgroup" 19 20 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 21 "github.com/grafana/pyroscope/pkg/block/metadata" 22 phlaremodel "github.com/grafana/pyroscope/pkg/model" 23 "github.com/grafana/pyroscope/pkg/objstore" 24 "github.com/grafana/pyroscope/pkg/phlaredb/symdb" 25 "github.com/grafana/pyroscope/pkg/phlaredb/tsdb/index" 26 memindex "github.com/grafana/pyroscope/pkg/segmentwriter/memdb/index" 27 "github.com/grafana/pyroscope/pkg/util" 28 ) 29 30 var ( 31 ErrNoBlocksToMerge = fmt.Errorf("no blocks to merge") 32 ErrShardMergeMismatch = fmt.Errorf("only blocks from the same shard can be merged") 33 ) 34 35 type CompactionOption func(*compactionConfig) 36 37 func WithCompactionObjectOptions(options ...ObjectOption) CompactionOption { 38 return func(p *compactionConfig) { 39 p.objectOptions = append(p.objectOptions, options...) 40 } 41 } 42 43 func WithCompactionTempDir(tempdir string) CompactionOption { 44 return func(p *compactionConfig) { 45 p.tempdir = tempdir 46 } 47 } 48 49 func WithCompactionDestination(storage objstore.Bucket) CompactionOption { 50 return func(p *compactionConfig) { 51 p.destination = storage 52 } 53 } 54 55 func WithSampleObserver(observer SampleObserver) CompactionOption { 56 return func(p *compactionConfig) { 57 p.sampleObserver = observer 58 } 59 } 60 61 type compactionConfig struct { 62 objectOptions []ObjectOption 63 source objstore.BucketReader 64 destination objstore.Bucket 65 tempdir string 66 sampleObserver SampleObserver 67 } 68 69 type SampleObserver interface { 70 symdb.SymbolsObserver 71 72 // Evaluate is called before the compactor rewrites any symbols. 73 // An "observe" callback function is returned to be called after writing the resulting blocks. 74 // This method must not modify the entry. 75 Evaluate(ProfileEntry) (observe func()) 76 } 77 78 func Compact( 79 ctx context.Context, 80 blocks []*metastorev1.BlockMeta, 81 storage objstore.Bucket, 82 options ...CompactionOption, 83 ) (m []*metastorev1.BlockMeta, err error) { 84 c := &compactionConfig{ 85 source: storage, 86 destination: storage, 87 tempdir: os.TempDir(), 88 } 89 for _, option := range options { 90 option(c) 91 } 92 93 objects := ObjectsFromMetas(storage, blocks, c.objectOptions...) 94 plan, err := PlanCompaction(objects) 95 if err != nil { 96 return nil, err 97 } 98 99 if err = objects.Open(ctx); err != nil { 100 return nil, fmt.Errorf("objects.Open: %w", err) 101 } 102 defer func() { 103 _ = objects.Close() 104 }() 105 106 compacted := make([]*metastorev1.BlockMeta, 0, len(plan)) 107 for _, p := range plan { 108 md, compactionErr := p.Compact(ctx, c.destination, c.tempdir, c.sampleObserver) 109 if compactionErr != nil { 110 return nil, compactionErr 111 } 112 compacted = append(compacted, md) 113 } 114 115 return compacted, nil 116 } 117 118 func PlanCompaction(objects Objects) ([]*CompactionPlan, error) { 119 if len(objects) == 0 { 120 // Even if there's just a single object, we still need to rewrite it. 121 return nil, ErrNoBlocksToMerge 122 } 123 124 r := objects[0] 125 var level uint32 126 for _, obj := range objects { 127 if r.meta.Shard != obj.meta.Shard { 128 return nil, ErrShardMergeMismatch 129 } 130 level = max(level, obj.meta.CompactionLevel) 131 } 132 level++ 133 134 g := NewULIDGenerator(objects) 135 m := make(map[string]*CompactionPlan) 136 for _, obj := range objects { 137 for _, ds := range obj.meta.Datasets { 138 if ds.Name == 0 { 139 // Anonymous dataset is never compacted: 140 // it is rebuilt based on the actual block contents. 141 continue 142 } 143 tm, ok := m[obj.meta.StringTable[ds.Tenant]] 144 if !ok { 145 tm = newBlockCompaction( 146 g.ULID().String(), 147 obj.meta.StringTable[ds.Tenant], 148 r.meta.Shard, 149 level, 150 ) 151 m[obj.meta.StringTable[ds.Tenant]] = tm 152 } 153 // Bind objects to datasets. 154 sm := tm.addDataset(obj.meta, ds) 155 sm.append(NewDataset(ds, obj)) 156 } 157 } 158 159 ordered := make([]*CompactionPlan, 0, len(m)) 160 for _, tm := range m { 161 ordered = append(ordered, tm) 162 slices.SortFunc(tm.datasets, func(a, b *datasetCompaction) int { 163 return strings.Compare(a.name, b.name) 164 }) 165 } 166 slices.SortFunc(ordered, func(a, b *CompactionPlan) int { 167 return strings.Compare(a.tenant, b.tenant) 168 }) 169 170 return ordered, nil 171 } 172 173 type CompactionPlan struct { 174 tenant string 175 path string 176 datasetMap map[int32]*datasetCompaction 177 datasets []*datasetCompaction 178 meta *metastorev1.BlockMeta 179 strings *metadata.StringTable 180 datasetIndex *datasetIndexWriter 181 } 182 183 func newBlockCompaction( 184 id string, 185 tenant string, 186 shard uint32, 187 compactionLevel uint32, 188 ) *CompactionPlan { 189 p := &CompactionPlan{ 190 tenant: tenant, 191 datasetMap: make(map[int32]*datasetCompaction), 192 strings: metadata.NewStringTable(), 193 datasetIndex: newDatasetIndexWriter(), 194 } 195 p.path = BuildObjectPath(tenant, shard, compactionLevel, id) 196 p.meta = &metastorev1.BlockMeta{ 197 FormatVersion: 1, 198 Id: id, 199 Tenant: p.strings.Put(tenant), 200 Shard: shard, 201 CompactionLevel: compactionLevel, 202 } 203 return p 204 } 205 206 func (b *CompactionPlan) Compact( 207 ctx context.Context, 208 dst objstore.Bucket, 209 tempdir string, 210 observer SampleObserver, 211 ) (m *metastorev1.BlockMeta, err error) { 212 w, err := NewBlockWriter(tempdir) 213 if err != nil { 214 return nil, fmt.Errorf("creating block writer: %w", err) 215 } 216 defer func() { 217 _ = w.Close() 218 }() 219 220 // Datasets are compacted in a strict order. 221 for i, s := range b.datasets { 222 b.datasetIndex.setIndex(uint32(i)) 223 s.registerSampleObserver(observer) 224 if err = s.compact(ctx, w); err != nil { 225 return nil, fmt.Errorf("compacting block: %w", err) 226 } 227 b.meta.Datasets = append(b.meta.Datasets, s.meta) 228 } 229 if err = b.writeDatasetIndex(w); err != nil { 230 return nil, fmt.Errorf("writing tenant index: %w", err) 231 } 232 b.meta.StringTable = b.strings.Strings 233 b.meta.MetadataOffset = w.Offset() 234 if err = metadata.Encode(w, b.meta); err != nil { 235 return nil, fmt.Errorf("writing metadata: %w", err) 236 } 237 b.meta.Size = w.Offset() 238 if err = w.Upload(ctx, dst, b.path); err != nil { 239 return nil, fmt.Errorf("uploading block: %w", err) 240 } 241 return b.meta, nil 242 } 243 244 func (b *CompactionPlan) writeDatasetIndex(w *Writer) error { 245 if err := b.datasetIndex.Flush(); err != nil { 246 return err 247 } 248 off := w.Offset() 249 n, err := io.Copy(w, bytes.NewReader(b.datasetIndex.buf)) 250 if err != nil { 251 return err 252 } 253 // We annotate the dataset with the 254 // __tenant_dataset__ = "dataset_tsdb_index" label, 255 // so the dataset index metadata can be queried. 256 labels := metadata.NewLabelBuilder(b.strings). 257 WithLabelSet(metadata.LabelNameTenantDataset, metadata.LabelValueDatasetTSDBIndex). 258 Build() 259 b.meta.Datasets = append(b.meta.Datasets, &metastorev1.Dataset{ 260 Format: 1, 261 Tenant: b.meta.Tenant, 262 Name: 0, // Anonymous. 263 MinTime: b.meta.MinTime, 264 MaxTime: b.meta.MaxTime, 265 TableOfContents: []uint64{off}, 266 Size: uint64(n), 267 Labels: labels, 268 }) 269 return nil 270 } 271 272 func (b *CompactionPlan) addDataset(md *metastorev1.BlockMeta, s *metastorev1.Dataset) *datasetCompaction { 273 name := b.strings.Put(md.StringTable[s.Name]) 274 tenant := b.strings.Put(md.StringTable[s.Tenant]) 275 sm, ok := b.datasetMap[name] 276 if !ok { 277 sm = b.newDatasetCompaction(tenant, name) 278 b.datasetMap[name] = sm 279 b.datasets = append(b.datasets, sm) 280 } 281 if b.meta.MinTime == 0 || s.MinTime < b.meta.MinTime { 282 b.meta.MinTime = s.MinTime 283 } 284 if s.MaxTime > b.meta.MaxTime { 285 b.meta.MaxTime = s.MaxTime 286 } 287 return sm 288 } 289 290 type datasetCompaction struct { 291 // Dataset name. 292 name string 293 parent *CompactionPlan 294 meta *metastorev1.Dataset 295 labels *metadata.LabelBuilder 296 297 datasets []*Dataset 298 299 indexRewriter *indexRewriter 300 symbolsRewriter *symbolsRewriter 301 profilesWriter *profilesWriter 302 303 samples uint64 304 series uint64 305 profiles uint64 306 307 flushOnce sync.Once 308 309 observer SampleObserver 310 } 311 312 func (b *CompactionPlan) newDatasetCompaction(tenant, name int32) *datasetCompaction { 313 return &datasetCompaction{ 314 parent: b, 315 name: b.strings.Strings[name], 316 labels: metadata.NewLabelBuilder(b.strings), 317 meta: &metastorev1.Dataset{ 318 Tenant: tenant, 319 Name: name, 320 // Updated at append. 321 MinTime: 0, 322 MaxTime: 0, 323 // Updated at writeTo. 324 TableOfContents: nil, 325 Size: 0, 326 Labels: nil, 327 }, 328 } 329 } 330 331 func (m *datasetCompaction) append(s *Dataset) { 332 m.datasets = append(m.datasets, s) 333 if m.meta.MinTime == 0 || s.meta.MinTime < m.meta.MinTime { 334 m.meta.MinTime = s.meta.MinTime 335 } 336 if s.meta.MaxTime > m.meta.MaxTime { 337 m.meta.MaxTime = s.meta.MaxTime 338 } 339 m.labels.Put(s.meta.Labels, s.obj.meta.StringTable) 340 } 341 342 func (m *datasetCompaction) compact(ctx context.Context, w *Writer) (err error) { 343 off := w.Offset() 344 m.meta.TableOfContents = make([]uint64, 0, 3) 345 m.meta.TableOfContents = append(m.meta.TableOfContents, w.Offset()) 346 347 if err = m.open(ctx, w); err != nil { 348 return fmt.Errorf("failed to open sections for compaction: %w", err) 349 } 350 defer func() { 351 _ = m.close() 352 }() 353 354 if err = m.merge(ctx); err != nil { 355 return fmt.Errorf("failed to merge datasets: %w", err) 356 } 357 if err = m.flush(); err != nil { 358 return fmt.Errorf("failed to flush compacted dataset: %w", err) 359 } 360 361 m.meta.TableOfContents = append(m.meta.TableOfContents, w.Offset()) 362 if _, err = io.Copy(w, bytes.NewReader(m.indexRewriter.buf)); err != nil { 363 return fmt.Errorf("failed to read index: %w", err) 364 } 365 m.meta.TableOfContents = append(m.meta.TableOfContents, w.Offset()) 366 if _, err = io.Copy(w, bytes.NewReader(m.symbolsRewriter.buf.Bytes())); err != nil { 367 return fmt.Errorf("failed to read symbols: %w", err) 368 } 369 370 m.meta.Size = w.Offset() - off 371 m.meta.Labels = m.labels.Build() 372 return nil 373 } 374 375 func (m *datasetCompaction) registerSampleObserver(observer SampleObserver) { 376 m.observer = observer 377 } 378 379 func (m *datasetCompaction) open(ctx context.Context, w io.Writer) (err error) { 380 var estimatedProfileTableSize int64 381 for _, ds := range m.datasets { 382 estimatedProfileTableSize += ds.sectionSize(SectionProfiles) 383 } 384 pageBufferSize := estimatePageBufferSize(estimatedProfileTableSize) 385 m.profilesWriter = newProfileWriter(pageBufferSize, w) 386 387 m.indexRewriter = newIndexRewriter() 388 m.symbolsRewriter = newSymbolsRewriter(m.observer) 389 390 g, ctx := errgroup.WithContext(ctx) 391 for _, s := range m.datasets { 392 s := s 393 g.Go(util.RecoverPanic(func() error { 394 if openErr := s.Open(ctx, 395 SectionProfiles, 396 SectionTSDB, 397 SectionSymbols, 398 ); openErr != nil { 399 return fmt.Errorf("opening tenant dataset (block %s): %w", s.obj.path, openErr) 400 } 401 return nil 402 })) 403 } 404 if err = g.Wait(); err != nil { 405 merr := multierror.New(err) 406 for _, s := range m.datasets { 407 merr.Add(s.Close()) 408 } 409 return merr.Err() 410 } 411 return nil 412 } 413 414 func (m *datasetCompaction) merge(ctx context.Context) (err error) { 415 rows, err := NewMergeRowProfileIterator(m.datasets) 416 if err != nil { 417 return err 418 } 419 defer func() { 420 err = multierror.New(err, rows.Close()).Err() 421 }() 422 var i int 423 for rows.Next() { 424 if i++; i%1000 == 0 { 425 if err = ctx.Err(); err != nil { 426 return err 427 } 428 } 429 if err = m.writeRow(rows.At()); err != nil { 430 return err 431 } 432 } 433 return rows.Err() 434 } 435 436 func (m *datasetCompaction) writeRow(r ProfileEntry) (err error) { 437 if m.observer != nil { 438 observe := m.observer.Evaluate(r) 439 defer observe() 440 } 441 m.parent.datasetIndex.writeRow(r) 442 m.indexRewriter.rewriteRow(r) 443 if err = m.symbolsRewriter.rewriteRow(r); err != nil { 444 return err 445 } 446 return m.profilesWriter.writeRow(r) 447 } 448 449 func (m *datasetCompaction) flush() (err error) { 450 m.flushOnce.Do(func() { 451 merr := multierror.New() 452 merr.Add(m.symbolsRewriter.Flush()) 453 merr.Add(m.indexRewriter.Flush()) 454 merr.Add(m.profilesWriter.Close()) 455 m.samples = m.symbolsRewriter.samples 456 m.series = m.indexRewriter.NumSeries() 457 m.profiles = m.profilesWriter.profiles 458 err = merr.Err() 459 }) 460 return err 461 } 462 463 func (m *datasetCompaction) close() error { 464 err := m.flush() 465 m.symbolsRewriter = nil 466 m.indexRewriter = nil 467 m.profilesWriter = nil 468 m.datasets = nil 469 return err 470 } 471 472 func newIndexRewriter() *indexRewriter { 473 return &indexRewriter{ 474 symbols: make(map[string]struct{}), 475 } 476 } 477 478 type indexRewriter struct { 479 series []seriesLabels 480 symbols map[string]struct{} 481 chunks []index.ChunkMeta // one chunk per series 482 previousFp model.Fingerprint 483 buf []byte 484 } 485 486 type seriesLabels struct { 487 labels phlaremodel.Labels 488 fingerprint model.Fingerprint 489 } 490 491 func (rw *indexRewriter) rewriteRow(e ProfileEntry) { 492 if rw.previousFp != e.Fingerprint || len(rw.series) == 0 { 493 series := e.Labels.Clone() 494 for _, l := range series { 495 rw.symbols[l.Name] = struct{}{} 496 rw.symbols[l.Value] = struct{}{} 497 } 498 rw.series = append(rw.series, seriesLabels{ 499 labels: series, 500 fingerprint: e.Fingerprint, 501 }) 502 rw.chunks = append(rw.chunks, index.ChunkMeta{ 503 MinTime: e.Timestamp, 504 MaxTime: e.Timestamp, 505 SeriesIndex: uint32(len(rw.series) - 1), 506 }) 507 rw.previousFp = e.Fingerprint 508 } 509 rw.chunks[len(rw.chunks)-1].MaxTime = e.Timestamp 510 e.Row.SetSeriesIndex(rw.chunks[len(rw.chunks)-1].SeriesIndex) 511 } 512 513 func (rw *indexRewriter) NumSeries() uint64 { return uint64(len(rw.series)) } 514 515 func (rw *indexRewriter) Flush() error { 516 // TODO(kolesnikovae): 517 // * Estimate size. 518 // * Use buffer pool. 519 w, err := memindex.NewWriter(context.Background(), 256<<10) 520 if err != nil { 521 return err 522 } 523 524 // Sort symbols 525 symbols := make([]string, 0, len(rw.symbols)) 526 for s := range rw.symbols { 527 symbols = append(symbols, s) 528 } 529 sort.Strings(symbols) 530 531 // Add symbols 532 for _, symbol := range symbols { 533 if err = w.AddSymbol(symbol); err != nil { 534 return err 535 } 536 } 537 538 // Add Series 539 for i, series := range rw.series { 540 if err = w.AddSeries(storage.SeriesRef(i), series.labels, series.fingerprint, rw.chunks[i]); err != nil { 541 return err 542 } 543 } 544 545 err = w.Close() 546 rw.buf = w.ReleaseIndex() 547 return err 548 } 549 550 type symbolsRewriter struct { 551 buf *bytes.Buffer 552 w *symdb.SymDB 553 rw map[*Dataset]*symdb.Rewriter 554 samples uint64 555 observer SampleObserver 556 557 stacktraces []uint32 558 } 559 560 func newSymbolsRewriter(observer SampleObserver) *symbolsRewriter { 561 // TODO(kolesnikovae): 562 // * Estimate size. 563 // * Use buffer pool. 564 buf := bytes.NewBuffer(make([]byte, 0, 1<<20)) 565 return &symbolsRewriter{ 566 buf: buf, 567 rw: make(map[*Dataset]*symdb.Rewriter), 568 w: symdb.NewSymDB(&symdb.Config{ 569 Version: symdb.FormatV3, 570 Writer: &nopWriteCloser{buf}, 571 }), 572 observer: observer, 573 } 574 } 575 576 type nopWriteCloser struct{ io.Writer } 577 578 func (*nopWriteCloser) Close() error { return nil } 579 580 func (s *symbolsRewriter) rewriteRow(e ProfileEntry) (err error) { 581 rw := s.rewriterFor(e.Dataset) 582 e.Row.ForStacktraceIDsValues(func(values []parquet.Value) { 583 s.loadStacktraceIDs(values) 584 if err = rw.Rewrite(e.Row.StacktracePartitionID(), s.stacktraces); err != nil { 585 return 586 } 587 s.samples += uint64(len(values)) 588 for i, v := range values { 589 values[i] = parquet.Int64Value(int64(s.stacktraces[i])).Level(v.RepetitionLevel(), v.DefinitionLevel(), v.Column()) 590 } 591 }) 592 return err 593 } 594 595 func (s *symbolsRewriter) rewriterFor(x *Dataset) *symdb.Rewriter { 596 rw, ok := s.rw[x] 597 if !ok { 598 rw = symdb.NewRewriter(s.w, x.Symbols(), s.observer) 599 s.rw[x] = rw 600 } 601 return rw 602 } 603 604 func (s *symbolsRewriter) loadStacktraceIDs(values []parquet.Value) { 605 s.stacktraces = slices.Grow(s.stacktraces[0:], len(values))[:len(values)] 606 for i := range values { 607 s.stacktraces[i] = values[i].Uint32() 608 } 609 } 610 611 func (s *symbolsRewriter) Flush() error { return s.w.Flush() } 612 613 // datasetIndexWriter is identical with indexRewriter, 614 // except it writes dataset ID instead of series ID. 615 type datasetIndexWriter struct { 616 series []seriesLabels 617 chunks []index.ChunkMeta 618 previous model.Fingerprint 619 symbols map[string]struct{} 620 idx uint32 621 buf []byte 622 } 623 624 func newDatasetIndexWriter() *datasetIndexWriter { 625 return &datasetIndexWriter{ 626 symbols: make(map[string]struct{}), 627 } 628 } 629 630 func (rw *datasetIndexWriter) setIndex(i uint32) { rw.idx = i } 631 632 func (rw *datasetIndexWriter) writeRow(e ProfileEntry) { 633 if rw.previous != e.Fingerprint || len(rw.series) == 0 { 634 series := e.Labels.Clone() 635 for _, l := range series { 636 rw.symbols[l.Name] = struct{}{} 637 rw.symbols[l.Value] = struct{}{} 638 } 639 rw.series = append(rw.series, seriesLabels{ 640 labels: series, 641 fingerprint: e.Fingerprint, 642 }) 643 rw.chunks = append(rw.chunks, index.ChunkMeta{ 644 SeriesIndex: rw.idx, 645 }) 646 rw.previous = e.Fingerprint 647 } 648 } 649 650 func (rw *datasetIndexWriter) Flush() error { 651 // TODO(kolesnikovae): 652 // * Estimate size. 653 // * Use buffer pool. 654 w, err := memindex.NewWriter(context.Background(), 1<<20) 655 if err != nil { 656 return err 657 } 658 659 // Sort symbols 660 symbols := make([]string, 0, len(rw.symbols)) 661 for s := range rw.symbols { 662 symbols = append(symbols, s) 663 } 664 sort.Strings(symbols) 665 666 // Add symbols 667 for _, symbol := range symbols { 668 if err = w.AddSymbol(symbol); err != nil { 669 return err 670 } 671 } 672 673 // Add Series 674 for i, series := range rw.series { 675 if err = w.AddSeries(storage.SeriesRef(i), series.labels, series.fingerprint, rw.chunks[i]); err != nil { 676 return err 677 } 678 } 679 680 err = w.Close() 681 rw.buf = w.ReleaseIndex() 682 return err 683 }