github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/symdb/block_reader.go (about) 1 //nolint:unused 2 package symdb 3 4 import ( 5 "bufio" 6 "context" 7 "fmt" 8 "hash/crc32" 9 "io" 10 "os" 11 "path/filepath" 12 "sync" 13 14 "github.com/grafana/dskit/multierror" 15 "github.com/opentracing/opentracing-go" 16 otlog "github.com/opentracing/opentracing-go/log" 17 "golang.org/x/sync/errgroup" 18 19 "github.com/grafana/pyroscope/pkg/iter" 20 "github.com/grafana/pyroscope/pkg/objstore" 21 "github.com/grafana/pyroscope/pkg/phlaredb/block" 22 schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1" 23 "github.com/grafana/pyroscope/pkg/util/bufferpool" 24 "github.com/grafana/pyroscope/pkg/util/refctr" 25 ) 26 27 type Reader struct { 28 bucket objstore.BucketReader 29 file block.File 30 index IndexFile 31 footer Footer 32 33 partitions []*partition 34 partitionsMap map[uint64]*partition 35 36 // Not used in v3; left for compatibility. 37 meta *block.Meta 38 files map[string]block.File 39 parquetFiles *parquetFiles 40 41 prefetchSize uint64 42 } 43 44 type Option func(*Reader) 45 46 func WithPrefetchSize(size uint64) Option { 47 return func(r *Reader) { 48 r.prefetchSize = size 49 } 50 } 51 52 func OpenObject(ctx context.Context, b objstore.BucketReader, name string, offset, size int64, options ...Option) (*Reader, error) { 53 f := block.File{ 54 RelPath: name, 55 SizeBytes: uint64(size), 56 } 57 r := &Reader{ 58 bucket: objstore.NewBucketReaderWithOffset(b, offset), 59 file: f, 60 } 61 for _, opt := range options { 62 opt(r) 63 } 64 65 var err error 66 if r.prefetchSize > 0 { 67 err = r.openIndexWithPrefetch(ctx) 68 } else { 69 err = r.openIndex(ctx) 70 } 71 if err != nil { 72 return nil, fmt.Errorf("opening index section: %w", err) 73 } 74 75 if err = r.buildPartitions(); err != nil { 76 return nil, err 77 } 78 79 return r, nil 80 } 81 82 func (r *Reader) openIndexWithPrefetch(ctx context.Context) (err error) { 83 prefetchSize := r.prefetchSize 84 if prefetchSize > r.file.SizeBytes { 85 prefetchSize = r.file.SizeBytes 86 } 87 n, err := r.prefetchIndex(ctx, prefetchSize) 88 if err == nil && n != 0 { 89 _, err = r.prefetchIndex(ctx, prefetchSize) 90 } 91 return err 92 } 93 94 func (r *Reader) prefetchIndex(ctx context.Context, size uint64) (n uint64, err error) { 95 if size < uint64(FooterSize) { 96 size = uint64(FooterSize) 97 } 98 prefetchOffset := r.file.SizeBytes - size 99 buf := bufferpool.GetBuffer(int(size)) 100 defer bufferpool.Put(buf) 101 if err = objstore.ReadRange(ctx, buf, r.file.RelPath, r.bucket, int64(prefetchOffset), int64(size)); err != nil { 102 return 0, fmt.Errorf("fetching index: %w", err) 103 } 104 footerOffset := size - uint64(FooterSize) 105 if err = r.footer.UnmarshalBinary(buf.B[footerOffset:]); err != nil { 106 return 0, fmt.Errorf("unmarshaling footer: %w", err) 107 } 108 if prefetchOffset > (r.footer.IndexOffset) { 109 return r.file.SizeBytes - r.footer.IndexOffset, nil 110 } 111 // prefetch offset is less that or equal to the index offset. 112 indexOffset := r.footer.IndexOffset - prefetchOffset 113 if r.index, err = OpenIndex(buf.B[indexOffset:footerOffset]); err != nil { 114 return 0, fmt.Errorf("opening index: %w", err) 115 } 116 return 0, nil 117 } 118 119 func Open(ctx context.Context, b objstore.BucketReader, m *block.Meta) (*Reader, error) { 120 r := &Reader{ 121 bucket: b, 122 meta: m, 123 files: make(map[string]block.File), 124 file: block.File{RelPath: DefaultFileName}, 125 } 126 for _, f := range r.meta.Files { 127 r.files[filepath.Base(f.RelPath)] = f 128 } 129 if err := r.open(ctx); err != nil { 130 return nil, err 131 } 132 if err := r.buildPartitions(); err != nil { 133 return nil, err 134 } 135 return r, nil 136 } 137 138 func (r *Reader) open(ctx context.Context) (err error) { 139 if r.file, err = r.lookupFile(r.file.RelPath); err == nil { 140 if err = r.openIndex(ctx); err != nil { 141 return fmt.Errorf("opening index section: %w", err) 142 } 143 return nil 144 } 145 if err = r.openIndexV12(ctx); err != nil { 146 return fmt.Errorf("opening index file: %w", err) 147 } 148 if r.index.Header.Version == FormatV2 { 149 if err = openParquetFiles(ctx, r); err != nil { 150 return fmt.Errorf("opening parquet files: %w", err) 151 } 152 } 153 return nil 154 } 155 156 func (r *Reader) buildPartitions() (err error) { 157 r.partitionsMap = make(map[uint64]*partition, len(r.index.PartitionHeaders)) 158 r.partitions = make([]*partition, len(r.index.PartitionHeaders)) 159 for i, h := range r.index.PartitionHeaders { 160 var p *partition 161 if p, err = r.partitionReader(h); err != nil { 162 return err 163 } 164 r.partitionsMap[h.Partition] = p 165 r.partitions[i] = p 166 } 167 // Cleanup the index to not retain unused objects. 168 r.index = IndexFile{ 169 Header: IndexHeader{ 170 Version: r.index.Header.Version, 171 }, 172 } 173 return nil 174 } 175 176 func (r *Reader) partitionReader(h *PartitionHeader) (*partition, error) { 177 p := &partition{reader: r} 178 switch r.index.Header.Version { 179 case FormatV1: 180 p.initEmptyTables(h) 181 case FormatV2: 182 p.initParquetTables(h) 183 case FormatV3: 184 if err := p.initTables(h); err != nil { 185 return nil, err 186 } 187 } 188 p.initStacktraces(h.Stacktraces) 189 return p, nil 190 } 191 192 // openIndex locates footer and loads the index section from 193 // the file into the memory. 194 func (r *Reader) openIndex(ctx context.Context) error { 195 if r.file.SizeBytes == 0 { 196 attrs, err := r.bucket.Attributes(ctx, r.file.RelPath) 197 if err != nil { 198 return fmt.Errorf("fetching file attributes: %w", err) 199 } 200 r.file.SizeBytes = uint64(attrs.Size) 201 } 202 // Read footer. 203 offset := int64(r.file.SizeBytes) - int64(FooterSize) 204 if offset < int64(IndexHeaderSize) { 205 return fmt.Errorf("%w: footer offset: %d", ErrInvalidSize, offset) 206 } 207 if err := r.readFooter(ctx, offset, int64(FooterSize)); err != nil { 208 return err 209 } 210 indexSize := offset - int64(r.footer.IndexOffset) 211 if indexSize < int64(IndexHeaderSize) { 212 return fmt.Errorf("%w: index section size: %d", ErrInvalidSize, indexSize) 213 } 214 return r.readIndexSection(ctx, int64(r.footer.IndexOffset), indexSize) 215 } 216 217 func (r *Reader) readFooter(ctx context.Context, offset, size int64) error { 218 o, err := r.bucket.GetRange(ctx, r.file.RelPath, offset, size) 219 if err != nil { 220 return fmt.Errorf("fetching footer: %w", err) 221 } 222 defer func() { 223 _ = o.Close() 224 }() 225 buf := make([]byte, size) 226 if _, err = io.ReadFull(o, buf); err != nil { 227 return fmt.Errorf("reading footer: %w", err) 228 } 229 if err = r.footer.UnmarshalBinary(buf); err != nil { 230 return fmt.Errorf("unmarshaling footer: %w", err) 231 } 232 return nil 233 } 234 235 func (r *Reader) readIndexSection(ctx context.Context, offset, size int64) error { 236 o, err := r.bucket.GetRange(ctx, r.file.RelPath, offset, size) 237 if err != nil { 238 return fmt.Errorf("fetching index: %w", err) 239 } 240 defer func() { 241 _ = o.Close() 242 }() 243 buf := make([]byte, int(size)) 244 if _, err = io.ReadFull(o, buf); err != nil { 245 return fmt.Errorf("reading index: %w", err) 246 } 247 r.index, err = OpenIndex(buf) 248 if err != nil { 249 return fmt.Errorf("opening index: %w", err) 250 } 251 return nil 252 } 253 254 func (r *Reader) openIndexV12(ctx context.Context) error { 255 f, err := r.lookupFile(IndexFileName) 256 if err != nil { 257 return err 258 } 259 o, err := r.bucket.Get(ctx, f.RelPath) 260 if err != nil { 261 return err 262 } 263 defer func() { 264 _ = o.Close() 265 }() 266 b, err := io.ReadAll(o) 267 if err != nil { 268 return err 269 } 270 r.index, err = OpenIndex(b) 271 return err 272 } 273 274 func (r *Reader) lookupFile(name string) (block.File, error) { 275 f, ok := r.files[name] 276 if !ok { 277 return block.File{}, fmt.Errorf("%q: %w", name, os.ErrNotExist) 278 } 279 return f, nil 280 } 281 282 func (r *Reader) Close() error { 283 if r == nil { 284 return nil 285 } 286 if r.parquetFiles != nil { 287 return r.parquetFiles.Close() 288 } 289 return nil 290 } 291 292 var ErrPartitionNotFound = fmt.Errorf("partition not found") 293 294 func (r *Reader) Partition(ctx context.Context, partition uint64) (PartitionReader, error) { 295 p, err := r.partition(ctx, partition) 296 if err != nil { 297 return nil, err 298 } 299 return p, nil 300 } 301 302 func (r *Reader) partition(ctx context.Context, partition uint64) (*partition, error) { 303 p, ok := r.partitionsMap[partition] 304 if !ok { 305 return nil, ErrPartitionNotFound 306 } 307 if err := p.fetch(ctx); err != nil { 308 return nil, err 309 } 310 return p, nil 311 } 312 313 type partition struct { 314 reader *Reader 315 316 stacktraces []*stacktraceBlock 317 locations table[schemav1.InMemoryLocation] 318 mappings table[schemav1.InMemoryMapping] 319 functions table[schemav1.InMemoryFunction] 320 strings table[string] 321 } 322 323 type table[T any] interface { 324 fetchable 325 slice() []T 326 } 327 328 func (p *partition) fetch(ctx context.Context) (err error) { 329 return p.tx().fetch(ctx) 330 } 331 332 func (p *partition) Release() { 333 p.tx().release() 334 } 335 336 func (p *partition) tx() *fetchTx { 337 tx := make(fetchTx, 0, len(p.stacktraces)+4) 338 for _, c := range p.stacktraces { 339 tx.append(c) 340 } 341 if p.reader.index.Header.Version > FormatV1 { 342 tx.append(p.locations) 343 tx.append(p.mappings) 344 tx.append(p.functions) 345 tx.append(p.strings) 346 } 347 return &tx 348 } 349 350 // Format V1. 351 func (p *partition) initEmptyTables(*PartitionHeader) { 352 p.locations = emptyTable[schemav1.InMemoryLocation]{} 353 p.mappings = emptyTable[schemav1.InMemoryMapping]{} 354 p.functions = emptyTable[schemav1.InMemoryFunction]{} 355 p.strings = emptyTable[string]{} 356 } 357 358 // Format V2. 359 func (p *partition) initParquetTables(h *PartitionHeader) { 360 p.locations = &parquetTable[schemav1.InMemoryLocation, schemav1.LocationPersister]{ 361 bucket: p.reader.bucket, 362 headers: h.V2.Locations, 363 file: &p.reader.parquetFiles.locations, 364 } 365 p.mappings = &parquetTable[schemav1.InMemoryMapping, schemav1.MappingPersister]{ 366 bucket: p.reader.bucket, 367 headers: h.V2.Mappings, 368 file: &p.reader.parquetFiles.mappings, 369 } 370 p.functions = &parquetTable[schemav1.InMemoryFunction, schemav1.FunctionPersister]{ 371 bucket: p.reader.bucket, 372 headers: h.V2.Functions, 373 file: &p.reader.parquetFiles.functions, 374 } 375 p.strings = &parquetTable[string, schemav1.StringPersister]{ 376 bucket: p.reader.bucket, 377 headers: h.V2.Strings, 378 file: &p.reader.parquetFiles.strings, 379 } 380 } 381 382 // Format V3. 383 func (p *partition) initTables(h *PartitionHeader) (err error) { 384 locations := &rawTable[schemav1.InMemoryLocation]{ 385 reader: p.reader, 386 header: h.V3.Locations, 387 } 388 if locations.dec, err = newLocationsDecoder(h.V3.Locations); err != nil { 389 return err 390 } 391 p.locations = locations 392 393 mappings := &rawTable[schemav1.InMemoryMapping]{ 394 reader: p.reader, 395 header: h.V3.Mappings, 396 } 397 if mappings.dec, err = newMappingsDecoder(h.V3.Mappings); err != nil { 398 return err 399 } 400 p.mappings = mappings 401 402 functions := &rawTable[schemav1.InMemoryFunction]{ 403 reader: p.reader, 404 header: h.V3.Functions, 405 } 406 if functions.dec, err = newFunctionsDecoder(h.V3.Functions); err != nil { 407 return err 408 } 409 p.functions = functions 410 411 strings := &rawTable[string]{ 412 reader: p.reader, 413 header: h.V3.Strings, 414 } 415 if strings.dec, err = newStringsDecoder(h.V3.Strings); err != nil { 416 return err 417 } 418 p.strings = strings 419 return nil 420 } 421 422 func (p *partition) Symbols() *Symbols { 423 return &Symbols{ 424 Stacktraces: p, 425 Locations: p.locations.slice(), 426 Mappings: p.mappings.slice(), 427 Functions: p.functions.slice(), 428 Strings: p.strings.slice(), 429 } 430 } 431 432 func (p *partition) WriteStats(s *PartitionStats) { 433 var nodes uint32 434 for _, c := range p.stacktraces { 435 s.StacktracesTotal += int(c.header.Stacktraces) 436 nodes += c.header.StacktraceNodes 437 } 438 s.MaxStacktraceID = int(nodes) 439 s.LocationsTotal = len(p.locations.slice()) 440 s.MappingsTotal = len(p.mappings.slice()) 441 s.FunctionsTotal = len(p.functions.slice()) 442 s.StringsTotal = len(p.strings.slice()) 443 } 444 445 var ErrInvalidStacktraceRange = fmt.Errorf("invalid range: stack traces can't be resolved") 446 447 func (p *partition) LookupLocations(dst []uint64, stacktraceID uint32) []uint64 { 448 dst = dst[:0] 449 if len(p.stacktraces) == 0 { 450 return dst 451 } 452 nodesPerChunk := p.stacktraces[0].header.StacktraceMaxNodes 453 chunkID := stacktraceID / nodesPerChunk 454 localSID := stacktraceID % nodesPerChunk 455 if localSID == 0 || int(chunkID) > len(p.stacktraces) { 456 return dst 457 } 458 return p.stacktraces[chunkID].t.resolveUint64(dst, localSID) 459 } 460 461 func (p *partition) ResolveStacktraceLocations(ctx context.Context, dst StacktraceInserter, s []uint32) (err error) { 462 if len(s) == 0 { 463 return nil 464 } 465 if len(p.stacktraces) == 0 { 466 return ErrInvalidStacktraceRange 467 } 468 // First, we determine the chunks needed for the range. 469 // All chunks in a block must have the same StacktraceMaxNodes. 470 sr := SplitStacktraces(s, p.stacktraces[0].header.StacktraceMaxNodes) 471 for _, c := range sr { 472 if err = p.lookupStacktraces(ctx, dst, c).do(); err != nil { 473 return err 474 } 475 } 476 return nil 477 } 478 479 func (p *partition) SplitStacktraceIDRanges(appender *SampleAppender) iter.Iterator[*StacktraceIDRange] { 480 if len(p.stacktraces) == 0 { 481 return iter.NewEmptyIterator[*StacktraceIDRange]() 482 } 483 var n int 484 samples := appender.Samples() 485 ranges := SplitStacktraces(samples.StacktraceIDs, p.stacktraces[0].header.StacktraceMaxNodes) 486 for _, sr := range ranges { 487 c := p.stacktraces[sr.chunk] 488 sr.ParentPointerTree = c.t 489 sr.Samples = samples.Range(n, n+len(sr.IDs)) 490 n += len(sr.IDs) 491 } 492 return iter.NewSliceIterator(ranges) 493 } 494 495 func (p *partition) initStacktraces(chunks []StacktraceBlockHeader) { 496 p.stacktraces = make([]*stacktraceBlock, len(chunks)) 497 for i, c := range chunks { 498 p.stacktraces[i] = &stacktraceBlock{ 499 reader: p.reader, 500 header: c, 501 } 502 } 503 } 504 505 func (p *partition) stacktraceChunkReader(i uint32) *stacktraceBlock { 506 if int(i) < len(p.stacktraces) { 507 return p.stacktraces[i] 508 } 509 return nil 510 } 511 512 func (p *partition) lookupStacktraces(ctx context.Context, dst StacktraceInserter, c *StacktraceIDRange) *stacktracesLookup { 513 return &stacktracesLookup{ 514 ctx: ctx, 515 dst: dst, 516 c: c, 517 r: p, 518 } 519 } 520 521 // stacktracesLookup represents a stacktrace resolution operation. 522 type stacktracesLookup struct { 523 ctx context.Context 524 dst StacktraceInserter 525 c *StacktraceIDRange 526 r *partition 527 } 528 529 func (r *stacktracesLookup) do() error { 530 cr := r.r.stacktraceChunkReader(r.c.chunk) 531 if cr == nil { 532 return ErrInvalidStacktraceRange 533 } 534 s := stacktraceLocations.get() 535 // Restore the original stacktrace ID. 536 off := r.c.Offset() 537 for _, sid := range r.c.IDs { 538 s = cr.t.resolve(s, sid) 539 r.dst.InsertStacktrace(off+sid, s) 540 } 541 stacktraceLocations.put(s) 542 return nil 543 } 544 545 type stacktraceBlock struct { 546 reader *Reader 547 header StacktraceBlockHeader 548 549 r refctr.Counter 550 t *parentPointerTree 551 } 552 553 func (c *stacktraceBlock) fetch(ctx context.Context) error { 554 span, ctx := opentracing.StartSpanFromContext(ctx, "stacktraceBlock.fetch") 555 span.LogFields( 556 otlog.Int64("size", c.header.Size), 557 otlog.Uint32("nodes", c.header.StacktraceNodes), 558 otlog.Uint32("stacks", c.header.Stacktraces), 559 ) 560 defer span.Finish() 561 return c.r.Inc(func() error { 562 path, err := c.stacktracesFile() 563 if err != nil { 564 return err 565 } 566 rc, err := c.reader.bucket.GetRange(ctx, path, c.header.Offset, c.header.Size) 567 if err != nil { 568 return err 569 } 570 r := getFetchBufReader(rc) 571 defer func() { 572 putFetchBufReader(r) 573 err = multierror.New(err, rc.Close()).Err() 574 }() 575 return c.readFrom(r) 576 }) 577 } 578 579 func (c *stacktraceBlock) stacktracesFile() (string, error) { 580 f := c.reader.file 581 if c.reader.index.Header.Version < 3 { 582 var err error 583 if f, err = c.reader.lookupFile(StacktracesFileName); err != nil { 584 return "", err 585 } 586 } 587 return f.RelPath, nil 588 } 589 590 func (c *stacktraceBlock) readFrom(r *bufio.Reader) error { 591 // NOTE(kolesnikovae): Pool of node chunks could reduce 592 // the alloc size, but it may affect memory locality. 593 // Although, properly aligned chunks of, say, 1-4K nodes 594 // which is 8-32KiB respectively, should not make things 595 // much worse than they are. Worth experimenting. 596 t := newParentPointerTree(c.header.StacktraceNodes) 597 // We unmarshal the tree speculatively, before validating 598 // the checksum. Even random bytes can be unmarshalled to 599 // a tree not causing any errors, therefore it is vital 600 // to verify the correctness of the data. 601 crc := crc32.New(castagnoli) 602 tee := io.TeeReader(r, crc) 603 if _, err := t.ReadFrom(tee); err != nil { 604 return fmt.Errorf("failed to unmarshal stack traces: %w", err) 605 } 606 if c.header.CRC != crc.Sum32() { 607 return ErrInvalidCRC 608 } 609 c.t = t 610 return nil 611 } 612 613 func (c *stacktraceBlock) release() { 614 c.r.Dec(func() { 615 c.t = nil 616 }) 617 } 618 619 type rawTable[T any] struct { 620 reader *Reader 621 header SymbolsBlockHeader 622 dec *symbolsDecoder[T] 623 r refctr.Counter 624 s []T 625 } 626 627 func (t *rawTable[T]) fetch(ctx context.Context) error { 628 span, ctx := opentracing.StartSpanFromContext(ctx, "symbolsTable.fetch") 629 span.LogFields( 630 otlog.Uint32("size", t.header.Size), 631 otlog.Uint32("length", t.header.Length), 632 ) 633 defer span.Finish() 634 return t.r.Inc(func() error { 635 rc, err := t.reader.bucket.GetRange(ctx, 636 t.reader.file.RelPath, 637 int64(t.header.Offset), 638 int64(t.header.Size)) 639 if err != nil { 640 return err 641 } 642 r := getFetchBufReader(rc) 643 defer func() { 644 putFetchBufReader(r) 645 err = multierror.New(err, rc.Close()).Err() 646 }() 647 return t.readFrom(r) 648 }) 649 } 650 651 func (t *rawTable[T]) readFrom(r *bufio.Reader) error { 652 crc := crc32.New(castagnoli) 653 tee := io.TeeReader(r, crc) 654 t.s = make([]T, t.header.Length) 655 if err := t.dec.decode(t.s, tee); err != nil { 656 return fmt.Errorf("failed to decode symbols: %w", err) 657 } 658 if t.header.CRC != crc.Sum32() { 659 return ErrInvalidCRC 660 } 661 return nil 662 } 663 664 func (t *rawTable[T]) slice() []T { return t.s } 665 666 func (t *rawTable[T]) release() { 667 t.r.Dec(func() { 668 t.s = nil 669 }) 670 } 671 672 // This is a stub for versions without tables in the block (format v1). 673 type emptyTable[T any] struct{} 674 675 func (emptyTable[T]) fetch(context.Context) error { return nil } 676 677 func (emptyTable[T]) release() {} 678 679 func (emptyTable[T]) slice() []T { return nil } 680 681 // fetchTx facilitates fetching multiple objects in a transactional manner: 682 // if one of the objects has failed, all the remaining ones are released. 683 type fetchTx []fetchable 684 685 type fetchable interface { 686 fetch(context.Context) error 687 release() 688 } 689 690 func (tx *fetchTx) append(x fetchable) { *tx = append(*tx, x) } 691 692 func (tx *fetchTx) fetch(ctx context.Context) (err error) { 693 defer func() { 694 if err != nil { 695 tx.release() 696 } 697 }() 698 g, ctx := errgroup.WithContext(ctx) 699 for i, x := range *tx { 700 i := i 701 x := x 702 g.Go(func() error { 703 fErr := x.fetch(ctx) 704 if fErr != nil { 705 (*tx)[i] = nil 706 } 707 return fErr 708 }) 709 } 710 return g.Wait() 711 } 712 713 func (tx *fetchTx) release() { 714 var wg sync.WaitGroup 715 wg.Add(len(*tx)) 716 for _, x := range *tx { 717 x := x 718 go func() { 719 defer wg.Done() 720 if x != nil { 721 x.release() 722 } 723 }() 724 } 725 wg.Wait() 726 } 727 728 const defaultFetchBufferSize = 64 << 10 729 730 var fetchBufReaderPool = sync.Pool{ 731 New: func() any { 732 return bufio.NewReaderSize(nil, defaultFetchBufferSize) 733 }, 734 } 735 736 func getFetchBufReader(r io.Reader) *bufio.Reader { 737 b := fetchBufReaderPool.Get().(*bufio.Reader) 738 b.Reset(r) 739 return b 740 } 741 742 func putFetchBufReader(b *bufio.Reader) { 743 b.Reset(nil) 744 fetchBufReaderPool.Put(b) 745 }