github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/symdb/locations.go (about)

     1  //nolint:unused
     2  package symdb
     3  
     4  import (
     5  	"bytes"
     6  	"encoding/binary"
     7  	"fmt"
     8  	"hash/crc32"
     9  	"io"
    10  	"unsafe"
    11  
    12  	"github.com/parquet-go/parquet-go/encoding/delta"
    13  
    14  	v1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1"
    15  	"github.com/grafana/pyroscope/pkg/slices"
    16  )
    17  
    18  const maxLocationLines = 255
    19  
    20  var (
    21  	_ symbolsBlockEncoder[v1.InMemoryLocation] = (*locationsBlockEncoder)(nil)
    22  	_ symbolsBlockDecoder[v1.InMemoryLocation] = (*locationsBlockDecoder)(nil)
    23  )
    24  
    25  type locationsBlockHeader struct {
    26  	LocationsLen uint32 // Number of locations
    27  	MappingSize  uint32 // Size of the encoded slice of mapping_ids
    28  	LinesLen     uint32 // Number of lines per location
    29  	LinesSize    uint32 // Size of the encoded lines
    30  	// Optional, might be empty.
    31  	AddrSize     uint32 // Size of the encoded slice of addresses
    32  	IsFoldedSize uint32 // Size of the encoded slice of is_folded
    33  	CRC          uint32 // Header CRC.
    34  }
    35  
    36  func (h *locationsBlockHeader) marshal(b []byte) {
    37  	binary.BigEndian.PutUint32(b[0:4], h.LocationsLen)
    38  	binary.BigEndian.PutUint32(b[4:8], h.MappingSize)
    39  	binary.BigEndian.PutUint32(b[8:12], h.LinesLen)
    40  	binary.BigEndian.PutUint32(b[12:16], h.LinesSize)
    41  	binary.BigEndian.PutUint32(b[16:20], h.AddrSize)
    42  	binary.BigEndian.PutUint32(b[20:24], h.IsFoldedSize)
    43  	// Fields can be added here in the future.
    44  	// CRC must be the last four bytes.
    45  	h.CRC = crc32.Checksum(b[0:24], castagnoli)
    46  	binary.BigEndian.PutUint32(b[24:28], h.CRC)
    47  }
    48  
    49  func (h *locationsBlockHeader) unmarshal(b []byte) {
    50  	h.LocationsLen = binary.BigEndian.Uint32(b[0:4])
    51  	h.MappingSize = binary.BigEndian.Uint32(b[4:8])
    52  	h.LinesLen = binary.BigEndian.Uint32(b[8:12])
    53  	h.LinesSize = binary.BigEndian.Uint32(b[12:16])
    54  	h.AddrSize = binary.BigEndian.Uint32(b[16:20])
    55  	h.IsFoldedSize = binary.BigEndian.Uint32(b[20:24])
    56  	// In future versions, new fields are decoded here;
    57  	// if pos < len(b)-checksumSize, then there are more fields.
    58  	h.CRC = binary.BigEndian.Uint32(b[24:28])
    59  }
    60  
    61  func (h *locationsBlockHeader) checksum() uint32 { return h.CRC }
    62  
    63  type locationsBlockEncoder struct {
    64  	header locationsBlockHeader
    65  
    66  	mapping []int32
    67  	// Assuming there are no locations with more than 255 lines.
    68  	// We could even use a nibble (4 bits), but there are locations
    69  	// with 10 and more functions, therefore there is a change that
    70  	// capacity of 2^4 is not enough in all cases.
    71  	lineCount []byte
    72  	lines     []int32
    73  	// Optional.
    74  	addr   []int64
    75  	folded []bool
    76  
    77  	tmp []byte
    78  	buf bytes.Buffer
    79  }
    80  
    81  func newLocationsEncoder() *symbolsEncoder[v1.InMemoryLocation] {
    82  	return newSymbolsEncoder[v1.InMemoryLocation](new(locationsBlockEncoder))
    83  }
    84  
    85  func (e *locationsBlockEncoder) format() SymbolsBlockFormat { return BlockLocationsV1 }
    86  
    87  func (e *locationsBlockEncoder) headerSize() uintptr { return unsafe.Sizeof(locationsBlockHeader{}) }
    88  
    89  func (e *locationsBlockEncoder) encode(w io.Writer, locations []v1.InMemoryLocation) error {
    90  	e.initWrite(len(locations))
    91  	var addr uint64
    92  	var folded bool
    93  	for i, loc := range locations {
    94  		e.mapping[i] = int32(loc.MappingId)
    95  		e.lineCount[i] = byte(len(loc.Line))
    96  		for j := 0; j < len(loc.Line) && j < maxLocationLines; j++ {
    97  			e.lines = append(e.lines,
    98  				int32(loc.Line[j].FunctionId),
    99  				loc.Line[j].Line)
   100  		}
   101  		addr |= loc.Address
   102  		e.addr[i] = int64(loc.Address)
   103  		folded = folded || loc.IsFolded
   104  		e.folded[i] = loc.IsFolded
   105  	}
   106  
   107  	// Mapping and line count per location.
   108  	var enc delta.BinaryPackedEncoding
   109  	e.tmp, _ = enc.EncodeInt32(e.tmp, e.mapping)
   110  	e.header.MappingSize = uint32(len(e.tmp))
   111  	e.buf.Write(e.tmp)
   112  	// Line count size and length is deterministic.
   113  	e.buf.Write(e.lineCount) // Without any encoding.
   114  
   115  	// Lines slice size and length (in lines, not int32s).
   116  	e.tmp, _ = enc.EncodeInt32(e.tmp, e.lines)
   117  	e.header.LinesLen = uint32(len(e.lines) / 2)
   118  	e.header.LinesSize = uint32(len(e.tmp))
   119  	e.buf.Write(e.tmp)
   120  
   121  	if addr > 0 {
   122  		e.tmp, _ = enc.EncodeInt64(e.tmp, e.addr)
   123  		e.header.AddrSize = uint32(len(e.tmp))
   124  		e.buf.Write(e.tmp)
   125  	}
   126  
   127  	if folded {
   128  		e.tmp = slices.GrowLen(e.tmp, len(e.folded)/8+1)
   129  		encodeBoolean(e.tmp, e.folded)
   130  		e.header.IsFoldedSize = uint32(len(e.tmp))
   131  		e.buf.Write(e.tmp)
   132  	}
   133  
   134  	e.tmp = slices.GrowLen(e.tmp, int(e.headerSize()))
   135  	e.header.marshal(e.tmp)
   136  	if _, err := w.Write(e.tmp); err != nil {
   137  		return err
   138  	}
   139  	_, err := e.buf.WriteTo(w)
   140  	return err
   141  }
   142  
   143  func (e *locationsBlockEncoder) initWrite(locations int) {
   144  	// Actual estimate is ~6 bytes per location.
   145  	// In a large data set, the most expensive member
   146  	// is FunctionID, and it's about 2 bytes per location.
   147  	e.buf.Reset()
   148  	e.buf.Grow(locations * 8)
   149  	*e = locationsBlockEncoder{
   150  		header: locationsBlockHeader{LocationsLen: uint32(locations)},
   151  
   152  		mapping:   slices.GrowLen(e.mapping, locations),
   153  		lineCount: slices.GrowLen(e.lineCount, locations),
   154  		lines:     e.lines[:0], // Appendable.
   155  		addr:      slices.GrowLen(e.addr, locations),
   156  		folded:    slices.GrowLen(e.folded, locations),
   157  
   158  		buf: e.buf,
   159  		tmp: slices.GrowLen(e.tmp, 2*locations),
   160  	}
   161  }
   162  
   163  type locationsBlockDecoder struct {
   164  	headerSize uint16
   165  	header     locationsBlockHeader
   166  
   167  	mappings  []int32
   168  	lineCount []byte
   169  	lines     []int32
   170  
   171  	address []int64
   172  	folded  []bool
   173  
   174  	buf []byte
   175  }
   176  
   177  func newLocationsDecoder(h SymbolsBlockHeader) (*symbolsDecoder[v1.InMemoryLocation], error) {
   178  	if h.Format == BlockLocationsV1 {
   179  		headerSize := max(locationsBlockHeaderMinSize, h.BlockHeaderSize)
   180  		return newSymbolsDecoder[v1.InMemoryLocation](h, &locationsBlockDecoder{headerSize: headerSize}), nil
   181  	}
   182  	return nil, fmt.Errorf("%w: unknown locations format: %d", ErrUnknownVersion, h.Format)
   183  }
   184  
   185  // In early versions, block header size is not specified. Must not change.
   186  const locationsBlockHeaderMinSize = 28
   187  
   188  func (d *locationsBlockDecoder) decode(r io.Reader, locations []v1.InMemoryLocation) (err error) {
   189  	d.buf = slices.GrowLen(d.buf, int(d.headerSize))
   190  	if err = readSymbolsBlockHeader(d.buf, r, &d.header); err != nil {
   191  		return err
   192  	}
   193  	if d.header.LocationsLen != uint32(len(locations)) {
   194  		return fmt.Errorf("locations buffer: %w", ErrInvalidSize)
   195  	}
   196  
   197  	// First we decode mapping_id and assign them to locations.
   198  	d.buf = slices.GrowLen(d.buf, int(d.header.MappingSize))
   199  	if _, err = io.ReadFull(r, d.buf); err != nil {
   200  		return err
   201  	}
   202  	d.mappings, err = decodeBinaryPackedInt32(d.mappings, d.buf, int(d.header.LocationsLen))
   203  	if err != nil {
   204  		return err
   205  	}
   206  
   207  	// Line count per location.
   208  	// One byte per location.
   209  	d.lineCount = slices.GrowLen(d.lineCount, int(d.header.LocationsLen))
   210  	if _, err = io.ReadFull(r, d.lineCount); err != nil {
   211  		return err
   212  	}
   213  
   214  	// Lines. A single slice backs all the location line
   215  	// sub-slices. But it has to be allocated as we can't
   216  	// reference d.lines, which is reusable.
   217  	lines := make([]v1.InMemoryLine, d.header.LinesLen)
   218  	d.buf = slices.GrowLen(d.buf, int(d.header.LinesSize))
   219  	if _, err = io.ReadFull(r, d.buf); err != nil {
   220  		return err
   221  	}
   222  	// Lines are encoded as pairs of uint32 (function_id and line number).
   223  	d.lines, err = decodeBinaryPackedInt32(d.lines, d.buf, int(d.header.LinesLen)*2)
   224  	if err != nil {
   225  		return err
   226  	}
   227  	copy(lines, *(*[]v1.InMemoryLine)(unsafe.Pointer(&d.lines)))
   228  
   229  	// In most cases we end up here.
   230  	if d.header.AddrSize == 0 && d.header.IsFoldedSize == 0 {
   231  		var o int // Offset within the lines slice.
   232  		// In case if the block is malformed, an invalid
   233  		// line count may cause an out-of-bounds panic.
   234  		maxLines := len(lines)
   235  		for i := 0; i < len(locations); i++ {
   236  			locations[i].MappingId = uint32(d.mappings[i])
   237  			n := o + int(d.lineCount[i])
   238  			if n > maxLines {
   239  				return fmt.Errorf("%w: location lines out of bounds", ErrInvalidSize)
   240  			}
   241  			locations[i].Line = lines[o:n]
   242  			o = n
   243  		}
   244  		return nil
   245  	}
   246  
   247  	// Otherwise, inspect all the optional fields.
   248  	d.address = slices.GrowLen(d.address, int(d.header.LocationsLen))
   249  	d.folded = slices.GrowLen(d.folded, int(d.header.LocationsLen))
   250  	if int(d.header.AddrSize) > 0 {
   251  		d.buf = slices.GrowLen(d.buf, int(d.header.AddrSize))
   252  		if _, err = io.ReadFull(r, d.buf); err != nil {
   253  			return err
   254  		}
   255  		d.address, err = decodeBinaryPackedInt64(d.address, d.buf, int(d.header.LocationsLen))
   256  		if err != nil {
   257  			return err
   258  		}
   259  	}
   260  	if int(d.header.IsFoldedSize) > 0 {
   261  		d.buf = slices.GrowLen(d.buf, int(d.header.IsFoldedSize))
   262  		if _, err = io.ReadFull(r, d.buf); err != nil {
   263  			return err
   264  		}
   265  		decodeBoolean(d.folded, d.buf)
   266  	}
   267  
   268  	var o int // Offset within the lines slice.
   269  	for i := uint32(0); i < d.header.LocationsLen; i++ {
   270  		locations[i].MappingId = uint32(d.mappings[i])
   271  		n := o + int(d.lineCount[i])
   272  		locations[i].Line = lines[o:n]
   273  		o = n
   274  		locations[i].Address = uint64(d.address[i])
   275  		locations[i].IsFolded = d.folded[i]
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  func encodeBoolean(dst []byte, src []bool) {
   282  	for i := range dst {
   283  		dst[i] = 0
   284  	}
   285  	for i, b := range src {
   286  		if b {
   287  			dst[i>>3] |= 1 << i & 7
   288  		}
   289  	}
   290  }
   291  
   292  func decodeBoolean(dst []bool, src []byte) {
   293  	for i := range dst {
   294  		dst[i] = false
   295  	}
   296  	for i := range dst {
   297  		dst[i] = src[i>>3]&(1<<i&7) != 0
   298  	}
   299  }