github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/profile_store.go (about)

     1  package phlaredb
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"path/filepath"
    10  	"sort"
    11  	"sync"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/grafana/dskit/runutil"
    16  	"github.com/parquet-go/parquet-go"
    17  	"github.com/pkg/errors"
    18  	"go.uber.org/atomic"
    19  
    20  	phlaremodel "github.com/grafana/pyroscope/pkg/model"
    21  	phlareparquet "github.com/grafana/pyroscope/pkg/parquet"
    22  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    23  	"github.com/grafana/pyroscope/pkg/phlaredb/query"
    24  	schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1"
    25  	phlarecontext "github.com/grafana/pyroscope/pkg/pyroscope/context"
    26  	"github.com/grafana/pyroscope/pkg/util/build"
    27  )
    28  
    29  const (
    30  	parquetWriteBufferSize = 3 << 20 // 3MB
    31  )
    32  
    33  type profileStore struct {
    34  	size      atomic.Uint64
    35  	totalSize atomic.Uint64
    36  
    37  	logger  log.Logger
    38  	cfg     *ParquetConfig
    39  	metrics *headMetrics
    40  
    41  	path      string
    42  	persister schemav1.Persister[*schemav1.Profile]
    43  	writer    *parquet.GenericWriter[*schemav1.Profile]
    44  
    45  	// lock serializes appends to the slice. Every new profile is appended
    46  	// to the slice and to the index (has its own lock). In practice, it's
    47  	// only purpose is to accommodate the parquet writer: slice is never
    48  	// accessed for reads.
    49  	profilesLock sync.Mutex
    50  	slice        []schemav1.InMemoryProfile
    51  
    52  	// Rows lock synchronises access to the on-disk row groups.
    53  	// When the in-memory index (profiles) is being flushed on disk,
    54  	// it should be modified simultaneously with rowGroups.
    55  	// Store readers only access rowGroups and index.
    56  	rowsLock    sync.RWMutex
    57  	rowsFlushed uint64
    58  	rowGroups   []*rowGroupOnDisk
    59  	index       *profilesIndex
    60  
    61  	flushing       *atomic.Bool
    62  	flushQueue     chan int // channel to signal that a flush is needed for slice[:n]
    63  	closeOnce      sync.Once
    64  	flushWg        sync.WaitGroup
    65  	flushBuffer    []schemav1.InMemoryProfile
    66  	flushBufferLbs []phlaremodel.Labels
    67  	onFlush        func()
    68  }
    69  
    70  func newParquetProfileWriter(writer io.Writer, options ...parquet.WriterOption) *parquet.GenericWriter[*schemav1.Profile] {
    71  	options = append(options, parquet.PageBufferSize(parquetWriteBufferSize))
    72  	options = append(options, parquet.CreatedBy("github.com/grafana/pyroscope/", build.Version, build.Revision))
    73  	options = append(options, schemav1.ProfilesSchema)
    74  	return parquet.NewGenericWriter[*schemav1.Profile](
    75  		writer, options...,
    76  	)
    77  }
    78  
    79  func newProfileStore(phlarectx context.Context) *profileStore {
    80  	s := &profileStore{
    81  		logger:     phlarecontext.Logger(phlarectx),
    82  		metrics:    contextHeadMetrics(phlarectx),
    83  		persister:  &schemav1.ProfilePersister{},
    84  		flushing:   atomic.NewBool(false),
    85  		flushQueue: make(chan int),
    86  	}
    87  	s.flushWg.Add(1)
    88  	go s.cutRowGroupLoop()
    89  	// Initialize writer on /dev/null
    90  	// TODO: Reuse parquet.Writer beyond life time of the head.
    91  	s.writer = newParquetProfileWriter(io.Discard)
    92  
    93  	return s
    94  }
    95  
    96  func (s *profileStore) Name() string {
    97  	return s.persister.Name()
    98  }
    99  
   100  func (s *profileStore) Size() uint64 {
   101  	return s.totalSize.Load()
   102  }
   103  
   104  func (s *profileStore) MemorySize() uint64 {
   105  	return s.size.Load()
   106  }
   107  
   108  // resets the store
   109  func (s *profileStore) Init(path string, cfg *ParquetConfig, metrics *headMetrics) (err error) {
   110  	// close previous iteration
   111  	if err := s.Close(); err != nil {
   112  		return err
   113  	}
   114  	s.flushQueue = make(chan int)
   115  	s.closeOnce = sync.Once{}
   116  	s.flushWg.Add(1)
   117  	go s.cutRowGroupLoop()
   118  
   119  	// create index
   120  	s.index, err = newProfileIndex(32, s.metrics)
   121  	if err != nil {
   122  		return err
   123  	}
   124  
   125  	s.path = path
   126  	s.cfg = cfg
   127  	s.metrics = metrics
   128  
   129  	s.slice = s.slice[:0]
   130  
   131  	s.rowsFlushed = 0
   132  
   133  	return nil
   134  }
   135  
   136  func (s *profileStore) Close() error {
   137  	if s.flushQueue != nil {
   138  		s.closeOnce.Do(func() {
   139  			close(s.flushQueue)
   140  		})
   141  
   142  		s.flushWg.Wait()
   143  	}
   144  	return nil
   145  }
   146  
   147  func (s *profileStore) RowGroups() (rowGroups []parquet.RowGroup) {
   148  	rowGroups = make([]parquet.RowGroup, len(s.rowGroups))
   149  	for pos := range rowGroups {
   150  		rowGroups[pos] = s.rowGroups[pos]
   151  	}
   152  	return rowGroups
   153  }
   154  
   155  // Flush writes row groups and the index to files on disk.
   156  // The call is thread-safe for reading but adding new profiles
   157  // should not be allowed during and after the call.
   158  func (s *profileStore) Flush(ctx context.Context) (numRows uint64, numRowGroups uint64, err error) {
   159  	if err := s.Close(); err != nil {
   160  		return 0, 0, err
   161  	}
   162  	if err = s.cutRowGroup(len(s.slice)); err != nil {
   163  		return 0, 0, err
   164  	}
   165  
   166  	indexPath := filepath.Join(
   167  		s.path,
   168  		block.IndexFilename,
   169  	)
   170  
   171  	rowRangerPerRG, err := s.index.writeTo(ctx, indexPath)
   172  	if err != nil {
   173  		return 0, 0, err
   174  	}
   175  
   176  	parquetPath := filepath.Join(
   177  		s.path,
   178  		s.persister.Name()+block.ParquetSuffix,
   179  	)
   180  
   181  	s.rowsLock.Lock()
   182  	for idx, ranges := range rowRangerPerRG {
   183  		s.rowGroups[idx].seriesIndexes = ranges
   184  	}
   185  	s.rowsLock.Unlock()
   186  	numRows, numRowGroups, err = s.writeRowGroups(parquetPath, s.RowGroups())
   187  	if err != nil {
   188  		return 0, 0, err
   189  	}
   190  	// Row groups are closed and removed on an explicit DeleteRowGroups call.
   191  	return numRows, numRowGroups, nil
   192  }
   193  
   194  func (s *profileStore) DeleteRowGroups() error {
   195  	s.rowsLock.Lock()
   196  	defer s.rowsLock.Unlock()
   197  	for _, rg := range s.rowGroups {
   198  		if err := rg.Close(); err != nil {
   199  			return err
   200  		}
   201  	}
   202  	s.rowGroups = s.rowGroups[:0]
   203  	return nil
   204  }
   205  
   206  func (s *profileStore) prepareFile(path string) (f *os.File, err error) {
   207  	file, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o644)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  	s.writer.Reset(file)
   212  
   213  	return file, err
   214  }
   215  
   216  // cutRowGroups gets called, when a patrticular row group has been finished
   217  // and it will flush it to disk. The caller of cutRowGroups should be holding
   218  // the write lock.
   219  //
   220  // Writes are not allowed during cutting the rows, but readers are not blocked
   221  // during the most of the time: only after the rows are written to disk do we
   222  // block them for a short time (via rowsLock).
   223  //
   224  // TODO(kolesnikovae): Make the lock more selective. The call takes long time,
   225  // if disk I/O is slow, which causes ingestion timeouts and impacts distributor
   226  // push latency, and memory consumption, transitively.
   227  // See index.cutRowGroup: we could find a way to not flush all the in-memory
   228  // profiles, including ones added since the start of the call, but only those
   229  // that were added before certain point (this call). The same for s.slice.
   230  func (s *profileStore) cutRowGroup(count int) (err error) {
   231  	// if cutRowGroup fails record it as failed segment
   232  	defer func() {
   233  		if err != nil {
   234  			s.metrics.writtenProfileSegments.WithLabelValues("failed").Inc()
   235  		}
   236  	}()
   237  
   238  	size := s.loadProfilesToFlush(count)
   239  	if len(s.flushBuffer) == 0 {
   240  		return nil
   241  	}
   242  
   243  	path := filepath.Join(
   244  		s.path,
   245  		fmt.Sprintf("%s.%d%s", s.persister.Name(), s.rowsFlushed, block.ParquetSuffix),
   246  	)
   247  	// Removes the file if it exists. This can happen if the previous
   248  	// cut attempt failed.
   249  	if err := os.Remove(path); err == nil {
   250  		level.Warn(s.logger).Log("msg", "deleting row group segment of a failed previous attempt", "path", path)
   251  	}
   252  	f, err := s.prepareFile(path)
   253  	if err != nil {
   254  		return err
   255  	}
   256  
   257  	n, err := parquet.CopyRows(s.writer, schemav1.NewInMemoryProfilesRowReader(s.flushBuffer))
   258  	if err != nil {
   259  		return errors.Wrap(err, "write row group segments to disk")
   260  	}
   261  
   262  	if err := s.writer.Close(); err != nil {
   263  		return errors.Wrap(err, "close row group segment writer")
   264  	}
   265  
   266  	if err := f.Close(); err != nil {
   267  		return errors.Wrap(err, "closing row group segment file")
   268  	}
   269  	s.metrics.writtenProfileSegments.WithLabelValues("success").Inc()
   270  
   271  	// get row group segment size on disk
   272  	if stat, err := f.Stat(); err == nil {
   273  		s.metrics.writtenProfileSegmentsBytes.Observe(float64(stat.Size()))
   274  	}
   275  
   276  	rowGroup, err := newRowGroupOnDisk(path)
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	// We need to make the new on-disk row group available to readers
   282  	// simultaneously with cutting the series from the index. Until that,
   283  	// profiles can be read from s.slice/s.index. This lock should not be
   284  	// held for long as it only performs in-memory operations,
   285  	// although blocking readers.
   286  	s.rowsLock.Lock()
   287  	// After the lock is released, rows/profiles should be read from the disk.
   288  	defer s.rowsLock.Unlock()
   289  	s.rowsFlushed += uint64(n)
   290  	s.rowGroups = append(s.rowGroups, rowGroup)
   291  	// Cutting the index is relatively quick op (no I/O).
   292  	err = s.index.cutRowGroup(s.flushBuffer)
   293  
   294  	s.profilesLock.Lock()
   295  	defer s.profilesLock.Unlock()
   296  	for i := range s.slice[:count] {
   297  		s.metrics.samples.Sub(float64(len(s.slice[i].Samples.StacktraceIDs)))
   298  	}
   299  	// reset slice and metrics
   300  	s.slice = copySlice(s.slice[count:])
   301  	currentSize := s.size.Sub(size)
   302  	if err != nil {
   303  		return err
   304  	}
   305  
   306  	level.Debug(s.logger).Log("msg", "cut row group segment", "path", path, "numProfiles", n)
   307  	s.metrics.sizeBytes.WithLabelValues(s.Name()).Set(float64(currentSize))
   308  	return nil
   309  }
   310  
   311  type byLabels struct {
   312  	p   []schemav1.InMemoryProfile
   313  	lbs []phlaremodel.Labels
   314  }
   315  
   316  func (b byLabels) Len() int { return len(b.p) }
   317  func (b byLabels) Swap(i, j int) {
   318  	b.p[i], b.p[j] = b.p[j], b.p[i]
   319  	b.lbs[i], b.lbs[j] = b.lbs[j], b.lbs[i]
   320  }
   321  
   322  func (by byLabels) Less(i, j int) bool {
   323  	// first compare the labels, if they don't match return
   324  	var (
   325  		pI   = by.p[i]
   326  		pJ   = by.p[j]
   327  		lbsI = by.lbs[i]
   328  		lbsJ = by.lbs[j]
   329  	)
   330  	if cmp := phlaremodel.CompareLabelPairs(lbsI, lbsJ); cmp != 0 {
   331  		return cmp < 0
   332  	}
   333  
   334  	// then compare timenanos, if they don't match return
   335  	if pI.TimeNanos < pJ.TimeNanos {
   336  		return true
   337  	} else if pI.TimeNanos > pJ.TimeNanos {
   338  		return false
   339  	}
   340  
   341  	// finally use ID as tie breaker
   342  	return bytes.Compare(pI.ID[:], pJ.ID[:]) < 0
   343  }
   344  
   345  // loadProfilesToFlush loads and sort profiles to flush into flushBuffer and returns the size of the profiles.
   346  func (s *profileStore) loadProfilesToFlush(count int) uint64 {
   347  	if cap(s.flushBuffer) < count {
   348  		s.flushBuffer = make([]schemav1.InMemoryProfile, 0, count)
   349  	}
   350  	if cap(s.flushBufferLbs) < count {
   351  		s.flushBufferLbs = make([]phlaremodel.Labels, 0, count)
   352  	}
   353  	s.flushBufferLbs = s.flushBufferLbs[:0]
   354  	s.flushBuffer = s.flushBuffer[:0]
   355  	s.profilesLock.Lock()
   356  	s.index.mutex.RLock()
   357  	for i := 0; i < count; i++ {
   358  		profile := s.slice[i]
   359  		s.flushBuffer = append(s.flushBuffer, profile)
   360  		s.flushBufferLbs = append(s.flushBufferLbs, s.index.profilesPerFP[profile.SeriesFingerprint].lbs)
   361  	}
   362  	s.profilesLock.Unlock()
   363  	s.index.mutex.RUnlock()
   364  	// order profiles properly
   365  	sort.Sort(byLabels{p: s.flushBuffer, lbs: s.flushBufferLbs})
   366  	var size uint64
   367  	for _, p := range s.flushBuffer {
   368  		size += p.Size()
   369  	}
   370  	return size
   371  }
   372  
   373  func (s *profileStore) writeRowGroups(path string, rowGroups []parquet.RowGroup) (n uint64, numRowGroups uint64, err error) {
   374  	fileCloser, err := s.prepareFile(path)
   375  	if err != nil {
   376  		return 0, 0, err
   377  	}
   378  	defer runutil.CloseWithErrCapture(&err, fileCloser, "closing parquet file")
   379  	readers := make([]parquet.RowReader, len(rowGroups))
   380  	for i, rg := range rowGroups {
   381  		readers[i] = rg.Rows()
   382  	}
   383  	n, numRowGroups, err = phlareparquet.CopyAsRowGroups(s.writer, schemav1.NewMergeProfilesRowReader(readers), s.cfg.MaxBufferRowCount)
   384  	if err != nil {
   385  		return 0, 0, err
   386  	}
   387  
   388  	if err := s.writer.Close(); err != nil {
   389  		return 0, 0, err
   390  	}
   391  
   392  	s.rowsFlushed += n
   393  
   394  	return n, numRowGroups, nil
   395  }
   396  
   397  func (s *profileStore) ingest(_ context.Context, profiles []schemav1.InMemoryProfile, lbs phlaremodel.Labels, profileName string) error {
   398  	s.profilesLock.Lock()
   399  	defer s.profilesLock.Unlock()
   400  
   401  	for pos, p := range profiles {
   402  		if !s.flushing.Load() {
   403  			// check if row group is full
   404  			if s.cfg.MaxBufferRowCount > 0 && len(s.slice) >= s.cfg.MaxBufferRowCount ||
   405  				s.cfg.MaxRowGroupBytes > 0 && s.size.Load() >= s.cfg.MaxRowGroupBytes {
   406  				s.flushing.Store(true)
   407  				s.flushQueue <- len(s.slice)
   408  			}
   409  		}
   410  
   411  		// add profile to the index
   412  		s.index.Add(&p, lbs, profileName)
   413  
   414  		// increase size of stored data
   415  		addedBytes := profiles[pos].Size()
   416  		s.metrics.sizeBytes.WithLabelValues(s.Name()).Set(float64(s.size.Add(addedBytes)))
   417  		s.totalSize.Add(addedBytes)
   418  
   419  		// add to slice
   420  		s.slice = append(s.slice, p)
   421  		s.metrics.samples.Add(float64(len(p.Samples.StacktraceIDs)))
   422  
   423  	}
   424  
   425  	return nil
   426  }
   427  
   428  func (s *profileStore) cutRowGroupLoop() {
   429  	defer s.flushWg.Done()
   430  	for n := range s.flushQueue {
   431  		if err := s.cutRowGroup(n); err != nil {
   432  			level.Error(s.logger).Log("msg", "cutting row group", "err", err)
   433  		}
   434  		s.flushing.Store(false)
   435  		if s.onFlush != nil {
   436  			s.onFlush()
   437  		}
   438  	}
   439  }
   440  
   441  type rowGroupOnDisk struct {
   442  	parquet.RowGroup
   443  	file          *os.File
   444  	seriesIndexes rowRangesWithSeriesIndex
   445  }
   446  
   447  func newRowGroupOnDisk(path string) (*rowGroupOnDisk, error) {
   448  	var (
   449  		r   = &rowGroupOnDisk{}
   450  		err error
   451  	)
   452  
   453  	// now open the row group file, so we are able to read the row group back in
   454  	r.file, err = os.Open(path)
   455  	if err != nil {
   456  		return nil, errors.Wrapf(err, "opening row groups segment file %s", path)
   457  	}
   458  
   459  	stats, err := r.file.Stat()
   460  	if err != nil {
   461  		return nil, errors.Wrapf(err, "getting stat of row groups segment file %s", path)
   462  	}
   463  
   464  	segmentParquet, err := parquet.OpenFile(r.file, stats.Size())
   465  	if err != nil {
   466  		return nil, errors.Wrapf(err, "reading parquet of row groups segment file %s", path)
   467  	}
   468  
   469  	rowGroups := segmentParquet.RowGroups()
   470  	if len(rowGroups) != 1 {
   471  		return nil, errors.Wrapf(err, "segement file expected to have exactly one row group (actual %d)", len(rowGroups))
   472  	}
   473  
   474  	r.RowGroup = rowGroups[0]
   475  
   476  	return r, nil
   477  }
   478  
   479  func (r *rowGroupOnDisk) RowGroups() []parquet.RowGroup {
   480  	return []parquet.RowGroup{r.RowGroup}
   481  }
   482  
   483  func (r *rowGroupOnDisk) Rows() parquet.Rows {
   484  	rows := r.RowGroup.Rows()
   485  	if len(r.seriesIndexes) == 0 {
   486  		return rows
   487  	}
   488  
   489  	return &seriesIDRowsRewriter{
   490  		Rows:          rows,
   491  		seriesIndexes: r.seriesIndexes,
   492  	}
   493  }
   494  
   495  func (r *rowGroupOnDisk) Close() error {
   496  	if err := r.file.Close(); err != nil {
   497  		return err
   498  	}
   499  
   500  	if err := os.Remove(r.file.Name()); err != nil {
   501  		return errors.Wrap(err, "deleting row group segment file")
   502  	}
   503  
   504  	return nil
   505  }
   506  
   507  func (r *rowGroupOnDisk) columnIter(ctx context.Context, columnName string, predicate query.Predicate, alias string) query.Iterator {
   508  	column, found := r.RowGroup.Schema().Lookup(columnName)
   509  	if !found {
   510  		return query.NewErrIterator(fmt.Errorf("column '%s' not found in head row group segment '%s'", columnName, r.file.Name()))
   511  	}
   512  	return query.NewSyncIterator(ctx, []parquet.RowGroup{r.RowGroup}, column.ColumnIndex, columnName, 1000, predicate, alias)
   513  }
   514  
   515  type seriesIDRowsRewriter struct {
   516  	parquet.Rows
   517  	pos           int64
   518  	seriesIndexes rowRangesWithSeriesIndex
   519  	searchHint    int // speed up getSeriesIndex()
   520  }
   521  
   522  func (r *seriesIDRowsRewriter) SeekToRow(pos int64) error {
   523  	if err := r.Rows.SeekToRow(pos); err != nil {
   524  		return err
   525  	}
   526  	r.pos += pos
   527  	return nil
   528  }
   529  
   530  var colIdxSeriesIndex = func() int {
   531  	p := &schemav1.ProfilePersister{}
   532  	colIdx, found := p.Schema().Lookup("SeriesIndex")
   533  	if !found {
   534  		panic("column SeriesIndex not found")
   535  	}
   536  	return colIdx.ColumnIndex
   537  }()
   538  
   539  func (r *seriesIDRowsRewriter) ReadRows(rows []parquet.Row) (int, error) {
   540  	n, err := r.Rows.ReadRows(rows)
   541  	if err != nil {
   542  		return n, err
   543  	}
   544  	// sh for next call of getSeriesIndex
   545  	sh := r.searchHint
   546  	for pos, row := range rows[:n] {
   547  		// actual row num
   548  		rowNum := r.pos + int64(pos)
   549  		row[colIdxSeriesIndex] = parquet.ValueOf(r.seriesIndexes.getSeriesIndex(rowNum, &sh)).Level(0, 0, colIdxSeriesIndex)
   550  	}
   551  	r.searchHint = sh
   552  	r.pos += int64(n)
   553  
   554  	return n, nil
   555  }
   556  
   557  func copySlice[T any](in []T) []T {
   558  	out := make([]T, len(in))
   559  	copy(out, in)
   560  	return out
   561  }