github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/profile_store.go (about) 1 package phlaredb 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "io" 8 "os" 9 "path/filepath" 10 "sort" 11 "sync" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/grafana/dskit/runutil" 16 "github.com/parquet-go/parquet-go" 17 "github.com/pkg/errors" 18 "go.uber.org/atomic" 19 20 phlaremodel "github.com/grafana/pyroscope/pkg/model" 21 phlareparquet "github.com/grafana/pyroscope/pkg/parquet" 22 "github.com/grafana/pyroscope/pkg/phlaredb/block" 23 "github.com/grafana/pyroscope/pkg/phlaredb/query" 24 schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1" 25 phlarecontext "github.com/grafana/pyroscope/pkg/pyroscope/context" 26 "github.com/grafana/pyroscope/pkg/util/build" 27 ) 28 29 const ( 30 parquetWriteBufferSize = 3 << 20 // 3MB 31 ) 32 33 type profileStore struct { 34 size atomic.Uint64 35 totalSize atomic.Uint64 36 37 logger log.Logger 38 cfg *ParquetConfig 39 metrics *headMetrics 40 41 path string 42 persister schemav1.Persister[*schemav1.Profile] 43 writer *parquet.GenericWriter[*schemav1.Profile] 44 45 // lock serializes appends to the slice. Every new profile is appended 46 // to the slice and to the index (has its own lock). In practice, it's 47 // only purpose is to accommodate the parquet writer: slice is never 48 // accessed for reads. 49 profilesLock sync.Mutex 50 slice []schemav1.InMemoryProfile 51 52 // Rows lock synchronises access to the on-disk row groups. 53 // When the in-memory index (profiles) is being flushed on disk, 54 // it should be modified simultaneously with rowGroups. 55 // Store readers only access rowGroups and index. 56 rowsLock sync.RWMutex 57 rowsFlushed uint64 58 rowGroups []*rowGroupOnDisk 59 index *profilesIndex 60 61 flushing *atomic.Bool 62 flushQueue chan int // channel to signal that a flush is needed for slice[:n] 63 closeOnce sync.Once 64 flushWg sync.WaitGroup 65 flushBuffer []schemav1.InMemoryProfile 66 flushBufferLbs []phlaremodel.Labels 67 onFlush func() 68 } 69 70 func newParquetProfileWriter(writer io.Writer, options ...parquet.WriterOption) *parquet.GenericWriter[*schemav1.Profile] { 71 options = append(options, parquet.PageBufferSize(parquetWriteBufferSize)) 72 options = append(options, parquet.CreatedBy("github.com/grafana/pyroscope/", build.Version, build.Revision)) 73 options = append(options, schemav1.ProfilesSchema) 74 return parquet.NewGenericWriter[*schemav1.Profile]( 75 writer, options..., 76 ) 77 } 78 79 func newProfileStore(phlarectx context.Context) *profileStore { 80 s := &profileStore{ 81 logger: phlarecontext.Logger(phlarectx), 82 metrics: contextHeadMetrics(phlarectx), 83 persister: &schemav1.ProfilePersister{}, 84 flushing: atomic.NewBool(false), 85 flushQueue: make(chan int), 86 } 87 s.flushWg.Add(1) 88 go s.cutRowGroupLoop() 89 // Initialize writer on /dev/null 90 // TODO: Reuse parquet.Writer beyond life time of the head. 91 s.writer = newParquetProfileWriter(io.Discard) 92 93 return s 94 } 95 96 func (s *profileStore) Name() string { 97 return s.persister.Name() 98 } 99 100 func (s *profileStore) Size() uint64 { 101 return s.totalSize.Load() 102 } 103 104 func (s *profileStore) MemorySize() uint64 { 105 return s.size.Load() 106 } 107 108 // resets the store 109 func (s *profileStore) Init(path string, cfg *ParquetConfig, metrics *headMetrics) (err error) { 110 // close previous iteration 111 if err := s.Close(); err != nil { 112 return err 113 } 114 s.flushQueue = make(chan int) 115 s.closeOnce = sync.Once{} 116 s.flushWg.Add(1) 117 go s.cutRowGroupLoop() 118 119 // create index 120 s.index, err = newProfileIndex(32, s.metrics) 121 if err != nil { 122 return err 123 } 124 125 s.path = path 126 s.cfg = cfg 127 s.metrics = metrics 128 129 s.slice = s.slice[:0] 130 131 s.rowsFlushed = 0 132 133 return nil 134 } 135 136 func (s *profileStore) Close() error { 137 if s.flushQueue != nil { 138 s.closeOnce.Do(func() { 139 close(s.flushQueue) 140 }) 141 142 s.flushWg.Wait() 143 } 144 return nil 145 } 146 147 func (s *profileStore) RowGroups() (rowGroups []parquet.RowGroup) { 148 rowGroups = make([]parquet.RowGroup, len(s.rowGroups)) 149 for pos := range rowGroups { 150 rowGroups[pos] = s.rowGroups[pos] 151 } 152 return rowGroups 153 } 154 155 // Flush writes row groups and the index to files on disk. 156 // The call is thread-safe for reading but adding new profiles 157 // should not be allowed during and after the call. 158 func (s *profileStore) Flush(ctx context.Context) (numRows uint64, numRowGroups uint64, err error) { 159 if err := s.Close(); err != nil { 160 return 0, 0, err 161 } 162 if err = s.cutRowGroup(len(s.slice)); err != nil { 163 return 0, 0, err 164 } 165 166 indexPath := filepath.Join( 167 s.path, 168 block.IndexFilename, 169 ) 170 171 rowRangerPerRG, err := s.index.writeTo(ctx, indexPath) 172 if err != nil { 173 return 0, 0, err 174 } 175 176 parquetPath := filepath.Join( 177 s.path, 178 s.persister.Name()+block.ParquetSuffix, 179 ) 180 181 s.rowsLock.Lock() 182 for idx, ranges := range rowRangerPerRG { 183 s.rowGroups[idx].seriesIndexes = ranges 184 } 185 s.rowsLock.Unlock() 186 numRows, numRowGroups, err = s.writeRowGroups(parquetPath, s.RowGroups()) 187 if err != nil { 188 return 0, 0, err 189 } 190 // Row groups are closed and removed on an explicit DeleteRowGroups call. 191 return numRows, numRowGroups, nil 192 } 193 194 func (s *profileStore) DeleteRowGroups() error { 195 s.rowsLock.Lock() 196 defer s.rowsLock.Unlock() 197 for _, rg := range s.rowGroups { 198 if err := rg.Close(); err != nil { 199 return err 200 } 201 } 202 s.rowGroups = s.rowGroups[:0] 203 return nil 204 } 205 206 func (s *profileStore) prepareFile(path string) (f *os.File, err error) { 207 file, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o644) 208 if err != nil { 209 return nil, err 210 } 211 s.writer.Reset(file) 212 213 return file, err 214 } 215 216 // cutRowGroups gets called, when a patrticular row group has been finished 217 // and it will flush it to disk. The caller of cutRowGroups should be holding 218 // the write lock. 219 // 220 // Writes are not allowed during cutting the rows, but readers are not blocked 221 // during the most of the time: only after the rows are written to disk do we 222 // block them for a short time (via rowsLock). 223 // 224 // TODO(kolesnikovae): Make the lock more selective. The call takes long time, 225 // if disk I/O is slow, which causes ingestion timeouts and impacts distributor 226 // push latency, and memory consumption, transitively. 227 // See index.cutRowGroup: we could find a way to not flush all the in-memory 228 // profiles, including ones added since the start of the call, but only those 229 // that were added before certain point (this call). The same for s.slice. 230 func (s *profileStore) cutRowGroup(count int) (err error) { 231 // if cutRowGroup fails record it as failed segment 232 defer func() { 233 if err != nil { 234 s.metrics.writtenProfileSegments.WithLabelValues("failed").Inc() 235 } 236 }() 237 238 size := s.loadProfilesToFlush(count) 239 if len(s.flushBuffer) == 0 { 240 return nil 241 } 242 243 path := filepath.Join( 244 s.path, 245 fmt.Sprintf("%s.%d%s", s.persister.Name(), s.rowsFlushed, block.ParquetSuffix), 246 ) 247 // Removes the file if it exists. This can happen if the previous 248 // cut attempt failed. 249 if err := os.Remove(path); err == nil { 250 level.Warn(s.logger).Log("msg", "deleting row group segment of a failed previous attempt", "path", path) 251 } 252 f, err := s.prepareFile(path) 253 if err != nil { 254 return err 255 } 256 257 n, err := parquet.CopyRows(s.writer, schemav1.NewInMemoryProfilesRowReader(s.flushBuffer)) 258 if err != nil { 259 return errors.Wrap(err, "write row group segments to disk") 260 } 261 262 if err := s.writer.Close(); err != nil { 263 return errors.Wrap(err, "close row group segment writer") 264 } 265 266 if err := f.Close(); err != nil { 267 return errors.Wrap(err, "closing row group segment file") 268 } 269 s.metrics.writtenProfileSegments.WithLabelValues("success").Inc() 270 271 // get row group segment size on disk 272 if stat, err := f.Stat(); err == nil { 273 s.metrics.writtenProfileSegmentsBytes.Observe(float64(stat.Size())) 274 } 275 276 rowGroup, err := newRowGroupOnDisk(path) 277 if err != nil { 278 return err 279 } 280 281 // We need to make the new on-disk row group available to readers 282 // simultaneously with cutting the series from the index. Until that, 283 // profiles can be read from s.slice/s.index. This lock should not be 284 // held for long as it only performs in-memory operations, 285 // although blocking readers. 286 s.rowsLock.Lock() 287 // After the lock is released, rows/profiles should be read from the disk. 288 defer s.rowsLock.Unlock() 289 s.rowsFlushed += uint64(n) 290 s.rowGroups = append(s.rowGroups, rowGroup) 291 // Cutting the index is relatively quick op (no I/O). 292 err = s.index.cutRowGroup(s.flushBuffer) 293 294 s.profilesLock.Lock() 295 defer s.profilesLock.Unlock() 296 for i := range s.slice[:count] { 297 s.metrics.samples.Sub(float64(len(s.slice[i].Samples.StacktraceIDs))) 298 } 299 // reset slice and metrics 300 s.slice = copySlice(s.slice[count:]) 301 currentSize := s.size.Sub(size) 302 if err != nil { 303 return err 304 } 305 306 level.Debug(s.logger).Log("msg", "cut row group segment", "path", path, "numProfiles", n) 307 s.metrics.sizeBytes.WithLabelValues(s.Name()).Set(float64(currentSize)) 308 return nil 309 } 310 311 type byLabels struct { 312 p []schemav1.InMemoryProfile 313 lbs []phlaremodel.Labels 314 } 315 316 func (b byLabels) Len() int { return len(b.p) } 317 func (b byLabels) Swap(i, j int) { 318 b.p[i], b.p[j] = b.p[j], b.p[i] 319 b.lbs[i], b.lbs[j] = b.lbs[j], b.lbs[i] 320 } 321 322 func (by byLabels) Less(i, j int) bool { 323 // first compare the labels, if they don't match return 324 var ( 325 pI = by.p[i] 326 pJ = by.p[j] 327 lbsI = by.lbs[i] 328 lbsJ = by.lbs[j] 329 ) 330 if cmp := phlaremodel.CompareLabelPairs(lbsI, lbsJ); cmp != 0 { 331 return cmp < 0 332 } 333 334 // then compare timenanos, if they don't match return 335 if pI.TimeNanos < pJ.TimeNanos { 336 return true 337 } else if pI.TimeNanos > pJ.TimeNanos { 338 return false 339 } 340 341 // finally use ID as tie breaker 342 return bytes.Compare(pI.ID[:], pJ.ID[:]) < 0 343 } 344 345 // loadProfilesToFlush loads and sort profiles to flush into flushBuffer and returns the size of the profiles. 346 func (s *profileStore) loadProfilesToFlush(count int) uint64 { 347 if cap(s.flushBuffer) < count { 348 s.flushBuffer = make([]schemav1.InMemoryProfile, 0, count) 349 } 350 if cap(s.flushBufferLbs) < count { 351 s.flushBufferLbs = make([]phlaremodel.Labels, 0, count) 352 } 353 s.flushBufferLbs = s.flushBufferLbs[:0] 354 s.flushBuffer = s.flushBuffer[:0] 355 s.profilesLock.Lock() 356 s.index.mutex.RLock() 357 for i := 0; i < count; i++ { 358 profile := s.slice[i] 359 s.flushBuffer = append(s.flushBuffer, profile) 360 s.flushBufferLbs = append(s.flushBufferLbs, s.index.profilesPerFP[profile.SeriesFingerprint].lbs) 361 } 362 s.profilesLock.Unlock() 363 s.index.mutex.RUnlock() 364 // order profiles properly 365 sort.Sort(byLabels{p: s.flushBuffer, lbs: s.flushBufferLbs}) 366 var size uint64 367 for _, p := range s.flushBuffer { 368 size += p.Size() 369 } 370 return size 371 } 372 373 func (s *profileStore) writeRowGroups(path string, rowGroups []parquet.RowGroup) (n uint64, numRowGroups uint64, err error) { 374 fileCloser, err := s.prepareFile(path) 375 if err != nil { 376 return 0, 0, err 377 } 378 defer runutil.CloseWithErrCapture(&err, fileCloser, "closing parquet file") 379 readers := make([]parquet.RowReader, len(rowGroups)) 380 for i, rg := range rowGroups { 381 readers[i] = rg.Rows() 382 } 383 n, numRowGroups, err = phlareparquet.CopyAsRowGroups(s.writer, schemav1.NewMergeProfilesRowReader(readers), s.cfg.MaxBufferRowCount) 384 if err != nil { 385 return 0, 0, err 386 } 387 388 if err := s.writer.Close(); err != nil { 389 return 0, 0, err 390 } 391 392 s.rowsFlushed += n 393 394 return n, numRowGroups, nil 395 } 396 397 func (s *profileStore) ingest(_ context.Context, profiles []schemav1.InMemoryProfile, lbs phlaremodel.Labels, profileName string) error { 398 s.profilesLock.Lock() 399 defer s.profilesLock.Unlock() 400 401 for pos, p := range profiles { 402 if !s.flushing.Load() { 403 // check if row group is full 404 if s.cfg.MaxBufferRowCount > 0 && len(s.slice) >= s.cfg.MaxBufferRowCount || 405 s.cfg.MaxRowGroupBytes > 0 && s.size.Load() >= s.cfg.MaxRowGroupBytes { 406 s.flushing.Store(true) 407 s.flushQueue <- len(s.slice) 408 } 409 } 410 411 // add profile to the index 412 s.index.Add(&p, lbs, profileName) 413 414 // increase size of stored data 415 addedBytes := profiles[pos].Size() 416 s.metrics.sizeBytes.WithLabelValues(s.Name()).Set(float64(s.size.Add(addedBytes))) 417 s.totalSize.Add(addedBytes) 418 419 // add to slice 420 s.slice = append(s.slice, p) 421 s.metrics.samples.Add(float64(len(p.Samples.StacktraceIDs))) 422 423 } 424 425 return nil 426 } 427 428 func (s *profileStore) cutRowGroupLoop() { 429 defer s.flushWg.Done() 430 for n := range s.flushQueue { 431 if err := s.cutRowGroup(n); err != nil { 432 level.Error(s.logger).Log("msg", "cutting row group", "err", err) 433 } 434 s.flushing.Store(false) 435 if s.onFlush != nil { 436 s.onFlush() 437 } 438 } 439 } 440 441 type rowGroupOnDisk struct { 442 parquet.RowGroup 443 file *os.File 444 seriesIndexes rowRangesWithSeriesIndex 445 } 446 447 func newRowGroupOnDisk(path string) (*rowGroupOnDisk, error) { 448 var ( 449 r = &rowGroupOnDisk{} 450 err error 451 ) 452 453 // now open the row group file, so we are able to read the row group back in 454 r.file, err = os.Open(path) 455 if err != nil { 456 return nil, errors.Wrapf(err, "opening row groups segment file %s", path) 457 } 458 459 stats, err := r.file.Stat() 460 if err != nil { 461 return nil, errors.Wrapf(err, "getting stat of row groups segment file %s", path) 462 } 463 464 segmentParquet, err := parquet.OpenFile(r.file, stats.Size()) 465 if err != nil { 466 return nil, errors.Wrapf(err, "reading parquet of row groups segment file %s", path) 467 } 468 469 rowGroups := segmentParquet.RowGroups() 470 if len(rowGroups) != 1 { 471 return nil, errors.Wrapf(err, "segement file expected to have exactly one row group (actual %d)", len(rowGroups)) 472 } 473 474 r.RowGroup = rowGroups[0] 475 476 return r, nil 477 } 478 479 func (r *rowGroupOnDisk) RowGroups() []parquet.RowGroup { 480 return []parquet.RowGroup{r.RowGroup} 481 } 482 483 func (r *rowGroupOnDisk) Rows() parquet.Rows { 484 rows := r.RowGroup.Rows() 485 if len(r.seriesIndexes) == 0 { 486 return rows 487 } 488 489 return &seriesIDRowsRewriter{ 490 Rows: rows, 491 seriesIndexes: r.seriesIndexes, 492 } 493 } 494 495 func (r *rowGroupOnDisk) Close() error { 496 if err := r.file.Close(); err != nil { 497 return err 498 } 499 500 if err := os.Remove(r.file.Name()); err != nil { 501 return errors.Wrap(err, "deleting row group segment file") 502 } 503 504 return nil 505 } 506 507 func (r *rowGroupOnDisk) columnIter(ctx context.Context, columnName string, predicate query.Predicate, alias string) query.Iterator { 508 column, found := r.RowGroup.Schema().Lookup(columnName) 509 if !found { 510 return query.NewErrIterator(fmt.Errorf("column '%s' not found in head row group segment '%s'", columnName, r.file.Name())) 511 } 512 return query.NewSyncIterator(ctx, []parquet.RowGroup{r.RowGroup}, column.ColumnIndex, columnName, 1000, predicate, alias) 513 } 514 515 type seriesIDRowsRewriter struct { 516 parquet.Rows 517 pos int64 518 seriesIndexes rowRangesWithSeriesIndex 519 searchHint int // speed up getSeriesIndex() 520 } 521 522 func (r *seriesIDRowsRewriter) SeekToRow(pos int64) error { 523 if err := r.Rows.SeekToRow(pos); err != nil { 524 return err 525 } 526 r.pos += pos 527 return nil 528 } 529 530 var colIdxSeriesIndex = func() int { 531 p := &schemav1.ProfilePersister{} 532 colIdx, found := p.Schema().Lookup("SeriesIndex") 533 if !found { 534 panic("column SeriesIndex not found") 535 } 536 return colIdx.ColumnIndex 537 }() 538 539 func (r *seriesIDRowsRewriter) ReadRows(rows []parquet.Row) (int, error) { 540 n, err := r.Rows.ReadRows(rows) 541 if err != nil { 542 return n, err 543 } 544 // sh for next call of getSeriesIndex 545 sh := r.searchHint 546 for pos, row := range rows[:n] { 547 // actual row num 548 rowNum := r.pos + int64(pos) 549 row[colIdxSeriesIndex] = parquet.ValueOf(r.seriesIndexes.getSeriesIndex(rowNum, &sh)).Level(0, 0, colIdxSeriesIndex) 550 } 551 r.searchHint = sh 552 r.pos += int64(n) 553 554 return n, nil 555 } 556 557 func copySlice[T any](in []T) []T { 558 out := make([]T, len(in)) 559 copy(out, in) 560 return out 561 }