github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/storage_exemplars.go (about) 1 package storage 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "strconv" 9 "sync" 10 "time" 11 12 "github.com/dgraph-io/badger/v2" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/sirupsen/logrus" 15 16 "github.com/pyroscope-io/pyroscope/pkg/storage/dict" 17 "github.com/pyroscope-io/pyroscope/pkg/storage/metadata" 18 "github.com/pyroscope-io/pyroscope/pkg/storage/segment" 19 "github.com/pyroscope-io/pyroscope/pkg/storage/tree" 20 "github.com/pyroscope-io/pyroscope/pkg/util/varint" 21 ) 22 23 // TODO(kolesnikovae): decouple from Storage. 24 25 const ( 26 exemplarDataPrefix Prefix = "v:" 27 exemplarTimestampPrefix Prefix = "t:" 28 exemplarsCurrentFormat = 2 29 30 defaultExemplarsBatchQueueSize = 5 31 defaultExemplarsBatchSize = 10 << 10 // 10K 32 defaultExemplarsBatchDuration = time.Second * 5 33 ) 34 35 type exemplars struct { 36 logger *logrus.Logger 37 config *Config 38 metrics *metrics 39 db BadgerDBWithCache 40 dicts BadgerDBWithCache 41 42 once sync.Once 43 mu sync.Mutex 44 currentBatch *exemplarsBatch 45 batches chan *exemplarsBatch 46 } 47 48 var ( 49 errBatchIsFull = errors.New("exemplars batch is full") 50 errProfileIDRequired = errors.New("profile id label required") 51 ) 52 53 type exemplarsBatch struct { 54 batchSize int 55 entries map[string]*exemplarEntry 56 config *Config 57 metrics *metrics 58 dicts BadgerDBWithCache 59 } 60 61 type exemplarEntry struct { 62 // DB exemplar key and its parts. 63 Key []byte 64 AppName string 65 ProfileID string 66 67 // Value. 68 StartTime int64 69 EndTime int64 70 Labels map[string]string 71 Tree *tree.Tree 72 } 73 74 func (e *exemplars) exemplarsQueueSize() int { 75 if e.config.exemplarsBatchQueueSize != 0 { 76 return e.config.exemplarsBatchQueueSize 77 } 78 return defaultExemplarsBatchQueueSize 79 } 80 81 func (e *exemplars) exemplarsBatchSize() int { 82 if e.config.exemplarsBatchSize != 0 { 83 return e.config.exemplarsBatchSize 84 } 85 return defaultExemplarsBatchSize 86 } 87 88 func (e *exemplars) exemplarsBatchDuration() time.Duration { 89 if e.config.exemplarsBatchDuration != 0 { 90 return e.config.exemplarsBatchDuration 91 } 92 return defaultExemplarsBatchDuration 93 } 94 95 func (e *exemplars) newExemplarsBatch() *exemplarsBatch { 96 batchSize := e.exemplarsBatchSize() 97 return &exemplarsBatch{ 98 batchSize: batchSize, 99 metrics: e.metrics, 100 config: e.config, 101 dicts: e.dicts, 102 entries: make(map[string]*exemplarEntry, batchSize), 103 } 104 } 105 106 func (s *Storage) initExemplarsStorage(db BadgerDBWithCache) { 107 e := exemplars{ 108 logger: s.logger, 109 config: s.config, 110 metrics: s.metrics, 111 dicts: s.dicts, 112 db: db, 113 } 114 115 e.batches = make(chan *exemplarsBatch, e.exemplarsQueueSize()) 116 e.currentBatch = e.newExemplarsBatch() 117 118 s.exemplars = &e 119 s.tasksWG.Add(1) 120 121 go func() { 122 retentionTicker := time.NewTicker(s.retentionTaskInterval) 123 batchFlushTicker := time.NewTicker(e.exemplarsBatchDuration()) 124 defer func() { 125 batchFlushTicker.Stop() 126 retentionTicker.Stop() 127 s.tasksWG.Done() 128 }() 129 for { 130 select { 131 default: 132 case batch, ok := <-e.batches: 133 if ok { 134 e.flush(batch) 135 } 136 } 137 138 select { 139 case <-s.stop: 140 e.logger.Debug("flushing batches queue") 141 e.flushBatchQueue() 142 return 143 144 case <-batchFlushTicker.C: 145 e.logger.Debug("flushing current batch") 146 e.mu.Lock() 147 e.flushCurrentBatch() 148 e.mu.Unlock() 149 150 case batch, ok := <-e.batches: 151 if ok { 152 e.flush(batch) 153 } 154 155 case <-retentionTicker.C: 156 s.exemplarsRetentionTask() 157 } 158 } 159 }() 160 } 161 162 func (e *exemplars) enforceRetentionPolicy(ctx context.Context, rp *segment.RetentionPolicy) { 163 observer := prometheus.ObserverFunc(e.metrics.exemplarsRetentionTaskDuration.Observe) 164 timer := prometheus.NewTimer(observer) 165 defer timer.ObserveDuration() 166 167 e.logger.Debug("enforcing exemplars retention policy") 168 err := e.truncateBefore(ctx, rp.ExemplarsRetentionTime) 169 switch { 170 case err == nil: 171 case errors.Is(ctx.Err(), context.Canceled): 172 e.logger.Warn("enforcing exemplars retention policy canceled") 173 default: 174 e.logger.WithError(err).Error("failed to enforce exemplars retention policy") 175 } 176 } 177 178 // exemplarKey creates a key in the v:{app_name}:{profile_id} format 179 func exemplarKey(appName, profileID string) []byte { 180 return exemplarDataPrefix.key(appName + ":" + profileID) 181 } 182 183 // parseExemplarTimestamp returns timestamp and the profile 184 // data key (in v:{app_name}:{profile_id} format), if the given timestamp key is valid. 185 func parseExemplarTimestamp(k []byte) (int64, []byte, bool) { 186 v, ok := exemplarTimestampPrefix.trim(k) 187 if !ok { 188 return 0, nil, false 189 } 190 i := bytes.IndexByte(v, ':') 191 if i < 0 { 192 return 0, nil, false 193 } 194 t, err := strconv.ParseInt(string(v[:i]), 10, 64) 195 if err != nil { 196 return 0, nil, false 197 } 198 return t, append(exemplarDataPrefix.bytes(), v[i+1:]...), true 199 } 200 201 func exemplarKeyToTimestampKey(k []byte, t int64) ([]byte, bool) { 202 if v, ok := exemplarDataPrefix.trim(k); ok { 203 return append(exemplarTimestampPrefix.key(strconv.FormatInt(t, 10)+":"), v...), true 204 } 205 return nil, false 206 } 207 208 func (e *exemplars) flushCurrentBatch() { 209 entries := len(e.currentBatch.entries) 210 if entries == 0 { 211 return 212 } 213 b := e.currentBatch 214 e.currentBatch = e.newExemplarsBatch() 215 select { 216 case e.batches <- b: 217 default: 218 e.metrics.exemplarsDiscardedTotal.Add(float64(entries)) 219 } 220 } 221 222 func (e *exemplars) Sync() { 223 e.mu.Lock() 224 defer e.mu.Unlock() 225 e.flush(e.currentBatch) 226 e.currentBatch = e.newExemplarsBatch() 227 n := len(e.batches) 228 var i int 229 for { 230 if i == n { 231 return 232 } 233 select { 234 default: 235 return 236 case b, ok := <-e.batches: 237 if !ok { 238 return 239 } 240 e.flush(b) 241 i++ 242 } 243 } 244 } 245 246 func (e *exemplars) flushBatchQueue() { 247 e.once.Do(func() { 248 e.flush(e.currentBatch) 249 close(e.batches) 250 for batch := range e.batches { 251 e.flush(batch) 252 } 253 }) 254 } 255 256 func (e *exemplars) flush(b *exemplarsBatch) { 257 if len(b.entries) == 0 { 258 return 259 } 260 e.logger.Debug("flushing completed batch") 261 err := e.db.Update(func(txn *badger.Txn) error { 262 for _, entry := range b.entries { 263 if err := b.writeExemplarToDB(txn, entry); err != nil { 264 return err 265 } 266 } 267 return nil 268 }) 269 270 if err != nil { 271 e.logger.WithError(err).Error("failed to write exemplars batch") 272 } 273 } 274 275 func (e *exemplars) insert(ctx context.Context, input *PutInput) error { 276 if input.Val == nil || input.Val.Samples() == 0 { 277 return nil 278 } 279 e.mu.Lock() 280 defer e.mu.Unlock() 281 err := e.currentBatch.insert(ctx, input) 282 if err == errBatchIsFull { 283 e.flushCurrentBatch() 284 return e.currentBatch.insert(ctx, input) 285 } 286 return err 287 } 288 289 func (e *exemplars) fetch(ctx context.Context, appName string, profileIDs []string, fn func(exemplarEntry) error) error { 290 d, ok := e.dicts.Lookup(appName) 291 if !ok { 292 return nil 293 } 294 dx := d.(*dict.Dict) 295 return e.db.View(func(txn *badger.Txn) error { 296 for _, profileID := range profileIDs { 297 if err := ctx.Err(); err != nil { 298 return err 299 } 300 k := exemplarKey(appName, profileID) 301 item, err := txn.Get(k) 302 switch { 303 default: 304 return err 305 case errors.Is(err, badger.ErrKeyNotFound): 306 case err == nil: 307 // TODO(kolesnikovae): Optimize: 308 // It makes sense to lookup the dictionary keys only after all 309 // exemplars fetched and merged. 310 err = item.Value(func(val []byte) error { 311 e.metrics.exemplarsReadBytes.Observe(float64(len(val))) 312 var x exemplarEntry 313 if err = x.Deserialize(dx, val); err != nil { 314 return err 315 } 316 x.Key = k 317 x.AppName = appName 318 x.ProfileID = profileID 319 return fn(x) 320 }) 321 if err != nil { 322 return err 323 } 324 } 325 } 326 return nil 327 }) 328 } 329 330 func (e *exemplars) truncateBefore(ctx context.Context, before time.Time) (err error) { 331 for more := true; more; { 332 select { 333 case <-ctx.Done(): 334 return ctx.Err() 335 case batch, ok := <-e.batches: 336 if ok { 337 e.flush(batch) 338 } 339 default: 340 if more, err = e.truncateN(before, defaultBatchSize); err != nil { 341 return err 342 } 343 } 344 } 345 return nil 346 } 347 348 func (e *exemplars) truncateN(before time.Time, count int) (bool, error) { 349 beforeTs := before.UnixNano() 350 keys := make([][]byte, 0, 2*count) 351 err := e.db.View(func(txn *badger.Txn) error { 352 it := txn.NewIterator(badger.IteratorOptions{ 353 Prefix: exemplarTimestampPrefix.bytes(), 354 }) 355 defer it.Close() 356 for it.Rewind(); it.Valid(); it.Next() { 357 if len(keys) == cap(keys) { 358 return nil 359 } 360 item := it.Item() 361 keyTs, exKey, ok := parseExemplarTimestamp(item.Key()) 362 if !ok { 363 continue 364 } 365 if keyTs > beforeTs { 366 break 367 } 368 keys = append(keys, item.KeyCopy(nil)) 369 keys = append(keys, exKey) 370 } 371 return nil 372 }) 373 374 if err != nil { 375 return false, err 376 } 377 if len(keys) == 0 { 378 return false, nil 379 } 380 381 batch := e.db.NewWriteBatch() 382 defer batch.Cancel() 383 for i := range keys { 384 if err = batch.Delete(keys[i]); err != nil { 385 return false, err 386 } 387 } 388 389 if err = batch.Flush(); err == nil { 390 e.metrics.exemplarsRemovedTotal.Add(float64(len(keys) / 2)) 391 } 392 393 return true, err 394 } 395 396 func (s *Storage) ensureAppSegmentExists(in *PutInput) error { 397 k := segment.AppSegmentKey(in.Key.AppName()) 398 r, err := s.segments.GetOrCreate(k) 399 if err != nil { 400 return fmt.Errorf("segments cache for %v: %w", k, err) 401 } 402 st := r.(*segment.Segment) 403 st.SetMetadata(metadata.Metadata{ 404 SpyName: in.SpyName, 405 SampleRate: in.SampleRate, 406 Units: in.Units, 407 AggregationType: in.AggregationType, 408 }) 409 s.segments.Put(k, st) 410 return err 411 } 412 413 func (b *exemplarsBatch) insert(_ context.Context, input *PutInput) error { 414 if len(b.entries) == b.batchSize { 415 return errBatchIsFull 416 } 417 profileID, ok := input.Key.ProfileID() 418 if !ok { 419 return errProfileIDRequired 420 } 421 appName := input.Key.AppName() 422 k := exemplarKey(appName, profileID) 423 key := string(k) 424 e, ok := b.entries[key] 425 if ok { 426 e.Tree.Merge(input.Val) 427 e.updateTime(input.StartTime.UnixNano(), input.EndTime.UnixNano()) 428 return nil 429 } 430 b.entries[key] = &exemplarEntry{ 431 Key: k, 432 AppName: appName, 433 ProfileID: profileID, 434 435 StartTime: input.StartTime.UnixNano(), 436 EndTime: input.EndTime.UnixNano(), 437 Labels: input.Key.Labels(), 438 Tree: input.Val, 439 } 440 return nil 441 } 442 443 func (b *exemplarsBatch) writeExemplarToDB(txn *badger.Txn, e *exemplarEntry) error { 444 k, ok := exemplarKeyToTimestampKey(e.Key, e.EndTime) 445 if !ok { 446 return fmt.Errorf("invalid exemplar key") 447 } 448 if err := txn.Set(k, nil); err != nil { 449 return err 450 } 451 d, err := b.dicts.GetOrCreate(e.AppName) 452 if err != nil { 453 return err 454 } 455 dx := d.(*dict.Dict) 456 457 item, err := txn.Get(e.Key) 458 switch { 459 default: 460 return err 461 case errors.Is(err, badger.ErrKeyNotFound): 462 // Fast path: there is no exemplar with this key in the database. 463 case err == nil: 464 // Merge with the found exemplar using the buffer provided. 465 // Ideally, we should also drop existing timestamp key and create a new one, 466 // so that the exemplar wouldn't be deleted before its actual EndTime passes 467 // the retention policy threshold. The time difference is negligible, therefore 468 // it's not happening: only the first EndTime is honored. 469 err = item.Value(func(val []byte) error { 470 b.metrics.exemplarsReadBytes.Observe(float64(len(val))) 471 var x exemplarEntry 472 if err = x.Deserialize(dx, val); err == nil { 473 e = x.Merge(e) 474 } 475 return err 476 }) 477 if err != nil { 478 return err 479 } 480 } 481 482 r, err := e.Serialize(dx, b.config.maxNodesSerialization) 483 if err != nil { 484 return err 485 } 486 if err = txn.Set(e.Key, r); err != nil { 487 return err 488 } 489 b.metrics.exemplarsWriteBytes.Observe(float64(len(r))) 490 return nil 491 } 492 493 func (e *exemplarEntry) Merge(src *exemplarEntry) *exemplarEntry { 494 e.updateTime(src.StartTime, src.EndTime) 495 e.Tree.Merge(src.Tree) 496 e.Key = src.Key 497 return e 498 } 499 500 func (e *exemplarEntry) updateTime(st, et int64) { 501 if st < e.StartTime { 502 e.StartTime = st 503 } 504 if et > e.EndTime { 505 e.EndTime = et 506 } 507 } 508 509 func (e *exemplarEntry) Serialize(d *dict.Dict, maxNodes int) ([]byte, error) { 510 b := bytes.NewBuffer(make([]byte, 0, 1<<10)) // 1 KB. 511 b.WriteByte(exemplarsCurrentFormat) // Version. 512 if err := e.Tree.SerializeTruncate(d, maxNodes, b); err != nil { 513 return nil, err 514 } 515 516 vw := varint.NewWriter() 517 _, _ = vw.Write(b, uint64(e.StartTime)) 518 _, _ = vw.Write(b, uint64(e.EndTime)) 519 520 // Strip profile_id and __name__ labels. 521 labels := make([]string, 0, len(e.Labels)*2) 522 for k, v := range e.Labels { 523 if k == segment.ProfileIDLabelName || k == "__name__" { 524 continue 525 } 526 labels = append(labels, k, v) 527 } 528 // Write labels as an array of string pairs. 529 _, _ = vw.Write(b, uint64(len(labels))) 530 for _, v := range labels { 531 bs := []byte(v) 532 _, _ = vw.Write(b, uint64(len(bs))) 533 _, _ = b.Write(bs) 534 } 535 536 return b.Bytes(), nil 537 } 538 539 func (e *exemplarEntry) Deserialize(d *dict.Dict, b []byte) error { 540 buf := bytes.NewBuffer(b) 541 v, err := buf.ReadByte() 542 if err != nil { 543 return err 544 } 545 switch v { 546 case 1: 547 return e.deserializeV1(d, buf) 548 case 2: 549 return e.deserializeV2(d, buf) 550 default: 551 return fmt.Errorf("unknown exemplar format version %d", v) 552 } 553 } 554 555 func (e *exemplarEntry) deserializeV1(d *dict.Dict, src *bytes.Buffer) error { 556 t, err := tree.Deserialize(d, src) 557 if err != nil { 558 return err 559 } 560 e.Tree = t 561 return nil 562 } 563 564 func (e *exemplarEntry) deserializeV2(d *dict.Dict, src *bytes.Buffer) error { 565 t, err := tree.Deserialize(d, src) 566 if err != nil { 567 return err 568 } 569 e.Tree = t 570 571 st, err := varint.Read(src) 572 if err != nil { 573 return err 574 } 575 e.StartTime = int64(st) 576 et, err := varint.Read(src) 577 if err != nil { 578 return err 579 } 580 e.EndTime = int64(et) 581 582 n, err := varint.Read(src) 583 if err != nil { 584 return err 585 } 586 if e.Labels == nil { 587 e.Labels = make(map[string]string, n) 588 } 589 var k string 590 for i := uint64(0); i < n; i++ { 591 m, err := varint.Read(src) 592 if err != nil { 593 return err 594 } 595 v := string(src.Next(int(m))) 596 if i%2 != 0 { 597 e.Labels[k] = v 598 } else { 599 k = v 600 } 601 } 602 603 return nil 604 }