github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/storage/wal/wal.go (about) 1 // This directory was copied and adapted from https://github.com/grafana/agent/tree/main/pkg/metrics. 2 // We cannot vendor the agent in since the agent vendors loki in, which would cause a cyclic dependency. 3 // NOTE: many changes have been made to the original code for our use-case. 4 package wal 5 6 import ( 7 "context" 8 "fmt" 9 "math" 10 "os" 11 "path/filepath" 12 "sync" 13 "time" 14 "unicode/utf8" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/pkg/errors" 19 "github.com/prometheus/client_golang/prometheus" 20 "github.com/prometheus/prometheus/model/exemplar" 21 "github.com/prometheus/prometheus/model/labels" 22 "github.com/prometheus/prometheus/model/timestamp" 23 "github.com/prometheus/prometheus/model/value" 24 "github.com/prometheus/prometheus/storage" 25 "github.com/prometheus/prometheus/tsdb" 26 "github.com/prometheus/prometheus/tsdb/chunks" 27 "github.com/prometheus/prometheus/tsdb/record" 28 "github.com/prometheus/prometheus/tsdb/wal" 29 "go.uber.org/atomic" 30 ) 31 32 // ErrWALClosed is an error returned when a WAL operation can't run because the 33 // storage has already been closed. 34 var ErrWALClosed = fmt.Errorf("WAL storage closed") 35 36 // Storage implements storage.Storage, and just writes to the WAL. 37 type Storage struct { 38 // Embed Queryable/ChunkQueryable for compatibility, but don't actually implement it. 39 storage.Queryable 40 storage.ChunkQueryable 41 42 // Operations against the WAL must be protected by a mutex so it doesn't get 43 // closed in the middle of an operation. Other operations are concurrency-safe, so we 44 // use a RWMutex to allow multiple usages of the WAL at once. If the WAL is closed, all 45 // operations that change the WAL must fail. 46 walMtx sync.RWMutex 47 walClosed bool 48 49 path string 50 wal *wal.WAL 51 logger log.Logger 52 53 appenderPool sync.Pool 54 bufPool sync.Pool 55 56 ref *atomic.Uint64 57 series *stripeSeries 58 59 deletedMtx sync.Mutex 60 deleted map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until. 61 62 metrics *Metrics 63 } 64 65 // NewStorage makes a new Storage. 66 func NewStorage(logger log.Logger, metrics *Metrics, registerer prometheus.Registerer, path string) (*Storage, error) { 67 w, err := wal.NewSize(logger, registerer, SubDirectory(path), wal.DefaultSegmentSize, true) 68 if err != nil { 69 return nil, err 70 } 71 72 storage := &Storage{ 73 path: path, 74 wal: w, 75 logger: logger, 76 deleted: map[chunks.HeadSeriesRef]int{}, 77 series: newStripeSeries(), 78 metrics: metrics, 79 ref: atomic.NewUint64(0), 80 } 81 82 storage.bufPool.New = func() interface{} { 83 b := make([]byte, 0, 1024) 84 return b 85 } 86 87 storage.appenderPool.New = func() interface{} { 88 return &appender{ 89 w: storage, 90 series: make([]record.RefSeries, 0, 100), 91 samples: make([]record.RefSample, 0, 100), 92 exemplars: make([]record.RefExemplar, 0, 10), 93 } 94 } 95 96 start := time.Now() 97 if err := storage.replayWAL(); err != nil { 98 metrics.TotalCorruptions.Inc() 99 100 level.Warn(storage.logger).Log("msg", "encountered WAL read error, attempting repair", "err", err) 101 if err := w.Repair(err); err != nil { 102 metrics.TotalFailedRepairs.Inc() 103 metrics.ReplayDuration.Observe(time.Since(start).Seconds()) 104 return nil, errors.Wrap(err, "repair corrupted WAL") 105 } 106 107 metrics.TotalSucceededRepairs.Inc() 108 } 109 110 metrics.ReplayDuration.Observe(time.Since(start).Seconds()) 111 112 go storage.recordSize() 113 114 return storage, nil 115 } 116 117 func (w *Storage) replayWAL() error { 118 w.walMtx.RLock() 119 defer w.walMtx.RUnlock() 120 121 if w.walClosed { 122 return ErrWALClosed 123 } 124 125 level.Info(w.logger).Log("msg", "replaying WAL, this may take a while", "dir", w.wal.Dir()) 126 dir, startFrom, err := wal.LastCheckpoint(w.wal.Dir()) 127 if err != nil && err != record.ErrNotFound { 128 return errors.Wrap(err, "find last checkpoint") 129 } 130 131 if err == nil { 132 sr, err := wal.NewSegmentsReader(dir) 133 if err != nil { 134 return errors.Wrap(err, "open checkpoint") 135 } 136 defer func() { 137 if err := sr.Close(); err != nil { 138 level.Warn(w.logger).Log("msg", "error while closing the wal segments reader", "err", err) 139 } 140 }() 141 142 // A corrupted checkpoint is a hard error for now and requires user 143 // intervention. There's likely little data that can be recovered anyway. 144 if err := w.loadWAL(wal.NewReader(sr)); err != nil { 145 return errors.Wrap(err, "backfill checkpoint") 146 } 147 startFrom++ 148 level.Info(w.logger).Log("msg", "WAL checkpoint loaded") 149 } 150 151 // Find the last segment. 152 _, last, err := wal.Segments(w.wal.Dir()) 153 if err != nil { 154 return errors.Wrap(err, "finding WAL segments") 155 } 156 157 // Backfill segments from the most recent checkpoint onwards. 158 for i := startFrom; i <= last; i++ { 159 s, err := wal.OpenReadSegment(wal.SegmentName(w.wal.Dir(), i)) 160 if err != nil { 161 return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i)) 162 } 163 164 sr := wal.NewSegmentBufReader(s) 165 err = w.loadWAL(wal.NewReader(sr)) 166 if err := sr.Close(); err != nil { 167 level.Warn(w.logger).Log("msg", "error while closing the wal segments reader", "err", err) 168 } 169 if err != nil { 170 return err 171 } 172 level.Info(w.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", last) 173 } 174 175 return nil 176 } 177 178 func (w *Storage) loadWAL(r *wal.Reader) (err error) { 179 var dec record.Decoder 180 181 var ( 182 decoded = make(chan interface{}, 10) 183 errCh = make(chan error, 1) 184 seriesPool = sync.Pool{ 185 New: func() interface{} { 186 return []record.RefSeries{} 187 }, 188 } 189 samplesPool = sync.Pool{ 190 New: func() interface{} { 191 return []record.RefSample{} 192 }, 193 } 194 ) 195 196 go func() { 197 defer close(decoded) 198 for r.Next() { 199 rec := r.Record() 200 switch dec.Type(rec) { 201 case record.Series: 202 series := seriesPool.Get().([]record.RefSeries)[:0] 203 series, err = dec.Series(rec, series) 204 if err != nil { 205 errCh <- &wal.CorruptionErr{ 206 Err: errors.Wrap(err, "decode series"), 207 Segment: r.Segment(), 208 Offset: r.Offset(), 209 } 210 return 211 } 212 decoded <- series 213 case record.Samples: 214 samples := samplesPool.Get().([]record.RefSample)[:0] 215 samples, err = dec.Samples(rec, samples) 216 if err != nil { 217 errCh <- &wal.CorruptionErr{ 218 Err: errors.Wrap(err, "decode samples"), 219 Segment: r.Segment(), 220 Offset: r.Offset(), 221 } 222 } 223 decoded <- samples 224 case record.Tombstones, record.Exemplars: 225 // We don't care about decoding tombstones or exemplars 226 continue 227 default: 228 errCh <- &wal.CorruptionErr{ 229 Err: errors.Errorf("invalid record type %v", dec.Type(rec)), 230 Segment: r.Segment(), 231 Offset: r.Offset(), 232 } 233 return 234 } 235 } 236 }() 237 238 biggestRef := chunks.HeadSeriesRef(w.ref.Load()) 239 240 for d := range decoded { 241 switch v := d.(type) { 242 case []record.RefSeries: 243 for _, s := range v { 244 // If this is a new series, create it in memory without a timestamp. 245 // If we read in a sample for it, we'll use the timestamp of the latest 246 // sample. Otherwise, the series is stale and will be deleted once 247 // the truncation is performed. 248 if w.series.getByID(s.Ref) == nil { 249 series := &memSeries{ref: s.Ref, lset: s.Labels, lastTs: 0} 250 w.series.set(s.Labels.Hash(), series) 251 252 w.metrics.NumActiveSeries.Inc() 253 w.metrics.TotalCreatedSeries.Inc() 254 255 if biggestRef <= s.Ref { 256 biggestRef = s.Ref 257 } 258 } 259 } 260 261 //nolint:staticcheck 262 seriesPool.Put(v) 263 case []record.RefSample: 264 for _, s := range v { 265 // Update the lastTs for the series based 266 series := w.series.getByID(s.Ref) 267 if series == nil { 268 level.Warn(w.logger).Log("msg", "found sample referencing non-existing series, skipping") 269 continue 270 } 271 272 series.Lock() 273 if s.T > series.lastTs { 274 series.lastTs = s.T 275 } 276 series.Unlock() 277 } 278 279 //nolint:staticcheck 280 samplesPool.Put(v) 281 default: 282 panic(fmt.Errorf("unexpected decoded type: %T", d)) 283 } 284 } 285 286 w.ref.Store(uint64(biggestRef)) 287 288 select { 289 case err := <-errCh: 290 return err 291 default: 292 } 293 294 if r.Err() != nil { 295 return errors.Wrap(r.Err(), "read records") 296 } 297 298 return nil 299 } 300 301 // Directory returns the path where the WAL storage is held. 302 func (w *Storage) Directory() string { 303 return w.path 304 } 305 306 // Appender returns a new appender against the storage. 307 func (w *Storage) Appender(_ context.Context) storage.Appender { 308 return w.appenderPool.Get().(storage.Appender) 309 } 310 311 // StartTime always returns 0, nil. It is implemented for compatibility with 312 // Prometheus, but is unused in the agent. 313 func (*Storage) StartTime() (int64, error) { 314 return 0, nil 315 } 316 317 // Truncate removes all data from the WAL prior to the timestamp specified by 318 // mint. 319 func (w *Storage) Truncate(mint int64) error { 320 w.walMtx.RLock() 321 defer w.walMtx.RUnlock() 322 323 if w.walClosed { 324 return ErrWALClosed 325 } 326 327 start := time.Now() 328 329 // Garbage collect series that haven't received an update since mint. 330 w.gc(mint) 331 level.Info(w.logger).Log("msg", "series GC completed", "duration", time.Since(start)) 332 333 first, last, err := wal.Segments(w.wal.Dir()) 334 if err != nil { 335 return errors.Wrap(err, "get segment range") 336 } 337 338 // Start a new segment, so low ingestion volume instance don't have more WAL 339 // than needed. 340 err = w.wal.NextSegment() 341 if err != nil { 342 return errors.Wrap(err, "next segment") 343 } 344 345 last-- // Never consider last segment for checkpoint. 346 if last < 0 { 347 return nil // no segments yet. 348 } 349 350 // The lower two thirds of segments should contain mostly obsolete samples. 351 // If we have less than two segments, it's not worth checkpointing yet. 352 last = first + (last-first)*2/3 353 if last <= first { 354 return nil 355 } 356 357 keep := func(id chunks.HeadSeriesRef) bool { 358 if w.series.getByID(id) != nil { 359 return true 360 } 361 362 w.deletedMtx.Lock() 363 _, ok := w.deleted[id] 364 w.deletedMtx.Unlock() 365 return ok 366 } 367 if _, err = wal.Checkpoint(w.logger, w.wal, first, last, keep, mint); err != nil { 368 return errors.Wrap(err, "create checkpoint") 369 } 370 if err := w.wal.Truncate(last + 1); err != nil { 371 // If truncating fails, we'll just try again at the next checkpoint. 372 // Leftover segments will just be ignored in the future if there's a checkpoint 373 // that supersedes them. 374 level.Error(w.logger).Log("msg", "truncating segments failed", "err", err) 375 } 376 377 // The checkpoint is written and segments before it is truncated, so we no 378 // longer need to track deleted series that are before it. 379 w.deletedMtx.Lock() 380 for ref, segment := range w.deleted { 381 if segment < first { 382 delete(w.deleted, ref) 383 w.metrics.TotalRemovedSeries.Inc() 384 } 385 } 386 w.metrics.NumDeletedSeries.Set(float64(len(w.deleted))) 387 w.deletedMtx.Unlock() 388 389 if err := wal.DeleteCheckpoints(w.wal.Dir(), last); err != nil { 390 // Leftover old checkpoints do not cause problems down the line beyond 391 // occupying disk space. 392 // They will just be ignored since a higher checkpoint exists. 393 level.Error(w.logger).Log("msg", "delete old checkpoints", "err", err) 394 } 395 396 level.Info(w.logger).Log("msg", "WAL checkpoint complete", 397 "first", first, "last", last, "duration", time.Since(start)) 398 return nil 399 } 400 401 // gc removes data before the minimum timestamp from the head. 402 func (w *Storage) gc(mint int64) { 403 deleted := w.series.gc(mint) 404 w.metrics.NumActiveSeries.Sub(float64(len(deleted))) 405 406 _, last, _ := wal.Segments(w.wal.Dir()) 407 w.deletedMtx.Lock() 408 defer w.deletedMtx.Unlock() 409 410 // We want to keep series records for any newly deleted series 411 // until we've passed the last recorded segment. The WAL will 412 // still contain samples records with all of the ref IDs until 413 // the segment's samples has been deleted from the checkpoint. 414 // 415 // If the series weren't kept on startup when the WAL was replied, 416 // the samples wouldn't be able to be used since there wouldn't 417 // be any labels for that ref ID. 418 for ref := range deleted { 419 w.deleted[ref] = last 420 } 421 422 w.metrics.NumDeletedSeries.Set(float64(len(w.deleted))) 423 } 424 425 // WriteStalenessMarkers appends a staleness sample for all active series. 426 func (w *Storage) WriteStalenessMarkers(remoteTsFunc func() int64) error { 427 var lastErr error 428 var lastTs int64 429 430 app := w.Appender(context.Background()) 431 it := w.series.iterator() 432 for series := range it.Channel() { 433 var ( 434 ref = series.ref 435 lset = series.lset 436 ) 437 438 ts := timestamp.FromTime(time.Now()) 439 _, err := app.Append(storage.SeriesRef(ref), lset, ts, math.Float64frombits(value.StaleNaN)) 440 if err != nil { 441 lastErr = err 442 } 443 444 // Remove millisecond precision; the remote write timestamp we get 445 // only has second precision. 446 lastTs = (ts / 1000) * 1000 447 } 448 449 if lastErr == nil { 450 if err := app.Commit(); err != nil { 451 return fmt.Errorf("failed to commit staleness markers: %w", err) 452 } 453 454 // Wait for remote write to write the lastTs, but give up after 1m 455 level.Info(w.logger).Log("msg", "waiting for remote write to write staleness markers...") 456 457 stopCh := time.After(1 * time.Minute) 458 start := time.Now() 459 460 Outer: 461 for { 462 select { 463 case <-stopCh: 464 level.Error(w.logger).Log("msg", "timed out waiting for staleness markers to be written") 465 break Outer 466 default: 467 writtenTs := remoteTsFunc() 468 if writtenTs >= lastTs { 469 duration := time.Since(start) 470 level.Info(w.logger).Log("msg", "remote write wrote staleness markers", "duration", duration) 471 break Outer 472 } 473 474 level.Info(w.logger).Log("msg", "remote write hasn't written staleness markers yet", "remoteTs", writtenTs, "lastTs", lastTs) 475 476 // Wait a bit before reading again 477 time.Sleep(5 * time.Second) 478 } 479 } 480 } 481 482 return lastErr 483 } 484 485 // Close closes the storage and all its underlying resources. 486 func (w *Storage) Close() error { 487 w.walMtx.Lock() 488 defer w.walMtx.Unlock() 489 490 if w.walClosed { 491 return fmt.Errorf("already closed") 492 } 493 w.walClosed = true 494 495 if w.metrics != nil { 496 w.metrics.Unregister() 497 } 498 return w.wal.Close() 499 } 500 501 func (w *Storage) recordSize() { 502 ticker := time.NewTicker(5 * time.Second) 503 defer ticker.Stop() 504 505 for range ticker.C { 506 size, err := dirSize(w.path) 507 if err != nil { 508 level.Debug(w.logger).Log("msg", "could not calculate WAL disk size", "path", w.path, "err", err) 509 continue 510 } 511 w.metrics.DiskSize.Set(float64(size)) 512 } 513 } 514 515 func dirSize(path string) (int64, error) { 516 var size int64 517 err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error { 518 if err != nil { 519 return err 520 } 521 522 if !info.IsDir() { 523 size += info.Size() 524 } 525 526 return err 527 }) 528 529 return size, err 530 } 531 532 type appender struct { 533 w *Storage 534 series []record.RefSeries 535 samples []record.RefSample 536 exemplars []record.RefExemplar 537 } 538 539 func (a *appender) Append(ref storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) { 540 series := a.w.series.getByID(chunks.HeadSeriesRef(ref)) 541 if series == nil { 542 // Ensure no empty or duplicate labels have gotten through. This mirrors the 543 // equivalent validation code in the TSDB's headAppender. 544 l = l.WithoutEmpty() 545 if len(l) == 0 { 546 return 0, errors.Wrap(tsdb.ErrInvalidSample, "empty labelset") 547 } 548 549 if lbl, dup := l.HasDuplicateLabelNames(); dup { 550 return 0, errors.Wrap(tsdb.ErrInvalidSample, fmt.Sprintf(`label name "%s" is not unique`, lbl)) 551 } 552 553 var created bool 554 series, created = a.getOrCreate(l) 555 if created { 556 a.series = append(a.series, record.RefSeries{ 557 Ref: series.ref, 558 Labels: l, 559 }) 560 561 a.w.metrics.NumActiveSeries.Inc() 562 a.w.metrics.TotalCreatedSeries.Inc() 563 } 564 } 565 566 series.Lock() 567 defer series.Unlock() 568 569 // Update last recorded timestamp. Used by Storage.gc to determine if a 570 // series is stale. 571 series.updateTs(t) 572 573 a.samples = append(a.samples, record.RefSample{ 574 Ref: series.ref, 575 T: t, 576 V: v, 577 }) 578 579 a.w.metrics.TotalAppendedSamples.Inc() 580 return storage.SeriesRef(series.ref), nil 581 } 582 583 func (a *appender) getOrCreate(l labels.Labels) (series *memSeries, created bool) { 584 hash := l.Hash() 585 586 series = a.w.series.getByHash(hash, l) 587 if series != nil { 588 return series, false 589 } 590 591 series = &memSeries{ref: chunks.HeadSeriesRef(a.w.ref.Inc()), lset: l} 592 a.w.series.set(l.Hash(), series) 593 return series, true 594 } 595 596 func (a *appender) AppendExemplar(ref storage.SeriesRef, _ labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) { 597 s := a.w.series.getByID(chunks.HeadSeriesRef(ref)) 598 if s == nil { 599 return 0, fmt.Errorf("unknown series ref. when trying to add exemplar: %d", ref) 600 } 601 602 // Ensure no empty labels have gotten through. 603 e.Labels = e.Labels.WithoutEmpty() 604 605 if lbl, dup := e.Labels.HasDuplicateLabelNames(); dup { 606 return 0, errors.Wrap(tsdb.ErrInvalidExemplar, fmt.Sprintf(`label name "%s" is not unique`, lbl)) 607 } 608 609 // Exemplar label length does not include chars involved in text rendering such as quotes 610 // equals sign, or commas. See definition of const ExemplarMaxLabelLength. 611 labelSetLen := 0 612 for _, l := range e.Labels { 613 labelSetLen += utf8.RuneCountInString(l.Name) 614 labelSetLen += utf8.RuneCountInString(l.Value) 615 616 if labelSetLen > exemplar.ExemplarMaxLabelSetLength { 617 return 0, storage.ErrExemplarLabelLength 618 } 619 } 620 621 a.exemplars = append(a.exemplars, record.RefExemplar{ 622 Ref: chunks.HeadSeriesRef(ref), 623 T: e.Ts, 624 V: e.Value, 625 Labels: e.Labels, 626 }) 627 628 return storage.SeriesRef(s.ref), nil 629 } 630 631 // Commit submits the collected samples and purges the batch. 632 func (a *appender) Commit() error { 633 a.w.walMtx.RLock() 634 defer a.w.walMtx.RUnlock() 635 636 if a.w.walClosed { 637 return ErrWALClosed 638 } 639 640 var encoder record.Encoder 641 buf := a.w.bufPool.Get().([]byte) 642 643 if len(a.series) > 0 { 644 buf = encoder.Series(a.series, buf) 645 if err := a.w.wal.Log(buf); err != nil { 646 return err 647 } 648 buf = buf[:0] 649 } 650 651 if len(a.samples) > 0 { 652 buf = encoder.Samples(a.samples, buf) 653 if err := a.w.wal.Log(buf); err != nil { 654 return err 655 } 656 buf = buf[:0] 657 } 658 659 if len(a.exemplars) > 0 { 660 buf = encoder.Exemplars(a.exemplars, buf) 661 if err := a.w.wal.Log(buf); err != nil { 662 return err 663 } 664 buf = buf[:0] 665 } 666 667 //nolint:staticcheck 668 a.w.bufPool.Put(buf) 669 670 for _, sample := range a.samples { 671 series := a.w.series.getByID(sample.Ref) 672 if series != nil { 673 series.Lock() 674 series.pendingCommit = false 675 series.Unlock() 676 } 677 } 678 679 return a.Rollback() 680 } 681 682 func (a *appender) Rollback() error { 683 a.series = a.series[:0] 684 a.samples = a.samples[:0] 685 a.exemplars = a.exemplars[:0] 686 a.w.appenderPool.Put(a) 687 return nil 688 }