github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/tsdb/head_manager.go (about) 1 package tsdb 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "path/filepath" 9 "sort" 10 "strconv" 11 "sync" 12 "time" 13 14 "github.com/cespare/xxhash" 15 "github.com/go-kit/log" 16 "github.com/go-kit/log/level" 17 "github.com/pkg/errors" 18 "github.com/prometheus/common/model" 19 "github.com/prometheus/prometheus/model/labels" 20 "github.com/prometheus/prometheus/tsdb/chunks" 21 "github.com/prometheus/prometheus/tsdb/record" 22 "go.uber.org/atomic" 23 24 "github.com/grafana/loki/pkg/storage/chunk" 25 "github.com/grafana/loki/pkg/storage/chunk/client/util" 26 "github.com/grafana/loki/pkg/storage/stores/index/stats" 27 "github.com/grafana/loki/pkg/storage/stores/tsdb/index" 28 "github.com/grafana/loki/pkg/util/wal" 29 ) 30 31 /* 32 period is a duration which the ingesters use to group index writes into a (WAL,TenantHeads) pair. 33 After each period elapses, a set of zero or more multitenant TSDB indices are built (one per 34 index bucket, generally 24h). 35 36 It's important to note that this cycle occurs in real time as opposed to the timestamps of 37 chunk entries. Index writes during the `period` may span multiple index buckets. Periods 38 also expose some helper functions to get the remainder-less offset integer for that period, 39 which we use in file creation/etc. 40 */ 41 type period time.Duration 42 43 const defaultRotationPeriod = period(15 * time.Minute) 44 45 func (p period) PeriodFor(t time.Time) int { 46 return int(t.UnixNano() / int64(p)) 47 } 48 49 func (p period) TimeForPeriod(n int) time.Time { 50 return time.Unix(0, int64(p)*int64(n)) 51 } 52 53 // Do not specify without bit shifting. This allows us to 54 // do shard index calcuations via bitwise & rather than modulos. 55 const defaultHeadManagerStripeSize = 1 << 7 56 57 /* 58 HeadManager both accepts flushed chunk writes 59 and exposes the index interface for multiple tenants. 60 It also handles updating an underlying WAL and periodically 61 rotates both the tenant Heads and the underlying WAL, using 62 the old versions to build + upload TSDB files. 63 64 On disk, it looks like: 65 66 tsdb/ 67 v1/ 68 # scratch directory used for temp tsdb files during build stage 69 scratch/ 70 # wal directory used to store WALs being written on the ingester. 71 # These are eventually shipped to storage as multi-tenant TSDB files 72 # and compacted into per tenant indices 73 wal/ 74 <timestamp> 75 # multitenant tsdb files which are created on the ingesters/shipped 76 multitenant/ 77 <timestamp>-<ingester-name>.tsdb 78 per_tenant/ 79 # post-compaction tenant tsdbs which are grouped per 80 # period bucket 81 <tenant>/ 82 <bucket>/ 83 index-<from>-<through>-<checksum>.tsdb 84 */ 85 86 type HeadManager struct { 87 log log.Logger 88 dir string 89 metrics *Metrics 90 91 // RLocked for all writes/reads, 92 // Locked before rotating heads/wal 93 mtx sync.RWMutex 94 95 // how often WALs should be rotated and TSDBs cut 96 period period 97 98 tsdbManager TSDBManager 99 active, prev *headWAL 100 101 shards int 102 activeHeads, prevHeads *tenantHeads 103 104 Index 105 } 106 107 func NewHeadManager(logger log.Logger, dir string, metrics *Metrics, tsdbManager TSDBManager) *HeadManager { 108 shards := defaultHeadManagerStripeSize 109 m := &HeadManager{ 110 log: log.With(logger, "component", "tsdb-head-manager"), 111 dir: dir, 112 metrics: metrics, 113 tsdbManager: tsdbManager, 114 115 period: defaultRotationPeriod, 116 shards: shards, 117 } 118 119 m.Index = LazyIndex(func() (Index, error) { 120 m.mtx.RLock() 121 defer m.mtx.RUnlock() 122 123 var indices []Index 124 if m.prevHeads != nil { 125 indices = append(indices, m.prevHeads) 126 } 127 if m.activeHeads != nil { 128 indices = append(indices, m.activeHeads) 129 } 130 131 return NewMultiIndex(indices...) 132 133 }) 134 135 return m 136 } 137 138 func (m *HeadManager) Stop() error { 139 m.mtx.Lock() 140 defer m.mtx.Unlock() 141 if err := m.active.Stop(); err != nil { 142 return err 143 } 144 145 return m.buildTSDBFromWAL(m.active.initialized) 146 } 147 148 func (m *HeadManager) Append(userID string, ls labels.Labels, chks index.ChunkMetas) error { 149 // TSDB doesnt need the __name__="log" convention the old chunk store index used. 150 // We must create a copy of the labels here to avoid mutating the existing 151 // labels when writing across index buckets. 152 b := labels.NewBuilder(ls) 153 b.Del(labels.MetricName) 154 ls = b.Labels() 155 156 m.mtx.RLock() 157 now := time.Now() 158 if m.period.PeriodFor(now) > m.period.PeriodFor(m.activeHeads.start) { 159 m.mtx.RUnlock() 160 if err := m.Rotate(now); err != nil { 161 return errors.Wrap(err, "rotating TSDB Head") 162 } 163 m.mtx.RLock() 164 } 165 defer m.mtx.RUnlock() 166 rec := m.activeHeads.Append(userID, ls, chks) 167 return m.active.Log(rec) 168 } 169 170 func (m *HeadManager) Start() error { 171 if err := os.RemoveAll(filepath.Join(m.dir, "scratch")); err != nil { 172 return errors.Wrap(err, "removing tsdb scratch dir") 173 } 174 175 for _, d := range managerRequiredDirs(m.dir) { 176 if err := util.EnsureDirectory(d); err != nil { 177 return errors.Wrapf(err, "ensuring required directory exists: %s", d) 178 } 179 } 180 181 walsByPeriod, err := walsByPeriod(m.dir, m.period) 182 if err != nil { 183 return err 184 } 185 level.Info(m.log).Log("msg", "loaded wals by period", "groups", len(walsByPeriod)) 186 187 // Load the shipper with any previously built TSDBs 188 if err := m.tsdbManager.Start(); err != nil { 189 return errors.Wrap(err, "failed to start tsdb manager") 190 } 191 192 // Build any old WALs into a TSDB for the shipper 193 var allWALs []WALIdentifier 194 for _, group := range walsByPeriod { 195 allWALs = append(allWALs, group.wals...) 196 } 197 198 now := time.Now() 199 if err := m.tsdbManager.BuildFromWALs( 200 now, 201 allWALs, 202 ); err != nil { 203 return errors.Wrap(err, "building tsdb") 204 } 205 206 if err := os.RemoveAll(managerWalDir(m.dir)); err != nil { 207 return errors.New("cleaning (removing) wal dir") 208 } 209 210 return m.Rotate(now) 211 } 212 213 func managerRequiredDirs(parent string) []string { 214 return []string{ 215 managerScratchDir(parent), 216 managerWalDir(parent), 217 managerMultitenantDir(parent), 218 managerPerTenantDir(parent), 219 } 220 } 221 func managerScratchDir(parent string) string { 222 return filepath.Join(parent, "scratch") 223 } 224 225 func managerWalDir(parent string) string { 226 return filepath.Join(parent, "wal") 227 } 228 229 func managerMultitenantDir(parent string) string { 230 return filepath.Join(parent, "multitenant") 231 } 232 233 func managerPerTenantDir(parent string) string { 234 return filepath.Join(parent, "per_tenant") 235 } 236 237 func (m *HeadManager) Rotate(t time.Time) error { 238 m.mtx.Lock() 239 defer m.mtx.Unlock() 240 241 if m.activeHeads != nil && m.period.PeriodFor(t) == m.period.PeriodFor(m.activeHeads.start) { 242 // no-op, we've already rotated to the desired period 243 return nil 244 } 245 246 // create new wal 247 nextWALPath := walPath(m.dir, t) 248 nextWAL, err := newHeadWAL(m.log, nextWALPath, t) 249 if err != nil { 250 return errors.Wrapf(err, "creating tsdb wal: %s during rotation", nextWALPath) 251 } 252 253 // create new tenant heads 254 nextHeads := newTenantHeads(t, m.shards, m.metrics, m.log) 255 256 stopPrev := func(s string) { 257 if m.prev != nil { 258 if err := m.prev.Stop(); err != nil { 259 level.Error(m.log).Log( 260 "msg", "failed stopping wal", 261 "period", m.period.PeriodFor(m.prev.initialized), 262 "err", err, 263 "wal", s, 264 ) 265 } 266 } 267 } 268 269 stopPrev("previous cycle") // stop the previous wal if it hasn't been cleaned up yet 270 m.prev = m.active 271 m.prevHeads = m.activeHeads 272 m.active = nextWAL 273 m.activeHeads = nextHeads 274 stopPrev("freshly rotated") // stop the newly rotated-out wal 275 276 // build tsdb from rotated-out period 277 // TODO(owen-d): don't block Append() waiting for tsdb building. Use a work channel/etc 278 if m.prev != nil { 279 if err := m.buildTSDBFromWAL(m.prev.initialized); err != nil { 280 return errors.Wrap(err, "building tsdb from rotated out period") 281 } 282 } 283 284 // Now that the tsdbManager has the updated TSDBs, we can remove our references 285 m.prevHeads = nil 286 m.prev = nil 287 return nil 288 } 289 290 func (m *HeadManager) buildTSDBFromWAL(t time.Time) error { 291 level.Debug(m.log).Log("msg", "combining tsdb WALs") 292 grp, _, err := walsForPeriod(m.dir, m.period, m.period.PeriodFor(t)) 293 if err != nil { 294 return errors.Wrap(err, "listing wals") 295 } 296 level.Debug(m.log).Log("msg", "listed WALs", "pd", grp.period, "n", len(grp.wals)) 297 298 // TODO(owen-d): It's probably faster to build this from the *tenantHeads instead, 299 // but we already need to impl BuildFromWALs to ensure we can correctly build/ship 300 // TSDBs from orphaned WALs of previous periods during startup. 301 // we use the same timestamp as wal here for the filename to ensure it can't clobber 302 // an existing file from a previous cycle. I don't think this is possible, but 303 // perhaps in some unusual crashlooping it could be, so let's be safe and protect ourselves. 304 if err := m.tsdbManager.BuildFromWALs(t, grp.wals); err != nil { 305 return errors.Wrapf(err, "building TSDB from prevHeads WALs for period %d", grp.period) 306 } 307 308 // Now that a TSDB has been created from this group, it's safe to remove them 309 if err := m.removeWALGroup(grp); err != nil { 310 return errors.Wrapf(err, "removing prev TSDB WALs for period %d", grp.period) 311 } 312 level.Debug(m.log).Log("msg", "removing wals", "pd", grp.period, "n", len(grp.wals)) 313 314 return nil 315 } 316 317 type WalGroup struct { 318 period int 319 wals []WALIdentifier 320 } 321 322 func walsByPeriod(dir string, period period) ([]WalGroup, error) { 323 324 groupsMap, err := walGroups(dir, period) 325 if err != nil { 326 return nil, err 327 } 328 res := make([]WalGroup, 0, len(groupsMap)) 329 for _, grp := range groupsMap { 330 res = append(res, *grp) 331 } 332 // Ensure the earliers periods are seen first 333 sort.Slice(res, func(i, j int) bool { 334 return res[i].period < res[j].period 335 }) 336 return res, nil 337 } 338 339 func walGroups(dir string, period period) (map[int]*WalGroup, error) { 340 files, err := ioutil.ReadDir(managerWalDir(dir)) 341 if err != nil { 342 return nil, err 343 } 344 345 groupsMap := map[int]*WalGroup{} 346 347 for _, f := range files { 348 if id, ok := parseWALPath(f.Name()); ok { 349 pd := period.PeriodFor(id.ts) 350 grp, ok := groupsMap[pd] 351 if !ok { 352 grp = &WalGroup{ 353 period: pd, 354 } 355 groupsMap[pd] = grp 356 } 357 grp.wals = append(grp.wals, id) 358 } 359 } 360 361 for _, grp := range groupsMap { 362 // Ensure the earliest wals are seen first 363 sort.Slice(grp.wals, func(i, j int) bool { 364 return grp.wals[i].ts.Before(grp.wals[j].ts) 365 }) 366 } 367 return groupsMap, nil 368 } 369 370 func walsForPeriod(dir string, period period, offset int) (WalGroup, bool, error) { 371 groupsMap, err := walGroups(dir, period) 372 if err != nil { 373 return WalGroup{}, false, err 374 } 375 376 grp, ok := groupsMap[offset] 377 if !ok { 378 return WalGroup{}, false, nil 379 } 380 381 return *grp, true, nil 382 } 383 384 func (m *HeadManager) removeWALGroup(grp WalGroup) error { 385 for _, wal := range grp.wals { 386 if err := os.RemoveAll(walPath(m.dir, wal.ts)); err != nil { 387 return errors.Wrapf(err, "removing tsdb wal: %s", walPath(m.dir, wal.ts)) 388 } 389 } 390 return nil 391 } 392 393 func walPath(parent string, t time.Time) string { 394 return filepath.Join( 395 managerWalDir(parent), 396 fmt.Sprintf("%d", t.Unix()), 397 ) 398 } 399 400 // recoverHead recovers from all WALs belonging to some period 401 // and inserts it into the active *tenantHeads 402 func recoverHead(dir string, heads *tenantHeads, wals []WALIdentifier) error { 403 for _, id := range wals { 404 // use anonymous function for ease of cleanup 405 if err := func(id WALIdentifier) error { 406 reader, closer, err := wal.NewWalReader(walPath(dir, id.ts), -1) 407 if err != nil { 408 return err 409 } 410 defer closer.Close() 411 412 // map of users -> ref -> series. 413 // Keep track of which ref corresponds to which series 414 // for each WAL so we replay into the correct series 415 seriesMap := make(map[string]map[uint64]labels.Labels) 416 417 for reader.Next() { 418 rec := &WALRecord{} 419 if err := decodeWALRecord(reader.Record(), rec); err != nil { 420 return err 421 } 422 423 // labels are always written to the WAL before corresponding chunks 424 if len(rec.Series.Labels) > 0 { 425 tenant, ok := seriesMap[rec.UserID] 426 if !ok { 427 tenant = make(map[uint64]labels.Labels) 428 seriesMap[rec.UserID] = tenant 429 } 430 tenant[uint64(rec.Series.Ref)] = rec.Series.Labels 431 } 432 433 if len(rec.Chks.Chks) > 0 { 434 tenant, ok := seriesMap[rec.UserID] 435 if !ok { 436 return errors.New("found tsdb chunk metas without user in WAL replay") 437 } 438 ls, ok := tenant[rec.Chks.Ref] 439 if !ok { 440 return errors.New("found tsdb chunk metas without series in WAL replay") 441 } 442 _ = heads.Append(rec.UserID, ls, rec.Chks.Chks) 443 } 444 } 445 return reader.Err() 446 447 }(id); err != nil { 448 return errors.Wrap( 449 err, 450 "error recovering from TSDB WAL", 451 ) 452 } 453 } 454 return nil 455 } 456 457 type WALIdentifier struct { 458 ts time.Time 459 } 460 461 func parseWALPath(p string) (id WALIdentifier, ok bool) { 462 ts, err := strconv.Atoi(p) 463 if err != nil { 464 return 465 } 466 467 return WALIdentifier{ 468 ts: time.Unix(int64(ts), 0), 469 }, true 470 } 471 472 type tenantHeads struct { 473 mint, maxt atomic.Int64 // easy lookup for Bounds() impl 474 475 start time.Time 476 shards int 477 locks []sync.RWMutex 478 tenants []map[string]*Head 479 log log.Logger 480 chunkFilter chunk.RequestChunkFilterer 481 metrics *Metrics 482 } 483 484 func newTenantHeads(start time.Time, shards int, metrics *Metrics, logger log.Logger) *tenantHeads { 485 res := &tenantHeads{ 486 start: start, 487 shards: shards, 488 locks: make([]sync.RWMutex, shards), 489 tenants: make([]map[string]*Head, shards), 490 log: log.With(logger, "component", "tenant-heads"), 491 metrics: metrics, 492 } 493 for i := range res.tenants { 494 res.tenants[i] = make(map[string]*Head) 495 } 496 return res 497 } 498 499 func (t *tenantHeads) Append(userID string, ls labels.Labels, chks index.ChunkMetas) *WALRecord { 500 var mint, maxt int64 501 for _, chk := range chks { 502 if chk.MinTime < mint || mint == 0 { 503 mint = chk.MinTime 504 } 505 506 if chk.MaxTime > maxt { 507 maxt = chk.MaxTime 508 } 509 } 510 updateMintMaxt(mint, maxt, &t.mint, &t.maxt) 511 512 head := t.getOrCreateTenantHead(userID) 513 newStream, refID := head.Append(ls, chks) 514 515 rec := &WALRecord{ 516 UserID: userID, 517 Chks: ChunkMetasRecord{ 518 Ref: refID, 519 Chks: chks, 520 }, 521 } 522 523 if newStream { 524 rec.Series = record.RefSeries{ 525 Ref: chunks.HeadSeriesRef(refID), 526 Labels: ls, 527 } 528 } 529 530 return rec 531 } 532 533 func (t *tenantHeads) getOrCreateTenantHead(userID string) *Head { 534 idx := t.shardForTenant(userID) 535 mtx := &t.locks[idx] 536 537 // return existing tenant head if it exists 538 mtx.RLock() 539 head, ok := t.tenants[idx][userID] 540 mtx.RUnlock() 541 if ok { 542 return head 543 } 544 545 mtx.Lock() 546 defer mtx.Unlock() 547 548 // tenant head was not found before. 549 // Check again if a competing request created the head already, don't create it again if so. 550 head, ok = t.tenants[idx][userID] 551 if !ok { 552 head = NewHead(userID, t.metrics, t.log) 553 t.tenants[idx][userID] = head 554 } 555 556 return head 557 } 558 559 func (t *tenantHeads) shardForTenant(userID string) uint64 { 560 return xxhash.Sum64String(userID) & uint64(t.shards-1) 561 } 562 563 func (t *tenantHeads) Close() error { return nil } 564 565 func (t *tenantHeads) SetChunkFilterer(chunkFilter chunk.RequestChunkFilterer) { 566 t.chunkFilter = chunkFilter 567 } 568 569 func (t *tenantHeads) Bounds() (model.Time, model.Time) { 570 return model.Time(t.mint.Load()), model.Time(t.maxt.Load()) 571 } 572 573 func (t *tenantHeads) tenantIndex(userID string, from, through model.Time) (idx Index, ok bool) { 574 i := t.shardForTenant(userID) 575 t.locks[i].RLock() 576 defer t.locks[i].RUnlock() 577 tenant, ok := t.tenants[i][userID] 578 if !ok { 579 return 580 } 581 582 idx = NewTSDBIndex(tenant.indexRange(int64(from), int64(through))) 583 if t.chunkFilter != nil { 584 idx.SetChunkFilterer(t.chunkFilter) 585 } 586 return idx, true 587 588 } 589 590 func (t *tenantHeads) GetChunkRefs(ctx context.Context, userID string, from, through model.Time, res []ChunkRef, shard *index.ShardAnnotation, matchers ...*labels.Matcher) ([]ChunkRef, error) { 591 idx, ok := t.tenantIndex(userID, from, through) 592 if !ok { 593 return nil, nil 594 } 595 return idx.GetChunkRefs(ctx, userID, from, through, nil, shard, matchers...) 596 597 } 598 599 // Series follows the same semantics regarding the passed slice and shard as GetChunkRefs. 600 func (t *tenantHeads) Series(ctx context.Context, userID string, from, through model.Time, res []Series, shard *index.ShardAnnotation, matchers ...*labels.Matcher) ([]Series, error) { 601 idx, ok := t.tenantIndex(userID, from, through) 602 if !ok { 603 return nil, nil 604 } 605 return idx.Series(ctx, userID, from, through, nil, shard, matchers...) 606 607 } 608 609 func (t *tenantHeads) LabelNames(ctx context.Context, userID string, from, through model.Time, matchers ...*labels.Matcher) ([]string, error) { 610 idx, ok := t.tenantIndex(userID, from, through) 611 if !ok { 612 return nil, nil 613 } 614 return idx.LabelNames(ctx, userID, from, through, matchers...) 615 616 } 617 618 func (t *tenantHeads) LabelValues(ctx context.Context, userID string, from, through model.Time, name string, matchers ...*labels.Matcher) ([]string, error) { 619 idx, ok := t.tenantIndex(userID, from, through) 620 if !ok { 621 return nil, nil 622 } 623 return idx.LabelValues(ctx, userID, from, through, name, matchers...) 624 625 } 626 627 func (t *tenantHeads) Stats(ctx context.Context, userID string, from, through model.Time, blooms *stats.Blooms, shard *index.ShardAnnotation, matchers ...*labels.Matcher) (*stats.Blooms, error) { 628 idx, ok := t.tenantIndex(userID, from, through) 629 if !ok { 630 return blooms, nil 631 } 632 return idx.Stats(ctx, userID, from, through, blooms, shard, matchers...) 633 } 634 635 // helper only used in building TSDBs 636 func (t *tenantHeads) forAll(fn func(user string, ls labels.Labels, chks index.ChunkMetas) error) error { 637 for i, shard := range t.tenants { 638 t.locks[i].RLock() 639 defer t.locks[i].RUnlock() 640 641 for user, tenant := range shard { 642 idx := tenant.Index() 643 ps, err := postingsForMatcher(idx, nil, labels.MustNewMatcher(labels.MatchEqual, "", "")) 644 if err != nil { 645 return err 646 } 647 648 for ps.Next() { 649 var ( 650 ls labels.Labels 651 chks []index.ChunkMeta 652 ) 653 654 _, err := idx.Series(ps.At(), &ls, &chks) 655 656 if err != nil { 657 return errors.Wrapf(err, "iterating postings for tenant: %s", user) 658 } 659 660 if err := fn(user, ls, chks); err != nil { 661 return err 662 } 663 } 664 } 665 } 666 667 return nil 668 }