github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/tsdb/index/index.go (about) 1 // Copyright 2017 The Prometheus Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package index 15 16 import ( 17 "bufio" 18 "bytes" 19 "context" 20 "encoding/binary" 21 "fmt" 22 "hash" 23 "hash/crc32" 24 "io" 25 "math" 26 "os" 27 "path/filepath" 28 "sort" 29 "unsafe" 30 31 "github.com/pkg/errors" 32 "github.com/prometheus/common/model" 33 "github.com/prometheus/prometheus/model/labels" 34 "github.com/prometheus/prometheus/storage" 35 tsdb_enc "github.com/prometheus/prometheus/tsdb/encoding" 36 "github.com/prometheus/prometheus/tsdb/fileutil" 37 38 typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1" 39 phlaremodel "github.com/grafana/pyroscope/pkg/model" 40 "github.com/grafana/pyroscope/pkg/phlaredb/block" 41 "github.com/grafana/pyroscope/pkg/phlaredb/tsdb/encoding" 42 ) 43 44 const ( 45 // MagicIndex 4 bytes at the head of an index file. 46 MagicIndex = 0xBAAAD700 47 // HeaderLen represents number of bytes reserved of index for header. 48 HeaderLen = 5 49 50 // FormatV1 represents 1 version of index. 51 FormatV1 = 1 52 // FormatV2 represents 2 version of index. 53 FormatV2 = 2 54 55 IndexFilename = "index" 56 57 // store every 1024 series' fingerprints in the fingerprint offsets table 58 fingerprintInterval = 1 << 10 59 ) 60 61 type indexWriterStage uint8 62 63 const ( 64 idxStageNone indexWriterStage = iota 65 idxStageSymbols 66 idxStageSeries 67 idxStageDone 68 ) 69 70 func (s indexWriterStage) String() string { 71 switch s { 72 case idxStageNone: 73 return "none" 74 case idxStageSymbols: 75 return "symbols" 76 case idxStageSeries: 77 return "series" 78 case idxStageDone: 79 return "done" 80 } 81 return "<unknown>" 82 } 83 84 // The table gets initialized with sync.Once but may still cause a race 85 // with any other use of the crc32 package anywhere. Thus we initialize it 86 // before. 87 var castagnoliTable *crc32.Table 88 89 func init() { 90 castagnoliTable = crc32.MakeTable(crc32.Castagnoli) 91 } 92 93 // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the 94 // polynomial may be easily changed in one location at a later time, if necessary. 95 func newCRC32() hash.Hash32 { 96 return crc32.New(castagnoliTable) 97 } 98 99 type symbolCacheEntry struct { 100 index uint32 101 lastValue string 102 lastValueIndex uint32 103 } 104 105 // Writer implements the IndexWriter interface for the standard 106 // serialization format. 107 type Writer struct { 108 ctx context.Context 109 110 // For the main index file. 111 f *FileWriter 112 113 // Temporary file for postings. 114 fP *FileWriter 115 // Temporary file for posting offsets table. 116 fPO *FileWriter 117 cntPO uint64 118 119 toc TOC 120 stage indexWriterStage 121 postingsStart uint64 // Due to padding, can differ from TOC entry. 122 123 // Reusable memory. 124 buf1 encoding.Encbuf 125 buf2 encoding.Encbuf 126 127 numSymbols int 128 symbols *Symbols 129 symbolFile *fileutil.MmapFile 130 lastSymbol string 131 symbolCache map[string]symbolCacheEntry 132 133 labelIndexes []labelIndexHashEntry // Label index offsets. 134 labelNames map[string]uint64 // Label names, and their usage. 135 // Keeps track of the fingerprint/offset for every n series 136 fingerprintOffsets FingerprintOffsets 137 138 // Hold last series to validate that clients insert new series in order. 139 lastSeries phlaremodel.Labels 140 lastSeriesHash uint64 141 lastRef storage.SeriesRef 142 143 crc32 hash.Hash 144 145 Version int 146 } 147 148 // TOC represents index Table Of Content that states where each section of index starts. 149 type TOC struct { 150 Symbols uint64 151 Series uint64 152 LabelIndices uint64 153 LabelIndicesTable uint64 154 Postings uint64 155 PostingsTable uint64 156 FingerprintOffsets uint64 157 Metadata Metadata 158 } 159 160 // Metadata is TSDB-level metadata 161 type Metadata struct { 162 From, Through int64 163 Checksum uint32 164 } 165 166 func (m *Metadata) EnsureBounds(from, through int64) { 167 if m.From == 0 || from < m.From { 168 m.From = from 169 } 170 171 if m.Through == 0 || through > m.Through { 172 m.Through = through 173 } 174 } 175 176 // NewTOCFromByteSlice return parsed TOC from given index byte slice. 177 func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) { 178 if bs.Len() < indexTOCLen { 179 return nil, tsdb_enc.ErrInvalidSize 180 } 181 b := bs.Range(bs.Len()-indexTOCLen, bs.Len()) 182 183 expCRC := binary.BigEndian.Uint32(b[len(b)-4:]) 184 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b[:len(b)-4]}) 185 if d.Crc32(castagnoliTable) != expCRC { 186 return nil, errors.Wrap(tsdb_enc.ErrInvalidChecksum, "read TOC") 187 } 188 189 if err := d.Err(); err != nil { 190 return nil, err 191 } 192 193 return &TOC{ 194 Symbols: d.Be64(), 195 Series: d.Be64(), 196 LabelIndices: d.Be64(), 197 LabelIndicesTable: d.Be64(), 198 Postings: d.Be64(), 199 PostingsTable: d.Be64(), 200 FingerprintOffsets: d.Be64(), 201 Metadata: Metadata{ 202 From: d.Be64int64(), 203 Through: d.Be64int64(), 204 Checksum: expCRC, 205 }, 206 }, nil 207 } 208 209 // NewWriter returns a new Writer to the given filename. It serializes data in format version 2. 210 func NewWriter(ctx context.Context, fn string) (*Writer, error) { 211 return NewWriterSize(ctx, fn, 4<<20) 212 } 213 214 func NewWriterSize(ctx context.Context, fn string, bufferSize int) (*Writer, error) { 215 dir := filepath.Dir(fn) 216 217 df, err := fileutil.OpenDir(dir) 218 if err != nil { 219 return nil, err 220 } 221 defer df.Close() // Close for platform windows. 222 223 if err := os.RemoveAll(fn); err != nil { 224 return nil, errors.Wrap(err, "remove any existing index at path") 225 } 226 227 // Main index file we are building. 228 f, err := NewFileWriter(fn, bufferSize) 229 if err != nil { 230 return nil, err 231 } 232 // Temporary file for postings. 233 fP, err := NewFileWriter(fn+"_tmp_p", bufferSize) 234 if err != nil { 235 return nil, err 236 } 237 // Temporary file for posting offset table. 238 fPO, err := NewFileWriter(fn+"_tmp_po", bufferSize) 239 if err != nil { 240 return nil, err 241 } 242 if err := df.Sync(); err != nil { 243 return nil, errors.Wrap(err, "sync dir") 244 } 245 246 iw := &Writer{ 247 ctx: ctx, 248 f: f, 249 fP: fP, 250 fPO: fPO, 251 stage: idxStageNone, 252 253 // Reusable memory. 254 buf1: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}), 255 buf2: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}), 256 257 symbolCache: make(map[string]symbolCacheEntry, 1<<8), 258 labelNames: make(map[string]uint64, 1<<8), 259 crc32: newCRC32(), 260 } 261 if err := iw.writeMeta(); err != nil { 262 return nil, err 263 } 264 return iw, nil 265 } 266 267 func (w *Writer) write(bufs ...[]byte) error { 268 return w.f.Write(bufs...) 269 } 270 271 func (w *Writer) writeAt(buf []byte, pos uint64) error { 272 return w.f.WriteAt(buf, pos) 273 } 274 275 func (w *Writer) addPadding(size int) error { 276 return w.f.AddPadding(size) 277 } 278 279 type FileWriter struct { 280 f *os.File 281 fbuf *bufio.Writer 282 pos uint64 283 name string 284 } 285 286 func NewFileWriter(name string, bufferSize int) (*FileWriter, error) { 287 f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o666) 288 if err != nil { 289 return nil, err 290 } 291 return &FileWriter{ 292 f: f, 293 fbuf: bufio.NewWriterSize(f, bufferSize), 294 pos: 0, 295 name: name, 296 }, nil 297 } 298 299 func (fw *FileWriter) Pos() uint64 { 300 return fw.pos 301 } 302 303 func (fw *FileWriter) Write(bufs ...[]byte) error { 304 for _, b := range bufs { 305 n, err := fw.fbuf.Write(b) 306 fw.pos += uint64(n) 307 if err != nil { 308 return err 309 } 310 // For now the index file must not grow beyond 64GiB. Some of the fixed-sized 311 // offset references in v1 are only 4 bytes large. 312 // Once we move to compressed/varint representations in those areas, this limitation 313 // can be lifted. 314 if fw.pos > 16*math.MaxUint32 { 315 return errors.Errorf("%q exceeding max size of 64GiB", fw.name) 316 } 317 } 318 return nil 319 } 320 321 func (fw *FileWriter) Flush() error { 322 return fw.fbuf.Flush() 323 } 324 325 func (fw *FileWriter) WriteAt(buf []byte, pos uint64) error { 326 if err := fw.Flush(); err != nil { 327 return err 328 } 329 _, err := fw.f.WriteAt(buf, int64(pos)) 330 return err 331 } 332 333 // AddPadding adds zero byte padding until the file size is a multiple size. 334 func (fw *FileWriter) AddPadding(size int) error { 335 p := fw.pos % uint64(size) 336 if p == 0 { 337 return nil 338 } 339 p = uint64(size) - p 340 341 if err := fw.Write(make([]byte, p)); err != nil { 342 return errors.Wrap(err, "add padding") 343 } 344 return nil 345 } 346 347 func (fw *FileWriter) Close() error { 348 if err := fw.Flush(); err != nil { 349 return err 350 } 351 if err := fw.f.Sync(); err != nil { 352 return err 353 } 354 return fw.f.Close() 355 } 356 357 func (fw *FileWriter) Remove() error { 358 return os.Remove(fw.name) 359 } 360 361 // ensureStage handles transitions between write stages and ensures that IndexWriter 362 // methods are called in an order valid for the implementation. 363 func (w *Writer) ensureStage(s indexWriterStage) error { 364 select { 365 case <-w.ctx.Done(): 366 return w.ctx.Err() 367 default: 368 } 369 370 if w.stage == s { 371 return nil 372 } 373 if w.stage < s-1 { 374 // A stage has been skipped. 375 if err := w.ensureStage(s - 1); err != nil { 376 return err 377 } 378 } 379 if w.stage > s { 380 return errors.Errorf("invalid stage %q, currently at %q", s, w.stage) 381 } 382 383 // Mark start of sections in table of contents. 384 switch s { 385 case idxStageSymbols: 386 w.toc.Symbols = w.f.pos 387 if err := w.startSymbols(); err != nil { 388 return err 389 } 390 case idxStageSeries: 391 if err := w.finishSymbols(); err != nil { 392 return err 393 } 394 w.toc.Series = w.f.pos 395 396 case idxStageDone: 397 w.toc.LabelIndices = w.f.pos 398 // LabelIndices generation depends on the posting offset 399 // table produced at this stage. 400 if err := w.writePostingsToTmpFiles(); err != nil { 401 return err 402 } 403 if err := w.writeLabelIndices(); err != nil { 404 return err 405 } 406 407 w.toc.Postings = w.f.pos 408 if err := w.writePostings(); err != nil { 409 return err 410 } 411 412 w.toc.LabelIndicesTable = w.f.pos 413 if err := w.writeLabelIndexesOffsetTable(); err != nil { 414 return err 415 } 416 417 w.toc.PostingsTable = w.f.pos 418 if err := w.writePostingsOffsetTable(); err != nil { 419 return err 420 } 421 422 w.toc.FingerprintOffsets = w.f.pos 423 if err := w.writeFingerprintOffsetsTable(); err != nil { 424 return err 425 } 426 427 if err := w.writeTOC(); err != nil { 428 return err 429 } 430 } 431 432 w.stage = s 433 return nil 434 } 435 436 func (w *Writer) writeMeta() error { 437 w.buf1.Reset() 438 w.buf1.PutBE32(MagicIndex) 439 w.buf1.PutByte(FormatV2) 440 441 return w.write(w.buf1.Get()) 442 } 443 444 // AddSeries adds the series one at a time along with its chunks. 445 // Requires a specific fingerprint to be passed in the case where the "desired" 446 // fingerprint differs from what labels.Hash() produces. For example, 447 // multitenant TSDBs embed a tenant label, but the actual series has no such 448 // label and so the derived fingerprint differs. 449 func (w *Writer) AddSeries(ref storage.SeriesRef, lset phlaremodel.Labels, fp model.Fingerprint, chunks ...ChunkMeta) error { 450 if err := w.ensureStage(idxStageSeries); err != nil { 451 return err 452 } 453 454 // Put the supplied fingerprint instead of the calculated hash. 455 // This allows us to have a synthetic label (__loki_tenant__) in 456 // the pre-compacted TSDBs which map to fingerprints (and chunks) 457 // without this label in storage. 458 labelHash := uint64(fp) 459 460 if ref < w.lastRef && len(w.lastSeries) != 0 { 461 return errors.Errorf("series with reference greater than %d already added", ref) 462 } 463 // We add padding to 16 bytes to increase the addressable space we get through 4 byte 464 // series references. 465 if err := w.addPadding(16); err != nil { 466 return errors.Errorf("failed to write padding bytes: %v", err) 467 } 468 469 if w.f.pos%16 != 0 { 470 return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos) 471 } 472 473 w.buf2.Reset() 474 w.buf2.PutBE64(labelHash) 475 w.buf2.PutUvarint(len(lset)) 476 477 for _, l := range lset { 478 var err error 479 cacheEntry, ok := w.symbolCache[l.Name] 480 nameIndex := cacheEntry.index 481 if !ok { 482 nameIndex, err = w.symbols.ReverseLookup(l.Name) 483 if err != nil { 484 return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err) 485 } 486 } 487 w.labelNames[l.Name]++ 488 w.buf2.PutUvarint32(nameIndex) 489 490 valueIndex := cacheEntry.lastValueIndex 491 if !ok || cacheEntry.lastValue != l.Value { 492 valueIndex, err = w.symbols.ReverseLookup(l.Value) 493 if err != nil { 494 return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err) 495 } 496 w.symbolCache[l.Name] = symbolCacheEntry{ 497 index: nameIndex, 498 lastValue: l.Value, 499 lastValueIndex: valueIndex, 500 } 501 } 502 w.buf2.PutUvarint32(valueIndex) 503 } 504 505 w.buf2.PutUvarint(len(chunks)) 506 507 if len(chunks) > 0 { 508 c := chunks[0] 509 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 510 511 w.buf2.PutVarint64(c.MinTime) 512 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 513 w.buf2.PutUvarint32(c.KB) 514 w.buf2.PutUvarint32(c.SeriesIndex) 515 w.buf2.PutBE32(c.Checksum) 516 t0 := c.MaxTime 517 518 for _, c := range chunks[1:] { 519 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 520 // Encode the diff against previous chunk as varint 521 // instead of uvarint because chunks may overlap 522 w.buf2.PutVarint64(c.MinTime - t0) 523 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 524 w.buf2.PutUvarint32(c.KB) 525 w.buf2.PutUvarint32(c.SeriesIndex) 526 t0 = c.MaxTime 527 528 w.buf2.PutBE32(c.Checksum) 529 } 530 } 531 532 w.buf1.Reset() 533 w.buf1.PutUvarint(w.buf2.Len()) 534 535 w.buf2.PutHash(w.crc32) 536 537 w.lastSeries = append(w.lastSeries[:0], lset...) 538 w.lastSeriesHash = labelHash 539 w.lastRef = ref 540 541 if ref%fingerprintInterval == 0 { 542 sRef := w.f.pos / 16 543 w.fingerprintOffsets = append(w.fingerprintOffsets, [2]uint64{sRef, labelHash}) 544 } 545 546 if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil { 547 return errors.Wrap(err, "write series data") 548 } 549 550 return nil 551 } 552 553 func (w *Writer) startSymbols() error { 554 // We are at w.toc.Symbols. 555 // Leave 4 bytes of space for the length, and another 4 for the number of symbols 556 // which will both be calculated later. 557 return w.write([]byte("alenblen")) 558 } 559 560 func (w *Writer) AddSymbol(sym string) error { 561 if err := w.ensureStage(idxStageSymbols); err != nil { 562 return err 563 } 564 if w.numSymbols != 0 && sym <= w.lastSymbol { 565 return errors.Errorf("symbol %q out-of-order", sym) 566 } 567 w.lastSymbol = sym 568 w.numSymbols++ 569 w.buf1.Reset() 570 w.buf1.PutUvarintStr(sym) 571 return w.write(w.buf1.Get()) 572 } 573 574 func (w *Writer) finishSymbols() error { 575 symbolTableSize := w.f.pos - w.toc.Symbols - 4 576 // The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1 577 if symbolTableSize > math.MaxUint32 { 578 return errors.Errorf("symbol table size exceeds 4 bytes: %d", symbolTableSize) 579 } 580 581 // Write out the length and symbol count. 582 w.buf1.Reset() 583 w.buf1.PutBE32int(int(symbolTableSize)) 584 w.buf1.PutBE32int(w.numSymbols) 585 if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil { 586 return err 587 } 588 589 hashPos := w.f.pos 590 // Leave space for the hash. We can only calculate it 591 // now that the number of symbols is known, so mmap and do it from there. 592 if err := w.write([]byte("hash")); err != nil { 593 return err 594 } 595 if err := w.f.Flush(); err != nil { 596 return err 597 } 598 599 sf, err := fileutil.OpenMmapFile(w.f.name) 600 if err != nil { 601 return err 602 } 603 w.symbolFile = sf 604 hash := crc32.Checksum(w.symbolFile.Bytes()[w.toc.Symbols+4:hashPos], castagnoliTable) 605 w.buf1.Reset() 606 w.buf1.PutBE32(hash) 607 if err := w.writeAt(w.buf1.Get(), hashPos); err != nil { 608 return err 609 } 610 611 // Load in the symbol table efficiently for the rest of the index writing. 612 w.symbols, err = NewSymbols(RealByteSlice(w.symbolFile.Bytes()), FormatV2, int(w.toc.Symbols)) 613 if err != nil { 614 return errors.Wrap(err, "read symbols") 615 } 616 return nil 617 } 618 619 func (w *Writer) writeLabelIndices() error { 620 if err := w.fPO.Flush(); err != nil { 621 return err 622 } 623 624 // Find all the label values in the tmp posting offset table. 625 f, err := fileutil.OpenMmapFile(w.fPO.name) 626 if err != nil { 627 return err 628 } 629 defer f.Close() 630 631 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos))) 632 cnt := w.cntPO 633 current := []byte{} 634 values := []uint32{} 635 for d.Err() == nil && cnt > 0 { 636 cnt-- 637 d.Uvarint() // Keycount. 638 name := d.UvarintBytes() // Label name. 639 value := yoloString(d.UvarintBytes()) // Label value. 640 d.Uvarint64() // Offset. 641 if len(name) == 0 { 642 continue // All index is ignored. 643 } 644 645 if !bytes.Equal(name, current) && len(values) > 0 { 646 // We've reached a new label name. 647 if err := w.writeLabelIndex(string(current), values); err != nil { 648 return err 649 } 650 values = values[:0] 651 } 652 current = name 653 sid, err := w.symbols.ReverseLookup(value) 654 if err != nil { 655 return err 656 } 657 values = append(values, sid) 658 } 659 if d.Err() != nil { 660 return d.Err() 661 } 662 663 // Handle the last label. 664 if len(values) > 0 { 665 if err := w.writeLabelIndex(string(current), values); err != nil { 666 return err 667 } 668 } 669 return nil 670 } 671 672 func (w *Writer) writeLabelIndex(name string, values []uint32) error { 673 // Align beginning to 4 bytes for more efficient index list scans. 674 if err := w.addPadding(4); err != nil { 675 return err 676 } 677 678 w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{ 679 keys: []string{name}, 680 offset: w.f.pos, 681 }) 682 683 startPos := w.f.pos 684 // Leave 4 bytes of space for the length, which will be calculated later. 685 if err := w.write([]byte("alen")); err != nil { 686 return err 687 } 688 w.crc32.Reset() 689 690 w.buf1.Reset() 691 w.buf1.PutBE32int(1) // Number of names. 692 w.buf1.PutBE32int(len(values)) 693 w.buf1.WriteToHash(w.crc32) 694 if err := w.write(w.buf1.Get()); err != nil { 695 return err 696 } 697 698 for _, v := range values { 699 w.buf1.Reset() 700 w.buf1.PutBE32(v) 701 w.buf1.WriteToHash(w.crc32) 702 if err := w.write(w.buf1.Get()); err != nil { 703 return err 704 } 705 } 706 707 // Write out the length. 708 w.buf1.Reset() 709 l := w.f.pos - startPos - 4 710 if l > math.MaxUint32 { 711 return errors.Errorf("label index size exceeds 4 bytes: %d", l) 712 } 713 w.buf1.PutBE32int(int(l)) 714 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 715 return err 716 } 717 718 w.buf1.Reset() 719 w.buf1.PutHashSum(w.crc32) 720 return w.write(w.buf1.Get()) 721 } 722 723 // writeLabelIndexesOffsetTable writes the label indices offset table. 724 func (w *Writer) writeLabelIndexesOffsetTable() error { 725 startPos := w.f.pos 726 // Leave 4 bytes of space for the length, which will be calculated later. 727 if err := w.write([]byte("alen")); err != nil { 728 return err 729 } 730 w.crc32.Reset() 731 732 w.buf1.Reset() 733 w.buf1.PutBE32int(len(w.labelIndexes)) 734 w.buf1.WriteToHash(w.crc32) 735 if err := w.write(w.buf1.Get()); err != nil { 736 return err 737 } 738 739 for _, e := range w.labelIndexes { 740 w.buf1.Reset() 741 w.buf1.PutUvarint(len(e.keys)) 742 for _, k := range e.keys { 743 w.buf1.PutUvarintStr(k) 744 } 745 w.buf1.PutUvarint64(e.offset) 746 w.buf1.WriteToHash(w.crc32) 747 if err := w.write(w.buf1.Get()); err != nil { 748 return err 749 } 750 } 751 // Write out the length. 752 w.buf1.Reset() 753 l := w.f.pos - startPos - 4 754 if l > math.MaxUint32 { 755 return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l) 756 } 757 w.buf1.PutBE32int(int(l)) 758 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 759 return err 760 } 761 762 w.buf1.Reset() 763 w.buf1.PutHashSum(w.crc32) 764 return w.write(w.buf1.Get()) 765 } 766 767 // writePostingsOffsetTable writes the postings offset table. 768 func (w *Writer) writePostingsOffsetTable() error { 769 // Ensure everything is in the temporary file. 770 if err := w.fPO.Flush(); err != nil { 771 return err 772 } 773 774 startPos := w.f.pos 775 // Leave 4 bytes of space for the length, which will be calculated later. 776 if err := w.write([]byte("alen")); err != nil { 777 return err 778 } 779 780 // Copy over the tmp posting offset table, however we need to 781 // adjust the offsets. 782 adjustment := w.postingsStart 783 784 w.buf1.Reset() 785 w.crc32.Reset() 786 w.buf1.PutBE32int(int(w.cntPO)) // Count. 787 w.buf1.WriteToHash(w.crc32) 788 if err := w.write(w.buf1.Get()); err != nil { 789 return err 790 } 791 792 f, err := fileutil.OpenMmapFile(w.fPO.name) 793 if err != nil { 794 return err 795 } 796 defer func() { 797 if f != nil { 798 f.Close() 799 } 800 }() 801 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos))) 802 cnt := w.cntPO 803 for d.Err() == nil && cnt > 0 { 804 w.buf1.Reset() 805 w.buf1.PutUvarint(d.Uvarint()) // Keycount. 806 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name. 807 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value. 808 w.buf1.PutUvarint64(d.Uvarint64() + adjustment) // Offset. 809 w.buf1.WriteToHash(w.crc32) 810 if err := w.write(w.buf1.Get()); err != nil { 811 return err 812 } 813 cnt-- 814 } 815 if d.Err() != nil { 816 return d.Err() 817 } 818 819 // Cleanup temporary file. 820 if err := f.Close(); err != nil { 821 return err 822 } 823 f = nil 824 if err := w.fPO.Close(); err != nil { 825 return err 826 } 827 if err := w.fPO.Remove(); err != nil { 828 return err 829 } 830 w.fPO = nil 831 832 // Write out the length. 833 w.buf1.Reset() 834 l := w.f.pos - startPos - 4 835 if l > math.MaxUint32 { 836 return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l) 837 } 838 w.buf1.PutBE32int(int(l)) 839 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 840 return err 841 } 842 843 // Finally write the hash. 844 w.buf1.Reset() 845 w.buf1.PutHashSum(w.crc32) 846 return w.write(w.buf1.Get()) 847 } 848 849 func (w *Writer) writeFingerprintOffsetsTable() error { 850 w.buf1.Reset() 851 w.buf2.Reset() 852 853 w.buf1.PutBE32int(len(w.fingerprintOffsets)) // Count. 854 // build offsets 855 for _, x := range w.fingerprintOffsets { 856 w.buf1.PutBE64(x[0]) // series offset 857 w.buf1.PutBE64(x[1]) // hash 858 } 859 860 // write length 861 ln := w.buf1.Len() 862 // TODO(owen-d): can remove the uint32 cast in the future 863 // Had to uint32 wrap these for arm32 builds, which we'll remove in the future. 864 if uint32(ln) > uint32(math.MaxUint32) { 865 return errors.Errorf("fingerprint offset size exceeds 4 bytes: %d", ln) 866 } 867 868 w.buf2.PutBE32int(ln) 869 if err := w.write(w.buf2.Get()); err != nil { 870 return err 871 } 872 873 // write offsets+checksum 874 w.buf1.PutHash(w.crc32) 875 if err := w.write(w.buf1.Get()); err != nil { 876 return errors.Wrap(err, "failure writing fingerprint offsets") 877 } 878 return nil 879 } 880 881 const indexTOCLen = 8*9 + crc32.Size 882 883 func (w *Writer) writeTOC() error { 884 w.buf1.Reset() 885 886 w.buf1.PutBE64(w.toc.Symbols) 887 w.buf1.PutBE64(w.toc.Series) 888 w.buf1.PutBE64(w.toc.LabelIndices) 889 w.buf1.PutBE64(w.toc.LabelIndicesTable) 890 w.buf1.PutBE64(w.toc.Postings) 891 w.buf1.PutBE64(w.toc.PostingsTable) 892 w.buf1.PutBE64(w.toc.FingerprintOffsets) 893 894 // metadata 895 w.buf1.PutBE64int64(w.toc.Metadata.From) 896 w.buf1.PutBE64int64(w.toc.Metadata.Through) 897 898 w.buf1.PutHash(w.crc32) 899 900 return w.write(w.buf1.Get()) 901 } 902 903 func (w *Writer) writePostingsToTmpFiles() error { 904 names := make([]string, 0, len(w.labelNames)) 905 for n := range w.labelNames { 906 names = append(names, n) 907 } 908 sort.Strings(names) 909 910 if err := w.f.Flush(); err != nil { 911 return err 912 } 913 f, err := fileutil.OpenMmapFile(w.f.name) 914 if err != nil { 915 return err 916 } 917 defer f.Close() 918 919 // Write out the special all posting. 920 offsets := []uint32{} 921 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices))) 922 d.Skip(int(w.toc.Series)) 923 for d.Len() > 0 { 924 d.ConsumePadding() 925 startPos := w.toc.LabelIndices - uint64(d.Len()) 926 if startPos%16 != 0 { 927 return errors.Errorf("series not 16-byte aligned at %d", startPos) 928 } 929 offsets = append(offsets, uint32(startPos/16)) 930 // Skip to next series. 931 x := d.Uvarint() 932 d.Skip(x + crc32.Size) 933 if err := d.Err(); err != nil { 934 return err 935 } 936 } 937 if err := w.writePosting("", "", offsets); err != nil { 938 return err 939 } 940 maxPostings := uint64(len(offsets)) // No label name can have more postings than this. 941 942 for len(names) > 0 { 943 batchNames := []string{} 944 var c uint64 945 // Try to bunch up label names into one loop, but avoid 946 // using more memory than a single label name can. 947 for len(names) > 0 { 948 if w.labelNames[names[0]]+c > maxPostings { 949 if c > 0 { 950 break 951 } 952 return fmt.Errorf("corruption detected when writing postings to index: label %q has %d uses, but maxPostings is %d", names[0], w.labelNames[names[0]], maxPostings) 953 } 954 batchNames = append(batchNames, names[0]) 955 c += w.labelNames[names[0]] 956 names = names[1:] 957 } 958 959 nameSymbols := map[uint32]string{} 960 for _, name := range batchNames { 961 sid, err := w.symbols.ReverseLookup(name) 962 if err != nil { 963 return err 964 } 965 nameSymbols[sid] = name 966 } 967 // Label name -> label value -> positions. 968 postings := map[uint32]map[uint32][]uint32{} 969 970 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices))) 971 d.Skip(int(w.toc.Series)) 972 for d.Len() > 0 { 973 d.ConsumePadding() 974 startPos := w.toc.LabelIndices - uint64(d.Len()) 975 l := d.Uvarint() // Length of this series in bytes. 976 startLen := d.Len() 977 978 _ = d.Be64() // skip fingerprint 979 // See if label names we want are in the series. 980 numLabels := d.Uvarint() 981 for i := 0; i < numLabels; i++ { 982 lno := uint32(d.Uvarint()) 983 lvo := uint32(d.Uvarint()) 984 985 if _, ok := nameSymbols[lno]; ok { 986 if _, ok := postings[lno]; !ok { 987 postings[lno] = map[uint32][]uint32{} 988 } 989 postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/16)) 990 } 991 } 992 // Skip to next series. 993 d.Skip(l - (startLen - d.Len()) + crc32.Size) 994 if err := d.Err(); err != nil { 995 return err 996 } 997 } 998 999 for _, name := range batchNames { 1000 // Write out postings for this label name. 1001 sid, err := w.symbols.ReverseLookup(name) 1002 if err != nil { 1003 return err 1004 } 1005 values := make([]uint32, 0, len(postings[sid])) 1006 for v := range postings[sid] { 1007 values = append(values, v) 1008 } 1009 // Symbol numbers are in order, so the strings will also be in order. 1010 sort.Sort(uint32slice(values)) 1011 for _, v := range values { 1012 value, err := w.symbols.Lookup(v) 1013 if err != nil { 1014 return err 1015 } 1016 if err := w.writePosting(name, value, postings[sid][v]); err != nil { 1017 return err 1018 } 1019 } 1020 } 1021 select { 1022 case <-w.ctx.Done(): 1023 return w.ctx.Err() 1024 default: 1025 } 1026 } 1027 return nil 1028 } 1029 1030 func (w *Writer) writePosting(name, value string, offs []uint32) error { 1031 // Align beginning to 4 bytes for more efficient postings list scans. 1032 if err := w.fP.AddPadding(4); err != nil { 1033 return err 1034 } 1035 1036 // Write out postings offset table to temporary file as we go. 1037 w.buf1.Reset() 1038 w.buf1.PutUvarint(2) 1039 w.buf1.PutUvarintStr(name) 1040 w.buf1.PutUvarintStr(value) 1041 w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file. 1042 if err := w.fPO.Write(w.buf1.Get()); err != nil { 1043 return err 1044 } 1045 w.cntPO++ 1046 1047 w.buf1.Reset() 1048 w.buf1.PutBE32int(len(offs)) 1049 1050 for _, off := range offs { 1051 if off > (1<<32)-1 { 1052 return errors.Errorf("series offset %d exceeds 4 bytes", off) 1053 } 1054 w.buf1.PutBE32(off) 1055 } 1056 1057 w.buf2.Reset() 1058 l := w.buf1.Len() 1059 // We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there. 1060 if uint(l) > math.MaxUint32 { 1061 return errors.Errorf("posting size exceeds 4 bytes: %d", l) 1062 } 1063 w.buf2.PutBE32int(l) 1064 w.buf1.PutHash(w.crc32) 1065 return w.fP.Write(w.buf2.Get(), w.buf1.Get()) 1066 } 1067 1068 func (w *Writer) writePostings() error { 1069 // There's padding in the tmp file, make sure it actually works. 1070 if err := w.f.AddPadding(4); err != nil { 1071 return err 1072 } 1073 w.postingsStart = w.f.pos 1074 1075 // Copy temporary file into main index. 1076 if err := w.fP.Flush(); err != nil { 1077 return err 1078 } 1079 if _, err := w.fP.f.Seek(0, 0); err != nil { 1080 return err 1081 } 1082 // Don't need to calculate a checksum, so can copy directly. 1083 buf := w.buf1.B[:cap(w.buf1.B)] 1084 n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, buf) 1085 if err != nil { 1086 return err 1087 } 1088 if uint64(n) != w.fP.pos { 1089 return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n) 1090 } 1091 w.f.pos += uint64(n) 1092 1093 if err := w.fP.Close(); err != nil { 1094 return err 1095 } 1096 if err := w.fP.Remove(); err != nil { 1097 return err 1098 } 1099 w.fP = nil 1100 return nil 1101 } 1102 1103 type uint32slice []uint32 1104 1105 func (s uint32slice) Len() int { return len(s) } 1106 func (s uint32slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 1107 func (s uint32slice) Less(i, j int) bool { return s[i] < s[j] } 1108 1109 type labelIndexHashEntry struct { 1110 keys []string 1111 offset uint64 1112 } 1113 1114 func (w *Writer) Close() error { 1115 // Even if this fails, we need to close all the files. 1116 ensureErr := w.ensureStage(idxStageDone) 1117 1118 if w.symbolFile != nil { 1119 if err := w.symbolFile.Close(); err != nil { 1120 return err 1121 } 1122 } 1123 if w.fP != nil { 1124 if err := w.fP.Close(); err != nil { 1125 return err 1126 } 1127 } 1128 if w.fPO != nil { 1129 if err := w.fPO.Close(); err != nil { 1130 return err 1131 } 1132 } 1133 if err := w.f.Close(); err != nil { 1134 return err 1135 } 1136 return ensureErr 1137 } 1138 1139 // StringIter iterates over a sorted list of strings. 1140 type StringIter interface { 1141 // Next advances the iterator and returns true if another value was found. 1142 Next() bool 1143 1144 // At returns the value at the current iterator position. 1145 At() string 1146 1147 // Err returns the last error of the iterator. 1148 Err() error 1149 } 1150 1151 type Reader struct { 1152 b ByteSlice 1153 toc *TOC 1154 1155 // Close that releases the underlying resources of the byte slice. 1156 c io.Closer 1157 1158 // Map of LabelName to a list of some LabelValues's position in the offset table. 1159 // The first and last values for each name are always present. 1160 postings map[string][]postingOffset 1161 // For the v1 format, labelname -> labelvalue -> offset. 1162 postingsV1 map[string]map[string]uint64 1163 1164 symbols *Symbols 1165 nameSymbols map[uint32]string // Cache of the label name symbol lookups, 1166 // as there are not many and they are half of all lookups. 1167 1168 fingerprintOffsets FingerprintOffsets 1169 1170 dec *Decoder 1171 1172 version int 1173 } 1174 1175 type postingOffset struct { 1176 value string 1177 off int 1178 } 1179 1180 // ByteSlice abstracts a byte slice. 1181 type ByteSlice interface { 1182 Len() int 1183 Range(start, end int) []byte 1184 } 1185 1186 type RealByteSlice []byte 1187 1188 func (b RealByteSlice) Len() int { 1189 return len(b) 1190 } 1191 1192 func (b RealByteSlice) Range(start, end int) []byte { 1193 return b[start:end] 1194 } 1195 1196 func (b RealByteSlice) Sub(start, end int) ByteSlice { 1197 return b[start:end] 1198 } 1199 1200 // NewReader returns a new index reader on the given byte slice. It automatically 1201 // handles different format versions. 1202 func NewReader(b ByteSlice) (*Reader, error) { 1203 return newReader(b, io.NopCloser(nil)) 1204 } 1205 1206 type nopCloser struct{} 1207 1208 func (nopCloser) Close() error { return nil } 1209 1210 // NewFileReader returns a new index reader against the given index file. 1211 func NewFileReader(path string) (*Reader, error) { 1212 b, err := os.ReadFile(path) 1213 if err != nil { 1214 return nil, err 1215 } 1216 r, err := newReader(RealByteSlice(b), nopCloser{}) 1217 if err != nil { 1218 return r, err 1219 } 1220 1221 return r, nil 1222 } 1223 1224 func newReader(b ByteSlice, c io.Closer) (*Reader, error) { 1225 r := &Reader{ 1226 b: b, 1227 c: c, 1228 postings: map[string][]postingOffset{}, 1229 } 1230 1231 // Verify header. 1232 if r.b.Len() < HeaderLen { 1233 return nil, errors.Wrap(tsdb_enc.ErrInvalidSize, "index header") 1234 } 1235 if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex { 1236 return nil, errors.Errorf("invalid magic number %x", m) 1237 } 1238 r.version = int(r.b.Range(4, 5)[0]) 1239 1240 if r.version != FormatV1 && r.version != FormatV2 { 1241 return nil, errors.Errorf("unknown index file version %d", r.version) 1242 } 1243 1244 var err error 1245 r.toc, err = NewTOCFromByteSlice(b) 1246 if err != nil { 1247 return nil, errors.Wrap(err, "read TOC") 1248 } 1249 1250 r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols)) 1251 if err != nil { 1252 return nil, errors.Wrap(err, "read symbols") 1253 } 1254 1255 if r.version == FormatV1 { 1256 // Earlier V1 formats don't have a sorted postings offset table, so 1257 // load the whole offset table into memory. 1258 r.postingsV1 = map[string]map[string]uint64{} 1259 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1260 if len(key) != 2 { 1261 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1262 } 1263 if _, ok := r.postingsV1[key[0]]; !ok { 1264 r.postingsV1[key[0]] = map[string]uint64{} 1265 r.postings[key[0]] = nil // Used to get a list of labelnames in places. 1266 } 1267 r.postingsV1[key[0]][key[1]] = off 1268 return nil 1269 }); err != nil { 1270 return nil, errors.Wrap(err, "read postings table") 1271 } 1272 } else { 1273 var lastKey []string 1274 lastOff := 0 1275 valueCount := 0 1276 // For the postings offset table we keep every label name but only every nth 1277 // label value (plus the first and last one), to save memory. 1278 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, _ uint64, off int) error { 1279 if len(key) != 2 { 1280 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1281 } 1282 if _, ok := r.postings[key[0]]; !ok { 1283 // Next label name. 1284 r.postings[key[0]] = []postingOffset{} 1285 if lastKey != nil { 1286 // Always include last value for each label name. 1287 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1288 } 1289 lastKey = nil 1290 valueCount = 0 1291 } 1292 if valueCount%symbolFactor == 0 { 1293 r.postings[key[0]] = append(r.postings[key[0]], postingOffset{value: key[1], off: off}) 1294 lastKey = nil 1295 } else { 1296 lastKey = key 1297 lastOff = off 1298 } 1299 valueCount++ 1300 return nil 1301 }); err != nil { 1302 return nil, errors.Wrap(err, "read postings table") 1303 } 1304 if lastKey != nil { 1305 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1306 } 1307 // Trim any extra space in the slices. 1308 for k, v := range r.postings { 1309 l := make([]postingOffset, len(v)) 1310 copy(l, v) 1311 r.postings[k] = l 1312 } 1313 } 1314 1315 r.nameSymbols = make(map[uint32]string, len(r.postings)) 1316 for k := range r.postings { 1317 if k == "" { 1318 continue 1319 } 1320 off, err := r.symbols.ReverseLookup(k) 1321 if err != nil { 1322 return nil, errors.Wrap(err, "reverse symbol lookup") 1323 } 1324 r.nameSymbols[off] = k 1325 } 1326 1327 r.fingerprintOffsets, err = readFingerprintOffsetsTable(r.b, r.toc.FingerprintOffsets) 1328 if err != nil { 1329 return nil, errors.Wrap(err, "loading fingerprint offsets") 1330 } 1331 1332 r.dec = &Decoder{LookupSymbol: r.lookupSymbol} 1333 1334 return r, nil 1335 } 1336 1337 // Version returns the file format version of the underlying index. 1338 func (r *Reader) Version() int { 1339 return r.version 1340 } 1341 1342 // FileInfo returns some general stats about the underlying file 1343 func (r *Reader) FileInfo() block.File { 1344 k, v := AllPostingsKey() 1345 postings, err := r.Postings(k, nil, v) 1346 if err != nil { 1347 panic(err) 1348 } 1349 var numSeries uint64 1350 for postings.Next() { 1351 numSeries++ 1352 } 1353 return block.File{ 1354 RelPath: block.IndexFilename, 1355 SizeBytes: uint64(r.Size()), 1356 TSDB: &block.TSDBFile{ 1357 NumSeries: numSeries, 1358 }, 1359 } 1360 } 1361 1362 // Range marks a byte range. 1363 type Range struct { 1364 Start, End int64 1365 } 1366 1367 // PostingsRanges returns a new map of byte range in the underlying index file 1368 // for all postings lists. 1369 func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) { 1370 m := map[labels.Label]Range{} 1371 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1372 if len(key) != 2 { 1373 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1374 } 1375 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(off), castagnoliTable)) 1376 if d.Err() != nil { 1377 return d.Err() 1378 } 1379 m[labels.Label{Name: key[0], Value: key[1]}] = Range{ 1380 Start: int64(off) + 4, 1381 End: int64(off) + 4 + int64(d.Len()), 1382 } 1383 return nil 1384 }); err != nil { 1385 return nil, errors.Wrap(err, "read postings table") 1386 } 1387 return m, nil 1388 } 1389 1390 type Symbols struct { 1391 bs ByteSlice 1392 version int 1393 off int 1394 1395 offsets []int 1396 seen int 1397 } 1398 1399 const symbolFactor = 32 1400 1401 // NewSymbols returns a Symbols object for symbol lookups. 1402 func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) { 1403 s := &Symbols{ 1404 bs: bs, 1405 version: version, 1406 off: off, 1407 } 1408 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, off, castagnoliTable)) 1409 var ( 1410 origLen = d.Len() 1411 cnt = d.Be32int() 1412 basePos = off + 4 1413 ) 1414 s.offsets = make([]int, 0, 1+cnt/symbolFactor) 1415 for d.Err() == nil && s.seen < cnt { 1416 if s.seen%symbolFactor == 0 { 1417 s.offsets = append(s.offsets, basePos+origLen-d.Len()) 1418 } 1419 d.UvarintBytes() // The symbol. 1420 s.seen++ 1421 } 1422 if d.Err() != nil { 1423 return nil, d.Err() 1424 } 1425 return s, nil 1426 } 1427 1428 func (s Symbols) Lookup(o uint32) (string, error) { 1429 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1430 B: s.bs.Range(0, s.bs.Len()), 1431 }) 1432 1433 if s.version == FormatV2 { 1434 if int(o) >= s.seen { 1435 return "", errors.Errorf("unknown symbol offset %d", o) 1436 } 1437 d.Skip(s.offsets[int(o/symbolFactor)]) 1438 // Walk until we find the one we want. 1439 for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- { 1440 d.UvarintBytes() 1441 } 1442 } else { 1443 d.Skip(int(o)) 1444 } 1445 sym := d.UvarintStr() 1446 if d.Err() != nil { 1447 return "", d.Err() 1448 } 1449 return sym, nil 1450 } 1451 1452 func (s Symbols) ReverseLookup(sym string) (uint32, error) { 1453 if len(s.offsets) == 0 { 1454 return 0, errors.Errorf("unknown symbol %q - no symbols", sym) 1455 } 1456 i := sort.Search(len(s.offsets), func(i int) bool { 1457 // Any decoding errors here will be lost, however 1458 // we already read through all of this at startup. 1459 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1460 B: s.bs.Range(0, s.bs.Len()), 1461 }) 1462 d.Skip(s.offsets[i]) 1463 return yoloString(d.UvarintBytes()) > sym 1464 }) 1465 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1466 B: s.bs.Range(0, s.bs.Len()), 1467 }) 1468 if i > 0 { 1469 i-- 1470 } 1471 d.Skip(s.offsets[i]) 1472 res := i * symbolFactor 1473 var lastLen int 1474 var lastSymbol string 1475 for d.Err() == nil && res <= s.seen { 1476 lastLen = d.Len() 1477 lastSymbol = yoloString(d.UvarintBytes()) 1478 if lastSymbol >= sym { 1479 break 1480 } 1481 res++ 1482 } 1483 if d.Err() != nil { 1484 return 0, d.Err() 1485 } 1486 if lastSymbol != sym { 1487 return 0, errors.Errorf("unknown symbol %q", sym) 1488 } 1489 if s.version == FormatV2 { 1490 return uint32(res), nil 1491 } 1492 return uint32(s.bs.Len() - lastLen), nil 1493 } 1494 1495 func (s Symbols) Size() int { 1496 return len(s.offsets) * 8 1497 } 1498 1499 func (s Symbols) Iter() StringIter { 1500 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(s.bs, s.off, castagnoliTable)) 1501 cnt := d.Be32int() 1502 return &symbolsIter{ 1503 d: d, 1504 cnt: cnt, 1505 } 1506 } 1507 1508 // symbolsIter implements StringIter. 1509 type symbolsIter struct { 1510 d encoding.Decbuf 1511 cnt int 1512 cur string 1513 err error 1514 } 1515 1516 func (s *symbolsIter) Next() bool { 1517 if s.cnt == 0 || s.err != nil { 1518 return false 1519 } 1520 s.cur = yoloString(s.d.UvarintBytes()) 1521 s.cnt-- 1522 if s.d.Err() != nil { 1523 s.err = s.d.Err() 1524 return false 1525 } 1526 return true 1527 } 1528 1529 func (s symbolsIter) At() string { return s.cur } 1530 func (s symbolsIter) Err() error { return s.err } 1531 1532 // ReadOffsetTable reads an offset table and at the given position calls f for each 1533 // found entry. If f returns an error it stops decoding and returns the received error. 1534 func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64, int) error) error { 1535 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1536 startLen := d.Len() 1537 cnt := d.Be32() 1538 1539 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1540 offsetPos := startLen - d.Len() 1541 keyCount := d.Uvarint() 1542 // The Postings offset table takes only 2 keys per entry (name and value of label), 1543 // and the LabelIndices offset table takes only 1 key per entry (a label name). 1544 // Hence setting the size to max of both, i.e. 2. 1545 keys := make([]string, 0, 2) 1546 1547 for i := 0; i < keyCount; i++ { 1548 keys = append(keys, d.UvarintStr()) 1549 } 1550 o := d.Uvarint64() 1551 if d.Err() != nil { 1552 break 1553 } 1554 if err := f(keys, o, offsetPos); err != nil { 1555 return err 1556 } 1557 cnt-- 1558 } 1559 return d.Err() 1560 } 1561 1562 func readFingerprintOffsetsTable(bs ByteSlice, off uint64) (FingerprintOffsets, error) { 1563 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1564 cnt := d.Be32() 1565 res := make(FingerprintOffsets, 0, int(cnt)) 1566 1567 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1568 res = append(res, [2]uint64{d.Be64(), d.Be64()}) 1569 cnt-- 1570 } 1571 1572 return res, d.Err() 1573 } 1574 1575 // Close the reader and its underlying resources. 1576 func (r *Reader) Close() error { 1577 return r.c.Close() 1578 } 1579 1580 func (r *Reader) lookupSymbol(o uint32) (string, error) { 1581 if s, ok := r.nameSymbols[o]; ok { 1582 return s, nil 1583 } 1584 return r.symbols.Lookup(o) 1585 } 1586 1587 func (r *Reader) Bounds() (int64, int64) { 1588 return r.toc.Metadata.From, r.toc.Metadata.Through 1589 } 1590 1591 func (r *Reader) Checksum() uint32 { 1592 return r.toc.Metadata.Checksum 1593 } 1594 1595 // Symbols returns an iterator over the symbols that exist within the index. 1596 func (r *Reader) Symbols() StringIter { 1597 return r.symbols.Iter() 1598 } 1599 1600 // SymbolTableSize returns the symbol table size in bytes. 1601 func (r *Reader) SymbolTableSize() uint64 { 1602 return uint64(r.symbols.Size()) 1603 } 1604 1605 // SortedLabelValues returns value tuples that exist for the given label name. 1606 // It is not safe to use the return value beyond the lifetime of the byte slice 1607 // passed into the Reader. 1608 func (r *Reader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1609 values, err := r.LabelValues(name, matchers...) 1610 if err == nil && r.version == FormatV1 { 1611 sort.Strings(values) 1612 } 1613 return values, err 1614 } 1615 1616 // LabelValues returns value tuples that exist for the given label name. 1617 // It is not safe to use the return value beyond the lifetime of the byte slice 1618 // passed into the Reader. 1619 // TODO(replay): Support filtering by matchers 1620 func (r *Reader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1621 if len(matchers) > 0 { 1622 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1623 } 1624 1625 if r.version == FormatV1 { 1626 e, ok := r.postingsV1[name] 1627 if !ok { 1628 return nil, nil 1629 } 1630 values := make([]string, 0, len(e)) 1631 for k := range e { 1632 values = append(values, k) 1633 } 1634 return values, nil 1635 1636 } 1637 e, ok := r.postings[name] 1638 if !ok { 1639 return nil, nil 1640 } 1641 if len(e) == 0 { 1642 return nil, nil 1643 } 1644 values := make([]string, 0, len(e)*symbolFactor) 1645 1646 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1647 d.Skip(e[0].off) 1648 lastVal := e[len(e)-1].value 1649 1650 skip := 0 1651 for d.Err() == nil { 1652 if skip == 0 { 1653 // These are always the same number of bytes, 1654 // and it's faster to skip than parse. 1655 skip = d.Len() 1656 d.Uvarint() // Keycount. 1657 d.UvarintBytes() // Label name. 1658 skip -= d.Len() 1659 } else { 1660 d.Skip(skip) 1661 } 1662 s := yoloString(d.UvarintBytes()) // Label value. 1663 values = append(values, s) 1664 if s == lastVal { 1665 break 1666 } 1667 d.Uvarint64() // Offset. 1668 } 1669 if d.Err() != nil { 1670 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1671 } 1672 return values, nil 1673 } 1674 1675 // LabelNamesFor returns all the label names for the series referred to by IDs. 1676 // The names returned are sorted. 1677 func (r *Reader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) { 1678 // Gather offsetsMap the name offsetsMap in the symbol table first 1679 offsetsMap := make(map[uint32]struct{}) 1680 for _, id := range ids { 1681 offset := id 1682 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1683 // and the ID is the multiple of 16 of the actual position. 1684 if r.version == FormatV2 { 1685 offset = id * 16 1686 } 1687 1688 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1689 buf := d.Get() 1690 if d.Err() != nil { 1691 return nil, errors.Wrap(d.Err(), "get buffer for series") 1692 } 1693 1694 offsets, err := r.dec.LabelNamesOffsetsFor(buf) 1695 if err != nil { 1696 return nil, errors.Wrap(err, "get label name offsets") 1697 } 1698 for _, off := range offsets { 1699 offsetsMap[off] = struct{}{} 1700 } 1701 } 1702 1703 // Lookup the unique symbols. 1704 names := make([]string, 0, len(offsetsMap)) 1705 for off := range offsetsMap { 1706 name, err := r.lookupSymbol(off) 1707 if err != nil { 1708 return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor") 1709 } 1710 names = append(names, name) 1711 } 1712 1713 sort.Strings(names) 1714 1715 return names, nil 1716 } 1717 1718 // LabelValueFor returns label value for the given label name in the series referred to by ID. 1719 func (r *Reader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { 1720 offset := id 1721 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1722 // and the ID is the multiple of 16 of the actual position. 1723 if r.version == FormatV2 { 1724 offset = id * 16 1725 } 1726 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1727 buf := d.Get() 1728 if d.Err() != nil { 1729 return "", errors.Wrap(d.Err(), "label values for") 1730 } 1731 1732 value, err := r.dec.LabelValueFor(buf, label) 1733 if err != nil { 1734 return "", storage.ErrNotFound 1735 } 1736 1737 if value == "" { 1738 return "", storage.ErrNotFound 1739 } 1740 1741 return value, nil 1742 } 1743 1744 // Series reads the series with the given ID and writes its labels and chunks into lbls and chks. 1745 func (r *Reader) Series(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]ChunkMeta) (uint64, error) { 1746 offset := id 1747 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1748 // and the ID is the multiple of 16 of the actual position. 1749 if r.version == FormatV2 { 1750 offset = id * 16 1751 } 1752 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1753 if d.Err() != nil { 1754 return 0, d.Err() 1755 } 1756 1757 fprint, err := r.dec.Series(d.Get(), lbls, chks, false) 1758 if err != nil { 1759 return 0, errors.Wrap(err, "read series") 1760 } 1761 return fprint, nil 1762 } 1763 1764 // SeriesBy is like Series but allows to group labels by name. This avoid looking up all label symbols for requested series. 1765 func (r *Reader) SeriesBy(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]ChunkMeta, by ...string) (uint64, error) { 1766 offset := id 1767 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1768 // and the ID is the multiple of 16 of the actual position. 1769 if r.version == FormatV2 { 1770 offset = id * 16 1771 } 1772 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1773 if d.Err() != nil { 1774 return 0, d.Err() 1775 } 1776 1777 fprint, err := r.dec.Series(d.Get(), lbls, chks, true, by...) 1778 if err != nil { 1779 return 0, errors.Wrap(err, "read series") 1780 } 1781 return fprint, nil 1782 } 1783 1784 func (r *Reader) Postings(name string, shard *ShardAnnotation, values ...string) (Postings, error) { 1785 if r.version == FormatV1 { 1786 e, ok := r.postingsV1[name] 1787 if !ok { 1788 return EmptyPostings(), nil 1789 } 1790 res := make([]Postings, 0, len(values)) 1791 for _, v := range values { 1792 postingsOff, ok := e[v] 1793 if !ok { 1794 continue 1795 } 1796 // Read from the postings table. 1797 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1798 _, p, err := r.dec.Postings(d.Get()) 1799 if err != nil { 1800 return nil, errors.Wrap(err, "decode postings") 1801 } 1802 res = append(res, p) 1803 } 1804 return Merge(res...), nil 1805 } 1806 1807 e, ok := r.postings[name] 1808 if !ok { 1809 return EmptyPostings(), nil 1810 } 1811 1812 if len(values) == 0 { 1813 return EmptyPostings(), nil 1814 } 1815 1816 res := make([]Postings, 0, len(values)) 1817 skip := 0 1818 valueIndex := 0 1819 for valueIndex < len(values) && values[valueIndex] < e[0].value { 1820 // Discard values before the start. 1821 valueIndex++ 1822 } 1823 for valueIndex < len(values) { 1824 value := values[valueIndex] 1825 1826 i := sort.Search(len(e), func(i int) bool { return e[i].value >= value }) 1827 if i == len(e) { 1828 // We're past the end. 1829 break 1830 } 1831 if i > 0 && e[i].value != value { 1832 // Need to look from previous entry. 1833 i-- 1834 } 1835 // Don't Crc32 the entire postings offset table, this is very slow 1836 // so hope any issues were caught at startup. 1837 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1838 d.Skip(e[i].off) 1839 1840 // Iterate on the offset table. 1841 var postingsOff uint64 // The offset into the postings table. 1842 for d.Err() == nil { 1843 if skip == 0 { 1844 // These are always the same number of bytes, 1845 // and it's faster to skip than parse. 1846 skip = d.Len() 1847 d.Uvarint() // Keycount. 1848 d.UvarintBytes() // Label name. 1849 skip -= d.Len() 1850 } else { 1851 d.Skip(skip) 1852 } 1853 v := d.UvarintBytes() // Label value. 1854 postingsOff = d.Uvarint64() // Offset. 1855 for string(v) >= value { 1856 if string(v) == value { 1857 // Read from the postings table. 1858 d2 := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1859 _, p, err := r.dec.Postings(d2.Get()) 1860 if err != nil { 1861 return nil, errors.Wrap(err, "decode postings") 1862 } 1863 res = append(res, p) 1864 } 1865 valueIndex++ 1866 if valueIndex == len(values) { 1867 break 1868 } 1869 value = values[valueIndex] 1870 } 1871 if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) { 1872 // Need to go to a later postings offset entry, if there is one. 1873 break 1874 } 1875 } 1876 if d.Err() != nil { 1877 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1878 } 1879 } 1880 1881 merged := Merge(res...) 1882 if shard != nil { 1883 return NewShardedPostings(merged, *shard, r.fingerprintOffsets), nil 1884 } 1885 1886 return merged, nil 1887 } 1888 1889 // Size returns the size of an index file. 1890 func (r *Reader) Size() int64 { 1891 return int64(r.b.Len()) 1892 } 1893 1894 // LabelNames returns all the unique label names present in the index. 1895 // TODO(twilkie) implement support for matchers 1896 func (r *Reader) LabelNames(matchers ...*labels.Matcher) ([]string, error) { 1897 if len(matchers) > 0 { 1898 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1899 } 1900 1901 labelNames := make([]string, 0, len(r.postings)) 1902 for name := range r.postings { 1903 if name == allPostingsKey.Name { 1904 // This is not from any metric. 1905 continue 1906 } 1907 labelNames = append(labelNames, name) 1908 } 1909 sort.Strings(labelNames) 1910 return labelNames, nil 1911 } 1912 1913 // Decoder provides decoding methods for the v1 and v2 index file format. 1914 // 1915 // It currently does not contain decoding methods for all entry types but can be extended 1916 // by them if there's demand. 1917 type Decoder struct { 1918 LookupSymbol func(uint32) (string, error) 1919 } 1920 1921 // Postings returns a postings list for b and its number of elements. 1922 func (dec *Decoder) Postings(b []byte) (int, Postings, error) { 1923 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1924 n := d.Be32int() 1925 l := d.Get() 1926 if d.Err() != nil { 1927 return 0, nil, d.Err() 1928 } 1929 if len(l) != 4*n { 1930 return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l)) 1931 } 1932 return n, newBigEndianPostings(l), nil 1933 } 1934 1935 // LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series. 1936 // They are returned in the same order they're stored, which should be sorted lexicographically. 1937 func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) { 1938 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1939 _ = d.Be64() // skip fingerprint 1940 k := d.Uvarint() 1941 1942 offsets := make([]uint32, k) 1943 for i := 0; i < k; i++ { 1944 offsets[i] = uint32(d.Uvarint()) 1945 _ = d.Uvarint() // skip the label value 1946 1947 if d.Err() != nil { 1948 return nil, errors.Wrap(d.Err(), "read series label offsets") 1949 } 1950 } 1951 1952 return offsets, d.Err() 1953 } 1954 1955 // LabelValueFor decodes a label for a given series. 1956 func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) { 1957 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1958 _ = d.Be64() // skip fingerprint 1959 k := d.Uvarint() 1960 1961 for i := 0; i < k; i++ { 1962 lno := uint32(d.Uvarint()) 1963 lvo := uint32(d.Uvarint()) 1964 1965 if d.Err() != nil { 1966 return "", errors.Wrap(d.Err(), "read series label offsets") 1967 } 1968 1969 ln, err := dec.LookupSymbol(lno) 1970 if err != nil { 1971 return "", errors.Wrap(err, "lookup label name") 1972 } 1973 1974 if ln == label { 1975 lv, err := dec.LookupSymbol(lvo) 1976 if err != nil { 1977 return "", errors.Wrap(err, "lookup label value") 1978 } 1979 1980 return lv, nil 1981 } 1982 } 1983 1984 return "", d.Err() 1985 } 1986 1987 // Series decodes a series entry from the given byte slice into lset and chks. 1988 func (dec *Decoder) Series(b []byte, lbls *phlaremodel.Labels, chks *[]ChunkMeta, group bool, by ...string) (uint64, error) { 1989 if lbls != nil { 1990 *lbls = (*lbls)[:0] 1991 } 1992 *chks = (*chks)[:0] 1993 1994 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1995 1996 fprint := d.Be64() 1997 k := d.Uvarint() 1998 1999 for i := 0; i < k; i++ { 2000 lno := uint32(d.Uvarint()) 2001 lvo := uint32(d.Uvarint()) 2002 2003 if d.Err() != nil { 2004 return 0, errors.Wrap(d.Err(), "read series label offsets") 2005 } 2006 if lbls == nil { 2007 continue 2008 } 2009 if group && len(by) == 0 { 2010 // If we're grouping by all labels, we don't need to decode them. 2011 continue 2012 } 2013 ln, err := dec.LookupSymbol(lno) 2014 if err != nil { 2015 return 0, errors.Wrap(err, "lookup label name") 2016 } 2017 if group { 2018 var found bool 2019 for _, b := range by { 2020 if b == ln { 2021 found = true 2022 break 2023 } 2024 } 2025 if !found { 2026 continue 2027 } 2028 } 2029 lv, err := dec.LookupSymbol(lvo) 2030 if err != nil { 2031 return 0, errors.Wrap(err, "lookup label value") 2032 } 2033 2034 *lbls = append(*lbls, &typesv1.LabelPair{Name: ln, Value: lv}) 2035 } 2036 2037 // Read the chunks meta data. 2038 k = d.Uvarint() 2039 2040 if k == 0 { 2041 return 0, d.Err() 2042 } 2043 2044 t0 := d.Varint64() 2045 maxt := int64(d.Uvarint64()) + t0 2046 kb := uint32(d.Uvarint()) 2047 entries := uint32(d.Uvarint64()) 2048 checksum := d.Be32() 2049 2050 *chks = append(*chks, ChunkMeta{ 2051 Checksum: checksum, 2052 MinTime: t0, 2053 MaxTime: maxt, 2054 KB: kb, 2055 SeriesIndex: entries, 2056 }) 2057 t0 = maxt 2058 2059 for i := 1; i < k; i++ { 2060 // Decode the diff against previous chunk as varint 2061 // instead of uvarint because chunks may overlap 2062 mint := d.Varint64() + t0 2063 maxt := int64(d.Uvarint64()) + mint 2064 kb := uint32(d.Uvarint()) 2065 entries := uint32(d.Uvarint64()) 2066 checksum := d.Be32() 2067 t0 = maxt 2068 2069 if d.Err() != nil { 2070 return 0, errors.Wrapf(d.Err(), "read meta for chunk %d", i) 2071 } 2072 2073 *chks = append(*chks, ChunkMeta{ 2074 Checksum: checksum, 2075 MinTime: mint, 2076 MaxTime: maxt, 2077 KB: kb, 2078 SeriesIndex: entries, 2079 }) 2080 } 2081 return fprint, d.Err() 2082 } 2083 2084 func yoloString(b []byte) string { 2085 return *((*string)(unsafe.Pointer(&b))) 2086 }