github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/tsdb/index/index.go (about) 1 // Copyright 2017 The Prometheus Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package index 15 16 import ( 17 "bufio" 18 "bytes" 19 "context" 20 "encoding/binary" 21 "fmt" 22 "hash" 23 "hash/crc32" 24 "io" 25 "io/ioutil" 26 "math" 27 "os" 28 "path/filepath" 29 "sort" 30 "unsafe" 31 32 "github.com/pkg/errors" 33 "github.com/prometheus/common/model" 34 "github.com/prometheus/prometheus/model/labels" 35 "github.com/prometheus/prometheus/storage" 36 tsdb_enc "github.com/prometheus/prometheus/tsdb/encoding" 37 "github.com/prometheus/prometheus/tsdb/fileutil" 38 39 "github.com/grafana/loki/pkg/util/encoding" 40 ) 41 42 const ( 43 // MagicIndex 4 bytes at the head of an index file. 44 MagicIndex = 0xBAAAD700 45 // HeaderLen represents number of bytes reserved of index for header. 46 HeaderLen = 5 47 48 // FormatV1 represents 1 version of index. 49 FormatV1 = 1 50 // FormatV2 represents 2 version of index. 51 FormatV2 = 2 52 53 IndexFilename = "index" 54 55 // store every 1024 series' fingerprints in the fingerprint offsets table 56 fingerprintInterval = 1 << 10 57 ) 58 59 type indexWriterStage uint8 60 61 const ( 62 idxStageNone indexWriterStage = iota 63 idxStageSymbols 64 idxStageSeries 65 idxStageDone 66 ) 67 68 func (s indexWriterStage) String() string { 69 switch s { 70 case idxStageNone: 71 return "none" 72 case idxStageSymbols: 73 return "symbols" 74 case idxStageSeries: 75 return "series" 76 case idxStageDone: 77 return "done" 78 } 79 return "<unknown>" 80 } 81 82 // The table gets initialized with sync.Once but may still cause a race 83 // with any other use of the crc32 package anywhere. Thus we initialize it 84 // before. 85 var castagnoliTable *crc32.Table 86 87 func init() { 88 castagnoliTable = crc32.MakeTable(crc32.Castagnoli) 89 } 90 91 // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the 92 // polynomial may be easily changed in one location at a later time, if necessary. 93 func newCRC32() hash.Hash32 { 94 return crc32.New(castagnoliTable) 95 } 96 97 type symbolCacheEntry struct { 98 index uint32 99 lastValue string 100 lastValueIndex uint32 101 } 102 103 // Writer implements the IndexWriter interface for the standard 104 // serialization format. 105 type Writer struct { 106 ctx context.Context 107 108 // For the main index file. 109 f *FileWriter 110 111 // Temporary file for postings. 112 fP *FileWriter 113 // Temporary file for posting offsets table. 114 fPO *FileWriter 115 cntPO uint64 116 117 toc TOC 118 stage indexWriterStage 119 postingsStart uint64 // Due to padding, can differ from TOC entry. 120 121 // Reusable memory. 122 buf1 encoding.Encbuf 123 buf2 encoding.Encbuf 124 125 numSymbols int 126 symbols *Symbols 127 symbolFile *fileutil.MmapFile 128 lastSymbol string 129 symbolCache map[string]symbolCacheEntry 130 131 labelIndexes []labelIndexHashEntry // Label index offsets. 132 labelNames map[string]uint64 // Label names, and their usage. 133 // Keeps track of the fingerprint/offset for every n series 134 fingerprintOffsets FingerprintOffsets 135 136 // Hold last series to validate that clients insert new series in order. 137 lastSeries labels.Labels 138 lastSeriesHash uint64 139 lastRef storage.SeriesRef 140 141 crc32 hash.Hash 142 143 Version int 144 } 145 146 // TOC represents index Table Of Content that states where each section of index starts. 147 type TOC struct { 148 Symbols uint64 149 Series uint64 150 LabelIndices uint64 151 LabelIndicesTable uint64 152 Postings uint64 153 PostingsTable uint64 154 FingerprintOffsets uint64 155 Metadata Metadata 156 } 157 158 // Metadata is TSDB-level metadata 159 type Metadata struct { 160 From, Through int64 161 Checksum uint32 162 } 163 164 func (m *Metadata) EnsureBounds(from, through int64) { 165 if m.From == 0 || from < m.From { 166 m.From = from 167 } 168 169 if m.Through == 0 || through > m.Through { 170 m.Through = through 171 } 172 173 } 174 175 // NewTOCFromByteSlice return parsed TOC from given index byte slice. 176 func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) { 177 if bs.Len() < indexTOCLen { 178 return nil, tsdb_enc.ErrInvalidSize 179 } 180 b := bs.Range(bs.Len()-indexTOCLen, bs.Len()) 181 182 expCRC := binary.BigEndian.Uint32(b[len(b)-4:]) 183 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b[:len(b)-4]}) 184 if d.Crc32(castagnoliTable) != expCRC { 185 return nil, errors.Wrap(tsdb_enc.ErrInvalidChecksum, "read TOC") 186 } 187 188 if err := d.Err(); err != nil { 189 return nil, err 190 } 191 192 return &TOC{ 193 Symbols: d.Be64(), 194 Series: d.Be64(), 195 LabelIndices: d.Be64(), 196 LabelIndicesTable: d.Be64(), 197 Postings: d.Be64(), 198 PostingsTable: d.Be64(), 199 FingerprintOffsets: d.Be64(), 200 Metadata: Metadata{ 201 From: d.Be64int64(), 202 Through: d.Be64int64(), 203 Checksum: expCRC, 204 }, 205 }, nil 206 } 207 208 // NewWriter returns a new Writer to the given filename. It serializes data in format version 2. 209 func NewWriter(ctx context.Context, fn string) (*Writer, error) { 210 dir := filepath.Dir(fn) 211 212 df, err := fileutil.OpenDir(dir) 213 if err != nil { 214 return nil, err 215 } 216 defer df.Close() // Close for platform windows. 217 218 if err := os.RemoveAll(fn); err != nil { 219 return nil, errors.Wrap(err, "remove any existing index at path") 220 } 221 222 // Main index file we are building. 223 f, err := NewFileWriter(fn) 224 if err != nil { 225 return nil, err 226 } 227 // Temporary file for postings. 228 fP, err := NewFileWriter(fn + "_tmp_p") 229 if err != nil { 230 return nil, err 231 } 232 // Temporary file for posting offset table. 233 fPO, err := NewFileWriter(fn + "_tmp_po") 234 if err != nil { 235 return nil, err 236 } 237 if err := df.Sync(); err != nil { 238 return nil, errors.Wrap(err, "sync dir") 239 } 240 241 iw := &Writer{ 242 ctx: ctx, 243 f: f, 244 fP: fP, 245 fPO: fPO, 246 stage: idxStageNone, 247 248 // Reusable memory. 249 buf1: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, 1<<22)}), 250 buf2: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, 1<<22)}), 251 252 symbolCache: make(map[string]symbolCacheEntry, 1<<8), 253 labelNames: make(map[string]uint64, 1<<8), 254 crc32: newCRC32(), 255 } 256 if err := iw.writeMeta(); err != nil { 257 return nil, err 258 } 259 return iw, nil 260 } 261 262 func (w *Writer) write(bufs ...[]byte) error { 263 return w.f.Write(bufs...) 264 } 265 266 func (w *Writer) writeAt(buf []byte, pos uint64) error { 267 return w.f.WriteAt(buf, pos) 268 } 269 270 func (w *Writer) addPadding(size int) error { 271 return w.f.AddPadding(size) 272 } 273 274 type FileWriter struct { 275 f *os.File 276 fbuf *bufio.Writer 277 pos uint64 278 name string 279 } 280 281 func NewFileWriter(name string) (*FileWriter, error) { 282 f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o666) 283 if err != nil { 284 return nil, err 285 } 286 return &FileWriter{ 287 f: f, 288 fbuf: bufio.NewWriterSize(f, 1<<22), 289 pos: 0, 290 name: name, 291 }, nil 292 } 293 294 func (fw *FileWriter) Pos() uint64 { 295 return fw.pos 296 } 297 298 func (fw *FileWriter) Write(bufs ...[]byte) error { 299 for _, b := range bufs { 300 n, err := fw.fbuf.Write(b) 301 fw.pos += uint64(n) 302 if err != nil { 303 return err 304 } 305 // For now the index file must not grow beyond 64GiB. Some of the fixed-sized 306 // offset references in v1 are only 4 bytes large. 307 // Once we move to compressed/varint representations in those areas, this limitation 308 // can be lifted. 309 if fw.pos > 16*math.MaxUint32 { 310 return errors.Errorf("%q exceeding max size of 64GiB", fw.name) 311 } 312 } 313 return nil 314 } 315 316 func (fw *FileWriter) Flush() error { 317 return fw.fbuf.Flush() 318 } 319 320 func (fw *FileWriter) WriteAt(buf []byte, pos uint64) error { 321 if err := fw.Flush(); err != nil { 322 return err 323 } 324 _, err := fw.f.WriteAt(buf, int64(pos)) 325 return err 326 } 327 328 // AddPadding adds zero byte padding until the file size is a multiple size. 329 func (fw *FileWriter) AddPadding(size int) error { 330 p := fw.pos % uint64(size) 331 if p == 0 { 332 return nil 333 } 334 p = uint64(size) - p 335 336 if err := fw.Write(make([]byte, p)); err != nil { 337 return errors.Wrap(err, "add padding") 338 } 339 return nil 340 } 341 342 func (fw *FileWriter) Close() error { 343 if err := fw.Flush(); err != nil { 344 return err 345 } 346 if err := fw.f.Sync(); err != nil { 347 return err 348 } 349 return fw.f.Close() 350 } 351 352 func (fw *FileWriter) Remove() error { 353 return os.Remove(fw.name) 354 } 355 356 // ensureStage handles transitions between write stages and ensures that IndexWriter 357 // methods are called in an order valid for the implementation. 358 func (w *Writer) ensureStage(s indexWriterStage) error { 359 select { 360 case <-w.ctx.Done(): 361 return w.ctx.Err() 362 default: 363 } 364 365 if w.stage == s { 366 return nil 367 } 368 if w.stage < s-1 { 369 // A stage has been skipped. 370 if err := w.ensureStage(s - 1); err != nil { 371 return err 372 } 373 } 374 if w.stage > s { 375 return errors.Errorf("invalid stage %q, currently at %q", s, w.stage) 376 } 377 378 // Mark start of sections in table of contents. 379 switch s { 380 case idxStageSymbols: 381 w.toc.Symbols = w.f.pos 382 if err := w.startSymbols(); err != nil { 383 return err 384 } 385 case idxStageSeries: 386 if err := w.finishSymbols(); err != nil { 387 return err 388 } 389 w.toc.Series = w.f.pos 390 391 case idxStageDone: 392 w.toc.LabelIndices = w.f.pos 393 // LabelIndices generation depends on the posting offset 394 // table produced at this stage. 395 if err := w.writePostingsToTmpFiles(); err != nil { 396 return err 397 } 398 if err := w.writeLabelIndices(); err != nil { 399 return err 400 } 401 402 w.toc.Postings = w.f.pos 403 if err := w.writePostings(); err != nil { 404 return err 405 } 406 407 w.toc.LabelIndicesTable = w.f.pos 408 if err := w.writeLabelIndexesOffsetTable(); err != nil { 409 return err 410 } 411 412 w.toc.PostingsTable = w.f.pos 413 if err := w.writePostingsOffsetTable(); err != nil { 414 return err 415 } 416 417 w.toc.FingerprintOffsets = w.f.pos 418 if err := w.writeFingerprintOffsetsTable(); err != nil { 419 return err 420 } 421 422 if err := w.writeTOC(); err != nil { 423 return err 424 } 425 } 426 427 w.stage = s 428 return nil 429 } 430 431 func (w *Writer) writeMeta() error { 432 w.buf1.Reset() 433 w.buf1.PutBE32(MagicIndex) 434 w.buf1.PutByte(FormatV2) 435 436 return w.write(w.buf1.Get()) 437 } 438 439 // AddSeries adds the series one at a time along with its chunks. 440 // Requires a specific fingerprint to be passed in the case where the "desired" 441 // fingerprint differs from what labels.Hash() produces. For example, 442 // multitenant TSDBs embed a tenant label, but the actual series has no such 443 // label and so the derived fingerprint differs. 444 func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, fp model.Fingerprint, chunks ...ChunkMeta) error { 445 if err := w.ensureStage(idxStageSeries); err != nil { 446 return err 447 } 448 449 // Put the supplied fingerprint instead of the calculated hash. 450 // This allows us to have a synthetic label (__loki_tenant__) in 451 // the pre-compacted TSDBs which map to fingerprints (and chunks) 452 // without this label in storage. 453 labelHash := uint64(fp) 454 455 lastHash := w.lastSeriesHash 456 // Ensure series are sorted by the priorities: [`hash(labels)`, `labels`] 457 if (labelHash < lastHash && len(w.lastSeries) > 0) || labelHash == lastHash && labels.Compare(lset, w.lastSeries) < 0 { 458 return errors.Errorf("out-of-order series added with label set %q", lset) 459 } 460 461 if ref < w.lastRef && len(w.lastSeries) != 0 { 462 return errors.Errorf("series with reference greater than %d already added", ref) 463 } 464 // We add padding to 16 bytes to increase the addressable space we get through 4 byte 465 // series references. 466 if err := w.addPadding(16); err != nil { 467 return errors.Errorf("failed to write padding bytes: %v", err) 468 } 469 470 if w.f.pos%16 != 0 { 471 return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos) 472 } 473 474 w.buf2.Reset() 475 w.buf2.PutBE64(labelHash) 476 w.buf2.PutUvarint(len(lset)) 477 478 for _, l := range lset { 479 var err error 480 cacheEntry, ok := w.symbolCache[l.Name] 481 nameIndex := cacheEntry.index 482 if !ok { 483 nameIndex, err = w.symbols.ReverseLookup(l.Name) 484 if err != nil { 485 return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err) 486 } 487 } 488 w.labelNames[l.Name]++ 489 w.buf2.PutUvarint32(nameIndex) 490 491 valueIndex := cacheEntry.lastValueIndex 492 if !ok || cacheEntry.lastValue != l.Value { 493 valueIndex, err = w.symbols.ReverseLookup(l.Value) 494 if err != nil { 495 return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err) 496 } 497 w.symbolCache[l.Name] = symbolCacheEntry{ 498 index: nameIndex, 499 lastValue: l.Value, 500 lastValueIndex: valueIndex, 501 } 502 } 503 w.buf2.PutUvarint32(valueIndex) 504 } 505 506 w.buf2.PutUvarint(len(chunks)) 507 508 if len(chunks) > 0 { 509 c := chunks[0] 510 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 511 512 w.buf2.PutVarint64(c.MinTime) 513 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 514 w.buf2.PutUvarint32(c.KB) 515 w.buf2.PutUvarint32(c.Entries) 516 w.buf2.PutBE32(c.Checksum) 517 t0 := c.MaxTime 518 519 for _, c := range chunks[1:] { 520 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 521 // Encode the diff against previous chunk as varint 522 // instead of uvarint because chunks may overlap 523 w.buf2.PutVarint64(c.MinTime - t0) 524 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 525 w.buf2.PutUvarint32(c.KB) 526 w.buf2.PutUvarint32(c.Entries) 527 t0 = c.MaxTime 528 529 w.buf2.PutBE32(c.Checksum) 530 } 531 } 532 533 w.buf1.Reset() 534 w.buf1.PutUvarint(w.buf2.Len()) 535 536 w.buf2.PutHash(w.crc32) 537 538 w.lastSeries = append(w.lastSeries[:0], lset...) 539 w.lastSeriesHash = labelHash 540 w.lastRef = ref 541 542 if ref%fingerprintInterval == 0 { 543 // series references are the 16-byte aligned offsets 544 // Do NOT ask me how long I debugged this particular bit >:O 545 sRef := w.f.pos / 16 546 w.fingerprintOffsets = append(w.fingerprintOffsets, [2]uint64{sRef, labelHash}) 547 } 548 549 if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil { 550 return errors.Wrap(err, "write series data") 551 } 552 553 return nil 554 } 555 556 func (w *Writer) startSymbols() error { 557 // We are at w.toc.Symbols. 558 // Leave 4 bytes of space for the length, and another 4 for the number of symbols 559 // which will both be calculated later. 560 return w.write([]byte("alenblen")) 561 } 562 563 func (w *Writer) AddSymbol(sym string) error { 564 if err := w.ensureStage(idxStageSymbols); err != nil { 565 return err 566 } 567 if w.numSymbols != 0 && sym <= w.lastSymbol { 568 return errors.Errorf("symbol %q out-of-order", sym) 569 } 570 w.lastSymbol = sym 571 w.numSymbols++ 572 w.buf1.Reset() 573 w.buf1.PutUvarintStr(sym) 574 return w.write(w.buf1.Get()) 575 } 576 577 func (w *Writer) finishSymbols() error { 578 symbolTableSize := w.f.pos - w.toc.Symbols - 4 579 // The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1 580 if symbolTableSize > math.MaxUint32 { 581 return errors.Errorf("symbol table size exceeds 4 bytes: %d", symbolTableSize) 582 } 583 584 // Write out the length and symbol count. 585 w.buf1.Reset() 586 w.buf1.PutBE32int(int(symbolTableSize)) 587 w.buf1.PutBE32int(w.numSymbols) 588 if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil { 589 return err 590 } 591 592 hashPos := w.f.pos 593 // Leave space for the hash. We can only calculate it 594 // now that the number of symbols is known, so mmap and do it from there. 595 if err := w.write([]byte("hash")); err != nil { 596 return err 597 } 598 if err := w.f.Flush(); err != nil { 599 return err 600 } 601 602 sf, err := fileutil.OpenMmapFile(w.f.name) 603 if err != nil { 604 return err 605 } 606 w.symbolFile = sf 607 hash := crc32.Checksum(w.symbolFile.Bytes()[w.toc.Symbols+4:hashPos], castagnoliTable) 608 w.buf1.Reset() 609 w.buf1.PutBE32(hash) 610 if err := w.writeAt(w.buf1.Get(), hashPos); err != nil { 611 return err 612 } 613 614 // Load in the symbol table efficiently for the rest of the index writing. 615 w.symbols, err = NewSymbols(RealByteSlice(w.symbolFile.Bytes()), FormatV2, int(w.toc.Symbols)) 616 if err != nil { 617 return errors.Wrap(err, "read symbols") 618 } 619 return nil 620 } 621 622 func (w *Writer) writeLabelIndices() error { 623 if err := w.fPO.Flush(); err != nil { 624 return err 625 } 626 627 // Find all the label values in the tmp posting offset table. 628 f, err := fileutil.OpenMmapFile(w.fPO.name) 629 if err != nil { 630 return err 631 } 632 defer f.Close() 633 634 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos))) 635 cnt := w.cntPO 636 current := []byte{} 637 values := []uint32{} 638 for d.Err() == nil && cnt > 0 { 639 cnt-- 640 d.Uvarint() // Keycount. 641 name := d.UvarintBytes() // Label name. 642 value := yoloString(d.UvarintBytes()) // Label value. 643 d.Uvarint64() // Offset. 644 if len(name) == 0 { 645 continue // All index is ignored. 646 } 647 648 if !bytes.Equal(name, current) && len(values) > 0 { 649 // We've reached a new label name. 650 if err := w.writeLabelIndex(string(current), values); err != nil { 651 return err 652 } 653 values = values[:0] 654 } 655 current = name 656 sid, err := w.symbols.ReverseLookup(value) 657 if err != nil { 658 return err 659 } 660 values = append(values, sid) 661 } 662 if d.Err() != nil { 663 return d.Err() 664 } 665 666 // Handle the last label. 667 if len(values) > 0 { 668 if err := w.writeLabelIndex(string(current), values); err != nil { 669 return err 670 } 671 } 672 return nil 673 } 674 675 func (w *Writer) writeLabelIndex(name string, values []uint32) error { 676 // Align beginning to 4 bytes for more efficient index list scans. 677 if err := w.addPadding(4); err != nil { 678 return err 679 } 680 681 w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{ 682 keys: []string{name}, 683 offset: w.f.pos, 684 }) 685 686 startPos := w.f.pos 687 // Leave 4 bytes of space for the length, which will be calculated later. 688 if err := w.write([]byte("alen")); err != nil { 689 return err 690 } 691 w.crc32.Reset() 692 693 w.buf1.Reset() 694 w.buf1.PutBE32int(1) // Number of names. 695 w.buf1.PutBE32int(len(values)) 696 w.buf1.WriteToHash(w.crc32) 697 if err := w.write(w.buf1.Get()); err != nil { 698 return err 699 } 700 701 for _, v := range values { 702 w.buf1.Reset() 703 w.buf1.PutBE32(v) 704 w.buf1.WriteToHash(w.crc32) 705 if err := w.write(w.buf1.Get()); err != nil { 706 return err 707 } 708 } 709 710 // Write out the length. 711 w.buf1.Reset() 712 l := w.f.pos - startPos - 4 713 if l > math.MaxUint32 { 714 return errors.Errorf("label index size exceeds 4 bytes: %d", l) 715 } 716 w.buf1.PutBE32int(int(l)) 717 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 718 return err 719 } 720 721 w.buf1.Reset() 722 w.buf1.PutHashSum(w.crc32) 723 return w.write(w.buf1.Get()) 724 } 725 726 // writeLabelIndexesOffsetTable writes the label indices offset table. 727 func (w *Writer) writeLabelIndexesOffsetTable() error { 728 startPos := w.f.pos 729 // Leave 4 bytes of space for the length, which will be calculated later. 730 if err := w.write([]byte("alen")); err != nil { 731 return err 732 } 733 w.crc32.Reset() 734 735 w.buf1.Reset() 736 w.buf1.PutBE32int(len(w.labelIndexes)) 737 w.buf1.WriteToHash(w.crc32) 738 if err := w.write(w.buf1.Get()); err != nil { 739 return err 740 } 741 742 for _, e := range w.labelIndexes { 743 w.buf1.Reset() 744 w.buf1.PutUvarint(len(e.keys)) 745 for _, k := range e.keys { 746 w.buf1.PutUvarintStr(k) 747 } 748 w.buf1.PutUvarint64(e.offset) 749 w.buf1.WriteToHash(w.crc32) 750 if err := w.write(w.buf1.Get()); err != nil { 751 return err 752 } 753 } 754 // Write out the length. 755 w.buf1.Reset() 756 l := w.f.pos - startPos - 4 757 if l > math.MaxUint32 { 758 return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l) 759 } 760 w.buf1.PutBE32int(int(l)) 761 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 762 return err 763 } 764 765 w.buf1.Reset() 766 w.buf1.PutHashSum(w.crc32) 767 return w.write(w.buf1.Get()) 768 } 769 770 // writePostingsOffsetTable writes the postings offset table. 771 func (w *Writer) writePostingsOffsetTable() error { 772 // Ensure everything is in the temporary file. 773 if err := w.fPO.Flush(); err != nil { 774 return err 775 } 776 777 startPos := w.f.pos 778 // Leave 4 bytes of space for the length, which will be calculated later. 779 if err := w.write([]byte("alen")); err != nil { 780 return err 781 } 782 783 // Copy over the tmp posting offset table, however we need to 784 // adjust the offsets. 785 adjustment := w.postingsStart 786 787 w.buf1.Reset() 788 w.crc32.Reset() 789 w.buf1.PutBE32int(int(w.cntPO)) // Count. 790 w.buf1.WriteToHash(w.crc32) 791 if err := w.write(w.buf1.Get()); err != nil { 792 return err 793 } 794 795 f, err := fileutil.OpenMmapFile(w.fPO.name) 796 if err != nil { 797 return err 798 } 799 defer func() { 800 if f != nil { 801 f.Close() 802 } 803 }() 804 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos))) 805 cnt := w.cntPO 806 for d.Err() == nil && cnt > 0 { 807 w.buf1.Reset() 808 w.buf1.PutUvarint(d.Uvarint()) // Keycount. 809 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name. 810 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value. 811 w.buf1.PutUvarint64(d.Uvarint64() + adjustment) // Offset. 812 w.buf1.WriteToHash(w.crc32) 813 if err := w.write(w.buf1.Get()); err != nil { 814 return err 815 } 816 cnt-- 817 } 818 if d.Err() != nil { 819 return d.Err() 820 } 821 822 // Cleanup temporary file. 823 if err := f.Close(); err != nil { 824 return err 825 } 826 f = nil 827 if err := w.fPO.Close(); err != nil { 828 return err 829 } 830 if err := w.fPO.Remove(); err != nil { 831 return err 832 } 833 w.fPO = nil 834 835 // Write out the length. 836 w.buf1.Reset() 837 l := w.f.pos - startPos - 4 838 if l > math.MaxUint32 { 839 return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l) 840 } 841 w.buf1.PutBE32int(int(l)) 842 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 843 return err 844 } 845 846 // Finally write the hash. 847 w.buf1.Reset() 848 w.buf1.PutHashSum(w.crc32) 849 return w.write(w.buf1.Get()) 850 } 851 852 func (w *Writer) writeFingerprintOffsetsTable() error { 853 w.buf1.Reset() 854 w.buf2.Reset() 855 856 w.buf1.PutBE32int(len(w.fingerprintOffsets)) // Count. 857 // build offsets 858 for _, x := range w.fingerprintOffsets { 859 w.buf1.PutBE64(x[0]) // series offset 860 w.buf1.PutBE64(x[1]) // hash 861 } 862 863 // write length 864 ln := w.buf1.Len() 865 // TODO(owen-d): can remove the uint32 cast in the future 866 // Had to uint32 wrap these for arm32 builds, which we'll remove in the future. 867 if uint32(ln) > uint32(math.MaxUint32) { 868 return errors.Errorf("fingerprint offset size exceeds 4 bytes: %d", ln) 869 } 870 871 w.buf2.PutBE32int(ln) 872 if err := w.write(w.buf2.Get()); err != nil { 873 return err 874 } 875 876 // write offsets+checksum 877 w.buf1.PutHash(w.crc32) 878 if err := w.write(w.buf1.Get()); err != nil { 879 return errors.Wrap(err, "failure writing fingerprint offsets") 880 } 881 return nil 882 } 883 884 const indexTOCLen = 8*9 + crc32.Size 885 886 func (w *Writer) writeTOC() error { 887 w.buf1.Reset() 888 889 w.buf1.PutBE64(w.toc.Symbols) 890 w.buf1.PutBE64(w.toc.Series) 891 w.buf1.PutBE64(w.toc.LabelIndices) 892 w.buf1.PutBE64(w.toc.LabelIndicesTable) 893 w.buf1.PutBE64(w.toc.Postings) 894 w.buf1.PutBE64(w.toc.PostingsTable) 895 w.buf1.PutBE64(w.toc.FingerprintOffsets) 896 897 // metadata 898 w.buf1.PutBE64int64(w.toc.Metadata.From) 899 w.buf1.PutBE64int64(w.toc.Metadata.Through) 900 901 w.buf1.PutHash(w.crc32) 902 903 return w.write(w.buf1.Get()) 904 } 905 906 func (w *Writer) writePostingsToTmpFiles() error { 907 names := make([]string, 0, len(w.labelNames)) 908 for n := range w.labelNames { 909 names = append(names, n) 910 } 911 sort.Strings(names) 912 913 if err := w.f.Flush(); err != nil { 914 return err 915 } 916 f, err := fileutil.OpenMmapFile(w.f.name) 917 if err != nil { 918 return err 919 } 920 defer f.Close() 921 922 // Write out the special all posting. 923 offsets := []uint32{} 924 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices))) 925 d.Skip(int(w.toc.Series)) 926 for d.Len() > 0 { 927 d.ConsumePadding() 928 startPos := w.toc.LabelIndices - uint64(d.Len()) 929 if startPos%16 != 0 { 930 return errors.Errorf("series not 16-byte aligned at %d", startPos) 931 } 932 offsets = append(offsets, uint32(startPos/16)) 933 // Skip to next series. 934 x := d.Uvarint() 935 d.Skip(x + crc32.Size) 936 if err := d.Err(); err != nil { 937 return err 938 } 939 } 940 if err := w.writePosting("", "", offsets); err != nil { 941 return err 942 } 943 maxPostings := uint64(len(offsets)) // No label name can have more postings than this. 944 945 for len(names) > 0 { 946 batchNames := []string{} 947 var c uint64 948 // Try to bunch up label names into one loop, but avoid 949 // using more memory than a single label name can. 950 for len(names) > 0 { 951 if w.labelNames[names[0]]+c > maxPostings { 952 break 953 } 954 batchNames = append(batchNames, names[0]) 955 c += w.labelNames[names[0]] 956 names = names[1:] 957 } 958 959 nameSymbols := map[uint32]string{} 960 for _, name := range batchNames { 961 sid, err := w.symbols.ReverseLookup(name) 962 if err != nil { 963 return err 964 } 965 nameSymbols[sid] = name 966 } 967 // Label name -> label value -> positions. 968 postings := map[uint32]map[uint32][]uint32{} 969 970 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices))) 971 d.Skip(int(w.toc.Series)) 972 for d.Len() > 0 { 973 d.ConsumePadding() 974 startPos := w.toc.LabelIndices - uint64(d.Len()) 975 l := d.Uvarint() // Length of this series in bytes. 976 startLen := d.Len() 977 978 _ = d.Be64() // skip fingerprint 979 // See if label names we want are in the series. 980 numLabels := d.Uvarint() 981 for i := 0; i < numLabels; i++ { 982 lno := uint32(d.Uvarint()) 983 lvo := uint32(d.Uvarint()) 984 985 if _, ok := nameSymbols[lno]; ok { 986 if _, ok := postings[lno]; !ok { 987 postings[lno] = map[uint32][]uint32{} 988 } 989 postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/16)) 990 } 991 } 992 // Skip to next series. 993 d.Skip(l - (startLen - d.Len()) + crc32.Size) 994 if err := d.Err(); err != nil { 995 return err 996 } 997 } 998 999 for _, name := range batchNames { 1000 // Write out postings for this label name. 1001 sid, err := w.symbols.ReverseLookup(name) 1002 if err != nil { 1003 return err 1004 } 1005 values := make([]uint32, 0, len(postings[sid])) 1006 for v := range postings[sid] { 1007 values = append(values, v) 1008 } 1009 // Symbol numbers are in order, so the strings will also be in order. 1010 sort.Sort(uint32slice(values)) 1011 for _, v := range values { 1012 value, err := w.symbols.Lookup(v) 1013 if err != nil { 1014 return err 1015 } 1016 if err := w.writePosting(name, value, postings[sid][v]); err != nil { 1017 return err 1018 } 1019 } 1020 } 1021 select { 1022 case <-w.ctx.Done(): 1023 return w.ctx.Err() 1024 default: 1025 } 1026 1027 } 1028 return nil 1029 } 1030 1031 func (w *Writer) writePosting(name, value string, offs []uint32) error { 1032 // Align beginning to 4 bytes for more efficient postings list scans. 1033 if err := w.fP.AddPadding(4); err != nil { 1034 return err 1035 } 1036 1037 // Write out postings offset table to temporary file as we go. 1038 w.buf1.Reset() 1039 w.buf1.PutUvarint(2) 1040 w.buf1.PutUvarintStr(name) 1041 w.buf1.PutUvarintStr(value) 1042 w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file. 1043 if err := w.fPO.Write(w.buf1.Get()); err != nil { 1044 return err 1045 } 1046 w.cntPO++ 1047 1048 w.buf1.Reset() 1049 w.buf1.PutBE32int(len(offs)) 1050 1051 for _, off := range offs { 1052 if off > (1<<32)-1 { 1053 return errors.Errorf("series offset %d exceeds 4 bytes", off) 1054 } 1055 w.buf1.PutBE32(off) 1056 } 1057 1058 w.buf2.Reset() 1059 l := w.buf1.Len() 1060 // We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there. 1061 if uint(l) > math.MaxUint32 { 1062 return errors.Errorf("posting size exceeds 4 bytes: %d", l) 1063 } 1064 w.buf2.PutBE32int(l) 1065 w.buf1.PutHash(w.crc32) 1066 return w.fP.Write(w.buf2.Get(), w.buf1.Get()) 1067 } 1068 1069 func (w *Writer) writePostings() error { 1070 // There's padding in the tmp file, make sure it actually works. 1071 if err := w.f.AddPadding(4); err != nil { 1072 return err 1073 } 1074 w.postingsStart = w.f.pos 1075 1076 // Copy temporary file into main index. 1077 if err := w.fP.Flush(); err != nil { 1078 return err 1079 } 1080 if _, err := w.fP.f.Seek(0, 0); err != nil { 1081 return err 1082 } 1083 // Don't need to calculate a checksum, so can copy directly. 1084 n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, make([]byte, 1<<20)) 1085 if err != nil { 1086 return err 1087 } 1088 if uint64(n) != w.fP.pos { 1089 return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n) 1090 } 1091 w.f.pos += uint64(n) 1092 1093 if err := w.fP.Close(); err != nil { 1094 return err 1095 } 1096 if err := w.fP.Remove(); err != nil { 1097 return err 1098 } 1099 w.fP = nil 1100 return nil 1101 } 1102 1103 type uint32slice []uint32 1104 1105 func (s uint32slice) Len() int { return len(s) } 1106 func (s uint32slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 1107 func (s uint32slice) Less(i, j int) bool { return s[i] < s[j] } 1108 1109 type labelIndexHashEntry struct { 1110 keys []string 1111 offset uint64 1112 } 1113 1114 func (w *Writer) Close() error { 1115 // Even if this fails, we need to close all the files. 1116 ensureErr := w.ensureStage(idxStageDone) 1117 1118 if w.symbolFile != nil { 1119 if err := w.symbolFile.Close(); err != nil { 1120 return err 1121 } 1122 } 1123 if w.fP != nil { 1124 if err := w.fP.Close(); err != nil { 1125 return err 1126 } 1127 } 1128 if w.fPO != nil { 1129 if err := w.fPO.Close(); err != nil { 1130 return err 1131 } 1132 } 1133 if err := w.f.Close(); err != nil { 1134 return err 1135 } 1136 return ensureErr 1137 } 1138 1139 // StringIter iterates over a sorted list of strings. 1140 type StringIter interface { 1141 // Next advances the iterator and returns true if another value was found. 1142 Next() bool 1143 1144 // At returns the value at the current iterator position. 1145 At() string 1146 1147 // Err returns the last error of the iterator. 1148 Err() error 1149 } 1150 1151 type Reader struct { 1152 b ByteSlice 1153 toc *TOC 1154 1155 // Close that releases the underlying resources of the byte slice. 1156 c io.Closer 1157 1158 // Map of LabelName to a list of some LabelValues's position in the offset table. 1159 // The first and last values for each name are always present. 1160 postings map[string][]postingOffset 1161 // For the v1 format, labelname -> labelvalue -> offset. 1162 postingsV1 map[string]map[string]uint64 1163 1164 symbols *Symbols 1165 nameSymbols map[uint32]string // Cache of the label name symbol lookups, 1166 // as there are not many and they are half of all lookups. 1167 1168 fingerprintOffsets FingerprintOffsets 1169 1170 dec *Decoder 1171 1172 version int 1173 } 1174 1175 type postingOffset struct { 1176 value string 1177 off int 1178 } 1179 1180 // ByteSlice abstracts a byte slice. 1181 type ByteSlice interface { 1182 Len() int 1183 Range(start, end int) []byte 1184 } 1185 1186 type RealByteSlice []byte 1187 1188 func (b RealByteSlice) Len() int { 1189 return len(b) 1190 } 1191 1192 func (b RealByteSlice) Range(start, end int) []byte { 1193 return b[start:end] 1194 } 1195 1196 func (b RealByteSlice) Sub(start, end int) ByteSlice { 1197 return b[start:end] 1198 } 1199 1200 // NewReader returns a new index reader on the given byte slice. It automatically 1201 // handles different format versions. 1202 func NewReader(b ByteSlice) (*Reader, error) { 1203 return newReader(b, ioutil.NopCloser(nil)) 1204 } 1205 1206 type nopCloser struct{} 1207 1208 func (nopCloser) Close() error { return nil } 1209 1210 // NewFileReader returns a new index reader against the given index file. 1211 func NewFileReader(path string) (*Reader, error) { 1212 b, err := ioutil.ReadFile(path) 1213 if err != nil { 1214 return nil, err 1215 } 1216 r, err := newReader(RealByteSlice(b), nopCloser{}) 1217 if err != nil { 1218 return r, err 1219 } 1220 1221 return r, nil 1222 } 1223 1224 func newReader(b ByteSlice, c io.Closer) (*Reader, error) { 1225 r := &Reader{ 1226 b: b, 1227 c: c, 1228 postings: map[string][]postingOffset{}, 1229 } 1230 1231 // Verify header. 1232 if r.b.Len() < HeaderLen { 1233 return nil, errors.Wrap(tsdb_enc.ErrInvalidSize, "index header") 1234 } 1235 if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex { 1236 return nil, errors.Errorf("invalid magic number %x", m) 1237 } 1238 r.version = int(r.b.Range(4, 5)[0]) 1239 1240 if r.version != FormatV1 && r.version != FormatV2 { 1241 return nil, errors.Errorf("unknown index file version %d", r.version) 1242 } 1243 1244 var err error 1245 r.toc, err = NewTOCFromByteSlice(b) 1246 if err != nil { 1247 return nil, errors.Wrap(err, "read TOC") 1248 } 1249 1250 r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols)) 1251 if err != nil { 1252 return nil, errors.Wrap(err, "read symbols") 1253 } 1254 1255 if r.version == FormatV1 { 1256 // Earlier V1 formats don't have a sorted postings offset table, so 1257 // load the whole offset table into memory. 1258 r.postingsV1 = map[string]map[string]uint64{} 1259 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1260 if len(key) != 2 { 1261 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1262 } 1263 if _, ok := r.postingsV1[key[0]]; !ok { 1264 r.postingsV1[key[0]] = map[string]uint64{} 1265 r.postings[key[0]] = nil // Used to get a list of labelnames in places. 1266 } 1267 r.postingsV1[key[0]][key[1]] = off 1268 return nil 1269 }); err != nil { 1270 return nil, errors.Wrap(err, "read postings table") 1271 } 1272 } else { 1273 var lastKey []string 1274 lastOff := 0 1275 valueCount := 0 1276 // For the postings offset table we keep every label name but only every nth 1277 // label value (plus the first and last one), to save memory. 1278 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, _ uint64, off int) error { 1279 if len(key) != 2 { 1280 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1281 } 1282 if _, ok := r.postings[key[0]]; !ok { 1283 // Next label name. 1284 r.postings[key[0]] = []postingOffset{} 1285 if lastKey != nil { 1286 // Always include last value for each label name. 1287 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1288 } 1289 lastKey = nil 1290 valueCount = 0 1291 } 1292 if valueCount%symbolFactor == 0 { 1293 r.postings[key[0]] = append(r.postings[key[0]], postingOffset{value: key[1], off: off}) 1294 lastKey = nil 1295 } else { 1296 lastKey = key 1297 lastOff = off 1298 } 1299 valueCount++ 1300 return nil 1301 }); err != nil { 1302 return nil, errors.Wrap(err, "read postings table") 1303 } 1304 if lastKey != nil { 1305 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1306 } 1307 // Trim any extra space in the slices. 1308 for k, v := range r.postings { 1309 l := make([]postingOffset, len(v)) 1310 copy(l, v) 1311 r.postings[k] = l 1312 } 1313 } 1314 1315 r.nameSymbols = make(map[uint32]string, len(r.postings)) 1316 for k := range r.postings { 1317 if k == "" { 1318 continue 1319 } 1320 off, err := r.symbols.ReverseLookup(k) 1321 if err != nil { 1322 return nil, errors.Wrap(err, "reverse symbol lookup") 1323 } 1324 r.nameSymbols[off] = k 1325 } 1326 1327 r.fingerprintOffsets, err = readFingerprintOffsetsTable(r.b, r.toc.FingerprintOffsets) 1328 if err != nil { 1329 return nil, errors.Wrap(err, "loading fingerprint offsets") 1330 } 1331 1332 r.dec = &Decoder{LookupSymbol: r.lookupSymbol} 1333 1334 return r, nil 1335 } 1336 1337 // Version returns the file format version of the underlying index. 1338 func (r *Reader) Version() int { 1339 return r.version 1340 } 1341 1342 // Range marks a byte range. 1343 type Range struct { 1344 Start, End int64 1345 } 1346 1347 // PostingsRanges returns a new map of byte range in the underlying index file 1348 // for all postings lists. 1349 func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) { 1350 m := map[labels.Label]Range{} 1351 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1352 if len(key) != 2 { 1353 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1354 } 1355 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(off), castagnoliTable)) 1356 if d.Err() != nil { 1357 return d.Err() 1358 } 1359 m[labels.Label{Name: key[0], Value: key[1]}] = Range{ 1360 Start: int64(off) + 4, 1361 End: int64(off) + 4 + int64(d.Len()), 1362 } 1363 return nil 1364 }); err != nil { 1365 return nil, errors.Wrap(err, "read postings table") 1366 } 1367 return m, nil 1368 } 1369 1370 type Symbols struct { 1371 bs ByteSlice 1372 version int 1373 off int 1374 1375 offsets []int 1376 seen int 1377 } 1378 1379 const symbolFactor = 32 1380 1381 // NewSymbols returns a Symbols object for symbol lookups. 1382 func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) { 1383 s := &Symbols{ 1384 bs: bs, 1385 version: version, 1386 off: off, 1387 } 1388 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, off, castagnoliTable)) 1389 var ( 1390 origLen = d.Len() 1391 cnt = d.Be32int() 1392 basePos = off + 4 1393 ) 1394 s.offsets = make([]int, 0, 1+cnt/symbolFactor) 1395 for d.Err() == nil && s.seen < cnt { 1396 if s.seen%symbolFactor == 0 { 1397 s.offsets = append(s.offsets, basePos+origLen-d.Len()) 1398 } 1399 d.UvarintBytes() // The symbol. 1400 s.seen++ 1401 } 1402 if d.Err() != nil { 1403 return nil, d.Err() 1404 } 1405 return s, nil 1406 } 1407 1408 func (s Symbols) Lookup(o uint32) (string, error) { 1409 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1410 B: s.bs.Range(0, s.bs.Len()), 1411 }) 1412 1413 if s.version == FormatV2 { 1414 if int(o) >= s.seen { 1415 return "", errors.Errorf("unknown symbol offset %d", o) 1416 } 1417 d.Skip(s.offsets[int(o/symbolFactor)]) 1418 // Walk until we find the one we want. 1419 for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- { 1420 d.UvarintBytes() 1421 } 1422 } else { 1423 d.Skip(int(o)) 1424 } 1425 sym := d.UvarintStr() 1426 if d.Err() != nil { 1427 return "", d.Err() 1428 } 1429 return sym, nil 1430 } 1431 1432 func (s Symbols) ReverseLookup(sym string) (uint32, error) { 1433 if len(s.offsets) == 0 { 1434 return 0, errors.Errorf("unknown symbol %q - no symbols", sym) 1435 } 1436 i := sort.Search(len(s.offsets), func(i int) bool { 1437 // Any decoding errors here will be lost, however 1438 // we already read through all of this at startup. 1439 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1440 B: s.bs.Range(0, s.bs.Len()), 1441 }) 1442 d.Skip(s.offsets[i]) 1443 return yoloString(d.UvarintBytes()) > sym 1444 }) 1445 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1446 B: s.bs.Range(0, s.bs.Len()), 1447 }) 1448 if i > 0 { 1449 i-- 1450 } 1451 d.Skip(s.offsets[i]) 1452 res := i * symbolFactor 1453 var lastLen int 1454 var lastSymbol string 1455 for d.Err() == nil && res <= s.seen { 1456 lastLen = d.Len() 1457 lastSymbol = yoloString(d.UvarintBytes()) 1458 if lastSymbol >= sym { 1459 break 1460 } 1461 res++ 1462 } 1463 if d.Err() != nil { 1464 return 0, d.Err() 1465 } 1466 if lastSymbol != sym { 1467 return 0, errors.Errorf("unknown symbol %q", sym) 1468 } 1469 if s.version == FormatV2 { 1470 return uint32(res), nil 1471 } 1472 return uint32(s.bs.Len() - lastLen), nil 1473 } 1474 1475 func (s Symbols) Size() int { 1476 return len(s.offsets) * 8 1477 } 1478 1479 func (s Symbols) Iter() StringIter { 1480 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(s.bs, s.off, castagnoliTable)) 1481 cnt := d.Be32int() 1482 return &symbolsIter{ 1483 d: d, 1484 cnt: cnt, 1485 } 1486 } 1487 1488 // symbolsIter implements StringIter. 1489 type symbolsIter struct { 1490 d encoding.Decbuf 1491 cnt int 1492 cur string 1493 err error 1494 } 1495 1496 func (s *symbolsIter) Next() bool { 1497 if s.cnt == 0 || s.err != nil { 1498 return false 1499 } 1500 s.cur = yoloString(s.d.UvarintBytes()) 1501 s.cnt-- 1502 if s.d.Err() != nil { 1503 s.err = s.d.Err() 1504 return false 1505 } 1506 return true 1507 } 1508 1509 func (s symbolsIter) At() string { return s.cur } 1510 func (s symbolsIter) Err() error { return s.err } 1511 1512 // ReadOffsetTable reads an offset table and at the given position calls f for each 1513 // found entry. If f returns an error it stops decoding and returns the received error. 1514 func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64, int) error) error { 1515 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1516 startLen := d.Len() 1517 cnt := d.Be32() 1518 1519 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1520 offsetPos := startLen - d.Len() 1521 keyCount := d.Uvarint() 1522 // The Postings offset table takes only 2 keys per entry (name and value of label), 1523 // and the LabelIndices offset table takes only 1 key per entry (a label name). 1524 // Hence setting the size to max of both, i.e. 2. 1525 keys := make([]string, 0, 2) 1526 1527 for i := 0; i < keyCount; i++ { 1528 keys = append(keys, d.UvarintStr()) 1529 } 1530 o := d.Uvarint64() 1531 if d.Err() != nil { 1532 break 1533 } 1534 if err := f(keys, o, offsetPos); err != nil { 1535 return err 1536 } 1537 cnt-- 1538 } 1539 return d.Err() 1540 } 1541 1542 func readFingerprintOffsetsTable(bs ByteSlice, off uint64) (FingerprintOffsets, error) { 1543 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1544 cnt := d.Be32() 1545 res := make(FingerprintOffsets, 0, int(cnt)) 1546 1547 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1548 res = append(res, [2]uint64{d.Be64(), d.Be64()}) 1549 cnt-- 1550 } 1551 1552 return res, d.Err() 1553 1554 } 1555 1556 // Close the reader and its underlying resources. 1557 func (r *Reader) Close() error { 1558 return r.c.Close() 1559 } 1560 1561 func (r *Reader) lookupSymbol(o uint32) (string, error) { 1562 if s, ok := r.nameSymbols[o]; ok { 1563 return s, nil 1564 } 1565 return r.symbols.Lookup(o) 1566 } 1567 1568 func (r *Reader) Bounds() (int64, int64) { 1569 return r.toc.Metadata.From, r.toc.Metadata.Through 1570 } 1571 1572 func (r *Reader) Checksum() uint32 { 1573 return r.toc.Metadata.Checksum 1574 } 1575 1576 // Symbols returns an iterator over the symbols that exist within the index. 1577 func (r *Reader) Symbols() StringIter { 1578 return r.symbols.Iter() 1579 } 1580 1581 // SymbolTableSize returns the symbol table size in bytes. 1582 func (r *Reader) SymbolTableSize() uint64 { 1583 return uint64(r.symbols.Size()) 1584 } 1585 1586 // SortedLabelValues returns value tuples that exist for the given label name. 1587 // It is not safe to use the return value beyond the lifetime of the byte slice 1588 // passed into the Reader. 1589 func (r *Reader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1590 values, err := r.LabelValues(name, matchers...) 1591 if err == nil && r.version == FormatV1 { 1592 sort.Strings(values) 1593 } 1594 return values, err 1595 } 1596 1597 // LabelValues returns value tuples that exist for the given label name. 1598 // It is not safe to use the return value beyond the lifetime of the byte slice 1599 // passed into the Reader. 1600 // TODO(replay): Support filtering by matchers 1601 func (r *Reader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1602 if len(matchers) > 0 { 1603 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1604 } 1605 1606 if r.version == FormatV1 { 1607 e, ok := r.postingsV1[name] 1608 if !ok { 1609 return nil, nil 1610 } 1611 values := make([]string, 0, len(e)) 1612 for k := range e { 1613 values = append(values, k) 1614 } 1615 return values, nil 1616 1617 } 1618 e, ok := r.postings[name] 1619 if !ok { 1620 return nil, nil 1621 } 1622 if len(e) == 0 { 1623 return nil, nil 1624 } 1625 values := make([]string, 0, len(e)*symbolFactor) 1626 1627 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1628 d.Skip(e[0].off) 1629 lastVal := e[len(e)-1].value 1630 1631 skip := 0 1632 for d.Err() == nil { 1633 if skip == 0 { 1634 // These are always the same number of bytes, 1635 // and it's faster to skip than parse. 1636 skip = d.Len() 1637 d.Uvarint() // Keycount. 1638 d.UvarintBytes() // Label name. 1639 skip -= d.Len() 1640 } else { 1641 d.Skip(skip) 1642 } 1643 s := yoloString(d.UvarintBytes()) // Label value. 1644 values = append(values, s) 1645 if s == lastVal { 1646 break 1647 } 1648 d.Uvarint64() // Offset. 1649 } 1650 if d.Err() != nil { 1651 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1652 } 1653 return values, nil 1654 } 1655 1656 // LabelNamesFor returns all the label names for the series referred to by IDs. 1657 // The names returned are sorted. 1658 func (r *Reader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) { 1659 // Gather offsetsMap the name offsetsMap in the symbol table first 1660 offsetsMap := make(map[uint32]struct{}) 1661 for _, id := range ids { 1662 offset := id 1663 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1664 // and the ID is the multiple of 16 of the actual position. 1665 if r.version == FormatV2 { 1666 offset = id * 16 1667 } 1668 1669 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1670 buf := d.Get() 1671 if d.Err() != nil { 1672 return nil, errors.Wrap(d.Err(), "get buffer for series") 1673 } 1674 1675 offsets, err := r.dec.LabelNamesOffsetsFor(buf) 1676 if err != nil { 1677 return nil, errors.Wrap(err, "get label name offsets") 1678 } 1679 for _, off := range offsets { 1680 offsetsMap[off] = struct{}{} 1681 } 1682 } 1683 1684 // Lookup the unique symbols. 1685 names := make([]string, 0, len(offsetsMap)) 1686 for off := range offsetsMap { 1687 name, err := r.lookupSymbol(off) 1688 if err != nil { 1689 return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor") 1690 } 1691 names = append(names, name) 1692 } 1693 1694 sort.Strings(names) 1695 1696 return names, nil 1697 } 1698 1699 // LabelValueFor returns label value for the given label name in the series referred to by ID. 1700 func (r *Reader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { 1701 offset := id 1702 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1703 // and the ID is the multiple of 16 of the actual position. 1704 if r.version == FormatV2 { 1705 offset = id * 16 1706 } 1707 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1708 buf := d.Get() 1709 if d.Err() != nil { 1710 return "", errors.Wrap(d.Err(), "label values for") 1711 } 1712 1713 value, err := r.dec.LabelValueFor(buf, label) 1714 if err != nil { 1715 return "", storage.ErrNotFound 1716 } 1717 1718 if value == "" { 1719 return "", storage.ErrNotFound 1720 } 1721 1722 return value, nil 1723 } 1724 1725 // Series reads the series with the given ID and writes its labels and chunks into lbls and chks. 1726 func (r *Reader) Series(id storage.SeriesRef, lbls *labels.Labels, chks *[]ChunkMeta) (uint64, error) { 1727 offset := id 1728 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1729 // and the ID is the multiple of 16 of the actual position. 1730 if r.version == FormatV2 { 1731 offset = id * 16 1732 } 1733 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1734 if d.Err() != nil { 1735 return 0, d.Err() 1736 } 1737 1738 fprint, err := r.dec.Series(d.Get(), lbls, chks) 1739 if err != nil { 1740 return 0, errors.Wrap(err, "read series") 1741 } 1742 return fprint, nil 1743 } 1744 1745 func (r *Reader) Postings(name string, shard *ShardAnnotation, values ...string) (Postings, error) { 1746 if r.version == FormatV1 { 1747 e, ok := r.postingsV1[name] 1748 if !ok { 1749 return EmptyPostings(), nil 1750 } 1751 res := make([]Postings, 0, len(values)) 1752 for _, v := range values { 1753 postingsOff, ok := e[v] 1754 if !ok { 1755 continue 1756 } 1757 // Read from the postings table. 1758 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1759 _, p, err := r.dec.Postings(d.Get()) 1760 if err != nil { 1761 return nil, errors.Wrap(err, "decode postings") 1762 } 1763 res = append(res, p) 1764 } 1765 return Merge(res...), nil 1766 } 1767 1768 e, ok := r.postings[name] 1769 if !ok { 1770 return EmptyPostings(), nil 1771 } 1772 1773 if len(values) == 0 { 1774 return EmptyPostings(), nil 1775 } 1776 1777 res := make([]Postings, 0, len(values)) 1778 skip := 0 1779 valueIndex := 0 1780 for valueIndex < len(values) && values[valueIndex] < e[0].value { 1781 // Discard values before the start. 1782 valueIndex++ 1783 } 1784 for valueIndex < len(values) { 1785 value := values[valueIndex] 1786 1787 i := sort.Search(len(e), func(i int) bool { return e[i].value >= value }) 1788 if i == len(e) { 1789 // We're past the end. 1790 break 1791 } 1792 if i > 0 && e[i].value != value { 1793 // Need to look from previous entry. 1794 i-- 1795 } 1796 // Don't Crc32 the entire postings offset table, this is very slow 1797 // so hope any issues were caught at startup. 1798 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1799 d.Skip(e[i].off) 1800 1801 // Iterate on the offset table. 1802 var postingsOff uint64 // The offset into the postings table. 1803 for d.Err() == nil { 1804 if skip == 0 { 1805 // These are always the same number of bytes, 1806 // and it's faster to skip than parse. 1807 skip = d.Len() 1808 d.Uvarint() // Keycount. 1809 d.UvarintBytes() // Label name. 1810 skip -= d.Len() 1811 } else { 1812 d.Skip(skip) 1813 } 1814 v := d.UvarintBytes() // Label value. 1815 postingsOff = d.Uvarint64() // Offset. 1816 for string(v) >= value { 1817 if string(v) == value { 1818 // Read from the postings table. 1819 d2 := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1820 _, p, err := r.dec.Postings(d2.Get()) 1821 if err != nil { 1822 return nil, errors.Wrap(err, "decode postings") 1823 } 1824 res = append(res, p) 1825 } 1826 valueIndex++ 1827 if valueIndex == len(values) { 1828 break 1829 } 1830 value = values[valueIndex] 1831 } 1832 if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) { 1833 // Need to go to a later postings offset entry, if there is one. 1834 break 1835 } 1836 } 1837 if d.Err() != nil { 1838 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1839 } 1840 } 1841 1842 merged := Merge(res...) 1843 if shard != nil { 1844 return NewShardedPostings(merged, *shard, r.fingerprintOffsets), nil 1845 } 1846 1847 return merged, nil 1848 } 1849 1850 // Size returns the size of an index file. 1851 func (r *Reader) Size() int64 { 1852 return int64(r.b.Len()) 1853 } 1854 1855 // LabelNames returns all the unique label names present in the index. 1856 // TODO(twilkie) implement support for matchers 1857 func (r *Reader) LabelNames(matchers ...*labels.Matcher) ([]string, error) { 1858 if len(matchers) > 0 { 1859 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1860 } 1861 1862 labelNames := make([]string, 0, len(r.postings)) 1863 for name := range r.postings { 1864 if name == allPostingsKey.Name { 1865 // This is not from any metric. 1866 continue 1867 } 1868 labelNames = append(labelNames, name) 1869 } 1870 sort.Strings(labelNames) 1871 return labelNames, nil 1872 } 1873 1874 // NewStringListIter returns a StringIter for the given sorted list of strings. 1875 func NewStringListIter(s []string) StringIter { 1876 return &stringListIter{l: s} 1877 } 1878 1879 // symbolsIter implements StringIter. 1880 type stringListIter struct { 1881 l []string 1882 cur string 1883 } 1884 1885 func (s *stringListIter) Next() bool { 1886 if len(s.l) == 0 { 1887 return false 1888 } 1889 s.cur = s.l[0] 1890 s.l = s.l[1:] 1891 return true 1892 } 1893 func (s stringListIter) At() string { return s.cur } 1894 func (s stringListIter) Err() error { return nil } 1895 1896 // Decoder provides decoding methods for the v1 and v2 index file format. 1897 // 1898 // It currently does not contain decoding methods for all entry types but can be extended 1899 // by them if there's demand. 1900 type Decoder struct { 1901 LookupSymbol func(uint32) (string, error) 1902 } 1903 1904 // Postings returns a postings list for b and its number of elements. 1905 func (dec *Decoder) Postings(b []byte) (int, Postings, error) { 1906 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1907 n := d.Be32int() 1908 l := d.Get() 1909 if d.Err() != nil { 1910 return 0, nil, d.Err() 1911 } 1912 if len(l) != 4*n { 1913 return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l)) 1914 } 1915 return n, newBigEndianPostings(l), nil 1916 } 1917 1918 // LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series. 1919 // They are returned in the same order they're stored, which should be sorted lexicographically. 1920 func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) { 1921 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1922 _ = d.Be64() // skip fingerprint 1923 k := d.Uvarint() 1924 1925 offsets := make([]uint32, k) 1926 for i := 0; i < k; i++ { 1927 offsets[i] = uint32(d.Uvarint()) 1928 _ = d.Uvarint() // skip the label value 1929 1930 if d.Err() != nil { 1931 return nil, errors.Wrap(d.Err(), "read series label offsets") 1932 } 1933 } 1934 1935 return offsets, d.Err() 1936 } 1937 1938 // LabelValueFor decodes a label for a given series. 1939 func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) { 1940 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1941 _ = d.Be64() // skip fingerprint 1942 k := d.Uvarint() 1943 1944 for i := 0; i < k; i++ { 1945 lno := uint32(d.Uvarint()) 1946 lvo := uint32(d.Uvarint()) 1947 1948 if d.Err() != nil { 1949 return "", errors.Wrap(d.Err(), "read series label offsets") 1950 } 1951 1952 ln, err := dec.LookupSymbol(lno) 1953 if err != nil { 1954 return "", errors.Wrap(err, "lookup label name") 1955 } 1956 1957 if ln == label { 1958 lv, err := dec.LookupSymbol(lvo) 1959 if err != nil { 1960 return "", errors.Wrap(err, "lookup label value") 1961 } 1962 1963 return lv, nil 1964 } 1965 } 1966 1967 return "", d.Err() 1968 } 1969 1970 // Series decodes a series entry from the given byte slice into lset and chks. 1971 func (dec *Decoder) Series(b []byte, lbls *labels.Labels, chks *[]ChunkMeta) (uint64, error) { 1972 *lbls = (*lbls)[:0] 1973 *chks = (*chks)[:0] 1974 1975 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1976 1977 fprint := d.Be64() 1978 k := d.Uvarint() 1979 1980 for i := 0; i < k; i++ { 1981 lno := uint32(d.Uvarint()) 1982 lvo := uint32(d.Uvarint()) 1983 1984 if d.Err() != nil { 1985 return 0, errors.Wrap(d.Err(), "read series label offsets") 1986 } 1987 1988 ln, err := dec.LookupSymbol(lno) 1989 if err != nil { 1990 return 0, errors.Wrap(err, "lookup label name") 1991 } 1992 lv, err := dec.LookupSymbol(lvo) 1993 if err != nil { 1994 return 0, errors.Wrap(err, "lookup label value") 1995 } 1996 1997 *lbls = append(*lbls, labels.Label{Name: ln, Value: lv}) 1998 } 1999 2000 // Read the chunks meta data. 2001 k = d.Uvarint() 2002 2003 if k == 0 { 2004 return 0, d.Err() 2005 } 2006 2007 t0 := d.Varint64() 2008 maxt := int64(d.Uvarint64()) + t0 2009 kb := uint32(d.Uvarint()) 2010 entries := uint32(d.Uvarint64()) 2011 checksum := d.Be32() 2012 2013 *chks = append(*chks, ChunkMeta{ 2014 Checksum: checksum, 2015 MinTime: t0, 2016 MaxTime: maxt, 2017 KB: kb, 2018 Entries: entries, 2019 }) 2020 t0 = maxt 2021 2022 for i := 1; i < k; i++ { 2023 // Decode the diff against previous chunk as varint 2024 // instead of uvarint because chunks may overlap 2025 mint := d.Varint64() + t0 2026 maxt := int64(d.Uvarint64()) + mint 2027 kb := uint32(d.Uvarint()) 2028 entries := uint32(d.Uvarint64()) 2029 checksum := d.Be32() 2030 t0 = maxt 2031 2032 if d.Err() != nil { 2033 return 0, errors.Wrapf(d.Err(), "read meta for chunk %d", i) 2034 } 2035 2036 *chks = append(*chks, ChunkMeta{ 2037 Checksum: checksum, 2038 MinTime: mint, 2039 MaxTime: maxt, 2040 KB: kb, 2041 Entries: entries, 2042 }) 2043 } 2044 return fprint, d.Err() 2045 } 2046 2047 func yoloString(b []byte) string { 2048 return *((*string)(unsafe.Pointer(&b))) 2049 }