github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/memdb/index/index.go (about) 1 // Copyright 2017 The Prometheus Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 // A tsdb index writer, that does not use files and mmap 15 // To be for tiny segments in v2 POC branch 16 // Inspired by loki https://raw.githubusercontent.com/grafana/loki/main/pkg/storage/wal/index/index.go 17 // But actually copied from pyroscope and modified accordingly 18 19 package index 20 21 import ( 22 "bytes" 23 "context" 24 "encoding/binary" 25 "fmt" 26 "hash" 27 "hash/crc32" 28 "io" 29 "math" 30 "os" 31 "sort" 32 "unsafe" 33 34 "github.com/grafana/pyroscope/pkg/phlaredb/tsdb/index" 35 36 "github.com/pkg/errors" 37 "github.com/prometheus/common/model" 38 "github.com/prometheus/prometheus/model/labels" 39 "github.com/prometheus/prometheus/storage" 40 tsdb_enc "github.com/prometheus/prometheus/tsdb/encoding" 41 42 typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1" 43 phlaremodel "github.com/grafana/pyroscope/pkg/model" 44 "github.com/grafana/pyroscope/pkg/phlaredb/block" 45 "github.com/grafana/pyroscope/pkg/phlaredb/tsdb/encoding" 46 ) 47 48 const ( 49 // MagicIndex 4 bytes at the head of an index file. 50 MagicIndex = 0xBAAAD700 51 // HeaderLen represents number of bytes reserved of index for header. 52 HeaderLen = 5 53 54 // FormatV1 represents 1 version of index. 55 FormatV1 = 1 56 // FormatV2 represents 2 version of index. 57 FormatV2 = 2 58 59 IndexFilename = "index" 60 61 // store every 1024 series' fingerprints in the fingerprint offsets table 62 fingerprintInterval = 1 << 10 63 64 SegmentsIndexWriterBufSize = 2 * 0x1000 // small for segments 65 BlocksIndexWriterBufSize = 1 << 22 // large for blocks 66 ) 67 68 type indexWriterStage uint8 69 70 const ( 71 idxStageNone indexWriterStage = iota 72 idxStageSymbols 73 idxStageSeries 74 idxStageDone 75 ) 76 77 func (s indexWriterStage) String() string { 78 switch s { 79 case idxStageNone: 80 return "none" 81 case idxStageSymbols: 82 return "symbols" 83 case idxStageSeries: 84 return "series" 85 case idxStageDone: 86 return "done" 87 } 88 return "<unknown>" 89 } 90 91 // The table gets initialized with sync.Once but may still cause a race 92 // with any other use of the crc32 package anywhere. Thus we initialize it 93 // before. 94 var castagnoliTable *crc32.Table 95 96 func init() { 97 castagnoliTable = crc32.MakeTable(crc32.Castagnoli) 98 } 99 100 // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the 101 // polynomial may be easily changed in one location at a later time, if necessary. 102 func newCRC32() hash.Hash32 { 103 return crc32.New(castagnoliTable) 104 } 105 106 type symbolCacheEntry struct { 107 index uint32 108 lastValue string 109 lastValueIndex uint32 110 } 111 112 // Writer implements the IndexWriter interface for the standard 113 // serialization format. 114 type Writer struct { 115 ctx context.Context 116 117 f *BufferWriter 118 119 // Temporary file for postings. 120 fP *BufferWriter 121 // Temporary file for posting offsets table. 122 fPO *BufferWriter 123 cntPO uint64 124 125 toc TOC 126 stage indexWriterStage 127 postingsStart uint64 // Due to padding, can differ from TOC entry. 128 129 // Reusable memory. 130 buf1 encoding.Encbuf 131 buf2 encoding.Encbuf 132 133 numSymbols int 134 symbols *Symbols 135 symbolFile io.Closer 136 lastSymbol string 137 symbolCache map[string]symbolCacheEntry 138 139 labelIndexes []labelIndexHashEntry // Label index offsets. 140 labelNames map[string]uint64 // Label names, and their usage. 141 // Keeps track of the fingerprint/offset for every n series 142 fingerprintOffsets index.FingerprintOffsets 143 144 // Hold last series to validate that clients insert new series in order. 145 lastSeries phlaremodel.Labels 146 lastSeriesHash uint64 147 lastRef storage.SeriesRef 148 149 crc32 hash.Hash 150 151 Version int 152 } 153 154 // TOC represents index Table Of Content that states where each section of index starts. 155 type TOC struct { 156 Symbols uint64 157 Series uint64 158 LabelIndices uint64 159 LabelIndicesTable uint64 160 Postings uint64 161 PostingsTable uint64 162 FingerprintOffsets uint64 163 Metadata Metadata 164 } 165 166 // Metadata is TSDB-level metadata 167 type Metadata struct { 168 From, Through int64 169 Checksum uint32 170 } 171 172 func (m *Metadata) EnsureBounds(from, through int64) { 173 if m.From == 0 || from < m.From { 174 m.From = from 175 } 176 177 if m.Through == 0 || through > m.Through { 178 m.Through = through 179 } 180 } 181 182 // NewTOCFromByteSlice return parsed TOC from given index byte slice. 183 func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) { 184 if bs.Len() < indexTOCLen { 185 return nil, tsdb_enc.ErrInvalidSize 186 } 187 b := bs.Range(bs.Len()-indexTOCLen, bs.Len()) 188 189 expCRC := binary.BigEndian.Uint32(b[len(b)-4:]) 190 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b[:len(b)-4]}) 191 if d.Crc32(castagnoliTable) != expCRC { 192 return nil, errors.Wrap(tsdb_enc.ErrInvalidChecksum, "read TOC") 193 } 194 195 if err := d.Err(); err != nil { 196 return nil, err 197 } 198 199 return &TOC{ 200 Symbols: d.Be64(), 201 Series: d.Be64(), 202 LabelIndices: d.Be64(), 203 LabelIndicesTable: d.Be64(), 204 Postings: d.Be64(), 205 PostingsTable: d.Be64(), 206 FingerprintOffsets: d.Be64(), 207 Metadata: Metadata{ 208 From: d.Be64int64(), 209 Through: d.Be64int64(), 210 Checksum: expCRC, 211 }, 212 }, nil 213 } 214 215 // NewWriter returns a new Writer to the given filename. It serializes data in format version 2. 216 func NewWriter(ctx context.Context, bufferSize int) (*Writer, error) { 217 iw := &Writer{ 218 ctx: ctx, 219 f: GetBufferWriterFromPool(), 220 fP: GetBufferWriterFromPool(), 221 fPO: GetBufferWriterFromPool(), 222 stage: idxStageNone, 223 224 // Reusable memory. 225 buf1: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}), 226 buf2: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}), 227 228 symbolCache: make(map[string]symbolCacheEntry, 1<<8), 229 labelNames: make(map[string]uint64, 1<<8), 230 crc32: newCRC32(), 231 } 232 if err := iw.writeMeta(); err != nil { 233 return nil, err 234 } 235 return iw, nil 236 } 237 238 func (w *Writer) write(bufs ...[]byte) error { 239 return w.f.Write(bufs...) 240 } 241 242 func (w *Writer) writeAt(buf []byte, pos uint64) error { 243 return w.f.WriteAt(buf, pos) 244 } 245 246 func (w *Writer) addPadding(size int) error { 247 return w.f.AddPadding(size) 248 } 249 250 // ensureStage handles transitions between write stages and ensures that IndexWriter 251 // methods are called in an order valid for the implementation. 252 func (w *Writer) ensureStage(s indexWriterStage) error { 253 select { 254 case <-w.ctx.Done(): 255 return w.ctx.Err() 256 default: 257 } 258 259 if w.stage == s { 260 return nil 261 } 262 if w.stage < s-1 { 263 // A stage has been skipped. 264 if err := w.ensureStage(s - 1); err != nil { 265 return err 266 } 267 } 268 if w.stage > s { 269 return errors.Errorf("invalid stage %q, currently at %q", s, w.stage) 270 } 271 272 // Mark start of sections in table of contents. 273 switch s { 274 case idxStageSymbols: 275 w.toc.Symbols = w.f.pos 276 if err := w.startSymbols(); err != nil { 277 return err 278 } 279 case idxStageSeries: 280 if err := w.finishSymbols(); err != nil { 281 return err 282 } 283 w.toc.Series = w.f.pos 284 285 case idxStageDone: 286 w.toc.LabelIndices = w.f.pos 287 // LabelIndices generation depends on the posting offset 288 // table produced at this stage. 289 if err := w.writePostingsToTmpFiles(); err != nil { 290 return err 291 } 292 if err := w.writeLabelIndices(); err != nil { 293 return err 294 } 295 296 w.toc.Postings = w.f.pos 297 if err := w.writePostings(); err != nil { 298 return err 299 } 300 301 w.toc.LabelIndicesTable = w.f.pos 302 if err := w.writeLabelIndexesOffsetTable(); err != nil { 303 return err 304 } 305 306 w.toc.PostingsTable = w.f.pos 307 if err := w.writePostingsOffsetTable(); err != nil { 308 return err 309 } 310 311 w.toc.FingerprintOffsets = w.f.pos 312 if err := w.writeFingerprintOffsetsTable(); err != nil { 313 return err 314 } 315 316 if err := w.writeTOC(); err != nil { 317 return err 318 } 319 } 320 321 w.stage = s 322 return nil 323 } 324 325 func (w *Writer) writeMeta() error { 326 w.buf1.Reset() 327 w.buf1.PutBE32(MagicIndex) 328 w.buf1.PutByte(FormatV2) 329 330 return w.write(w.buf1.Get()) 331 } 332 333 // AddSeries adds the series one at a time along with its chunks. 334 // Requires a specific fingerprint to be passed in the case where the "desired" 335 // fingerprint differs from what labels.Hash() produces. For example, 336 // multitenant TSDBs embed a tenant label, but the actual series has no such 337 // label and so the derived fingerprint differs. 338 func (w *Writer) AddSeries(ref storage.SeriesRef, lset phlaremodel.Labels, fp model.Fingerprint, chunks ...index.ChunkMeta) error { 339 if err := w.ensureStage(idxStageSeries); err != nil { 340 return err 341 } 342 343 // Put the supplied fingerprint instead of the calculated hash. 344 // This allows us to have a synthetic label (__loki_tenant__) in 345 // the pre-compacted TSDBs which map to fingerprints (and chunks) 346 // without this label in storage. 347 labelHash := uint64(fp) 348 349 if ref < w.lastRef && len(w.lastSeries) != 0 { 350 return errors.Errorf("series with reference greater than %d already added", ref) 351 } 352 // We add padding to 16 bytes to increase the addressable space we get through 4 byte 353 // series references. 354 if err := w.addPadding(16); err != nil { 355 return errors.Errorf("failed to write padding bytes: %v", err) 356 } 357 358 if w.f.pos%16 != 0 { 359 return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos) 360 } 361 362 w.buf2.Reset() 363 w.buf2.PutBE64(labelHash) 364 w.buf2.PutUvarint(len(lset)) 365 366 for _, l := range lset { 367 var err error 368 cacheEntry, ok := w.symbolCache[l.Name] 369 nameIndex := cacheEntry.index 370 if !ok { 371 nameIndex, err = w.symbols.ReverseLookup(l.Name) 372 if err != nil { 373 return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err) 374 } 375 } 376 w.labelNames[l.Name]++ 377 w.buf2.PutUvarint32(nameIndex) 378 379 valueIndex := cacheEntry.lastValueIndex 380 if !ok || cacheEntry.lastValue != l.Value { 381 valueIndex, err = w.symbols.ReverseLookup(l.Value) 382 if err != nil { 383 return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err) 384 } 385 w.symbolCache[l.Name] = symbolCacheEntry{ 386 index: nameIndex, 387 lastValue: l.Value, 388 lastValueIndex: valueIndex, 389 } 390 } 391 w.buf2.PutUvarint32(valueIndex) 392 } 393 394 w.buf2.PutUvarint(len(chunks)) 395 396 if len(chunks) > 0 { 397 c := chunks[0] 398 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 399 400 w.buf2.PutVarint64(c.MinTime) 401 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 402 w.buf2.PutUvarint32(c.KB) 403 w.buf2.PutUvarint32(c.SeriesIndex) 404 w.buf2.PutBE32(c.Checksum) 405 t0 := c.MaxTime 406 407 for _, c := range chunks[1:] { 408 w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime) 409 // Encode the diff against previous chunk as varint 410 // instead of uvarint because chunks may overlap 411 w.buf2.PutVarint64(c.MinTime - t0) 412 w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime)) 413 w.buf2.PutUvarint32(c.KB) 414 w.buf2.PutUvarint32(c.SeriesIndex) 415 t0 = c.MaxTime 416 417 w.buf2.PutBE32(c.Checksum) 418 } 419 } 420 421 w.buf1.Reset() 422 w.buf1.PutUvarint(w.buf2.Len()) 423 424 w.buf2.PutHash(w.crc32) 425 426 w.lastSeries = append(w.lastSeries[:0], lset...) 427 w.lastSeriesHash = labelHash 428 w.lastRef = ref 429 430 if ref%fingerprintInterval == 0 { 431 sRef := w.f.pos / 16 432 w.fingerprintOffsets = append(w.fingerprintOffsets, [2]uint64{sRef, labelHash}) 433 } 434 435 if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil { 436 return errors.Wrap(err, "write series data") 437 } 438 439 return nil 440 } 441 442 func (w *Writer) startSymbols() error { 443 // We are at w.toc.Symbols. 444 // Leave 4 bytes of space for the length, and another 4 for the number of symbols 445 // which will both be calculated later. 446 return w.write([]byte("alenblen")) 447 } 448 449 func (w *Writer) AddSymbol(sym string) error { 450 if err := w.ensureStage(idxStageSymbols); err != nil { 451 return err 452 } 453 if w.numSymbols != 0 && sym <= w.lastSymbol { 454 return errors.Errorf("symbol %q out-of-order", sym) 455 } 456 w.lastSymbol = sym 457 w.numSymbols++ 458 w.buf1.Reset() 459 w.buf1.PutUvarintStr(sym) 460 return w.write(w.buf1.Get()) 461 } 462 463 func (w *Writer) finishSymbols() error { 464 symbolTableSize := w.f.pos - w.toc.Symbols - 4 465 // The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1 466 if symbolTableSize > math.MaxUint32 { 467 return errors.Errorf("symbol table size exceeds 4 bytes: %d", symbolTableSize) 468 } 469 470 // Write out the length and symbol count. 471 w.buf1.Reset() 472 w.buf1.PutBE32int(int(symbolTableSize)) 473 w.buf1.PutBE32int(w.numSymbols) 474 if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil { 475 return err 476 } 477 478 hashPos := w.f.pos 479 // Leave space for the hash. We can only calculate it 480 // now that the number of symbols is known, so mmap and do it from there. 481 if err := w.write([]byte("hash")); err != nil { 482 return err 483 } 484 if err := w.f.Flush(); err != nil { 485 return err 486 } 487 488 //sf, err := fileutil.OpenMmapFile(w.f.name) 489 buf, sf, err := w.f.Buffer() 490 if err != nil { 491 return err 492 } 493 w.symbolFile = sf 494 hash := crc32.Checksum(buf[w.toc.Symbols+4:hashPos], castagnoliTable) 495 w.buf1.Reset() 496 w.buf1.PutBE32(hash) 497 if err := w.writeAt(w.buf1.Get(), hashPos); err != nil { 498 return err 499 } 500 501 // Load in the symbol table efficiently for the rest of the index writing. 502 w.symbols, err = NewSymbols(RealByteSlice(buf), FormatV2, int(w.toc.Symbols)) 503 if err != nil { 504 return errors.Wrap(err, "read symbols") 505 } 506 return nil 507 } 508 509 func (w *Writer) writeLabelIndices() error { 510 if err := w.fPO.Flush(); err != nil { 511 return err 512 } 513 514 // Find all the label values in the tmp posting offset table. 515 //f, err := fileutil.OpenMmapFile(w.fPO.name) 516 buf, closer, err := w.fPO.Buffer() 517 if err != nil { 518 return err 519 } 520 defer closer.Close() 521 522 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.fPO.pos))) 523 cnt := w.cntPO 524 current := []byte{} 525 values := []uint32{} 526 for d.Err() == nil && cnt > 0 { 527 cnt-- 528 d.Uvarint() // Keycount. 529 name := d.UvarintBytes() // Label name. 530 value := yoloString(d.UvarintBytes()) // Label value. 531 d.Uvarint64() // Offset. 532 if len(name) == 0 { 533 continue // All index is ignored. 534 } 535 536 if !bytes.Equal(name, current) && len(values) > 0 { 537 // We've reached a new label name. 538 if err := w.writeLabelIndex(string(current), values); err != nil { 539 return err 540 } 541 values = values[:0] 542 } 543 current = name 544 sid, err := w.symbols.ReverseLookup(value) 545 if err != nil { 546 return err 547 } 548 values = append(values, sid) 549 } 550 if d.Err() != nil { 551 return d.Err() 552 } 553 554 // Handle the last label. 555 if len(values) > 0 { 556 if err := w.writeLabelIndex(string(current), values); err != nil { 557 return err 558 } 559 } 560 return nil 561 } 562 563 func (w *Writer) writeLabelIndex(name string, values []uint32) error { 564 // Align beginning to 4 bytes for more efficient index list scans. 565 if err := w.addPadding(4); err != nil { 566 return err 567 } 568 569 w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{ 570 keys: []string{name}, 571 offset: w.f.pos, 572 }) 573 574 startPos := w.f.pos 575 // Leave 4 bytes of space for the length, which will be calculated later. 576 if err := w.write([]byte("alen")); err != nil { 577 return err 578 } 579 w.crc32.Reset() 580 581 w.buf1.Reset() 582 w.buf1.PutBE32int(1) // Number of names. 583 w.buf1.PutBE32int(len(values)) 584 w.buf1.WriteToHash(w.crc32) 585 if err := w.write(w.buf1.Get()); err != nil { 586 return err 587 } 588 589 for _, v := range values { 590 w.buf1.Reset() 591 w.buf1.PutBE32(v) 592 w.buf1.WriteToHash(w.crc32) 593 if err := w.write(w.buf1.Get()); err != nil { 594 return err 595 } 596 } 597 598 // Write out the length. 599 w.buf1.Reset() 600 l := w.f.pos - startPos - 4 601 if l > math.MaxUint32 { 602 return errors.Errorf("label index size exceeds 4 bytes: %d", l) 603 } 604 w.buf1.PutBE32int(int(l)) 605 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 606 return err 607 } 608 609 w.buf1.Reset() 610 w.buf1.PutHashSum(w.crc32) 611 return w.write(w.buf1.Get()) 612 } 613 614 // writeLabelIndexesOffsetTable writes the label indices offset table. 615 func (w *Writer) writeLabelIndexesOffsetTable() error { 616 startPos := w.f.pos 617 // Leave 4 bytes of space for the length, which will be calculated later. 618 if err := w.write([]byte("alen")); err != nil { 619 return err 620 } 621 w.crc32.Reset() 622 623 w.buf1.Reset() 624 w.buf1.PutBE32int(len(w.labelIndexes)) 625 w.buf1.WriteToHash(w.crc32) 626 if err := w.write(w.buf1.Get()); err != nil { 627 return err 628 } 629 630 for _, e := range w.labelIndexes { 631 w.buf1.Reset() 632 w.buf1.PutUvarint(len(e.keys)) 633 for _, k := range e.keys { 634 w.buf1.PutUvarintStr(k) 635 } 636 w.buf1.PutUvarint64(e.offset) 637 w.buf1.WriteToHash(w.crc32) 638 if err := w.write(w.buf1.Get()); err != nil { 639 return err 640 } 641 } 642 // Write out the length. 643 w.buf1.Reset() 644 l := w.f.pos - startPos - 4 645 if l > math.MaxUint32 { 646 return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l) 647 } 648 w.buf1.PutBE32int(int(l)) 649 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 650 return err 651 } 652 653 w.buf1.Reset() 654 w.buf1.PutHashSum(w.crc32) 655 return w.write(w.buf1.Get()) 656 } 657 658 // writePostingsOffsetTable writes the postings offset table. 659 func (w *Writer) writePostingsOffsetTable() error { 660 // Ensure everything is in the temporary file. 661 if err := w.fPO.Flush(); err != nil { 662 return err 663 } 664 665 startPos := w.f.pos 666 // Leave 4 bytes of space for the length, which will be calculated later. 667 if err := w.write([]byte("alen")); err != nil { 668 return err 669 } 670 671 // Copy over the tmp posting offset table, however we need to 672 // adjust the offsets. 673 adjustment := w.postingsStart 674 675 w.buf1.Reset() 676 w.crc32.Reset() 677 w.buf1.PutBE32int(int(w.cntPO)) // Count. 678 w.buf1.WriteToHash(w.crc32) 679 if err := w.write(w.buf1.Get()); err != nil { 680 return err 681 } 682 683 //f, err := fileutil.OpenMmapFile(w.fPO.name) 684 buf, closer, err := w.fPO.Buffer() 685 if err != nil { 686 return err 687 } 688 defer func() { 689 if closer != nil { 690 closer.Close() 691 } 692 }() 693 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.fPO.pos))) 694 cnt := w.cntPO 695 for d.Err() == nil && cnt > 0 { 696 w.buf1.Reset() 697 w.buf1.PutUvarint(d.Uvarint()) // Keycount. 698 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name. 699 w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value. 700 w.buf1.PutUvarint64(d.Uvarint64() + adjustment) // Offset. 701 w.buf1.WriteToHash(w.crc32) 702 if err := w.write(w.buf1.Get()); err != nil { 703 return err 704 } 705 cnt-- 706 } 707 if d.Err() != nil { 708 return d.Err() 709 } 710 711 // Cleanup temporary file. 712 //if err := f.Close(); err != nil { 713 // return err 714 //} 715 //f = nil 716 if err := w.fPO.Close(); err != nil { 717 return err 718 } 719 if err := w.fPO.Remove(); err != nil { 720 return err 721 } 722 //w.fPO = nil 723 724 // Write out the length. 725 w.buf1.Reset() 726 l := w.f.pos - startPos - 4 727 if l > math.MaxUint32 { 728 return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l) 729 } 730 w.buf1.PutBE32int(int(l)) 731 if err := w.writeAt(w.buf1.Get(), startPos); err != nil { 732 return err 733 } 734 735 // Finally write the hash. 736 w.buf1.Reset() 737 w.buf1.PutHashSum(w.crc32) 738 return w.write(w.buf1.Get()) 739 } 740 741 func (w *Writer) writeFingerprintOffsetsTable() error { 742 w.buf1.Reset() 743 w.buf2.Reset() 744 745 w.buf1.PutBE32int(len(w.fingerprintOffsets)) // Count. 746 // build offsets 747 for _, x := range w.fingerprintOffsets { 748 w.buf1.PutBE64(x[0]) // series offset 749 w.buf1.PutBE64(x[1]) // hash 750 } 751 752 // write length 753 ln := w.buf1.Len() 754 // TODO(owen-d): can remove the uint32 cast in the future 755 // Had to uint32 wrap these for arm32 builds, which we'll remove in the future. 756 if uint32(ln) > uint32(math.MaxUint32) { 757 return errors.Errorf("fingerprint offset size exceeds 4 bytes: %d", ln) 758 } 759 760 w.buf2.PutBE32int(ln) 761 if err := w.write(w.buf2.Get()); err != nil { 762 return err 763 } 764 765 // write offsets+checksum 766 w.buf1.PutHash(w.crc32) 767 if err := w.write(w.buf1.Get()); err != nil { 768 return errors.Wrap(err, "failure writing fingerprint offsets") 769 } 770 return nil 771 } 772 773 const indexTOCLen = 8*9 + crc32.Size 774 775 func (w *Writer) writeTOC() error { 776 w.buf1.Reset() 777 778 w.buf1.PutBE64(w.toc.Symbols) 779 w.buf1.PutBE64(w.toc.Series) 780 w.buf1.PutBE64(w.toc.LabelIndices) 781 w.buf1.PutBE64(w.toc.LabelIndicesTable) 782 w.buf1.PutBE64(w.toc.Postings) 783 w.buf1.PutBE64(w.toc.PostingsTable) 784 w.buf1.PutBE64(w.toc.FingerprintOffsets) 785 786 // metadata 787 w.buf1.PutBE64int64(w.toc.Metadata.From) 788 w.buf1.PutBE64int64(w.toc.Metadata.Through) 789 790 w.buf1.PutHash(w.crc32) 791 792 return w.write(w.buf1.Get()) 793 } 794 795 func (w *Writer) writePostingsToTmpFiles() error { 796 names := make([]string, 0, len(w.labelNames)) 797 for n := range w.labelNames { 798 names = append(names, n) 799 } 800 sort.Strings(names) 801 802 if err := w.f.Flush(); err != nil { 803 return err 804 } 805 //f, err := fileutil.OpenMmapFile(w.f.name) 806 buf, closer, err := w.f.Buffer() 807 if err != nil { 808 return err 809 } 810 defer closer.Close() 811 812 // Write out the special all posting. 813 offsets := []uint32{} 814 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.toc.LabelIndices))) 815 d.Skip(int(w.toc.Series)) 816 for d.Len() > 0 { 817 d.ConsumePadding() 818 startPos := w.toc.LabelIndices - uint64(d.Len()) 819 if startPos%16 != 0 { 820 return errors.Errorf("series not 16-byte aligned at %d", startPos) 821 } 822 offsets = append(offsets, uint32(startPos/16)) 823 // Skip to next series. 824 x := d.Uvarint() 825 d.Skip(x + crc32.Size) 826 if err := d.Err(); err != nil { 827 return err 828 } 829 } 830 if err := w.writePosting("", "", offsets); err != nil { 831 return err 832 } 833 maxPostings := uint64(len(offsets)) // No label name can have more postings than this. 834 835 for len(names) > 0 { 836 batchNames := []string{} 837 var c uint64 838 // Try to bunch up label names into one loop, but avoid 839 // using more memory than a single label name can. 840 for len(names) > 0 { 841 if w.labelNames[names[0]]+c > maxPostings { 842 if c > 0 { 843 break 844 } 845 return fmt.Errorf("corruption detected when writing postings to index: label %q has %d uses, but maxPostings is %d", names[0], w.labelNames[names[0]], maxPostings) 846 } 847 batchNames = append(batchNames, names[0]) 848 c += w.labelNames[names[0]] 849 names = names[1:] 850 } 851 852 nameSymbols := map[uint32]string{} 853 for _, name := range batchNames { 854 sid, err := w.symbols.ReverseLookup(name) 855 if err != nil { 856 return err 857 } 858 nameSymbols[sid] = name 859 } 860 // Label name -> label value -> positions. 861 postings := map[uint32]map[uint32][]uint32{} 862 863 d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.toc.LabelIndices))) 864 d.Skip(int(w.toc.Series)) 865 for d.Len() > 0 { 866 d.ConsumePadding() 867 startPos := w.toc.LabelIndices - uint64(d.Len()) 868 l := d.Uvarint() // Length of this series in bytes. 869 startLen := d.Len() 870 871 _ = d.Be64() // skip fingerprint 872 // See if label names we want are in the series. 873 numLabels := d.Uvarint() 874 for i := 0; i < numLabels; i++ { 875 lno := uint32(d.Uvarint()) 876 lvo := uint32(d.Uvarint()) 877 878 if _, ok := nameSymbols[lno]; ok { 879 if _, ok := postings[lno]; !ok { 880 postings[lno] = map[uint32][]uint32{} 881 } 882 postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/16)) 883 } 884 } 885 // Skip to next series. 886 d.Skip(l - (startLen - d.Len()) + crc32.Size) 887 if err := d.Err(); err != nil { 888 return err 889 } 890 } 891 892 for _, name := range batchNames { 893 // Write out postings for this label name. 894 sid, err := w.symbols.ReverseLookup(name) 895 if err != nil { 896 return err 897 } 898 values := make([]uint32, 0, len(postings[sid])) 899 for v := range postings[sid] { 900 values = append(values, v) 901 } 902 // Symbol numbers are in order, so the strings will also be in order. 903 sort.Sort(uint32slice(values)) 904 for _, v := range values { 905 value, err := w.symbols.Lookup(v) 906 if err != nil { 907 return err 908 } 909 if err := w.writePosting(name, value, postings[sid][v]); err != nil { 910 return err 911 } 912 } 913 } 914 select { 915 case <-w.ctx.Done(): 916 return w.ctx.Err() 917 default: 918 } 919 } 920 return nil 921 } 922 923 func (w *Writer) writePosting(name, value string, offs []uint32) error { 924 // Align beginning to 4 bytes for more efficient postings list scans. 925 if err := w.fP.AddPadding(4); err != nil { 926 return err 927 } 928 929 // Write out postings offset table to temporary file as we go. 930 w.buf1.Reset() 931 w.buf1.PutUvarint(2) 932 w.buf1.PutUvarintStr(name) 933 w.buf1.PutUvarintStr(value) 934 w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file. 935 if err := w.fPO.Write(w.buf1.Get()); err != nil { 936 return err 937 } 938 w.cntPO++ 939 940 w.buf1.Reset() 941 w.buf1.PutBE32int(len(offs)) 942 943 for _, off := range offs { 944 if off > (1<<32)-1 { 945 return errors.Errorf("series offset %d exceeds 4 bytes", off) 946 } 947 w.buf1.PutBE32(off) 948 } 949 950 w.buf2.Reset() 951 l := w.buf1.Len() 952 // We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there. 953 if uint(l) > math.MaxUint32 { 954 return errors.Errorf("posting size exceeds 4 bytes: %d", l) 955 } 956 w.buf2.PutBE32int(l) 957 w.buf1.PutHash(w.crc32) 958 return w.fP.Write(w.buf2.Get(), w.buf1.Get()) 959 } 960 961 func (w *Writer) writePostings() error { 962 // There's padding in the tmp file, make sure it actually works. 963 if err := w.f.AddPadding(4); err != nil { 964 return err 965 } 966 w.postingsStart = w.f.pos 967 968 // Copy temporary file into main index. 969 if err := w.fP.Flush(); err != nil { 970 return err 971 } 972 //if _, err := w.fP.f.Seek(0, 0); err != nil { 973 // return err 974 //} 975 // Don't need to calculate a checksum, so can copy directly. 976 //n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, make([]byte, 1<<20)) 977 //buf := make([]byte, cap(w.buf1.B)) 978 //buf := w.buf1.B[:cap(w.buf1.B)] 979 //n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, buf) 980 //if err != nil { 981 // return err 982 //} 983 n, err := w.f.ReadFrom(w.fP) 984 if err != nil { 985 return err 986 } 987 if uint64(n) != w.fP.pos { 988 return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n) 989 } 990 //w.f.pos += uint64(n) 991 992 if err := w.fP.Close(); err != nil { 993 return err 994 } 995 if err := w.fP.Remove(); err != nil { 996 return err 997 } 998 //w.fP = nil 999 return nil 1000 } 1001 1002 type uint32slice []uint32 1003 1004 func (s uint32slice) Len() int { return len(s) } 1005 func (s uint32slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 1006 func (s uint32slice) Less(i, j int) bool { return s[i] < s[j] } 1007 1008 type labelIndexHashEntry struct { 1009 keys []string 1010 offset uint64 1011 } 1012 1013 func (w *Writer) Close() error { 1014 // Even if this fails, we need to close all the files. 1015 ensureErr := w.ensureStage(idxStageDone) 1016 1017 if w.symbolFile != nil { 1018 if err := w.symbolFile.Close(); err != nil { 1019 return err 1020 } 1021 } 1022 if w.fP != nil { 1023 if err := w.fP.Close(); err != nil { 1024 return err 1025 } 1026 } 1027 if w.fPO != nil { 1028 if err := w.fPO.Close(); err != nil { 1029 return err 1030 } 1031 } 1032 if err := w.f.Close(); err != nil { 1033 return err 1034 } 1035 // w.f is kept around a bit longer and returned to pool by users 1036 PutBufferWriterToPool(w.fP) 1037 PutBufferWriterToPool(w.fPO) 1038 w.fP = nil 1039 w.fPO = nil 1040 1041 return ensureErr 1042 } 1043 1044 // StringIter iterates over a sorted list of strings. 1045 type StringIter interface { 1046 // Next advances the iterator and returns true if another value was found. 1047 Next() bool 1048 1049 // At returns the value at the current iterator position. 1050 At() string 1051 1052 // Err returns the last error of the iterator. 1053 Err() error 1054 } 1055 1056 type Reader struct { 1057 b ByteSlice 1058 toc *TOC 1059 1060 // Close that releases the underlying resources of the byte slice. 1061 c io.Closer 1062 1063 // Map of LabelName to a list of some LabelValues's position in the offset table. 1064 // The first and last values for each name are always present. 1065 postings map[string][]postingOffset 1066 // For the v1 format, labelname -> labelvalue -> offset. 1067 postingsV1 map[string]map[string]uint64 1068 1069 symbols *Symbols 1070 nameSymbols map[uint32]string // Cache of the label name symbol lookups, 1071 // as there are not many and they are half of all lookups. 1072 1073 fingerprintOffsets index.FingerprintOffsets 1074 1075 dec *Decoder 1076 1077 version int 1078 } 1079 1080 type postingOffset struct { 1081 value string 1082 off int 1083 } 1084 1085 // ByteSlice abstracts a byte slice. 1086 type ByteSlice interface { 1087 Len() int 1088 Range(start, end int) []byte 1089 } 1090 1091 type RealByteSlice []byte 1092 1093 func (b RealByteSlice) Len() int { 1094 return len(b) 1095 } 1096 1097 func (b RealByteSlice) Range(start, end int) []byte { 1098 return b[start:end] 1099 } 1100 1101 func (b RealByteSlice) Sub(start, end int) ByteSlice { 1102 return b[start:end] 1103 } 1104 1105 // NewReader returns a new index reader on the given byte slice. It automatically 1106 // handles different format versions. 1107 func NewReader(b ByteSlice) (*Reader, error) { 1108 return newReader(b, io.NopCloser(nil)) 1109 } 1110 1111 type nopCloser struct{} 1112 1113 func (nopCloser) Close() error { return nil } 1114 1115 // NewFileReader returns a new index reader against the given index file. 1116 func NewFileReader(path string) (*Reader, error) { 1117 b, err := os.ReadFile(path) 1118 if err != nil { 1119 return nil, err 1120 } 1121 r, err := newReader(RealByteSlice(b), nopCloser{}) 1122 if err != nil { 1123 return r, err 1124 } 1125 1126 return r, nil 1127 } 1128 1129 func newReader(b ByteSlice, c io.Closer) (*Reader, error) { 1130 r := &Reader{ 1131 b: b, 1132 c: c, 1133 postings: map[string][]postingOffset{}, 1134 } 1135 1136 // Verify header. 1137 if r.b.Len() < HeaderLen { 1138 return nil, errors.Wrap(tsdb_enc.ErrInvalidSize, "index header") 1139 } 1140 if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex { 1141 return nil, errors.Errorf("invalid magic number %x", m) 1142 } 1143 r.version = int(r.b.Range(4, 5)[0]) 1144 1145 if r.version != FormatV1 && r.version != FormatV2 { 1146 return nil, errors.Errorf("unknown index file version %d", r.version) 1147 } 1148 1149 var err error 1150 r.toc, err = NewTOCFromByteSlice(b) 1151 if err != nil { 1152 return nil, errors.Wrap(err, "read TOC") 1153 } 1154 1155 r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols)) 1156 if err != nil { 1157 return nil, errors.Wrap(err, "read symbols") 1158 } 1159 1160 if r.version == FormatV1 { 1161 // Earlier V1 formats don't have a sorted postings offset table, so 1162 // load the whole offset table into memory. 1163 r.postingsV1 = map[string]map[string]uint64{} 1164 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1165 if len(key) != 2 { 1166 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1167 } 1168 if _, ok := r.postingsV1[key[0]]; !ok { 1169 r.postingsV1[key[0]] = map[string]uint64{} 1170 r.postings[key[0]] = nil // Used to get a list of labelnames in places. 1171 } 1172 r.postingsV1[key[0]][key[1]] = off 1173 return nil 1174 }); err != nil { 1175 return nil, errors.Wrap(err, "read postings table") 1176 } 1177 } else { 1178 var lastKey []string 1179 lastOff := 0 1180 valueCount := 0 1181 // For the postings offset table we keep every label name but only every nth 1182 // label value (plus the first and last one), to save memory. 1183 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, _ uint64, off int) error { 1184 if len(key) != 2 { 1185 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1186 } 1187 if _, ok := r.postings[key[0]]; !ok { 1188 // Next label name. 1189 r.postings[key[0]] = []postingOffset{} 1190 if lastKey != nil { 1191 // Always include last value for each label name. 1192 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1193 } 1194 lastKey = nil 1195 valueCount = 0 1196 } 1197 if valueCount%symbolFactor == 0 { 1198 r.postings[key[0]] = append(r.postings[key[0]], postingOffset{value: key[1], off: off}) 1199 lastKey = nil 1200 } else { 1201 lastKey = key 1202 lastOff = off 1203 } 1204 valueCount++ 1205 return nil 1206 }); err != nil { 1207 return nil, errors.Wrap(err, "read postings table") 1208 } 1209 if lastKey != nil { 1210 r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff}) 1211 } 1212 // Trim any extra space in the slices. 1213 for k, v := range r.postings { 1214 l := make([]postingOffset, len(v)) 1215 copy(l, v) 1216 r.postings[k] = l 1217 } 1218 } 1219 1220 r.nameSymbols = make(map[uint32]string, len(r.postings)) 1221 for k := range r.postings { 1222 if k == "" { 1223 continue 1224 } 1225 off, err := r.symbols.ReverseLookup(k) 1226 if err != nil { 1227 return nil, errors.Wrap(err, "reverse symbol lookup") 1228 } 1229 r.nameSymbols[off] = k 1230 } 1231 1232 r.fingerprintOffsets, err = readFingerprintOffsetsTable(r.b, r.toc.FingerprintOffsets) 1233 if err != nil { 1234 return nil, errors.Wrap(err, "loading fingerprint offsets") 1235 } 1236 1237 r.dec = &Decoder{LookupSymbol: r.lookupSymbol} 1238 1239 return r, nil 1240 } 1241 1242 // Version returns the file format version of the underlying index. 1243 func (r *Reader) Version() int { 1244 return r.version 1245 } 1246 1247 // FileInfo returns some general stats about the underlying file 1248 func (r *Reader) FileInfo() block.File { 1249 k, v := index.AllPostingsKey() 1250 postings, err := r.Postings(k, nil, v) 1251 if err != nil { 1252 panic(err) 1253 } 1254 var numSeries uint64 1255 for postings.Next() { 1256 numSeries++ 1257 } 1258 return block.File{ 1259 RelPath: block.IndexFilename, 1260 SizeBytes: uint64(r.Size()), 1261 TSDB: &block.TSDBFile{ 1262 NumSeries: numSeries, 1263 }, 1264 } 1265 } 1266 1267 // Range marks a byte range. 1268 type Range struct { 1269 Start, End int64 1270 } 1271 1272 // PostingsRanges returns a new map of byte range in the underlying index file 1273 // for all postings lists. 1274 func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) { 1275 m := map[labels.Label]Range{} 1276 if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error { 1277 if len(key) != 2 { 1278 return errors.Errorf("unexpected key length for posting table %d", len(key)) 1279 } 1280 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(off), castagnoliTable)) 1281 if d.Err() != nil { 1282 return d.Err() 1283 } 1284 m[labels.Label{Name: key[0], Value: key[1]}] = Range{ 1285 Start: int64(off) + 4, 1286 End: int64(off) + 4 + int64(d.Len()), 1287 } 1288 return nil 1289 }); err != nil { 1290 return nil, errors.Wrap(err, "read postings table") 1291 } 1292 return m, nil 1293 } 1294 1295 type Symbols struct { 1296 bs ByteSlice 1297 version int 1298 off int 1299 1300 offsets []int 1301 seen int 1302 } 1303 1304 const symbolFactor = 32 1305 1306 // NewSymbols returns a Symbols object for symbol lookups. 1307 func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) { 1308 s := &Symbols{ 1309 bs: bs, 1310 version: version, 1311 off: off, 1312 } 1313 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, off, castagnoliTable)) 1314 var ( 1315 origLen = d.Len() 1316 cnt = d.Be32int() 1317 basePos = off + 4 1318 ) 1319 s.offsets = make([]int, 0, 1+cnt/symbolFactor) 1320 for d.Err() == nil && s.seen < cnt { 1321 if s.seen%symbolFactor == 0 { 1322 s.offsets = append(s.offsets, basePos+origLen-d.Len()) 1323 } 1324 d.UvarintBytes() // The symbol. 1325 s.seen++ 1326 } 1327 if d.Err() != nil { 1328 return nil, d.Err() 1329 } 1330 return s, nil 1331 } 1332 1333 func (s Symbols) Lookup(o uint32) (string, error) { 1334 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1335 B: s.bs.Range(0, s.bs.Len()), 1336 }) 1337 1338 if s.version == FormatV2 { 1339 if int(o) >= s.seen { 1340 return "", errors.Errorf("unknown symbol offset %d", o) 1341 } 1342 d.Skip(s.offsets[int(o/symbolFactor)]) 1343 // Walk until we find the one we want. 1344 for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- { 1345 d.UvarintBytes() 1346 } 1347 } else { 1348 d.Skip(int(o)) 1349 } 1350 sym := d.UvarintStr() 1351 if d.Err() != nil { 1352 return "", d.Err() 1353 } 1354 return sym, nil 1355 } 1356 1357 func (s Symbols) ReverseLookup(sym string) (uint32, error) { 1358 if len(s.offsets) == 0 { 1359 return 0, errors.Errorf("unknown symbol %q - no symbols", sym) 1360 } 1361 i := sort.Search(len(s.offsets), func(i int) bool { 1362 // Any decoding errors here will be lost, however 1363 // we already read through all of this at startup. 1364 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1365 B: s.bs.Range(0, s.bs.Len()), 1366 }) 1367 d.Skip(s.offsets[i]) 1368 return yoloString(d.UvarintBytes()) > sym 1369 }) 1370 d := encoding.DecWrap(tsdb_enc.Decbuf{ 1371 B: s.bs.Range(0, s.bs.Len()), 1372 }) 1373 if i > 0 { 1374 i-- 1375 } 1376 d.Skip(s.offsets[i]) 1377 res := i * symbolFactor 1378 var lastLen int 1379 var lastSymbol string 1380 for d.Err() == nil && res <= s.seen { 1381 lastLen = d.Len() 1382 lastSymbol = yoloString(d.UvarintBytes()) 1383 if lastSymbol >= sym { 1384 break 1385 } 1386 res++ 1387 } 1388 if d.Err() != nil { 1389 return 0, d.Err() 1390 } 1391 if lastSymbol != sym { 1392 return 0, errors.Errorf("unknown symbol %q", sym) 1393 } 1394 if s.version == FormatV2 { 1395 return uint32(res), nil 1396 } 1397 return uint32(s.bs.Len() - lastLen), nil 1398 } 1399 1400 func (s Symbols) Size() int { 1401 return len(s.offsets) * 8 1402 } 1403 1404 func (s Symbols) Iter() StringIter { 1405 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(s.bs, s.off, castagnoliTable)) 1406 cnt := d.Be32int() 1407 return &symbolsIter{ 1408 d: d, 1409 cnt: cnt, 1410 } 1411 } 1412 1413 // symbolsIter implements StringIter. 1414 type symbolsIter struct { 1415 d encoding.Decbuf 1416 cnt int 1417 cur string 1418 err error 1419 } 1420 1421 func (s *symbolsIter) Next() bool { 1422 if s.cnt == 0 || s.err != nil { 1423 return false 1424 } 1425 s.cur = yoloString(s.d.UvarintBytes()) 1426 s.cnt-- 1427 if s.d.Err() != nil { 1428 s.err = s.d.Err() 1429 return false 1430 } 1431 return true 1432 } 1433 1434 func (s symbolsIter) At() string { return s.cur } 1435 func (s symbolsIter) Err() error { return s.err } 1436 1437 // ReadOffsetTable reads an offset table and at the given position calls f for each 1438 // found entry. If f returns an error it stops decoding and returns the received error. 1439 func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64, int) error) error { 1440 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1441 startLen := d.Len() 1442 cnt := d.Be32() 1443 1444 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1445 offsetPos := startLen - d.Len() 1446 keyCount := d.Uvarint() 1447 // The Postings offset table takes only 2 keys per entry (name and value of label), 1448 // and the LabelIndices offset table takes only 1 key per entry (a label name). 1449 // Hence setting the size to max of both, i.e. 2. 1450 keys := make([]string, 0, 2) 1451 1452 for i := 0; i < keyCount; i++ { 1453 keys = append(keys, d.UvarintStr()) 1454 } 1455 o := d.Uvarint64() 1456 if d.Err() != nil { 1457 break 1458 } 1459 if err := f(keys, o, offsetPos); err != nil { 1460 return err 1461 } 1462 cnt-- 1463 } 1464 return d.Err() 1465 } 1466 1467 func readFingerprintOffsetsTable(bs ByteSlice, off uint64) (index.FingerprintOffsets, error) { 1468 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable)) 1469 cnt := d.Be32() 1470 res := make(index.FingerprintOffsets, 0, int(cnt)) 1471 1472 for d.Err() == nil && d.Len() > 0 && cnt > 0 { 1473 res = append(res, [2]uint64{d.Be64(), d.Be64()}) 1474 cnt-- 1475 } 1476 1477 return res, d.Err() 1478 } 1479 1480 // Close the reader and its underlying resources. 1481 func (r *Reader) Close() error { 1482 return r.c.Close() 1483 } 1484 1485 func (r *Reader) lookupSymbol(o uint32) (string, error) { 1486 if s, ok := r.nameSymbols[o]; ok { 1487 return s, nil 1488 } 1489 return r.symbols.Lookup(o) 1490 } 1491 1492 func (r *Reader) Bounds() (int64, int64) { 1493 return r.toc.Metadata.From, r.toc.Metadata.Through 1494 } 1495 1496 func (r *Reader) Checksum() uint32 { 1497 return r.toc.Metadata.Checksum 1498 } 1499 1500 // Symbols returns an iterator over the symbols that exist within the index. 1501 func (r *Reader) Symbols() StringIter { 1502 return r.symbols.Iter() 1503 } 1504 1505 // SymbolTableSize returns the symbol table size in bytes. 1506 func (r *Reader) SymbolTableSize() uint64 { 1507 return uint64(r.symbols.Size()) 1508 } 1509 1510 // SortedLabelValues returns value tuples that exist for the given label name. 1511 // It is not safe to use the return value beyond the lifetime of the byte slice 1512 // passed into the Reader. 1513 func (r *Reader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1514 values, err := r.LabelValues(name, matchers...) 1515 if err == nil && r.version == FormatV1 { 1516 sort.Strings(values) 1517 } 1518 return values, err 1519 } 1520 1521 // LabelValues returns value tuples that exist for the given label name. 1522 // It is not safe to use the return value beyond the lifetime of the byte slice 1523 // passed into the Reader. 1524 // TODO(replay): Support filtering by matchers 1525 func (r *Reader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { 1526 if len(matchers) > 0 { 1527 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1528 } 1529 1530 if r.version == FormatV1 { 1531 e, ok := r.postingsV1[name] 1532 if !ok { 1533 return nil, nil 1534 } 1535 values := make([]string, 0, len(e)) 1536 for k := range e { 1537 values = append(values, k) 1538 } 1539 return values, nil 1540 1541 } 1542 e, ok := r.postings[name] 1543 if !ok { 1544 return nil, nil 1545 } 1546 if len(e) == 0 { 1547 return nil, nil 1548 } 1549 values := make([]string, 0, len(e)*symbolFactor) 1550 1551 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1552 d.Skip(e[0].off) 1553 lastVal := e[len(e)-1].value 1554 1555 skip := 0 1556 for d.Err() == nil { 1557 if skip == 0 { 1558 // These are always the same number of bytes, 1559 // and it's faster to skip than parse. 1560 skip = d.Len() 1561 d.Uvarint() // Keycount. 1562 d.UvarintBytes() // Label name. 1563 skip -= d.Len() 1564 } else { 1565 d.Skip(skip) 1566 } 1567 s := yoloString(d.UvarintBytes()) // Label value. 1568 values = append(values, s) 1569 if s == lastVal { 1570 break 1571 } 1572 d.Uvarint64() // Offset. 1573 } 1574 if d.Err() != nil { 1575 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1576 } 1577 return values, nil 1578 } 1579 1580 // LabelNamesFor returns all the label names for the series referred to by IDs. 1581 // The names returned are sorted. 1582 func (r *Reader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) { 1583 // Gather offsetsMap the name offsetsMap in the symbol table first 1584 offsetsMap := make(map[uint32]struct{}) 1585 for _, id := range ids { 1586 offset := id 1587 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1588 // and the ID is the multiple of 16 of the actual position. 1589 if r.version == FormatV2 { 1590 offset = id * 16 1591 } 1592 1593 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1594 buf := d.Get() 1595 if d.Err() != nil { 1596 return nil, errors.Wrap(d.Err(), "get buffer for series") 1597 } 1598 1599 offsets, err := r.dec.LabelNamesOffsetsFor(buf) 1600 if err != nil { 1601 return nil, errors.Wrap(err, "get label name offsets") 1602 } 1603 for _, off := range offsets { 1604 offsetsMap[off] = struct{}{} 1605 } 1606 } 1607 1608 // Lookup the unique symbols. 1609 names := make([]string, 0, len(offsetsMap)) 1610 for off := range offsetsMap { 1611 name, err := r.lookupSymbol(off) 1612 if err != nil { 1613 return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor") 1614 } 1615 names = append(names, name) 1616 } 1617 1618 sort.Strings(names) 1619 1620 return names, nil 1621 } 1622 1623 // LabelValueFor returns label value for the given label name in the series referred to by ID. 1624 func (r *Reader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { 1625 offset := id 1626 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1627 // and the ID is the multiple of 16 of the actual position. 1628 if r.version == FormatV2 { 1629 offset = id * 16 1630 } 1631 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1632 buf := d.Get() 1633 if d.Err() != nil { 1634 return "", errors.Wrap(d.Err(), "label values for") 1635 } 1636 1637 value, err := r.dec.LabelValueFor(buf, label) 1638 if err != nil { 1639 return "", storage.ErrNotFound 1640 } 1641 1642 if value == "" { 1643 return "", storage.ErrNotFound 1644 } 1645 1646 return value, nil 1647 } 1648 1649 // Series reads the series with the given ID and writes its labels and chunks into lbls and chks. 1650 func (r *Reader) Series(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta) (uint64, error) { 1651 offset := id 1652 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1653 // and the ID is the multiple of 16 of the actual position. 1654 if r.version == FormatV2 { 1655 offset = id * 16 1656 } 1657 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1658 if d.Err() != nil { 1659 return 0, d.Err() 1660 } 1661 1662 fprint, err := r.dec.Series(d.Get(), lbls, chks, false) 1663 if err != nil { 1664 return 0, errors.Wrap(err, "read series") 1665 } 1666 return fprint, nil 1667 } 1668 1669 // SeriesBy is like Series but allows to group labels by name. This avoid looking up all label symbols for requested series. 1670 func (r *Reader) SeriesBy(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta, by ...string) (uint64, error) { 1671 offset := id 1672 // In version 2 series IDs are no longer exact references but series are 16-byte padded 1673 // and the ID is the multiple of 16 of the actual position. 1674 if r.version == FormatV2 { 1675 offset = id * 16 1676 } 1677 d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)) 1678 if d.Err() != nil { 1679 return 0, d.Err() 1680 } 1681 1682 fprint, err := r.dec.Series(d.Get(), lbls, chks, true, by...) 1683 if err != nil { 1684 return 0, errors.Wrap(err, "read series") 1685 } 1686 return fprint, nil 1687 } 1688 1689 func (r *Reader) Postings(name string, shard *index.ShardAnnotation, values ...string) (index.Postings, error) { 1690 if r.version == FormatV1 { 1691 e, ok := r.postingsV1[name] 1692 if !ok { 1693 return index.EmptyPostings(), nil 1694 } 1695 res := make([]index.Postings, 0, len(values)) 1696 for _, v := range values { 1697 postingsOff, ok := e[v] 1698 if !ok { 1699 continue 1700 } 1701 // Read from the postings table. 1702 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1703 _, p, err := r.dec.Postings(d.Get()) 1704 if err != nil { 1705 return nil, errors.Wrap(err, "decode postings") 1706 } 1707 res = append(res, p) 1708 } 1709 return index.Merge(res...), nil 1710 } 1711 1712 e, ok := r.postings[name] 1713 if !ok { 1714 return index.EmptyPostings(), nil 1715 } 1716 1717 if len(values) == 0 { 1718 return index.EmptyPostings(), nil 1719 } 1720 1721 res := make([]index.Postings, 0, len(values)) 1722 skip := 0 1723 valueIndex := 0 1724 for valueIndex < len(values) && values[valueIndex] < e[0].value { 1725 // Discard values before the start. 1726 valueIndex++ 1727 } 1728 for valueIndex < len(values) { 1729 value := values[valueIndex] 1730 1731 i := sort.Search(len(e), func(i int) bool { return e[i].value >= value }) 1732 if i == len(e) { 1733 // We're past the end. 1734 break 1735 } 1736 if i > 0 && e[i].value != value { 1737 // Need to look from previous entry. 1738 i-- 1739 } 1740 // Don't Crc32 the entire postings offset table, this is very slow 1741 // so hope any issues were caught at startup. 1742 d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil)) 1743 d.Skip(e[i].off) 1744 1745 // Iterate on the offset table. 1746 var postingsOff uint64 // The offset into the postings table. 1747 for d.Err() == nil { 1748 if skip == 0 { 1749 // These are always the same number of bytes, 1750 // and it's faster to skip than parse. 1751 skip = d.Len() 1752 d.Uvarint() // Keycount. 1753 d.UvarintBytes() // Label name. 1754 skip -= d.Len() 1755 } else { 1756 d.Skip(skip) 1757 } 1758 v := d.UvarintBytes() // Label value. 1759 postingsOff = d.Uvarint64() // Offset. 1760 for string(v) >= value { 1761 if string(v) == value { 1762 // Read from the postings table. 1763 d2 := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)) 1764 _, p, err := r.dec.Postings(d2.Get()) 1765 if err != nil { 1766 return nil, errors.Wrap(err, "decode postings") 1767 } 1768 res = append(res, p) 1769 } 1770 valueIndex++ 1771 if valueIndex == len(values) { 1772 break 1773 } 1774 value = values[valueIndex] 1775 } 1776 if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) { 1777 // Need to go to a later postings offset entry, if there is one. 1778 break 1779 } 1780 } 1781 if d.Err() != nil { 1782 return nil, errors.Wrap(d.Err(), "get postings offset entry") 1783 } 1784 } 1785 1786 merged := index.Merge(res...) 1787 if shard != nil { 1788 return index.NewShardedPostings(merged, *shard, r.fingerprintOffsets), nil 1789 } 1790 1791 return merged, nil 1792 } 1793 1794 // Size returns the size of an index file. 1795 func (r *Reader) Size() int64 { 1796 return int64(r.b.Len()) 1797 } 1798 1799 // LabelNames returns all the unique label names present in the index. 1800 // TODO(twilkie) implement support for matchers 1801 func (r *Reader) LabelNames(matchers ...*labels.Matcher) ([]string, error) { 1802 if len(matchers) > 0 { 1803 return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers) 1804 } 1805 1806 labelNames := make([]string, 0, len(r.postings)) 1807 allPostingsKeyName, _ := index.AllPostingsKey() 1808 for name := range r.postings { 1809 //if name == index.allPostingsKey.Name { 1810 if name == allPostingsKeyName { 1811 // This is not from any metric. 1812 continue 1813 } 1814 labelNames = append(labelNames, name) 1815 } 1816 sort.Strings(labelNames) 1817 return labelNames, nil 1818 } 1819 1820 // Decoder provides decoding methods for the v1 and v2 index file format. 1821 // 1822 // It currently does not contain decoding methods for all entry types but can be extended 1823 // by them if there's demand. 1824 type Decoder struct { 1825 LookupSymbol func(uint32) (string, error) 1826 } 1827 1828 // Postings returns a postings list for b and its number of elements. 1829 func (dec *Decoder) Postings(b []byte) (int, index.Postings, error) { 1830 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1831 n := d.Be32int() 1832 l := d.Get() 1833 if d.Err() != nil { 1834 return 0, nil, d.Err() 1835 } 1836 if len(l) != 4*n { 1837 return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l)) 1838 } 1839 return n, index.NewBigEndianPostings(l), nil 1840 } 1841 1842 // LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series. 1843 // They are returned in the same order they're stored, which should be sorted lexicographically. 1844 func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) { 1845 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1846 _ = d.Be64() // skip fingerprint 1847 k := d.Uvarint() 1848 1849 offsets := make([]uint32, k) 1850 for i := 0; i < k; i++ { 1851 offsets[i] = uint32(d.Uvarint()) 1852 _ = d.Uvarint() // skip the label value 1853 1854 if d.Err() != nil { 1855 return nil, errors.Wrap(d.Err(), "read series label offsets") 1856 } 1857 } 1858 1859 return offsets, d.Err() 1860 } 1861 1862 // LabelValueFor decodes a label for a given series. 1863 func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) { 1864 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1865 _ = d.Be64() // skip fingerprint 1866 k := d.Uvarint() 1867 1868 for i := 0; i < k; i++ { 1869 lno := uint32(d.Uvarint()) 1870 lvo := uint32(d.Uvarint()) 1871 1872 if d.Err() != nil { 1873 return "", errors.Wrap(d.Err(), "read series label offsets") 1874 } 1875 1876 ln, err := dec.LookupSymbol(lno) 1877 if err != nil { 1878 return "", errors.Wrap(err, "lookup label name") 1879 } 1880 1881 if ln == label { 1882 lv, err := dec.LookupSymbol(lvo) 1883 if err != nil { 1884 return "", errors.Wrap(err, "lookup label value") 1885 } 1886 1887 return lv, nil 1888 } 1889 } 1890 1891 return "", d.Err() 1892 } 1893 1894 // Series decodes a series entry from the given byte slice into lset and chks. 1895 func (dec *Decoder) Series(b []byte, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta, group bool, by ...string) (uint64, error) { 1896 if lbls != nil { 1897 *lbls = (*lbls)[:0] 1898 } 1899 *chks = (*chks)[:0] 1900 1901 d := encoding.DecWrap(tsdb_enc.Decbuf{B: b}) 1902 1903 fprint := d.Be64() 1904 k := d.Uvarint() 1905 1906 for i := 0; i < k; i++ { 1907 lno := uint32(d.Uvarint()) 1908 lvo := uint32(d.Uvarint()) 1909 1910 if d.Err() != nil { 1911 return 0, errors.Wrap(d.Err(), "read series label offsets") 1912 } 1913 if lbls == nil { 1914 continue 1915 } 1916 if group && len(by) == 0 { 1917 // If we're grouping by all labels, we don't need to decode them. 1918 continue 1919 } 1920 ln, err := dec.LookupSymbol(lno) 1921 if err != nil { 1922 return 0, errors.Wrap(err, "lookup label name") 1923 } 1924 if group { 1925 var found bool 1926 for _, b := range by { 1927 if b == ln { 1928 found = true 1929 break 1930 } 1931 } 1932 if !found { 1933 continue 1934 } 1935 } 1936 lv, err := dec.LookupSymbol(lvo) 1937 if err != nil { 1938 return 0, errors.Wrap(err, "lookup label value") 1939 } 1940 1941 *lbls = append(*lbls, &typesv1.LabelPair{Name: ln, Value: lv}) 1942 } 1943 1944 // Read the chunks meta data. 1945 k = d.Uvarint() 1946 1947 if k == 0 { 1948 return 0, d.Err() 1949 } 1950 1951 t0 := d.Varint64() 1952 maxt := int64(d.Uvarint64()) + t0 1953 kb := uint32(d.Uvarint()) 1954 entries := uint32(d.Uvarint64()) 1955 checksum := d.Be32() 1956 1957 *chks = append(*chks, index.ChunkMeta{ 1958 Checksum: checksum, 1959 MinTime: t0, 1960 MaxTime: maxt, 1961 KB: kb, 1962 SeriesIndex: entries, 1963 }) 1964 t0 = maxt 1965 1966 for i := 1; i < k; i++ { 1967 // Decode the diff against previous chunk as varint 1968 // instead of uvarint because chunks may overlap 1969 mint := d.Varint64() + t0 1970 maxt := int64(d.Uvarint64()) + mint 1971 kb := uint32(d.Uvarint()) 1972 entries := uint32(d.Uvarint64()) 1973 checksum := d.Be32() 1974 t0 = maxt 1975 1976 if d.Err() != nil { 1977 return 0, errors.Wrapf(d.Err(), "read meta for chunk %d", i) 1978 } 1979 1980 *chks = append(*chks, index.ChunkMeta{ 1981 Checksum: checksum, 1982 MinTime: mint, 1983 MaxTime: maxt, 1984 KB: kb, 1985 SeriesIndex: entries, 1986 }) 1987 } 1988 return fprint, d.Err() 1989 } 1990 1991 func yoloString(b []byte) string { 1992 return *((*string)(unsafe.Pointer(&b))) 1993 } 1994 1995 // todo better name, nicer api 1996 func (w *Writer) ReleaseIndexBuffer() *BufferWriter { 1997 res := w.f 1998 w.f = nil 1999 return res 2000 } 2001 2002 // todo better name, nicer api 2003 func (w *Writer) ReleaseIndex() []byte { 2004 bw := w.ReleaseIndexBuffer() 2005 defer PutBufferWriterToPool(bw) 2006 buffer, _, _ := bw.Buffer() 2007 res := make([]byte, len(buffer)) 2008 copy(res, buffer) 2009 return res 2010 }