github.com/thanos-io/thanos@v0.32.5/pkg/block/indexheader/binary_reader.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package indexheader 5 6 import ( 7 "bufio" 8 "bytes" 9 "context" 10 "encoding/binary" 11 "hash" 12 "hash/crc32" 13 "io" 14 "math" 15 "os" 16 "path/filepath" 17 "sort" 18 "sync" 19 "time" 20 "unsafe" 21 22 "github.com/go-kit/log" 23 "github.com/go-kit/log/level" 24 "github.com/oklog/ulid" 25 "github.com/pkg/errors" 26 "github.com/prometheus/prometheus/tsdb/encoding" 27 "github.com/prometheus/prometheus/tsdb/fileutil" 28 "github.com/prometheus/prometheus/tsdb/index" 29 "github.com/thanos-io/objstore" 30 31 "github.com/thanos-io/thanos/pkg/block" 32 "github.com/thanos-io/thanos/pkg/runutil" 33 ) 34 35 const ( 36 // BinaryFormatV1 represents first version of index-header file. 37 BinaryFormatV1 = 1 38 39 indexTOCLen = 6*8 + crc32.Size 40 binaryTOCLen = 2*8 + crc32.Size 41 // headerLen represents number of bytes reserved of index header for header. 42 headerLen = 4 + 1 + 1 + 8 43 44 // MagicIndex are 4 bytes at the head of an index-header file. 45 MagicIndex = 0xBAAAD792 46 47 postingLengthFieldSize = 4 48 ) 49 50 // The table gets initialized with sync.Once but may still cause a race 51 // with any other use of the crc32 package anywhere. Thus we initialize it 52 // before. 53 var castagnoliTable *crc32.Table 54 55 func init() { 56 castagnoliTable = crc32.MakeTable(crc32.Castagnoli) 57 } 58 59 // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the 60 // polynomial may be easily changed in one location at a later time, if necessary. 61 func newCRC32() hash.Hash32 { 62 return crc32.New(castagnoliTable) 63 } 64 65 // BinaryTOC is a table of content for index-header file. 66 type BinaryTOC struct { 67 // Symbols holds start to the same symbols section as index related to this index header. 68 Symbols uint64 69 // PostingsOffsetTable holds start to the same Postings Offset Table section as index related to this index header. 70 PostingsOffsetTable uint64 71 } 72 73 // WriteBinary build index header from the pieces of index in object storage, and cached in file if necessary. 74 func WriteBinary(ctx context.Context, bkt objstore.BucketReader, id ulid.ULID, filename string) ([]byte, error) { 75 ir, indexVersion, err := newChunkedIndexReader(ctx, bkt, id) 76 if err != nil { 77 return nil, errors.Wrap(err, "new index reader") 78 } 79 tmpFilename := "" 80 if filename != "" { 81 tmpFilename = filename + ".tmp" 82 } 83 84 // Buffer for copying and encbuffers. 85 // This also will control the size of file writer buffer. 86 buf := make([]byte, 32*1024) 87 bw, err := newBinaryWriter(id, tmpFilename, buf) 88 if err != nil { 89 return nil, errors.Wrap(err, "new binary index header writer") 90 } 91 defer runutil.CloseWithErrCapture(&err, bw, "close binary writer for %s", tmpFilename) 92 93 if err := bw.AddIndexMeta(indexVersion, ir.toc.PostingsTable); err != nil { 94 return nil, errors.Wrap(err, "add index meta") 95 } 96 97 if err := ir.CopySymbols(bw.SymbolsWriter(), buf); err != nil { 98 return nil, err 99 } 100 101 if err := bw.writer.Flush(); err != nil { 102 return nil, errors.Wrap(err, "flush") 103 } 104 105 if err := ir.CopyPostingsOffsets(bw.PostingOffsetsWriter(), buf); err != nil { 106 return nil, err 107 } 108 109 if err := bw.writer.Flush(); err != nil { 110 return nil, errors.Wrap(err, "flush") 111 } 112 113 if err := bw.WriteTOC(); err != nil { 114 return nil, errors.Wrap(err, "write index header TOC") 115 } 116 117 if err := bw.writer.Flush(); err != nil { 118 return nil, errors.Wrap(err, "flush") 119 } 120 121 if err := bw.writer.Sync(); err != nil { 122 return nil, errors.Wrap(err, "sync") 123 } 124 125 if tmpFilename != "" { 126 // Create index-header in atomic way, to avoid partial writes (e.g during restart or crash of store GW). 127 return nil, os.Rename(tmpFilename, filename) 128 } 129 130 return bw.Buffer(), nil 131 } 132 133 type chunkedIndexReader struct { 134 ctx context.Context 135 path string 136 size uint64 137 bkt objstore.BucketReader 138 toc *index.TOC 139 } 140 141 func newChunkedIndexReader(ctx context.Context, bkt objstore.BucketReader, id ulid.ULID) (*chunkedIndexReader, int, error) { 142 indexFilepath := filepath.Join(id.String(), block.IndexFilename) 143 attrs, err := bkt.Attributes(ctx, indexFilepath) 144 if err != nil { 145 return nil, 0, errors.Wrapf(err, "get object attributes of %s", indexFilepath) 146 } 147 148 rc, err := bkt.GetRange(ctx, indexFilepath, 0, index.HeaderLen) 149 if err != nil { 150 return nil, 0, errors.Wrapf(err, "get TOC from object storage of %s", indexFilepath) 151 } 152 153 b, err := io.ReadAll(rc) 154 if err != nil { 155 runutil.CloseWithErrCapture(&err, rc, "close reader") 156 return nil, 0, errors.Wrapf(err, "get header from object storage of %s", indexFilepath) 157 } 158 159 if err := rc.Close(); err != nil { 160 return nil, 0, errors.Wrap(err, "close reader") 161 } 162 163 if m := binary.BigEndian.Uint32(b[0:4]); m != index.MagicIndex { 164 return nil, 0, errors.Errorf("invalid magic number %x for %s", m, indexFilepath) 165 } 166 167 version := int(b[4:5][0]) 168 169 if version != index.FormatV1 && version != index.FormatV2 { 170 return nil, 0, errors.Errorf("not supported index file version %d of %s", version, indexFilepath) 171 } 172 173 ir := &chunkedIndexReader{ 174 ctx: ctx, 175 path: indexFilepath, 176 size: uint64(attrs.Size), 177 bkt: bkt, 178 } 179 180 toc, err := ir.readTOC() 181 if err != nil { 182 return nil, 0, err 183 } 184 ir.toc = toc 185 186 return ir, version, nil 187 } 188 189 func (r *chunkedIndexReader) readTOC() (*index.TOC, error) { 190 rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.size-indexTOCLen-crc32.Size), indexTOCLen+crc32.Size) 191 if err != nil { 192 return nil, errors.Wrapf(err, "get TOC from object storage of %s", r.path) 193 } 194 195 tocBytes, err := io.ReadAll(rc) 196 if err != nil { 197 runutil.CloseWithErrCapture(&err, rc, "close toc reader") 198 return nil, errors.Wrapf(err, "get TOC from object storage of %s", r.path) 199 } 200 201 if err := rc.Close(); err != nil { 202 return nil, errors.Wrap(err, "close toc reader") 203 } 204 205 toc, err := index.NewTOCFromByteSlice(realByteSlice(tocBytes)) 206 if err != nil { 207 return nil, errors.Wrap(err, "new TOC") 208 } 209 return toc, nil 210 } 211 212 func (r *chunkedIndexReader) CopySymbols(w io.Writer, buf []byte) (err error) { 213 rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.toc.Symbols), int64(r.toc.Series-r.toc.Symbols)) 214 if err != nil { 215 return errors.Wrapf(err, "get symbols from object storage of %s", r.path) 216 } 217 defer runutil.CloseWithErrCapture(&err, rc, "close symbol reader") 218 219 if _, err := io.CopyBuffer(w, rc, buf); err != nil { 220 return errors.Wrap(err, "copy symbols") 221 } 222 223 return nil 224 } 225 226 func (r *chunkedIndexReader) CopyPostingsOffsets(w io.Writer, buf []byte) (err error) { 227 rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.toc.PostingsTable), int64(r.size-r.toc.PostingsTable)) 228 if err != nil { 229 return errors.Wrapf(err, "get posting offset table from object storage of %s", r.path) 230 } 231 defer runutil.CloseWithErrCapture(&err, rc, "close posting offsets reader") 232 233 if _, err := io.CopyBuffer(w, rc, buf); err != nil { 234 return errors.Wrap(err, "copy posting offsets") 235 } 236 237 return nil 238 } 239 240 // TODO(bwplotka): Add padding for efficient read. 241 type binaryWriter struct { 242 writer PosWriter 243 244 toc BinaryTOC 245 246 // Reusable memory. 247 buf encoding.Encbuf 248 249 crc32 hash.Hash 250 } 251 252 func newBinaryWriter(id ulid.ULID, cacheFilename string, buf []byte) (w *binaryWriter, err error) { 253 var binWriter PosWriter 254 if cacheFilename != "" { 255 dir := filepath.Dir(cacheFilename) 256 257 df, err := fileutil.OpenDir(dir) 258 if os.IsNotExist(err) { 259 if err := os.MkdirAll(dir, os.ModePerm); err != nil { 260 return nil, err 261 } 262 df, err = fileutil.OpenDir(dir) 263 } 264 if err != nil { 265 return nil, err 266 } 267 268 defer runutil.CloseWithErrCapture(&err, df, "dir close") 269 270 if err := os.RemoveAll(cacheFilename); err != nil { 271 return nil, errors.Wrap(err, "remove any existing index at path") 272 } 273 274 var fileWriter *FileWriter 275 fileWriter, err = NewFileWriter(cacheFilename, len(buf)) 276 if err != nil { 277 return nil, err 278 } 279 if err := df.Sync(); err != nil { 280 return nil, errors.Wrap(err, "sync dir") 281 } 282 binWriter = fileWriter 283 } else { 284 binWriter = NewMemoryWriter(id, len(buf)) 285 } 286 287 w = &binaryWriter{ 288 writer: binWriter, 289 290 // Reusable memory. 291 buf: encoding.Encbuf{B: buf}, 292 crc32: newCRC32(), 293 } 294 295 w.buf.Reset() 296 w.buf.PutBE32(MagicIndex) 297 w.buf.PutByte(BinaryFormatV1) 298 299 return w, w.writer.Write(w.buf.Get()) 300 } 301 302 type PosWriterWithBuffer interface { 303 PosWriter 304 Buffer() []byte 305 } 306 307 type PosWriter interface { 308 Pos() uint64 309 Write(bufs ...[]byte) error 310 Flush() error 311 Sync() error 312 Close() error 313 } 314 315 type MemoryWriter struct { 316 id ulid.ULID 317 buf *bytes.Buffer 318 pos uint64 319 } 320 321 func NewMemoryWriter(id ulid.ULID, size int) *MemoryWriter { 322 return &MemoryWriter{ 323 id: id, 324 buf: bytes.NewBuffer(make([]byte, 0, size)), 325 pos: 0, 326 } 327 } 328 329 func (mw *MemoryWriter) Pos() uint64 { 330 return mw.pos 331 } 332 333 func (mw *MemoryWriter) Write(bufs ...[]byte) error { 334 for _, b := range bufs { 335 n, err := mw.buf.Write(b) 336 mw.pos += uint64(n) 337 if err != nil { 338 return err 339 } 340 // For now the index file must not grow beyond 64GiB. Some of the fixed-sized 341 // offset references in v1 are only 4 bytes large. 342 // Once we move to compressed/varint representations in those areas, this limitation 343 // can be lifted. 344 if mw.pos > 16*math.MaxUint32 { 345 return errors.Errorf("%q exceeding max size of 64GiB", mw.id) 346 } 347 } 348 return nil 349 } 350 351 func (mw *MemoryWriter) Buffer() []byte { 352 return mw.buf.Bytes() 353 } 354 355 func (mw *MemoryWriter) Flush() error { 356 return nil 357 } 358 359 func (mw *MemoryWriter) Sync() error { 360 return nil 361 } 362 363 func (mw *MemoryWriter) Close() error { 364 return mw.Flush() 365 } 366 367 type FileWriter struct { 368 f *os.File 369 fileWriter *bufio.Writer 370 name string 371 pos uint64 372 } 373 374 // TODO(bwplotka): Added size to method, upstream this. 375 func NewFileWriter(name string, size int) (*FileWriter, error) { 376 f, err := os.OpenFile(filepath.Clean(name), os.O_CREATE|os.O_RDWR, 0600) 377 if err != nil { 378 return nil, err 379 } 380 return &FileWriter{ 381 f: f, 382 fileWriter: bufio.NewWriterSize(f, size), 383 name: name, 384 pos: 0, 385 }, nil 386 } 387 388 func (fw *FileWriter) Pos() uint64 { 389 return fw.pos 390 } 391 392 func (fw *FileWriter) Write(bufs ...[]byte) error { 393 for _, b := range bufs { 394 n, err := fw.fileWriter.Write(b) 395 fw.pos += uint64(n) 396 if err != nil { 397 return err 398 } 399 // For now the index file must not grow beyond 64GiB. Some of the fixed-sized 400 // offset references in v1 are only 4 bytes large. 401 // Once we move to compressed/varint representations in those areas, this limitation 402 // can be lifted. 403 if fw.pos > 16*math.MaxUint32 { 404 return errors.Errorf("%q exceeding max size of 64GiB", fw.name) 405 } 406 } 407 return nil 408 } 409 410 func (fw *FileWriter) Flush() error { 411 return fw.fileWriter.Flush() 412 } 413 414 func (fw *FileWriter) Close() error { 415 if err := fw.Flush(); err != nil { 416 return err 417 } 418 if err := fw.f.Sync(); err != nil { 419 return err 420 } 421 return fw.f.Close() 422 } 423 424 func (fw *FileWriter) Sync() error { 425 return fw.f.Sync() 426 } 427 428 func (fw *FileWriter) Remove() error { 429 return os.Remove(fw.name) 430 } 431 432 func (w *binaryWriter) AddIndexMeta(indexVersion int, indexPostingOffsetTable uint64) error { 433 w.buf.Reset() 434 w.buf.PutByte(byte(indexVersion)) 435 w.buf.PutBE64(indexPostingOffsetTable) 436 return w.writer.Write(w.buf.Get()) 437 } 438 439 func (w *binaryWriter) SymbolsWriter() io.Writer { 440 w.toc.Symbols = w.writer.Pos() 441 return w 442 } 443 444 func (w *binaryWriter) PostingOffsetsWriter() io.Writer { 445 w.toc.PostingsOffsetTable = w.writer.Pos() 446 return w 447 } 448 449 func (w *binaryWriter) WriteTOC() error { 450 w.buf.Reset() 451 452 w.buf.PutBE64(w.toc.Symbols) 453 w.buf.PutBE64(w.toc.PostingsOffsetTable) 454 455 w.buf.PutHash(w.crc32) 456 457 return w.writer.Write(w.buf.Get()) 458 } 459 460 func (w *binaryWriter) Write(p []byte) (int, error) { 461 n := w.writer.Pos() 462 err := w.writer.Write(p) 463 return int(w.writer.Pos() - n), err 464 } 465 466 func (w *binaryWriter) Buffer() []byte { 467 pwb, ok := w.writer.(PosWriterWithBuffer) 468 if ok { 469 return pwb.Buffer() 470 } 471 return nil 472 } 473 474 func (w *binaryWriter) Close() error { 475 return w.writer.Close() 476 } 477 478 type postingValueOffsets struct { 479 offsets []postingOffset 480 lastValOffset int64 481 } 482 483 type postingOffset struct { 484 // label value. 485 value string 486 // offset of this entry in posting offset table in index-header file. 487 tableOff int 488 } 489 490 const valueSymbolsCacheSize = 1024 491 492 type BinaryReader struct { 493 b index.ByteSlice 494 toc *BinaryTOC 495 496 // Close that releases the underlying resources of the byte slice. 497 c io.Closer 498 499 // Map of LabelName to a list of some LabelValues's position in the offset table. 500 // The first and last values for each name are always present, we keep only 1/postingOffsetsInMemSampling of the rest. 501 postings map[string]*postingValueOffsets 502 // For the v1 format, labelname -> labelvalue -> offset. 503 postingsV1 map[string]map[string]index.Range 504 505 // Symbols struct that keeps only 1/postingOffsetsInMemSampling in the memory, then looks up the rest via mmap. 506 symbols *index.Symbols 507 // Cache of the label name symbol lookups, 508 // as there are not many and they are half of all lookups. 509 nameSymbols map[uint32]string 510 // Direct cache of values. This is much faster than an LRU cache and still provides 511 // a reasonable cache hit ratio. 512 valueSymbolsMx sync.Mutex 513 valueSymbols [valueSymbolsCacheSize]struct { 514 index uint32 515 symbol string 516 } 517 518 dec *index.Decoder 519 520 version int 521 indexVersion int 522 indexLastPostingEnd int64 523 524 postingOffsetsInMemSampling int 525 } 526 527 // NewBinaryReader loads or builds new index-header if not present on disk. 528 func NewBinaryReader(ctx context.Context, logger log.Logger, bkt objstore.BucketReader, dir string, id ulid.ULID, postingOffsetsInMemSampling int) (*BinaryReader, error) { 529 if dir != "" { 530 binfn := filepath.Join(dir, id.String(), block.IndexHeaderFilename) 531 br, err := newFileBinaryReader(binfn, postingOffsetsInMemSampling) 532 if err == nil { 533 return br, nil 534 } 535 536 level.Debug(logger).Log("msg", "failed to read index-header from disk; recreating", "path", binfn, "err", err) 537 538 start := time.Now() 539 if _, err := WriteBinary(ctx, bkt, id, binfn); err != nil { 540 return nil, errors.Wrap(err, "write index header") 541 } 542 543 level.Debug(logger).Log("msg", "built index-header file", "path", binfn, "elapsed", time.Since(start)) 544 return newFileBinaryReader(binfn, postingOffsetsInMemSampling) 545 } else { 546 buf, err := WriteBinary(ctx, bkt, id, "") 547 if err != nil { 548 return nil, errors.Wrap(err, "generate index header") 549 } 550 551 return newMemoryBinaryReader(buf, postingOffsetsInMemSampling) 552 } 553 } 554 555 func newMemoryBinaryReader(buf []byte, postingOffsetsInMemSampling int) (bw *BinaryReader, err error) { 556 r := &BinaryReader{ 557 b: realByteSlice(buf), 558 c: nil, 559 postings: map[string]*postingValueOffsets{}, 560 postingOffsetsInMemSampling: postingOffsetsInMemSampling, 561 } 562 563 if err := r.init(); err != nil { 564 return nil, err 565 } 566 567 return r, nil 568 } 569 570 func newFileBinaryReader(path string, postingOffsetsInMemSampling int) (bw *BinaryReader, err error) { 571 f, err := fileutil.OpenMmapFile(path) 572 if err != nil { 573 return nil, err 574 } 575 defer func() { 576 if err != nil { 577 runutil.CloseWithErrCapture(&err, f, "index header close") 578 } 579 }() 580 581 r := &BinaryReader{ 582 b: realByteSlice(f.Bytes()), 583 c: f, 584 postings: map[string]*postingValueOffsets{}, 585 postingOffsetsInMemSampling: postingOffsetsInMemSampling, 586 } 587 588 if err := r.init(); err != nil { 589 return nil, err 590 } 591 592 return r, nil 593 } 594 595 // newBinaryTOCFromByteSlice return parsed TOC from given index header byte slice. 596 func newBinaryTOCFromByteSlice(bs index.ByteSlice) (*BinaryTOC, error) { 597 if bs.Len() < binaryTOCLen { 598 return nil, encoding.ErrInvalidSize 599 } 600 b := bs.Range(bs.Len()-binaryTOCLen, bs.Len()) 601 602 expCRC := binary.BigEndian.Uint32(b[len(b)-4:]) 603 d := encoding.Decbuf{B: b[:len(b)-4]} 604 605 if d.Crc32(castagnoliTable) != expCRC { 606 return nil, errors.Wrap(encoding.ErrInvalidChecksum, "read index header TOC") 607 } 608 609 if err := d.Err(); err != nil { 610 return nil, err 611 } 612 613 return &BinaryTOC{ 614 Symbols: d.Be64(), 615 PostingsOffsetTable: d.Be64(), 616 }, nil 617 } 618 619 func (r *BinaryReader) init() (err error) { 620 // Verify header. 621 if r.b.Len() < headerLen { 622 return errors.Wrap(encoding.ErrInvalidSize, "index header's header") 623 } 624 if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex { 625 return errors.Errorf("invalid magic number %x", m) 626 } 627 r.version = int(r.b.Range(4, 5)[0]) 628 r.indexVersion = int(r.b.Range(5, 6)[0]) 629 630 r.indexLastPostingEnd = int64(binary.BigEndian.Uint64(r.b.Range(6, headerLen))) 631 632 if r.version != BinaryFormatV1 { 633 return errors.Errorf("unknown index header file version %d", r.version) 634 } 635 636 r.toc, err = newBinaryTOCFromByteSlice(r.b) 637 if err != nil { 638 return errors.Wrap(err, "read index header TOC") 639 } 640 641 // TODO(bwplotka): Consider contributing to Prometheus to allow specifying custom number for symbolsFactor. 642 r.symbols, err = index.NewSymbols(r.b, r.indexVersion, int(r.toc.Symbols)) 643 if err != nil { 644 return errors.Wrap(err, "read symbols") 645 } 646 647 var lastName, lastValue []byte 648 if r.indexVersion == index.FormatV1 { 649 // Earlier V1 formats don't have a sorted postings offset table, so 650 // load the whole offset table into memory. 651 r.postingsV1 = map[string]map[string]index.Range{} 652 653 var prevRng index.Range 654 if err := index.ReadPostingsOffsetTable(r.b, r.toc.PostingsOffsetTable, func(name, value []byte, postingsOffset uint64, _ int) error { 655 if lastName != nil { 656 prevRng.End = int64(postingsOffset - crc32.Size) 657 r.postingsV1[string(lastName)][string(lastValue)] = prevRng 658 } 659 660 if _, ok := r.postingsV1[string(name)]; !ok { 661 r.postingsV1[string(name)] = map[string]index.Range{} 662 r.postings[string(name)] = nil // Used to get a list of labelnames in places. 663 } 664 665 lastName = name 666 lastValue = value 667 prevRng = index.Range{Start: int64(postingsOffset + postingLengthFieldSize)} 668 return nil 669 }); err != nil { 670 return errors.Wrap(err, "read postings table") 671 } 672 if string(lastName) != "" { 673 prevRng.End = r.indexLastPostingEnd - crc32.Size 674 r.postingsV1[string(lastName)][string(lastValue)] = prevRng 675 } 676 } else { 677 lastTableOff := 0 678 valueCount := 0 679 680 // For the postings offset table we keep every label name but only every nth 681 // label value (plus the first and last one), to save memory. 682 if err := index.ReadPostingsOffsetTable(r.b, r.toc.PostingsOffsetTable, func(name, value []byte, postingsOffset uint64, labelOffset int) error { 683 if _, ok := r.postings[string(name)]; !ok { 684 // Not seen before label name. 685 r.postings[string(name)] = &postingValueOffsets{} 686 if lastName != nil { 687 // Always include last value for each label name, unless it was just added in previous iteration based 688 // on valueCount. 689 if (valueCount-1)%r.postingOffsetsInMemSampling != 0 { 690 r.postings[string(lastName)].offsets = append(r.postings[string(lastName)].offsets, postingOffset{value: string(lastValue), tableOff: lastTableOff}) 691 } 692 r.postings[string(lastName)].lastValOffset = int64(postingsOffset - crc32.Size) 693 lastName = nil 694 lastValue = nil 695 } 696 valueCount = 0 697 } 698 699 lastName = name 700 lastValue = value 701 lastTableOff = labelOffset 702 valueCount++ 703 704 if (valueCount-1)%r.postingOffsetsInMemSampling == 0 { 705 r.postings[string(name)].offsets = append(r.postings[string(name)].offsets, postingOffset{value: string(value), tableOff: labelOffset}) 706 } 707 708 return nil 709 }); err != nil { 710 return errors.Wrap(err, "read postings table") 711 } 712 if lastName != nil { 713 if (valueCount-1)%r.postingOffsetsInMemSampling != 0 { 714 // Always include last value for each label name if not included already based on valueCount. 715 r.postings[string(lastName)].offsets = append(r.postings[string(lastName)].offsets, postingOffset{value: string(lastValue), tableOff: lastTableOff}) 716 } 717 // In any case lastValOffset is unknown as don't have next posting anymore. Guess from TOC table. 718 // In worst case we will overfetch a few bytes. 719 r.postings[string(lastName)].lastValOffset = r.indexLastPostingEnd - crc32.Size 720 } 721 // Trim any extra space in the slices. 722 for k, v := range r.postings { 723 l := make([]postingOffset, len(v.offsets)) 724 copy(l, v.offsets) 725 r.postings[k].offsets = l 726 } 727 } 728 729 r.nameSymbols = make(map[uint32]string, len(r.postings)) 730 for k := range r.postings { 731 if k == "" { 732 continue 733 } 734 off, err := r.symbols.ReverseLookup(k) 735 if err != nil { 736 return errors.Wrap(err, "reverse symbol lookup") 737 } 738 r.nameSymbols[off] = k 739 } 740 741 r.dec = &index.Decoder{LookupSymbol: r.LookupSymbol} 742 743 return nil 744 } 745 746 func (r *BinaryReader) IndexVersion() (int, error) { 747 return r.indexVersion, nil 748 } 749 750 // TODO(bwplotka): Get advantage of multi value offset fetch. 751 func (r *BinaryReader) PostingsOffset(name, value string) (index.Range, error) { 752 rngs, err := r.postingsOffset(name, value) 753 if err != nil { 754 return index.Range{}, err 755 } 756 if len(rngs) != 1 { 757 return index.Range{}, NotFoundRangeErr 758 } 759 return rngs[0], nil 760 } 761 762 func skipNAndName(d *encoding.Decbuf, buf *int) { 763 if *buf == 0 { 764 // Keycount+LabelName are always the same number of bytes, 765 // and it's faster to skip than parse. 766 *buf = d.Len() 767 d.Uvarint() // Keycount. 768 d.UvarintBytes() // Label name. 769 *buf -= d.Len() 770 return 771 } 772 d.Skip(*buf) 773 } 774 func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Range, error) { 775 rngs := make([]index.Range, 0, len(values)) 776 if r.indexVersion == index.FormatV1 { 777 e, ok := r.postingsV1[name] 778 if !ok { 779 return nil, nil 780 } 781 for _, v := range values { 782 rng, ok := e[v] 783 if !ok { 784 continue 785 } 786 rngs = append(rngs, rng) 787 } 788 return rngs, nil 789 } 790 791 e, ok := r.postings[name] 792 if !ok { 793 return nil, nil 794 } 795 796 if len(values) == 0 { 797 return nil, nil 798 } 799 800 buf := 0 801 valueIndex := 0 802 for valueIndex < len(values) && values[valueIndex] < e.offsets[0].value { 803 // Discard values before the start. 804 valueIndex++ 805 } 806 807 var newSameRngs []index.Range // The start, end offsets in the postings table in the original index file. 808 for valueIndex < len(values) { 809 wantedValue := values[valueIndex] 810 811 i := sort.Search(len(e.offsets), func(i int) bool { return e.offsets[i].value >= wantedValue }) 812 if i == len(e.offsets) { 813 // We're past the end. 814 break 815 } 816 if i > 0 && e.offsets[i].value != wantedValue { 817 // Need to look from previous entry. 818 i-- 819 } 820 821 // Don't Crc32 the entire postings offset table, this is very slow 822 // so hope any issues were caught at startup. 823 d := encoding.NewDecbufAt(r.b, int(r.toc.PostingsOffsetTable), nil) 824 d.Skip(e.offsets[i].tableOff) 825 826 // Iterate on the offset table. 827 newSameRngs = newSameRngs[:0] 828 for d.Err() == nil { 829 // Posting format entry is as follows: 830 // │ ┌────────────────────────────────────────┐ │ 831 // │ │ n = 2 <1b> │ │ 832 // │ ├──────────────────────┬─────────────────┤ │ 833 // │ │ len(name) <uvarint> │ name <bytes> │ │ 834 // │ ├──────────────────────┼─────────────────┤ │ 835 // │ │ len(value) <uvarint> │ value <bytes> │ │ 836 // │ ├──────────────────────┴─────────────────┤ │ 837 // │ │ offset <uvarint64> │ │ 838 // │ └────────────────────────────────────────┘ │ 839 // First, let's skip n and name. 840 skipNAndName(&d, &buf) 841 value := d.UvarintBytes() // Label value. 842 postingOffset := int64(d.Uvarint64()) 843 844 if len(newSameRngs) > 0 { 845 // We added some ranges in previous iteration. Use next posting offset as end of all our new ranges. 846 for j := range newSameRngs { 847 newSameRngs[j].End = postingOffset - crc32.Size 848 } 849 rngs = append(rngs, newSameRngs...) 850 newSameRngs = newSameRngs[:0] 851 } 852 853 for string(value) >= wantedValue { 854 // If wantedValue is equals of greater than current value, loop over all given wanted values in the values until 855 // this is no longer true or there are no more values wanted. 856 // This ensures we cover case when someone asks for postingsOffset(name, value1, value1, value1). 857 858 // Record on the way if wanted value is equal to the current value. 859 if string(value) == wantedValue { 860 newSameRngs = append(newSameRngs, index.Range{Start: postingOffset + postingLengthFieldSize}) 861 } 862 valueIndex++ 863 if valueIndex == len(values) { 864 break 865 } 866 wantedValue = values[valueIndex] 867 } 868 869 if i+1 == len(e.offsets) { 870 // No more offsets for this name. 871 // Break this loop and record lastOffset on the way for ranges we just added if any. 872 for j := range newSameRngs { 873 newSameRngs[j].End = e.lastValOffset 874 } 875 rngs = append(rngs, newSameRngs...) 876 break 877 } 878 879 if valueIndex != len(values) && wantedValue <= e.offsets[i+1].value { 880 // wantedValue is smaller or same as the next offset we know about, let's iterate further to add those. 881 continue 882 } 883 884 // Nothing wanted or wantedValue is larger than next offset we know about. 885 // Let's exit and do binary search again / exit if nothing wanted. 886 887 if len(newSameRngs) > 0 { 888 // We added some ranges in this iteration. Use next posting offset as the end of our ranges. 889 // We know it exists as we never go further in this loop than e.offsets[i, i+1]. 890 891 skipNAndName(&d, &buf) 892 d.UvarintBytes() // Label value. 893 postingOffset := int64(d.Uvarint64()) 894 895 for j := range newSameRngs { 896 newSameRngs[j].End = postingOffset - crc32.Size 897 } 898 rngs = append(rngs, newSameRngs...) 899 } 900 break 901 } 902 if d.Err() != nil { 903 return nil, errors.Wrap(d.Err(), "get postings offset entry") 904 } 905 } 906 907 return rngs, nil 908 } 909 910 func (r *BinaryReader) LookupSymbol(o uint32) (string, error) { 911 cacheIndex := o % valueSymbolsCacheSize 912 r.valueSymbolsMx.Lock() 913 if cached := r.valueSymbols[cacheIndex]; cached.index == o && cached.symbol != "" { 914 v := cached.symbol 915 r.valueSymbolsMx.Unlock() 916 return v, nil 917 } 918 r.valueSymbolsMx.Unlock() 919 920 if s, ok := r.nameSymbols[o]; ok { 921 return s, nil 922 } 923 924 if r.indexVersion == index.FormatV1 { 925 // For v1 little trick is needed. Refs are actual offset inside index, not index-header. This is different 926 // of the header length difference between two files. 927 o += headerLen - index.HeaderLen 928 } 929 930 s, err := r.symbols.Lookup(o) 931 if err != nil { 932 return s, err 933 } 934 935 r.valueSymbolsMx.Lock() 936 r.valueSymbols[cacheIndex].index = o 937 r.valueSymbols[cacheIndex].symbol = s 938 r.valueSymbolsMx.Unlock() 939 940 return s, nil 941 } 942 943 func (r *BinaryReader) LabelValues(name string) ([]string, error) { 944 if r.indexVersion == index.FormatV1 { 945 e, ok := r.postingsV1[name] 946 if !ok { 947 return nil, nil 948 } 949 values := make([]string, 0, len(e)) 950 for k := range e { 951 values = append(values, k) 952 } 953 sort.Strings(values) 954 return values, nil 955 956 } 957 e, ok := r.postings[name] 958 if !ok { 959 return nil, nil 960 } 961 if len(e.offsets) == 0 { 962 return nil, nil 963 } 964 values := make([]string, 0, len(e.offsets)*r.postingOffsetsInMemSampling) 965 966 d := encoding.NewDecbufAt(r.b, int(r.toc.PostingsOffsetTable), nil) 967 d.Skip(e.offsets[0].tableOff) 968 lastVal := e.offsets[len(e.offsets)-1].value 969 970 skip := 0 971 for d.Err() == nil { 972 if skip == 0 { 973 // These are always the same number of bytes, 974 // and it's faster to skip than parse. 975 skip = d.Len() 976 d.Uvarint() // Keycount. 977 d.UvarintBytes() // Label name. 978 skip -= d.Len() 979 } else { 980 d.Skip(skip) 981 } 982 s := yoloString(d.UvarintBytes()) // Label value. 983 values = append(values, s) 984 if s == lastVal { 985 break 986 } 987 d.Uvarint64() // Offset. 988 } 989 if d.Err() != nil { 990 return nil, errors.Wrap(d.Err(), "get postings offset entry") 991 } 992 return values, nil 993 } 994 995 func yoloString(b []byte) string { 996 return *((*string)(unsafe.Pointer(&b))) 997 } 998 999 func (r *BinaryReader) LabelNames() ([]string, error) { 1000 allPostingsKeyName, _ := index.AllPostingsKey() 1001 labelNames := make([]string, 0, len(r.postings)) 1002 for name := range r.postings { 1003 if name == allPostingsKeyName { 1004 // This is not from any metric. 1005 continue 1006 } 1007 labelNames = append(labelNames, name) 1008 } 1009 sort.Strings(labelNames) 1010 return labelNames, nil 1011 } 1012 1013 func (r *BinaryReader) Close() error { 1014 if r.c == nil { 1015 return nil 1016 } 1017 return r.c.Close() 1018 } 1019 1020 type realByteSlice []byte 1021 1022 func (b realByteSlice) Len() int { 1023 return len(b) 1024 } 1025 1026 func (b realByteSlice) Range(start, end int) []byte { 1027 return b[start:end] 1028 } 1029 1030 func (b realByteSlice) Sub(start, end int) index.ByteSlice { 1031 return b[start:end] 1032 }