github.com/thanos-io/thanos@v0.32.5/pkg/store/postings_codec.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package store 5 6 import ( 7 "bytes" 8 "encoding/binary" 9 "fmt" 10 "hash/crc32" 11 "io" 12 13 "github.com/golang/snappy" 14 "github.com/klauspost/compress/s2" 15 "github.com/pkg/errors" 16 "github.com/prometheus/prometheus/storage" 17 "github.com/prometheus/prometheus/tsdb/encoding" 18 "github.com/prometheus/prometheus/tsdb/index" 19 extsnappy "github.com/thanos-io/thanos/pkg/extgrpc/snappy" 20 "github.com/thanos-io/thanos/pkg/pool" 21 ) 22 23 // This file implements encoding and decoding of postings using diff (or delta) + varint 24 // number encoding. On top of that, we apply Snappy compression. 25 // 26 // On its own, Snappy compressing raw postings doesn't really help, because there is no 27 // repetition in raw data. Using diff (delta) between postings entries makes values small, 28 // and Varint is very efficient at encoding small values (values < 128 are encoded as 29 // single byte, values < 16384 are encoded as two bytes). Diff + varint reduces postings size 30 // significantly (to about 20% of original), snappy then halves it to ~10% of the original. 31 32 const ( 33 codecHeaderSnappy = "dvs" // As in "diff+varint+snappy". 34 codecHeaderStreamedSnappy = "dss" // As in "diffvarint+streamed snappy". 35 ) 36 37 func decodePostings(input []byte) (closeablePostings, error) { 38 var df func([]byte, bool) (closeablePostings, error) 39 40 switch { 41 case isDiffVarintSnappyEncodedPostings(input): 42 df = diffVarintSnappyDecode 43 case isDiffVarintSnappyStreamedEncodedPostings(input): 44 df = diffVarintSnappyStreamedDecode 45 default: 46 return nil, fmt.Errorf("unrecognize postings format") 47 } 48 49 return df(input, false) 50 } 51 52 // isDiffVarintSnappyEncodedPostings returns true, if input looks like it has been encoded by diff+varint+snappy codec. 53 func isDiffVarintSnappyEncodedPostings(input []byte) bool { 54 return bytes.HasPrefix(input, []byte(codecHeaderSnappy)) 55 } 56 57 // isDiffVarintSnappyStreamedEncodedPostings returns true, if input looks like it has been encoded by diff+varint+snappy streamed codec. 58 func isDiffVarintSnappyStreamedEncodedPostings(input []byte) bool { 59 return bytes.HasPrefix(input, []byte(codecHeaderStreamedSnappy)) 60 } 61 62 // estimateSnappyStreamSize estimates the number of bytes 63 // needed for encoding length postings. Note that in reality 64 // the number of bytes needed could be much bigger if postings 65 // different by a lot. Practically, stddev=64 is used. 66 func estimateSnappyStreamSize(length int) int { 67 // Snappy stream writes data in chunks up to 65536 in size. 68 // The stream begins with bytes 0xff 0x06 0x00 0x00 's' 'N' 'a' 'P' 'p' 'Y'. 69 // Our encoded data also needs a header. 70 // Each encoded (or uncompressed) chunk needs tag (chunk type 1B + chunk len 3B) + checksum 4B. 71 72 // Mark for encoded data. 73 ret := len(codecHeaderStreamedSnappy) 74 // Magic snappy stream start. 75 ret += 10 76 77 const maxBlockSize = 65536 78 79 length = 5 * length / 4 // estimate 1.25B per posting. 80 81 blocks := length / maxBlockSize 82 83 ret += blocks * snappy.MaxEncodedLen(maxBlockSize) 84 length -= blocks * maxBlockSize 85 if length > 0 { 86 ret += snappy.MaxEncodedLen(length) 87 } 88 89 return ret 90 } 91 92 func diffVarintSnappyStreamedEncode(p index.Postings, length int) ([]byte, error) { 93 compressedBuf := bytes.NewBuffer(make([]byte, 0, estimateSnappyStreamSize(length))) 94 if n, err := compressedBuf.WriteString(codecHeaderStreamedSnappy); err != nil { 95 return nil, fmt.Errorf("writing streamed snappy header") 96 } else if n != len(codecHeaderStreamedSnappy) { 97 return nil, fmt.Errorf("short-write streamed snappy header") 98 } 99 100 uvarintEncodeBuf := make([]byte, binary.MaxVarintLen64) 101 102 sw, err := extsnappy.Compressor.Compress(compressedBuf) 103 if err != nil { 104 return nil, fmt.Errorf("creating snappy compressor: %w", err) 105 } 106 107 prev := storage.SeriesRef(0) 108 for p.Next() { 109 v := p.At() 110 if v < prev { 111 return nil, errors.Errorf("postings entries must be in increasing order, current: %d, previous: %d", v, prev) 112 } 113 114 uvarintSize := binary.PutUvarint(uvarintEncodeBuf, uint64(v-prev)) 115 if written, err := sw.Write(uvarintEncodeBuf[:uvarintSize]); err != nil { 116 return nil, errors.Wrap(err, "writing uvarint encoded byte") 117 } else if written != uvarintSize { 118 return nil, errors.Wrap(err, "short-write for uvarint encoded byte") 119 } 120 121 prev = v 122 } 123 if p.Err() != nil { 124 return nil, p.Err() 125 } 126 if err := sw.Close(); err != nil { 127 return nil, errors.Wrap(err, "closing snappy stream writer") 128 } 129 130 return compressedBuf.Bytes(), nil 131 } 132 133 func diffVarintSnappyStreamedDecode(input []byte, disablePooling bool) (closeablePostings, error) { 134 if !isDiffVarintSnappyStreamedEncodedPostings(input) { 135 return nil, errors.New("header not found") 136 } 137 138 return newStreamedDiffVarintPostings(input[len(codecHeaderStreamedSnappy):], disablePooling) 139 } 140 141 type streamedDiffVarintPostings struct { 142 curSeries storage.SeriesRef 143 144 err error 145 input, buf []byte 146 maximumDecodedLen int 147 148 db *encoding.Decbuf 149 150 readSnappyIdentifier bool 151 disablePooling bool 152 } 153 154 const ( 155 chunkTypeCompressedData = 0x00 156 chunkTypeUncompressedData = 0x01 157 chunkTypeStreamIdentifier = 0xff 158 chunkTypePadding = 0xfe 159 checksumSize = 4 160 ) 161 162 func maximumDecodedLenSnappyStreamed(in []byte) (int, error) { 163 maxDecodedLen := -1 164 165 for len(in) > 0 { 166 // Chunk type. 167 chunkType := in[0] 168 in = in[1:] 169 chunkLen := int(in[0]) | int(in[1])<<8 | int(in[2])<<16 170 in = in[3:] 171 172 switch chunkType { 173 case chunkTypeCompressedData: 174 bl := in[:chunkLen] 175 // NOTE: checksum will be checked later on. 176 decodedLen, err := s2.DecodedLen(bl[checksumSize:]) 177 if err != nil { 178 return 0, err 179 } 180 if decodedLen > maxDecodedLen { 181 maxDecodedLen = decodedLen 182 } 183 case chunkTypeUncompressedData: 184 // NOTE: checksum will be checked later on. 185 n := chunkLen - checksumSize 186 if n > maxDecodedLen { 187 maxDecodedLen = n 188 } 189 } 190 in = in[chunkLen:] 191 } 192 return maxDecodedLen, nil 193 } 194 195 var decodedBufPool = pool.MustNewBucketedBytes(1024, 65536, 2, 0) 196 197 func newStreamedDiffVarintPostings(input []byte, disablePooling bool) (closeablePostings, error) { 198 // We can't use the regular s2.Reader because it assumes a stream. 199 // We already everything in memory so let's avoid copying. 200 // Algorithm: 201 // 1. Step through all chunks all get maximum decoded len. 202 // 2. Read into decoded step by step. For decoding call s2.Decode(r.decoded, buf). 203 maximumDecodedLen, err := maximumDecodedLenSnappyStreamed(input) 204 if err != nil { 205 return nil, err 206 } 207 208 return &streamedDiffVarintPostings{ 209 input: input, 210 maximumDecodedLen: maximumDecodedLen, 211 db: &encoding.Decbuf{}, 212 disablePooling: disablePooling, 213 }, nil 214 } 215 216 func (it *streamedDiffVarintPostings) close() { 217 if it.buf == nil { 218 return 219 } 220 if it.disablePooling { 221 return 222 } 223 decodedBufPool.Put(&it.buf) 224 } 225 226 func (it *streamedDiffVarintPostings) At() storage.SeriesRef { 227 return it.curSeries 228 } 229 230 func (it *streamedDiffVarintPostings) readNextChunk(remainder []byte) bool { 231 // Normal EOF. 232 if len(it.input) == 0 { 233 return false 234 } 235 236 // Read next chunk into it.db.B. 237 chunkType := it.input[0] 238 it.input = it.input[1:] 239 240 if len(it.input) < 3 { 241 it.err = io.ErrUnexpectedEOF 242 return false 243 } 244 245 chunkLen := int(it.input[0]) | int(it.input[1])<<8 | int(it.input[2])<<16 246 it.input = it.input[3:] 247 248 switch chunkType { 249 case chunkTypeStreamIdentifier: 250 const magicBody = "sNaPpY" 251 if chunkLen != len(magicBody) { 252 it.err = fmt.Errorf("corrupted identifier") 253 return false 254 } 255 if string(it.input[:len(magicBody)]) != magicBody { 256 it.err = fmt.Errorf("got bad identifier %s", string(it.input[:6])) 257 return false 258 } 259 it.input = it.input[6:] 260 it.readSnappyIdentifier = true 261 return it.readNextChunk(nil) 262 case chunkTypeCompressedData: 263 if !it.readSnappyIdentifier { 264 it.err = fmt.Errorf("missing magic snappy marker") 265 return false 266 } 267 if len(it.input) < 4 { 268 it.err = io.ErrUnexpectedEOF 269 return false 270 } 271 checksum := uint32(it.input[0]) | uint32(it.input[1])<<8 | uint32(it.input[2])<<16 | uint32(it.input[3])<<24 272 if len(it.input) < chunkLen { 273 it.err = io.ErrUnexpectedEOF 274 return false 275 } 276 277 if it.buf == nil { 278 if it.disablePooling { 279 it.buf = make([]byte, it.maximumDecodedLen) 280 } else { 281 b, err := decodedBufPool.Get(it.maximumDecodedLen) 282 if err != nil { 283 it.err = err 284 return false 285 } 286 it.buf = *b 287 } 288 } 289 290 encodedBuf := it.input[:chunkLen] 291 292 // NOTE(GiedriusS): we can probably optimize this better but this should be rare enough 293 // and not cause any problems. 294 if len(remainder) > 0 { 295 remainderCopy := make([]byte, 0, len(remainder)) 296 remainderCopy = append(remainderCopy, remainder...) 297 remainder = remainderCopy 298 } 299 decoded, err := s2.Decode(it.buf, encodedBuf[checksumSize:]) 300 if err != nil { 301 it.err = err 302 return false 303 } 304 if crc(decoded) != checksum { 305 it.err = fmt.Errorf("mismatched checksum (got %v, expected %v)", crc(decoded), checksum) 306 return false 307 } 308 if len(remainder) > 0 { 309 it.db.B = append(remainder, decoded...) 310 } else { 311 it.db.B = decoded 312 } 313 case chunkTypeUncompressedData: 314 if !it.readSnappyIdentifier { 315 it.err = fmt.Errorf("missing magic snappy marker") 316 return false 317 } 318 if len(it.input) < 4 { 319 it.err = io.ErrUnexpectedEOF 320 return false 321 } 322 checksum := uint32(it.input[0]) | uint32(it.input[1])<<8 | uint32(it.input[2])<<16 | uint32(it.input[3])<<24 323 if len(it.input) < chunkLen { 324 it.err = io.ErrUnexpectedEOF 325 return false 326 } 327 uncompressedData := it.input[checksumSize:chunkLen] 328 if crc(uncompressedData) != checksum { 329 it.err = fmt.Errorf("mismatched checksum (got %v, expected %v)", crc(uncompressedData), checksum) 330 return false 331 } 332 333 // NOTE(GiedriusS): we can probably optimize this better but this should be rare enough 334 // and not cause any problems. 335 if len(remainder) > 0 { 336 remainderCopy := make([]byte, 0, len(remainder)) 337 remainderCopy = append(remainderCopy, remainder...) 338 remainder = remainderCopy 339 } 340 341 if len(remainder) > 0 { 342 it.db.B = append(remainder, uncompressedData...) 343 } else { 344 it.db.B = uncompressedData 345 } 346 default: 347 if chunkType <= 0x7f { 348 it.err = fmt.Errorf("unsupported chunk type %v", chunkType) 349 return false 350 } 351 if chunkType > 0xfd { 352 it.err = fmt.Errorf("invalid chunk type %v", chunkType) 353 return false 354 } 355 } 356 it.input = it.input[chunkLen:] 357 358 return true 359 } 360 361 func (it *streamedDiffVarintPostings) Next() bool { 362 // Continue reading next chunks until there is at least binary.MaxVarintLen64. 363 // If we cannot add any more chunks then return false. 364 for { 365 val := it.db.Uvarint64() 366 if it.db.Err() != nil { 367 if !it.readNextChunk(it.db.B) { 368 return false 369 } 370 it.db.E = nil 371 continue 372 } 373 374 it.curSeries = it.curSeries + storage.SeriesRef(val) 375 return true 376 } 377 } 378 379 func (it *streamedDiffVarintPostings) Err() error { 380 return it.err 381 } 382 383 func (it *streamedDiffVarintPostings) Seek(x storage.SeriesRef) bool { 384 if it.curSeries >= x { 385 return true 386 } 387 388 // We cannot do any search due to how values are stored, 389 // so we simply advance until we find the right value. 390 for it.Next() { 391 if it.At() >= x { 392 return true 393 } 394 } 395 396 return false 397 } 398 399 // diffVarintSnappyEncode encodes postings into diff+varint representation, 400 // and applies snappy compression on the result. 401 // Returned byte slice starts with codecHeaderSnappy header. 402 // Length argument is expected number of postings, used for preallocating buffer. 403 // TODO(GiedriusS): remove for v1.0. 404 func diffVarintSnappyEncode(p index.Postings, length int) ([]byte, error) { 405 buf, err := diffVarintEncodeNoHeader(p, length) 406 if err != nil { 407 return nil, err 408 } 409 410 // Make result buffer large enough to hold our header and compressed block. 411 result := make([]byte, len(codecHeaderSnappy)+snappy.MaxEncodedLen(len(buf))) 412 copy(result, codecHeaderSnappy) 413 414 compressed := snappy.Encode(result[len(codecHeaderSnappy):], buf) 415 416 // Slice result buffer based on compressed size. 417 result = result[:len(codecHeaderSnappy)+len(compressed)] 418 return result, nil 419 } 420 421 // diffVarintEncodeNoHeader encodes postings into diff+varint representation. 422 // It doesn't add any header to the output bytes. 423 // Length argument is expected number of postings, used for preallocating buffer. 424 func diffVarintEncodeNoHeader(p index.Postings, length int) ([]byte, error) { 425 buf := encoding.Encbuf{} 426 427 // This encoding uses around ~1 bytes per posting, but let's use 428 // conservative 1.25 bytes per posting to avoid extra allocations. 429 if length > 0 { 430 buf.B = make([]byte, 0, 5*length/4) 431 } 432 433 prev := storage.SeriesRef(0) 434 for p.Next() { 435 v := p.At() 436 if v < prev { 437 return nil, errors.Errorf("postings entries must be in increasing order, current: %d, previous: %d", v, prev) 438 } 439 440 // This is the 'diff' part -- compute difference from previous value. 441 buf.PutUvarint64(uint64(v - prev)) 442 prev = v 443 } 444 if p.Err() != nil { 445 return nil, p.Err() 446 } 447 448 return buf.B, nil 449 } 450 451 // Creating 15 buckets from 1k to 32mb. 452 var snappyDecodePool = pool.MustNewBucketedBytes(1024, 32*1024*1024, 2, 0) 453 454 type closeablePostings interface { 455 index.Postings 456 close() 457 } 458 459 // alias returns true if given slices have the same both backing array. 460 // See: https://groups.google.com/g/golang-nuts/c/C6ufGl73Uzk. 461 func alias(x, y []byte) bool { 462 return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1] 463 } 464 465 // TODO(GiedriusS): remove for v1.0. 466 func diffVarintSnappyDecode(input []byte, disablePooling bool) (closeablePostings, error) { 467 if !isDiffVarintSnappyEncodedPostings(input) { 468 return nil, errors.New("header not found") 469 } 470 471 toFree := make([][]byte, 0, 2) 472 473 var dstBuf []byte 474 if !disablePooling { 475 if len, err := s2.DecodedLen(input[len(codecHeaderSnappy):]); err == nil { 476 if decodeBuf, err := snappyDecodePool.Get(len); err == nil && decodeBuf != nil { 477 dstBuf = *decodeBuf 478 toFree = append(toFree, dstBuf) 479 } 480 } 481 } 482 483 raw, err := s2.Decode(dstBuf, input[len(codecHeaderSnappy):]) 484 if err != nil { 485 return nil, errors.Wrap(err, "snappy decode") 486 } 487 488 if !alias(raw, dstBuf) && !disablePooling { 489 toFree = append(toFree, raw) 490 } 491 492 return newDiffVarintPostings(raw, toFree), nil 493 } 494 495 func newDiffVarintPostings(input []byte, freeSlices [][]byte) *diffVarintPostings { 496 return &diffVarintPostings{freeSlices: freeSlices, buf: &encoding.Decbuf{B: input}} 497 } 498 499 // diffVarintPostings is an implementation of index.Postings based on diff+varint encoded data. 500 type diffVarintPostings struct { 501 buf *encoding.Decbuf 502 cur storage.SeriesRef 503 freeSlices [][]byte 504 } 505 506 func (it *diffVarintPostings) close() { 507 for i := range it.freeSlices { 508 snappyDecodePool.Put(&it.freeSlices[i]) 509 } 510 } 511 512 func (it *diffVarintPostings) At() storage.SeriesRef { 513 return it.cur 514 } 515 516 func (it *diffVarintPostings) Next() bool { 517 if it.buf.Err() != nil || it.buf.Len() == 0 { 518 return false 519 } 520 521 val := it.buf.Uvarint64() 522 if it.buf.Err() != nil { 523 return false 524 } 525 526 it.cur = it.cur + storage.SeriesRef(val) 527 return true 528 } 529 530 func (it *diffVarintPostings) Seek(x storage.SeriesRef) bool { 531 if it.cur >= x { 532 return true 533 } 534 535 // We cannot do any search due to how values are stored, 536 // so we simply advance until we find the right value. 537 for it.Next() { 538 if it.At() >= x { 539 return true 540 } 541 } 542 543 return false 544 } 545 546 func (it *diffVarintPostings) Err() error { 547 return it.buf.Err() 548 } 549 550 func snappyStreamedEncode(postingsLength int, diffVarintPostings []byte) ([]byte, error) { 551 compressedBuf := bytes.NewBuffer(make([]byte, 0, estimateSnappyStreamSize(postingsLength))) 552 if n, err := compressedBuf.WriteString(codecHeaderStreamedSnappy); err != nil { 553 return nil, fmt.Errorf("writing streamed snappy header") 554 } else if n != len(codecHeaderStreamedSnappy) { 555 return nil, fmt.Errorf("short-write streamed snappy header") 556 } 557 558 sw, err := extsnappy.Compressor.Compress(compressedBuf) 559 if err != nil { 560 return nil, fmt.Errorf("creating snappy compressor: %w", err) 561 } 562 _, err = sw.Write(diffVarintPostings) 563 if err != nil { 564 return nil, err 565 } 566 if err := sw.Close(); err != nil { 567 return nil, errors.Wrap(err, "closing snappy stream writer") 568 } 569 570 return compressedBuf.Bytes(), nil 571 } 572 573 var crcTable = crc32.MakeTable(crc32.Castagnoli) 574 575 // crc implements the checksum specified in section 3 of 576 // https://github.com/google/snappy/blob/master/framing_format.txt 577 func crc(b []byte) uint32 { 578 c := crc32.Update(0, crcTable, b) 579 return c>>15 | c<<17 + 0xa282ead8 580 }