github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/block_property.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "encoding/binary" 9 "fmt" 10 "math" 11 "sync" 12 "unsafe" 13 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/internal/rangekey" 16 ) 17 18 // Block properties are an optional user-facing feature that can be used to 19 // filter data blocks (and whole sstables) from an Iterator before they are 20 // loaded. They do not apply to range delete blocks. These are expected to 21 // very concisely represent a set of some attribute value contained within the 22 // key or value, such that the set includes all the attribute values in the 23 // block. This has some similarities with OLAP pruning approaches that 24 // maintain min-max attribute values for some column (which concisely 25 // represent a set), that is then used to prune at query time. In Pebble's 26 // case, data blocks are small, typically 25-50KB, so these properties should 27 // reduce their precision in order to be concise -- a good rule of thumb is to 28 // not consume more than 50-100 bytes across all properties maintained for a 29 // block, i.e., a 500x reduction compared to loading the data block. 30 // 31 // A block property must be assigned a unique name, which is encoded and 32 // stored in the sstable. This name must be unique among all user-properties 33 // encoded in an sstable. 34 // 35 // A property is represented as a []byte. A nil value or empty byte slice are 36 // considered semantically identical. The caller is free to choose the 37 // semantics of an empty byte slice e.g. they could use it to represent the 38 // empty set or the universal set, whichever they think is more common and 39 // therefore better to encode more concisely. The serialization of the 40 // property for the various Finish*() calls in a BlockPropertyCollector 41 // implementation should be identical, since the corresponding 42 // BlockPropertyFilter implementation is not told the context in which it is 43 // deserializing the property. 44 // 45 // Block properties are more general than table properties and should be 46 // preferred over using table properties. A BlockPropertyCollector can achieve 47 // identical behavior to table properties by returning the nil slice from 48 // FinishDataBlock and FinishIndexBlock, and interpret them as the universal 49 // set in BlockPropertyFilter, and return a non-universal set in FinishTable. 50 // 51 // Block property filtering is nondeterministic because the separation of keys 52 // into blocks is nondeterministic. Clients use block-property filters to 53 // implement efficient application of a filter F that applies to key-value pairs 54 // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly 55 // the same key-value pairs that would be surfaced if one applied the filter F 56 // above normal iteration. With this correctness definition, block property 57 // filtering may introduce two kinds of errors: 58 // 59 // a) Block property filtering that uses a kv-filter may produce additional 60 // key-value pairs that don't satisfy the filter because of the separation 61 // of keys into blocks. Clients may remove these extra key-value pairs by 62 // re-applying the kv filter while reading results back from Pebble. 63 // 64 // b) Block property filtering may surface deleted key-value pairs if the 65 // kv filter is not a strict function of the key's user key. A block 66 // containing k.DEL may be filtered, while a block containing the deleted 67 // key k.SET may not be filtered, if the kv filter applies to one but not 68 // the other. 69 // 70 // This error may be avoided trivially by using a kv filter that is a pure 71 // function of the user key. A filter that examines values or key kinds 72 // requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL). 73 // 74 // The combination of range deletions and filtering by table-level properties 75 // add another opportunity for deleted point keys to be surfaced. The pebble 76 // Iterator stack takes care to correctly apply filtered tables' range deletions 77 // to lower tables, preventing this form of nondeterministic error. 78 // 79 // In addition to the non-determinism discussed in (b), which limits the use 80 // of properties over values, we now have support for values that are not 81 // stored together with the key, and may not even be retrieved during 82 // compactions. If Pebble is configured with such value separation, block 83 // properties must only apply to the key, and will be provided a nil value. 84 85 // BlockPropertyCollector is used when writing a sstable. 86 // 87 // - All calls to Add are included in the next FinishDataBlock, after which 88 // the next data block is expected to start. 89 // 90 // - The index entry generated for the data block, which contains the return 91 // value from FinishDataBlock, is not immediately included in the current 92 // index block. It is included when AddPrevDataBlockToIndexBlock is called. 93 // An alternative would be to return an opaque handle from FinishDataBlock 94 // and pass it to a new AddToIndexBlock method, which requires more 95 // plumbing, and passing of an interface{} results in a undesirable heap 96 // allocation. AddPrevDataBlockToIndexBlock must be called before keys are 97 // added to the new data block. 98 type BlockPropertyCollector interface { 99 // Name returns the name of the block property collector. 100 Name() string 101 // Add is called with each new entry added to a data block in the sstable. 102 // The callee can assume that these are in sorted order. 103 Add(key InternalKey, value []byte) error 104 // FinishDataBlock is called when all the entries have been added to a 105 // data block. Subsequent Add calls will be for the next data block. It 106 // returns the property value for the finished block. 107 FinishDataBlock(buf []byte) ([]byte, error) 108 // AddPrevDataBlockToIndexBlock adds the entry corresponding to the 109 // previous FinishDataBlock to the current index block. 110 AddPrevDataBlockToIndexBlock() 111 // FinishIndexBlock is called when an index block, containing all the 112 // key-value pairs since the last FinishIndexBlock, will no longer see new 113 // entries. It returns the property value for the index block. 114 FinishIndexBlock(buf []byte) ([]byte, error) 115 // FinishTable is called when the sstable is finished, and returns the 116 // property value for the sstable. 117 FinishTable(buf []byte) ([]byte, error) 118 } 119 120 // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector 121 // interface that allows a block property collector to indicate that it supports 122 // being *updated* during suffix replacement, i.e. when an existing SST in which 123 // all keys have the same key suffix is updated to have a new suffix. 124 // 125 // A collector which supports being updated in such cases must be able to derive 126 // its updated value from its old value and the change being made to the suffix, 127 // without needing to be passed each updated K/V. 128 // 129 // For example, a collector that only inspects values would can simply copy its 130 // previously computed property as-is, since key-suffix replacement does not 131 // change values, while a collector that depends only on key suffixes, like one 132 // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just 133 // set its new bounds from the new suffix, as it is common to all keys, without 134 // needing to recompute it from every key. 135 // 136 // An implementation of DataBlockIntervalCollector can also implement this 137 // interface, in which case the BlockPropertyCollector returned by passing it to 138 // NewBlockIntervalCollector will also implement this interface automatically. 139 type SuffixReplaceableBlockCollector interface { 140 // UpdateKeySuffixes is called when a block is updated to change the suffix of 141 // all keys in the block, and is passed the old value for that prop, if any, 142 // for that block as well as the old and new suffix. 143 UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error 144 } 145 146 // BlockPropertyFilter is used in an Iterator to filter sstables and blocks 147 // within the sstable. It should not maintain any per-sstable state, and must 148 // be thread-safe. 149 type BlockPropertyFilter = base.BlockPropertyFilter 150 151 // BoundLimitedBlockPropertyFilter implements the block-property filter but 152 // imposes an additional constraint on its usage, requiring that only blocks 153 // containing exclusively keys between its lower and upper bounds may be 154 // filtered. The bounds may be change during iteration, so the filter doesn't 155 // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods 156 // for performing bound comparisons. 157 // 158 // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly 159 // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through 160 // the ordinary slice of block property filters, this filter's bounds will be 161 // ignored. 162 // 163 // The current [lower,upper) bounds of the filter are unknown, because they may 164 // be changing. During forward iteration the lower bound is externally 165 // guaranteed, meaning Intersects only returns false if the sstable iterator is 166 // already known to be positioned at a key ≥ lower. The sstable iterator is then 167 // only responsible for ensuring filtered blocks also meet the upper bound, and 168 // should only allow a block to be filtered if all its keys are < upper. The 169 // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check, 170 // where key is an inclusive upper bound on the block's keys. 171 // 172 // During backward iteration the upper bound is externally guaranteed, and 173 // Intersects only returns false if the sstable iterator is already known to be 174 // positioned at a key < upper. The sstable iterator is responsible for ensuring 175 // filtered blocks also meet the lower bound, enforcing that a block is only 176 // filtered if all its keys are ≥ lower. This check is made through passing the 177 // block's inclusive lower bound to KeyIsWithinLowerBound. 178 // 179 // Implementations may become active or inactive through implementing Intersects 180 // to return true whenever the filter is disabled. 181 // 182 // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers 183 // should not implement this interface directly. This interface is an internal 184 // detail in the implementation of block-property range-key masking. 185 type BoundLimitedBlockPropertyFilter interface { 186 BlockPropertyFilter 187 188 // KeyIsWithinLowerBound tests whether the provided internal key falls 189 // within the current lower bound of the filter. A true return value 190 // indicates that the filter may be used to filter blocks that exclusively 191 // contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper 192 // bound. 193 KeyIsWithinLowerBound(key []byte) bool 194 // KeyIsWithinUpperBound tests whether the provided internal key falls 195 // within the current upper bound of the filter. A true return value 196 // indicates that the filter may be used to filter blocks that exclusively 197 // contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower 198 // bound. 199 KeyIsWithinUpperBound(key []byte) bool 200 } 201 202 // BlockIntervalCollector is a helper implementation of BlockPropertyCollector 203 // for users who want to represent a set of the form [lower,upper) where both 204 // lower and upper are uint64, and lower <= upper. 205 // 206 // The set is encoded as: 207 // - Two varint integers, (lower,upper-lower), when upper-lower > 0 208 // - Nil, when upper-lower=0 209 // 210 // Users must not expect this to preserve differences between empty sets -- 211 // they will all get turned into the semantically equivalent [0,0). 212 // 213 // A BlockIntervalCollector that collects over point and range keys needs to 214 // have both the point and range DataBlockIntervalCollector specified, since 215 // point and range keys are fed to the BlockIntervalCollector in an interleaved 216 // fashion, independently of one another. This also implies that the 217 // DataBlockIntervalCollectors for point and range keys should be references to 218 // independent instances, rather than references to the same collector, as point 219 // and range keys are tracked independently. 220 type BlockIntervalCollector struct { 221 name string 222 points DataBlockIntervalCollector 223 ranges DataBlockIntervalCollector 224 225 blockInterval interval 226 indexInterval interval 227 tableInterval interval 228 } 229 230 var _ BlockPropertyCollector = &BlockIntervalCollector{} 231 232 // DataBlockIntervalCollector is the interface used by BlockIntervalCollector 233 // that contains the actual logic pertaining to the property. It only 234 // maintains state for the current data block, and resets that state in 235 // FinishDataBlock. This interface can be used to reduce parsing costs. 236 type DataBlockIntervalCollector interface { 237 // Add is called with each new entry added to a data block in the sstable. 238 // The callee can assume that these are in sorted order. 239 Add(key InternalKey, value []byte) error 240 // FinishDataBlock is called when all the entries have been added to a 241 // data block. Subsequent Add calls will be for the next data block. It 242 // returns the [lower, upper) for the finished block. 243 FinishDataBlock() (lower uint64, upper uint64, err error) 244 } 245 246 // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given 247 // name. The BlockIntervalCollector makes use of the given point and range key 248 // DataBlockIntervalCollectors when encountering point and range keys, 249 // respectively. 250 // 251 // The caller may pass a nil DataBlockIntervalCollector for one of the point or 252 // range key collectors, in which case keys of those types will be ignored. This 253 // allows for flexible construction of BlockIntervalCollectors that operate on 254 // just point keys, just range keys, or both point and range keys. 255 // 256 // If both point and range keys are to be tracked, two independent collectors 257 // should be provided, rather than the same collector passed in twice (see the 258 // comment on BlockIntervalCollector for more detail) 259 func NewBlockIntervalCollector( 260 name string, pointCollector, rangeCollector DataBlockIntervalCollector, 261 ) BlockPropertyCollector { 262 if pointCollector == nil && rangeCollector == nil { 263 panic("sstable: at least one interval collector must be provided") 264 } 265 bic := BlockIntervalCollector{ 266 name: name, 267 points: pointCollector, 268 ranges: rangeCollector, 269 } 270 if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok { 271 return &suffixReplacementBlockCollectorWrapper{bic} 272 } 273 return &bic 274 } 275 276 // Name implements the BlockPropertyCollector interface. 277 func (b *BlockIntervalCollector) Name() string { 278 return b.name 279 } 280 281 // Add implements the BlockPropertyCollector interface. 282 func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error { 283 if rangekey.IsRangeKey(key.Kind()) { 284 if b.ranges != nil { 285 return b.ranges.Add(key, value) 286 } 287 } else if b.points != nil { 288 return b.points.Add(key, value) 289 } 290 return nil 291 } 292 293 // FinishDataBlock implements the BlockPropertyCollector interface. 294 func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) { 295 if b.points == nil { 296 return buf, nil 297 } 298 var err error 299 b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock() 300 if err != nil { 301 return buf, err 302 } 303 buf = b.blockInterval.encode(buf) 304 b.tableInterval.union(b.blockInterval) 305 return buf, nil 306 } 307 308 // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector 309 // interface. 310 func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() { 311 b.indexInterval.union(b.blockInterval) 312 b.blockInterval = interval{} 313 } 314 315 // FinishIndexBlock implements the BlockPropertyCollector interface. 316 func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) { 317 buf = b.indexInterval.encode(buf) 318 b.indexInterval = interval{} 319 return buf, nil 320 } 321 322 // FinishTable implements the BlockPropertyCollector interface. 323 func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) { 324 // If the collector is tracking range keys, the range key interval is union-ed 325 // with the point key interval for the table. 326 if b.ranges != nil { 327 var rangeInterval interval 328 var err error 329 rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock() 330 if err != nil { 331 return buf, err 332 } 333 b.tableInterval.union(rangeInterval) 334 } 335 return b.tableInterval.encode(buf), nil 336 } 337 338 type interval struct { 339 lower uint64 340 upper uint64 341 } 342 343 func (i interval) encode(buf []byte) []byte { 344 if i.lower < i.upper { 345 var encoded [binary.MaxVarintLen64 * 2]byte 346 n := binary.PutUvarint(encoded[:], i.lower) 347 n += binary.PutUvarint(encoded[n:], i.upper-i.lower) 348 buf = append(buf, encoded[:n]...) 349 } 350 return buf 351 } 352 353 func (i *interval) decode(buf []byte) error { 354 if len(buf) == 0 { 355 *i = interval{} 356 return nil 357 } 358 var n int 359 i.lower, n = binary.Uvarint(buf) 360 if n <= 0 || n >= len(buf) { 361 return base.CorruptionErrorf("cannot decode interval from buf %x", buf) 362 } 363 pos := n 364 i.upper, n = binary.Uvarint(buf[pos:]) 365 pos += n 366 if pos != len(buf) || n <= 0 { 367 return base.CorruptionErrorf("cannot decode interval from buf %x", buf) 368 } 369 // Delta decode. 370 i.upper += i.lower 371 if i.upper < i.lower { 372 return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower) 373 } 374 return nil 375 } 376 377 func (i *interval) union(x interval) { 378 if x.lower >= x.upper { 379 // x is the empty set. 380 return 381 } 382 if i.lower >= i.upper { 383 // i is the empty set. 384 *i = x 385 return 386 } 387 // Both sets are non-empty. 388 if x.lower < i.lower { 389 i.lower = x.lower 390 } 391 if x.upper > i.upper { 392 i.upper = x.upper 393 } 394 } 395 396 func (i interval) intersects(x interval) bool { 397 if i.lower >= i.upper || x.lower >= x.upper { 398 // At least one of the sets is empty. 399 return false 400 } 401 // Neither set is empty. 402 return i.upper > x.lower && i.lower < x.upper 403 } 404 405 type suffixReplacementBlockCollectorWrapper struct { 406 BlockIntervalCollector 407 } 408 409 // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface. 410 func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes( 411 oldProp []byte, from, to []byte, 412 ) error { 413 return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to) 414 } 415 416 // BlockIntervalFilter is an implementation of BlockPropertyFilter when the 417 // corresponding collector is a BlockIntervalCollector. That is, the set is of 418 // the form [lower, upper). 419 type BlockIntervalFilter struct { 420 name string 421 filterInterval interval 422 } 423 424 var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil) 425 426 // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks 427 // based on an interval property collected by BlockIntervalCollector and the 428 // given [lower, upper) bounds. The given name specifies the 429 // BlockIntervalCollector's properties to read. 430 func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter { 431 b := new(BlockIntervalFilter) 432 b.Init(name, lower, upper) 433 return b 434 } 435 436 // Init initializes (or re-initializes, clearing previous state) an existing 437 // BLockPropertyFilter to filter blocks based on an interval property collected 438 // by BlockIntervalCollector and the given [lower, upper) bounds. The given name 439 // specifies the BlockIntervalCollector's properties to read. 440 func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) { 441 *b = BlockIntervalFilter{ 442 name: name, 443 filterInterval: interval{lower: lower, upper: upper}, 444 } 445 } 446 447 // Name implements the BlockPropertyFilter interface. 448 func (b *BlockIntervalFilter) Name() string { 449 return b.name 450 } 451 452 // Intersects implements the BlockPropertyFilter interface. 453 func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) { 454 var i interval 455 if err := i.decode(prop); err != nil { 456 return false, err 457 } 458 return i.intersects(b.filterInterval), nil 459 } 460 461 // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not 462 // generally safe to alter the filter while it's in use, except as part of the 463 // implementation of BlockPropertyFilterMask.SetSuffix used for range-key 464 // masking. 465 func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) { 466 b.filterInterval = interval{lower: lower, upper: upper} 467 } 468 469 // When encoding block properties for each block, we cannot afford to encode 470 // the name. Instead, the name is mapped to a shortID, in the scope of that 471 // sstable, and the shortID is encoded. Since we use a uint8, there is a limit 472 // of 256 block property collectors per sstable. 473 type shortID uint8 474 475 type blockPropertiesEncoder struct { 476 propsBuf []byte 477 scratch []byte 478 } 479 480 func (e *blockPropertiesEncoder) getScratchForProp() []byte { 481 return e.scratch[:0] 482 } 483 484 func (e *blockPropertiesEncoder) resetProps() { 485 e.propsBuf = e.propsBuf[:0] 486 } 487 488 func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) { 489 const lenID = 1 490 lenProp := uvarintLen(uint32(len(scratch))) 491 n := lenID + lenProp + len(scratch) 492 if cap(e.propsBuf)-len(e.propsBuf) < n { 493 size := len(e.propsBuf) + 2*n 494 if size < 2*cap(e.propsBuf) { 495 size = 2 * cap(e.propsBuf) 496 } 497 buf := make([]byte, len(e.propsBuf), size) 498 copy(buf, e.propsBuf) 499 e.propsBuf = buf 500 } 501 pos := len(e.propsBuf) 502 b := e.propsBuf[pos : pos+lenID] 503 b[0] = byte(id) 504 pos += lenID 505 b = e.propsBuf[pos : pos+lenProp] 506 n = binary.PutUvarint(b, uint64(len(scratch))) 507 pos += n 508 b = e.propsBuf[pos : pos+len(scratch)] 509 pos += len(scratch) 510 copy(b, scratch) 511 e.propsBuf = e.propsBuf[0:pos] 512 e.scratch = scratch 513 } 514 515 func (e *blockPropertiesEncoder) unsafeProps() []byte { 516 return e.propsBuf 517 } 518 519 func (e *blockPropertiesEncoder) props() []byte { 520 buf := make([]byte, len(e.propsBuf)) 521 copy(buf, e.propsBuf) 522 return buf 523 } 524 525 type blockPropertiesDecoder struct { 526 props []byte 527 } 528 529 func (d *blockPropertiesDecoder) done() bool { 530 return len(d.props) == 0 531 } 532 533 // REQUIRES: !done() 534 func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) { 535 const lenID = 1 536 id = shortID(d.props[0]) 537 propLen, m := binary.Uvarint(d.props[lenID:]) 538 n := lenID + m 539 if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) { 540 return 0, nil, base.CorruptionErrorf("corrupt block property length") 541 } 542 prop = d.props[n : n+int(propLen)] 543 d.props = d.props[n+int(propLen):] 544 return id, prop, nil 545 } 546 547 // BlockPropertiesFilterer provides filtering support when reading an sstable 548 // in the context of an iterator that has a slice of BlockPropertyFilters. 549 // After the call to NewBlockPropertiesFilterer, the caller must call 550 // IntersectsUserPropsAndFinishInit to check if the sstable intersects with 551 // the filters. If it does intersect, this function also finishes initializing 552 // the BlockPropertiesFilterer using the shortIDs for the relevant filters. 553 // Subsequent checks for relevance of a block should use the intersects 554 // method. 555 type BlockPropertiesFilterer struct { 556 filters []BlockPropertyFilter 557 // Maps shortID => index in filters. This can be sparse, and shortIDs for 558 // which there is no filter are represented with an index of -1. The 559 // length of this can be shorter than the shortIDs allocated in the 560 // sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator 561 // has two filters, corresponding to shortIDs 2, 0, this would be: 562 // len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0. 563 shortIDToFiltersIndex []int 564 565 // boundLimitedFilter, if non-nil, holds a single block-property filter with 566 // additional constraints on its filtering. A boundLimitedFilter may only 567 // filter blocks that are wholly contained within its bounds. During forward 568 // iteration the lower bound (and during backward iteration the upper bound) 569 // must be externally guaranteed, with Intersects only returning false if 570 // that bound is met. The opposite bound is verified during iteration by the 571 // sstable iterator. 572 // 573 // boundLimitedFilter is permitted to be defined on a property (`Name()`) 574 // for which another filter exists in filters. In this case both filters 575 // will be consulted, and either filter may exclude block(s). Only a single 576 // bound-limited block-property filter may be set. 577 // 578 // The boundLimitedShortID field contains the shortID of the filter's 579 // property within the sstable. It's set to -1 if the property was not 580 // collected when the table was built. 581 boundLimitedFilter BoundLimitedBlockPropertyFilter 582 boundLimitedShortID int 583 } 584 585 var blockPropertiesFiltererPool = sync.Pool{ 586 New: func() interface{} { 587 return &BlockPropertiesFilterer{} 588 }, 589 } 590 591 // newBlockPropertiesFilterer returns a partially initialized filterer. To complete 592 // initialization, call IntersectsUserPropsAndFinishInit. 593 func newBlockPropertiesFilterer( 594 filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter, 595 ) *BlockPropertiesFilterer { 596 filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer) 597 *filterer = BlockPropertiesFilterer{ 598 filters: filters, 599 shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0], 600 boundLimitedFilter: limited, 601 boundLimitedShortID: -1, 602 } 603 return filterer 604 } 605 606 func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) { 607 *filterer = BlockPropertiesFilterer{ 608 shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0], 609 } 610 blockPropertiesFiltererPool.Put(filterer) 611 } 612 613 // IntersectsTable evaluates the provided block-property filter against the 614 // provided set of table-level properties. If there is no intersection between 615 // the filters and the table or an error is encountered, IntersectsTable returns 616 // a nil filterer (and possibly an error). If there is an intersection, 617 // IntersectsTable returns a non-nil filterer that may be used by an iterator 618 // reading the table. 619 func IntersectsTable( 620 filters []BlockPropertyFilter, 621 limited BoundLimitedBlockPropertyFilter, 622 userProperties map[string]string, 623 ) (*BlockPropertiesFilterer, error) { 624 f := newBlockPropertiesFilterer(filters, limited) 625 ok, err := f.intersectsUserPropsAndFinishInit(userProperties) 626 if !ok || err != nil { 627 releaseBlockPropertiesFilterer(f) 628 return nil, err 629 } 630 return f, nil 631 } 632 633 // intersectsUserPropsAndFinishInit is called with the user properties map for 634 // the sstable and returns whether the sstable intersects the filters. It 635 // additionally initializes the shortIDToFiltersIndex for the filters that are 636 // relevant to this sstable. 637 func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit( 638 userProperties map[string]string, 639 ) (bool, error) { 640 for i := range f.filters { 641 props, ok := userProperties[f.filters[i].Name()] 642 if !ok { 643 // Collector was not used when writing this file, so it is 644 // considered intersecting. 645 continue 646 } 647 if len(props) < 1 { 648 return false, base.CorruptionErrorf( 649 "block properties for %s is corrupted", f.filters[i].Name()) 650 } 651 shortID := shortID(props[0]) 652 { 653 // Use an unsafe conversion to avoid allocating. Intersects() is not 654 // supposed to modify the given slice. 655 // Note that unsafe.StringData only works if the string is not empty 656 // (which we already checked). 657 byteProps := unsafe.Slice(unsafe.StringData(props), len(props)) 658 intersects, err := f.filters[i].Intersects(byteProps[1:]) 659 if err != nil || !intersects { 660 return false, err 661 } 662 } 663 // Intersects the sstable, so need to use this filter when 664 // deciding whether to read blocks. 665 n := len(f.shortIDToFiltersIndex) 666 if n <= int(shortID) { 667 if cap(f.shortIDToFiltersIndex) <= int(shortID) { 668 index := make([]int, shortID+1, 2*(shortID+1)) 669 copy(index, f.shortIDToFiltersIndex) 670 f.shortIDToFiltersIndex = index 671 } else { 672 f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1] 673 } 674 for j := n; j < int(shortID); j++ { 675 f.shortIDToFiltersIndex[j] = -1 676 } 677 } 678 f.shortIDToFiltersIndex[shortID] = i 679 } 680 if f.boundLimitedFilter == nil { 681 return true, nil 682 } 683 684 // There's a bound-limited filter. Find its shortID. It's possible that 685 // there's an existing filter in f.filters on the same property. That's 686 // okay. Both filters will be consulted whenever a relevant prop is decoded. 687 props, ok := userProperties[f.boundLimitedFilter.Name()] 688 if !ok { 689 // The collector was not used when writing this file, so it's 690 // intersecting. We leave f.boundLimitedShortID=-1, so the filter will 691 // be unused within this file. 692 return true, nil 693 } 694 if len(props) < 1 { 695 return false, base.CorruptionErrorf( 696 "block properties for %s is corrupted", f.boundLimitedFilter.Name()) 697 } 698 f.boundLimitedShortID = int(props[0]) 699 700 // We don't check for table-level intersection for the bound-limited filter. 701 // The bound-limited filter is treated as vacuously intersecting. 702 // 703 // NB: If a block-property filter needs to be toggled inactive/active, it 704 // should be implemented within the Intersects implementation. 705 // 706 // TODO(jackson): We could filter at the table-level by threading the table 707 // smallest and largest bounds here. 708 709 // The bound-limited filter isn't included in shortIDToFiltersIndex. 710 // 711 // When determining intersection, we decode props only up to the shortID 712 // len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of 713 // the existing filters' shortIDs, we need to grow shortIDToFiltersIndex. 714 // Growing the index with -1s ensures we're able to consult the index 715 // without length checks. 716 if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID { 717 if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID { 718 index := make([]int, f.boundLimitedShortID+1) 719 copy(index, f.shortIDToFiltersIndex) 720 f.shortIDToFiltersIndex = index 721 } else { 722 f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1] 723 } 724 for j := n; j <= f.boundLimitedShortID; j++ { 725 f.shortIDToFiltersIndex[j] = -1 726 } 727 } 728 return true, nil 729 } 730 731 type intersectsResult int8 732 733 const ( 734 blockIntersects intersectsResult = iota 735 blockExcluded 736 // blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when 737 // no filters unconditionally exclude the block, but the bound-limited block 738 // property filter will exclude it if the block's bounds fall within the 739 // filter's current bounds. See the reader's 740 // {single,two}LevelIterator.resolveMaybeExcluded methods. 741 blockMaybeExcluded 742 ) 743 744 func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) { 745 i := 0 746 decoder := blockPropertiesDecoder{props: props} 747 ret = blockIntersects 748 for i < len(f.shortIDToFiltersIndex) { 749 var id int 750 var prop []byte 751 if !decoder.done() { 752 var shortID shortID 753 var err error 754 shortID, prop, err = decoder.next() 755 if err != nil { 756 return ret, err 757 } 758 id = int(shortID) 759 } else { 760 id = math.MaxUint8 + 1 761 } 762 for i < len(f.shortIDToFiltersIndex) && id > i { 763 // The property for this id is not encoded for this block, but there 764 // may still be a filter for this id. 765 if intersects, err := f.intersectsFilter(i, nil); err != nil { 766 return ret, err 767 } else if intersects == blockExcluded { 768 return blockExcluded, nil 769 } else if intersects == blockMaybeExcluded { 770 ret = blockMaybeExcluded 771 } 772 i++ 773 } 774 if i >= len(f.shortIDToFiltersIndex) { 775 return ret, nil 776 } 777 // INVARIANT: id <= i. And since i is always incremented by 1, id==i. 778 if id != i { 779 panic(fmt.Sprintf("%d != %d", id, i)) 780 } 781 if intersects, err := f.intersectsFilter(i, prop); err != nil { 782 return ret, err 783 } else if intersects == blockExcluded { 784 return blockExcluded, nil 785 } else if intersects == blockMaybeExcluded { 786 ret = blockMaybeExcluded 787 } 788 i++ 789 } 790 // ret == blockIntersects || ret == blockMaybeExcluded 791 return ret, nil 792 } 793 794 func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) { 795 if f.shortIDToFiltersIndex[i] >= 0 { 796 intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop) 797 if err != nil { 798 return blockIntersects, err 799 } 800 if !intersects { 801 return blockExcluded, nil 802 } 803 } 804 if i == f.boundLimitedShortID { 805 // The bound-limited filter uses this id. 806 // 807 // The bound-limited filter only applies within a keyspan interval. We 808 // expect the Intersects call to be cheaper than bounds checks. If 809 // Intersects determines that there is no intersection, we return 810 // `blockMaybeExcluded` if no other bpf unconditionally excludes the 811 // block. 812 intersects, err := f.boundLimitedFilter.Intersects(prop) 813 if err != nil { 814 return blockIntersects, err 815 } else if !intersects { 816 return blockMaybeExcluded, nil 817 } 818 } 819 return blockIntersects, nil 820 }