github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/block_property.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/block_property.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"encoding/binary"
     9  	"fmt"
    10  	"math"
    11  	"sync"
    12  
    13  	"github.com/zuoyebang/bitalostable/internal/base"
    14  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    15  )
    16  
    17  // Block properties are an optional user-facing feature that can be used to
    18  // filter data blocks (and whole sstables) from an Iterator before they are
    19  // loaded. They do not apply to range delete blocks. These are expected to
    20  // very concisely represent a set of some attribute value contained within the
    21  // key or value, such that the set includes all the attribute values in the
    22  // block. This has some similarities with OLAP pruning approaches that
    23  // maintain min-max attribute values for some column (which concisely
    24  // represent a set), that is then used to prune at query time. In Pebble's
    25  // case, data blocks are small, typically 25-50KB, so these properties should
    26  // reduce their precision in order to be concise -- a good rule of thumb is to
    27  // not consume more than 50-100 bytes across all properties maintained for a
    28  // block, i.e., a 500x reduction compared to loading the data block.
    29  //
    30  // A block property must be assigned a unique name, which is encoded and
    31  // stored in the sstable. This name must be unique among all user-properties
    32  // encoded in an sstable.
    33  //
    34  // A property is represented as a []byte. A nil value or empty byte slice are
    35  // considered semantically identical. The caller is free to choose the
    36  // semantics of an empty byte slice e.g. they could use it to represent the
    37  // empty set or the universal set, whichever they think is more common and
    38  // therefore better to encode more concisely. The serialization of the
    39  // property for the various Finish*() calls in a BlockPropertyCollector
    40  // implementation should be identical, since the corresponding
    41  // BlockPropertyFilter implementation is not told the context in which it is
    42  // deserializing the property.
    43  //
    44  // Block properties are more general than table properties and should be
    45  // preferred over using table properties. A BlockPropertyCollector can achieve
    46  // identical behavior to table properties by returning the nil slice from
    47  // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
    48  // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
    49  //
    50  // Block property filtering is nondeterministic because the separation of keys
    51  // into blocks is nondeterministic. Clients use block-property filters to
    52  // implement efficient application of a filter F that applies to key-value pairs
    53  // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
    54  // the same key-value pairs that would be surfaced if one applied the filter F
    55  // above normal iteration. With this correctness definition, block property
    56  // filtering may introduce two kinds of errors:
    57  //
    58  //   a) Block property filtering that uses a kv-filter may produce additional
    59  //      key-value pairs that don't satisfy the filter because of the separation
    60  //      of keys into blocks. Clients may remove these extra key-value pairs by
    61  //      re-applying the kv filter while reading results back from Pebble.
    62  //
    63  //   b) Block property filtering may surface deleted key-value pairs if the
    64  //      the kv filter is not a strict function of the key's user key. A block
    65  //      containing k.DEL may be filtered, while a block containing the deleted
    66  //      key k.SET may not be filtered, if the kv filter applies to one but not
    67  //      the other.
    68  //
    69  //      This error may be avoided trivially by using a kv filter that is a pure
    70  //      function of the the user key. A filter that examines values or key kinds
    71  //      requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
    72  //
    73  // The combination of range deletions and filtering by table-level properties
    74  // add another opportunity for deleted point keys to be surfaced. The bitalostable
    75  // Iterator stack takes care to correctly apply filtered tables' range deletions
    76  // to lower tables, preventing this form of nondeterministic error.
    77  
    78  // BlockPropertyCollector is used when writing a sstable.
    79  //
    80  //   - All calls to Add are included in the next FinishDataBlock, after which
    81  //     the next data block is expected to start.
    82  //
    83  //   - The index entry generated for the data block, which contains the return
    84  //     value from FinishDataBlock, is not immediately included in the current
    85  //     index block. It is included when AddPrevDataBlockToIndexBlock is called.
    86  //     An alternative would be to return an opaque handle from FinishDataBlock
    87  //     and pass it to a new AddToIndexBlock method, which requires more
    88  //     plumbing, and passing of an interface{} results in a undesirable heap
    89  //     allocation. AddPrevDataBlockToIndexBlock must be called before keys are
    90  //     added to the new data block.
    91  type BlockPropertyCollector interface {
    92  	// Name returns the name of the block property collector.
    93  	Name() string
    94  	// Add is called with each new entry added to a data block in the sstable.
    95  	// The callee can assume that these are in sorted order.
    96  	Add(key InternalKey, value []byte) error
    97  	// FinishDataBlock is called when all the entries have been added to a
    98  	// data block. Subsequent Add calls will be for the next data block. It
    99  	// returns the property value for the finished block.
   100  	FinishDataBlock(buf []byte) ([]byte, error)
   101  	// AddPrevDataBlockToIndexBlock adds the entry corresponding to the
   102  	// previous FinishDataBlock to the current index block.
   103  	AddPrevDataBlockToIndexBlock()
   104  	// FinishIndexBlock is called when an index block, containing all the
   105  	// key-value pairs since the last FinishIndexBlock, will no longer see new
   106  	// entries. It returns the property value for the index block.
   107  	FinishIndexBlock(buf []byte) ([]byte, error)
   108  	// FinishTable is called when the sstable is finished, and returns the
   109  	// property value for the sstable.
   110  	FinishTable(buf []byte) ([]byte, error)
   111  }
   112  
   113  // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
   114  // interface that allows a block property collector to indicate the it supports
   115  // being *updated* during suffix replacement, i.e. when an existing SST in which
   116  // all keys have the same key suffix is updated to have a new suffix.
   117  //
   118  // A collector which supports being updated in such cases must be able to derive
   119  // its updated value from its old value and the change being made to the suffix,
   120  // without needing to be passed each updated K/V.
   121  //
   122  // For example, a collector that only inspects values would can simply copy its
   123  // previously computed property as-is, since key-suffix replacement does not
   124  // change values, while a collector that depends only on key suffixes, like one
   125  // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
   126  // set its new bounds from the new suffix, as it is common to all keys, without
   127  // needing to recompute it from every key.
   128  //
   129  // An implementation of DataBlockIntervalCollector can also implement this
   130  // interface, in which case the BlockPropertyCollector returned by passing it to
   131  // NewBlockIntervalCollector will also implement this interface automatically.
   132  type SuffixReplaceableBlockCollector interface {
   133  	// UpdateKeySuffixes is called when a block is updated to change the suffix of
   134  	// all keys in the block, and is passed the old value for that prop, if any,
   135  	// for that block as well as the old and new suffix.
   136  	UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
   137  }
   138  
   139  // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
   140  // within the sstable. It should not maintain any per-sstable state, and must
   141  // be thread-safe.
   142  type BlockPropertyFilter = base.BlockPropertyFilter
   143  
   144  // BoundLimitedBlockPropertyFilter implements the block-property filter but
   145  // imposes an additional constraint on its usage, requiring that only blocks
   146  // containing exclusively keys between its lower and upper bounds may be
   147  // filtered. The bounds may be change during iteration, so the filter doesn't
   148  // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
   149  // for performing bound comparisons.
   150  //
   151  // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
   152  // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
   153  // the ordinary slice of block property filters, this filter's bounds will be
   154  // ignored.
   155  //
   156  // The current [lower,upper) bounds of the filter are unknown, because they may
   157  // be changing. During forward iteration the lower bound is externally
   158  // guaranteed, meaning Intersects only returns false if the sstable iterator is
   159  // already known to be positioned at a key ≥ lower. The sstable iterator is then
   160  // only responsible for ensuring filtered blocks also meet the upper bound, and
   161  // should only allow a block to be filtered if all its keys are < upper. The
   162  // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
   163  // where key is an inclusive upper bound on the block's keys.
   164  //
   165  // During backward iteration the upper bound is externally guaranteed, and
   166  // Intersects only returns false if the sstable iterator is already known to be
   167  // positioned at a key < upper. The sstable iterator is responsible for ensuring
   168  // filtered blocks also meet the lower bound, enforcing that a block is only
   169  // filtered if all its keys are ≥ lower. This check is made through passing the
   170  // block's inclusive lower bound to KeyIsWithinLowerBound.
   171  //
   172  // Implementations may become active or inactive through implementing Intersects
   173  // to return true whenever the filter is disabled.
   174  //
   175  // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
   176  // should not implement this interface directly. This interface is an internal
   177  // detail in the implementation of block-property range-key masking.
   178  type BoundLimitedBlockPropertyFilter interface {
   179  	BlockPropertyFilter
   180  
   181  	// KeyIsWithinLowerBound tests whether the provided internal key falls
   182  	// within the current lower bound of the filter. A true return value
   183  	// indicates that the filter may be used to filter blocks that exclusively
   184  	// contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
   185  	// bound.
   186  	KeyIsWithinLowerBound(key *InternalKey) bool
   187  	// KeyIsWithinUpperBound tests whether the provided internal key falls
   188  	// within the current upper bound of the filter. A true return value
   189  	// indicates that the filter may be used to filter blocks that exclusively
   190  	// contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
   191  	// bound.
   192  	KeyIsWithinUpperBound(key *InternalKey) bool
   193  }
   194  
   195  // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
   196  // for users who want to represent a set of the form [lower,upper) where both
   197  // lower and upper are uint64, and lower <= upper.
   198  //
   199  // The set is encoded as:
   200  // - Two varint integers, (lower,upper-lower), when upper-lower > 0
   201  // - Nil, when upper-lower=0
   202  //
   203  // Users must not expect this to preserve differences between empty sets --
   204  // they will all get turned into the semantically equivalent [0,0).
   205  //
   206  // A BlockIntervalCollector that collects over point and range keys needs to
   207  // have both the point and range DataBlockIntervalCollector specified, since
   208  // point and range keys are fed to the BlockIntervalCollector in an interleaved
   209  // fashion, independently of one another. This also implies that the
   210  // DataBlockIntervalCollectors for point and range keys should be references to
   211  // independent instances, rather than references to the same collector, as point
   212  // and range keys are tracked independently.
   213  type BlockIntervalCollector struct {
   214  	name   string
   215  	points DataBlockIntervalCollector
   216  	ranges DataBlockIntervalCollector
   217  
   218  	blockInterval interval
   219  	indexInterval interval
   220  	tableInterval interval
   221  }
   222  
   223  var _ BlockPropertyCollector = &BlockIntervalCollector{}
   224  
   225  // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
   226  // that contains the actual logic pertaining to the property. It only
   227  // maintains state for the current data block, and resets that state in
   228  // FinishDataBlock. This interface can be used to reduce parsing costs.
   229  type DataBlockIntervalCollector interface {
   230  	// Add is called with each new entry added to a data block in the sstable.
   231  	// The callee can assume that these are in sorted order.
   232  	Add(key InternalKey, value []byte) error
   233  	// FinishDataBlock is called when all the entries have been added to a
   234  	// data block. Subsequent Add calls will be for the next data block. It
   235  	// returns the [lower, upper) for the finished block.
   236  	FinishDataBlock() (lower uint64, upper uint64, err error)
   237  }
   238  
   239  // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
   240  // name. The BlockIntervalCollector makes use of the given point and range key
   241  // DataBlockIntervalCollectors when encountering point and range keys,
   242  // respectively.
   243  //
   244  // The caller may pass a nil DataBlockIntervalCollector for one of the point or
   245  // range key collectors, in which case keys of those types will be ignored. This
   246  // allows for flexible construction of BlockIntervalCollectors that operate on
   247  // just point keys, just range keys, or both point and range keys.
   248  //
   249  // If both point and range keys are to be tracked, two independent collectors
   250  // should be provided, rather than the same collector passed in twice (see the
   251  // comment on BlockIntervalCollector for more detail)
   252  func NewBlockIntervalCollector(
   253  	name string, pointCollector, rangeCollector DataBlockIntervalCollector,
   254  ) BlockPropertyCollector {
   255  	if pointCollector == nil && rangeCollector == nil {
   256  		panic("sstable: at least one interval collector must be provided")
   257  	}
   258  	bic := BlockIntervalCollector{
   259  		name:   name,
   260  		points: pointCollector,
   261  		ranges: rangeCollector,
   262  	}
   263  	if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
   264  		return &suffixReplacementBlockCollectorWrapper{bic}
   265  	}
   266  	return &bic
   267  }
   268  
   269  // Name implements the BlockPropertyCollector interface.
   270  func (b *BlockIntervalCollector) Name() string {
   271  	return b.name
   272  }
   273  
   274  // Add implements the BlockPropertyCollector interface.
   275  func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
   276  	if rangekey.IsRangeKey(key.Kind()) {
   277  		if b.ranges != nil {
   278  			return b.ranges.Add(key, value)
   279  		}
   280  	} else if b.points != nil {
   281  		return b.points.Add(key, value)
   282  	}
   283  	return nil
   284  }
   285  
   286  // FinishDataBlock implements the BlockPropertyCollector interface.
   287  func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
   288  	if b.points == nil {
   289  		return buf, nil
   290  	}
   291  	var err error
   292  	b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
   293  	if err != nil {
   294  		return buf, err
   295  	}
   296  	buf = b.blockInterval.encode(buf)
   297  	b.tableInterval.union(b.blockInterval)
   298  	return buf, nil
   299  }
   300  
   301  // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
   302  // interface.
   303  func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
   304  	b.indexInterval.union(b.blockInterval)
   305  	b.blockInterval = interval{}
   306  }
   307  
   308  // FinishIndexBlock implements the BlockPropertyCollector interface.
   309  func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
   310  	buf = b.indexInterval.encode(buf)
   311  	b.indexInterval = interval{}
   312  	return buf, nil
   313  }
   314  
   315  // FinishTable implements the BlockPropertyCollector interface.
   316  func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
   317  	// If the collector is tracking range keys, the range key interval is union-ed
   318  	// with the point key interval for the table.
   319  	if b.ranges != nil {
   320  		var rangeInterval interval
   321  		var err error
   322  		rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
   323  		if err != nil {
   324  			return buf, err
   325  		}
   326  		b.tableInterval.union(rangeInterval)
   327  	}
   328  	return b.tableInterval.encode(buf), nil
   329  }
   330  
   331  type interval struct {
   332  	lower uint64
   333  	upper uint64
   334  }
   335  
   336  func (i interval) encode(buf []byte) []byte {
   337  	if i.lower < i.upper {
   338  		var encoded [binary.MaxVarintLen64 * 2]byte
   339  		n := binary.PutUvarint(encoded[:], i.lower)
   340  		n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
   341  		buf = append(buf, encoded[:n]...)
   342  	}
   343  	return buf
   344  }
   345  
   346  func (i *interval) decode(buf []byte) error {
   347  	if len(buf) == 0 {
   348  		*i = interval{}
   349  		return nil
   350  	}
   351  	var n int
   352  	i.lower, n = binary.Uvarint(buf)
   353  	if n <= 0 || n >= len(buf) {
   354  		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
   355  	}
   356  	pos := n
   357  	i.upper, n = binary.Uvarint(buf[pos:])
   358  	pos += n
   359  	if pos != len(buf) || n <= 0 {
   360  		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
   361  	}
   362  	// Delta decode.
   363  	i.upper += i.lower
   364  	if i.upper < i.lower {
   365  		return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
   366  	}
   367  	return nil
   368  }
   369  
   370  func (i *interval) union(x interval) {
   371  	if x.lower >= x.upper {
   372  		// x is the empty set.
   373  		return
   374  	}
   375  	if i.lower >= i.upper {
   376  		// i is the empty set.
   377  		*i = x
   378  		return
   379  	}
   380  	// Both sets are non-empty.
   381  	if x.lower < i.lower {
   382  		i.lower = x.lower
   383  	}
   384  	if x.upper > i.upper {
   385  		i.upper = x.upper
   386  	}
   387  }
   388  
   389  func (i interval) intersects(x interval) bool {
   390  	if i.lower >= i.upper || x.lower >= x.upper {
   391  		// At least one of the sets is empty.
   392  		return false
   393  	}
   394  	// Neither set is empty.
   395  	return i.upper > x.lower && i.lower < x.upper
   396  }
   397  
   398  type suffixReplacementBlockCollectorWrapper struct {
   399  	BlockIntervalCollector
   400  }
   401  
   402  // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
   403  func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
   404  	oldProp []byte, from, to []byte,
   405  ) error {
   406  	return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
   407  }
   408  
   409  // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
   410  // corresponding collector is a BlockIntervalCollector. That is, the set is of
   411  // the form [lower, upper).
   412  type BlockIntervalFilter struct {
   413  	name           string
   414  	filterInterval interval
   415  }
   416  
   417  var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
   418  
   419  // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
   420  // based on an interval property collected by BlockIntervalCollector and the
   421  // given [lower, upper) bounds. The given name specifies the
   422  // BlockIntervalCollector's properties to read.
   423  func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter {
   424  	b := new(BlockIntervalFilter)
   425  	b.Init(name, lower, upper)
   426  	return b
   427  }
   428  
   429  // Init initializes (or re-initializes, clearing previous state) an existing
   430  // BLockPropertyFilter to filter blocks based on an interval property collected
   431  // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
   432  // specifies the BlockIntervalCollector's properties to read.
   433  func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) {
   434  	*b = BlockIntervalFilter{
   435  		name:           name,
   436  		filterInterval: interval{lower: lower, upper: upper},
   437  	}
   438  }
   439  
   440  // Name implements the BlockPropertyFilter interface.
   441  func (b *BlockIntervalFilter) Name() string {
   442  	return b.name
   443  }
   444  
   445  // Intersects implements the BlockPropertyFilter interface.
   446  func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
   447  	var i interval
   448  	if err := i.decode(prop); err != nil {
   449  		return false, err
   450  	}
   451  	return i.intersects(b.filterInterval), nil
   452  }
   453  
   454  // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
   455  // generally safe to alter the filter while it's in use, except as part of the
   456  // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
   457  // masking.
   458  func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
   459  	b.filterInterval = interval{lower: lower, upper: upper}
   460  }
   461  
   462  // When encoding block properties for each block, we cannot afford to encode
   463  // the name. Instead, the name is mapped to a shortID, in the scope of that
   464  // sstable, and the shortID is encoded. Since we use a uint8, there is a limit
   465  // of 256 block property collectors per sstable.
   466  type shortID uint8
   467  
   468  type blockPropertiesEncoder struct {
   469  	propsBuf []byte
   470  	scratch  []byte
   471  }
   472  
   473  func (e *blockPropertiesEncoder) getScratchForProp() []byte {
   474  	return e.scratch[:0]
   475  }
   476  
   477  func (e *blockPropertiesEncoder) resetProps() {
   478  	e.propsBuf = e.propsBuf[:0]
   479  }
   480  
   481  func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
   482  	const lenID = 1
   483  	lenProp := uvarintLen(uint32(len(scratch)))
   484  	n := lenID + lenProp + len(scratch)
   485  	if cap(e.propsBuf)-len(e.propsBuf) < n {
   486  		size := len(e.propsBuf) + 2*n
   487  		if size < 2*cap(e.propsBuf) {
   488  			size = 2 * cap(e.propsBuf)
   489  		}
   490  		buf := make([]byte, len(e.propsBuf), size)
   491  		copy(buf, e.propsBuf)
   492  		e.propsBuf = buf
   493  	}
   494  	pos := len(e.propsBuf)
   495  	b := e.propsBuf[pos : pos+lenID]
   496  	b[0] = byte(id)
   497  	pos += lenID
   498  	b = e.propsBuf[pos : pos+lenProp]
   499  	n = binary.PutUvarint(b, uint64(len(scratch)))
   500  	pos += n
   501  	b = e.propsBuf[pos : pos+len(scratch)]
   502  	pos += len(scratch)
   503  	copy(b, scratch)
   504  	e.propsBuf = e.propsBuf[0:pos]
   505  	e.scratch = scratch
   506  }
   507  
   508  func (e *blockPropertiesEncoder) unsafeProps() []byte {
   509  	return e.propsBuf
   510  }
   511  
   512  func (e *blockPropertiesEncoder) props() []byte {
   513  	buf := make([]byte, len(e.propsBuf))
   514  	copy(buf, e.propsBuf)
   515  	return buf
   516  }
   517  
   518  type blockPropertiesDecoder struct {
   519  	props []byte
   520  }
   521  
   522  func (d *blockPropertiesDecoder) done() bool {
   523  	return len(d.props) == 0
   524  }
   525  
   526  // REQUIRES: !done()
   527  func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
   528  	const lenID = 1
   529  	id = shortID(d.props[0])
   530  	propLen, m := binary.Uvarint(d.props[lenID:])
   531  	n := lenID + m
   532  	if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
   533  		return 0, nil, base.CorruptionErrorf("corrupt block property length")
   534  	}
   535  	prop = d.props[n : n+int(propLen)]
   536  	d.props = d.props[n+int(propLen):]
   537  	return id, prop, nil
   538  }
   539  
   540  // BlockPropertiesFilterer provides filtering support when reading an sstable
   541  // in the context of an iterator that has a slice of BlockPropertyFilters.
   542  // After the call to NewBlockPropertiesFilterer, the caller must call
   543  // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
   544  // the filters. If it does intersect, this function also finishes initializing
   545  // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
   546  // Subsequent checks for relevance of a block should use the intersects
   547  // method.
   548  type BlockPropertiesFilterer struct {
   549  	filters []BlockPropertyFilter
   550  	// Maps shortID => index in filters. This can be sparse, and shortIDs for
   551  	// which there is no filter are represented with an index of -1. The
   552  	// length of this can be shorter than the shortIDs allocated in the
   553  	// sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
   554  	// has two filters, corresponding to shortIDs 2, 0, this would be:
   555  	// len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
   556  	shortIDToFiltersIndex []int
   557  
   558  	// boundLimitedFilter, if non-nil, holds a single block-property filter with
   559  	// additional constraints on its filtering. A boundLimitedFilter may only
   560  	// filter blocks that are wholly contained within its bounds. During forward
   561  	// iteration the lower bound (and during backward iteration the upper bound)
   562  	// must be externally guaranteed, with Intersects only returning false if
   563  	// that bound is met. The opposite bound is verified during iteration by the
   564  	// sstable iterator.
   565  	//
   566  	// boundLimitedFilter is permitted to be defined on a property (`Name()`)
   567  	// for which another filter exists in filters. In this case both filters
   568  	// will be consulted, and either filter may exclude block(s). Only a single
   569  	// bound-limited block-property filter may be set.
   570  	//
   571  	// The boundLimitedShortID field contains the shortID of the filter's
   572  	// property within the sstable. It's set to -1 if the property was not
   573  	// collected when the table was built.
   574  	boundLimitedFilter  BoundLimitedBlockPropertyFilter
   575  	boundLimitedShortID int
   576  }
   577  
   578  var blockPropertiesFiltererPool = sync.Pool{
   579  	New: func() interface{} {
   580  		return &BlockPropertiesFilterer{}
   581  	},
   582  }
   583  
   584  // NewBlockPropertiesFilterer returns a partially initialized filterer. To complete
   585  // initialization, call IntersectsUserPropsAndFinishInit.
   586  func NewBlockPropertiesFilterer(
   587  	filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter,
   588  ) *BlockPropertiesFilterer {
   589  	filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
   590  	*filterer = BlockPropertiesFilterer{
   591  		filters:               filters,
   592  		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
   593  		boundLimitedFilter:    limited,
   594  		boundLimitedShortID:   -1,
   595  	}
   596  	return filterer
   597  }
   598  
   599  func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
   600  	*filterer = BlockPropertiesFilterer{
   601  		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
   602  	}
   603  	blockPropertiesFiltererPool.Put(filterer)
   604  }
   605  
   606  // IntersectsUserPropsAndFinishInit is called with the user properties map for
   607  // the sstable and returns whether the sstable intersects the filters. It
   608  // additionally initializes the shortIDToFiltersIndex for the filters that are
   609  // relevant to this sstable.
   610  func (f *BlockPropertiesFilterer) IntersectsUserPropsAndFinishInit(
   611  	userProperties map[string]string,
   612  ) (bool, error) {
   613  	for i := range f.filters {
   614  		props, ok := userProperties[f.filters[i].Name()]
   615  		if !ok {
   616  			// Collector was not used when writing this file, so it is
   617  			// considered intersecting.
   618  			continue
   619  		}
   620  		byteProps := []byte(props)
   621  		if len(byteProps) < 1 {
   622  			return false, base.CorruptionErrorf(
   623  				"block properties for %s is corrupted", f.filters[i].Name())
   624  		}
   625  		shortID := shortID(byteProps[0])
   626  		intersects, err := f.filters[i].Intersects(byteProps[1:])
   627  		if err != nil || !intersects {
   628  			return false, err
   629  		}
   630  		// Intersects the sstable, so need to use this filter when
   631  		// deciding whether to read blocks.
   632  		n := len(f.shortIDToFiltersIndex)
   633  		if n <= int(shortID) {
   634  			if cap(f.shortIDToFiltersIndex) <= int(shortID) {
   635  				index := make([]int, shortID+1, 2*(shortID+1))
   636  				copy(index, f.shortIDToFiltersIndex)
   637  				f.shortIDToFiltersIndex = index
   638  			} else {
   639  				f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
   640  			}
   641  			for j := n; j < int(shortID); j++ {
   642  				f.shortIDToFiltersIndex[j] = -1
   643  			}
   644  		}
   645  		f.shortIDToFiltersIndex[shortID] = i
   646  	}
   647  	if f.boundLimitedFilter == nil {
   648  		return true, nil
   649  	}
   650  
   651  	// There's a bound-limited filter. Find its shortID. It's possible that
   652  	// there's an existing filter in f.filters on the same property. That's
   653  	// okay. Both filters will be consulted whenever a relevant prop is decoded.
   654  	props, ok := userProperties[f.boundLimitedFilter.Name()]
   655  	if !ok {
   656  		// The collector was not used when writing this file, so it's
   657  		// intersecting. We leave f.boundLimitedShortID=-1, so the filter will
   658  		// be unused within this file.
   659  		return true, nil
   660  	}
   661  	byteProps := []byte(props)
   662  	if len(byteProps) < 1 {
   663  		return false, base.CorruptionErrorf(
   664  			"block properties for %s is corrupted", f.boundLimitedFilter.Name())
   665  	}
   666  	f.boundLimitedShortID = int(byteProps[0])
   667  
   668  	// We don't check for table-level intersection for the bound-limited filter.
   669  	// The bound-limited filter is treated as vacuously intersecting.
   670  	//
   671  	// NB: If a block-property filter needs to be toggled inactive/active, it
   672  	// should be implemented within the Intersects implementation.
   673  	//
   674  	// TODO(jackson): We could filter at the table-level by threading the table
   675  	// smallest and largest bounds here.
   676  
   677  	// The bound-limited filter isn't included in shortIDToFiltersIndex.
   678  	//
   679  	// When determining intersection, we decode props only up to the shortID
   680  	// len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
   681  	// the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
   682  	// Growing the index with -1s ensures we're able to consult the index
   683  	// without length checks.
   684  	if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
   685  		if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
   686  			index := make([]int, f.boundLimitedShortID+1)
   687  			copy(index, f.shortIDToFiltersIndex)
   688  			f.shortIDToFiltersIndex = index
   689  		} else {
   690  			f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
   691  		}
   692  		for j := n; j <= f.boundLimitedShortID; j++ {
   693  			f.shortIDToFiltersIndex[j] = -1
   694  		}
   695  	}
   696  	return true, nil
   697  }
   698  
   699  type intersectsResult int8
   700  
   701  const (
   702  	blockIntersects intersectsResult = iota
   703  	blockExcluded
   704  	// blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
   705  	// no filters unconditionally exclude the block, but the bound-limited block
   706  	// property filter will exclude it if the block's bounds fall within the
   707  	// filter's current bounds. See the reader's
   708  	// {single,two}LevelIterator.resolveMaybeExcluded methods.
   709  	blockMaybeExcluded
   710  )
   711  
   712  func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
   713  	i := 0
   714  	decoder := blockPropertiesDecoder{props: props}
   715  	ret = blockIntersects
   716  	for i < len(f.shortIDToFiltersIndex) {
   717  		var id int
   718  		var prop []byte
   719  		if !decoder.done() {
   720  			var shortID shortID
   721  			var err error
   722  			shortID, prop, err = decoder.next()
   723  			if err != nil {
   724  				return ret, err
   725  			}
   726  			id = int(shortID)
   727  		} else {
   728  			id = math.MaxUint8 + 1
   729  		}
   730  		for i < len(f.shortIDToFiltersIndex) && id > i {
   731  			// The property for this id is not encoded for this block, but there
   732  			// may still be a filter for this id.
   733  			if intersects, err := f.intersectsFilter(i, nil); err != nil {
   734  				return ret, err
   735  			} else if intersects == blockExcluded {
   736  				return blockExcluded, nil
   737  			} else if intersects == blockMaybeExcluded {
   738  				ret = blockMaybeExcluded
   739  			}
   740  			i++
   741  		}
   742  		if i >= len(f.shortIDToFiltersIndex) {
   743  			return ret, nil
   744  		}
   745  		// INVARIANT: id <= i. And since i is always incremented by 1, id==i.
   746  		if id != i {
   747  			panic(fmt.Sprintf("%d != %d", id, i))
   748  		}
   749  		if intersects, err := f.intersectsFilter(i, prop); err != nil {
   750  			return ret, err
   751  		} else if intersects == blockExcluded {
   752  			return blockExcluded, nil
   753  		} else if intersects == blockMaybeExcluded {
   754  			ret = blockMaybeExcluded
   755  		}
   756  		i++
   757  	}
   758  	// ret == blockIntersects || ret == blockMaybeExcluded
   759  	return ret, nil
   760  }
   761  
   762  func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
   763  	if f.shortIDToFiltersIndex[i] >= 0 {
   764  		intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
   765  		if err != nil {
   766  			return blockIntersects, err
   767  		}
   768  		if !intersects {
   769  			return blockExcluded, nil
   770  		}
   771  	}
   772  	if i == f.boundLimitedShortID {
   773  		// The bound-limited filter uses this id.
   774  		//
   775  		// The bound-limited filter only applies within a keyspan interval. We
   776  		// expect the Intersects call to be cheaper than bounds checks. If
   777  		// Intersects determines that there is no intersection, we return
   778  		// `blockMaybeExcluded` if no other bpf unconditionally excludes the
   779  		// block.
   780  		intersects, err := f.boundLimitedFilter.Intersects(prop)
   781  		if err != nil {
   782  			return blockIntersects, err
   783  		} else if !intersects {
   784  			return blockMaybeExcluded, nil
   785  		}
   786  	}
   787  	return blockIntersects, nil
   788  }