github.com/cockroachdb/pebble@v1.1.2/sstable/block_property.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"encoding/binary"
     9  	"fmt"
    10  	"math"
    11  	"sync"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/internal/rangekey"
    16  )
    17  
    18  // Block properties are an optional user-facing feature that can be used to
    19  // filter data blocks (and whole sstables) from an Iterator before they are
    20  // loaded. They do not apply to range delete blocks. These are expected to
    21  // very concisely represent a set of some attribute value contained within the
    22  // key or value, such that the set includes all the attribute values in the
    23  // block. This has some similarities with OLAP pruning approaches that
    24  // maintain min-max attribute values for some column (which concisely
    25  // represent a set), that is then used to prune at query time. In Pebble's
    26  // case, data blocks are small, typically 25-50KB, so these properties should
    27  // reduce their precision in order to be concise -- a good rule of thumb is to
    28  // not consume more than 50-100 bytes across all properties maintained for a
    29  // block, i.e., a 500x reduction compared to loading the data block.
    30  //
    31  // A block property must be assigned a unique name, which is encoded and
    32  // stored in the sstable. This name must be unique among all user-properties
    33  // encoded in an sstable.
    34  //
    35  // A property is represented as a []byte. A nil value or empty byte slice are
    36  // considered semantically identical. The caller is free to choose the
    37  // semantics of an empty byte slice e.g. they could use it to represent the
    38  // empty set or the universal set, whichever they think is more common and
    39  // therefore better to encode more concisely. The serialization of the
    40  // property for the various Finish*() calls in a BlockPropertyCollector
    41  // implementation should be identical, since the corresponding
    42  // BlockPropertyFilter implementation is not told the context in which it is
    43  // deserializing the property.
    44  //
    45  // Block properties are more general than table properties and should be
    46  // preferred over using table properties. A BlockPropertyCollector can achieve
    47  // identical behavior to table properties by returning the nil slice from
    48  // FinishDataBlock and FinishIndexBlock, and interpret them as the universal
    49  // set in BlockPropertyFilter, and return a non-universal set in FinishTable.
    50  //
    51  // Block property filtering is nondeterministic because the separation of keys
    52  // into blocks is nondeterministic. Clients use block-property filters to
    53  // implement efficient application of a filter F that applies to key-value pairs
    54  // (abbreviated as kv-filter). Consider correctness defined as surfacing exactly
    55  // the same key-value pairs that would be surfaced if one applied the filter F
    56  // above normal iteration. With this correctness definition, block property
    57  // filtering may introduce two kinds of errors:
    58  //
    59  //   a) Block property filtering that uses a kv-filter may produce additional
    60  //      key-value pairs that don't satisfy the filter because of the separation
    61  //      of keys into blocks. Clients may remove these extra key-value pairs by
    62  //      re-applying the kv filter while reading results back from Pebble.
    63  //
    64  //   b) Block property filtering may surface deleted key-value pairs if the
    65  //      kv filter is not a strict function of the key's user key. A block
    66  //      containing k.DEL may be filtered, while a block containing the deleted
    67  //      key k.SET may not be filtered, if the kv filter applies to one but not
    68  //      the other.
    69  //
    70  //      This error may be avoided trivially by using a kv filter that is a pure
    71  //      function of the user key. A filter that examines values or key kinds
    72  //      requires care to ensure F(k.SET, <value>) = F(k.DEL) = F(k.SINGLEDEL).
    73  //
    74  // The combination of range deletions and filtering by table-level properties
    75  // add another opportunity for deleted point keys to be surfaced. The pebble
    76  // Iterator stack takes care to correctly apply filtered tables' range deletions
    77  // to lower tables, preventing this form of nondeterministic error.
    78  //
    79  // In addition to the non-determinism discussed in (b), which limits the use
    80  // of properties over values, we now have support for values that are not
    81  // stored together with the key, and may not even be retrieved during
    82  // compactions. If Pebble is configured with such value separation, block
    83  // properties must only apply to the key, and will be provided a nil value.
    84  
    85  // BlockPropertyCollector is used when writing a sstable.
    86  //
    87  //   - All calls to Add are included in the next FinishDataBlock, after which
    88  //     the next data block is expected to start.
    89  //
    90  //   - The index entry generated for the data block, which contains the return
    91  //     value from FinishDataBlock, is not immediately included in the current
    92  //     index block. It is included when AddPrevDataBlockToIndexBlock is called.
    93  //     An alternative would be to return an opaque handle from FinishDataBlock
    94  //     and pass it to a new AddToIndexBlock method, which requires more
    95  //     plumbing, and passing of an interface{} results in a undesirable heap
    96  //     allocation. AddPrevDataBlockToIndexBlock must be called before keys are
    97  //     added to the new data block.
    98  type BlockPropertyCollector interface {
    99  	// Name returns the name of the block property collector.
   100  	Name() string
   101  	// Add is called with each new entry added to a data block in the sstable.
   102  	// The callee can assume that these are in sorted order.
   103  	Add(key InternalKey, value []byte) error
   104  	// FinishDataBlock is called when all the entries have been added to a
   105  	// data block. Subsequent Add calls will be for the next data block. It
   106  	// returns the property value for the finished block.
   107  	FinishDataBlock(buf []byte) ([]byte, error)
   108  	// AddPrevDataBlockToIndexBlock adds the entry corresponding to the
   109  	// previous FinishDataBlock to the current index block.
   110  	AddPrevDataBlockToIndexBlock()
   111  	// FinishIndexBlock is called when an index block, containing all the
   112  	// key-value pairs since the last FinishIndexBlock, will no longer see new
   113  	// entries. It returns the property value for the index block.
   114  	FinishIndexBlock(buf []byte) ([]byte, error)
   115  	// FinishTable is called when the sstable is finished, and returns the
   116  	// property value for the sstable.
   117  	FinishTable(buf []byte) ([]byte, error)
   118  }
   119  
   120  // SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector
   121  // interface that allows a block property collector to indicate that it supports
   122  // being *updated* during suffix replacement, i.e. when an existing SST in which
   123  // all keys have the same key suffix is updated to have a new suffix.
   124  //
   125  // A collector which supports being updated in such cases must be able to derive
   126  // its updated value from its old value and the change being made to the suffix,
   127  // without needing to be passed each updated K/V.
   128  //
   129  // For example, a collector that only inspects values would can simply copy its
   130  // previously computed property as-is, since key-suffix replacement does not
   131  // change values, while a collector that depends only on key suffixes, like one
   132  // which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just
   133  // set its new bounds from the new suffix, as it is common to all keys, without
   134  // needing to recompute it from every key.
   135  //
   136  // An implementation of DataBlockIntervalCollector can also implement this
   137  // interface, in which case the BlockPropertyCollector returned by passing it to
   138  // NewBlockIntervalCollector will also implement this interface automatically.
   139  type SuffixReplaceableBlockCollector interface {
   140  	// UpdateKeySuffixes is called when a block is updated to change the suffix of
   141  	// all keys in the block, and is passed the old value for that prop, if any,
   142  	// for that block as well as the old and new suffix.
   143  	UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error
   144  }
   145  
   146  // BlockPropertyFilter is used in an Iterator to filter sstables and blocks
   147  // within the sstable. It should not maintain any per-sstable state, and must
   148  // be thread-safe.
   149  type BlockPropertyFilter = base.BlockPropertyFilter
   150  
   151  // BoundLimitedBlockPropertyFilter implements the block-property filter but
   152  // imposes an additional constraint on its usage, requiring that only blocks
   153  // containing exclusively keys between its lower and upper bounds may be
   154  // filtered. The bounds may be change during iteration, so the filter doesn't
   155  // expose the bounds, instead implementing KeyIsWithin[Lower,Upper]Bound methods
   156  // for performing bound comparisons.
   157  //
   158  // To be used, a BoundLimitedBlockPropertyFilter must be supplied directly
   159  // through NewBlockPropertiesFilterer's dedicated parameter. If supplied through
   160  // the ordinary slice of block property filters, this filter's bounds will be
   161  // ignored.
   162  //
   163  // The current [lower,upper) bounds of the filter are unknown, because they may
   164  // be changing. During forward iteration the lower bound is externally
   165  // guaranteed, meaning Intersects only returns false if the sstable iterator is
   166  // already known to be positioned at a key ≥ lower. The sstable iterator is then
   167  // only responsible for ensuring filtered blocks also meet the upper bound, and
   168  // should only allow a block to be filtered if all its keys are < upper. The
   169  // sstable iterator may invoke KeyIsWithinUpperBound(key) to perform this check,
   170  // where key is an inclusive upper bound on the block's keys.
   171  //
   172  // During backward iteration the upper bound is externally guaranteed, and
   173  // Intersects only returns false if the sstable iterator is already known to be
   174  // positioned at a key < upper. The sstable iterator is responsible for ensuring
   175  // filtered blocks also meet the lower bound, enforcing that a block is only
   176  // filtered if all its keys are ≥ lower. This check is made through passing the
   177  // block's inclusive lower bound to KeyIsWithinLowerBound.
   178  //
   179  // Implementations may become active or inactive through implementing Intersects
   180  // to return true whenever the filter is disabled.
   181  //
   182  // Usage of BoundLimitedBlockPropertyFilter is subtle, and Pebble consumers
   183  // should not implement this interface directly. This interface is an internal
   184  // detail in the implementation of block-property range-key masking.
   185  type BoundLimitedBlockPropertyFilter interface {
   186  	BlockPropertyFilter
   187  
   188  	// KeyIsWithinLowerBound tests whether the provided internal key falls
   189  	// within the current lower bound of the filter. A true return value
   190  	// indicates that the filter may be used to filter blocks that exclusively
   191  	// contain keys ≥ `key`, so long as the blocks' keys also satisfy the upper
   192  	// bound.
   193  	KeyIsWithinLowerBound(key []byte) bool
   194  	// KeyIsWithinUpperBound tests whether the provided internal key falls
   195  	// within the current upper bound of the filter. A true return value
   196  	// indicates that the filter may be used to filter blocks that exclusively
   197  	// contain keys ≤ `key`, so long as the blocks' keys also satisfy the lower
   198  	// bound.
   199  	KeyIsWithinUpperBound(key []byte) bool
   200  }
   201  
   202  // BlockIntervalCollector is a helper implementation of BlockPropertyCollector
   203  // for users who want to represent a set of the form [lower,upper) where both
   204  // lower and upper are uint64, and lower <= upper.
   205  //
   206  // The set is encoded as:
   207  // - Two varint integers, (lower,upper-lower), when upper-lower > 0
   208  // - Nil, when upper-lower=0
   209  //
   210  // Users must not expect this to preserve differences between empty sets --
   211  // they will all get turned into the semantically equivalent [0,0).
   212  //
   213  // A BlockIntervalCollector that collects over point and range keys needs to
   214  // have both the point and range DataBlockIntervalCollector specified, since
   215  // point and range keys are fed to the BlockIntervalCollector in an interleaved
   216  // fashion, independently of one another. This also implies that the
   217  // DataBlockIntervalCollectors for point and range keys should be references to
   218  // independent instances, rather than references to the same collector, as point
   219  // and range keys are tracked independently.
   220  type BlockIntervalCollector struct {
   221  	name   string
   222  	points DataBlockIntervalCollector
   223  	ranges DataBlockIntervalCollector
   224  
   225  	blockInterval interval
   226  	indexInterval interval
   227  	tableInterval interval
   228  }
   229  
   230  var _ BlockPropertyCollector = &BlockIntervalCollector{}
   231  
   232  // DataBlockIntervalCollector is the interface used by BlockIntervalCollector
   233  // that contains the actual logic pertaining to the property. It only
   234  // maintains state for the current data block, and resets that state in
   235  // FinishDataBlock. This interface can be used to reduce parsing costs.
   236  type DataBlockIntervalCollector interface {
   237  	// Add is called with each new entry added to a data block in the sstable.
   238  	// The callee can assume that these are in sorted order.
   239  	Add(key InternalKey, value []byte) error
   240  	// FinishDataBlock is called when all the entries have been added to a
   241  	// data block. Subsequent Add calls will be for the next data block. It
   242  	// returns the [lower, upper) for the finished block.
   243  	FinishDataBlock() (lower uint64, upper uint64, err error)
   244  }
   245  
   246  // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given
   247  // name. The BlockIntervalCollector makes use of the given point and range key
   248  // DataBlockIntervalCollectors when encountering point and range keys,
   249  // respectively.
   250  //
   251  // The caller may pass a nil DataBlockIntervalCollector for one of the point or
   252  // range key collectors, in which case keys of those types will be ignored. This
   253  // allows for flexible construction of BlockIntervalCollectors that operate on
   254  // just point keys, just range keys, or both point and range keys.
   255  //
   256  // If both point and range keys are to be tracked, two independent collectors
   257  // should be provided, rather than the same collector passed in twice (see the
   258  // comment on BlockIntervalCollector for more detail)
   259  func NewBlockIntervalCollector(
   260  	name string, pointCollector, rangeCollector DataBlockIntervalCollector,
   261  ) BlockPropertyCollector {
   262  	if pointCollector == nil && rangeCollector == nil {
   263  		panic("sstable: at least one interval collector must be provided")
   264  	}
   265  	bic := BlockIntervalCollector{
   266  		name:   name,
   267  		points: pointCollector,
   268  		ranges: rangeCollector,
   269  	}
   270  	if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok {
   271  		return &suffixReplacementBlockCollectorWrapper{bic}
   272  	}
   273  	return &bic
   274  }
   275  
   276  // Name implements the BlockPropertyCollector interface.
   277  func (b *BlockIntervalCollector) Name() string {
   278  	return b.name
   279  }
   280  
   281  // Add implements the BlockPropertyCollector interface.
   282  func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error {
   283  	if rangekey.IsRangeKey(key.Kind()) {
   284  		if b.ranges != nil {
   285  			return b.ranges.Add(key, value)
   286  		}
   287  	} else if b.points != nil {
   288  		return b.points.Add(key, value)
   289  	}
   290  	return nil
   291  }
   292  
   293  // FinishDataBlock implements the BlockPropertyCollector interface.
   294  func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) {
   295  	if b.points == nil {
   296  		return buf, nil
   297  	}
   298  	var err error
   299  	b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock()
   300  	if err != nil {
   301  		return buf, err
   302  	}
   303  	buf = b.blockInterval.encode(buf)
   304  	b.tableInterval.union(b.blockInterval)
   305  	return buf, nil
   306  }
   307  
   308  // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector
   309  // interface.
   310  func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() {
   311  	b.indexInterval.union(b.blockInterval)
   312  	b.blockInterval = interval{}
   313  }
   314  
   315  // FinishIndexBlock implements the BlockPropertyCollector interface.
   316  func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) {
   317  	buf = b.indexInterval.encode(buf)
   318  	b.indexInterval = interval{}
   319  	return buf, nil
   320  }
   321  
   322  // FinishTable implements the BlockPropertyCollector interface.
   323  func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) {
   324  	// If the collector is tracking range keys, the range key interval is union-ed
   325  	// with the point key interval for the table.
   326  	if b.ranges != nil {
   327  		var rangeInterval interval
   328  		var err error
   329  		rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock()
   330  		if err != nil {
   331  			return buf, err
   332  		}
   333  		b.tableInterval.union(rangeInterval)
   334  	}
   335  	return b.tableInterval.encode(buf), nil
   336  }
   337  
   338  type interval struct {
   339  	lower uint64
   340  	upper uint64
   341  }
   342  
   343  func (i interval) encode(buf []byte) []byte {
   344  	if i.lower < i.upper {
   345  		var encoded [binary.MaxVarintLen64 * 2]byte
   346  		n := binary.PutUvarint(encoded[:], i.lower)
   347  		n += binary.PutUvarint(encoded[n:], i.upper-i.lower)
   348  		buf = append(buf, encoded[:n]...)
   349  	}
   350  	return buf
   351  }
   352  
   353  func (i *interval) decode(buf []byte) error {
   354  	if len(buf) == 0 {
   355  		*i = interval{}
   356  		return nil
   357  	}
   358  	var n int
   359  	i.lower, n = binary.Uvarint(buf)
   360  	if n <= 0 || n >= len(buf) {
   361  		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
   362  	}
   363  	pos := n
   364  	i.upper, n = binary.Uvarint(buf[pos:])
   365  	pos += n
   366  	if pos != len(buf) || n <= 0 {
   367  		return base.CorruptionErrorf("cannot decode interval from buf %x", buf)
   368  	}
   369  	// Delta decode.
   370  	i.upper += i.lower
   371  	if i.upper < i.lower {
   372  		return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower)
   373  	}
   374  	return nil
   375  }
   376  
   377  func (i *interval) union(x interval) {
   378  	if x.lower >= x.upper {
   379  		// x is the empty set.
   380  		return
   381  	}
   382  	if i.lower >= i.upper {
   383  		// i is the empty set.
   384  		*i = x
   385  		return
   386  	}
   387  	// Both sets are non-empty.
   388  	if x.lower < i.lower {
   389  		i.lower = x.lower
   390  	}
   391  	if x.upper > i.upper {
   392  		i.upper = x.upper
   393  	}
   394  }
   395  
   396  func (i interval) intersects(x interval) bool {
   397  	if i.lower >= i.upper || x.lower >= x.upper {
   398  		// At least one of the sets is empty.
   399  		return false
   400  	}
   401  	// Neither set is empty.
   402  	return i.upper > x.lower && i.lower < x.upper
   403  }
   404  
   405  type suffixReplacementBlockCollectorWrapper struct {
   406  	BlockIntervalCollector
   407  }
   408  
   409  // UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface.
   410  func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes(
   411  	oldProp []byte, from, to []byte,
   412  ) error {
   413  	return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to)
   414  }
   415  
   416  // BlockIntervalFilter is an implementation of BlockPropertyFilter when the
   417  // corresponding collector is a BlockIntervalCollector. That is, the set is of
   418  // the form [lower, upper).
   419  type BlockIntervalFilter struct {
   420  	name           string
   421  	filterInterval interval
   422  }
   423  
   424  var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil)
   425  
   426  // NewBlockIntervalFilter constructs a BlockPropertyFilter that filters blocks
   427  // based on an interval property collected by BlockIntervalCollector and the
   428  // given [lower, upper) bounds. The given name specifies the
   429  // BlockIntervalCollector's properties to read.
   430  func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter {
   431  	b := new(BlockIntervalFilter)
   432  	b.Init(name, lower, upper)
   433  	return b
   434  }
   435  
   436  // Init initializes (or re-initializes, clearing previous state) an existing
   437  // BLockPropertyFilter to filter blocks based on an interval property collected
   438  // by BlockIntervalCollector and the given [lower, upper) bounds. The given name
   439  // specifies the BlockIntervalCollector's properties to read.
   440  func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) {
   441  	*b = BlockIntervalFilter{
   442  		name:           name,
   443  		filterInterval: interval{lower: lower, upper: upper},
   444  	}
   445  }
   446  
   447  // Name implements the BlockPropertyFilter interface.
   448  func (b *BlockIntervalFilter) Name() string {
   449  	return b.name
   450  }
   451  
   452  // Intersects implements the BlockPropertyFilter interface.
   453  func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) {
   454  	var i interval
   455  	if err := i.decode(prop); err != nil {
   456  		return false, err
   457  	}
   458  	return i.intersects(b.filterInterval), nil
   459  }
   460  
   461  // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not
   462  // generally safe to alter the filter while it's in use, except as part of the
   463  // implementation of BlockPropertyFilterMask.SetSuffix used for range-key
   464  // masking.
   465  func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) {
   466  	b.filterInterval = interval{lower: lower, upper: upper}
   467  }
   468  
   469  // When encoding block properties for each block, we cannot afford to encode
   470  // the name. Instead, the name is mapped to a shortID, in the scope of that
   471  // sstable, and the shortID is encoded. Since we use a uint8, there is a limit
   472  // of 256 block property collectors per sstable.
   473  type shortID uint8
   474  
   475  type blockPropertiesEncoder struct {
   476  	propsBuf []byte
   477  	scratch  []byte
   478  }
   479  
   480  func (e *blockPropertiesEncoder) getScratchForProp() []byte {
   481  	return e.scratch[:0]
   482  }
   483  
   484  func (e *blockPropertiesEncoder) resetProps() {
   485  	e.propsBuf = e.propsBuf[:0]
   486  }
   487  
   488  func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) {
   489  	const lenID = 1
   490  	lenProp := uvarintLen(uint32(len(scratch)))
   491  	n := lenID + lenProp + len(scratch)
   492  	if cap(e.propsBuf)-len(e.propsBuf) < n {
   493  		size := len(e.propsBuf) + 2*n
   494  		if size < 2*cap(e.propsBuf) {
   495  			size = 2 * cap(e.propsBuf)
   496  		}
   497  		buf := make([]byte, len(e.propsBuf), size)
   498  		copy(buf, e.propsBuf)
   499  		e.propsBuf = buf
   500  	}
   501  	pos := len(e.propsBuf)
   502  	b := e.propsBuf[pos : pos+lenID]
   503  	b[0] = byte(id)
   504  	pos += lenID
   505  	b = e.propsBuf[pos : pos+lenProp]
   506  	n = binary.PutUvarint(b, uint64(len(scratch)))
   507  	pos += n
   508  	b = e.propsBuf[pos : pos+len(scratch)]
   509  	pos += len(scratch)
   510  	copy(b, scratch)
   511  	e.propsBuf = e.propsBuf[0:pos]
   512  	e.scratch = scratch
   513  }
   514  
   515  func (e *blockPropertiesEncoder) unsafeProps() []byte {
   516  	return e.propsBuf
   517  }
   518  
   519  func (e *blockPropertiesEncoder) props() []byte {
   520  	buf := make([]byte, len(e.propsBuf))
   521  	copy(buf, e.propsBuf)
   522  	return buf
   523  }
   524  
   525  type blockPropertiesDecoder struct {
   526  	props []byte
   527  }
   528  
   529  func (d *blockPropertiesDecoder) done() bool {
   530  	return len(d.props) == 0
   531  }
   532  
   533  // REQUIRES: !done()
   534  func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) {
   535  	const lenID = 1
   536  	id = shortID(d.props[0])
   537  	propLen, m := binary.Uvarint(d.props[lenID:])
   538  	n := lenID + m
   539  	if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) {
   540  		return 0, nil, base.CorruptionErrorf("corrupt block property length")
   541  	}
   542  	prop = d.props[n : n+int(propLen)]
   543  	d.props = d.props[n+int(propLen):]
   544  	return id, prop, nil
   545  }
   546  
   547  // BlockPropertiesFilterer provides filtering support when reading an sstable
   548  // in the context of an iterator that has a slice of BlockPropertyFilters.
   549  // After the call to NewBlockPropertiesFilterer, the caller must call
   550  // IntersectsUserPropsAndFinishInit to check if the sstable intersects with
   551  // the filters. If it does intersect, this function also finishes initializing
   552  // the BlockPropertiesFilterer using the shortIDs for the relevant filters.
   553  // Subsequent checks for relevance of a block should use the intersects
   554  // method.
   555  type BlockPropertiesFilterer struct {
   556  	filters []BlockPropertyFilter
   557  	// Maps shortID => index in filters. This can be sparse, and shortIDs for
   558  	// which there is no filter are represented with an index of -1. The
   559  	// length of this can be shorter than the shortIDs allocated in the
   560  	// sstable. e.g. if the sstable used shortIDs 0, 1, 2, 3, and the iterator
   561  	// has two filters, corresponding to shortIDs 2, 0, this would be:
   562  	// len(shortIDToFiltersIndex)==3, 0=>1, 1=>-1, 2=>0.
   563  	shortIDToFiltersIndex []int
   564  
   565  	// boundLimitedFilter, if non-nil, holds a single block-property filter with
   566  	// additional constraints on its filtering. A boundLimitedFilter may only
   567  	// filter blocks that are wholly contained within its bounds. During forward
   568  	// iteration the lower bound (and during backward iteration the upper bound)
   569  	// must be externally guaranteed, with Intersects only returning false if
   570  	// that bound is met. The opposite bound is verified during iteration by the
   571  	// sstable iterator.
   572  	//
   573  	// boundLimitedFilter is permitted to be defined on a property (`Name()`)
   574  	// for which another filter exists in filters. In this case both filters
   575  	// will be consulted, and either filter may exclude block(s). Only a single
   576  	// bound-limited block-property filter may be set.
   577  	//
   578  	// The boundLimitedShortID field contains the shortID of the filter's
   579  	// property within the sstable. It's set to -1 if the property was not
   580  	// collected when the table was built.
   581  	boundLimitedFilter  BoundLimitedBlockPropertyFilter
   582  	boundLimitedShortID int
   583  }
   584  
   585  var blockPropertiesFiltererPool = sync.Pool{
   586  	New: func() interface{} {
   587  		return &BlockPropertiesFilterer{}
   588  	},
   589  }
   590  
   591  // newBlockPropertiesFilterer returns a partially initialized filterer. To complete
   592  // initialization, call IntersectsUserPropsAndFinishInit.
   593  func newBlockPropertiesFilterer(
   594  	filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter,
   595  ) *BlockPropertiesFilterer {
   596  	filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer)
   597  	*filterer = BlockPropertiesFilterer{
   598  		filters:               filters,
   599  		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
   600  		boundLimitedFilter:    limited,
   601  		boundLimitedShortID:   -1,
   602  	}
   603  	return filterer
   604  }
   605  
   606  func releaseBlockPropertiesFilterer(filterer *BlockPropertiesFilterer) {
   607  	*filterer = BlockPropertiesFilterer{
   608  		shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0],
   609  	}
   610  	blockPropertiesFiltererPool.Put(filterer)
   611  }
   612  
   613  // IntersectsTable evaluates the provided block-property filter against the
   614  // provided set of table-level properties. If there is no intersection between
   615  // the filters and the table or an error is encountered, IntersectsTable returns
   616  // a nil filterer (and possibly an error). If there is an intersection,
   617  // IntersectsTable returns a non-nil filterer that may be used by an iterator
   618  // reading the table.
   619  func IntersectsTable(
   620  	filters []BlockPropertyFilter,
   621  	limited BoundLimitedBlockPropertyFilter,
   622  	userProperties map[string]string,
   623  ) (*BlockPropertiesFilterer, error) {
   624  	f := newBlockPropertiesFilterer(filters, limited)
   625  	ok, err := f.intersectsUserPropsAndFinishInit(userProperties)
   626  	if !ok || err != nil {
   627  		releaseBlockPropertiesFilterer(f)
   628  		return nil, err
   629  	}
   630  	return f, nil
   631  }
   632  
   633  // intersectsUserPropsAndFinishInit is called with the user properties map for
   634  // the sstable and returns whether the sstable intersects the filters. It
   635  // additionally initializes the shortIDToFiltersIndex for the filters that are
   636  // relevant to this sstable.
   637  func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit(
   638  	userProperties map[string]string,
   639  ) (bool, error) {
   640  	for i := range f.filters {
   641  		props, ok := userProperties[f.filters[i].Name()]
   642  		if !ok {
   643  			// Collector was not used when writing this file, so it is
   644  			// considered intersecting.
   645  			continue
   646  		}
   647  		if len(props) < 1 {
   648  			return false, base.CorruptionErrorf(
   649  				"block properties for %s is corrupted", f.filters[i].Name())
   650  		}
   651  		shortID := shortID(props[0])
   652  		{
   653  			// Use an unsafe conversion to avoid allocating. Intersects() is not
   654  			// supposed to modify the given slice.
   655  			// Note that unsafe.StringData only works if the string is not empty
   656  			// (which we already checked).
   657  			byteProps := unsafe.Slice(unsafe.StringData(props), len(props))
   658  			intersects, err := f.filters[i].Intersects(byteProps[1:])
   659  			if err != nil || !intersects {
   660  				return false, err
   661  			}
   662  		}
   663  		// Intersects the sstable, so need to use this filter when
   664  		// deciding whether to read blocks.
   665  		n := len(f.shortIDToFiltersIndex)
   666  		if n <= int(shortID) {
   667  			if cap(f.shortIDToFiltersIndex) <= int(shortID) {
   668  				index := make([]int, shortID+1, 2*(shortID+1))
   669  				copy(index, f.shortIDToFiltersIndex)
   670  				f.shortIDToFiltersIndex = index
   671  			} else {
   672  				f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:shortID+1]
   673  			}
   674  			for j := n; j < int(shortID); j++ {
   675  				f.shortIDToFiltersIndex[j] = -1
   676  			}
   677  		}
   678  		f.shortIDToFiltersIndex[shortID] = i
   679  	}
   680  	if f.boundLimitedFilter == nil {
   681  		return true, nil
   682  	}
   683  
   684  	// There's a bound-limited filter. Find its shortID. It's possible that
   685  	// there's an existing filter in f.filters on the same property. That's
   686  	// okay. Both filters will be consulted whenever a relevant prop is decoded.
   687  	props, ok := userProperties[f.boundLimitedFilter.Name()]
   688  	if !ok {
   689  		// The collector was not used when writing this file, so it's
   690  		// intersecting. We leave f.boundLimitedShortID=-1, so the filter will
   691  		// be unused within this file.
   692  		return true, nil
   693  	}
   694  	if len(props) < 1 {
   695  		return false, base.CorruptionErrorf(
   696  			"block properties for %s is corrupted", f.boundLimitedFilter.Name())
   697  	}
   698  	f.boundLimitedShortID = int(props[0])
   699  
   700  	// We don't check for table-level intersection for the bound-limited filter.
   701  	// The bound-limited filter is treated as vacuously intersecting.
   702  	//
   703  	// NB: If a block-property filter needs to be toggled inactive/active, it
   704  	// should be implemented within the Intersects implementation.
   705  	//
   706  	// TODO(jackson): We could filter at the table-level by threading the table
   707  	// smallest and largest bounds here.
   708  
   709  	// The bound-limited filter isn't included in shortIDToFiltersIndex.
   710  	//
   711  	// When determining intersection, we decode props only up to the shortID
   712  	// len(shortIDToFiltersIndex). If f.limitedShortID is greater than any of
   713  	// the existing filters' shortIDs, we need to grow shortIDToFiltersIndex.
   714  	// Growing the index with -1s ensures we're able to consult the index
   715  	// without length checks.
   716  	if n := len(f.shortIDToFiltersIndex); n <= f.boundLimitedShortID {
   717  		if cap(f.shortIDToFiltersIndex) <= f.boundLimitedShortID {
   718  			index := make([]int, f.boundLimitedShortID+1)
   719  			copy(index, f.shortIDToFiltersIndex)
   720  			f.shortIDToFiltersIndex = index
   721  		} else {
   722  			f.shortIDToFiltersIndex = f.shortIDToFiltersIndex[:f.boundLimitedShortID+1]
   723  		}
   724  		for j := n; j <= f.boundLimitedShortID; j++ {
   725  			f.shortIDToFiltersIndex[j] = -1
   726  		}
   727  	}
   728  	return true, nil
   729  }
   730  
   731  type intersectsResult int8
   732  
   733  const (
   734  	blockIntersects intersectsResult = iota
   735  	blockExcluded
   736  	// blockMaybeExcluded is returned by BlockPropertiesFilterer.intersects when
   737  	// no filters unconditionally exclude the block, but the bound-limited block
   738  	// property filter will exclude it if the block's bounds fall within the
   739  	// filter's current bounds. See the reader's
   740  	// {single,two}LevelIterator.resolveMaybeExcluded methods.
   741  	blockMaybeExcluded
   742  )
   743  
   744  func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) {
   745  	i := 0
   746  	decoder := blockPropertiesDecoder{props: props}
   747  	ret = blockIntersects
   748  	for i < len(f.shortIDToFiltersIndex) {
   749  		var id int
   750  		var prop []byte
   751  		if !decoder.done() {
   752  			var shortID shortID
   753  			var err error
   754  			shortID, prop, err = decoder.next()
   755  			if err != nil {
   756  				return ret, err
   757  			}
   758  			id = int(shortID)
   759  		} else {
   760  			id = math.MaxUint8 + 1
   761  		}
   762  		for i < len(f.shortIDToFiltersIndex) && id > i {
   763  			// The property for this id is not encoded for this block, but there
   764  			// may still be a filter for this id.
   765  			if intersects, err := f.intersectsFilter(i, nil); err != nil {
   766  				return ret, err
   767  			} else if intersects == blockExcluded {
   768  				return blockExcluded, nil
   769  			} else if intersects == blockMaybeExcluded {
   770  				ret = blockMaybeExcluded
   771  			}
   772  			i++
   773  		}
   774  		if i >= len(f.shortIDToFiltersIndex) {
   775  			return ret, nil
   776  		}
   777  		// INVARIANT: id <= i. And since i is always incremented by 1, id==i.
   778  		if id != i {
   779  			panic(fmt.Sprintf("%d != %d", id, i))
   780  		}
   781  		if intersects, err := f.intersectsFilter(i, prop); err != nil {
   782  			return ret, err
   783  		} else if intersects == blockExcluded {
   784  			return blockExcluded, nil
   785  		} else if intersects == blockMaybeExcluded {
   786  			ret = blockMaybeExcluded
   787  		}
   788  		i++
   789  	}
   790  	// ret == blockIntersects || ret == blockMaybeExcluded
   791  	return ret, nil
   792  }
   793  
   794  func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) {
   795  	if f.shortIDToFiltersIndex[i] >= 0 {
   796  		intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop)
   797  		if err != nil {
   798  			return blockIntersects, err
   799  		}
   800  		if !intersects {
   801  			return blockExcluded, nil
   802  		}
   803  	}
   804  	if i == f.boundLimitedShortID {
   805  		// The bound-limited filter uses this id.
   806  		//
   807  		// The bound-limited filter only applies within a keyspan interval. We
   808  		// expect the Intersects call to be cheaper than bounds checks. If
   809  		// Intersects determines that there is no intersection, we return
   810  		// `blockMaybeExcluded` if no other bpf unconditionally excludes the
   811  		// block.
   812  		intersects, err := f.boundLimitedFilter.Intersects(prop)
   813  		if err != nil {
   814  			return blockIntersects, err
   815  		} else if !intersects {
   816  			return blockMaybeExcluded, nil
   817  		}
   818  	}
   819  	return blockIntersects, nil
   820  }