github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/properties.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"math"
    12  	"reflect"
    13  	"sort"
    14  	"unsafe"
    15  
    16  	"github.com/cockroachdb/pebble/internal/intern"
    17  )
    18  
    19  const propertiesBlockRestartInterval = math.MaxInt32
    20  const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno"
    21  
    22  var propTagMap = make(map[string]reflect.StructField)
    23  var propBoolTrue = []byte{'1'}
    24  var propBoolFalse = []byte{'0'}
    25  
    26  var propOffsetTagMap = make(map[uintptr]string)
    27  
    28  func generateTagMaps(t reflect.Type, indexPrefix []int) {
    29  	for i := 0; i < t.NumField(); i++ {
    30  		f := t.Field(i)
    31  		if f.Type.Kind() == reflect.Struct {
    32  			if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" {
    33  				// CommonProperties struct embedded in Properties. Note that since
    34  				// CommonProperties is placed at the top of properties we can use
    35  				// the offsets of the fields within CommonProperties to determine
    36  				// the offsets of those fields within Properties.
    37  				generateTagMaps(f.Type, []int{i})
    38  				continue
    39  			}
    40  			panic("pebble: unknown struct type in Properties")
    41  		}
    42  		if tag := f.Tag.Get("prop"); tag != "" {
    43  			switch f.Type.Kind() {
    44  			case reflect.Bool:
    45  			case reflect.Uint32:
    46  			case reflect.Uint64:
    47  			case reflect.String:
    48  			default:
    49  				panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type))
    50  			}
    51  			if len(indexPrefix) > 0 {
    52  				// Prepend the index prefix so that we can use FieldByIndex on the top-level struct.
    53  				f.Index = append(indexPrefix[:len(indexPrefix):len(indexPrefix)], f.Index...)
    54  			}
    55  			propTagMap[tag] = f
    56  			propOffsetTagMap[f.Offset] = tag
    57  		}
    58  	}
    59  }
    60  
    61  func init() {
    62  	generateTagMaps(reflect.TypeOf(Properties{}), nil)
    63  }
    64  
    65  // CommonProperties holds properties for either a virtual or a physical sstable. This
    66  // can be used by code which doesn't care to make the distinction between physical
    67  // and virtual sstables properties.
    68  //
    69  // For virtual sstables, fields are constructed through extrapolation upon virtual
    70  // reader construction. See MakeVirtualReader for implementation details.
    71  //
    72  // NB: The values of these properties can affect correctness. For example,
    73  // if NumRangeKeySets == 0, but the sstable actually contains range keys, then
    74  // the iterators will behave incorrectly.
    75  type CommonProperties struct {
    76  	// The number of entries in this table.
    77  	NumEntries uint64 `prop:"rocksdb.num.entries"`
    78  	// Total raw key size.
    79  	RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
    80  	// Total raw value size.
    81  	RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
    82  	// Total raw key size of point deletion tombstones. This value is comparable
    83  	// to RawKeySize.
    84  	RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
    85  	// Sum of the raw value sizes carried by point deletion tombstones
    86  	// containing size estimates. See the DeleteSized key kind. This value is
    87  	// comparable to Raw{Key,Value}Size.
    88  	RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
    89  	// The number of point deletion entries ("tombstones") in this table that
    90  	// carry a size hint indicating the size of the value the tombstone deletes.
    91  	NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
    92  	// The number of deletion entries in this table, including both point and
    93  	// range deletions.
    94  	NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
    95  	// The number of range deletions in this table.
    96  	NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
    97  	// The number of RANGEKEYDELs in this table.
    98  	NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
    99  	// The number of RANGEKEYSETs in this table.
   100  	NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
   101  	// Total size of value blocks and value index block. Only serialized if > 0.
   102  	ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
   103  }
   104  
   105  // String is only used for testing purposes.
   106  func (c *CommonProperties) String() string {
   107  	var buf bytes.Buffer
   108  	v := reflect.ValueOf(*c)
   109  	loaded := make(map[uintptr]struct{})
   110  	writeProperties(loaded, v, &buf)
   111  	return buf.String()
   112  }
   113  
   114  // NumPointDeletions is the number of point deletions in the sstable. For virtual
   115  // sstables, this is an estimate.
   116  func (c *CommonProperties) NumPointDeletions() uint64 {
   117  	return c.NumDeletions - c.NumRangeDeletions
   118  }
   119  
   120  // Properties holds the sstable property values. The properties are
   121  // automatically populated during sstable creation and load from the properties
   122  // meta block when an sstable is opened.
   123  type Properties struct {
   124  	// CommonProperties needs to be at the top of the Properties struct so that the
   125  	// offsets of the fields in CommonProperties match the offsets of the embedded
   126  	// fields of CommonProperties in Properties.
   127  	CommonProperties `prop:"pebble.embbeded_common_properties"`
   128  
   129  	// The name of the comparer used in this table.
   130  	ComparerName string `prop:"rocksdb.comparator"`
   131  	// The compression algorithm used to compress blocks.
   132  	CompressionName string `prop:"rocksdb.compression"`
   133  	// The compression options used to compress blocks.
   134  	CompressionOptions string `prop:"rocksdb.compression_options"`
   135  	// The total size of all data blocks.
   136  	DataSize uint64 `prop:"rocksdb.data.size"`
   137  	// The external sstable version format. Version 2 is the one RocksDB has been
   138  	// using since 5.13. RocksDB only uses the global sequence number for an
   139  	// sstable if this property has been set.
   140  	ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"`
   141  	// The name of the filter policy used in this table. Empty if no filter
   142  	// policy is used.
   143  	FilterPolicyName string `prop:"rocksdb.filter.policy"`
   144  	// The size of filter block.
   145  	FilterSize uint64 `prop:"rocksdb.filter.size"`
   146  	// The global sequence number to use for all entries in the table. Present if
   147  	// the table was created externally and ingested whole.
   148  	GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"`
   149  	// Total number of index partitions if kTwoLevelIndexSearch is used.
   150  	IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
   151  	// The size of index block.
   152  	IndexSize uint64 `prop:"rocksdb.index.size"`
   153  	// The index type. TODO(peter): add a more detailed description.
   154  	IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
   155  	// For formats >= TableFormatPebblev4, this is set to true if the obsolete
   156  	// bit is strict for all the point keys.
   157  	IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
   158  	// The name of the merger used in this table. Empty if no merger is used.
   159  	MergerName string `prop:"rocksdb.merge.operator"`
   160  	// The number of blocks in this table.
   161  	NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
   162  	// The number of merge operands in the table.
   163  	NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
   164  	// The number of RANGEKEYUNSETs in this table.
   165  	NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
   166  	// The number of value blocks in this table. Only serialized if > 0.
   167  	NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
   168  	// The number of values stored in value blocks. Only serialized if > 0.
   169  	NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
   170  	// The name of the prefix extractor used in this table. Empty if no prefix
   171  	// extractor is used.
   172  	PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"`
   173  	// If filtering is enabled, was the filter created on the key prefix.
   174  	PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"`
   175  	// A comma separated list of names of the property collectors used in this
   176  	// table.
   177  	PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
   178  	// Total raw rangekey key size.
   179  	RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
   180  	// Total raw rangekey value size.
   181  	RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
   182  	// The total number of keys in this table that were pinned by open snapshots.
   183  	SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
   184  	// The cumulative bytes of keys in this table that were pinned by
   185  	// open snapshots. This value is comparable to RawKeySize.
   186  	SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
   187  	// The cumulative bytes of values in this table that were pinned by
   188  	// open snapshots. This value is comparable to RawValueSize.
   189  	SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
   190  	// Size of the top-level index if kTwoLevelIndexSearch is used.
   191  	TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
   192  	// User collected properties.
   193  	UserProperties map[string]string
   194  	// If filtering is enabled, was the filter created on the whole key.
   195  	WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"`
   196  
   197  	// Loaded set indicating which fields have been loaded from disk. Indexed by
   198  	// the field's byte offset within the struct
   199  	// (reflect.StructField.Offset). Only set if the properties have been loaded
   200  	// from a file. Only exported for testing purposes.
   201  	Loaded map[uintptr]struct{}
   202  }
   203  
   204  // NumPointDeletions returns the number of point deletions in this table.
   205  func (p *Properties) NumPointDeletions() uint64 {
   206  	return p.NumDeletions - p.NumRangeDeletions
   207  }
   208  
   209  // NumRangeKeys returns a count of the number of range keys in this table.
   210  func (p *Properties) NumRangeKeys() uint64 {
   211  	return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets
   212  }
   213  
   214  func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) {
   215  	vt := v.Type()
   216  	for i := 0; i < v.NumField(); i++ {
   217  		ft := vt.Field(i)
   218  		if ft.Type.Kind() == reflect.Struct {
   219  			// Embedded struct within the properties.
   220  			writeProperties(loaded, v.Field(i), buf)
   221  			continue
   222  		}
   223  		tag := ft.Tag.Get("prop")
   224  		if tag == "" {
   225  			continue
   226  		}
   227  
   228  		f := v.Field(i)
   229  		// TODO(peter): Use f.IsZero() when we can rely on go1.13.
   230  		if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() {
   231  			// Skip printing of zero values which were not loaded from disk.
   232  			if _, ok := loaded[ft.Offset]; !ok {
   233  				continue
   234  			}
   235  		}
   236  
   237  		fmt.Fprintf(buf, "%s: ", tag)
   238  		switch ft.Type.Kind() {
   239  		case reflect.Bool:
   240  			fmt.Fprintf(buf, "%t\n", f.Bool())
   241  		case reflect.Uint32:
   242  			fmt.Fprintf(buf, "%d\n", f.Uint())
   243  		case reflect.Uint64:
   244  			fmt.Fprintf(buf, "%d\n", f.Uint())
   245  		case reflect.String:
   246  			fmt.Fprintf(buf, "%s\n", f.String())
   247  		default:
   248  			panic("not reached")
   249  		}
   250  	}
   251  }
   252  
   253  func (p *Properties) String() string {
   254  	var buf bytes.Buffer
   255  	v := reflect.ValueOf(*p)
   256  	writeProperties(p.Loaded, v, &buf)
   257  
   258  	// Write the UserProperties.
   259  	keys := make([]string, 0, len(p.UserProperties))
   260  	for key := range p.UserProperties {
   261  		keys = append(keys, key)
   262  	}
   263  	sort.Strings(keys)
   264  	for _, key := range keys {
   265  		fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key])
   266  	}
   267  	return buf.String()
   268  }
   269  
   270  func (p *Properties) load(
   271  	b block, blockOffset uint64, deniedUserProperties map[string]struct{},
   272  ) error {
   273  	i, err := newRawBlockIter(bytes.Compare, b)
   274  	if err != nil {
   275  		return err
   276  	}
   277  	p.Loaded = make(map[uintptr]struct{})
   278  	v := reflect.ValueOf(p).Elem()
   279  
   280  	for valid := i.First(); valid; valid = i.Next() {
   281  		if f, ok := propTagMap[string(i.Key().UserKey)]; ok {
   282  			p.Loaded[f.Offset] = struct{}{}
   283  			field := v.FieldByIndex(f.Index)
   284  			switch f.Type.Kind() {
   285  			case reflect.Bool:
   286  				field.SetBool(bytes.Equal(i.Value(), propBoolTrue))
   287  			case reflect.Uint32:
   288  				field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value())))
   289  			case reflect.Uint64:
   290  				var n uint64
   291  				if string(i.Key().UserKey) == propGlobalSeqnumName {
   292  					n = binary.LittleEndian.Uint64(i.Value())
   293  				} else {
   294  					n, _ = binary.Uvarint(i.Value())
   295  				}
   296  				field.SetUint(n)
   297  			case reflect.String:
   298  				field.SetString(intern.Bytes(i.Value()))
   299  			default:
   300  				panic("not reached")
   301  			}
   302  			continue
   303  		}
   304  		if p.UserProperties == nil {
   305  			p.UserProperties = make(map[string]string)
   306  		}
   307  
   308  		if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied {
   309  			p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value())
   310  		}
   311  	}
   312  	return nil
   313  }
   314  
   315  func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
   316  	tag := propOffsetTagMap[offset]
   317  	if value {
   318  		m[tag] = propBoolTrue
   319  	} else {
   320  		m[tag] = propBoolFalse
   321  	}
   322  }
   323  
   324  func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) {
   325  	var buf [4]byte
   326  	binary.LittleEndian.PutUint32(buf[:], value)
   327  	m[propOffsetTagMap[offset]] = buf[:]
   328  }
   329  
   330  func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) {
   331  	var buf [8]byte
   332  	binary.LittleEndian.PutUint64(buf[:], value)
   333  	m[propOffsetTagMap[offset]] = buf[:]
   334  }
   335  
   336  func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) {
   337  	var buf [10]byte
   338  	n := binary.PutUvarint(buf[:], value)
   339  	m[propOffsetTagMap[offset]] = buf[:n]
   340  }
   341  
   342  func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) {
   343  	m[propOffsetTagMap[offset]] = []byte(value)
   344  }
   345  
   346  func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
   347  	m := make(map[string][]byte)
   348  	for k, v := range p.UserProperties {
   349  		m[k] = []byte(v)
   350  	}
   351  
   352  	if p.ComparerName != "" {
   353  		p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName)
   354  	}
   355  	if p.CompressionName != "" {
   356  		p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName)
   357  	}
   358  	if p.CompressionOptions != "" {
   359  		p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions)
   360  	}
   361  	p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize)
   362  	if p.ExternalFormatVersion != 0 {
   363  		p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion)
   364  		p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum)
   365  	}
   366  	if p.FilterPolicyName != "" {
   367  		p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName)
   368  	}
   369  	p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize)
   370  	if p.IndexPartitions != 0 {
   371  		p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions)
   372  		p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize)
   373  	}
   374  	p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
   375  	p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
   376  	if p.IsStrictObsolete {
   377  		p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
   378  	}
   379  	if p.MergerName != "" {
   380  		p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
   381  	}
   382  	p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks)
   383  	p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries)
   384  	p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions)
   385  	if p.NumSizedDeletions > 0 {
   386  		p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions)
   387  	}
   388  	p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands)
   389  	p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions)
   390  	// NB: We only write out some properties for Pebble formats. This isn't
   391  	// strictly necessary because unrecognized properties are interpreted as
   392  	// user-defined properties, however writing them prevents byte-for-byte
   393  	// equivalence with RocksDB files that some of our testing requires.
   394  	if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 {
   395  		p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize)
   396  	}
   397  	if p.RawPointTombstoneValueSize > 0 {
   398  		p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize)
   399  	}
   400  	if p.NumRangeKeys() > 0 {
   401  		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels)
   402  		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets)
   403  		p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets)
   404  		p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize)
   405  		p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize)
   406  	}
   407  	if p.NumValueBlocks > 0 {
   408  		p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks)
   409  	}
   410  	if p.NumValuesInValueBlocks > 0 {
   411  		p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks)
   412  	}
   413  	if p.PrefixExtractorName != "" {
   414  		p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName)
   415  	}
   416  	p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering)
   417  	if p.PropertyCollectorNames != "" {
   418  		p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames)
   419  	}
   420  	if p.SnapshotPinnedKeys > 0 {
   421  		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys)
   422  		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize)
   423  		p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize)
   424  	}
   425  	p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize)
   426  	p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize)
   427  	if p.ValueBlocksSize > 0 {
   428  		p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
   429  	}
   430  	p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering)
   431  
   432  	if tblFormat < TableFormatPebblev1 {
   433  		m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
   434  		m["rocksdb.fixed.key.length"] = []byte{0x00}
   435  		m["rocksdb.index.key.is.user.key"] = []byte{0x00}
   436  		m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00}
   437  		m["rocksdb.oldest.key.time"] = []byte{0x00}
   438  		m["rocksdb.creation.time"] = []byte{0x00}
   439  		m["rocksdb.format.version"] = []byte{0x00}
   440  	}
   441  
   442  	keys := make([]string, 0, len(m))
   443  	for key := range m {
   444  		keys = append(keys, key)
   445  	}
   446  	sort.Strings(keys)
   447  	for _, key := range keys {
   448  		w.add(InternalKey{UserKey: []byte(key)}, m[key])
   449  	}
   450  }