github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/properties.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "fmt" 11 "math" 12 "reflect" 13 "sort" 14 "unsafe" 15 16 "github.com/cockroachdb/pebble/internal/intern" 17 ) 18 19 const propertiesBlockRestartInterval = math.MaxInt32 20 const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno" 21 22 var propTagMap = make(map[string]reflect.StructField) 23 var propBoolTrue = []byte{'1'} 24 var propBoolFalse = []byte{'0'} 25 26 var propOffsetTagMap = make(map[uintptr]string) 27 28 func generateTagMaps(t reflect.Type, indexPrefix []int) { 29 for i := 0; i < t.NumField(); i++ { 30 f := t.Field(i) 31 if f.Type.Kind() == reflect.Struct { 32 if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" { 33 // CommonProperties struct embedded in Properties. Note that since 34 // CommonProperties is placed at the top of properties we can use 35 // the offsets of the fields within CommonProperties to determine 36 // the offsets of those fields within Properties. 37 generateTagMaps(f.Type, []int{i}) 38 continue 39 } 40 panic("pebble: unknown struct type in Properties") 41 } 42 if tag := f.Tag.Get("prop"); tag != "" { 43 switch f.Type.Kind() { 44 case reflect.Bool: 45 case reflect.Uint32: 46 case reflect.Uint64: 47 case reflect.String: 48 default: 49 panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type)) 50 } 51 if len(indexPrefix) > 0 { 52 // Prepend the index prefix so that we can use FieldByIndex on the top-level struct. 53 f.Index = append(indexPrefix[:len(indexPrefix):len(indexPrefix)], f.Index...) 54 } 55 propTagMap[tag] = f 56 propOffsetTagMap[f.Offset] = tag 57 } 58 } 59 } 60 61 func init() { 62 generateTagMaps(reflect.TypeOf(Properties{}), nil) 63 } 64 65 // CommonProperties holds properties for either a virtual or a physical sstable. This 66 // can be used by code which doesn't care to make the distinction between physical 67 // and virtual sstables properties. 68 // 69 // For virtual sstables, fields are constructed through extrapolation upon virtual 70 // reader construction. See MakeVirtualReader for implementation details. 71 // 72 // NB: The values of these properties can affect correctness. For example, 73 // if NumRangeKeySets == 0, but the sstable actually contains range keys, then 74 // the iterators will behave incorrectly. 75 type CommonProperties struct { 76 // The number of entries in this table. 77 NumEntries uint64 `prop:"rocksdb.num.entries"` 78 // Total raw key size. 79 RawKeySize uint64 `prop:"rocksdb.raw.key.size"` 80 // Total raw value size. 81 RawValueSize uint64 `prop:"rocksdb.raw.value.size"` 82 // Total raw key size of point deletion tombstones. This value is comparable 83 // to RawKeySize. 84 RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"` 85 // Sum of the raw value sizes carried by point deletion tombstones 86 // containing size estimates. See the DeleteSized key kind. This value is 87 // comparable to Raw{Key,Value}Size. 88 RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"` 89 // The number of point deletion entries ("tombstones") in this table that 90 // carry a size hint indicating the size of the value the tombstone deletes. 91 NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"` 92 // The number of deletion entries in this table, including both point and 93 // range deletions. 94 NumDeletions uint64 `prop:"rocksdb.deleted.keys"` 95 // The number of range deletions in this table. 96 NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"` 97 // The number of RANGEKEYDELs in this table. 98 NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"` 99 // The number of RANGEKEYSETs in this table. 100 NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` 101 // Total size of value blocks and value index block. Only serialized if > 0. 102 ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"` 103 } 104 105 // String is only used for testing purposes. 106 func (c *CommonProperties) String() string { 107 var buf bytes.Buffer 108 v := reflect.ValueOf(*c) 109 loaded := make(map[uintptr]struct{}) 110 writeProperties(loaded, v, &buf) 111 return buf.String() 112 } 113 114 // NumPointDeletions is the number of point deletions in the sstable. For virtual 115 // sstables, this is an estimate. 116 func (c *CommonProperties) NumPointDeletions() uint64 { 117 return c.NumDeletions - c.NumRangeDeletions 118 } 119 120 // Properties holds the sstable property values. The properties are 121 // automatically populated during sstable creation and load from the properties 122 // meta block when an sstable is opened. 123 type Properties struct { 124 // CommonProperties needs to be at the top of the Properties struct so that the 125 // offsets of the fields in CommonProperties match the offsets of the embedded 126 // fields of CommonProperties in Properties. 127 CommonProperties `prop:"pebble.embbeded_common_properties"` 128 129 // The name of the comparer used in this table. 130 ComparerName string `prop:"rocksdb.comparator"` 131 // The compression algorithm used to compress blocks. 132 CompressionName string `prop:"rocksdb.compression"` 133 // The compression options used to compress blocks. 134 CompressionOptions string `prop:"rocksdb.compression_options"` 135 // The total size of all data blocks. 136 DataSize uint64 `prop:"rocksdb.data.size"` 137 // The external sstable version format. Version 2 is the one RocksDB has been 138 // using since 5.13. RocksDB only uses the global sequence number for an 139 // sstable if this property has been set. 140 ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"` 141 // The name of the filter policy used in this table. Empty if no filter 142 // policy is used. 143 FilterPolicyName string `prop:"rocksdb.filter.policy"` 144 // The size of filter block. 145 FilterSize uint64 `prop:"rocksdb.filter.size"` 146 // The global sequence number to use for all entries in the table. Present if 147 // the table was created externally and ingested whole. 148 GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"` 149 // Total number of index partitions if kTwoLevelIndexSearch is used. 150 IndexPartitions uint64 `prop:"rocksdb.index.partitions"` 151 // The size of index block. 152 IndexSize uint64 `prop:"rocksdb.index.size"` 153 // The index type. TODO(peter): add a more detailed description. 154 IndexType uint32 `prop:"rocksdb.block.based.table.index.type"` 155 // For formats >= TableFormatPebblev4, this is set to true if the obsolete 156 // bit is strict for all the point keys. 157 IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"` 158 // The name of the merger used in this table. Empty if no merger is used. 159 MergerName string `prop:"rocksdb.merge.operator"` 160 // The number of blocks in this table. 161 NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"` 162 // The number of merge operands in the table. 163 NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` 164 // The number of RANGEKEYUNSETs in this table. 165 NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"` 166 // The number of value blocks in this table. Only serialized if > 0. 167 NumValueBlocks uint64 `prop:"pebble.num.value-blocks"` 168 // The number of values stored in value blocks. Only serialized if > 0. 169 NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"` 170 // The name of the prefix extractor used in this table. Empty if no prefix 171 // extractor is used. 172 PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"` 173 // If filtering is enabled, was the filter created on the key prefix. 174 PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"` 175 // A comma separated list of names of the property collectors used in this 176 // table. 177 PropertyCollectorNames string `prop:"rocksdb.property.collectors"` 178 // Total raw rangekey key size. 179 RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"` 180 // Total raw rangekey value size. 181 RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"` 182 // The total number of keys in this table that were pinned by open snapshots. 183 SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"` 184 // The cumulative bytes of keys in this table that were pinned by 185 // open snapshots. This value is comparable to RawKeySize. 186 SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"` 187 // The cumulative bytes of values in this table that were pinned by 188 // open snapshots. This value is comparable to RawValueSize. 189 SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"` 190 // Size of the top-level index if kTwoLevelIndexSearch is used. 191 TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"` 192 // User collected properties. 193 UserProperties map[string]string 194 // If filtering is enabled, was the filter created on the whole key. 195 WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"` 196 197 // Loaded set indicating which fields have been loaded from disk. Indexed by 198 // the field's byte offset within the struct 199 // (reflect.StructField.Offset). Only set if the properties have been loaded 200 // from a file. Only exported for testing purposes. 201 Loaded map[uintptr]struct{} 202 } 203 204 // NumPointDeletions returns the number of point deletions in this table. 205 func (p *Properties) NumPointDeletions() uint64 { 206 return p.NumDeletions - p.NumRangeDeletions 207 } 208 209 // NumRangeKeys returns a count of the number of range keys in this table. 210 func (p *Properties) NumRangeKeys() uint64 { 211 return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets 212 } 213 214 func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) { 215 vt := v.Type() 216 for i := 0; i < v.NumField(); i++ { 217 ft := vt.Field(i) 218 if ft.Type.Kind() == reflect.Struct { 219 // Embedded struct within the properties. 220 writeProperties(loaded, v.Field(i), buf) 221 continue 222 } 223 tag := ft.Tag.Get("prop") 224 if tag == "" { 225 continue 226 } 227 228 f := v.Field(i) 229 // TODO(peter): Use f.IsZero() when we can rely on go1.13. 230 if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() { 231 // Skip printing of zero values which were not loaded from disk. 232 if _, ok := loaded[ft.Offset]; !ok { 233 continue 234 } 235 } 236 237 fmt.Fprintf(buf, "%s: ", tag) 238 switch ft.Type.Kind() { 239 case reflect.Bool: 240 fmt.Fprintf(buf, "%t\n", f.Bool()) 241 case reflect.Uint32: 242 fmt.Fprintf(buf, "%d\n", f.Uint()) 243 case reflect.Uint64: 244 fmt.Fprintf(buf, "%d\n", f.Uint()) 245 case reflect.String: 246 fmt.Fprintf(buf, "%s\n", f.String()) 247 default: 248 panic("not reached") 249 } 250 } 251 } 252 253 func (p *Properties) String() string { 254 var buf bytes.Buffer 255 v := reflect.ValueOf(*p) 256 writeProperties(p.Loaded, v, &buf) 257 258 // Write the UserProperties. 259 keys := make([]string, 0, len(p.UserProperties)) 260 for key := range p.UserProperties { 261 keys = append(keys, key) 262 } 263 sort.Strings(keys) 264 for _, key := range keys { 265 fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key]) 266 } 267 return buf.String() 268 } 269 270 func (p *Properties) load( 271 b block, blockOffset uint64, deniedUserProperties map[string]struct{}, 272 ) error { 273 i, err := newRawBlockIter(bytes.Compare, b) 274 if err != nil { 275 return err 276 } 277 p.Loaded = make(map[uintptr]struct{}) 278 v := reflect.ValueOf(p).Elem() 279 280 for valid := i.First(); valid; valid = i.Next() { 281 if f, ok := propTagMap[string(i.Key().UserKey)]; ok { 282 p.Loaded[f.Offset] = struct{}{} 283 field := v.FieldByIndex(f.Index) 284 switch f.Type.Kind() { 285 case reflect.Bool: 286 field.SetBool(bytes.Equal(i.Value(), propBoolTrue)) 287 case reflect.Uint32: 288 field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value()))) 289 case reflect.Uint64: 290 var n uint64 291 if string(i.Key().UserKey) == propGlobalSeqnumName { 292 n = binary.LittleEndian.Uint64(i.Value()) 293 } else { 294 n, _ = binary.Uvarint(i.Value()) 295 } 296 field.SetUint(n) 297 case reflect.String: 298 field.SetString(intern.Bytes(i.Value())) 299 default: 300 panic("not reached") 301 } 302 continue 303 } 304 if p.UserProperties == nil { 305 p.UserProperties = make(map[string]string) 306 } 307 308 if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied { 309 p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value()) 310 } 311 } 312 return nil 313 } 314 315 func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) { 316 tag := propOffsetTagMap[offset] 317 if value { 318 m[tag] = propBoolTrue 319 } else { 320 m[tag] = propBoolFalse 321 } 322 } 323 324 func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) { 325 var buf [4]byte 326 binary.LittleEndian.PutUint32(buf[:], value) 327 m[propOffsetTagMap[offset]] = buf[:] 328 } 329 330 func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) { 331 var buf [8]byte 332 binary.LittleEndian.PutUint64(buf[:], value) 333 m[propOffsetTagMap[offset]] = buf[:] 334 } 335 336 func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) { 337 var buf [10]byte 338 n := binary.PutUvarint(buf[:], value) 339 m[propOffsetTagMap[offset]] = buf[:n] 340 } 341 342 func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) { 343 m[propOffsetTagMap[offset]] = []byte(value) 344 } 345 346 func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { 347 m := make(map[string][]byte) 348 for k, v := range p.UserProperties { 349 m[k] = []byte(v) 350 } 351 352 if p.ComparerName != "" { 353 p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName) 354 } 355 if p.CompressionName != "" { 356 p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName) 357 } 358 if p.CompressionOptions != "" { 359 p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions) 360 } 361 p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize) 362 if p.ExternalFormatVersion != 0 { 363 p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion) 364 p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum) 365 } 366 if p.FilterPolicyName != "" { 367 p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName) 368 } 369 p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize) 370 if p.IndexPartitions != 0 { 371 p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions) 372 p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize) 373 } 374 p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize) 375 p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType) 376 if p.IsStrictObsolete { 377 p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete) 378 } 379 if p.MergerName != "" { 380 p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName) 381 } 382 p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks) 383 p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries) 384 p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions) 385 if p.NumSizedDeletions > 0 { 386 p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions) 387 } 388 p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands) 389 p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions) 390 // NB: We only write out some properties for Pebble formats. This isn't 391 // strictly necessary because unrecognized properties are interpreted as 392 // user-defined properties, however writing them prevents byte-for-byte 393 // equivalence with RocksDB files that some of our testing requires. 394 if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 { 395 p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize) 396 } 397 if p.RawPointTombstoneValueSize > 0 { 398 p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize) 399 } 400 if p.NumRangeKeys() > 0 { 401 p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels) 402 p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets) 403 p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets) 404 p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize) 405 p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize) 406 } 407 if p.NumValueBlocks > 0 { 408 p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks) 409 } 410 if p.NumValuesInValueBlocks > 0 { 411 p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks) 412 } 413 if p.PrefixExtractorName != "" { 414 p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) 415 } 416 p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering) 417 if p.PropertyCollectorNames != "" { 418 p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames) 419 } 420 if p.SnapshotPinnedKeys > 0 { 421 p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys) 422 p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize) 423 p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize) 424 } 425 p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize) 426 p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize) 427 if p.ValueBlocksSize > 0 { 428 p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize) 429 } 430 p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering) 431 432 if tblFormat < TableFormatPebblev1 { 433 m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32) 434 m["rocksdb.fixed.key.length"] = []byte{0x00} 435 m["rocksdb.index.key.is.user.key"] = []byte{0x00} 436 m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00} 437 m["rocksdb.oldest.key.time"] = []byte{0x00} 438 m["rocksdb.creation.time"] = []byte{0x00} 439 m["rocksdb.format.version"] = []byte{0x00} 440 } 441 442 keys := make([]string, 0, len(m)) 443 for key := range m { 444 keys = append(keys, key) 445 } 446 sort.Strings(keys) 447 for _, key := range keys { 448 w.add(InternalKey{UserKey: []byte(key)}, m[key]) 449 } 450 }