github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/layout.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"cmp"
    10  	"context"
    11  	"encoding/binary"
    12  	"fmt"
    13  	"io"
    14  	"slices"
    15  	"sort"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  )
    20  
    21  // Layout describes the block organization of an sstable.
    22  type Layout struct {
    23  	// NOTE: changes to fields in this struct should also be reflected in
    24  	// ValidateBlockChecksums, which validates a static list of BlockHandles
    25  	// referenced in this struct.
    26  
    27  	Data       []BlockHandleWithProperties
    28  	Index      []BlockHandle
    29  	TopIndex   BlockHandle
    30  	Filter     BlockHandle
    31  	RangeDel   BlockHandle
    32  	RangeKey   BlockHandle
    33  	ValueBlock []BlockHandle
    34  	ValueIndex BlockHandle
    35  	Properties BlockHandle
    36  	MetaIndex  BlockHandle
    37  	Footer     BlockHandle
    38  	Format     TableFormat
    39  }
    40  
    41  // Describe returns a description of the layout. If the verbose parameter is
    42  // true, details of the structure of each block are returned as well.
    43  func (l *Layout) Describe(
    44  	w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
    45  ) {
    46  	ctx := context.TODO()
    47  	type block struct {
    48  		BlockHandle
    49  		name string
    50  	}
    51  	var blocks []block
    52  
    53  	for i := range l.Data {
    54  		blocks = append(blocks, block{l.Data[i].BlockHandle, "data"})
    55  	}
    56  	for i := range l.Index {
    57  		blocks = append(blocks, block{l.Index[i], "index"})
    58  	}
    59  	if l.TopIndex.Length != 0 {
    60  		blocks = append(blocks, block{l.TopIndex, "top-index"})
    61  	}
    62  	if l.Filter.Length != 0 {
    63  		blocks = append(blocks, block{l.Filter, "filter"})
    64  	}
    65  	if l.RangeDel.Length != 0 {
    66  		blocks = append(blocks, block{l.RangeDel, "range-del"})
    67  	}
    68  	if l.RangeKey.Length != 0 {
    69  		blocks = append(blocks, block{l.RangeKey, "range-key"})
    70  	}
    71  	for i := range l.ValueBlock {
    72  		blocks = append(blocks, block{l.ValueBlock[i], "value-block"})
    73  	}
    74  	if l.ValueIndex.Length != 0 {
    75  		blocks = append(blocks, block{l.ValueIndex, "value-index"})
    76  	}
    77  	if l.Properties.Length != 0 {
    78  		blocks = append(blocks, block{l.Properties, "properties"})
    79  	}
    80  	if l.MetaIndex.Length != 0 {
    81  		blocks = append(blocks, block{l.MetaIndex, "meta-index"})
    82  	}
    83  	if l.Footer.Length != 0 {
    84  		if l.Footer.Length == levelDBFooterLen {
    85  			blocks = append(blocks, block{l.Footer, "leveldb-footer"})
    86  		} else {
    87  			blocks = append(blocks, block{l.Footer, "footer"})
    88  		}
    89  	}
    90  
    91  	slices.SortFunc(blocks, func(a, b block) int {
    92  		return cmp.Compare(a.Offset, b.Offset)
    93  	})
    94  	for i := range blocks {
    95  		b := &blocks[i]
    96  		fmt.Fprintf(w, "%10d  %s (%d)\n", b.Offset, b.name, b.Length)
    97  
    98  		if !verbose {
    99  			continue
   100  		}
   101  		if b.name == "filter" {
   102  			continue
   103  		}
   104  
   105  		if b.name == "footer" || b.name == "leveldb-footer" {
   106  			trailer, offset := make([]byte, b.Length), b.Offset
   107  			_ = r.readable.ReadAt(ctx, trailer, int64(offset))
   108  
   109  			if b.name == "footer" {
   110  				checksumType := ChecksumType(trailer[0])
   111  				fmt.Fprintf(w, "%10d    checksum type: %s\n", offset, checksumType)
   112  				trailer, offset = trailer[1:], offset+1
   113  			}
   114  
   115  			metaHandle, n := binary.Uvarint(trailer)
   116  			metaLen, m := binary.Uvarint(trailer[n:])
   117  			fmt.Fprintf(w, "%10d    meta: offset=%d, length=%d\n", offset, metaHandle, metaLen)
   118  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
   119  
   120  			indexHandle, n := binary.Uvarint(trailer)
   121  			indexLen, m := binary.Uvarint(trailer[n:])
   122  			fmt.Fprintf(w, "%10d    index: offset=%d, length=%d\n", offset, indexHandle, indexLen)
   123  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
   124  
   125  			fmt.Fprintf(w, "%10d    [padding]\n", offset)
   126  
   127  			trailing := 12
   128  			if b.name == "leveldb-footer" {
   129  				trailing = 8
   130  			}
   131  
   132  			offset += uint64(len(trailer) - trailing)
   133  			trailer = trailer[len(trailer)-trailing:]
   134  
   135  			if b.name == "footer" {
   136  				version := trailer[:4]
   137  				fmt.Fprintf(w, "%10d    version: %d\n", offset, binary.LittleEndian.Uint32(version))
   138  				trailer, offset = trailer[4:], offset+4
   139  			}
   140  
   141  			magicNumber := trailer
   142  			fmt.Fprintf(w, "%10d    magic number: 0x%x\n", offset, magicNumber)
   143  
   144  			continue
   145  		}
   146  
   147  		h, err := r.readBlock(
   148  			context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
   149  		if err != nil {
   150  			fmt.Fprintf(w, "  [err: %s]\n", err)
   151  			continue
   152  		}
   153  
   154  		getRestart := func(data []byte, restarts, i int32) int32 {
   155  			return decodeRestart(data[restarts+4*i:])
   156  		}
   157  
   158  		formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
   159  			i := sort.Search(int(numRestarts), func(i int) bool {
   160  				return getRestart(data, restarts, int32(i)) >= offset
   161  			})
   162  			if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
   163  				fmt.Fprintf(w, " [restart]\n")
   164  			} else {
   165  				fmt.Fprintf(w, "\n")
   166  			}
   167  		}
   168  
   169  		formatRestarts := func(data []byte, restarts, numRestarts int32) {
   170  			for i := int32(0); i < numRestarts; i++ {
   171  				offset := getRestart(data, restarts, i)
   172  				fmt.Fprintf(w, "%10d    [restart %d]\n",
   173  					b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
   174  			}
   175  		}
   176  
   177  		formatTrailer := func() {
   178  			trailer := make([]byte, blockTrailerLen)
   179  			offset := int64(b.Offset + b.Length)
   180  			_ = r.readable.ReadAt(ctx, trailer, offset)
   181  			bt := blockType(trailer[0])
   182  			checksum := binary.LittleEndian.Uint32(trailer[1:])
   183  			fmt.Fprintf(w, "%10d    [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum)
   184  		}
   185  
   186  		var lastKey InternalKey
   187  		switch b.name {
   188  		case "data", "range-del", "range-key":
   189  			iter, _ := newBlockIter(r.Compare, h.Get())
   190  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   191  				ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
   192  				shared, ptr := decodeVarint(ptr)
   193  				unshared, ptr := decodeVarint(ptr)
   194  				value2, _ := decodeVarint(ptr)
   195  
   196  				total := iter.nextOffset - iter.offset
   197  				// The format of the numbers in the record line is:
   198  				//
   199  				//   (<total> = <length> [<shared>] + <unshared> + <value>)
   200  				//
   201  				// <total>    is the total number of bytes for the record.
   202  				// <length>   is the size of the 3 varint encoded integers for <shared>,
   203  				//            <unshared>, and <value>.
   204  				// <shared>   is the number of key bytes shared with the previous key.
   205  				// <unshared> is the number of unshared key bytes.
   206  				// <value>    is the number of value bytes.
   207  				fmt.Fprintf(w, "%10d    record (%d = %d [%d] + %d + %d)",
   208  					b.Offset+uint64(iter.offset), total,
   209  					total-int32(unshared+value2), shared, unshared, value2)
   210  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   211  				if fmtRecord != nil {
   212  					fmt.Fprintf(w, "              ")
   213  					if l.Format < TableFormatPebblev3 {
   214  						fmtRecord(key, value.InPlaceValue())
   215  					} else {
   216  						// InPlaceValue() will succeed even for data blocks where the
   217  						// actual value is in a different location, since this value was
   218  						// fetched from a blockIter which does not know about value
   219  						// blocks.
   220  						v := value.InPlaceValue()
   221  						if base.TrailerKind(key.Trailer) != InternalKeyKindSet {
   222  							fmtRecord(key, v)
   223  						} else if !isValueHandle(valuePrefix(v[0])) {
   224  							fmtRecord(key, v[1:])
   225  						} else {
   226  							vh := decodeValueHandle(v[1:])
   227  							fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh)))
   228  						}
   229  					}
   230  				}
   231  
   232  				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
   233  					fmt.Fprintf(w, "              WARNING: OUT OF ORDER KEYS!\n")
   234  				}
   235  				lastKey.Trailer = key.Trailer
   236  				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
   237  			}
   238  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   239  			formatTrailer()
   240  		case "index", "top-index":
   241  			iter, _ := newBlockIter(r.Compare, h.Get())
   242  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   243  				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   244  				if err != nil {
   245  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
   246  					continue
   247  				}
   248  				fmt.Fprintf(w, "%10d    block:%d/%d",
   249  					b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
   250  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   251  			}
   252  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   253  			formatTrailer()
   254  		case "properties":
   255  			iter, _ := newRawBlockIter(r.Compare, h.Get())
   256  			for valid := iter.First(); valid; valid = iter.Next() {
   257  				fmt.Fprintf(w, "%10d    %s (%d)",
   258  					b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
   259  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   260  			}
   261  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   262  			formatTrailer()
   263  		case "meta-index":
   264  			iter, _ := newRawBlockIter(r.Compare, h.Get())
   265  			for valid := iter.First(); valid; valid = iter.Next() {
   266  				value := iter.Value()
   267  				var bh BlockHandle
   268  				var n int
   269  				var vbih valueBlocksIndexHandle
   270  				isValueBlocksIndexHandle := false
   271  				if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) {
   272  					vbih, n, err = decodeValueBlocksIndexHandle(value)
   273  					bh = vbih.h
   274  					isValueBlocksIndexHandle = true
   275  				} else {
   276  					bh, n = decodeBlockHandle(value)
   277  				}
   278  				if n == 0 || n != len(value) {
   279  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
   280  					continue
   281  				}
   282  				var vbihStr string
   283  				if isValueBlocksIndexHandle {
   284  					vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)",
   285  						vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength)
   286  				}
   287  				fmt.Fprintf(w, "%10d    %s block:%d/%d%s",
   288  					b.Offset+uint64(iter.offset), iter.Key().UserKey,
   289  					bh.Offset, bh.Length, vbihStr)
   290  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   291  			}
   292  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   293  			formatTrailer()
   294  		case "value-block":
   295  			// We don't peer into the value-block since it can't be interpreted
   296  			// without the valueHandles.
   297  		case "value-index":
   298  			// We have already read the value-index to construct the list of
   299  			// value-blocks, so no need to do it again.
   300  		}
   301  
   302  		h.Release()
   303  	}
   304  
   305  	last := blocks[len(blocks)-1]
   306  	fmt.Fprintf(w, "%10d  EOF\n", last.Offset+last.Length)
   307  }