github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/layout.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"sort"
    14  	"unsafe"
    15  
    16  	"github.com/cockroachdb/pebble/internal/base"
    17  )
    18  
    19  // Layout describes the block organization of an sstable.
    20  type Layout struct {
    21  	// NOTE: changes to fields in this struct should also be reflected in
    22  	// ValidateBlockChecksums, which validates a static list of BlockHandles
    23  	// referenced in this struct.
    24  
    25  	Data       []BlockHandleWithProperties
    26  	Index      []BlockHandle
    27  	TopIndex   BlockHandle
    28  	Filter     BlockHandle
    29  	RangeDel   BlockHandle
    30  	RangeKey   BlockHandle
    31  	ValueBlock []BlockHandle
    32  	ValueIndex BlockHandle
    33  	Properties BlockHandle
    34  	MetaIndex  BlockHandle
    35  	Footer     BlockHandle
    36  	Format     TableFormat
    37  }
    38  
    39  // Describe returns a description of the layout. If the verbose parameter is
    40  // true, details of the structure of each block are returned as well.
    41  func (l *Layout) Describe(
    42  	w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
    43  ) {
    44  	ctx := context.TODO()
    45  	type block struct {
    46  		BlockHandle
    47  		name string
    48  	}
    49  	var blocks []block
    50  
    51  	for i := range l.Data {
    52  		blocks = append(blocks, block{l.Data[i].BlockHandle, "data"})
    53  	}
    54  	for i := range l.Index {
    55  		blocks = append(blocks, block{l.Index[i], "index"})
    56  	}
    57  	if l.TopIndex.Length != 0 {
    58  		blocks = append(blocks, block{l.TopIndex, "top-index"})
    59  	}
    60  	if l.Filter.Length != 0 {
    61  		blocks = append(blocks, block{l.Filter, "filter"})
    62  	}
    63  	if l.RangeDel.Length != 0 {
    64  		blocks = append(blocks, block{l.RangeDel, "range-del"})
    65  	}
    66  	if l.RangeKey.Length != 0 {
    67  		blocks = append(blocks, block{l.RangeKey, "range-key"})
    68  	}
    69  	for i := range l.ValueBlock {
    70  		blocks = append(blocks, block{l.ValueBlock[i], "value-block"})
    71  	}
    72  	if l.ValueIndex.Length != 0 {
    73  		blocks = append(blocks, block{l.ValueIndex, "value-index"})
    74  	}
    75  	if l.Properties.Length != 0 {
    76  		blocks = append(blocks, block{l.Properties, "properties"})
    77  	}
    78  	if l.MetaIndex.Length != 0 {
    79  		blocks = append(blocks, block{l.MetaIndex, "meta-index"})
    80  	}
    81  	if l.Footer.Length != 0 {
    82  		if l.Footer.Length == levelDBFooterLen {
    83  			blocks = append(blocks, block{l.Footer, "leveldb-footer"})
    84  		} else {
    85  			blocks = append(blocks, block{l.Footer, "footer"})
    86  		}
    87  	}
    88  
    89  	sort.Slice(blocks, func(i, j int) bool {
    90  		return blocks[i].Offset < blocks[j].Offset
    91  	})
    92  
    93  	for i := range blocks {
    94  		b := &blocks[i]
    95  		fmt.Fprintf(w, "%10d  %s (%d)\n", b.Offset, b.name, b.Length)
    96  
    97  		if !verbose {
    98  			continue
    99  		}
   100  		if b.name == "filter" {
   101  			continue
   102  		}
   103  
   104  		if b.name == "footer" || b.name == "leveldb-footer" {
   105  			trailer, offset := make([]byte, b.Length), b.Offset
   106  			_ = r.readable.ReadAt(ctx, trailer, int64(offset))
   107  
   108  			if b.name == "footer" {
   109  				checksumType := ChecksumType(trailer[0])
   110  				fmt.Fprintf(w, "%10d    checksum type: %s\n", offset, checksumType)
   111  				trailer, offset = trailer[1:], offset+1
   112  			}
   113  
   114  			metaHandle, n := binary.Uvarint(trailer)
   115  			metaLen, m := binary.Uvarint(trailer[n:])
   116  			fmt.Fprintf(w, "%10d    meta: offset=%d, length=%d\n", offset, metaHandle, metaLen)
   117  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
   118  
   119  			indexHandle, n := binary.Uvarint(trailer)
   120  			indexLen, m := binary.Uvarint(trailer[n:])
   121  			fmt.Fprintf(w, "%10d    index: offset=%d, length=%d\n", offset, indexHandle, indexLen)
   122  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
   123  
   124  			fmt.Fprintf(w, "%10d    [padding]\n", offset)
   125  
   126  			trailing := 12
   127  			if b.name == "leveldb-footer" {
   128  				trailing = 8
   129  			}
   130  
   131  			offset += uint64(len(trailer) - trailing)
   132  			trailer = trailer[len(trailer)-trailing:]
   133  
   134  			if b.name == "footer" {
   135  				version := trailer[:4]
   136  				fmt.Fprintf(w, "%10d    version: %d\n", offset, binary.LittleEndian.Uint32(version))
   137  				trailer, offset = trailer[4:], offset+4
   138  			}
   139  
   140  			magicNumber := trailer
   141  			fmt.Fprintf(w, "%10d    magic number: 0x%x\n", offset, magicNumber)
   142  
   143  			continue
   144  		}
   145  
   146  		h, err := r.readBlock(
   147  			context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
   148  		if err != nil {
   149  			fmt.Fprintf(w, "  [err: %s]\n", err)
   150  			continue
   151  		}
   152  
   153  		getRestart := func(data []byte, restarts, i int32) int32 {
   154  			return decodeRestart(data[restarts+4*i:])
   155  		}
   156  
   157  		formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
   158  			i := sort.Search(int(numRestarts), func(i int) bool {
   159  				return getRestart(data, restarts, int32(i)) >= offset
   160  			})
   161  			if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
   162  				fmt.Fprintf(w, " [restart]\n")
   163  			} else {
   164  				fmt.Fprintf(w, "\n")
   165  			}
   166  		}
   167  
   168  		formatRestarts := func(data []byte, restarts, numRestarts int32) {
   169  			for i := int32(0); i < numRestarts; i++ {
   170  				offset := getRestart(data, restarts, i)
   171  				fmt.Fprintf(w, "%10d    [restart %d]\n",
   172  					b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
   173  			}
   174  		}
   175  
   176  		formatTrailer := func() {
   177  			trailer := make([]byte, blockTrailerLen)
   178  			offset := int64(b.Offset + b.Length)
   179  			_ = r.readable.ReadAt(ctx, trailer, offset)
   180  			bt := blockType(trailer[0])
   181  			checksum := binary.LittleEndian.Uint32(trailer[1:])
   182  			fmt.Fprintf(w, "%10d    [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum)
   183  		}
   184  
   185  		var lastKey InternalKey
   186  		switch b.name {
   187  		case "data", "range-del", "range-key":
   188  			iter, _ := newBlockIter(r.Compare, h.Get())
   189  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   190  				ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
   191  				shared, ptr := decodeVarint(ptr)
   192  				unshared, ptr := decodeVarint(ptr)
   193  				value2, _ := decodeVarint(ptr)
   194  
   195  				total := iter.nextOffset - iter.offset
   196  				// The format of the numbers in the record line is:
   197  				//
   198  				//   (<total> = <length> [<shared>] + <unshared> + <value>)
   199  				//
   200  				// <total>    is the total number of bytes for the record.
   201  				// <length>   is the size of the 3 varint encoded integers for <shared>,
   202  				//            <unshared>, and <value>.
   203  				// <shared>   is the number of key bytes shared with the previous key.
   204  				// <unshared> is the number of unshared key bytes.
   205  				// <value>    is the number of value bytes.
   206  				fmt.Fprintf(w, "%10d    record (%d = %d [%d] + %d + %d)",
   207  					b.Offset+uint64(iter.offset), total,
   208  					total-int32(unshared+value2), shared, unshared, value2)
   209  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   210  				if fmtRecord != nil {
   211  					fmt.Fprintf(w, "              ")
   212  					if l.Format < TableFormatPebblev3 {
   213  						fmtRecord(key, value.InPlaceValue())
   214  					} else {
   215  						// InPlaceValue() will succeed even for data blocks where the
   216  						// actual value is in a different location, since this value was
   217  						// fetched from a blockIter which does not know about value
   218  						// blocks.
   219  						v := value.InPlaceValue()
   220  						if base.TrailerKind(key.Trailer) != InternalKeyKindSet {
   221  							fmtRecord(key, v)
   222  						} else if !isValueHandle(valuePrefix(v[0])) {
   223  							fmtRecord(key, v[1:])
   224  						} else {
   225  							vh := decodeValueHandle(v[1:])
   226  							fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh)))
   227  						}
   228  					}
   229  				}
   230  
   231  				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
   232  					fmt.Fprintf(w, "              WARNING: OUT OF ORDER KEYS!\n")
   233  				}
   234  				lastKey.Trailer = key.Trailer
   235  				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
   236  			}
   237  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   238  			formatTrailer()
   239  		case "index", "top-index":
   240  			iter, _ := newBlockIter(r.Compare, h.Get())
   241  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   242  				bh, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   243  				if err != nil {
   244  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
   245  					continue
   246  				}
   247  				fmt.Fprintf(w, "%10d    block:%d/%d",
   248  					b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
   249  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   250  			}
   251  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   252  			formatTrailer()
   253  		case "properties":
   254  			iter, _ := newRawBlockIter(r.Compare, h.Get())
   255  			for valid := iter.First(); valid; valid = iter.Next() {
   256  				fmt.Fprintf(w, "%10d    %s (%d)",
   257  					b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
   258  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   259  			}
   260  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   261  			formatTrailer()
   262  		case "meta-index":
   263  			iter, _ := newRawBlockIter(r.Compare, h.Get())
   264  			for valid := iter.First(); valid; valid = iter.Next() {
   265  				value := iter.Value()
   266  				var bh BlockHandle
   267  				var n int
   268  				var vbih valueBlocksIndexHandle
   269  				isValueBlocksIndexHandle := false
   270  				if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) {
   271  					vbih, n, err = decodeValueBlocksIndexHandle(value)
   272  					bh = vbih.h
   273  					isValueBlocksIndexHandle = true
   274  				} else {
   275  					bh, n = decodeBlockHandle(value)
   276  				}
   277  				if n == 0 || n != len(value) {
   278  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
   279  					continue
   280  				}
   281  				var vbihStr string
   282  				if isValueBlocksIndexHandle {
   283  					vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)",
   284  						vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength)
   285  				}
   286  				fmt.Fprintf(w, "%10d    %s block:%d/%d%s",
   287  					b.Offset+uint64(iter.offset), iter.Key().UserKey,
   288  					bh.Offset, bh.Length, vbihStr)
   289  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
   290  			}
   291  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
   292  			formatTrailer()
   293  		case "value-block":
   294  			// We don't peer into the value-block since it can't be interpreted
   295  			// without the valueHandles.
   296  		case "value-index":
   297  			// We have already read the value-index to construct the list of
   298  			// value-blocks, so no need to do it again.
   299  		}
   300  
   301  		h.Release()
   302  	}
   303  
   304  	last := blocks[len(blocks)-1]
   305  	fmt.Fprintf(w, "%10d  EOF\n", last.Offset+last.Length)
   306  }