github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/layout.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "cmp" 10 "context" 11 "encoding/binary" 12 "fmt" 13 "io" 14 "slices" 15 "sort" 16 "unsafe" 17 18 "github.com/cockroachdb/pebble/internal/base" 19 ) 20 21 // Layout describes the block organization of an sstable. 22 type Layout struct { 23 // NOTE: changes to fields in this struct should also be reflected in 24 // ValidateBlockChecksums, which validates a static list of BlockHandles 25 // referenced in this struct. 26 27 Data []BlockHandleWithProperties 28 Index []BlockHandle 29 TopIndex BlockHandle 30 Filter BlockHandle 31 RangeDel BlockHandle 32 RangeKey BlockHandle 33 ValueBlock []BlockHandle 34 ValueIndex BlockHandle 35 Properties BlockHandle 36 MetaIndex BlockHandle 37 Footer BlockHandle 38 Format TableFormat 39 } 40 41 // Describe returns a description of the layout. If the verbose parameter is 42 // true, details of the structure of each block are returned as well. 43 func (l *Layout) Describe( 44 w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte), 45 ) { 46 ctx := context.TODO() 47 type block struct { 48 BlockHandle 49 name string 50 } 51 var blocks []block 52 53 for i := range l.Data { 54 blocks = append(blocks, block{l.Data[i].BlockHandle, "data"}) 55 } 56 for i := range l.Index { 57 blocks = append(blocks, block{l.Index[i], "index"}) 58 } 59 if l.TopIndex.Length != 0 { 60 blocks = append(blocks, block{l.TopIndex, "top-index"}) 61 } 62 if l.Filter.Length != 0 { 63 blocks = append(blocks, block{l.Filter, "filter"}) 64 } 65 if l.RangeDel.Length != 0 { 66 blocks = append(blocks, block{l.RangeDel, "range-del"}) 67 } 68 if l.RangeKey.Length != 0 { 69 blocks = append(blocks, block{l.RangeKey, "range-key"}) 70 } 71 for i := range l.ValueBlock { 72 blocks = append(blocks, block{l.ValueBlock[i], "value-block"}) 73 } 74 if l.ValueIndex.Length != 0 { 75 blocks = append(blocks, block{l.ValueIndex, "value-index"}) 76 } 77 if l.Properties.Length != 0 { 78 blocks = append(blocks, block{l.Properties, "properties"}) 79 } 80 if l.MetaIndex.Length != 0 { 81 blocks = append(blocks, block{l.MetaIndex, "meta-index"}) 82 } 83 if l.Footer.Length != 0 { 84 if l.Footer.Length == levelDBFooterLen { 85 blocks = append(blocks, block{l.Footer, "leveldb-footer"}) 86 } else { 87 blocks = append(blocks, block{l.Footer, "footer"}) 88 } 89 } 90 91 slices.SortFunc(blocks, func(a, b block) int { 92 return cmp.Compare(a.Offset, b.Offset) 93 }) 94 for i := range blocks { 95 b := &blocks[i] 96 fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length) 97 98 if !verbose { 99 continue 100 } 101 if b.name == "filter" { 102 continue 103 } 104 105 if b.name == "footer" || b.name == "leveldb-footer" { 106 trailer, offset := make([]byte, b.Length), b.Offset 107 _ = r.readable.ReadAt(ctx, trailer, int64(offset)) 108 109 if b.name == "footer" { 110 checksumType := ChecksumType(trailer[0]) 111 fmt.Fprintf(w, "%10d checksum type: %s\n", offset, checksumType) 112 trailer, offset = trailer[1:], offset+1 113 } 114 115 metaHandle, n := binary.Uvarint(trailer) 116 metaLen, m := binary.Uvarint(trailer[n:]) 117 fmt.Fprintf(w, "%10d meta: offset=%d, length=%d\n", offset, metaHandle, metaLen) 118 trailer, offset = trailer[n+m:], offset+uint64(n+m) 119 120 indexHandle, n := binary.Uvarint(trailer) 121 indexLen, m := binary.Uvarint(trailer[n:]) 122 fmt.Fprintf(w, "%10d index: offset=%d, length=%d\n", offset, indexHandle, indexLen) 123 trailer, offset = trailer[n+m:], offset+uint64(n+m) 124 125 fmt.Fprintf(w, "%10d [padding]\n", offset) 126 127 trailing := 12 128 if b.name == "leveldb-footer" { 129 trailing = 8 130 } 131 132 offset += uint64(len(trailer) - trailing) 133 trailer = trailer[len(trailer)-trailing:] 134 135 if b.name == "footer" { 136 version := trailer[:4] 137 fmt.Fprintf(w, "%10d version: %d\n", offset, binary.LittleEndian.Uint32(version)) 138 trailer, offset = trailer[4:], offset+4 139 } 140 141 magicNumber := trailer 142 fmt.Fprintf(w, "%10d magic number: 0x%x\n", offset, magicNumber) 143 144 continue 145 } 146 147 h, err := r.readBlock( 148 context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */) 149 if err != nil { 150 fmt.Fprintf(w, " [err: %s]\n", err) 151 continue 152 } 153 154 getRestart := func(data []byte, restarts, i int32) int32 { 155 return decodeRestart(data[restarts+4*i:]) 156 } 157 158 formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) { 159 i := sort.Search(int(numRestarts), func(i int) bool { 160 return getRestart(data, restarts, int32(i)) >= offset 161 }) 162 if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset { 163 fmt.Fprintf(w, " [restart]\n") 164 } else { 165 fmt.Fprintf(w, "\n") 166 } 167 } 168 169 formatRestarts := func(data []byte, restarts, numRestarts int32) { 170 for i := int32(0); i < numRestarts; i++ { 171 offset := getRestart(data, restarts, i) 172 fmt.Fprintf(w, "%10d [restart %d]\n", 173 b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset)) 174 } 175 } 176 177 formatTrailer := func() { 178 trailer := make([]byte, blockTrailerLen) 179 offset := int64(b.Offset + b.Length) 180 _ = r.readable.ReadAt(ctx, trailer, offset) 181 bt := blockType(trailer[0]) 182 checksum := binary.LittleEndian.Uint32(trailer[1:]) 183 fmt.Fprintf(w, "%10d [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum) 184 } 185 186 var lastKey InternalKey 187 switch b.name { 188 case "data", "range-del", "range-key": 189 iter, _ := newBlockIter(r.Compare, h.Get()) 190 for key, value := iter.First(); key != nil; key, value = iter.Next() { 191 ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) 192 shared, ptr := decodeVarint(ptr) 193 unshared, ptr := decodeVarint(ptr) 194 value2, _ := decodeVarint(ptr) 195 196 total := iter.nextOffset - iter.offset 197 // The format of the numbers in the record line is: 198 // 199 // (<total> = <length> [<shared>] + <unshared> + <value>) 200 // 201 // <total> is the total number of bytes for the record. 202 // <length> is the size of the 3 varint encoded integers for <shared>, 203 // <unshared>, and <value>. 204 // <shared> is the number of key bytes shared with the previous key. 205 // <unshared> is the number of unshared key bytes. 206 // <value> is the number of value bytes. 207 fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)", 208 b.Offset+uint64(iter.offset), total, 209 total-int32(unshared+value2), shared, unshared, value2) 210 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 211 if fmtRecord != nil { 212 fmt.Fprintf(w, " ") 213 if l.Format < TableFormatPebblev3 { 214 fmtRecord(key, value.InPlaceValue()) 215 } else { 216 // InPlaceValue() will succeed even for data blocks where the 217 // actual value is in a different location, since this value was 218 // fetched from a blockIter which does not know about value 219 // blocks. 220 v := value.InPlaceValue() 221 if base.TrailerKind(key.Trailer) != InternalKeyKindSet { 222 fmtRecord(key, v) 223 } else if !isValueHandle(valuePrefix(v[0])) { 224 fmtRecord(key, v[1:]) 225 } else { 226 vh := decodeValueHandle(v[1:]) 227 fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh))) 228 } 229 } 230 } 231 232 if base.InternalCompare(r.Compare, lastKey, *key) >= 0 { 233 fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n") 234 } 235 lastKey.Trailer = key.Trailer 236 lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) 237 } 238 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 239 formatTrailer() 240 case "index", "top-index": 241 iter, _ := newBlockIter(r.Compare, h.Get()) 242 for key, value := iter.First(); key != nil; key, value = iter.Next() { 243 bh, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 244 if err != nil { 245 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 246 continue 247 } 248 fmt.Fprintf(w, "%10d block:%d/%d", 249 b.Offset+uint64(iter.offset), bh.Offset, bh.Length) 250 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 251 } 252 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 253 formatTrailer() 254 case "properties": 255 iter, _ := newRawBlockIter(r.Compare, h.Get()) 256 for valid := iter.First(); valid; valid = iter.Next() { 257 fmt.Fprintf(w, "%10d %s (%d)", 258 b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset) 259 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 260 } 261 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 262 formatTrailer() 263 case "meta-index": 264 iter, _ := newRawBlockIter(r.Compare, h.Get()) 265 for valid := iter.First(); valid; valid = iter.Next() { 266 value := iter.Value() 267 var bh BlockHandle 268 var n int 269 var vbih valueBlocksIndexHandle 270 isValueBlocksIndexHandle := false 271 if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) { 272 vbih, n, err = decodeValueBlocksIndexHandle(value) 273 bh = vbih.h 274 isValueBlocksIndexHandle = true 275 } else { 276 bh, n = decodeBlockHandle(value) 277 } 278 if n == 0 || n != len(value) { 279 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 280 continue 281 } 282 var vbihStr string 283 if isValueBlocksIndexHandle { 284 vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)", 285 vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength) 286 } 287 fmt.Fprintf(w, "%10d %s block:%d/%d%s", 288 b.Offset+uint64(iter.offset), iter.Key().UserKey, 289 bh.Offset, bh.Length, vbihStr) 290 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 291 } 292 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 293 formatTrailer() 294 case "value-block": 295 // We don't peer into the value-block since it can't be interpreted 296 // without the valueHandles. 297 case "value-index": 298 // We have already read the value-index to construct the list of 299 // value-blocks, so no need to do it again. 300 } 301 302 h.Release() 303 } 304 305 last := blocks[len(blocks)-1] 306 fmt.Fprintf(w, "%10d EOF\n", last.Offset+last.Length) 307 }