github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/layout.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/binary" 11 "fmt" 12 "io" 13 "sort" 14 "unsafe" 15 16 "github.com/cockroachdb/pebble/internal/base" 17 ) 18 19 // Layout describes the block organization of an sstable. 20 type Layout struct { 21 // NOTE: changes to fields in this struct should also be reflected in 22 // ValidateBlockChecksums, which validates a static list of BlockHandles 23 // referenced in this struct. 24 25 Data []BlockHandleWithProperties 26 Index []BlockHandle 27 TopIndex BlockHandle 28 Filter BlockHandle 29 RangeDel BlockHandle 30 RangeKey BlockHandle 31 ValueBlock []BlockHandle 32 ValueIndex BlockHandle 33 Properties BlockHandle 34 MetaIndex BlockHandle 35 Footer BlockHandle 36 Format TableFormat 37 } 38 39 // Describe returns a description of the layout. If the verbose parameter is 40 // true, details of the structure of each block are returned as well. 41 func (l *Layout) Describe( 42 w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte), 43 ) { 44 ctx := context.TODO() 45 type block struct { 46 BlockHandle 47 name string 48 } 49 var blocks []block 50 51 for i := range l.Data { 52 blocks = append(blocks, block{l.Data[i].BlockHandle, "data"}) 53 } 54 for i := range l.Index { 55 blocks = append(blocks, block{l.Index[i], "index"}) 56 } 57 if l.TopIndex.Length != 0 { 58 blocks = append(blocks, block{l.TopIndex, "top-index"}) 59 } 60 if l.Filter.Length != 0 { 61 blocks = append(blocks, block{l.Filter, "filter"}) 62 } 63 if l.RangeDel.Length != 0 { 64 blocks = append(blocks, block{l.RangeDel, "range-del"}) 65 } 66 if l.RangeKey.Length != 0 { 67 blocks = append(blocks, block{l.RangeKey, "range-key"}) 68 } 69 for i := range l.ValueBlock { 70 blocks = append(blocks, block{l.ValueBlock[i], "value-block"}) 71 } 72 if l.ValueIndex.Length != 0 { 73 blocks = append(blocks, block{l.ValueIndex, "value-index"}) 74 } 75 if l.Properties.Length != 0 { 76 blocks = append(blocks, block{l.Properties, "properties"}) 77 } 78 if l.MetaIndex.Length != 0 { 79 blocks = append(blocks, block{l.MetaIndex, "meta-index"}) 80 } 81 if l.Footer.Length != 0 { 82 if l.Footer.Length == levelDBFooterLen { 83 blocks = append(blocks, block{l.Footer, "leveldb-footer"}) 84 } else { 85 blocks = append(blocks, block{l.Footer, "footer"}) 86 } 87 } 88 89 sort.Slice(blocks, func(i, j int) bool { 90 return blocks[i].Offset < blocks[j].Offset 91 }) 92 93 for i := range blocks { 94 b := &blocks[i] 95 fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length) 96 97 if !verbose { 98 continue 99 } 100 if b.name == "filter" { 101 continue 102 } 103 104 if b.name == "footer" || b.name == "leveldb-footer" { 105 trailer, offset := make([]byte, b.Length), b.Offset 106 _ = r.readable.ReadAt(ctx, trailer, int64(offset)) 107 108 if b.name == "footer" { 109 checksumType := ChecksumType(trailer[0]) 110 fmt.Fprintf(w, "%10d checksum type: %s\n", offset, checksumType) 111 trailer, offset = trailer[1:], offset+1 112 } 113 114 metaHandle, n := binary.Uvarint(trailer) 115 metaLen, m := binary.Uvarint(trailer[n:]) 116 fmt.Fprintf(w, "%10d meta: offset=%d, length=%d\n", offset, metaHandle, metaLen) 117 trailer, offset = trailer[n+m:], offset+uint64(n+m) 118 119 indexHandle, n := binary.Uvarint(trailer) 120 indexLen, m := binary.Uvarint(trailer[n:]) 121 fmt.Fprintf(w, "%10d index: offset=%d, length=%d\n", offset, indexHandle, indexLen) 122 trailer, offset = trailer[n+m:], offset+uint64(n+m) 123 124 fmt.Fprintf(w, "%10d [padding]\n", offset) 125 126 trailing := 12 127 if b.name == "leveldb-footer" { 128 trailing = 8 129 } 130 131 offset += uint64(len(trailer) - trailing) 132 trailer = trailer[len(trailer)-trailing:] 133 134 if b.name == "footer" { 135 version := trailer[:4] 136 fmt.Fprintf(w, "%10d version: %d\n", offset, binary.LittleEndian.Uint32(version)) 137 trailer, offset = trailer[4:], offset+4 138 } 139 140 magicNumber := trailer 141 fmt.Fprintf(w, "%10d magic number: 0x%x\n", offset, magicNumber) 142 143 continue 144 } 145 146 h, err := r.readBlock( 147 context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) 148 if err != nil { 149 fmt.Fprintf(w, " [err: %s]\n", err) 150 continue 151 } 152 153 getRestart := func(data []byte, restarts, i int32) int32 { 154 return decodeRestart(data[restarts+4*i:]) 155 } 156 157 formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) { 158 i := sort.Search(int(numRestarts), func(i int) bool { 159 return getRestart(data, restarts, int32(i)) >= offset 160 }) 161 if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset { 162 fmt.Fprintf(w, " [restart]\n") 163 } else { 164 fmt.Fprintf(w, "\n") 165 } 166 } 167 168 formatRestarts := func(data []byte, restarts, numRestarts int32) { 169 for i := int32(0); i < numRestarts; i++ { 170 offset := getRestart(data, restarts, i) 171 fmt.Fprintf(w, "%10d [restart %d]\n", 172 b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset)) 173 } 174 } 175 176 formatTrailer := func() { 177 trailer := make([]byte, blockTrailerLen) 178 offset := int64(b.Offset + b.Length) 179 _ = r.readable.ReadAt(ctx, trailer, offset) 180 bt := blockType(trailer[0]) 181 checksum := binary.LittleEndian.Uint32(trailer[1:]) 182 fmt.Fprintf(w, "%10d [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum) 183 } 184 185 var lastKey InternalKey 186 switch b.name { 187 case "data", "range-del", "range-key": 188 iter, _ := newBlockIter(r.Compare, h.Get()) 189 for key, value := iter.First(); key != nil; key, value = iter.Next() { 190 ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) 191 shared, ptr := decodeVarint(ptr) 192 unshared, ptr := decodeVarint(ptr) 193 value2, _ := decodeVarint(ptr) 194 195 total := iter.nextOffset - iter.offset 196 // The format of the numbers in the record line is: 197 // 198 // (<total> = <length> [<shared>] + <unshared> + <value>) 199 // 200 // <total> is the total number of bytes for the record. 201 // <length> is the size of the 3 varint encoded integers for <shared>, 202 // <unshared>, and <value>. 203 // <shared> is the number of key bytes shared with the previous key. 204 // <unshared> is the number of unshared key bytes. 205 // <value> is the number of value bytes. 206 fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)", 207 b.Offset+uint64(iter.offset), total, 208 total-int32(unshared+value2), shared, unshared, value2) 209 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 210 if fmtRecord != nil { 211 fmt.Fprintf(w, " ") 212 if l.Format < TableFormatPebblev3 { 213 fmtRecord(key, value.InPlaceValue()) 214 } else { 215 // InPlaceValue() will succeed even for data blocks where the 216 // actual value is in a different location, since this value was 217 // fetched from a blockIter which does not know about value 218 // blocks. 219 v := value.InPlaceValue() 220 if base.TrailerKind(key.Trailer) != InternalKeyKindSet { 221 fmtRecord(key, v) 222 } else if !isValueHandle(valuePrefix(v[0])) { 223 fmtRecord(key, v[1:]) 224 } else { 225 vh := decodeValueHandle(v[1:]) 226 fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh))) 227 } 228 } 229 } 230 231 if base.InternalCompare(r.Compare, lastKey, *key) >= 0 { 232 fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n") 233 } 234 lastKey.Trailer = key.Trailer 235 lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) 236 } 237 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 238 formatTrailer() 239 case "index", "top-index": 240 iter, _ := newBlockIter(r.Compare, h.Get()) 241 for key, value := iter.First(); key != nil; key, value = iter.Next() { 242 bh, err := decodeBlockHandleWithProperties(value.InPlaceValue()) 243 if err != nil { 244 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 245 continue 246 } 247 fmt.Fprintf(w, "%10d block:%d/%d", 248 b.Offset+uint64(iter.offset), bh.Offset, bh.Length) 249 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 250 } 251 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 252 formatTrailer() 253 case "properties": 254 iter, _ := newRawBlockIter(r.Compare, h.Get()) 255 for valid := iter.First(); valid; valid = iter.Next() { 256 fmt.Fprintf(w, "%10d %s (%d)", 257 b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset) 258 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 259 } 260 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 261 formatTrailer() 262 case "meta-index": 263 iter, _ := newRawBlockIter(r.Compare, h.Get()) 264 for valid := iter.First(); valid; valid = iter.Next() { 265 value := iter.Value() 266 var bh BlockHandle 267 var n int 268 var vbih valueBlocksIndexHandle 269 isValueBlocksIndexHandle := false 270 if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) { 271 vbih, n, err = decodeValueBlocksIndexHandle(value) 272 bh = vbih.h 273 isValueBlocksIndexHandle = true 274 } else { 275 bh, n = decodeBlockHandle(value) 276 } 277 if n == 0 || n != len(value) { 278 fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) 279 continue 280 } 281 var vbihStr string 282 if isValueBlocksIndexHandle { 283 vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)", 284 vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength) 285 } 286 fmt.Fprintf(w, "%10d %s block:%d/%d%s", 287 b.Offset+uint64(iter.offset), iter.Key().UserKey, 288 bh.Offset, bh.Length, vbihStr) 289 formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) 290 } 291 formatRestarts(iter.data, iter.restarts, iter.numRestarts) 292 formatTrailer() 293 case "value-block": 294 // We don't peer into the value-block since it can't be interpreted 295 // without the valueHandles. 296 case "value-index": 297 // We have already read the value-index to construct the list of 298 // value-blocks, so no need to do it again. 299 } 300 301 h.Release() 302 } 303 304 last := blocks[len(blocks)-1] 305 fmt.Fprintf(w, "%10d EOF\n", last.Offset+last.Length) 306 }