github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/manifest/version_edit.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bufio" 9 "bytes" 10 "encoding/binary" 11 "errors" 12 "fmt" 13 "io" 14 "sync/atomic" 15 16 "github.com/petermattis/pebble/internal/base" 17 ) 18 19 // TODO(peter): describe the MANIFEST file format, independently of the C++ 20 // project. 21 22 var errCorruptManifest = errors.New("pebble: corrupt manifest") 23 24 type byteReader interface { 25 io.ByteReader 26 io.Reader 27 } 28 29 // Tags for the versionEdit disk format. 30 // Tag 8 is no longer used. 31 const ( 32 // LevelDB tags. 33 tagComparator = 1 34 tagLogNumber = 2 35 tagNextFileNumber = 3 36 tagLastSequence = 4 37 tagCompactPointer = 5 38 tagDeletedFile = 6 39 tagNewFile = 7 40 tagPrevLogNumber = 9 41 42 // RocksDB tags. 43 tagNewFile2 = 100 44 tagNewFile3 = 102 45 tagNewFile4 = 103 46 tagColumnFamily = 200 47 tagColumnFamilyAdd = 201 48 tagColumnFamilyDrop = 202 49 tagMaxColumnFamily = 203 50 51 // The custom tags sub-format used by tagNewFile4. 52 customTagTerminate = 1 53 customTagNeedsCompaction = 2 54 customTagPathID = 65 55 customTagNonSafeIgnoreMask = 1 << 6 56 ) 57 58 // DeletedFileEntry holds the state for a file deletion from a level. The file 59 // itself might still be referenced by another level. 60 type DeletedFileEntry struct { 61 Level int 62 FileNum uint64 63 } 64 65 // NewFileEntry holds the state for a new file or one moved from a different 66 // level. 67 type NewFileEntry struct { 68 Level int 69 Meta FileMetadata 70 } 71 72 // VersionEdit holds the state for an edit to a Version along with other 73 // on-disk state (log numbers, next file number, and the last sequence number). 74 type VersionEdit struct { 75 ComparerName string 76 LogNum uint64 77 PrevLogNum uint64 78 NextFileNum uint64 79 LastSeqNum uint64 80 DeletedFiles map[DeletedFileEntry]bool // set of DeletedFileEntry values 81 NewFiles []NewFileEntry 82 } 83 84 // Decode decodes an edit from the specified reader. 85 func (v *VersionEdit) Decode(r io.Reader) error { 86 br, ok := r.(byteReader) 87 if !ok { 88 br = bufio.NewReader(r) 89 } 90 d := versionEditDecoder{br} 91 for { 92 tag, err := binary.ReadUvarint(br) 93 if err == io.EOF { 94 break 95 } 96 if err != nil { 97 return err 98 } 99 switch tag { 100 case tagComparator: 101 s, err := d.readBytes() 102 if err != nil { 103 return err 104 } 105 v.ComparerName = string(s) 106 107 case tagLogNumber: 108 n, err := d.readUvarint() 109 if err != nil { 110 return err 111 } 112 v.LogNum = n 113 114 case tagNextFileNumber: 115 n, err := d.readUvarint() 116 if err != nil { 117 return err 118 } 119 v.NextFileNum = n 120 121 case tagLastSequence: 122 n, err := d.readUvarint() 123 if err != nil { 124 return err 125 } 126 v.LastSeqNum = n 127 128 case tagCompactPointer: 129 if _, err := d.readLevel(); err != nil { 130 return err 131 } 132 if _, err := d.readBytes(); err != nil { 133 return err 134 } 135 // NB: RocksDB does not use compaction pointers anymore. 136 137 case tagDeletedFile: 138 level, err := d.readLevel() 139 if err != nil { 140 return err 141 } 142 fileNum, err := d.readUvarint() 143 if err != nil { 144 return err 145 } 146 if v.DeletedFiles == nil { 147 v.DeletedFiles = make(map[DeletedFileEntry]bool) 148 } 149 v.DeletedFiles[DeletedFileEntry{level, fileNum}] = true 150 151 case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4: 152 level, err := d.readLevel() 153 if err != nil { 154 return err 155 } 156 fileNum, err := d.readUvarint() 157 if err != nil { 158 return err 159 } 160 if tag == tagNewFile3 { 161 // The pathID field appears unused in RocksDB. 162 _ /* pathID */, err := d.readUvarint() 163 if err != nil { 164 return err 165 } 166 } 167 size, err := d.readUvarint() 168 if err != nil { 169 return err 170 } 171 smallest, err := d.readBytes() 172 if err != nil { 173 return err 174 } 175 largest, err := d.readBytes() 176 if err != nil { 177 return err 178 } 179 var smallestSeqNum uint64 180 var largestSeqNum uint64 181 if tag != tagNewFile { 182 smallestSeqNum, err = d.readUvarint() 183 if err != nil { 184 return err 185 } 186 largestSeqNum, err = d.readUvarint() 187 if err != nil { 188 return err 189 } 190 } 191 var markedForCompaction bool 192 if tag == tagNewFile4 { 193 for { 194 customTag, err := d.readUvarint() 195 if err != nil { 196 return err 197 } 198 if customTag == customTagTerminate { 199 break 200 } 201 field, err := d.readBytes() 202 if err != nil { 203 return err 204 } 205 switch customTag { 206 case customTagNeedsCompaction: 207 if len(field) != 1 { 208 return fmt.Errorf("new-file4: need-compaction field wrong size") 209 } 210 markedForCompaction = (field[0] == 1) 211 212 case customTagPathID: 213 return fmt.Errorf("new-file4: path-id field not supported") 214 215 default: 216 if (customTag & customTagNonSafeIgnoreMask) != 0 { 217 return fmt.Errorf("new-file4: custom field not supported: %d", customTag) 218 } 219 } 220 } 221 } 222 v.NewFiles = append(v.NewFiles, NewFileEntry{ 223 Level: level, 224 Meta: FileMetadata{ 225 FileNum: fileNum, 226 Size: size, 227 Smallest: base.DecodeInternalKey(smallest), 228 Largest: base.DecodeInternalKey(largest), 229 SmallestSeqNum: smallestSeqNum, 230 LargestSeqNum: largestSeqNum, 231 MarkedForCompaction: markedForCompaction, 232 }, 233 }) 234 235 case tagPrevLogNumber: 236 n, err := d.readUvarint() 237 if err != nil { 238 return err 239 } 240 v.PrevLogNum = n 241 242 case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: 243 return fmt.Errorf("column families are not supported") 244 245 default: 246 return errCorruptManifest 247 } 248 } 249 return nil 250 } 251 252 // Encode encodes an edit to the specified writer. 253 func (v *VersionEdit) Encode(w io.Writer) error { 254 e := versionEditEncoder{new(bytes.Buffer)} 255 if v.ComparerName != "" { 256 e.writeUvarint(tagComparator) 257 e.writeString(v.ComparerName) 258 } 259 if v.LogNum != 0 { 260 e.writeUvarint(tagLogNumber) 261 e.writeUvarint(v.LogNum) 262 } 263 if v.PrevLogNum != 0 { 264 e.writeUvarint(tagPrevLogNumber) 265 e.writeUvarint(v.PrevLogNum) 266 } 267 if v.NextFileNum != 0 { 268 e.writeUvarint(tagNextFileNumber) 269 e.writeUvarint(v.NextFileNum) 270 } 271 if v.LastSeqNum != 0 { 272 e.writeUvarint(tagLastSequence) 273 e.writeUvarint(v.LastSeqNum) 274 } 275 for x := range v.DeletedFiles { 276 e.writeUvarint(tagDeletedFile) 277 e.writeUvarint(uint64(x.Level)) 278 e.writeUvarint(x.FileNum) 279 } 280 for _, x := range v.NewFiles { 281 var customFields bool 282 if x.Meta.MarkedForCompaction { 283 customFields = true 284 e.writeUvarint(tagNewFile4) 285 } else { 286 e.writeUvarint(tagNewFile2) 287 } 288 e.writeUvarint(uint64(x.Level)) 289 e.writeUvarint(x.Meta.FileNum) 290 e.writeUvarint(x.Meta.Size) 291 e.writeKey(x.Meta.Smallest) 292 e.writeKey(x.Meta.Largest) 293 e.writeUvarint(x.Meta.SmallestSeqNum) 294 e.writeUvarint(x.Meta.LargestSeqNum) 295 if customFields { 296 if x.Meta.MarkedForCompaction { 297 e.writeUvarint(customTagNeedsCompaction) 298 e.writeBytes([]byte{1}) 299 } 300 e.writeUvarint(customTagTerminate) 301 } 302 } 303 _, err := w.Write(e.Bytes()) 304 return err 305 } 306 307 type versionEditDecoder struct { 308 byteReader 309 } 310 311 func (d versionEditDecoder) readBytes() ([]byte, error) { 312 n, err := d.readUvarint() 313 if err != nil { 314 return nil, err 315 } 316 s := make([]byte, n) 317 _, err = io.ReadFull(d, s) 318 if err != nil { 319 if err == io.ErrUnexpectedEOF { 320 return nil, errCorruptManifest 321 } 322 return nil, err 323 } 324 return s, nil 325 } 326 327 func (d versionEditDecoder) readLevel() (int, error) { 328 u, err := d.readUvarint() 329 if err != nil { 330 return 0, err 331 } 332 if u >= NumLevels { 333 return 0, errCorruptManifest 334 } 335 return int(u), nil 336 } 337 338 func (d versionEditDecoder) readUvarint() (uint64, error) { 339 u, err := binary.ReadUvarint(d) 340 if err != nil { 341 if err == io.EOF { 342 return 0, errCorruptManifest 343 } 344 return 0, err 345 } 346 return u, nil 347 } 348 349 type versionEditEncoder struct { 350 *bytes.Buffer 351 } 352 353 func (e versionEditEncoder) writeBytes(p []byte) { 354 e.writeUvarint(uint64(len(p))) 355 e.Write(p) 356 } 357 358 func (e versionEditEncoder) writeKey(k InternalKey) { 359 e.writeUvarint(uint64(k.Size())) 360 e.Write(k.UserKey) 361 buf := k.EncodeTrailer() 362 e.Write(buf[:]) 363 } 364 365 func (e versionEditEncoder) writeString(s string) { 366 e.writeUvarint(uint64(len(s))) 367 e.WriteString(s) 368 } 369 370 func (e versionEditEncoder) writeUvarint(u uint64) { 371 var buf [binary.MaxVarintLen64]byte 372 n := binary.PutUvarint(buf[:], u) 373 e.Write(buf[:n]) 374 } 375 376 // BulkVersionEdit summarizes the files added and deleted from a set of version 377 // edits. 378 type BulkVersionEdit struct { 379 Added [NumLevels][]FileMetadata 380 Deleted [NumLevels]map[uint64]bool // map[uint64]bool is a set of fileNums 381 } 382 383 // Accumulate adds the file addition and deletions in the specified version 384 // edit to the bulk edit's internal state. 385 func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) { 386 for df := range ve.DeletedFiles { 387 dmap := b.Deleted[df.Level] 388 if dmap == nil { 389 dmap = make(map[uint64]bool) 390 b.Deleted[df.Level] = dmap 391 } 392 dmap[df.FileNum] = true 393 } 394 395 for _, nf := range ve.NewFiles { 396 if dmap := b.Deleted[nf.Level]; dmap != nil { 397 delete(dmap, nf.Meta.FileNum) 398 } 399 b.Added[nf.Level] = append(b.Added[nf.Level], nf.Meta) 400 } 401 } 402 403 // Apply applies the delta b to a base version to produce a new version. The 404 // new version is consistent with respect to the internal key comparer icmp. 405 // 406 // base may be nil, which is equivalent to a pointer to a zero version. 407 func (b *BulkVersionEdit) Apply( 408 opts *Options, base *Version, cmp Compare, 409 ) (*Version, error) { 410 v := new(Version) 411 for level := range v.Files { 412 if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 { 413 // There are no edits on this level. 414 if base == nil { 415 continue 416 } 417 files := base.Files[level] 418 v.Files[level] = files 419 // We still have to bump the ref count for all files. 420 for i := range files { 421 atomic.AddInt32(files[i].refs, 1) 422 } 423 continue 424 } 425 426 combined := [2][]FileMetadata{ 427 nil, 428 b.Added[level], 429 } 430 if base != nil { 431 combined[0] = base.Files[level] 432 } 433 n := len(combined[0]) + len(combined[1]) 434 if n == 0 { 435 continue 436 } 437 v.Files[level] = make([]FileMetadata, 0, n) 438 dmap := b.Deleted[level] 439 440 for _, ff := range combined { 441 for _, f := range ff { 442 if dmap != nil && dmap[f.FileNum] { 443 continue 444 } 445 if f.refs == nil { 446 f.refs = new(int32) 447 } 448 atomic.AddInt32(f.refs, 1) 449 v.Files[level] = append(v.Files[level], f) 450 } 451 } 452 453 // TODO(peter): base.files[level] is already sorted. Instead of appending 454 // b.addFiles[level] to the end and sorting afterwards, it might be more 455 // efficient to sort b.addFiles[level] and then merge the two sorted 456 // slices. 457 if level == 0 { 458 SortBySeqNum(v.Files[level]) 459 } else { 460 SortBySmallest(v.Files[level], cmp) 461 } 462 } 463 if err := v.CheckOrdering(cmp); err != nil { 464 return nil, fmt.Errorf("pebble: internal error: %v", err) 465 } 466 return v, nil 467 }