github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/manifest.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bufio" 21 "bytes" 22 "encoding/binary" 23 "fmt" 24 "hash/crc32" 25 "io" 26 "os" 27 "path/filepath" 28 "sync" 29 30 "github.com/pingcap/badger/options" 31 "github.com/pingcap/badger/protos" 32 "github.com/pingcap/badger/y" 33 "github.com/pingcap/errors" 34 ) 35 36 // Manifest represents the contents of the MANIFEST file in a Badger store. 37 // 38 // The MANIFEST file describes the startup state of the db -- all LSM files and what level they're 39 // at. 40 // 41 // It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically, 42 // and contains a sequence of ManifestChange's (file creations/deletions) which we use to 43 // reconstruct the manifest at startup. 44 type Manifest struct { 45 Levels []levelManifest 46 Tables map[uint64]tableManifest 47 48 // Contains total number of creation and deletion changes in the manifest -- used to compute 49 // whether it'd be useful to rewrite the manifest. 50 Creations int 51 Deletions int 52 53 Head *protos.HeadInfo 54 } 55 56 func createManifest() Manifest { 57 levels := make([]levelManifest, 0) 58 return Manifest{ 59 Levels: levels, 60 Tables: make(map[uint64]tableManifest), 61 } 62 } 63 64 // levelManifest contains information about LSM tree levels 65 // in the MANIFEST file. 66 type levelManifest struct { 67 Tables map[uint64]struct{} // Set of table id's 68 } 69 70 // tableManifest contains information about a specific level 71 // in the LSM tree. 72 type tableManifest struct { 73 Level uint8 74 Compression options.CompressionType 75 } 76 77 // manifestFile holds the file pointer (and other info) about the manifest file, which is a log 78 // file we append to. 79 type manifestFile struct { 80 fp *os.File 81 directory string 82 // We make this configurable so that unit tests can hit rewrite() code quickly 83 deletionsRewriteThreshold int 84 85 // Guards appends, which includes access to the manifest field. 86 appendLock sync.Mutex 87 88 // Used to track the current state of the manifest, used when rewriting. 89 manifest Manifest 90 } 91 92 const ( 93 // ManifestFilename is the filename for the manifest file. 94 ManifestFilename = "MANIFEST" 95 manifestRewriteFilename = "MANIFEST-REWRITE" 96 manifestDeletionsRewriteThreshold = 10000 97 manifestDeletionsRatio = 10 98 ) 99 100 // asChanges returns a sequence of changes that could be used to recreate the Manifest in its 101 // present state. 102 func (m *Manifest) asChanges() []*protos.ManifestChange { 103 changes := make([]*protos.ManifestChange, 0, len(m.Tables)) 104 for id, tm := range m.Tables { 105 changes = append(changes, newCreateChange(id, int(tm.Level))) 106 } 107 return changes 108 } 109 110 func (m *Manifest) clone() Manifest { 111 changeSet := protos.ManifestChangeSet{Changes: m.asChanges()} 112 ret := createManifest() 113 y.Check(applyChangeSet(&ret, &changeSet)) 114 return ret 115 } 116 117 // openOrCreateManifestFile opens a Badger manifest file if it exists, or creates on if 118 // one doesn’t. 119 func openOrCreateManifestFile(dir string, readOnly bool) (ret *manifestFile, result Manifest, err error) { 120 return helpOpenOrCreateManifestFile(dir, readOnly, manifestDeletionsRewriteThreshold) 121 } 122 123 func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) (ret *manifestFile, result Manifest, err error) { 124 path := filepath.Join(dir, ManifestFilename) 125 var flags uint32 126 if readOnly { 127 flags |= y.ReadOnly 128 } 129 fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock. 130 if err != nil { 131 if !os.IsNotExist(err) { 132 return nil, Manifest{}, err 133 } 134 if readOnly { 135 return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db") 136 } 137 m := createManifest() 138 fp, netCreations, err := helpRewrite(dir, &m) 139 if err != nil { 140 return nil, Manifest{}, err 141 } 142 y.Assert(netCreations == 0) 143 mf := &manifestFile{ 144 fp: fp, 145 directory: dir, 146 manifest: m.clone(), 147 deletionsRewriteThreshold: deletionsThreshold, 148 } 149 return mf, m, nil 150 } 151 152 manifest, truncOffset, err := ReplayManifestFile(fp) 153 if err != nil { 154 _ = fp.Close() 155 return nil, Manifest{}, err 156 } 157 158 if !readOnly { 159 // Truncate file so we don't have a half-written entry at the end. 160 if err := fp.Truncate(truncOffset); err != nil { 161 _ = fp.Close() 162 return nil, Manifest{}, err 163 } 164 } 165 if _, err = fp.Seek(0, io.SeekEnd); err != nil { 166 _ = fp.Close() 167 return nil, Manifest{}, err 168 } 169 170 mf := &manifestFile{ 171 fp: fp, 172 directory: dir, 173 manifest: manifest.clone(), 174 deletionsRewriteThreshold: deletionsThreshold, 175 } 176 return mf, manifest, nil 177 } 178 179 func (mf *manifestFile) close() error { 180 return mf.fp.Close() 181 } 182 183 // addChanges writes a batch of changes, atomically, to the file. By "atomically" that means when 184 // we replay the MANIFEST file, we'll either replay all the changes or none of them. (The truth of 185 // this depends on the filesystem -- some might append garbage data if a system crash happens at 186 // the wrong time.) 187 func (mf *manifestFile) addChanges(changesParam []*protos.ManifestChange, head *protos.HeadInfo) error { 188 changes := protos.ManifestChangeSet{Changes: changesParam, Head: head} 189 buf, err := changes.Marshal() 190 if err != nil { 191 return err 192 } 193 194 // Maybe we could use O_APPEND instead (on certain file systems) 195 mf.appendLock.Lock() 196 if err := applyChangeSet(&mf.manifest, &changes); err != nil { 197 mf.appendLock.Unlock() 198 return err 199 } 200 // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care 201 if mf.manifest.Deletions > mf.deletionsRewriteThreshold && 202 mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) { 203 if err := mf.rewrite(); err != nil { 204 mf.appendLock.Unlock() 205 return err 206 } 207 } else { 208 var lenCrcBuf [8]byte 209 binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf))) 210 binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable)) 211 buf = append(lenCrcBuf[:], buf...) 212 if _, err := mf.fp.Write(buf); err != nil { 213 mf.appendLock.Unlock() 214 return err 215 } 216 } 217 218 mf.appendLock.Unlock() 219 return mf.fp.Sync() 220 } 221 222 // Has to be 4 bytes. The value can never change, ever, anyway. 223 var magicText = [4]byte{'B', 'd', 'g', 'r'} 224 225 // The magic version number. 226 const magicVersion = 4 227 228 func helpRewrite(dir string, m *Manifest) (*os.File, int, error) { 229 rewritePath := filepath.Join(dir, manifestRewriteFilename) 230 // We explicitly sync. 231 fp, err := y.OpenTruncFile(rewritePath, false) 232 if err != nil { 233 return nil, 0, err 234 } 235 236 buf := make([]byte, 8) 237 copy(buf[0:4], magicText[:]) 238 binary.BigEndian.PutUint32(buf[4:8], magicVersion) 239 240 netCreations := len(m.Tables) 241 changes := m.asChanges() 242 set := protos.ManifestChangeSet{Changes: changes, Head: m.Head} 243 244 changeBuf, err := set.Marshal() 245 if err != nil { 246 fp.Close() 247 return nil, 0, err 248 } 249 var lenCrcBuf [8]byte 250 binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf))) 251 binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable)) 252 buf = append(buf, lenCrcBuf[:]...) 253 buf = append(buf, changeBuf...) 254 if _, err := fp.Write(buf); err != nil { 255 fp.Close() 256 return nil, 0, err 257 } 258 if err := fp.Sync(); err != nil { 259 fp.Close() 260 return nil, 0, err 261 } 262 263 // In Windows the files should be closed before doing a Rename. 264 if err = fp.Close(); err != nil { 265 return nil, 0, err 266 } 267 manifestPath := filepath.Join(dir, ManifestFilename) 268 if err := os.Rename(rewritePath, manifestPath); err != nil { 269 return nil, 0, err 270 } 271 fp, err = y.OpenExistingFile(manifestPath, 0) 272 if err != nil { 273 return nil, 0, err 274 } 275 if _, err := fp.Seek(0, io.SeekEnd); err != nil { 276 fp.Close() 277 return nil, 0, err 278 } 279 if err := syncDir(dir); err != nil { 280 fp.Close() 281 return nil, 0, err 282 } 283 284 return fp, netCreations, nil 285 } 286 287 // Must be called while appendLock is held. 288 func (mf *manifestFile) rewrite() error { 289 // In Windows the files should be closed before doing a Rename. 290 if err := mf.fp.Close(); err != nil { 291 return err 292 } 293 fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest) 294 if err != nil { 295 return err 296 } 297 mf.fp = fp 298 mf.manifest.Creations = netCreations 299 mf.manifest.Deletions = 0 300 301 return nil 302 } 303 304 type countingReader struct { 305 wrapped *bufio.Reader 306 count int64 307 } 308 309 func (r *countingReader) Read(p []byte) (n int, err error) { 310 n, err = r.wrapped.Read(p) 311 r.count += int64(n) 312 return 313 } 314 315 func (r *countingReader) ReadByte() (b byte, err error) { 316 b, err = r.wrapped.ReadByte() 317 if err == nil { 318 r.count++ 319 } 320 return 321 } 322 323 var ( 324 errBadMagic = errors.New("manifest has bad magic") 325 ) 326 327 // ReplayManifestFile reads the manifest file and constructs two manifest objects. (We need one 328 // immutable copy and one mutable copy of the manifest. Easiest way is to construct two of them.) 329 // Also, returns the last offset after a completely read manifest entry -- the file must be 330 // truncated at that point before further appends are made (if there is a partial entry after 331 // that). In normal conditions, truncOffset is the file size. 332 func ReplayManifestFile(fp *os.File) (ret Manifest, truncOffset int64, err error) { 333 r := countingReader{wrapped: bufio.NewReader(fp)} 334 335 var magicBuf [8]byte 336 if _, err := io.ReadFull(&r, magicBuf[:]); err != nil { 337 return Manifest{}, 0, errBadMagic 338 } 339 if !bytes.Equal(magicBuf[0:4], magicText[:]) { 340 return Manifest{}, 0, errBadMagic 341 } 342 version := binary.BigEndian.Uint32(magicBuf[4:8]) 343 if version != magicVersion { 344 return Manifest{}, 0, 345 fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, magicVersion) 346 } 347 348 build := createManifest() 349 var offset int64 350 for { 351 offset = r.count 352 var lenCrcBuf [8]byte 353 _, err := io.ReadFull(&r, lenCrcBuf[:]) 354 if err != nil { 355 if err == io.EOF || err == io.ErrUnexpectedEOF { 356 break 357 } 358 return Manifest{}, 0, err 359 } 360 length := binary.BigEndian.Uint32(lenCrcBuf[0:4]) 361 var buf = make([]byte, length) 362 if _, err := io.ReadFull(&r, buf); err != nil { 363 if err == io.EOF || err == io.ErrUnexpectedEOF { 364 break 365 } 366 return Manifest{}, 0, err 367 } 368 if crc32.Checksum(buf, y.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) { 369 break 370 } 371 372 var changeSet protos.ManifestChangeSet 373 if err := changeSet.Unmarshal(buf); err != nil { 374 return Manifest{}, 0, err 375 } 376 377 if err := applyChangeSet(&build, &changeSet); err != nil { 378 return Manifest{}, 0, err 379 } 380 } 381 382 return build, offset, err 383 } 384 385 func addNewToManifest(build *Manifest, tc *protos.ManifestChange) { 386 build.Tables[tc.Id] = tableManifest{ 387 Level: uint8(tc.Level), 388 } 389 for len(build.Levels) <= int(tc.Level) { 390 build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})}) 391 } 392 build.Levels[tc.Level].Tables[tc.Id] = struct{}{} 393 build.Creations++ 394 } 395 396 func applyManifestChange(build *Manifest, tc *protos.ManifestChange) error { 397 switch tc.Op { 398 case protos.ManifestChange_CREATE: 399 if _, ok := build.Tables[tc.Id]; ok { 400 return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id) 401 } 402 addNewToManifest(build, tc) 403 case protos.ManifestChange_DELETE: 404 tm, ok := build.Tables[tc.Id] 405 if !ok { 406 return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id) 407 } 408 delete(build.Levels[tm.Level].Tables, tc.Id) 409 delete(build.Tables, tc.Id) 410 build.Deletions++ 411 case protos.ManifestChange_MOVE_DOWN: 412 tm, ok := build.Tables[tc.Id] 413 if !ok { 414 return fmt.Errorf("MANIFEST moves down non-exisitng table %d", tc.Id) 415 } 416 delete(build.Levels[tm.Level].Tables, tc.Id) 417 delete(build.Tables, tc.Id) 418 build.Deletions++ 419 addNewToManifest(build, tc) 420 default: 421 return fmt.Errorf("MANIFEST file has invalid manifestChange op") 422 } 423 return nil 424 } 425 426 // This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is 427 // just plain broken. 428 func applyChangeSet(build *Manifest, changeSet *protos.ManifestChangeSet) error { 429 for _, change := range changeSet.Changes { 430 if err := applyManifestChange(build, change); err != nil { 431 return err 432 } 433 } 434 if changeSet.Head != nil { 435 build.Head = changeSet.Head 436 } 437 return nil 438 } 439 440 func newCreateChange( 441 id uint64, level int) *protos.ManifestChange { 442 return &protos.ManifestChange{ 443 Id: id, 444 Op: protos.ManifestChange_CREATE, 445 Level: uint32(level), 446 } 447 } 448 449 func newDeleteChange(id uint64) *protos.ManifestChange { 450 return &protos.ManifestChange{ 451 Id: id, 452 Op: protos.ManifestChange_DELETE, 453 } 454 } 455 456 func newMoveDownChange(id uint64, moveToLevel int) *protos.ManifestChange { 457 return &protos.ManifestChange{ 458 Id: id, 459 Op: protos.ManifestChange_MOVE_DOWN, 460 Level: uint32(moveToLevel), 461 } 462 }