github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/open.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "os" 13 "path/filepath" 14 "sort" 15 "sync" 16 17 "github.com/petermattis/pebble/internal/arenaskl" 18 "github.com/petermattis/pebble/internal/base" 19 "github.com/petermattis/pebble/internal/rate" 20 "github.com/petermattis/pebble/internal/record" 21 "github.com/petermattis/pebble/vfs" 22 ) 23 24 var dbNumAlloc = struct { 25 sync.Mutex 26 seq uint64 27 }{seq: 1} 28 29 func allocDBNum() uint64 { 30 dbNumAlloc.Lock() 31 num := dbNumAlloc.seq 32 dbNumAlloc.seq++ 33 dbNumAlloc.Unlock() 34 return num 35 } 36 37 func createDB(dirname string, opts *Options) (retErr error) { 38 const manifestFileNum = 1 39 ve := versionEdit{ 40 ComparerName: opts.Comparer.Name, 41 NextFileNum: manifestFileNum + 1, 42 } 43 manifestFilename := base.MakeFilename(dirname, fileTypeManifest, manifestFileNum) 44 f, err := opts.FS.Create(manifestFilename) 45 if err != nil { 46 return fmt.Errorf("pebble: could not create %q: %v", manifestFilename, err) 47 } 48 defer func() { 49 if retErr != nil { 50 opts.FS.Remove(manifestFilename) 51 } 52 }() 53 defer f.Close() 54 55 recWriter := record.NewWriter(f) 56 w, err := recWriter.Next() 57 if err != nil { 58 return err 59 } 60 err = ve.Encode(w) 61 if err != nil { 62 return err 63 } 64 err = recWriter.Close() 65 if err != nil { 66 return err 67 } 68 return setCurrentFile(dirname, opts.FS, manifestFileNum) 69 } 70 71 // Open opens a LevelDB whose files live in the given directory. 72 func Open(dirname string, opts *Options) (*DB, error) { 73 // Make a copy of the options so that we don't mutate the passed in options. 74 opts = opts.Clone() 75 opts = opts.EnsureDefaults() 76 77 d := &DB{ 78 dbNum: allocDBNum(), 79 dirname: dirname, 80 walDirname: opts.WALDir, 81 opts: opts, 82 cmp: opts.Comparer.Compare, 83 equal: opts.Comparer.Equal, 84 merge: opts.Merger.Merge, 85 split: opts.Comparer.Split, 86 abbreviatedKey: opts.Comparer.AbbreviatedKey, 87 logRecycler: logRecycler{limit: opts.MemTableStopWritesThreshold + 1}, 88 } 89 if d.equal == nil { 90 d.equal = bytes.Equal 91 } 92 tableCacheSize := opts.MaxOpenFiles - numNonTableCacheFiles 93 if tableCacheSize < minTableCacheSize { 94 tableCacheSize = minTableCacheSize 95 } 96 d.tableCache.init(d.dbNum, dirname, opts.FS, d.opts, tableCacheSize, defaultTableCacheHitBuffer) 97 d.newIters = d.tableCache.newIters 98 d.commit = newCommitPipeline(commitEnv{ 99 logSeqNum: &d.mu.versions.logSeqNum, 100 visibleSeqNum: &d.mu.versions.visibleSeqNum, 101 apply: d.commitApply, 102 write: d.commitWrite, 103 }) 104 d.compactionLimiter = rate.NewLimiter(rate.Limit(d.opts.MinCompactionRate), d.opts.MinCompactionRate) 105 d.flushLimiter = rate.NewLimiter(rate.Limit(d.opts.MinFlushRate), d.opts.MinFlushRate) 106 d.mu.nextJobID = 1 107 d.mu.mem.cond.L = &d.mu.Mutex 108 d.mu.mem.mutable = newMemTable(d.opts) 109 d.mu.mem.queue = append(d.mu.mem.queue, d.mu.mem.mutable) 110 d.mu.cleaner.cond.L = &d.mu.Mutex 111 d.mu.compact.cond.L = &d.mu.Mutex 112 d.mu.compact.pendingOutputs = make(map[uint64]struct{}) 113 d.mu.snapshots.init() 114 d.largeBatchThreshold = (d.opts.MemTableSize - int(d.mu.mem.mutable.emptySize)) / 2 115 116 d.mu.Lock() 117 defer d.mu.Unlock() 118 119 // Lock the database directory. 120 if !d.opts.ReadOnly { 121 err := opts.FS.MkdirAll(dirname, 0755) 122 if err != nil { 123 return nil, err 124 } 125 } 126 fileLock, err := opts.FS.Lock(base.MakeFilename(dirname, fileTypeLock, 0)) 127 if err != nil { 128 return nil, err 129 } 130 defer func() { 131 if fileLock != nil { 132 fileLock.Close() 133 } 134 }() 135 136 d.dataDir, err = opts.FS.OpenDir(dirname) 137 if err != nil { 138 return nil, err 139 } 140 if d.walDirname == "" { 141 d.walDirname = d.dirname 142 } 143 if d.walDirname == d.dirname { 144 d.walDir = d.dataDir 145 } else { 146 if !d.opts.ReadOnly { 147 err := opts.FS.MkdirAll(d.walDirname, 0755) 148 if err != nil { 149 return nil, err 150 } 151 } 152 d.walDir, err = opts.FS.OpenDir(d.walDirname) 153 } 154 155 if _, err := opts.FS.Stat(base.MakeFilename(dirname, fileTypeCurrent, 0)); os.IsNotExist(err) && !d.opts.ReadOnly { 156 // Create the DB if it did not already exist. 157 if err := createDB(dirname, opts); err != nil { 158 return nil, err 159 } 160 if err := d.dataDir.Sync(); err != nil { 161 return nil, err 162 } 163 } else if err != nil { 164 return nil, fmt.Errorf("pebble: database %q: %v", dirname, err) 165 } else if opts.ErrorIfDBExists { 166 return nil, fmt.Errorf("pebble: database %q already exists", dirname) 167 } 168 169 // Load the version set. 170 err = d.mu.versions.load(dirname, opts, &d.mu.Mutex) 171 if err != nil { 172 return nil, err 173 } 174 175 ls, err := opts.FS.List(d.walDirname) 176 if err != nil { 177 return nil, err 178 } 179 if d.dirname != d.walDirname { 180 ls2, err := opts.FS.List(d.dirname) 181 if err != nil { 182 return nil, err 183 } 184 ls = append(ls, ls2...) 185 } 186 187 // Replay any newer log files than the ones named in the manifest. 188 type fileNumAndName struct { 189 num uint64 190 name string 191 } 192 var logFiles []fileNumAndName 193 for _, filename := range ls { 194 ft, fn, ok := base.ParseFilename(filename) 195 if !ok { 196 continue 197 } 198 switch ft { 199 case fileTypeLog: 200 if fn >= d.mu.versions.logNum || fn == d.mu.versions.prevLogNum { 201 logFiles = append(logFiles, fileNumAndName{fn, filename}) 202 } 203 case fileTypeOptions: 204 if err := checkOptions(opts, filepath.Join(dirname, filename)); err != nil { 205 return nil, err 206 } 207 } 208 } 209 sort.Slice(logFiles, func(i, j int) bool { 210 return logFiles[i].num < logFiles[j].num 211 }) 212 213 jobID := d.mu.nextJobID 214 d.mu.nextJobID++ 215 216 var ve versionEdit 217 for _, lf := range logFiles { 218 maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, filepath.Join(d.walDirname, lf.name), lf.num) 219 if err != nil { 220 return nil, err 221 } 222 d.mu.versions.markFileNumUsed(lf.num) 223 if d.mu.versions.logSeqNum < maxSeqNum { 224 d.mu.versions.logSeqNum = maxSeqNum 225 } 226 } 227 d.mu.versions.visibleSeqNum = d.mu.versions.logSeqNum 228 229 if !d.opts.ReadOnly { 230 // Create an empty .log file. 231 ve.LogNum = d.mu.versions.getNextFileNum() 232 d.mu.log.queue = append(d.mu.log.queue, ve.LogNum) 233 logFile, err := opts.FS.Create(base.MakeFilename(d.walDirname, fileTypeLog, ve.LogNum)) 234 if err != nil { 235 return nil, err 236 } 237 if err := d.walDir.Sync(); err != nil { 238 return nil, err 239 } 240 logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ 241 BytesPerSync: d.opts.BytesPerSync, 242 PreallocateSize: d.walPreallocateSize(), 243 }) 244 d.mu.log.LogWriter = record.NewLogWriter(logFile, ve.LogNum) 245 d.mu.versions.metrics.WAL.Files++ 246 247 // Write a new manifest to disk. 248 if err := d.mu.versions.logAndApply(0, &ve, nil, d.dataDir); err != nil { 249 return nil, err 250 } 251 } 252 d.updateReadStateLocked() 253 254 if !d.opts.ReadOnly { 255 // Write the current options to disk. 256 d.optionsFileNum = d.mu.versions.getNextFileNum() 257 optionsFile, err := opts.FS.Create(base.MakeFilename(dirname, fileTypeOptions, d.optionsFileNum)) 258 if err != nil { 259 return nil, err 260 } 261 if _, err := optionsFile.Write([]byte(opts.String())); err != nil { 262 return nil, err 263 } 264 optionsFile.Close() 265 if err := d.dataDir.Sync(); err != nil { 266 return nil, err 267 } 268 } 269 270 if !d.opts.ReadOnly { 271 d.scanObsoleteFiles(ls) 272 d.deleteObsoleteFiles(jobID) 273 } 274 d.maybeScheduleFlush() 275 d.maybeScheduleCompaction() 276 277 d.fileLock, fileLock = fileLock, nil 278 return d, nil 279 } 280 281 // replayWAL replays the edits in the specified log file. 282 // 283 // d.mu must be held when calling this, but the mutex may be dropped and 284 // re-acquired during the course of this method. 285 func (d *DB) replayWAL( 286 jobID int, 287 ve *versionEdit, 288 fs vfs.FS, 289 filename string, 290 logNum uint64, 291 ) (maxSeqNum uint64, err error) { 292 file, err := fs.Open(filename) 293 if err != nil { 294 return 0, err 295 } 296 defer file.Close() 297 298 var ( 299 b Batch 300 buf bytes.Buffer 301 mem *memTable 302 rr = record.NewReader(file, logNum) 303 ) 304 305 // In read-only mode, we replay directly into the mutable memtable which will 306 // never be flushed. 307 if d.opts.ReadOnly { 308 mem = d.mu.mem.mutable 309 } 310 311 for { 312 r, err := rr.Next() 313 if err == nil { 314 _, err = io.Copy(&buf, r) 315 } 316 if err != nil { 317 // It is common to encounter a zeroed or invalid chunk due to WAL 318 // preallocation and WAL recycling. We need to distinguish these errors 319 // from EOF in order to recognize that the record was truncated, but want 320 // to otherwise treat them like EOF. 321 if err == io.EOF || err == record.ErrZeroedChunk || err == record.ErrInvalidChunk { 322 break 323 } 324 return 0, err 325 } 326 327 if buf.Len() < batchHeaderLen { 328 return 0, fmt.Errorf("pebble: corrupt log file %q", filename) 329 } 330 331 // TODO(peter): If the batch is too large to fit in the memtable, flush the 332 // existing memtable and write the batch as a separate L0 table. 333 b = Batch{} 334 b.SetRepr(buf.Bytes()) 335 seqNum := b.SeqNum() 336 maxSeqNum = seqNum + uint64(b.Count()) 337 338 if mem == nil { 339 mem = newMemTable(d.opts) 340 } 341 342 for { 343 err := mem.prepare(&b) 344 if err == arenaskl.ErrArenaFull { 345 // TODO(peter): write the memtable to disk. 346 panic(err) 347 } 348 if err != nil { 349 return 0, err 350 } 351 break 352 } 353 354 if err := mem.apply(&b, seqNum); err != nil { 355 return 0, err 356 } 357 mem.unref() 358 359 buf.Reset() 360 } 361 362 if d.opts.ReadOnly { 363 // In read-only mode, each WAL file is replayed into its own memtable. This 364 // is done so that the WAL metrics can be accurately provided. 365 mem.logSize = uint64(rr.Offset()) 366 d.mu.mem.mutable = newMemTable(d.opts) 367 d.mu.mem.queue = append(d.mu.mem.queue, d.mu.mem.mutable) 368 d.mu.versions.metrics.WAL.Files++ 369 } else if mem != nil && !mem.empty() { 370 c := newFlush(d.opts, d.mu.versions.currentVersion(), 371 1 /* base level */, []flushable{mem}, &d.bytesFlushed) 372 newVE, pendingOutputs, err := d.runCompaction(jobID, c, nilPacer) 373 if err != nil { 374 return 0, err 375 } 376 ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...) 377 // Strictly speaking, it's too early to delete from d.pendingOutputs, but 378 // we are replaying the log file, which happens before Open returns, so 379 // there is no possibility of deleteObsoleteFiles being called concurrently 380 // here. 381 for _, fileNum := range pendingOutputs { 382 delete(d.mu.compact.pendingOutputs, fileNum) 383 } 384 } 385 386 return maxSeqNum, nil 387 } 388 389 func checkOptions(opts *Options, path string) error { 390 f, err := opts.FS.Open(path) 391 if err != nil { 392 return err 393 } 394 defer f.Close() 395 396 data, err := ioutil.ReadAll(f) 397 if err != nil { 398 return err 399 } 400 return opts.Check(string(data)) 401 }