github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/ingest.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/petermattis/pebble/internal/base" 12 "github.com/petermattis/pebble/sstable" 13 "github.com/petermattis/pebble/vfs" 14 ) 15 16 func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { 17 c := userCmp(a.UserKey, b.UserKey) 18 if c != 0 { 19 return c 20 } 21 if a.Trailer == InternalKeyRangeDeleteSentinel { 22 if b.Trailer != InternalKeyRangeDeleteSentinel { 23 return -1 24 } 25 } else if b.Trailer == InternalKeyRangeDeleteSentinel { 26 return 1 27 } 28 return 0 29 } 30 31 func ingestLoad1(opts *Options, path string, dbNum, fileNum uint64) (*fileMetadata, error) { 32 stat, err := opts.FS.Stat(path) 33 if err != nil { 34 return nil, err 35 } 36 37 f, err := opts.FS.Open(path) 38 if err != nil { 39 return nil, err 40 } 41 42 r, err := sstable.NewReader(f, dbNum, fileNum, opts) 43 defer r.Close() 44 if err != nil { 45 return nil, err 46 } 47 48 meta := &fileMetadata{} 49 meta.FileNum = fileNum 50 meta.Size = uint64(stat.Size()) 51 meta.Smallest = InternalKey{} 52 meta.Largest = InternalKey{} 53 smallestSet, largestSet := false, false 54 55 { 56 iter := r.NewIter(nil /* lower */, nil /* upper */) 57 defer iter.Close() 58 if key, _ := iter.First(); key != nil { 59 meta.Smallest = key.Clone() 60 smallestSet = true 61 } 62 if key, _ := iter.Last(); key != nil { 63 meta.Largest = key.Clone() 64 largestSet = true 65 } 66 if err := iter.Error(); err != nil { 67 return nil, err 68 } 69 } 70 71 if iter := r.NewRangeDelIter(); iter != nil { 72 defer iter.Close() 73 if key, _ := iter.First(); key != nil { 74 if !smallestSet || 75 base.InternalCompare(opts.Comparer.Compare, meta.Smallest, *key) > 0 { 76 meta.Smallest = key.Clone() 77 } 78 } 79 if key, val := iter.Last(); key != nil { 80 end := base.MakeRangeDeleteSentinelKey(val) 81 if !largestSet || 82 base.InternalCompare(opts.Comparer.Compare, meta.Largest, end) < 0 { 83 meta.Largest = end.Clone() 84 } 85 } 86 } 87 88 return meta, nil 89 } 90 91 func ingestLoad( 92 opts *Options, paths []string, dbNum uint64, pending []uint64, 93 ) ([]*fileMetadata, error) { 94 meta := make([]*fileMetadata, len(paths)) 95 for i := range paths { 96 var err error 97 meta[i], err = ingestLoad1(opts, paths[i], dbNum, pending[i]) 98 if err != nil { 99 return nil, err 100 } 101 } 102 return meta, nil 103 } 104 105 func ingestSortAndVerify(cmp Compare, meta []*fileMetadata) error { 106 if len(meta) <= 1 { 107 return nil 108 } 109 110 sort.Slice(meta, func(i, j int) bool { 111 return cmp(meta[i].Smallest.UserKey, meta[j].Smallest.UserKey) < 0 112 }) 113 114 for i := 1; i < len(meta); i++ { 115 if sstableKeyCompare(cmp, meta[i-1].Largest, meta[i].Smallest) >= 0 { 116 return fmt.Errorf("files have overlapping ranges") 117 } 118 } 119 return nil 120 } 121 122 func ingestCleanup(fs vfs.FS, dirname string, meta []*fileMetadata) error { 123 var firstErr error 124 for i := range meta { 125 target := base.MakeFilename(dirname, fileTypeTable, meta[i].FileNum) 126 if err := fs.Remove(target); err != nil { 127 if firstErr != nil { 128 firstErr = err 129 } 130 } 131 } 132 return firstErr 133 } 134 135 func ingestLink(opts *Options, dirname string, paths []string, meta []*fileMetadata) error { 136 for i := range paths { 137 target := base.MakeFilename(dirname, fileTypeTable, meta[i].FileNum) 138 err := opts.FS.Link(paths[i], target) 139 if err != nil { 140 if err2 := ingestCleanup(opts.FS, dirname, meta[:i]); err2 != nil { 141 opts.Logger.Infof("ingest cleanup failed: %v", err2) 142 } 143 return err 144 } 145 } 146 147 return nil 148 } 149 150 func ingestMemtableOverlaps(cmp Compare, mem flushable, meta []*fileMetadata) bool { 151 { 152 // Check overlap with point operations. 153 iter := mem.newIter(nil) 154 defer iter.Close() 155 156 for _, m := range meta { 157 key, _ := iter.SeekGE(m.Smallest.UserKey) 158 if key == nil { 159 continue 160 } 161 if cmp(key.UserKey, m.Largest.UserKey) <= 0 { 162 return true 163 } 164 } 165 } 166 167 // Check overlap with range deletions. 168 if iter := mem.newRangeDelIter(nil); iter != nil { 169 defer iter.Close() 170 for _, m := range meta { 171 key, val := iter.SeekLT(m.Smallest.UserKey) 172 if key == nil { 173 key, val = iter.Next() 174 } 175 for ; key != nil; key, val = iter.Next() { 176 if cmp(key.UserKey, m.Largest.UserKey) > 0 { 177 // The start of the tombstone is after the largest key in the 178 // ingested table. 179 break 180 } 181 if cmp(val, m.Smallest.UserKey) > 0 { 182 // The end of the tombstone is greater than the smallest in the 183 // table. Note that the tombstone end key is exclusive, thus ">0" 184 // instead of ">=0". 185 return true 186 } 187 } 188 } 189 } 190 191 return false 192 } 193 194 func ingestUpdateSeqNum(opts *Options, dirname string, seqNum uint64, meta []*fileMetadata) error { 195 for _, m := range meta { 196 m.Smallest = base.MakeInternalKey(m.Smallest.UserKey, seqNum, m.Smallest.Kind()) 197 m.Largest = base.MakeInternalKey(m.Largest.UserKey, seqNum, m.Largest.Kind()) 198 // Setting smallestSeqNum == largestSeqNum triggers the setting of 199 // Properties.GlobalSeqNum when an sstable is loaded. 200 m.SmallestSeqNum = seqNum 201 m.LargestSeqNum = seqNum 202 seqNum++ 203 204 // TODO(peter): Update the global sequence number property. This is only 205 // necessary for compatibility with RocksDB. 206 } 207 return nil 208 } 209 210 func ingestTargetLevel(cmp Compare, v *version, meta *fileMetadata) int { 211 // Find the lowest level which does not have any files which overlap meta. 212 if len(v.Overlaps(0, cmp, meta.Smallest.UserKey, meta.Largest.UserKey)) != 0 { 213 return 0 214 } 215 216 level := 1 217 for ; level < numLevels; level++ { 218 if len(v.Overlaps(level, cmp, meta.Smallest.UserKey, meta.Largest.UserKey)) != 0 { 219 break 220 } 221 } 222 return level - 1 223 } 224 225 // Ingest ingests a set of sstables into the DB. Ingestion of the files is 226 // atomic and semantically equivalent to creating a single batch containing all 227 // of the mutations in the sstables. Ingestion may require the memtable to be 228 // flushed. The ingested sstable files are moved into the DB and must reside on 229 // the same filesystem as the DB. Sstables can be created for ingestion using 230 // sstable.Writer. 231 // 232 // Ingestion loads each sstable into the lowest level of the LSM which it 233 // doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, 234 // ingestion forces the memtable to flush, and then waits for the flush to 235 // occur. 236 // 237 // The steps for ingestion are: 238 // 239 // 1. Allocate file numbers for every sstable beign ingested. 240 // 2. Load the metadata for all sstables being ingest. 241 // 3. Sort the sstables by smallest key, verifying non overlap. 242 // 4. Hard link the sstables into the DB directory. 243 // 5. Allocate a sequence number to use for all of the entries in the 244 // sstables. This is the step where overlap with memtables is 245 // determined. If there is overlap, we remember the most recent memtable 246 // that overlaps. 247 // 6. Update the sequence number in the ingested sstables. 248 // 7. Wait for the most recent memtable that overlaps to flush (if any). 249 // 8. Add the ingested sstables to the version (DB.ingestApply). 250 // 9. Publish the ingestion sequence number. 251 // 252 // Note that if the mutable memtable overlaps with ingestion, a flush of the 253 // memtable is forced equivalent to DB.Flush. Additionally, subsequent 254 // mutations that get sequence numbers larger than the ingestion sequence 255 // number get queued up behind the ingestion waiting for it to complete. This 256 // can produce a noticeable hiccup in performance. See 257 // https://github.com/petermattis/pebble/issues/25 for an idea for how to fix 258 // this hiccup. 259 func (d *DB) Ingest(paths []string) error { 260 // Allocate file numbers for all of the files being ingested and mark them as 261 // pending in order to prevent them from being deleted. Note that this causes 262 // the file number ordering to be out of alignment with sequence number 263 // ordering. The sorting of L0 tables by sequence number avoids relying on 264 // that (busted) invariant. 265 d.mu.Lock() 266 pendingOutputs := make([]uint64, len(paths)) 267 for i := range paths { 268 pendingOutputs[i] = d.mu.versions.getNextFileNum() 269 } 270 for _, fileNum := range pendingOutputs { 271 d.mu.compact.pendingOutputs[fileNum] = struct{}{} 272 } 273 jobID := d.mu.nextJobID 274 d.mu.nextJobID++ 275 d.mu.Unlock() 276 277 defer func() { 278 d.mu.Lock() 279 for _, fileNum := range pendingOutputs { 280 delete(d.mu.compact.pendingOutputs, fileNum) 281 } 282 d.mu.Unlock() 283 }() 284 285 // Load the metadata for all of the files being ingested. 286 meta, err := ingestLoad(d.opts, paths, d.dbNum, pendingOutputs) 287 if err != nil { 288 return err 289 } 290 291 // Verify the sstables do not overlap. 292 if err := ingestSortAndVerify(d.cmp, meta); err != nil { 293 return err 294 } 295 296 // Hard link the sstables into the DB directory. Since the sstables aren't 297 // referenced by a version, they won't be used. If the hard linking fails 298 // (e.g. because the files reside on a different filesystem) we undo our work 299 // and return an error. 300 if err := ingestLink(d.opts, d.dirname, paths, meta); err != nil { 301 return err 302 } 303 // Fsync the directory we added the tables to. We need to do this at some 304 // point before we update the MANIFEST (via logAndApply), otherwise a crash 305 // can have the tables referenced in the MANIFEST, but not present in the 306 // directory. 307 if err := d.dataDir.Sync(); err != nil { 308 return err 309 } 310 311 var mem flushable 312 prepare := func() { 313 d.mu.Lock() 314 defer d.mu.Unlock() 315 316 // If the mutable memtable contains keys which overlap any of the sstables 317 // then flush the memtable. Note that apply will wait for the flushing to 318 // finish. 319 if ingestMemtableOverlaps(d.cmp, d.mu.mem.mutable, meta) { 320 mem = d.mu.mem.mutable 321 err = d.makeRoomForWrite(nil) 322 return 323 } 324 325 // Check to see if any files overlap with any of the immutable 326 // memtables. The queue is ordered from oldest to newest. We want to wait 327 // for the newest table that overlaps. 328 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { 329 m := d.mu.mem.queue[i] 330 if ingestMemtableOverlaps(d.cmp, m, meta) { 331 mem = m 332 return 333 } 334 } 335 } 336 337 var ve *versionEdit 338 apply := func(seqNum uint64) { 339 if err != nil { 340 // An error occurred during prepare. 341 return 342 } 343 344 // Update the sequence number for all of the sstables, both in the metadata 345 // and the global sequence number property on disk. 346 if err = ingestUpdateSeqNum(d.opts, d.dirname, seqNum, meta); err != nil { 347 return 348 } 349 350 // If we flushed the mutable memtable in prepare wait for the flush to 351 // finish. 352 if mem != nil { 353 <-mem.flushed() 354 } 355 356 // Assign the sstables to the correct level in the LSM and apply the 357 // version edit. 358 ve, err = d.ingestApply(jobID, meta) 359 } 360 361 d.commit.AllocateSeqNum(len(meta), prepare, apply) 362 363 if err != nil { 364 if err2 := ingestCleanup(d.opts.FS, d.dirname, meta); err2 != nil { 365 d.opts.Logger.Infof("ingest cleanup failed: %v", err2) 366 } 367 } 368 369 if d.opts.EventListener.TableIngested != nil { 370 info := TableIngestInfo{ 371 JobID: jobID, 372 GlobalSeqNum: meta[0].SmallestSeqNum, 373 Err: err, 374 } 375 if ve != nil { 376 info.Tables = make([]struct { 377 TableInfo 378 Level int 379 }, len(ve.NewFiles)) 380 for i := range ve.NewFiles { 381 e := &ve.NewFiles[i] 382 info.Tables[i].Level = e.Level 383 info.Tables[i].TableInfo = e.Meta.TableInfo(d.dirname) 384 } 385 } 386 d.opts.EventListener.TableIngested(info) 387 } 388 389 return err 390 } 391 392 func (d *DB) ingestApply(jobID int, meta []*fileMetadata) (*versionEdit, error) { 393 d.mu.Lock() 394 defer d.mu.Unlock() 395 396 ve := &versionEdit{ 397 NewFiles: make([]newFileEntry, len(meta)), 398 } 399 metrics := make(map[int]*LevelMetrics) 400 current := d.mu.versions.currentVersion() 401 for i := range meta { 402 // Determine the lowest level in the LSM for which the sstable doesn't 403 // overlap any existing files in the level. 404 m := meta[i] 405 f := &ve.NewFiles[i] 406 f.Level = ingestTargetLevel(d.cmp, current, m) 407 f.Meta = *m 408 levelMetrics := metrics[f.Level] 409 if levelMetrics == nil { 410 levelMetrics = &LevelMetrics{} 411 metrics[f.Level] = levelMetrics 412 } 413 levelMetrics.BytesIngested += m.Size 414 } 415 if err := d.mu.versions.logAndApply(jobID, ve, metrics, d.dataDir); err != nil { 416 return nil, err 417 } 418 d.updateReadStateLocked() 419 return ve, nil 420 }