github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/keyspan/fragmenter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package keyspan 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/internal/invariants" 13 ) 14 15 type spansByStartKey struct { 16 cmp base.Compare 17 buf []Span 18 } 19 20 func (v *spansByStartKey) Len() int { return len(v.buf) } 21 func (v *spansByStartKey) Less(i, j int) bool { 22 return v.cmp(v.buf[i].Start, v.buf[j].Start) < 0 23 } 24 func (v *spansByStartKey) Swap(i, j int) { 25 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 26 } 27 28 type spansByEndKey struct { 29 cmp base.Compare 30 buf []Span 31 } 32 33 func (v *spansByEndKey) Len() int { return len(v.buf) } 34 func (v *spansByEndKey) Less(i, j int) bool { 35 return v.cmp(v.buf[i].End, v.buf[j].End) < 0 36 } 37 func (v *spansByEndKey) Swap(i, j int) { 38 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 39 } 40 41 // keysBySeqNumKind sorts spans by the start key's sequence number in 42 // descending order. If two spans have equal sequence number, they're compared 43 // by key kind in descending order. This ordering matches the ordering of 44 // base.InternalCompare among keys with matching user keys. 45 type keysBySeqNumKind []Key 46 47 func (v *keysBySeqNumKind) Len() int { return len(*v) } 48 func (v *keysBySeqNumKind) Less(i, j int) bool { return (*v)[i].Trailer > (*v)[j].Trailer } 49 func (v *keysBySeqNumKind) Swap(i, j int) { (*v)[i], (*v)[j] = (*v)[j], (*v)[i] } 50 51 // Sort the spans by start key. This is the ordering required by the 52 // Fragmenter. Usually spans are naturally sorted by their start key, 53 // but that isn't true for range deletion tombstones in the legacy 54 // range-del-v1 block format. 55 func Sort(cmp base.Compare, spans []Span) { 56 sorter := spansByStartKey{ 57 cmp: cmp, 58 buf: spans, 59 } 60 sort.Sort(&sorter) 61 } 62 63 // Fragmenter fragments a set of spans such that overlapping spans are 64 // split at their overlap points. The fragmented spans are output to the 65 // supplied Output function. 66 type Fragmenter struct { 67 Cmp base.Compare 68 Format base.FormatKey 69 // Emit is called to emit a fragmented span and its keys. Every key defined 70 // within the emitted Span applies to the entirety of the Span's key span. 71 // Keys are ordered in decreasing order of their sequence numbers, and if 72 // equal, decreasing order of key kind. 73 Emit func(Span) 74 // pending contains the list of pending fragments that have not been 75 // flushed to the block writer. Note that the spans have not been 76 // fragmented on the end keys yet. That happens as the spans are 77 // flushed. All pending spans have the same Start. 78 pending []Span 79 // doneBuf is used to buffer completed span fragments when flushing to a 80 // specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to 81 // allow reuse. 82 doneBuf []Span 83 // sortBuf is used to sort fragments by end key when flushing. 84 sortBuf spansByEndKey 85 // flushBuf is used to sort keys by (seqnum,kind) before emitting. 86 flushBuf keysBySeqNumKind 87 // flushedKey is the key that fragments have been flushed up to. Any 88 // additional spans added to the fragmenter must have a start key >= 89 // flushedKey. A nil value indicates flushedKey has not been set. 90 flushedKey []byte 91 finished bool 92 } 93 94 func (f *Fragmenter) checkInvariants(buf []Span) { 95 for i := 1; i < len(buf); i++ { 96 if f.Cmp(buf[i].Start, buf[i].End) >= 0 { 97 panic(fmt.Sprintf("pebble: empty pending span invariant violated: %s", buf[i])) 98 } 99 if f.Cmp(buf[i-1].Start, buf[i].Start) != 0 { 100 panic(fmt.Sprintf("pebble: pending span invariant violated: %s %s", 101 f.Format(buf[i-1].Start), f.Format(buf[i].Start))) 102 } 103 } 104 } 105 106 // Add adds a span to the fragmenter. Spans may overlap and the 107 // fragmenter will internally split them. The spans must be presented in 108 // increasing start key order. That is, Add must be called with a series 109 // of spans like: 110 // 111 // a---e 112 // c---g 113 // c-----i 114 // j---n 115 // j-l 116 // 117 // We need to fragment the spans at overlap points. In the above 118 // example, we'd create: 119 // 120 // a-c-e 121 // c-e-g 122 // c-e-g-i 123 // j-l-n 124 // j-l 125 // 126 // The fragments need to be output sorted by start key, and for equal start 127 // keys, sorted by descending sequence number. This last part requires a mild 128 // bit of care as the fragments are not created in descending sequence number 129 // order. 130 // 131 // Once a start key has been seen, we know that we'll never see a smaller 132 // start key and can thus flush all of the fragments that lie before that 133 // start key. 134 // 135 // Walking through the example above, we start with: 136 // 137 // a---e 138 // 139 // Next we add [c,g) resulting in: 140 // 141 // a-c-e 142 // c---g 143 // 144 // The fragment [a,c) is flushed leaving the pending spans as: 145 // 146 // c-e 147 // c---g 148 // 149 // The next span is [c,i): 150 // 151 // c-e 152 // c---g 153 // c-----i 154 // 155 // No fragments are flushed. The next span is [j,n): 156 // 157 // c-e 158 // c---g 159 // c-----i 160 // j---n 161 // 162 // The fragments [c,e), [c,g) and [c,i) are flushed. We sort these fragments 163 // by their end key, then split the fragments on the end keys: 164 // 165 // c-e 166 // c-e-g 167 // c-e---i 168 // 169 // The [c,e) fragments all get flushed leaving: 170 // 171 // e-g 172 // e---i 173 // 174 // This process continues until there are no more fragments to flush. 175 // 176 // WARNING: the slices backing Start, End, Keys, Key.Suffix and Key.Value are 177 // all retained after this method returns and should not be modified. This is 178 // safe for spans that are added from a memtable or batch. It is partially 179 // unsafe for a span read from an sstable. Specifically, the Keys slice of a 180 // Span returned during sstable iteration is only valid until the next iterator 181 // operation. The stability of the user keys depend on whether the block is 182 // prefix compressed, and in practice Pebble never prefix compresses range 183 // deletion and range key blocks, so these keys are stable. Because of this key 184 // stability, typically callers only need to perform a shallow clone of the Span 185 // before Add-ing it to the fragmenter. 186 // 187 // Add requires the provided span's keys are sorted in Trailer descending order. 188 func (f *Fragmenter) Add(s Span) { 189 if f.finished { 190 panic("pebble: span fragmenter already finished") 191 } else if s.KeysOrder != ByTrailerDesc { 192 panic("pebble: span keys unexpectedly not in trailer descending order") 193 } 194 if f.flushedKey != nil { 195 switch c := f.Cmp(s.Start, f.flushedKey); { 196 case c < 0: 197 panic(fmt.Sprintf("pebble: start key (%s) < flushed key (%s)", 198 f.Format(s.Start), f.Format(f.flushedKey))) 199 } 200 } 201 if f.Cmp(s.Start, s.End) >= 0 { 202 // An empty span, we can ignore it. 203 return 204 } 205 if invariants.RaceEnabled { 206 f.checkInvariants(f.pending) 207 defer func() { f.checkInvariants(f.pending) }() 208 } 209 210 if len(f.pending) > 0 { 211 // Since all of the pending spans have the same start key, we only need 212 // to compare against the first one. 213 switch c := f.Cmp(f.pending[0].Start, s.Start); { 214 case c > 0: 215 panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s", 216 f.Format(f.pending[0].Start), f.Format(s.Start))) 217 case c == 0: 218 // The new span has the same start key as the existing pending 219 // spans. Add it to the pending buffer. 220 f.pending = append(f.pending, s) 221 return 222 } 223 224 // At this point we know that the new start key is greater than the pending 225 // spans start keys. 226 f.truncateAndFlush(s.Start) 227 } 228 229 f.pending = append(f.pending, s) 230 } 231 232 // Cover is returned by Framenter.Covers and describes a span's relationship to 233 // a key at a particular snapshot. 234 type Cover int8 235 236 const ( 237 // NoCover indicates the tested key does not fall within the span's bounds, 238 // or the span contains no keys with sequence numbers higher than the key's. 239 NoCover Cover = iota 240 // CoversInvisibly indicates the tested key does fall within the span's 241 // bounds and the span contains at least one key with a higher sequence 242 // number, but none visible at the provided snapshot. 243 CoversInvisibly 244 // CoversVisibly indicates the tested key does fall within the span's 245 // bounds, and the span constains at least one key with a sequence number 246 // higher than the key's sequence number that is visible at the provided 247 // snapshot. 248 CoversVisibly 249 ) 250 251 // Covers returns an enum indicating whether the specified key is covered by one 252 // of the pending keys. The provided key must be consistent with the ordering of 253 // the spans. That is, it is invalid to specify a key here that is out of order 254 // with the span start keys passed to Add. 255 func (f *Fragmenter) Covers(key base.InternalKey, snapshot uint64) Cover { 256 if f.finished { 257 panic("pebble: span fragmenter already finished") 258 } 259 if len(f.pending) == 0 { 260 return NoCover 261 } 262 263 if f.Cmp(f.pending[0].Start, key.UserKey) > 0 { 264 panic(fmt.Sprintf("pebble: keys must be in order: %s > %s", 265 f.Format(f.pending[0].Start), key.Pretty(f.Format))) 266 } 267 268 cover := NoCover 269 seqNum := key.SeqNum() 270 for _, s := range f.pending { 271 if f.Cmp(key.UserKey, s.End) < 0 { 272 // NB: A range deletion tombstone does not delete a point operation 273 // at the same sequence number, and broadly a span is not considered 274 // to cover a point operation at the same sequence number. 275 276 for i := range s.Keys { 277 if kseq := s.Keys[i].SeqNum(); kseq > seqNum { 278 // This key from the span has a higher sequence number than 279 // `key`. It covers `key`, although the span's key might not 280 // be visible if its snapshot is too high. 281 // 282 // Batch keys are always be visible. 283 if kseq < snapshot || kseq&base.InternalKeySeqNumBatch != 0 { 284 return CoversVisibly 285 } 286 // s.Keys[i] is not visible. 287 cover = CoversInvisibly 288 } 289 } 290 } 291 } 292 return cover 293 } 294 295 // Empty returns true if all fragments added so far have finished flushing. 296 func (f *Fragmenter) Empty() bool { 297 return f.finished || len(f.pending) == 0 298 } 299 300 // TruncateAndFlushTo flushes all of the fragments with a start key <= key, 301 // truncating spans to the specified end key. Used during compaction to force 302 // emitting of spans which straddle an sstable boundary. Consider 303 // the scenario: 304 // 305 // a---------k#10 306 // f#8 307 // f#7 308 // 309 // Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will 310 // flush this span: 311 // 312 // a-------g#10 313 // f#8 314 // f#7 315 // 316 // And leave this one in f.pending: 317 // 318 // g----k#10 319 // 320 // WARNING: The fragmenter could hold on to the specified end key. Ensure it's 321 // a safe byte slice that could outlast the current sstable output, and one 322 // that will never be modified. 323 func (f *Fragmenter) TruncateAndFlushTo(key []byte) { 324 if f.finished { 325 panic("pebble: span fragmenter already finished") 326 } 327 if f.flushedKey != nil { 328 switch c := f.Cmp(key, f.flushedKey); { 329 case c < 0: 330 panic(fmt.Sprintf("pebble: start key (%s) < flushed key (%s)", 331 f.Format(key), f.Format(f.flushedKey))) 332 } 333 } 334 if invariants.RaceEnabled { 335 f.checkInvariants(f.pending) 336 defer func() { f.checkInvariants(f.pending) }() 337 } 338 if len(f.pending) > 0 { 339 // Since all of the pending spans have the same start key, we only need 340 // to compare against the first one. 341 switch c := f.Cmp(f.pending[0].Start, key); { 342 case c > 0: 343 panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s", 344 f.Format(f.pending[0].Start), f.Format(key))) 345 case c == 0: 346 return 347 } 348 } 349 f.truncateAndFlush(key) 350 } 351 352 // Start returns the start key of the first span in the pending buffer, or nil 353 // if there are no pending spans. The start key of all pending spans is the same 354 // as that of the first one. 355 func (f *Fragmenter) Start() []byte { 356 if len(f.pending) > 0 { 357 return f.pending[0].Start 358 } 359 return nil 360 } 361 362 // Flushes all pending spans up to key (exclusive). 363 // 364 // WARNING: The specified key is stored without making a copy, so all callers 365 // must ensure it is safe. 366 func (f *Fragmenter) truncateAndFlush(key []byte) { 367 f.flushedKey = append(f.flushedKey[:0], key...) 368 done := f.doneBuf[:0] 369 pending := f.pending 370 f.pending = f.pending[:0] 371 372 // pending and f.pending share the same underlying storage. As we iterate 373 // over pending we append to f.pending, but only one entry is appended in 374 // each iteration, after we have read the entry being overwritten. 375 for _, s := range pending { 376 if f.Cmp(key, s.End) < 0 { 377 // s: a--+--e 378 // new: c------ 379 if f.Cmp(s.Start, key) < 0 { 380 done = append(done, Span{ 381 Start: s.Start, 382 End: key, 383 Keys: s.Keys, 384 }) 385 } 386 f.pending = append(f.pending, Span{ 387 Start: key, 388 End: s.End, 389 Keys: s.Keys, 390 }) 391 } else { 392 // s: a-----e 393 // new: e---- 394 done = append(done, s) 395 } 396 } 397 398 f.doneBuf = done[:0] 399 f.flush(done, nil) 400 } 401 402 // flush a group of range spans to the block. The spans are required to all have 403 // the same start key. We flush all span fragments until startKey > lastKey. If 404 // lastKey is nil, all span fragments are flushed. The specification of a 405 // non-nil lastKey occurs for range deletion tombstones during compaction where 406 // we want to flush (but not truncate) all range tombstones that start at or 407 // before the first key in the next sstable. Consider: 408 // 409 // a---e#10 410 // a------h#9 411 // 412 // If a compaction splits the sstables at key c we want the first sstable to 413 // contain the tombstones [a,e)#10 and [a,e)#9. Fragmentation would naturally 414 // produce a tombstone [e,h)#9, but we don't need to output that tombstone to 415 // the first sstable. 416 func (f *Fragmenter) flush(buf []Span, lastKey []byte) { 417 if invariants.RaceEnabled { 418 f.checkInvariants(buf) 419 } 420 421 // Sort the spans by end key. This will allow us to walk over the spans and 422 // easily determine the next split point (the smallest end-key). 423 f.sortBuf.cmp = f.Cmp 424 f.sortBuf.buf = buf 425 sort.Sort(&f.sortBuf) 426 427 // Loop over the spans, splitting by end key. 428 for len(buf) > 0 { 429 // A prefix of spans will end at split. remove represents the count of 430 // that prefix. 431 remove := 1 432 split := buf[0].End 433 f.flushBuf = append(f.flushBuf[:0], buf[0].Keys...) 434 435 for i := 1; i < len(buf); i++ { 436 if f.Cmp(split, buf[i].End) == 0 { 437 remove++ 438 } 439 f.flushBuf = append(f.flushBuf, buf[i].Keys...) 440 } 441 442 sort.Sort(&f.flushBuf) 443 444 f.Emit(Span{ 445 Start: buf[0].Start, 446 End: split, 447 // Copy the sorted keys to a new slice. 448 // 449 // This allocation is an unfortunate side effect of the Fragmenter and 450 // the expectation that the spans it produces are available in-memory 451 // indefinitely. 452 // 453 // Eventually, we should be able to replace the fragmenter with the 454 // keyspan.MergingIter which will perform just-in-time 455 // fragmentation, and only guaranteeing the memory lifetime for the 456 // current span. The MergingIter fragments while only needing to 457 // access one Span per level. It only accesses the Span at the 458 // current position for each level. During compactions, we can write 459 // these spans to sstables without retaining previous Spans. 460 Keys: append([]Key(nil), f.flushBuf...), 461 }) 462 463 if lastKey != nil && f.Cmp(split, lastKey) > 0 { 464 break 465 } 466 467 // Adjust the start key for every remaining span. 468 buf = buf[remove:] 469 for i := range buf { 470 buf[i].Start = split 471 } 472 } 473 } 474 475 // Finish flushes any remaining fragments to the output. It is an error to call 476 // this if any other spans will be added. 477 func (f *Fragmenter) Finish() { 478 if f.finished { 479 panic("pebble: span fragmenter already finished") 480 } 481 f.flush(f.pending, nil) 482 f.finished = true 483 }