github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/keyspan/defragment.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package keyspan 6 7 import ( 8 "bytes" 9 10 "github.com/cockroachdb/pebble/internal/base" 11 "github.com/cockroachdb/pebble/internal/bytealloc" 12 "github.com/cockroachdb/pebble/internal/invariants" 13 ) 14 15 // bufferReuseMaxCapacity is the maximum capacity of a DefragmentingIter buffer 16 // that DefragmentingIter will reuse. Buffers larger than this will be 17 // discarded and reallocated as necessary. 18 const bufferReuseMaxCapacity = 10 << 10 // 10 KB 19 20 // keysReuseMaxCapacity is the maximum capacity of a []keyspan.Key buffer that 21 // DefragmentingIter will reuse. Buffers larger than this will be discarded and 22 // reallocated as necessary. 23 const keysReuseMaxCapacity = 100 24 25 // DefragmentMethod configures the defragmentation performed by the 26 // DefragmentingIter. 27 type DefragmentMethod interface { 28 // ShouldDefragment takes two abutting spans and returns whether the two 29 // spans should be combined into a single, defragmented Span. 30 ShouldDefragment(equal base.Equal, left, right *Span) bool 31 } 32 33 // The DefragmentMethodFunc type is an adapter to allow the use of ordinary 34 // functions as DefragmentMethods. If f is a function with the appropriate 35 // signature, DefragmentMethodFunc(f) is a DefragmentMethod that calls f. 36 type DefragmentMethodFunc func(equal base.Equal, left, right *Span) bool 37 38 // ShouldDefragment calls f(equal, left, right). 39 func (f DefragmentMethodFunc) ShouldDefragment(equal base.Equal, left, right *Span) bool { 40 return f(equal, left, right) 41 } 42 43 // DefragmentInternal configures a DefragmentingIter to defragment spans 44 // only if they have identical keys. It requires spans' keys to be sorted in 45 // trailer descending order. 46 // 47 // This defragmenting method is intended for use in compactions that may see 48 // internal range keys fragments that may now be joined, because the state that 49 // required their fragmentation has been dropped. 50 var DefragmentInternal DefragmentMethod = DefragmentMethodFunc(func(equal base.Equal, a, b *Span) bool { 51 if a.KeysOrder != ByTrailerDesc || b.KeysOrder != ByTrailerDesc { 52 panic("pebble: span keys unexpectedly not in trailer descending order") 53 } 54 if len(a.Keys) != len(b.Keys) { 55 return false 56 } 57 for i := range a.Keys { 58 if a.Keys[i].Trailer != b.Keys[i].Trailer { 59 return false 60 } 61 if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) { 62 return false 63 } 64 if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) { 65 return false 66 } 67 } 68 return true 69 }) 70 71 // DefragmentReducer merges the current and next Key slices, returning a new Key 72 // slice. 73 // 74 // Implementations should modify and return `cur` to save on allocations, or 75 // consider allocating a new slice, as the `cur` slice may be retained by the 76 // DefragmentingIter and mutated. The `next` slice must not be mutated. 77 // 78 // The incoming slices are sorted by (SeqNum, Kind) descending. The output slice 79 // must also have this sort order. 80 type DefragmentReducer func(cur, next []Key) []Key 81 82 // StaticDefragmentReducer is a no-op DefragmentReducer that simply returns the 83 // current key slice, effectively retaining the first set of keys encountered 84 // for a defragmented span. 85 // 86 // This reducer can be used, for example, when the set of Keys for each Span 87 // being reduced is not expected to change, and therefore the keys from the 88 // first span encountered can be used without considering keys in subsequent 89 // spans. 90 var StaticDefragmentReducer DefragmentReducer = func(cur, _ []Key) []Key { 91 return cur 92 } 93 94 // iterPos is an enum indicating the position of the defragmenting iter's 95 // wrapped iter. The defragmenting iter must look ahead or behind when 96 // defragmenting forward or backwards respectively, and this enum records that 97 // current position. 98 type iterPos int8 99 100 const ( 101 iterPosPrev iterPos = -1 102 iterPosCurr iterPos = 0 103 iterPosNext iterPos = +1 104 ) 105 106 // DefragmentingIter wraps a key span iterator, defragmenting physical 107 // fragmentation during iteration. 108 // 109 // During flushes and compactions, keys applied over a span may be split at 110 // sstable boundaries. This fragmentation can produce internal key bounds that 111 // do not match any of the bounds ever supplied to a user operation. This 112 // physical fragmentation is necessary to avoid excessively wide sstables. 113 // 114 // The defragmenting iterator undoes this physical fragmentation, joining spans 115 // with abutting bounds and equal state. The defragmenting iterator takes a 116 // DefragmentMethod to determine what is "equal state" for a span. The 117 // DefragmentMethod is a function type, allowing arbitrary comparisons between 118 // Span keys. 119 // 120 // Seeking (SeekGE, SeekLT) poses an obstacle to defragmentation. A seek may 121 // land on a physical fragment in the middle of several fragments that must be 122 // defragmented. A seek that lands in a fragment straddling the seek key must 123 // first degfragment in the opposite direction of iteration to find the 124 // beginning of the defragmented span, and then defragments in the iteration 125 // direction, ensuring it's found a whole defragmented span. 126 type DefragmentingIter struct { 127 // DefragmentingBuffers holds buffers used for copying iterator state. 128 *DefragmentingBuffers 129 comparer *base.Comparer 130 equal base.Equal 131 iter FragmentIterator 132 iterSpan *Span 133 iterPos iterPos 134 135 // curr holds the span at the current iterator position. 136 curr Span 137 138 // method is a comparison function for two spans. method is called when two 139 // spans are abutting to determine whether they may be defragmented. 140 // method does not itself check for adjacency for the two spans. 141 method DefragmentMethod 142 143 // reduce is the reducer function used to collect Keys across all spans that 144 // constitute a defragmented span. 145 reduce DefragmentReducer 146 } 147 148 // DefragmentingBuffers holds buffers used for copying iterator state. 149 type DefragmentingBuffers struct { 150 // currBuf is a buffer for use when copying user keys for curr. currBuf is 151 // cleared between positioning methods. 152 currBuf bytealloc.A 153 // keysBuf is a buffer for use when copying Keys for DefragmentingIter.curr. 154 keysBuf []Key 155 // keyBuf is a buffer specifically for the defragmented start key when 156 // defragmenting backwards or the defragmented end key when defragmenting 157 // forwards. These bounds are overwritten repeatedly during defragmentation, 158 // and the defragmentation routines overwrite keyBuf repeatedly to store 159 // these extended bounds. 160 keyBuf []byte 161 } 162 163 // PrepareForReuse discards any excessively large buffers. 164 func (bufs *DefragmentingBuffers) PrepareForReuse() { 165 if cap(bufs.currBuf) > bufferReuseMaxCapacity { 166 bufs.currBuf = nil 167 } 168 if cap(bufs.keyBuf) > bufferReuseMaxCapacity { 169 bufs.keyBuf = nil 170 } 171 if cap(bufs.keysBuf) > keysReuseMaxCapacity { 172 bufs.keysBuf = nil 173 } 174 } 175 176 // Assert that *DefragmentingIter implements the FragmentIterator interface. 177 var _ FragmentIterator = (*DefragmentingIter)(nil) 178 179 // Init initializes the defragmenting iter using the provided defragment 180 // method. 181 func (i *DefragmentingIter) Init( 182 comparer *base.Comparer, 183 iter FragmentIterator, 184 equal DefragmentMethod, 185 reducer DefragmentReducer, 186 bufs *DefragmentingBuffers, 187 ) { 188 *i = DefragmentingIter{ 189 DefragmentingBuffers: bufs, 190 comparer: comparer, 191 equal: comparer.Equal, 192 iter: iter, 193 method: equal, 194 reduce: reducer, 195 } 196 } 197 198 // Error returns any accumulated error. 199 func (i *DefragmentingIter) Error() error { 200 return i.iter.Error() 201 } 202 203 // Close closes the underlying iterators. 204 func (i *DefragmentingIter) Close() error { 205 return i.iter.Close() 206 } 207 208 // SeekGE moves the iterator to the first span covering a key greater than or 209 // equal to the given key. This is equivalent to seeking to the first span with 210 // an end key greater than the given key. 211 func (i *DefragmentingIter) SeekGE(key []byte) *Span { 212 i.iterSpan = i.iter.SeekGE(key) 213 if i.iterSpan == nil { 214 i.iterPos = iterPosCurr 215 return nil 216 } else if i.iterSpan.Empty() { 217 i.iterPos = iterPosCurr 218 return i.iterSpan 219 } 220 // If the span starts strictly after key, we know there mustn't be an 221 // earlier span that ends at i.iterSpan.Start, otherwise i.iter would've 222 // returned that span instead. 223 if i.comparer.Compare(i.iterSpan.Start, key) > 0 { 224 return i.defragmentForward() 225 } 226 227 // The span we landed on has a Start bound ≤ key. There may be additional 228 // fragments before this span. Defragment backward to find the start of the 229 // defragmented span. 230 i.defragmentBackward() 231 232 // Defragmenting backward may have stopped because it encountered an error. 233 // If so, we must not continue so that i.iter.Error() (and thus i.Error()) 234 // yields the error. 235 if i.iterSpan == nil && i.iter.Error() != nil { 236 return nil 237 } 238 239 if i.iterPos == iterPosPrev { 240 // Next once back onto the span. 241 i.iterSpan = i.iter.Next() 242 } 243 // Defragment the full span from its start. 244 return i.defragmentForward() 245 } 246 247 // SeekLT moves the iterator to the last span covering a key less than the 248 // given key. This is equivalent to seeking to the last span with a start 249 // key less than the given key. 250 func (i *DefragmentingIter) SeekLT(key []byte) *Span { 251 i.iterSpan = i.iter.SeekLT(key) 252 if i.iterSpan == nil { 253 i.iterPos = iterPosCurr 254 return nil 255 } else if i.iterSpan.Empty() { 256 i.iterPos = iterPosCurr 257 return i.iterSpan 258 } 259 // If the span ends strictly before key, we know there mustn't be a later 260 // span that starts at i.iterSpan.End, otherwise i.iter would've returned 261 // that span instead. 262 if i.comparer.Compare(i.iterSpan.End, key) < 0 { 263 return i.defragmentBackward() 264 } 265 266 // The span we landed on has a End bound ≥ key. There may be additional 267 // fragments after this span. Defragment forward to find the end of the 268 // defragmented span. 269 i.defragmentForward() 270 271 // Defragmenting forward may have stopped because it encountered an error. 272 // If so, we must not continue so that i.iter.Error() (and thus i.Error()) 273 // yields the error. 274 if i.iterSpan == nil && i.iter.Error() != nil { 275 return nil 276 } 277 278 if i.iterPos == iterPosNext { 279 // Prev once back onto the span. 280 i.iterSpan = i.iter.Prev() 281 } 282 // Defragment the full span from its end. 283 return i.defragmentBackward() 284 } 285 286 // First seeks the iterator to the first span and returns it. 287 func (i *DefragmentingIter) First() *Span { 288 i.iterSpan = i.iter.First() 289 if i.iterSpan == nil { 290 i.iterPos = iterPosCurr 291 return nil 292 } 293 return i.defragmentForward() 294 } 295 296 // Last seeks the iterator to the last span and returns it. 297 func (i *DefragmentingIter) Last() *Span { 298 i.iterSpan = i.iter.Last() 299 if i.iterSpan == nil { 300 i.iterPos = iterPosCurr 301 return nil 302 } 303 return i.defragmentBackward() 304 } 305 306 // Next advances to the next span and returns it. 307 func (i *DefragmentingIter) Next() *Span { 308 switch i.iterPos { 309 case iterPosPrev: 310 // Switching directions; The iterator is currently positioned over the 311 // last span of the previous set of fragments. In the below diagram, 312 // the iterator is positioned over the last span that contributes to 313 // the defragmented x position. We want to be positioned over the first 314 // span that contributes to the z position. 315 // 316 // x x x y y y z z z 317 // ^ ^ 318 // old new 319 // 320 // Next once to move onto y, defragment forward to land on the first z 321 // position. 322 i.iterSpan = i.iter.Next() 323 if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil { 324 panic("pebble: invariant violation: no next span while switching directions") 325 } 326 // We're now positioned on the first span that was defragmented into the 327 // current iterator position. Skip over the rest of the current iterator 328 // position's constitutent fragments. In the above example, this would 329 // land on the first 'z'. 330 i.defragmentForward() 331 if i.iterSpan == nil { 332 i.iterPos = iterPosCurr 333 return nil 334 } 335 336 // Now that we're positioned over the first of the next set of 337 // fragments, defragment forward. 338 return i.defragmentForward() 339 case iterPosCurr: 340 // iterPosCurr is only used when the iter is exhausted or when the iterator 341 // is at an empty span. 342 if invariants.Enabled && i.iterSpan != nil && !i.iterSpan.Empty() { 343 panic("pebble: invariant violation: iterPosCurr with valid iterSpan") 344 } 345 346 i.iterSpan = i.iter.Next() 347 if i.iterSpan == nil { 348 return nil 349 } 350 return i.defragmentForward() 351 case iterPosNext: 352 // Already at the next span. 353 if i.iterSpan == nil { 354 i.iterPos = iterPosCurr 355 return nil 356 } 357 return i.defragmentForward() 358 default: 359 panic("unreachable") 360 } 361 } 362 363 // Prev steps back to the previous span and returns it. 364 func (i *DefragmentingIter) Prev() *Span { 365 switch i.iterPos { 366 case iterPosPrev: 367 // Already at the previous span. 368 if i.iterSpan == nil { 369 i.iterPos = iterPosCurr 370 return nil 371 } 372 return i.defragmentBackward() 373 case iterPosCurr: 374 // iterPosCurr is only used when the iter is exhausted or when the iterator 375 // is at an empty span. 376 if invariants.Enabled && i.iterSpan != nil && !i.iterSpan.Empty() { 377 panic("pebble: invariant violation: iterPosCurr with valid iterSpan") 378 } 379 380 i.iterSpan = i.iter.Prev() 381 if i.iterSpan == nil { 382 return nil 383 } 384 return i.defragmentBackward() 385 case iterPosNext: 386 // Switching directions; The iterator is currently positioned over the 387 // first fragment of the next set of fragments. In the below diagram, 388 // the iterator is positioned over the first span that contributes to 389 // the defragmented z position. We want to be positioned over the last 390 // span that contributes to the x position. 391 // 392 // x x x y y y z z z 393 // ^ ^ 394 // new old 395 // 396 // Prev once to move onto y, defragment backward to land on the last x 397 // position. 398 i.iterSpan = i.iter.Prev() 399 if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil { 400 panic("pebble: invariant violation: no previous span while switching directions") 401 } 402 // We're now positioned on the last span that was defragmented into the 403 // current iterator position. Skip over the rest of the current iterator 404 // position's constitutent fragments. In the above example, this would 405 // land on the last 'x'. 406 i.defragmentBackward() 407 408 // Now that we're positioned over the last of the prev set of 409 // fragments, defragment backward. 410 if i.iterSpan == nil { 411 i.iterPos = iterPosCurr 412 return nil 413 } 414 return i.defragmentBackward() 415 default: 416 panic("unreachable") 417 } 418 } 419 420 // checkEqual checks the two spans for logical equivalence. It uses the passed-in 421 // DefragmentMethod and ensures both spans are NOT empty; not defragmenting empty 422 // spans is an optimization that lets us load fewer sstable blocks. 423 func (i *DefragmentingIter) checkEqual(left, right *Span) bool { 424 return (!left.Empty() && !right.Empty()) && i.method.ShouldDefragment(i.equal, i.iterSpan, &i.curr) 425 } 426 427 // defragmentForward defragments spans in the forward direction, starting from 428 // i.iter's current position. The span at the current position must be non-nil, 429 // but may be Empty(). 430 func (i *DefragmentingIter) defragmentForward() *Span { 431 if i.iterSpan.Empty() { 432 // An empty span will never be equal to another span; see checkEqual for 433 // why. To avoid loading non-empty range keys further ahead by calling Next, 434 // return early. 435 i.iterPos = iterPosCurr 436 return i.iterSpan 437 } 438 i.saveCurrent() 439 440 i.iterPos = iterPosNext 441 i.iterSpan = i.iter.Next() 442 for i.iterSpan != nil { 443 if !i.equal(i.curr.End, i.iterSpan.Start) { 444 // Not a continuation. 445 break 446 } 447 if !i.checkEqual(i.iterSpan, &i.curr) { 448 // Not a continuation. 449 break 450 } 451 i.keyBuf = append(i.keyBuf[:0], i.iterSpan.End...) 452 i.curr.End = i.keyBuf 453 i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys) 454 i.iterSpan = i.iter.Next() 455 } 456 // i.iterSpan == nil 457 // 458 // The inner iterator may return nil when it encounters an error. If there 459 // was an error, we don't know whether there is another span we should 460 // defragment or not. Return nil so that the caller knows they should check 461 // Error(). 462 if i.iter.Error() != nil { 463 return nil 464 } 465 i.curr.Keys = i.keysBuf 466 return &i.curr 467 } 468 469 // defragmentBackward defragments spans in the backward direction, starting from 470 // i.iter's current position. The span at the current position must be non-nil, 471 // but may be Empty(). 472 func (i *DefragmentingIter) defragmentBackward() *Span { 473 if i.iterSpan.Empty() { 474 // An empty span will never be equal to another span; see checkEqual for 475 // why. To avoid loading non-empty range keys further ahead by calling Next, 476 // return early. 477 i.iterPos = iterPosCurr 478 return i.iterSpan 479 } 480 i.saveCurrent() 481 482 i.iterPos = iterPosPrev 483 i.iterSpan = i.iter.Prev() 484 for i.iterSpan != nil { 485 if !i.equal(i.curr.Start, i.iterSpan.End) { 486 // Not a continuation. 487 break 488 } 489 if !i.checkEqual(i.iterSpan, &i.curr) { 490 // Not a continuation. 491 break 492 } 493 i.keyBuf = append(i.keyBuf[:0], i.iterSpan.Start...) 494 i.curr.Start = i.keyBuf 495 i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys) 496 i.iterSpan = i.iter.Prev() 497 } 498 // i.iterSpan == nil 499 // 500 // The inner iterator may return nil when it encounters an error. If there 501 // was an error, we don't know whether there is another span we should 502 // defragment or not. Return nil so that the caller knows they should check 503 // Error(). 504 if i.iter.Error() != nil { 505 return nil 506 } 507 i.curr.Keys = i.keysBuf 508 return &i.curr 509 } 510 511 func (i *DefragmentingIter) saveCurrent() { 512 i.currBuf.Reset() 513 i.keysBuf = i.keysBuf[:0] 514 i.keyBuf = i.keyBuf[:0] 515 if i.iterSpan == nil { 516 return 517 } 518 i.curr = Span{ 519 Start: i.saveBytes(i.iterSpan.Start), 520 End: i.saveBytes(i.iterSpan.End), 521 KeysOrder: i.iterSpan.KeysOrder, 522 } 523 for j := range i.iterSpan.Keys { 524 i.keysBuf = append(i.keysBuf, Key{ 525 Trailer: i.iterSpan.Keys[j].Trailer, 526 Suffix: i.saveBytes(i.iterSpan.Keys[j].Suffix), 527 Value: i.saveBytes(i.iterSpan.Keys[j].Value), 528 }) 529 } 530 i.curr.Keys = i.keysBuf 531 } 532 533 func (i *DefragmentingIter) saveBytes(b []byte) []byte { 534 if b == nil { 535 return nil 536 } 537 i.currBuf, b = i.currBuf.Copy(b) 538 return b 539 }