github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/keyspan/fragmenter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package keyspan 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/zuoyebang/bitalostable/internal/base" 12 "github.com/zuoyebang/bitalostable/internal/invariants" 13 ) 14 15 type spansByStartKey struct { 16 cmp base.Compare 17 buf []Span 18 } 19 20 func (v *spansByStartKey) Len() int { return len(v.buf) } 21 func (v *spansByStartKey) Less(i, j int) bool { 22 return v.cmp(v.buf[i].Start, v.buf[j].Start) < 0 23 } 24 func (v *spansByStartKey) Swap(i, j int) { 25 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 26 } 27 28 type spansByEndKey struct { 29 cmp base.Compare 30 buf []Span 31 } 32 33 func (v *spansByEndKey) Len() int { return len(v.buf) } 34 func (v *spansByEndKey) Less(i, j int) bool { 35 return v.cmp(v.buf[i].End, v.buf[j].End) < 0 36 } 37 func (v *spansByEndKey) Swap(i, j int) { 38 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 39 } 40 41 // keysBySeqNumKind sorts spans by the start key's sequence number in 42 // descending order. If two spans have equal sequence number, they're compared 43 // by key kind in descending order. This ordering matches the ordering of 44 // base.InternalCompare among keys with matching user keys. 45 type keysBySeqNumKind []Key 46 47 func (v *keysBySeqNumKind) Len() int { return len(*v) } 48 func (v *keysBySeqNumKind) Less(i, j int) bool { return (*v)[i].Trailer > (*v)[j].Trailer } 49 func (v *keysBySeqNumKind) Swap(i, j int) { (*v)[i], (*v)[j] = (*v)[j], (*v)[i] } 50 51 // Sort the spans by start key. This is the ordering required by the 52 // Fragmenter. Usually spans are naturally sorted by their start key, 53 // but that isn't true for range deletion tombstones in the legacy 54 // range-del-v1 block format. 55 func Sort(cmp base.Compare, spans []Span) { 56 sorter := spansByStartKey{ 57 cmp: cmp, 58 buf: spans, 59 } 60 sort.Sort(&sorter) 61 } 62 63 // Fragmenter fragments a set of spans such that overlapping spans are 64 // split at their overlap points. The fragmented spans are output to the 65 // supplied Output function. 66 type Fragmenter struct { 67 Cmp base.Compare 68 Format base.FormatKey 69 // Emit is called to emit a fragmented span and its keys. Every key defined 70 // within the emitted Span applies to the entirety of the Span's key span. 71 // Keys are ordered in decreasing order of their sequence numbers, and if 72 // equal, decreasing order of key kind. 73 Emit func(Span) 74 // pending contains the list of pending fragments that have not been 75 // flushed to the block writer. Note that the spans have not been 76 // fragmented on the end keys yet. That happens as the spans are 77 // flushed. All pending spans have the same Start. 78 pending []Span 79 // doneBuf is used to buffer completed span fragments when flushing to a 80 // specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to 81 // allow reuse. 82 doneBuf []Span 83 // sortBuf is used to sort fragments by end key when flushing. 84 sortBuf spansByEndKey 85 // flushBuf is used to sort keys by (seqnum,kind) before emitting. 86 flushBuf keysBySeqNumKind 87 // flushedKey is the key that fragments have been flushed up to. Any 88 // additional spans added to the fragmenter must have a start key >= 89 // flushedKey. A nil value indicates flushedKey has not been set. 90 flushedKey []byte 91 finished bool 92 } 93 94 func (f *Fragmenter) checkInvariants(buf []Span) { 95 for i := 1; i < len(buf); i++ { 96 if f.Cmp(buf[i].Start, buf[i].End) >= 0 { 97 panic(fmt.Sprintf("bitalostable: empty pending span invariant violated: %s", buf[i])) 98 } 99 if f.Cmp(buf[i-1].Start, buf[i].Start) != 0 { 100 panic(fmt.Sprintf("bitalostable: pending span invariant violated: %s %s", 101 f.Format(buf[i-1].Start), f.Format(buf[i].Start))) 102 } 103 } 104 } 105 106 // Add adds a span to the fragmenter. Spans may overlap and the 107 // fragmenter will internally split them. The spans must be presented in 108 // increasing start key order. That is, Add must be called with a series 109 // of spans like: 110 // 111 // a---e 112 // c---g 113 // c-----i 114 // j---n 115 // j-l 116 // 117 // We need to fragment the spans at overlap points. In the above 118 // example, we'd create: 119 // 120 // a-c-e 121 // c-e-g 122 // c-e-g-i 123 // j-l-n 124 // j-l 125 // 126 // The fragments need to be output sorted by start key, and for equal start 127 // keys, sorted by descending sequence number. This last part requires a mild 128 // bit of care as the fragments are not created in descending sequence number 129 // order. 130 // 131 // Once a start key has been seen, we know that we'll never see a smaller 132 // start key and can thus flush all of the fragments that lie before that 133 // start key. 134 // 135 // Walking through the example above, we start with: 136 // 137 // a---e 138 // 139 // Next we add [c,g) resulting in: 140 // 141 // a-c-e 142 // c---g 143 // 144 // The fragment [a,c) is flushed leaving the pending spans as: 145 // 146 // c-e 147 // c---g 148 // 149 // The next span is [c,i): 150 // 151 // c-e 152 // c---g 153 // c-----i 154 // 155 // No fragments are flushed. The next span is [j,n): 156 // 157 // c-e 158 // c---g 159 // c-----i 160 // j---n 161 // 162 // The fragments [c,e), [c,g) and [c,i) are flushed. We sort these fragments 163 // by their end key, then split the fragments on the end keys: 164 // 165 // c-e 166 // c-e-g 167 // c-e---i 168 // 169 // The [c,e) fragments all get flushed leaving: 170 // 171 // e-g 172 // e---i 173 // 174 // This process continues until there are no more fragments to flush. 175 // 176 // WARNING: the slices backing Start, End, Keys, Key.Suffix and Key.Value are 177 // all retained after this method returns and should not be modified. This is 178 // safe for spans that are added from a memtable or batch. It is partially 179 // unsafe for a span read from an sstable. Specifically, the Keys slice of a 180 // Span returned during sstable iteration is only valid until the next iterator 181 // operation. The stability of the user keys depend on whether the block is 182 // prefix compressed, and in practice Pebble never prefix compresses range 183 // deletion and range key blocks, so these keys are stable. Because of this key 184 // stability, typically callers only need to perform a shallow clone of the Span 185 // before Add-ing it to the fragmenter. 186 // 187 // Add requires the provided span's keys are sorted in Trailer descending order. 188 func (f *Fragmenter) Add(s Span) { 189 if f.finished { 190 panic("bitalostable: span fragmenter already finished") 191 } else if s.KeysOrder != ByTrailerDesc { 192 panic("bitalostable: span keys unexpectedly not in trailer descending order") 193 } 194 if f.flushedKey != nil { 195 switch c := f.Cmp(s.Start, f.flushedKey); { 196 case c < 0: 197 panic(fmt.Sprintf("bitalostable: start key (%s) < flushed key (%s)", 198 f.Format(s.Start), f.Format(f.flushedKey))) 199 } 200 } 201 if f.Cmp(s.Start, s.End) >= 0 { 202 // An empty span, we can ignore it. 203 return 204 } 205 if invariants.RaceEnabled { 206 f.checkInvariants(f.pending) 207 defer func() { f.checkInvariants(f.pending) }() 208 } 209 210 if len(f.pending) > 0 { 211 // Since all of the pending spans have the same start key, we only need 212 // to compare against the first one. 213 switch c := f.Cmp(f.pending[0].Start, s.Start); { 214 case c > 0: 215 panic(fmt.Sprintf("bitalostable: keys must be added in order: %s > %s", 216 f.Format(f.pending[0].Start), f.Format(s.Start))) 217 case c == 0: 218 // The new span has the same start key as the existing pending 219 // spans. Add it to the pending buffer. 220 f.pending = append(f.pending, s) 221 return 222 } 223 224 // At this point we know that the new start key is greater than the pending 225 // spans start keys. 226 f.truncateAndFlush(s.Start) 227 } 228 229 f.pending = append(f.pending, s) 230 } 231 232 // Covers returns true if the specified key is covered by one of the pending 233 // spans. The key must be consistent with the ordering of the 234 // spans. That is, it is invalid to specify a key here that is out of 235 // order with the span start keys passed to Add. 236 func (f *Fragmenter) Covers(key base.InternalKey, snapshot uint64) bool { 237 if f.finished { 238 panic("bitalostable: span fragmenter already finished") 239 } 240 if len(f.pending) == 0 { 241 return false 242 } 243 244 if f.Cmp(f.pending[0].Start, key.UserKey) > 0 { 245 panic(fmt.Sprintf("bitalostable: keys must be in order: %s > %s", 246 f.Format(f.pending[0].Start), key.Pretty(f.Format))) 247 } 248 249 seqNum := key.SeqNum() 250 for _, s := range f.pending { 251 if f.Cmp(key.UserKey, s.End) < 0 { 252 // NB: A range deletion tombstone does not delete a point operation 253 // at the same sequence number, and broadly a span is not considered 254 // to cover a point operation at the same sequence number. 255 if s.CoversAt(snapshot, seqNum) { 256 return true 257 } 258 } 259 } 260 return false 261 } 262 263 // Empty returns true if all fragments added so far have finished flushing. 264 func (f *Fragmenter) Empty() bool { 265 return f.finished || len(f.pending) == 0 266 } 267 268 // TruncateAndFlushTo flushes all of the fragments with a start key <= key, 269 // truncating spans to the specified end key. Used during compaction to force 270 // emitting of spans which straddle an sstable boundary. Consider 271 // the scenario: 272 // 273 // a---------k#10 274 // f#8 275 // f#7 276 // 277 // / 278 // Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will 279 // flush this span: 280 // 281 // a-------g#10 282 // f#8 283 // f#7 284 // 285 // And leave this one in f.pending: 286 // 287 // g----k#10 288 // 289 // WARNING: The fragmenter could hold on to the specified end key. Ensure it's 290 // a safe byte slice that could outlast the current sstable output, and one 291 // that will never be modified. 292 func (f *Fragmenter) TruncateAndFlushTo(key []byte) { 293 if f.finished { 294 panic("bitalostable: span fragmenter already finished") 295 } 296 if f.flushedKey != nil { 297 switch c := f.Cmp(key, f.flushedKey); { 298 case c < 0: 299 panic(fmt.Sprintf("bitalostable: start key (%s) < flushed key (%s)", 300 f.Format(key), f.Format(f.flushedKey))) 301 } 302 } 303 if invariants.RaceEnabled { 304 f.checkInvariants(f.pending) 305 defer func() { f.checkInvariants(f.pending) }() 306 } 307 if len(f.pending) > 0 { 308 // Since all of the pending spans have the same start key, we only need 309 // to compare against the first one. 310 switch c := f.Cmp(f.pending[0].Start, key); { 311 case c > 0: 312 panic(fmt.Sprintf("bitalostable: keys must be added in order: %s > %s", 313 f.Format(f.pending[0].Start), f.Format(key))) 314 case c == 0: 315 return 316 } 317 } 318 f.truncateAndFlush(key) 319 } 320 321 // Start returns the start key of the first span in the pending buffer, or nil 322 // if there are no pending spans. The start key of all pending spans is the same 323 // as that of the first one. 324 func (f *Fragmenter) Start() []byte { 325 if len(f.pending) > 0 { 326 return f.pending[0].Start 327 } 328 return nil 329 } 330 331 // Flushes all pending spans up to key (exclusive). 332 // 333 // WARNING: The specified key is stored without making a copy, so all callers 334 // must ensure it is safe. 335 func (f *Fragmenter) truncateAndFlush(key []byte) { 336 f.flushedKey = append(f.flushedKey[:0], key...) 337 done := f.doneBuf[:0] 338 pending := f.pending 339 f.pending = f.pending[:0] 340 341 // pending and f.pending share the same underlying storage. As we iterate 342 // over pending we append to f.pending, but only one entry is appended in 343 // each iteration, after we have read the entry being overwritten. 344 for _, s := range pending { 345 if f.Cmp(key, s.End) < 0 { 346 // s: a--+--e 347 // new: c------ 348 if f.Cmp(s.Start, key) < 0 { 349 done = append(done, Span{ 350 Start: s.Start, 351 End: key, 352 Keys: s.Keys, 353 }) 354 } 355 f.pending = append(f.pending, Span{ 356 Start: key, 357 End: s.End, 358 Keys: s.Keys, 359 }) 360 } else { 361 // s: a-----e 362 // new: e---- 363 done = append(done, s) 364 } 365 } 366 367 f.doneBuf = done[:0] 368 f.flush(done, nil) 369 } 370 371 // flush a group of range spans to the block. The spans are required to all have 372 // the same start key. We flush all span fragments until startKey > lastKey. If 373 // lastKey is nil, all span fragments are flushed. The specification of a 374 // non-nil lastKey occurs for range deletion tombstones during compaction where 375 // we want to flush (but not truncate) all range tombstones that start at or 376 // before the first key in the next sstable. Consider: 377 // 378 // a---e#10 379 // a------h#9 380 // 381 // If a compaction splits the sstables at key c we want the first sstable to 382 // contain the tombstones [a,e)#10 and [a,e)#9. Fragmentation would naturally 383 // produce a tombstone [e,h)#9, but we don't need to output that tombstone to 384 // the first sstable. 385 func (f *Fragmenter) flush(buf []Span, lastKey []byte) { 386 if invariants.RaceEnabled { 387 f.checkInvariants(buf) 388 } 389 390 // Sort the spans by end key. This will allow us to walk over the spans and 391 // easily determine the next split point (the smallest end-key). 392 f.sortBuf.cmp = f.Cmp 393 f.sortBuf.buf = buf 394 sort.Sort(&f.sortBuf) 395 396 // Loop over the spans, splitting by end key. 397 for len(buf) > 0 { 398 // A prefix of spans will end at split. remove represents the count of 399 // that prefix. 400 remove := 1 401 split := buf[0].End 402 f.flushBuf = append(f.flushBuf[:0], buf[0].Keys...) 403 404 for i := 1; i < len(buf); i++ { 405 if f.Cmp(split, buf[i].End) == 0 { 406 remove++ 407 } 408 f.flushBuf = append(f.flushBuf, buf[i].Keys...) 409 } 410 411 sort.Sort(&f.flushBuf) 412 413 f.Emit(Span{ 414 Start: buf[0].Start, 415 End: split, 416 // Copy the sorted keys to a new slice. 417 // 418 // This allocation is an unfortunate side effect of the Fragmenter and 419 // the expectation that the spans it produces are available in-memory 420 // indefinitely. 421 // 422 // Eventually, we should be able to replace the fragmenter with the 423 // keyspan.MergingIter which will perform just-in-time 424 // fragmentation, and only guaranteeing the memory lifetime for the 425 // current span. The MergingIter fragments while only needing to 426 // access one Span per level. It only accesses the Span at the 427 // current position for each level. During compactions, we can write 428 // these spans to sstables without retaining previous Spans. 429 Keys: append([]Key(nil), f.flushBuf...), 430 }) 431 432 if lastKey != nil && f.Cmp(split, lastKey) > 0 { 433 break 434 } 435 436 // Adjust the start key for every remaining span. 437 buf = buf[remove:] 438 for i := range buf { 439 buf[i].Start = split 440 } 441 } 442 } 443 444 // Finish flushes any remaining fragments to the output. It is an error to call 445 // this if any other spans will be added. 446 func (f *Fragmenter) Finish() { 447 if f.finished { 448 panic("bitalostable: span fragmenter already finished") 449 } 450 f.flush(f.pending, nil) 451 f.finished = true 452 }