github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/rangedel/fragmenter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package rangedel 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/petermattis/pebble/internal/base" 12 ) 13 14 type tombstonesByStartKey struct { 15 cmp base.Compare 16 buf []Tombstone 17 } 18 19 func (v *tombstonesByStartKey) Len() int { return len(v.buf) } 20 func (v *tombstonesByStartKey) Less(i, j int) bool { 21 return base.InternalCompare(v.cmp, v.buf[i].Start, v.buf[j].Start) < 0 22 } 23 func (v *tombstonesByStartKey) Swap(i, j int) { 24 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 25 } 26 27 type tombstonesByEndKey struct { 28 cmp base.Compare 29 buf []Tombstone 30 } 31 32 func (v *tombstonesByEndKey) Len() int { return len(v.buf) } 33 func (v *tombstonesByEndKey) Less(i, j int) bool { 34 return v.cmp(v.buf[i].End, v.buf[j].End) < 0 35 } 36 func (v *tombstonesByEndKey) Swap(i, j int) { 37 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 38 } 39 40 type tombstonesBySeqNum []Tombstone 41 42 func (v *tombstonesBySeqNum) Len() int { return len(*v) } 43 func (v *tombstonesBySeqNum) Less(i, j int) bool { 44 return (*v)[i].Start.SeqNum() > (*v)[j].Start.SeqNum() 45 } 46 func (v *tombstonesBySeqNum) Swap(i, j int) { 47 (*v)[i], (*v)[j] = (*v)[j], (*v)[i] 48 } 49 50 // Sort the tombstones by start key. This is the ordering required by the 51 // Fragmenter. Usually tombstones are naturally sorted by their start key, but 52 // that isn't true for tombstones in the legacy range-del-v1 block format. 53 func Sort(cmp base.Compare, tombstones []Tombstone) { 54 sorter := tombstonesByStartKey{ 55 cmp: cmp, 56 buf: tombstones, 57 } 58 sort.Sort(&sorter) 59 } 60 61 // Fragmenter fragments a set of range tombstones such that overlapping 62 // tombstones are split at their overlap points. The fragmented tombstones are 63 // output to the supplied Output function. 64 type Fragmenter struct { 65 Cmp base.Compare 66 // Emit is called to emit a chunk of tombstone fragments. Every tombstone 67 // within the chunk has the same start and end key, and differ only by 68 // sequence number. 69 Emit func([]Tombstone) 70 // pending contains the list of pending range tombstone fragments that have 71 // not been flushed to the block writer. Note that the tombstones have not 72 // been fragmented on the end keys yet. That happens as the tombstones are 73 // flushed. 74 pending []Tombstone 75 doneBuf []Tombstone 76 sortBuf tombstonesByEndKey 77 flushBuf tombstonesBySeqNum 78 finished bool 79 } 80 81 func (f *Fragmenter) checkSameStart(buf []Tombstone) { 82 for i := 1; i < len(buf); i++ { 83 if f.Cmp(buf[i-1].Start.UserKey, buf[i].Start.UserKey) != 0 { 84 panic(fmt.Sprintf("pebble: pending tombstone invariant violated: %s %s", 85 buf[i-1].Start, buf[i].Start)) 86 } 87 } 88 } 89 90 func (f *Fragmenter) checkInvariants() { 91 f.checkSameStart(f.pending) 92 } 93 94 // Add adds a tombstone to the fragmenter. Tombstones may overlap and the 95 // fragmenter will internally split them. The tombstones must be presented in 96 // increasing start key order. That is, Add must be called with a series of 97 // tombstones like: 98 // 99 // a---e 100 // c---g 101 // c-----i 102 // j---n 103 // j-l 104 // 105 // We need to fragment the tombstones at overlap points. In the above 106 // example, we'd create: 107 // 108 // a-c-e 109 // c-e-g 110 // c-e-g-i 111 // j-l-n 112 // j-l 113 // 114 // The fragments need to be output sorted by start key, and for equal start 115 // keys, sorted by descending sequence number. This last part requires a mild 116 // bit of care as the fragments are not created in descending sequence number 117 // order. 118 // 119 // Once a start key has been seen, we know that we'll never see a smaller 120 // start key and can thus flush all of the fragments that lie before that 121 // start key. 122 // 123 // Walking through the example above, we start with: 124 // 125 // a---e 126 // 127 // Next we add [c,g) resulting in: 128 // 129 // a-c-e 130 // c---g 131 // 132 // The fragment [a,c) is flushed leaving the pending tombstones as: 133 // 134 // c-e 135 // c---g 136 // 137 // The next tombstone is [c,i): 138 // 139 // c-e 140 // c---g 141 // c-----i 142 // 143 // No fragments are flushed. The next tombstone is [j,n): 144 // 145 // c-e 146 // c---g 147 // c-----i 148 // j---n 149 // 150 // The fragments [c,e), [c,g) and [c,i) are flushed. We sort these fragments 151 // by their end key, then split the fragments on the end keys: 152 // 153 // c-e 154 // c-e-g 155 // c-e---i 156 // 157 // The [c,e) fragments all get flushed leaving: 158 // 159 // e-g 160 // e---i 161 // 162 // This process continues until there are no more fragments to flush. 163 // 164 // WARNING: the slices backing start.UserKey and end are retained after this 165 // method returns and should not be modified. This is safe for tombstones that 166 // are added from a memtable or batch. It is not safe for a tombstone added 167 // from an sstable where the range-del block has been prefix compressed. 168 func (f *Fragmenter) Add(start base.InternalKey, end []byte) { 169 if f.finished { 170 panic("pebble: tombstone fragmenter already finished") 171 } 172 if raceEnabled { 173 f.checkInvariants() 174 defer f.checkInvariants() 175 } 176 177 if len(f.pending) > 0 { 178 // Since all of the pending tombstones have the same start key, we only need 179 // to compare against the first one. 180 switch c := f.Cmp(f.pending[0].Start.UserKey, start.UserKey); { 181 case c > 0: 182 panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s", 183 f.pending[0].Start, start)) 184 case c == 0: 185 // The new tombstone has the same start key as the existing pending 186 // tombstones. Add it to the pending buffer. 187 f.pending = append(f.pending, Tombstone{ 188 Start: start, 189 End: end, 190 }) 191 return 192 } 193 194 // At this point we know that the new start key is greater than the pending 195 // tombstones start keys. 196 f.truncateAndFlush(start.UserKey) 197 } 198 199 f.pending = append(f.pending, Tombstone{ 200 Start: start, 201 End: end, 202 }) 203 } 204 205 // Deleted returns true if the specified key is covered by one of the pending 206 // tombstones. The key must be consistent with the ordering of the 207 // tombstones. That is, it is invalid to specify a key here that is out of 208 // order with the tombstone start keys passed to Add. 209 func (f *Fragmenter) Deleted(key base.InternalKey, snapshot uint64) bool { 210 if f.finished { 211 panic("pebble: tombstone fragmenter already finished") 212 } 213 if len(f.pending) == 0 { 214 return false 215 } 216 217 if f.Cmp(f.pending[0].Start.UserKey, key.UserKey) > 0 { 218 panic(fmt.Sprintf("pebble: keys must be in order: %s > %s", 219 f.pending[0].Start, key)) 220 } 221 222 seqNum := key.SeqNum() 223 flush := true 224 for _, t := range f.pending { 225 if f.Cmp(key.UserKey, t.End) < 0 { 226 // NB: A range deletion tombstone deletes a point operation at the same 227 // sequence number. 228 if t.Start.Visible(snapshot) && t.Start.SeqNum() > seqNum { 229 return true 230 } 231 flush = false 232 } 233 } 234 235 if flush { 236 // All of the pending tombstones ended before the specified key which means 237 // we can flush them without causing fragmentation at key. This is an 238 // optimization to allow flushing the pending tombstones as early as 239 // possible so that we don't have to continually reconsider them in 240 // Deleted. 241 f.flush(f.pending, true /* all */) 242 f.pending = f.pending[:0] 243 } 244 return false 245 } 246 247 // FlushTo flushes all of the fragments before key. Used internally by Add to 248 // flush tombstone fragments, and can be used externally to fragment tombstones 249 // during compaction when a tombstone straddles an sstable boundary. 250 func (f *Fragmenter) FlushTo(key []byte) { 251 if f.finished { 252 panic("pebble: tombstone fragmenter already finished") 253 } 254 if len(f.pending) == 0 { 255 return 256 } 257 // Since all of the pending tombstones have the same start key, we only need 258 // to compare against the first one. 259 switch c := f.Cmp(f.pending[0].Start.UserKey, key); { 260 case c > 0: 261 panic(fmt.Sprintf("pebble: keys must be in order: %s > %s", 262 f.pending[0].Start, key)) 263 } 264 265 // At this point we know that the new start key is greater than the pending 266 // tombstones start keys. We flush the pending first set of fragments for the 267 // pending tombstones. 268 f.flush(f.pending, false /* all */) 269 270 for i := range f.pending { 271 f.pending[i].Start.UserKey = key 272 } 273 } 274 275 func (f *Fragmenter) truncateAndFlush(key []byte) { 276 done := f.doneBuf[:0] 277 pending := f.pending 278 f.pending = f.pending[:0] 279 280 for _, t := range pending { 281 if f.Cmp(key, t.End) < 0 { 282 // t: a--+--e 283 // new: c------ 284 done = append(done, Tombstone{Start: t.Start, End: key}) 285 f.pending = append(f.pending, Tombstone{ 286 Start: base.MakeInternalKey(key, t.Start.SeqNum(), t.Start.Kind()), 287 End: t.End, 288 }) 289 } else { 290 // t: a-----e 291 // new: e---- 292 done = append(done, t) 293 } 294 } 295 296 f.doneBuf = done[:0] 297 f.flush(done, true /* all */) 298 } 299 300 // flush a group of range tombstones to the block. The tombstones are required 301 // to all have the same start key. 302 func (f *Fragmenter) flush(buf []Tombstone, all bool) { 303 if raceEnabled { 304 f.checkSameStart(buf) 305 } 306 307 // Sort the tombstones by end key. This will allow us to walk over the 308 // tombstones and easily determine the next split point (the smallest 309 // end-key). 310 f.sortBuf.cmp = f.Cmp 311 f.sortBuf.buf = buf 312 sort.Sort(&f.sortBuf) 313 314 // Loop over the range tombstones, splitting by end key. 315 for len(buf) > 0 { 316 remove := 1 317 split := buf[0].End 318 f.flushBuf = append(f.flushBuf[:0], buf[0]) 319 320 for i := 1; i < len(buf); i++ { 321 if f.Cmp(split, buf[i].End) == 0 { 322 remove++ 323 } 324 f.flushBuf = append(f.flushBuf, Tombstone{ 325 Start: buf[i].Start, 326 End: split, 327 }) 328 } 329 330 buf = buf[remove:] 331 332 sort.Sort(&f.flushBuf) 333 f.Emit(f.flushBuf) 334 335 if !all { 336 break 337 } 338 339 // Adjust the start key for every remaining tombstone. 340 for i := range buf { 341 buf[i].Start.UserKey = split 342 } 343 } 344 } 345 346 // Finish flushes any remaining fragments to the output. It is an error to call 347 // this if any other tombstones will be added. 348 func (f *Fragmenter) Finish() { 349 if f.finished { 350 panic("pebble: tombstone fragmenter already finished") 351 } 352 f.flush(f.pending, true /* all */) 353 f.finished = true 354 }