github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/metamorphic/key_manager.go (about) 1 package metamorphic 2 3 import ( 4 "fmt" 5 "sort" 6 7 "github.com/cockroachdb/pebble" 8 "github.com/cockroachdb/pebble/internal/base" 9 "github.com/cockroachdb/pebble/internal/testkeys" 10 "github.com/stretchr/testify/require" 11 ) 12 13 // objKey is a tuple of (objID, key). This struct is used primarily as a map 14 // key for keyManager. Only writer objTags can occur here, i.e., dbTag and 15 // batchTag, since this is used for tracking the keys in a writer. 16 type objKey struct { 17 id objID 18 key []byte 19 } 20 21 // makeObjKey returns a new objKey given and id and key. 22 func makeObjKey(id objID, key []byte) objKey { 23 if id.tag() != dbTag && id.tag() != batchTag { 24 panic("unexpected non-writer tag") 25 } 26 return objKey{id, key} 27 } 28 29 // String implements fmt.Stringer, returning a stable string representation of 30 // the objKey. This string is used as map key. 31 func (o objKey) String() string { 32 return fmt.Sprintf("%s:%s", o.id, o.key) 33 } 34 35 type keyUpdate struct { 36 deleted bool 37 // metaTimestamp at which the write or delete op occurred. 38 metaTimestamp int 39 } 40 41 // keyMeta is metadata associated with an (objID, key) pair, where objID is 42 // a writer containing the key. 43 type keyMeta struct { 44 objKey 45 46 // The number of Sets of the key in this writer. 47 sets int 48 // The number of Merges of the key in this writer. 49 merges int 50 // singleDel can be true only if sets <= 1 && merges == 0 and the 51 // SingleDelete was added to this writer after the set. 52 singleDel bool 53 // The number of Deletes of the key in this writer. 54 dels int 55 // del can be true only if a Delete was added to this writer after the 56 // Sets and Merges counted above. 57 del bool 58 59 // updateOps should always be ordered by non-decreasing metaTimestamp. 60 // updateOps will not be updated if the key is range deleted. Therefore, it 61 // is a best effort sequence of updates to the key. updateOps is used to 62 // determine if an iterator created on the DB can read a certain key. 63 updateOps []keyUpdate 64 } 65 66 func (m *keyMeta) clear() { 67 m.sets = 0 68 m.merges = 0 69 m.singleDel = false 70 m.del = false 71 m.dels = 0 72 m.updateOps = nil 73 } 74 75 // mergeInto merges this metadata this into the metadata for other. 76 func (m *keyMeta) mergeInto(keyManager *keyManager, other *keyMeta) { 77 if other.del && !m.del { 78 // m's Sets and Merges are later. 79 if m.sets > 0 || m.merges > 0 { 80 other.del = false 81 } 82 } else { 83 other.del = m.del 84 } 85 // Sets, merges, dels are additive. 86 other.sets += m.sets 87 other.merges += m.merges 88 other.dels += m.dels 89 90 // Single deletes are preserved. This is valid since we are also 91 // maintaining a global invariant that SingleDelete will only be added for 92 // a key that has no inflight Sets or Merges (Sets have made their way to 93 // the DB), and no subsequent Sets or Merges will happen until the 94 // SingleDelete makes its way to the DB. 95 other.singleDel = other.singleDel || m.singleDel 96 if other.singleDel { 97 if other.sets > 1 || other.merges > 0 || other.dels > 0 { 98 panic(fmt.Sprintf("invalid sets %d or merges %d or dels %d", 99 other.sets, other.merges, other.dels)) 100 } 101 } 102 103 // Determine if the key is visible or not after the keyMetas are merged. 104 // TODO(bananabrick): We currently only care about key updates which make it 105 // to the DB, since we only use key updates to determine if an iterator 106 // can read a key in the DB. We could extend the timestamp system to add 107 // support for iterators created on batches. 108 if other.del || other.singleDel { 109 other.updateOps = append( 110 other.updateOps, keyUpdate{true, keyManager.nextMetaTimestamp()}, 111 ) 112 } else { 113 other.updateOps = append( 114 other.updateOps, keyUpdate{false, keyManager.nextMetaTimestamp()}, 115 ) 116 } 117 } 118 119 // keyManager tracks the write operations performed on keys in the generation 120 // phase of the metamorphic test. It makes the assumption that write 121 // operations do not fail, since that can cause the keyManager state to be not 122 // in-sync with the actual state of the writers. This assumption is needed to 123 // correctly decide when it is safe to generate a SingleDelete. This 124 // assumption is violated in a single place in the metamorphic test: ingestion 125 // of multiple batches. We sidestep this issue in a narrow way in 126 // generator.writerIngest by not ingesting multiple batches that contain 127 // deletes or single deletes, since loss of those specific operations on a key 128 // are what we cannot tolerate (doing SingleDelete on a key that has not been 129 // written to because the Set was lost is harmless). 130 type keyManager struct { 131 comparer *base.Comparer 132 133 // metaTimestamp is used to provide a ordering over certain operations like 134 // iter creation, updates to keys. Keeping track of the timestamp allows us 135 // to make determinations such as whether a key will be visible to an 136 // iterator. 137 metaTimestamp int 138 139 // byObjKey tracks the state for each (writer, key) pair. It refers to the 140 // same *keyMeta as in the byObj slices. Using a map allows for fast state 141 // lookups when changing the state based on a writer operation on the key. 142 byObjKey map[string]*keyMeta 143 // List of keys per writer, and what has happened to it in that writer. 144 // Will be transferred when needed. 145 byObj map[objID][]*keyMeta 146 147 // globalKeys represents all the keys that have been generated so far. Not 148 // all these keys have been written to. globalKeys is sorted. 149 globalKeys [][]byte 150 // globalKeysMap contains the same keys as globalKeys. It ensures no 151 // duplication, and contains the aggregate state of the key across all 152 // writers, including inflight state that has not made its way to the DB 153 // yet.The keyMeta.objKey is uninitialized. 154 globalKeysMap map[string]*keyMeta 155 // globalKeyPrefixes contains all the key prefixes (as defined by the 156 // comparer's Split) generated so far. globalKeyPrefixes is sorted. 157 globalKeyPrefixes [][]byte 158 // globalKeyPrefixesMap contains the same keys as globalKeyPrefixes. It 159 // ensures no duplication. 160 globalKeyPrefixesMap map[string]struct{} 161 162 // Using SingleDeletes imposes some constraints on the above state, and 163 // causes some state transitions that help with generating complex but 164 // correct sequences involving SingleDeletes. 165 // - Generating a SingleDelete requires for that key: global.merges==0 && 166 // global.sets==1 && global.dels==0 && !global.singleDel && (db.sets==1 167 // || writer.sets==1), where global represents the entry in 168 // globalKeysMap[key] and db represents the entry in 169 // byObjKey[makeObjKey(makeObjID(dbTag, 0), key)], and writer is the 170 // entry in byObjKey[makeObjKey(writerID, key)]. 171 // 172 // - We do not track state changes due to range deletes, so one should 173 // think of these counts as upper bounds. Also we are not preventing 174 // interactions caused by concurrently in-flight range deletes and 175 // SingleDelete. This is acceptable since it does not cause 176 // non-determinism. 177 // 178 // - When the SingleDelete is generated, it is recorded as 179 // writer.singleDel=true and global.singleDel=true. No more write 180 // operations are permitted on this key until db.singleDel transitions 181 // to true. 182 // 183 // - When db.singleDel transitions to true, we are guaranteed that no 184 // writer other than the DB has any writes for this key. We set 185 // db.singleDel and global.singleDel to false and the corresponding sets 186 // and merges counts in global and db also to 0. This allows this key to 187 // fully participate again in write operations. This means we can 188 // generate sequences of the form: 189 // SET => SINGLEDEL => SET* => MERGE* => DEL 190 // SET => SINGLEDEL => SET => SINGLEDEL, among others. 191 // 192 // - The above logic is insufficient to generate sequences of the form 193 // SET => DEL => SET => SINGLEDEL 194 // To do this we need to track Deletes. When db.del transitions to true, 195 // we check if db.sets==global.sets && db.merges==global.merges && 196 // db.dels==global.dels. If true, there are no in-flight 197 // sets/merges/deletes to this key. We then default initialize the 198 // global and db entries since one can behave as if this key was never 199 // written in this system. This enables the above sequence, among 200 // others. 201 } 202 203 func (k *keyManager) nextMetaTimestamp() int { 204 ret := k.metaTimestamp 205 k.metaTimestamp++ 206 return ret 207 } 208 209 var dbObjID objID = makeObjID(dbTag, 0) 210 211 // newKeyManager returns a pointer to a new keyManager. Callers should 212 // interact with this using addNewKey, eligible*Keys, update, 213 // canTolerateApplyFailure methods only. 214 func newKeyManager() *keyManager { 215 m := &keyManager{ 216 comparer: testkeys.Comparer, 217 byObjKey: make(map[string]*keyMeta), 218 byObj: make(map[objID][]*keyMeta), 219 globalKeysMap: make(map[string]*keyMeta), 220 globalKeyPrefixesMap: make(map[string]struct{}), 221 } 222 m.byObj[dbObjID] = []*keyMeta{} 223 return m 224 } 225 226 // addNewKey adds the given key to the key manager for global key tracking. 227 // Returns false iff this is not a new key. 228 func (k *keyManager) addNewKey(key []byte) bool { 229 _, ok := k.globalKeysMap[string(key)] 230 if ok { 231 return false 232 } 233 keyString := string(key) 234 insertSorted(k.comparer.Compare, &k.globalKeys, key) 235 k.globalKeysMap[keyString] = &keyMeta{objKey: objKey{key: key}} 236 237 prefixLen := k.comparer.Split(key) 238 if _, ok := k.globalKeyPrefixesMap[keyString[:prefixLen]]; !ok { 239 insertSorted(k.comparer.Compare, &k.globalKeyPrefixes, key[:prefixLen]) 240 k.globalKeyPrefixesMap[keyString[:prefixLen]] = struct{}{} 241 } 242 return true 243 } 244 245 // getOrInit returns the keyMeta for the (objID, key) pair, if it exists, else 246 // allocates, initializes and returns a new value. 247 func (k *keyManager) getOrInit(id objID, key []byte) *keyMeta { 248 o := makeObjKey(id, key) 249 m, ok := k.byObjKey[o.String()] 250 if ok { 251 return m 252 } 253 m = &keyMeta{objKey: makeObjKey(id, key)} 254 // Initialize the key-to-meta index. 255 k.byObjKey[o.String()] = m 256 // Add to the id-to-metas slide. 257 k.byObj[o.id] = append(k.byObj[o.id], m) 258 return m 259 } 260 261 // contains returns true if the (objID, key) pair is tracked by the keyManager. 262 func (k *keyManager) contains(id objID, key []byte) bool { 263 _, ok := k.byObjKey[makeObjKey(id, key).String()] 264 return ok 265 } 266 267 // mergeKeysInto merges all metadata for all keys associated with the "from" ID 268 // with the metadata for keys associated with the "to" ID. 269 func (k *keyManager) mergeKeysInto(from, to objID) { 270 msFrom, ok := k.byObj[from] 271 if !ok { 272 msFrom = []*keyMeta{} 273 k.byObj[from] = msFrom 274 } 275 276 msTo, ok := k.byObj[to] 277 if !ok { 278 msTo = []*keyMeta{} 279 k.byObj[to] = msTo 280 } 281 282 // Sort to facilitate a merge. 283 sort.Slice(msFrom, func(i, j int) bool { 284 return msFrom[i].String() < msFrom[j].String() 285 }) 286 sort.Slice(msTo, func(i, j int) bool { 287 return msTo[i].String() < msTo[j].String() 288 }) 289 290 var msNew []*keyMeta 291 var iTo int 292 for _, m := range msFrom { 293 // Move cursor on mTo forward. 294 for iTo < len(msTo) && string(msTo[iTo].key) < string(m.key) { 295 msNew = append(msNew, msTo[iTo]) 296 iTo++ 297 } 298 299 var mTo *keyMeta 300 if iTo < len(msTo) && string(msTo[iTo].key) == string(m.key) { 301 mTo = msTo[iTo] 302 iTo++ 303 } else { 304 mTo = &keyMeta{objKey: makeObjKey(to, m.key)} 305 k.byObjKey[mTo.String()] = mTo 306 } 307 308 m.mergeInto(k, mTo) 309 msNew = append(msNew, mTo) 310 311 delete(k.byObjKey, m.String()) // Unlink "from". 312 } 313 314 // Add any remaining items from the "to" set. 315 for iTo < len(msTo) { 316 msNew = append(msNew, msTo[iTo]) 317 iTo++ 318 } 319 320 k.byObj[to] = msNew // Update "to". 321 delete(k.byObj, from) // Unlink "from". 322 } 323 324 func (k *keyManager) checkForDelOrSingleDelTransition(dbMeta *keyMeta, globalMeta *keyMeta) { 325 if dbMeta.singleDel { 326 if !globalMeta.singleDel { 327 panic("inconsistency with globalMeta") 328 } 329 if dbMeta.del || globalMeta.del || dbMeta.dels > 0 || globalMeta.dels > 0 || 330 dbMeta.merges > 0 || globalMeta.merges > 0 || dbMeta.sets != 1 || globalMeta.sets != 1 { 331 panic("inconsistency in metas when SingleDelete applied to DB") 332 } 333 dbMeta.clear() 334 globalMeta.clear() 335 return 336 } 337 if dbMeta.del && globalMeta.sets == dbMeta.sets && globalMeta.merges == dbMeta.merges && 338 globalMeta.dels == dbMeta.dels { 339 if dbMeta.singleDel || globalMeta.singleDel { 340 panic("Delete should not have happened given SingleDelete") 341 } 342 dbMeta.clear() 343 globalMeta.clear() 344 } 345 } 346 347 func (k *keyManager) checkForDelOrSingleDelTransitionInDB() { 348 keys := k.byObj[dbObjID] 349 for _, dbMeta := range keys { 350 globalMeta := k.globalKeysMap[string(dbMeta.key)] 351 k.checkForDelOrSingleDelTransition(dbMeta, globalMeta) 352 } 353 } 354 355 // update updates the internal state of the keyManager according to the given 356 // op. 357 func (k *keyManager) update(o op) { 358 switch s := o.(type) { 359 case *setOp: 360 meta := k.getOrInit(s.writerID, s.key) 361 globalMeta := k.globalKeysMap[string(s.key)] 362 meta.sets++ // Update the set count on this specific (id, key) pair. 363 meta.del = false 364 globalMeta.sets++ 365 meta.updateOps = append(meta.updateOps, keyUpdate{false, k.nextMetaTimestamp()}) 366 if meta.singleDel || globalMeta.singleDel { 367 panic("setting a key that has in-flight SingleDelete") 368 } 369 case *mergeOp: 370 meta := k.getOrInit(s.writerID, s.key) 371 globalMeta := k.globalKeysMap[string(s.key)] 372 meta.merges++ 373 meta.del = false 374 globalMeta.merges++ 375 meta.updateOps = append(meta.updateOps, keyUpdate{false, k.nextMetaTimestamp()}) 376 if meta.singleDel || globalMeta.singleDel { 377 panic("merging a key that has in-flight SingleDelete") 378 } 379 case *deleteOp: 380 meta := k.getOrInit(s.writerID, s.key) 381 globalMeta := k.globalKeysMap[string(s.key)] 382 meta.del = true 383 globalMeta.del = true 384 meta.dels++ 385 globalMeta.dels++ 386 meta.updateOps = append(meta.updateOps, keyUpdate{true, k.nextMetaTimestamp()}) 387 if s.writerID == dbObjID { 388 k.checkForDelOrSingleDelTransition(meta, globalMeta) 389 } 390 case *singleDeleteOp: 391 if !k.globalStateIndicatesEligibleForSingleDelete(s.key) { 392 panic("key ineligible for SingleDelete") 393 } 394 meta := k.getOrInit(s.writerID, s.key) 395 globalMeta := k.globalKeysMap[string(s.key)] 396 meta.singleDel = true 397 globalMeta.singleDel = true 398 meta.updateOps = append(meta.updateOps, keyUpdate{true, k.nextMetaTimestamp()}) 399 if s.writerID == dbObjID { 400 k.checkForDelOrSingleDelTransition(meta, globalMeta) 401 } 402 case *ingestOp: 403 // For each batch, merge all keys with the keys in the DB. 404 for _, batchID := range s.batchIDs { 405 k.mergeKeysInto(batchID, dbObjID) 406 } 407 k.checkForDelOrSingleDelTransitionInDB() 408 case *applyOp: 409 // Merge the keys from this writer into the parent writer. 410 k.mergeKeysInto(s.batchID, s.writerID) 411 if s.writerID == dbObjID { 412 k.checkForDelOrSingleDelTransitionInDB() 413 } 414 case *batchCommitOp: 415 // Merge the keys from the batch with the keys from the DB. 416 k.mergeKeysInto(s.batchID, dbObjID) 417 k.checkForDelOrSingleDelTransitionInDB() 418 } 419 } 420 421 func (k *keyManager) eligibleReadKeys() (keys [][]byte) { 422 return k.globalKeys 423 } 424 425 // eligibleReadKeysInRange returns all eligible read keys within the range 426 // [start,end). The returned slice is owned by the keyManager and must not be 427 // retained. 428 func (k *keyManager) eligibleReadKeysInRange(kr pebble.KeyRange) (keys [][]byte) { 429 s := sort.Search(len(k.globalKeys), func(i int) bool { 430 return k.comparer.Compare(k.globalKeys[i], kr.Start) >= 0 431 }) 432 e := sort.Search(len(k.globalKeys), func(i int) bool { 433 return k.comparer.Compare(k.globalKeys[i], kr.End) >= 0 434 }) 435 if s >= e { 436 return nil 437 } 438 return k.globalKeys[s:e] 439 } 440 441 func (k *keyManager) prefixes() (prefixes [][]byte) { 442 return k.globalKeyPrefixes 443 } 444 445 // prefixExists returns true if a key has been generated with the provided 446 // prefix before. 447 func (k *keyManager) prefixExists(prefix []byte) bool { 448 _, exists := k.globalKeyPrefixesMap[string(prefix)] 449 return exists 450 } 451 452 func (k *keyManager) eligibleWriteKeys() (keys [][]byte) { 453 // Creating and sorting this slice of keys is wasteful given that the 454 // caller will pick one, but makes it simpler for unit testing. 455 for _, v := range k.globalKeysMap { 456 if v.singleDel { 457 continue 458 } 459 keys = append(keys, v.key) 460 } 461 sort.Slice(keys, func(i, j int) bool { 462 return k.comparer.Compare(keys[i], keys[j]) < 0 463 }) 464 return keys 465 } 466 467 // eligibleSingleDeleteKeys returns a slice of keys that can be safely single 468 // deleted, given the writer id. 469 func (k *keyManager) eligibleSingleDeleteKeys(id objID) (keys [][]byte) { 470 // Creating and sorting this slice of keys is wasteful given that the 471 // caller will pick one, but makes it simpler for unit testing. 472 addForObjID := func(id objID) { 473 for _, m := range k.byObj[id] { 474 if m.sets == 1 && k.globalStateIndicatesEligibleForSingleDelete(m.key) { 475 keys = append(keys, m.key) 476 } 477 } 478 } 479 addForObjID(id) 480 if id != dbObjID { 481 addForObjID(dbObjID) 482 } 483 sort.Slice(keys, func(i, j int) bool { 484 return k.comparer.Compare(keys[i], keys[j]) < 0 485 }) 486 return keys 487 } 488 489 func (k *keyManager) globalStateIndicatesEligibleForSingleDelete(key []byte) bool { 490 m := k.globalKeysMap[string(key)] 491 return m.merges == 0 && m.sets == 1 && m.dels == 0 && !m.singleDel 492 } 493 494 // canTolerateApplyFailure is called with a batch ID and returns true iff a 495 // failure to apply this batch to the DB can be tolerated. 496 func (k *keyManager) canTolerateApplyFailure(id objID) bool { 497 if id.tag() != batchTag { 498 panic("called with an objID that is not a batch") 499 } 500 ms, ok := k.byObj[id] 501 if !ok { 502 return true 503 } 504 for _, m := range ms { 505 if m.singleDel || m.del { 506 return false 507 } 508 } 509 return true 510 } 511 512 func opWrittenKeys(untypedOp op) [][]byte { 513 switch t := untypedOp.(type) { 514 case *applyOp: 515 case *batchCommitOp: 516 case *checkpointOp: 517 case *closeOp: 518 case *compactOp: 519 case *dbRestartOp: 520 case *deleteOp: 521 return [][]byte{t.key} 522 case *deleteRangeOp: 523 return [][]byte{t.start, t.end} 524 case *flushOp: 525 case *getOp: 526 case *ingestOp: 527 case *initOp: 528 case *iterFirstOp: 529 case *iterLastOp: 530 case *iterNextOp: 531 case *iterNextPrefixOp: 532 case *iterCanSingleDelOp: 533 case *iterPrevOp: 534 case *iterSeekGEOp: 535 case *iterSeekLTOp: 536 case *iterSeekPrefixGEOp: 537 case *iterSetBoundsOp: 538 case *iterSetOptionsOp: 539 case *mergeOp: 540 return [][]byte{t.key} 541 case *newBatchOp: 542 case *newIndexedBatchOp: 543 case *newIterOp: 544 case *newIterUsingCloneOp: 545 case *newSnapshotOp: 546 case *rangeKeyDeleteOp: 547 case *rangeKeySetOp: 548 case *rangeKeyUnsetOp: 549 case *setOp: 550 return [][]byte{t.key} 551 case *singleDeleteOp: 552 return [][]byte{t.key} 553 } 554 return nil 555 } 556 557 func loadPrecedingKeys(t TestingT, ops []op, cfg *config, m *keyManager) { 558 for _, op := range ops { 559 // Pretend we're generating all the operation's keys as potential new 560 // key, so that we update the key manager's keys and prefix sets. 561 for _, k := range opWrittenKeys(op) { 562 m.addNewKey(k) 563 564 // If the key has a suffix, ratchet up the suffix distribution if 565 // necessary. 566 if s := m.comparer.Split(k); s < len(k) { 567 suffix, err := testkeys.ParseSuffix(k[s:]) 568 require.NoError(t, err) 569 if uint64(suffix) > cfg.writeSuffixDist.Max() { 570 diff := int(uint64(suffix) - cfg.writeSuffixDist.Max()) 571 cfg.writeSuffixDist.IncMax(diff) 572 } 573 } 574 } 575 576 // Update key tracking state. 577 m.update(op) 578 } 579 } 580 581 func insertSorted(cmp base.Compare, dst *[][]byte, k []byte) { 582 s := *dst 583 i := sort.Search(len(*dst), func(i int) bool { 584 return cmp((*dst)[i], k) >= 0 585 }) 586 if i == len(s) { 587 *dst = append(*dst, k) 588 return 589 } 590 *dst = append((*dst)[:i+1], (*dst)[i:]...) 591 (*dst)[i] = k 592 }