github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/metamorphic/ops.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package metamorphic 6 7 import ( 8 "bytes" 9 "context" 10 "crypto/rand" 11 "encoding/binary" 12 "fmt" 13 "io" 14 "path" 15 "path/filepath" 16 "strings" 17 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble" 20 "github.com/cockroachdb/pebble/internal/base" 21 "github.com/cockroachdb/pebble/internal/keyspan" 22 "github.com/cockroachdb/pebble/internal/private" 23 "github.com/cockroachdb/pebble/internal/rangekey" 24 "github.com/cockroachdb/pebble/internal/testkeys" 25 "github.com/cockroachdb/pebble/objstorage/objstorageprovider" 26 "github.com/cockroachdb/pebble/sstable" 27 "github.com/cockroachdb/pebble/vfs/errorfs" 28 ) 29 30 // op defines the interface for a single operation, such as creating a batch, 31 // or advancing an iterator. 32 type op interface { 33 String() string 34 run(t *test, h historyRecorder) 35 36 // receiver returns the object ID of the object the operation is performed 37 // on. Every operation has a receiver (eg, batch0.Set(...) has `batch0` as 38 // its receiver). Receivers are used for synchronization when running with 39 // concurrency. 40 receiver() objID 41 42 // syncObjs returns an additional set of object IDs—excluding the 43 // receiver—that the operation must synchronize with. At execution time, 44 // the operation will run serially with respect to all other operations 45 // that return these objects from their own syncObjs or receiver methods. 46 syncObjs() objIDSlice 47 } 48 49 // initOp performs test initialization 50 type initOp struct { 51 dbSlots uint32 52 batchSlots uint32 53 iterSlots uint32 54 snapshotSlots uint32 55 } 56 57 func (o *initOp) run(t *test, h historyRecorder) { 58 t.batches = make([]*pebble.Batch, o.batchSlots) 59 t.iters = make([]*retryableIter, o.iterSlots) 60 t.snapshots = make([]readerCloser, o.snapshotSlots) 61 h.Recordf("%s", o) 62 } 63 64 func (o *initOp) String() string { 65 return fmt.Sprintf("Init(%d /* dbs */, %d /* batches */, %d /* iters */, %d /* snapshots */)", 66 o.dbSlots, o.batchSlots, o.iterSlots, o.snapshotSlots) 67 } 68 69 func (o *initOp) receiver() objID { return makeObjID(dbTag, 1) } 70 func (o *initOp) syncObjs() objIDSlice { 71 syncObjs := make([]objID, 0) 72 // Add any additional DBs to syncObjs. 73 for i := uint32(2); i < o.dbSlots+1; i++ { 74 syncObjs = append(syncObjs, makeObjID(dbTag, i)) 75 } 76 return syncObjs 77 } 78 79 // applyOp models a Writer.Apply operation. 80 type applyOp struct { 81 writerID objID 82 batchID objID 83 } 84 85 func (o *applyOp) run(t *test, h historyRecorder) { 86 b := t.getBatch(o.batchID) 87 w := t.getWriter(o.writerID) 88 var err error 89 if o.writerID.tag() == dbTag && t.testOpts.asyncApplyToDB && t.writeOpts.Sync { 90 err = w.(*pebble.DB).ApplyNoSyncWait(b, t.writeOpts) 91 if err == nil { 92 err = b.SyncWait() 93 } 94 } else { 95 err = w.Apply(b, t.writeOpts) 96 } 97 h.Recordf("%s // %v", o, err) 98 // batch will be closed by a closeOp which is guaranteed to be generated 99 } 100 101 func (o *applyOp) String() string { return fmt.Sprintf("%s.Apply(%s)", o.writerID, o.batchID) } 102 func (o *applyOp) receiver() objID { return o.writerID } 103 func (o *applyOp) syncObjs() objIDSlice { 104 // Apply should not be concurrent with operations that are mutating the 105 // batch. 106 return []objID{o.batchID} 107 } 108 109 // checkpointOp models a DB.Checkpoint operation. 110 type checkpointOp struct { 111 dbID objID 112 // If non-empty, the checkpoint is restricted to these spans. 113 spans []pebble.CheckpointSpan 114 } 115 116 func (o *checkpointOp) run(t *test, h historyRecorder) { 117 // TODO(josh): db.Checkpoint does not work with shared storage yet. 118 // It would be better to filter out ahead of calling run on the op, 119 // by setting the weight that generator.go uses to zero, or similar. 120 // But IIUC the ops are shared for ALL the metamorphic test runs, so 121 // not sure how to do that easily: 122 // https://github.com/cockroachdb/pebble/blob/master/metamorphic/meta.go#L177 123 if t.testOpts.sharedStorageEnabled { 124 h.Recordf("%s // %v", o, nil) 125 return 126 } 127 var opts []pebble.CheckpointOption 128 if len(o.spans) > 0 { 129 opts = append(opts, pebble.WithRestrictToSpans(o.spans)) 130 } 131 db := t.getDB(o.dbID) 132 err := withRetries(func() error { 133 return db.Checkpoint(o.dir(t.dir, h.op), opts...) 134 }) 135 h.Recordf("%s // %v", o, err) 136 } 137 138 func (o *checkpointOp) dir(dataDir string, idx int) string { 139 return filepath.Join(dataDir, "checkpoints", fmt.Sprintf("op-%06d", idx)) 140 } 141 142 func (o *checkpointOp) String() string { 143 var spanStr bytes.Buffer 144 for i, span := range o.spans { 145 if i > 0 { 146 spanStr.WriteString(",") 147 } 148 fmt.Fprintf(&spanStr, "%q,%q", span.Start, span.End) 149 } 150 return fmt.Sprintf("%s.Checkpoint(%s)", o.dbID, spanStr.String()) 151 } 152 153 func (o *checkpointOp) receiver() objID { return o.dbID } 154 func (o *checkpointOp) syncObjs() objIDSlice { return nil } 155 156 // closeOp models a {Batch,Iterator,Snapshot}.Close operation. 157 type closeOp struct { 158 objID objID 159 derivedDBID objID 160 } 161 162 func (o *closeOp) run(t *test, h historyRecorder) { 163 c := t.getCloser(o.objID) 164 if o.objID.tag() == dbTag && t.opts.DisableWAL { 165 // Special case: If WAL is disabled, do a flush right before DB Close. This 166 // allows us to reuse this run's data directory as initial state for 167 // future runs without losing any mutations. 168 _ = t.getDB(o.objID).Flush() 169 } 170 t.clearObj(o.objID) 171 err := c.Close() 172 h.Recordf("%s // %v", o, err) 173 } 174 175 func (o *closeOp) String() string { return fmt.Sprintf("%s.Close()", o.objID) } 176 func (o *closeOp) receiver() objID { return o.objID } 177 func (o *closeOp) syncObjs() objIDSlice { 178 // Synchronize on the database so that we don't close the database before 179 // all its iterators, snapshots and batches are closed. 180 // TODO(jackson): It would be nice to relax this so that Close calls can 181 // execute in parallel. 182 if o.objID.tag() == dbTag { 183 return nil 184 } 185 if o.derivedDBID != 0 { 186 return []objID{o.derivedDBID} 187 } 188 return nil 189 } 190 191 // compactOp models a DB.Compact operation. 192 type compactOp struct { 193 dbID objID 194 start []byte 195 end []byte 196 parallelize bool 197 } 198 199 func (o *compactOp) run(t *test, h historyRecorder) { 200 err := withRetries(func() error { 201 return t.getDB(o.dbID).Compact(o.start, o.end, o.parallelize) 202 }) 203 h.Recordf("%s // %v", o, err) 204 } 205 206 func (o *compactOp) String() string { 207 return fmt.Sprintf("%s.Compact(%q, %q, %t /* parallelize */)", o.dbID, o.start, o.end, o.parallelize) 208 } 209 210 func (o *compactOp) receiver() objID { return o.dbID } 211 func (o *compactOp) syncObjs() objIDSlice { return nil } 212 213 // deleteOp models a Write.Delete operation. 214 type deleteOp struct { 215 writerID objID 216 key []byte 217 218 derivedDBID objID 219 } 220 221 func (o *deleteOp) run(t *test, h historyRecorder) { 222 w := t.getWriter(o.writerID) 223 var err error 224 if t.testOpts.deleteSized && t.isFMV(o.derivedDBID, pebble.FormatDeleteSizedAndObsolete) { 225 // Call DeleteSized with a deterministic size derived from the index. 226 // The size does not need to be accurate for correctness. 227 err = w.DeleteSized(o.key, hashSize(t.idx), t.writeOpts) 228 } else { 229 err = w.Delete(o.key, t.writeOpts) 230 } 231 h.Recordf("%s // %v", o, err) 232 } 233 234 func hashSize(index int) uint32 { 235 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 236 return uint32((11400714819323198485 * uint64(index)) % maxValueSize) 237 } 238 239 func (o *deleteOp) String() string { 240 return fmt.Sprintf("%s.Delete(%q)", o.writerID, o.key) 241 } 242 func (o *deleteOp) receiver() objID { return o.writerID } 243 func (o *deleteOp) syncObjs() objIDSlice { return nil } 244 245 // singleDeleteOp models a Write.SingleDelete operation. 246 type singleDeleteOp struct { 247 writerID objID 248 key []byte 249 maybeReplaceDelete bool 250 } 251 252 func (o *singleDeleteOp) run(t *test, h historyRecorder) { 253 w := t.getWriter(o.writerID) 254 var err error 255 if t.testOpts.replaceSingleDelete && o.maybeReplaceDelete { 256 err = w.Delete(o.key, t.writeOpts) 257 } else { 258 err = w.SingleDelete(o.key, t.writeOpts) 259 } 260 // NOTE: even if the SINGLEDEL was replaced with a DELETE, we must still 261 // write the former to the history log. The log line will indicate whether 262 // or not the delete *could* have been replaced. The OPTIONS file should 263 // also be consulted to determine what happened at runtime (i.e. by taking 264 // the logical AND). 265 h.Recordf("%s // %v", o, err) 266 } 267 268 func (o *singleDeleteOp) String() string { 269 return fmt.Sprintf("%s.SingleDelete(%q, %v /* maybeReplaceDelete */)", o.writerID, o.key, o.maybeReplaceDelete) 270 } 271 272 func (o *singleDeleteOp) receiver() objID { return o.writerID } 273 func (o *singleDeleteOp) syncObjs() objIDSlice { return nil } 274 275 // deleteRangeOp models a Write.DeleteRange operation. 276 type deleteRangeOp struct { 277 writerID objID 278 start []byte 279 end []byte 280 } 281 282 func (o *deleteRangeOp) run(t *test, h historyRecorder) { 283 w := t.getWriter(o.writerID) 284 err := w.DeleteRange(o.start, o.end, t.writeOpts) 285 h.Recordf("%s // %v", o, err) 286 } 287 288 func (o *deleteRangeOp) String() string { 289 return fmt.Sprintf("%s.DeleteRange(%q, %q)", o.writerID, o.start, o.end) 290 } 291 292 func (o *deleteRangeOp) receiver() objID { return o.writerID } 293 func (o *deleteRangeOp) syncObjs() objIDSlice { return nil } 294 295 // flushOp models a DB.Flush operation. 296 type flushOp struct { 297 db objID 298 } 299 300 func (o *flushOp) run(t *test, h historyRecorder) { 301 db := t.getDB(o.db) 302 err := db.Flush() 303 h.Recordf("%s // %v", o, err) 304 } 305 306 func (o *flushOp) String() string { return fmt.Sprintf("%s.Flush()", o.db) } 307 func (o *flushOp) receiver() objID { return o.db } 308 func (o *flushOp) syncObjs() objIDSlice { return nil } 309 310 // mergeOp models a Write.Merge operation. 311 type mergeOp struct { 312 writerID objID 313 key []byte 314 value []byte 315 } 316 317 func (o *mergeOp) run(t *test, h historyRecorder) { 318 w := t.getWriter(o.writerID) 319 err := w.Merge(o.key, o.value, t.writeOpts) 320 h.Recordf("%s // %v", o, err) 321 } 322 323 func (o *mergeOp) String() string { return fmt.Sprintf("%s.Merge(%q, %q)", o.writerID, o.key, o.value) } 324 func (o *mergeOp) receiver() objID { return o.writerID } 325 func (o *mergeOp) syncObjs() objIDSlice { return nil } 326 327 // setOp models a Write.Set operation. 328 type setOp struct { 329 writerID objID 330 key []byte 331 value []byte 332 } 333 334 func (o *setOp) run(t *test, h historyRecorder) { 335 w := t.getWriter(o.writerID) 336 err := w.Set(o.key, o.value, t.writeOpts) 337 h.Recordf("%s // %v", o, err) 338 } 339 340 func (o *setOp) String() string { return fmt.Sprintf("%s.Set(%q, %q)", o.writerID, o.key, o.value) } 341 func (o *setOp) receiver() objID { return o.writerID } 342 func (o *setOp) syncObjs() objIDSlice { return nil } 343 344 // rangeKeyDeleteOp models a Write.RangeKeyDelete operation. 345 type rangeKeyDeleteOp struct { 346 writerID objID 347 start []byte 348 end []byte 349 } 350 351 func (o *rangeKeyDeleteOp) run(t *test, h historyRecorder) { 352 w := t.getWriter(o.writerID) 353 err := w.RangeKeyDelete(o.start, o.end, t.writeOpts) 354 h.Recordf("%s // %v", o, err) 355 } 356 357 func (o *rangeKeyDeleteOp) String() string { 358 return fmt.Sprintf("%s.RangeKeyDelete(%q, %q)", o.writerID, o.start, o.end) 359 } 360 361 func (o *rangeKeyDeleteOp) receiver() objID { return o.writerID } 362 func (o *rangeKeyDeleteOp) syncObjs() objIDSlice { return nil } 363 364 // rangeKeySetOp models a Write.RangeKeySet operation. 365 type rangeKeySetOp struct { 366 writerID objID 367 start []byte 368 end []byte 369 suffix []byte 370 value []byte 371 } 372 373 func (o *rangeKeySetOp) run(t *test, h historyRecorder) { 374 w := t.getWriter(o.writerID) 375 err := w.RangeKeySet(o.start, o.end, o.suffix, o.value, t.writeOpts) 376 h.Recordf("%s // %v", o, err) 377 } 378 379 func (o *rangeKeySetOp) String() string { 380 return fmt.Sprintf("%s.RangeKeySet(%q, %q, %q, %q)", 381 o.writerID, o.start, o.end, o.suffix, o.value) 382 } 383 384 func (o *rangeKeySetOp) receiver() objID { return o.writerID } 385 func (o *rangeKeySetOp) syncObjs() objIDSlice { return nil } 386 387 // rangeKeyUnsetOp models a Write.RangeKeyUnset operation. 388 type rangeKeyUnsetOp struct { 389 writerID objID 390 start []byte 391 end []byte 392 suffix []byte 393 } 394 395 func (o *rangeKeyUnsetOp) run(t *test, h historyRecorder) { 396 w := t.getWriter(o.writerID) 397 err := w.RangeKeyUnset(o.start, o.end, o.suffix, t.writeOpts) 398 h.Recordf("%s // %v", o, err) 399 } 400 401 func (o *rangeKeyUnsetOp) String() string { 402 return fmt.Sprintf("%s.RangeKeyUnset(%q, %q, %q)", 403 o.writerID, o.start, o.end, o.suffix) 404 } 405 406 func (o *rangeKeyUnsetOp) receiver() objID { return o.writerID } 407 func (o *rangeKeyUnsetOp) syncObjs() objIDSlice { return nil } 408 409 // newBatchOp models a Write.NewBatch operation. 410 type newBatchOp struct { 411 dbID objID 412 batchID objID 413 } 414 415 func (o *newBatchOp) run(t *test, h historyRecorder) { 416 b := t.getDB(o.dbID).NewBatch() 417 t.setBatch(o.batchID, b) 418 h.Recordf("%s", o) 419 } 420 421 func (o *newBatchOp) String() string { return fmt.Sprintf("%s = %s.NewBatch()", o.batchID, o.dbID) } 422 func (o *newBatchOp) receiver() objID { return o.dbID } 423 func (o *newBatchOp) syncObjs() objIDSlice { 424 // NewBatch should not be concurrent with operations that interact with that 425 // same batch. 426 return []objID{o.batchID} 427 } 428 429 // newIndexedBatchOp models a Write.NewIndexedBatch operation. 430 type newIndexedBatchOp struct { 431 dbID objID 432 batchID objID 433 } 434 435 func (o *newIndexedBatchOp) run(t *test, h historyRecorder) { 436 b := t.getDB(o.dbID).NewIndexedBatch() 437 t.setBatch(o.batchID, b) 438 h.Recordf("%s", o) 439 } 440 441 func (o *newIndexedBatchOp) String() string { 442 return fmt.Sprintf("%s = %s.NewIndexedBatch()", o.batchID, o.dbID) 443 } 444 func (o *newIndexedBatchOp) receiver() objID { return o.dbID } 445 func (o *newIndexedBatchOp) syncObjs() objIDSlice { 446 // NewIndexedBatch should not be concurrent with operations that interact 447 // with that same batch. 448 return []objID{o.batchID} 449 } 450 451 // batchCommitOp models a Batch.Commit operation. 452 type batchCommitOp struct { 453 dbID objID 454 batchID objID 455 } 456 457 func (o *batchCommitOp) run(t *test, h historyRecorder) { 458 b := t.getBatch(o.batchID) 459 err := b.Commit(t.writeOpts) 460 h.Recordf("%s // %v", o, err) 461 } 462 463 func (o *batchCommitOp) String() string { return fmt.Sprintf("%s.Commit()", o.batchID) } 464 func (o *batchCommitOp) receiver() objID { return o.batchID } 465 func (o *batchCommitOp) syncObjs() objIDSlice { 466 // Synchronize on the database so that NewIters wait for the commit. 467 return []objID{o.dbID} 468 } 469 470 // ingestOp models a DB.Ingest operation. 471 type ingestOp struct { 472 dbID objID 473 batchIDs []objID 474 475 derivedDBIDs []objID 476 } 477 478 func (o *ingestOp) run(t *test, h historyRecorder) { 479 // We can only use apply as an alternative for ingestion if we are ingesting 480 // a single batch. If we are ingesting multiple batches, the batches may 481 // overlap which would cause ingestion to fail but apply would succeed. 482 if t.testOpts.ingestUsingApply && len(o.batchIDs) == 1 && o.derivedDBIDs[0] == o.dbID { 483 id := o.batchIDs[0] 484 b := t.getBatch(id) 485 iter, rangeDelIter, rangeKeyIter := private.BatchSort(b) 486 db := t.getDB(o.dbID) 487 c, err := o.collapseBatch(t, db, iter, rangeDelIter, rangeKeyIter, b) 488 if err == nil { 489 err = db.Apply(c, t.writeOpts) 490 } 491 _ = b.Close() 492 _ = c.Close() 493 t.clearObj(id) 494 h.Recordf("%s // %v", o, err) 495 return 496 } 497 498 var paths []string 499 var err error 500 for i, id := range o.batchIDs { 501 b := t.getBatch(id) 502 t.clearObj(id) 503 path, err2 := o.build(t, h, b, i) 504 if err2 != nil { 505 h.Recordf("Build(%s) // %v", id, err2) 506 } 507 err = firstError(err, err2) 508 if err2 == nil { 509 paths = append(paths, path) 510 } 511 err = firstError(err, b.Close()) 512 } 513 514 err = firstError(err, withRetries(func() error { 515 return t.getDB(o.dbID).Ingest(paths) 516 })) 517 518 h.Recordf("%s // %v", o, err) 519 } 520 521 func buildForIngest( 522 t *test, dbID objID, h historyRecorder, b *pebble.Batch, i int, 523 ) (string, *sstable.WriterMetadata, error) { 524 path := t.opts.FS.PathJoin(t.tmpDir, fmt.Sprintf("ext%d-%d", dbID.slot(), i)) 525 f, err := t.opts.FS.Create(path) 526 if err != nil { 527 return "", nil, err 528 } 529 db := t.getDB(dbID) 530 531 iter, rangeDelIter, rangeKeyIter := private.BatchSort(b) 532 defer closeIters(iter, rangeDelIter, rangeKeyIter) 533 534 equal := t.opts.Comparer.Equal 535 tableFormat := db.FormatMajorVersion().MaxTableFormat() 536 w := sstable.NewWriter( 537 objstorageprovider.NewFileWritable(f), 538 t.opts.MakeWriterOptions(0, tableFormat), 539 ) 540 541 var lastUserKey []byte 542 for key, value := iter.First(); key != nil; key, value = iter.Next() { 543 // Ignore duplicate keys. 544 if equal(lastUserKey, key.UserKey) { 545 continue 546 } 547 // NB: We don't have to copy the key or value since we're reading from a 548 // batch which doesn't do prefix compression. 549 lastUserKey = key.UserKey 550 551 key.SetSeqNum(base.SeqNumZero) 552 // It's possible that we wrote the key on a batch from a db that supported 553 // DeleteSized, but are now ingesting into a db that does not. Detect 554 // this case and translate the key to an InternalKeyKindDelete. 555 if key.Kind() == pebble.InternalKeyKindDeleteSized && !t.isFMV(dbID, pebble.FormatDeleteSizedAndObsolete) { 556 value = pebble.LazyValue{} 557 key.SetKind(pebble.InternalKeyKindDelete) 558 } 559 if err := w.Add(*key, value.InPlaceValue()); err != nil { 560 return "", nil, err 561 } 562 } 563 if err := iter.Close(); err != nil { 564 return "", nil, err 565 } 566 iter = nil 567 568 if rangeDelIter != nil { 569 // NB: The range tombstones have already been fragmented by the Batch. 570 for t := rangeDelIter.First(); t != nil; t = rangeDelIter.Next() { 571 // NB: We don't have to copy the key or value since we're reading from a 572 // batch which doesn't do prefix compression. 573 if err := w.DeleteRange(t.Start, t.End); err != nil { 574 return "", nil, err 575 } 576 } 577 if err := rangeDelIter.Close(); err != nil { 578 return "", nil, err 579 } 580 rangeDelIter = nil 581 } 582 583 if rangeKeyIter != nil { 584 for span := rangeKeyIter.First(); span != nil; span = rangeKeyIter.Next() { 585 // Coalesce the keys of this span and then zero the sequence 586 // numbers. This is necessary in order to make the range keys within 587 // the ingested sstable internally consistent at the sequence number 588 // it's ingested at. The individual keys within a batch are 589 // committed at unique sequence numbers, whereas all the keys of an 590 // ingested sstable are given the same sequence number. A span 591 // contaning keys that both set and unset the same suffix at the 592 // same sequence number is nonsensical, so we "coalesce" or collapse 593 // the keys. 594 collapsed := keyspan.Span{ 595 Start: span.Start, 596 End: span.End, 597 Keys: make([]keyspan.Key, 0, len(span.Keys)), 598 } 599 err = rangekey.Coalesce(t.opts.Comparer.Compare, equal, span.Keys, &collapsed.Keys) 600 if err != nil { 601 return "", nil, err 602 } 603 for i := range collapsed.Keys { 604 collapsed.Keys[i].Trailer = base.MakeTrailer(0, collapsed.Keys[i].Kind()) 605 } 606 keyspan.SortKeysByTrailer(&collapsed.Keys) 607 if err := rangekey.Encode(&collapsed, w.AddRangeKey); err != nil { 608 return "", nil, err 609 } 610 } 611 if err := rangeKeyIter.Error(); err != nil { 612 return "", nil, err 613 } 614 if err := rangeKeyIter.Close(); err != nil { 615 return "", nil, err 616 } 617 rangeKeyIter = nil 618 } 619 620 if err := w.Close(); err != nil { 621 return "", nil, err 622 } 623 meta, err := w.Metadata() 624 return path, meta, err 625 } 626 627 func (o *ingestOp) build(t *test, h historyRecorder, b *pebble.Batch, i int) (string, error) { 628 path, _, err := buildForIngest(t, o.dbID, h, b, i) 629 return path, err 630 } 631 632 func (o *ingestOp) receiver() objID { return o.dbID } 633 func (o *ingestOp) syncObjs() objIDSlice { 634 // Ingest should not be concurrent with mutating the batches that will be 635 // ingested as sstables. 636 objs := make([]objID, 0, len(o.batchIDs)+1) 637 objs = append(objs, o.batchIDs...) 638 addedDBs := make(map[objID]struct{}) 639 for i := range o.derivedDBIDs { 640 _, ok := addedDBs[o.derivedDBIDs[i]] 641 if !ok && o.derivedDBIDs[i] != o.dbID { 642 objs = append(objs, o.derivedDBIDs[i]) 643 addedDBs[o.derivedDBIDs[i]] = struct{}{} 644 } 645 } 646 return objs 647 } 648 649 func closeIters( 650 pointIter base.InternalIterator, 651 rangeDelIter keyspan.FragmentIterator, 652 rangeKeyIter keyspan.FragmentIterator, 653 ) { 654 if pointIter != nil { 655 pointIter.Close() 656 } 657 if rangeDelIter != nil { 658 rangeDelIter.Close() 659 } 660 if rangeKeyIter != nil { 661 rangeKeyIter.Close() 662 } 663 } 664 665 // collapseBatch collapses the mutations in a batch to be equivalent to an 666 // sstable ingesting those mutations. Duplicate updates to a key are collapsed 667 // so that only the latest update is performed. All range deletions are 668 // performed first in the batch to match the semantics of ingestion where a 669 // range deletion does not delete a point record contained in the sstable. 670 func (o *ingestOp) collapseBatch( 671 t *test, 672 db *pebble.DB, 673 pointIter base.InternalIterator, 674 rangeDelIter, rangeKeyIter keyspan.FragmentIterator, 675 b *pebble.Batch, 676 ) (*pebble.Batch, error) { 677 defer closeIters(pointIter, rangeDelIter, rangeKeyIter) 678 equal := t.opts.Comparer.Equal 679 collapsed := db.NewBatch() 680 681 if rangeDelIter != nil { 682 // NB: The range tombstones have already been fragmented by the Batch. 683 for t := rangeDelIter.First(); t != nil; t = rangeDelIter.Next() { 684 // NB: We don't have to copy the key or value since we're reading from a 685 // batch which doesn't do prefix compression. 686 if err := collapsed.DeleteRange(t.Start, t.End, nil); err != nil { 687 return nil, err 688 } 689 } 690 if err := rangeDelIter.Close(); err != nil { 691 return nil, err 692 } 693 rangeDelIter = nil 694 } 695 696 if pointIter != nil { 697 var lastUserKey []byte 698 for key, value := pointIter.First(); key != nil; key, value = pointIter.Next() { 699 // Ignore duplicate keys. 700 // 701 // Note: this is necessary due to MERGE keys, otherwise it would be 702 // fine to include all the keys in the batch and let the normal 703 // sequence number precedence determine which of the keys "wins". 704 // But the code to build the ingested sstable will only keep the 705 // most recent internal key and will not merge across internal keys. 706 if equal(lastUserKey, key.UserKey) { 707 continue 708 } 709 // NB: We don't have to copy the key or value since we're reading from a 710 // batch which doesn't do prefix compression. 711 lastUserKey = key.UserKey 712 713 var err error 714 switch key.Kind() { 715 case pebble.InternalKeyKindDelete: 716 err = collapsed.Delete(key.UserKey, nil) 717 case pebble.InternalKeyKindDeleteSized: 718 v, _ := binary.Uvarint(value.InPlaceValue()) 719 // Batch.DeleteSized takes just the length of the value being 720 // deleted and adds the key's length to derive the overall entry 721 // size of the value being deleted. This has already been done 722 // to the key we're reading from the batch, so we must subtract 723 // the key length from the encoded value before calling 724 // collapsed.DeleteSized, which will again add the key length 725 // before encoding. 726 err = collapsed.DeleteSized(key.UserKey, uint32(v-uint64(len(key.UserKey))), nil) 727 case pebble.InternalKeyKindSingleDelete: 728 err = collapsed.SingleDelete(key.UserKey, nil) 729 case pebble.InternalKeyKindSet: 730 err = collapsed.Set(key.UserKey, value.InPlaceValue(), nil) 731 case pebble.InternalKeyKindMerge: 732 err = collapsed.Merge(key.UserKey, value.InPlaceValue(), nil) 733 case pebble.InternalKeyKindLogData: 734 err = collapsed.LogData(key.UserKey, nil) 735 default: 736 err = errors.Errorf("unknown batch record kind: %d", key.Kind()) 737 } 738 if err != nil { 739 return nil, err 740 } 741 } 742 if err := pointIter.Close(); err != nil { 743 return nil, err 744 } 745 pointIter = nil 746 } 747 748 // There's no equivalent of a MERGE operator for range keys, so there's no 749 // need to collapse the range keys here. Rather than reading the range keys 750 // from `rangeKeyIter`, which will already be fragmented, read the range 751 // keys from the batch and copy them verbatim. This marginally improves our 752 // test coverage over the alternative approach of pre-fragmenting and 753 // pre-coalescing before writing to the batch. 754 // 755 // The `rangeKeyIter` is used only to determine if there are any range keys 756 // in the batch at all, and only because we already have it handy from 757 // private.BatchSort. 758 if rangeKeyIter != nil { 759 for r := b.Reader(); ; { 760 kind, key, value, ok, err := r.Next() 761 if !ok { 762 if err != nil { 763 return nil, err 764 } 765 break 766 } else if !rangekey.IsRangeKey(kind) { 767 continue 768 } 769 ik := base.MakeInternalKey(key, 0, kind) 770 if err := collapsed.AddInternalKey(&ik, value, nil); err != nil { 771 return nil, err 772 } 773 } 774 if err := rangeKeyIter.Close(); err != nil { 775 return nil, err 776 } 777 rangeKeyIter = nil 778 } 779 780 return collapsed, nil 781 } 782 783 func (o *ingestOp) String() string { 784 var buf strings.Builder 785 buf.WriteString(o.dbID.String()) 786 buf.WriteString(".Ingest(") 787 for i, id := range o.batchIDs { 788 if i > 0 { 789 buf.WriteString(", ") 790 } 791 buf.WriteString(id.String()) 792 } 793 buf.WriteString(")") 794 return buf.String() 795 } 796 797 type ingestAndExciseOp struct { 798 dbID objID 799 batchID objID 800 derivedDBID objID 801 exciseStart, exciseEnd []byte 802 } 803 804 func (o *ingestAndExciseOp) run(t *test, h historyRecorder) { 805 var err error 806 b := t.getBatch(o.batchID) 807 t.clearObj(o.batchID) 808 if t.testOpts.Opts.Comparer.Compare(o.exciseEnd, o.exciseStart) <= 0 { 809 panic("non-well-formed excise span") 810 } 811 if b.Empty() { 812 // No-op. 813 h.Recordf("%s // %v", o, err) 814 return 815 } 816 path, writerMeta, err2 := o.build(t, h, b, 0 /* i */) 817 if err2 != nil { 818 h.Recordf("Build(%s) // %v", o.batchID, err2) 819 return 820 } 821 err = firstError(err, err2) 822 err = firstError(err, b.Close()) 823 824 if writerMeta.Properties.NumEntries == 0 && writerMeta.Properties.NumRangeKeys() == 0 { 825 // No-op. 826 h.Recordf("%s // %v", o, err) 827 return 828 } 829 db := t.getDB(o.dbID) 830 if !t.testOpts.useExcise { 831 // Do a rangedel and rangekeydel before the ingestion. This mimics the 832 // behaviour of an excise. 833 err = firstError(err, db.DeleteRange(o.exciseStart, o.exciseEnd, t.writeOpts)) 834 err = firstError(err, db.RangeKeyDelete(o.exciseStart, o.exciseEnd, t.writeOpts)) 835 } 836 837 if t.testOpts.useExcise { 838 err = firstError(err, withRetries(func() error { 839 _, err := t.getDB(o.dbID).IngestAndExcise([]string{path}, nil /* sharedSSTs */, pebble.KeyRange{ 840 Start: o.exciseStart, 841 End: o.exciseEnd, 842 }) 843 return err 844 })) 845 } else { 846 err = firstError(err, withRetries(func() error { 847 return t.getDB(o.dbID).Ingest([]string{path}) 848 })) 849 } 850 851 h.Recordf("%s // %v", o, err) 852 } 853 854 func (o *ingestAndExciseOp) build( 855 t *test, h historyRecorder, b *pebble.Batch, i int, 856 ) (string, *sstable.WriterMetadata, error) { 857 return buildForIngest(t, o.dbID, h, b, i) 858 } 859 860 func (o *ingestAndExciseOp) receiver() objID { return o.dbID } 861 func (o *ingestAndExciseOp) syncObjs() objIDSlice { 862 // Ingest should not be concurrent with mutating the batches that will be 863 // ingested as sstables. 864 objs := []objID{o.batchID} 865 if o.derivedDBID != o.dbID { 866 objs = append(objs, o.derivedDBID) 867 } 868 return objs 869 } 870 871 func (o *ingestAndExciseOp) String() string { 872 return fmt.Sprintf("%s.IngestAndExcise(%s, %q, %q)", o.dbID, o.batchID, o.exciseStart, o.exciseEnd) 873 } 874 875 // getOp models a Reader.Get operation. 876 type getOp struct { 877 readerID objID 878 key []byte 879 derivedDBID objID 880 } 881 882 func (o *getOp) run(t *test, h historyRecorder) { 883 r := t.getReader(o.readerID) 884 var val []byte 885 var closer io.Closer 886 err := withRetries(func() (err error) { 887 val, closer, err = r.Get(o.key) 888 return err 889 }) 890 h.Recordf("%s // [%q] %v", o, val, err) 891 if closer != nil { 892 closer.Close() 893 } 894 } 895 896 func (o *getOp) String() string { return fmt.Sprintf("%s.Get(%q)", o.readerID, o.key) } 897 func (o *getOp) receiver() objID { return o.readerID } 898 func (o *getOp) syncObjs() objIDSlice { 899 if o.readerID.tag() == dbTag { 900 return nil 901 } 902 // batch.Get reads through to the current database state. 903 if o.derivedDBID != 0 { 904 return []objID{o.derivedDBID} 905 } 906 return nil 907 } 908 909 // newIterOp models a Reader.NewIter operation. 910 type newIterOp struct { 911 readerID objID 912 iterID objID 913 iterOpts 914 derivedDBID objID 915 } 916 917 func (o *newIterOp) run(t *test, h historyRecorder) { 918 r := t.getReader(o.readerID) 919 opts := iterOptions(o.iterOpts) 920 921 var i *pebble.Iterator 922 for { 923 i, _ = r.NewIter(opts) 924 if err := i.Error(); !errors.Is(err, errorfs.ErrInjected) { 925 break 926 } 927 // close this iter and retry NewIter 928 _ = i.Close() 929 } 930 t.setIter(o.iterID, i) 931 932 // Trash the bounds to ensure that Pebble doesn't rely on the stability of 933 // the user-provided bounds. 934 if opts != nil { 935 rand.Read(opts.LowerBound[:]) 936 rand.Read(opts.UpperBound[:]) 937 } 938 h.Recordf("%s // %v", o, i.Error()) 939 } 940 941 func (o *newIterOp) String() string { 942 return fmt.Sprintf("%s = %s.NewIter(%q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)", 943 o.iterID, o.readerID, o.lower, o.upper, o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix) 944 } 945 946 func (o *newIterOp) receiver() objID { return o.readerID } 947 func (o *newIterOp) syncObjs() objIDSlice { 948 // Prevent o.iterID ops from running before it exists. 949 objs := []objID{o.iterID} 950 // If reading through a batch or snapshot, the new iterator will also observe database 951 // state, and we must synchronize on the database state for a consistent 952 // view. 953 if o.readerID.tag() == batchTag || o.readerID.tag() == snapTag { 954 objs = append(objs, o.derivedDBID) 955 } 956 return objs 957 } 958 959 // newIterUsingCloneOp models a Iterator.Clone operation. 960 type newIterUsingCloneOp struct { 961 existingIterID objID 962 iterID objID 963 refreshBatch bool 964 iterOpts 965 966 // derivedReaderID is the ID of the underlying reader that backs both the 967 // existing iterator and the new iterator. The derivedReaderID is NOT 968 // serialized by String and is derived from other operations during parse. 969 derivedReaderID objID 970 } 971 972 func (o *newIterUsingCloneOp) run(t *test, h historyRecorder) { 973 iter := t.getIter(o.existingIterID) 974 cloneOpts := pebble.CloneOptions{ 975 IterOptions: iterOptions(o.iterOpts), 976 RefreshBatchView: o.refreshBatch, 977 } 978 i, err := iter.iter.Clone(cloneOpts) 979 if err != nil { 980 panic(err) 981 } 982 t.setIter(o.iterID, i) 983 h.Recordf("%s // %v", o, i.Error()) 984 } 985 986 func (o *newIterUsingCloneOp) String() string { 987 return fmt.Sprintf("%s = %s.Clone(%t, %q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)", 988 o.iterID, o.existingIterID, o.refreshBatch, o.lower, o.upper, 989 o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix) 990 } 991 992 func (o *newIterUsingCloneOp) receiver() objID { return o.existingIterID } 993 994 func (o *newIterUsingCloneOp) syncObjs() objIDSlice { 995 objIDs := []objID{o.iterID} 996 // If the underlying reader is a batch, we must synchronize with the batch. 997 // If refreshBatch=true, synchronizing is necessary to observe all the 998 // mutations up to until this op and no more. Even when refreshBatch=false, 999 // we must synchronize because iterator construction may access state cached 1000 // on the indexed batch to avoid refragmenting range tombstones or range 1001 // keys. 1002 if o.derivedReaderID.tag() == batchTag { 1003 objIDs = append(objIDs, o.derivedReaderID) 1004 } 1005 return objIDs 1006 } 1007 1008 // iterSetBoundsOp models an Iterator.SetBounds operation. 1009 type iterSetBoundsOp struct { 1010 iterID objID 1011 lower []byte 1012 upper []byte 1013 } 1014 1015 func (o *iterSetBoundsOp) run(t *test, h historyRecorder) { 1016 i := t.getIter(o.iterID) 1017 var lower, upper []byte 1018 if o.lower != nil { 1019 lower = append(lower, o.lower...) 1020 } 1021 if o.upper != nil { 1022 upper = append(upper, o.upper...) 1023 } 1024 i.SetBounds(lower, upper) 1025 1026 // Trash the bounds to ensure that Pebble doesn't rely on the stability of 1027 // the user-provided bounds. 1028 rand.Read(lower[:]) 1029 rand.Read(upper[:]) 1030 1031 h.Recordf("%s // %v", o, i.Error()) 1032 } 1033 1034 func (o *iterSetBoundsOp) String() string { 1035 return fmt.Sprintf("%s.SetBounds(%q, %q)", o.iterID, o.lower, o.upper) 1036 } 1037 1038 func (o *iterSetBoundsOp) receiver() objID { return o.iterID } 1039 func (o *iterSetBoundsOp) syncObjs() objIDSlice { return nil } 1040 1041 // iterSetOptionsOp models an Iterator.SetOptions operation. 1042 type iterSetOptionsOp struct { 1043 iterID objID 1044 iterOpts 1045 1046 // derivedReaderID is the ID of the underlying reader that backs the 1047 // iterator. The derivedReaderID is NOT serialized by String and is derived 1048 // from other operations during parse. 1049 derivedReaderID objID 1050 } 1051 1052 func (o *iterSetOptionsOp) run(t *test, h historyRecorder) { 1053 i := t.getIter(o.iterID) 1054 1055 opts := iterOptions(o.iterOpts) 1056 if opts == nil { 1057 opts = &pebble.IterOptions{} 1058 } 1059 i.SetOptions(opts) 1060 1061 // Trash the bounds to ensure that Pebble doesn't rely on the stability of 1062 // the user-provided bounds. 1063 rand.Read(opts.LowerBound[:]) 1064 rand.Read(opts.UpperBound[:]) 1065 1066 h.Recordf("%s // %v", o, i.Error()) 1067 } 1068 1069 func (o *iterSetOptionsOp) String() string { 1070 return fmt.Sprintf("%s.SetOptions(%q, %q, %d /* key types */, %d, %d, %t /* use L6 filters */, %q /* masking suffix */)", 1071 o.iterID, o.lower, o.upper, o.keyTypes, o.filterMin, o.filterMax, o.useL6Filters, o.maskSuffix) 1072 } 1073 1074 func iterOptions(o iterOpts) *pebble.IterOptions { 1075 if o.IsZero() { 1076 return nil 1077 } 1078 var lower, upper []byte 1079 if o.lower != nil { 1080 lower = append(lower, o.lower...) 1081 } 1082 if o.upper != nil { 1083 upper = append(upper, o.upper...) 1084 } 1085 opts := &pebble.IterOptions{ 1086 LowerBound: lower, 1087 UpperBound: upper, 1088 KeyTypes: pebble.IterKeyType(o.keyTypes), 1089 RangeKeyMasking: pebble.RangeKeyMasking{ 1090 Suffix: o.maskSuffix, 1091 }, 1092 UseL6Filters: o.useL6Filters, 1093 } 1094 if opts.RangeKeyMasking.Suffix != nil { 1095 opts.RangeKeyMasking.Filter = func() pebble.BlockPropertyFilterMask { 1096 return sstable.NewTestKeysMaskingFilter() 1097 } 1098 } 1099 if o.filterMax > 0 { 1100 opts.PointKeyFilters = []pebble.BlockPropertyFilter{ 1101 sstable.NewTestKeysBlockPropertyFilter(o.filterMin, o.filterMax), 1102 } 1103 // Enforce the timestamp bounds in SkipPoint, so that the iterator never 1104 // returns a key outside the filterMin, filterMax bounds. This provides 1105 // deterministic iteration. 1106 opts.SkipPoint = func(k []byte) (skip bool) { 1107 n := testkeys.Comparer.Split(k) 1108 if n == len(k) { 1109 // No suffix, don't skip it. 1110 return false 1111 } 1112 v, err := testkeys.ParseSuffix(k[n:]) 1113 if err != nil { 1114 panic(err) 1115 } 1116 ts := uint64(v) 1117 return ts < o.filterMin || ts >= o.filterMax 1118 } 1119 } 1120 return opts 1121 } 1122 1123 func (o *iterSetOptionsOp) receiver() objID { return o.iterID } 1124 1125 func (o *iterSetOptionsOp) syncObjs() objIDSlice { 1126 if o.derivedReaderID.tag() == batchTag { 1127 // If the underlying reader is a batch, we must synchronize with the 1128 // batch so that we observe all the mutations up until this operation 1129 // and no more. 1130 return []objID{o.derivedReaderID} 1131 } 1132 return nil 1133 } 1134 1135 // iterSeekGEOp models an Iterator.SeekGE[WithLimit] operation. 1136 type iterSeekGEOp struct { 1137 iterID objID 1138 key []byte 1139 limit []byte 1140 1141 derivedReaderID objID 1142 } 1143 1144 func iteratorPos(i *retryableIter) string { 1145 var buf bytes.Buffer 1146 fmt.Fprintf(&buf, "%q", i.Key()) 1147 hasPoint, hasRange := i.HasPointAndRange() 1148 if hasPoint { 1149 fmt.Fprintf(&buf, ",%q", i.Value()) 1150 } else { 1151 fmt.Fprint(&buf, ",<no point>") 1152 } 1153 if hasRange { 1154 start, end := i.RangeBounds() 1155 fmt.Fprintf(&buf, ",[%q,%q)=>{", start, end) 1156 for i, rk := range i.RangeKeys() { 1157 if i > 0 { 1158 fmt.Fprint(&buf, ",") 1159 } 1160 fmt.Fprintf(&buf, "%q=%q", rk.Suffix, rk.Value) 1161 } 1162 fmt.Fprint(&buf, "}") 1163 } else { 1164 fmt.Fprint(&buf, ",<no range>") 1165 } 1166 if i.RangeKeyChanged() { 1167 fmt.Fprint(&buf, "*") 1168 } 1169 return buf.String() 1170 } 1171 1172 func validBoolToStr(valid bool) string { 1173 return fmt.Sprintf("%t", valid) 1174 } 1175 1176 func validityStateToStr(validity pebble.IterValidityState) (bool, string) { 1177 // We can't distinguish between IterExhausted and IterAtLimit in a 1178 // deterministic manner. 1179 switch validity { 1180 case pebble.IterExhausted, pebble.IterAtLimit: 1181 return false, "invalid" 1182 case pebble.IterValid: 1183 return true, "valid" 1184 default: 1185 panic("unknown validity") 1186 } 1187 } 1188 1189 func (o *iterSeekGEOp) run(t *test, h historyRecorder) { 1190 i := t.getIter(o.iterID) 1191 var valid bool 1192 var validStr string 1193 if o.limit == nil { 1194 valid = i.SeekGE(o.key) 1195 validStr = validBoolToStr(valid) 1196 } else { 1197 valid, validStr = validityStateToStr(i.SeekGEWithLimit(o.key, o.limit)) 1198 } 1199 if valid { 1200 h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error()) 1201 } else { 1202 h.Recordf("%s // [%s] %v", o, validStr, i.Error()) 1203 } 1204 } 1205 1206 func (o *iterSeekGEOp) String() string { 1207 return fmt.Sprintf("%s.SeekGE(%q, %q)", o.iterID, o.key, o.limit) 1208 } 1209 func (o *iterSeekGEOp) receiver() objID { return o.iterID } 1210 func (o *iterSeekGEOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1211 1212 func onlyBatchIDs(ids ...objID) objIDSlice { 1213 var ret objIDSlice 1214 for _, id := range ids { 1215 if id.tag() == batchTag { 1216 ret = append(ret, id) 1217 } 1218 } 1219 return ret 1220 } 1221 1222 // iterSeekPrefixGEOp models an Iterator.SeekPrefixGE operation. 1223 type iterSeekPrefixGEOp struct { 1224 iterID objID 1225 key []byte 1226 1227 derivedReaderID objID 1228 } 1229 1230 func (o *iterSeekPrefixGEOp) run(t *test, h historyRecorder) { 1231 i := t.getIter(o.iterID) 1232 valid := i.SeekPrefixGE(o.key) 1233 if valid { 1234 h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error()) 1235 } else { 1236 h.Recordf("%s // [%t] %v", o, valid, i.Error()) 1237 } 1238 } 1239 1240 func (o *iterSeekPrefixGEOp) String() string { 1241 return fmt.Sprintf("%s.SeekPrefixGE(%q)", o.iterID, o.key) 1242 } 1243 func (o *iterSeekPrefixGEOp) receiver() objID { return o.iterID } 1244 func (o *iterSeekPrefixGEOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1245 1246 // iterSeekLTOp models an Iterator.SeekLT[WithLimit] operation. 1247 type iterSeekLTOp struct { 1248 iterID objID 1249 key []byte 1250 limit []byte 1251 1252 derivedReaderID objID 1253 } 1254 1255 func (o *iterSeekLTOp) run(t *test, h historyRecorder) { 1256 i := t.getIter(o.iterID) 1257 var valid bool 1258 var validStr string 1259 if o.limit == nil { 1260 valid = i.SeekLT(o.key) 1261 validStr = validBoolToStr(valid) 1262 } else { 1263 valid, validStr = validityStateToStr(i.SeekLTWithLimit(o.key, o.limit)) 1264 } 1265 if valid { 1266 h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error()) 1267 } else { 1268 h.Recordf("%s // [%s] %v", o, validStr, i.Error()) 1269 } 1270 } 1271 1272 func (o *iterSeekLTOp) String() string { 1273 return fmt.Sprintf("%s.SeekLT(%q, %q)", o.iterID, o.key, o.limit) 1274 } 1275 1276 func (o *iterSeekLTOp) receiver() objID { return o.iterID } 1277 func (o *iterSeekLTOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1278 1279 // iterFirstOp models an Iterator.First operation. 1280 type iterFirstOp struct { 1281 iterID objID 1282 1283 derivedReaderID objID 1284 } 1285 1286 func (o *iterFirstOp) run(t *test, h historyRecorder) { 1287 i := t.getIter(o.iterID) 1288 valid := i.First() 1289 if valid { 1290 h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error()) 1291 } else { 1292 h.Recordf("%s // [%t] %v", o, valid, i.Error()) 1293 } 1294 } 1295 1296 func (o *iterFirstOp) String() string { return fmt.Sprintf("%s.First()", o.iterID) } 1297 func (o *iterFirstOp) receiver() objID { return o.iterID } 1298 func (o *iterFirstOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1299 1300 // iterLastOp models an Iterator.Last operation. 1301 type iterLastOp struct { 1302 iterID objID 1303 1304 derivedReaderID objID 1305 } 1306 1307 func (o *iterLastOp) run(t *test, h historyRecorder) { 1308 i := t.getIter(o.iterID) 1309 valid := i.Last() 1310 if valid { 1311 h.Recordf("%s // [%t,%s] %v", o, valid, iteratorPos(i), i.Error()) 1312 } else { 1313 h.Recordf("%s // [%t] %v", o, valid, i.Error()) 1314 } 1315 } 1316 1317 func (o *iterLastOp) String() string { return fmt.Sprintf("%s.Last()", o.iterID) } 1318 func (o *iterLastOp) receiver() objID { return o.iterID } 1319 func (o *iterLastOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1320 1321 // iterNextOp models an Iterator.Next[WithLimit] operation. 1322 type iterNextOp struct { 1323 iterID objID 1324 limit []byte 1325 1326 derivedReaderID objID 1327 } 1328 1329 func (o *iterNextOp) run(t *test, h historyRecorder) { 1330 i := t.getIter(o.iterID) 1331 var valid bool 1332 var validStr string 1333 if o.limit == nil { 1334 valid = i.Next() 1335 validStr = validBoolToStr(valid) 1336 } else { 1337 valid, validStr = validityStateToStr(i.NextWithLimit(o.limit)) 1338 } 1339 if valid { 1340 h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error()) 1341 } else { 1342 h.Recordf("%s // [%s] %v", o, validStr, i.Error()) 1343 } 1344 } 1345 1346 func (o *iterNextOp) String() string { return fmt.Sprintf("%s.Next(%q)", o.iterID, o.limit) } 1347 func (o *iterNextOp) receiver() objID { return o.iterID } 1348 func (o *iterNextOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1349 1350 // iterNextPrefixOp models an Iterator.NextPrefix operation. 1351 type iterNextPrefixOp struct { 1352 iterID objID 1353 1354 derivedReaderID objID 1355 } 1356 1357 func (o *iterNextPrefixOp) run(t *test, h historyRecorder) { 1358 i := t.getIter(o.iterID) 1359 valid := i.NextPrefix() 1360 validStr := validBoolToStr(valid) 1361 if valid { 1362 h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error()) 1363 } else { 1364 h.Recordf("%s // [%s] %v", o, validStr, i.Error()) 1365 } 1366 } 1367 1368 func (o *iterNextPrefixOp) String() string { return fmt.Sprintf("%s.NextPrefix()", o.iterID) } 1369 func (o *iterNextPrefixOp) receiver() objID { return o.iterID } 1370 func (o *iterNextPrefixOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1371 1372 // iterCanSingleDelOp models a call to CanDeterministicallySingleDelete with an 1373 // Iterator. 1374 type iterCanSingleDelOp struct { 1375 iterID objID 1376 1377 derivedReaderID objID 1378 } 1379 1380 func (o *iterCanSingleDelOp) run(t *test, h historyRecorder) { 1381 // TODO(jackson): When we perform error injection, we'll need to rethink 1382 // this. 1383 _, err := pebble.CanDeterministicallySingleDelete(t.getIter(o.iterID).iter) 1384 // The return value of CanDeterministicallySingleDelete is dependent on 1385 // internal LSM state and non-deterministic, so we don't record it. 1386 // Including the operation within the metamorphic test at all helps ensure 1387 // that it does not change the result of any other Iterator operation that 1388 // should be deterministic, regardless of its own outcome. 1389 // 1390 // We still record the value of the error because it's deterministic, at 1391 // least for now. The possible error cases are: 1392 // - The iterator was already in an error state when the operation ran. 1393 // - The operation is deterministically invalid (like using an InternalNext 1394 // to change directions.) 1395 h.Recordf("%s // %v", o, err) 1396 } 1397 1398 func (o *iterCanSingleDelOp) String() string { return fmt.Sprintf("%s.InternalNext()", o.iterID) } 1399 func (o *iterCanSingleDelOp) receiver() objID { return o.iterID } 1400 func (o *iterCanSingleDelOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1401 1402 // iterPrevOp models an Iterator.Prev[WithLimit] operation. 1403 type iterPrevOp struct { 1404 iterID objID 1405 limit []byte 1406 1407 derivedReaderID objID 1408 } 1409 1410 func (o *iterPrevOp) run(t *test, h historyRecorder) { 1411 i := t.getIter(o.iterID) 1412 var valid bool 1413 var validStr string 1414 if o.limit == nil { 1415 valid = i.Prev() 1416 validStr = validBoolToStr(valid) 1417 } else { 1418 valid, validStr = validityStateToStr(i.PrevWithLimit(o.limit)) 1419 } 1420 if valid { 1421 h.Recordf("%s // [%s,%s] %v", o, validStr, iteratorPos(i), i.Error()) 1422 } else { 1423 h.Recordf("%s // [%s] %v", o, validStr, i.Error()) 1424 } 1425 } 1426 1427 func (o *iterPrevOp) String() string { return fmt.Sprintf("%s.Prev(%q)", o.iterID, o.limit) } 1428 func (o *iterPrevOp) receiver() objID { return o.iterID } 1429 func (o *iterPrevOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) } 1430 1431 // newSnapshotOp models a DB.NewSnapshot operation. 1432 type newSnapshotOp struct { 1433 dbID objID 1434 snapID objID 1435 // If nonempty, this snapshot must not be used to read any keys outside of 1436 // the provided bounds. This allows some implementations to use 'Eventually 1437 // file-only snapshots,' which require bounds. 1438 bounds []pebble.KeyRange 1439 } 1440 1441 func (o *newSnapshotOp) run(t *test, h historyRecorder) { 1442 bounds := o.bounds 1443 if len(bounds) == 0 { 1444 panic("bounds unexpectedly unset for newSnapshotOp") 1445 } 1446 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 1447 createEfos := ((11400714819323198485 * uint64(t.idx) * t.testOpts.seedEFOS) >> 63) == 1 1448 // If either of these options is true, an EFOS _must_ be created, regardless 1449 // of what the fibonacci hash returned. 1450 excisePossible := t.testOpts.useSharedReplicate || t.testOpts.useExcise 1451 if createEfos || excisePossible { 1452 s := t.getDB(o.dbID).NewEventuallyFileOnlySnapshot(bounds) 1453 t.setSnapshot(o.snapID, s) 1454 // If the EFOS isn't guaranteed to always create iterators, we must force 1455 // a flush on this DB so it transitions this EFOS into a file-only snapshot. 1456 if excisePossible && !t.testOpts.efosAlwaysCreatesIters { 1457 err := t.getDB(o.dbID).Flush() 1458 if err != nil { 1459 h.Recordf("%s // %v", o, err) 1460 panic(errors.Wrap(err, "newSnapshotOp")) 1461 } 1462 } 1463 } else { 1464 s := t.getDB(o.dbID).NewSnapshot() 1465 t.setSnapshot(o.snapID, s) 1466 } 1467 h.Recordf("%s", o) 1468 } 1469 1470 func (o *newSnapshotOp) String() string { 1471 var buf bytes.Buffer 1472 fmt.Fprintf(&buf, "%s = %s.NewSnapshot(", o.snapID, o.dbID) 1473 for i := range o.bounds { 1474 if i > 0 { 1475 fmt.Fprint(&buf, ", ") 1476 } 1477 fmt.Fprintf(&buf, "%q, %q", o.bounds[i].Start, o.bounds[i].End) 1478 } 1479 fmt.Fprint(&buf, ")") 1480 return buf.String() 1481 } 1482 func (o *newSnapshotOp) receiver() objID { return o.dbID } 1483 func (o *newSnapshotOp) syncObjs() objIDSlice { return []objID{o.snapID} } 1484 1485 type dbRatchetFormatMajorVersionOp struct { 1486 dbID objID 1487 vers pebble.FormatMajorVersion 1488 } 1489 1490 func (o *dbRatchetFormatMajorVersionOp) run(t *test, h historyRecorder) { 1491 var err error 1492 // NB: We no-op the operation if we're already at or above the provided 1493 // format major version. Different runs start at different format major 1494 // versions, making the presence of an error and the error message itself 1495 // non-deterministic if we attempt to upgrade to an older version. 1496 // 1497 //Regardless, subsequent operations should behave identically, which is what 1498 //we're really aiming to test by including this format major version ratchet 1499 //operation. 1500 if t.getDB(o.dbID).FormatMajorVersion() < o.vers { 1501 err = t.getDB(o.dbID).RatchetFormatMajorVersion(o.vers) 1502 } 1503 h.Recordf("%s // %v", o, err) 1504 } 1505 1506 func (o *dbRatchetFormatMajorVersionOp) String() string { 1507 return fmt.Sprintf("%s.RatchetFormatMajorVersion(%s)", o.dbID, o.vers) 1508 } 1509 func (o *dbRatchetFormatMajorVersionOp) receiver() objID { return o.dbID } 1510 func (o *dbRatchetFormatMajorVersionOp) syncObjs() objIDSlice { return nil } 1511 1512 type dbRestartOp struct { 1513 dbID objID 1514 } 1515 1516 func (o *dbRestartOp) run(t *test, h historyRecorder) { 1517 if err := t.restartDB(o.dbID); err != nil { 1518 h.Recordf("%s // %v", o, err) 1519 h.history.err.Store(errors.Wrap(err, "dbRestartOp")) 1520 } else { 1521 h.Recordf("%s", o) 1522 } 1523 } 1524 1525 func (o *dbRestartOp) String() string { return fmt.Sprintf("%s.Restart()", o.dbID) } 1526 func (o *dbRestartOp) receiver() objID { return o.dbID } 1527 func (o *dbRestartOp) syncObjs() objIDSlice { return nil } 1528 1529 func formatOps(ops []op) string { 1530 var buf strings.Builder 1531 for _, op := range ops { 1532 fmt.Fprintf(&buf, "%s\n", op) 1533 } 1534 return buf.String() 1535 } 1536 1537 // replicateOp models an operation that could copy keys from one db to 1538 // another through either an IngestAndExcise, or an Ingest. 1539 type replicateOp struct { 1540 source, dest objID 1541 start, end []byte 1542 } 1543 1544 func (r *replicateOp) runSharedReplicate( 1545 t *test, h historyRecorder, source, dest *pebble.DB, w *sstable.Writer, sstPath string, 1546 ) { 1547 var sharedSSTs []pebble.SharedSSTMeta 1548 var err error 1549 err = source.ScanInternal(context.TODO(), sstable.CategoryAndQoS{}, r.start, r.end, 1550 func(key *pebble.InternalKey, value pebble.LazyValue, _ pebble.IteratorLevel) error { 1551 val, _, err := value.Value(nil) 1552 if err != nil { 1553 panic(err) 1554 } 1555 return w.Add(base.MakeInternalKey(key.UserKey, 0, key.Kind()), val) 1556 }, 1557 func(start, end []byte, seqNum uint64) error { 1558 return w.DeleteRange(start, end) 1559 }, 1560 func(start, end []byte, keys []keyspan.Key) error { 1561 s := keyspan.Span{ 1562 Start: start, 1563 End: end, 1564 Keys: keys, 1565 } 1566 return rangekey.Encode(&s, w.AddRangeKey) 1567 }, 1568 func(sst *pebble.SharedSSTMeta) error { 1569 sharedSSTs = append(sharedSSTs, *sst) 1570 return nil 1571 }, 1572 ) 1573 if err != nil { 1574 h.Recordf("%s // %v", r, err) 1575 return 1576 } 1577 1578 err = w.Close() 1579 if err != nil { 1580 h.Recordf("%s // %v", r, err) 1581 return 1582 } 1583 meta, err := w.Metadata() 1584 if err != nil { 1585 h.Recordf("%s // %v", r, err) 1586 return 1587 } 1588 if len(sharedSSTs) == 0 && meta.Properties.NumEntries == 0 && meta.Properties.NumRangeKeys() == 0 { 1589 // IngestAndExcise below will be a no-op. We should do a 1590 // DeleteRange+RangeKeyDel to mimic the behaviour of the non-shared-replicate 1591 // case. 1592 // 1593 // TODO(bilal): Remove this when we support excises with no matching ingests. 1594 if err := dest.RangeKeyDelete(r.start, r.end, t.writeOpts); err != nil { 1595 h.Recordf("%s // %v", r, err) 1596 return 1597 } 1598 err := dest.DeleteRange(r.start, r.end, t.writeOpts) 1599 h.Recordf("%s // %v", r, err) 1600 return 1601 } 1602 1603 _, err = dest.IngestAndExcise([]string{sstPath}, sharedSSTs, pebble.KeyRange{Start: r.start, End: r.end}) 1604 h.Recordf("%s // %v", r, err) 1605 } 1606 1607 func (r *replicateOp) run(t *test, h historyRecorder) { 1608 // Shared replication only works if shared storage is enabled. 1609 useSharedIngest := t.testOpts.useSharedReplicate 1610 if !t.testOpts.sharedStorageEnabled { 1611 useSharedIngest = false 1612 } 1613 1614 source := t.getDB(r.source) 1615 dest := t.getDB(r.dest) 1616 sstPath := path.Join(t.tmpDir, fmt.Sprintf("ext-replicate%d.sst", t.idx)) 1617 f, err := t.opts.FS.Create(sstPath) 1618 if err != nil { 1619 h.Recordf("%s // %v", r, err) 1620 return 1621 } 1622 w := sstable.NewWriter(objstorageprovider.NewFileWritable(f), t.opts.MakeWriterOptions(0, dest.FormatMajorVersion().MaxTableFormat())) 1623 1624 if useSharedIngest { 1625 r.runSharedReplicate(t, h, source, dest, w, sstPath) 1626 return 1627 } 1628 1629 // First, do a RangeKeyDelete and DeleteRange on the whole span. 1630 if err := dest.RangeKeyDelete(r.start, r.end, t.writeOpts); err != nil { 1631 h.Recordf("%s // %v", r, err) 1632 return 1633 } 1634 if err := dest.DeleteRange(r.start, r.end, t.writeOpts); err != nil { 1635 h.Recordf("%s // %v", r, err) 1636 return 1637 } 1638 iter, err := source.NewIter(&pebble.IterOptions{ 1639 LowerBound: r.start, 1640 UpperBound: r.end, 1641 KeyTypes: pebble.IterKeyTypePointsAndRanges, 1642 }) 1643 if err != nil { 1644 panic(err) 1645 } 1646 defer iter.Close() 1647 1648 for ok := iter.SeekGE(r.start); ok && iter.Error() == nil; ok = iter.Next() { 1649 hasPoint, hasRange := iter.HasPointAndRange() 1650 if hasPoint { 1651 val, err := iter.ValueAndErr() 1652 if err != nil { 1653 panic(err) 1654 } 1655 if err := w.Set(iter.Key(), val); err != nil { 1656 panic(err) 1657 } 1658 } 1659 if hasRange && iter.RangeKeyChanged() { 1660 rangeKeys := iter.RangeKeys() 1661 rkStart, rkEnd := iter.RangeBounds() 1662 1663 span := &keyspan.Span{Start: rkStart, End: rkEnd, Keys: make([]keyspan.Key, len(rangeKeys))} 1664 for i := range rangeKeys { 1665 span.Keys[i] = keyspan.Key{ 1666 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet), 1667 Suffix: rangeKeys[i].Suffix, 1668 Value: rangeKeys[i].Value, 1669 } 1670 } 1671 keyspan.SortKeysByTrailer(&span.Keys) 1672 if err := rangekey.Encode(span, w.AddRangeKey); err != nil { 1673 panic(err) 1674 } 1675 } 1676 } 1677 if err := iter.Error(); err != nil { 1678 h.Recordf("%s // %v", r, err) 1679 return 1680 } 1681 if err := w.Close(); err != nil { 1682 panic(err) 1683 } 1684 1685 err = dest.Ingest([]string{sstPath}) 1686 h.Recordf("%s // %v", r, err) 1687 } 1688 1689 func (r *replicateOp) String() string { 1690 return fmt.Sprintf("%s.Replicate(%s, %q, %q)", r.source, r.dest, r.start, r.end) 1691 } 1692 1693 func (r *replicateOp) receiver() objID { return r.source } 1694 func (r *replicateOp) syncObjs() objIDSlice { return objIDSlice{r.dest} }