github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvnemesis/generator.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvnemesis 12 13 import ( 14 "math/rand" 15 "strconv" 16 17 "github.com/cockroachdb/cockroach/pkg/keys" 18 "github.com/cockroachdb/cockroach/pkg/roachpb" 19 "github.com/cockroachdb/cockroach/pkg/util/encoding" 20 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 21 "github.com/cockroachdb/cockroach/pkg/util/uuid" 22 "github.com/cockroachdb/errors" 23 ) 24 25 // GeneratorConfig contains all the tunable knobs necessary to run a Generator. 26 type GeneratorConfig struct { 27 Ops OperationConfig 28 NumNodes, NumReplicas int 29 } 30 31 // OperationConfig configures the relative probabilities of producing various 32 // operations. 33 // 34 // In this struct and all sub-configurations, wording such as "likely exists" or 35 // "definitely doesn't exist" is according to previously generated steps. 36 // "likely" is a result of non-determinism due to concurrent execution of the 37 // generated operations. 38 type OperationConfig struct { 39 DB ClientOperationConfig 40 Batch BatchOperationConfig 41 ClosureTxn ClosureTxnConfig 42 Split SplitConfig 43 Merge MergeConfig 44 ChangeReplicas ChangeReplicasConfig 45 } 46 47 // ClosureTxnConfig configures the relative probability of running some 48 // operations in a transaction by using the closure-based kv.DB.Txn method. This 49 // family of operations mainly varies in how it commits (or doesn't commit). The 50 // composition of the operations in the txn is controlled by TxnClientOps and 51 // TxnBatchOps 52 type ClosureTxnConfig struct { 53 TxnClientOps ClientOperationConfig 54 TxnBatchOps BatchOperationConfig 55 56 // Commit is a transaction that commits normally. 57 Commit int 58 // Rollback is a transaction that encounters an error at the end and has to 59 // roll back. 60 Rollback int 61 // CommitInBatch is a transaction that commits via the CommitInBatchMethod. 62 // This is an important part of the 1pc txn fastpath. 63 CommitInBatch int 64 // When CommitInBatch is selected, CommitBatchOps controls the composition of 65 // the kv.Batch used. 66 CommitBatchOps ClientOperationConfig 67 } 68 69 // ClientOperationConfig configures the relative probabilities of the 70 // bread-and-butter kv operations such as Get/Put/Delete/etc. These can all be 71 // run on a DB, a Txn, or a Batch. 72 type ClientOperationConfig struct { 73 // GetMissing is an operation that Gets a key that definitely doesn't exist. 74 GetMissing int 75 // GetExisting is an operation that Gets a key that likely exists. 76 GetExisting int 77 // PutMissing is an operation that Puts a key that definitely doesn't exist. 78 PutMissing int 79 // PutExisting is an operation that Puts a key that likely exists. 80 PutExisting int 81 } 82 83 // BatchOperationConfig configures the relative probability of generating a 84 // kv.Batch of some number of operations as well as the composition of the 85 // operations in the batch itself. These can be run in various ways including 86 // kv.DB.Run or kv.Txn.Run. 87 type BatchOperationConfig struct { 88 Batch int 89 Ops ClientOperationConfig 90 } 91 92 // SplitConfig configures the relative probability of generating a Split 93 // operation. 94 type SplitConfig struct { 95 // SplitNew is an operation that Splits at a key that has never previously 96 // been a split point. 97 SplitNew int 98 // SplitAgain is an operation that Splits at a key that likely has 99 // previously been a split point, though it may or may not have been merged 100 // since. 101 SplitAgain int 102 } 103 104 // MergeConfig configures the relative probability of generating a Merge 105 // operation. 106 type MergeConfig struct { 107 // MergeNotSplit is an operation that Merges at a key that has never been 108 // split at (meaning this should be a no-op). 109 MergeNotSplit int 110 // MergeIsSplit is an operation that Merges at a key that is likely to 111 // currently be split. 112 MergeIsSplit int 113 } 114 115 // ChangeReplicasConfig configures the relative probability of generating a 116 // ChangeReplicas operation. 117 type ChangeReplicasConfig struct { 118 // AddReplica adds a single replica. 119 AddReplica int 120 // RemoveReplica removes a single replica. 121 RemoveReplica int 122 // AtomicSwapReplica adds 1 replica and removes 1 replica in a single 123 // ChangeReplicas call. 124 AtomicSwapReplica int 125 } 126 127 // newAllOperationsConfig returns a GeneratorConfig that exercises *all* 128 // options. You probably want NewDefaultConfig. Most of the time, these will be 129 // the same, but having both allows us to merge code for operations that do not 130 // yet pass (for example, if the new operation finds a kv bug or edge case). 131 func newAllOperationsConfig() GeneratorConfig { 132 clientOpConfig := ClientOperationConfig{ 133 GetMissing: 1, 134 GetExisting: 1, 135 PutMissing: 1, 136 PutExisting: 1, 137 } 138 batchOpConfig := BatchOperationConfig{ 139 Batch: 4, 140 Ops: clientOpConfig, 141 } 142 return GeneratorConfig{Ops: OperationConfig{ 143 DB: clientOpConfig, 144 Batch: batchOpConfig, 145 ClosureTxn: ClosureTxnConfig{ 146 Commit: 5, 147 Rollback: 5, 148 CommitInBatch: 5, 149 TxnClientOps: clientOpConfig, 150 TxnBatchOps: batchOpConfig, 151 CommitBatchOps: clientOpConfig, 152 }, 153 Split: SplitConfig{ 154 SplitNew: 1, 155 SplitAgain: 1, 156 }, 157 Merge: MergeConfig{ 158 MergeNotSplit: 1, 159 MergeIsSplit: 1, 160 }, 161 ChangeReplicas: ChangeReplicasConfig{ 162 AddReplica: 1, 163 RemoveReplica: 1, 164 AtomicSwapReplica: 1, 165 }, 166 }} 167 } 168 169 // NewDefaultConfig returns a GeneratorConfig that is a reasonable default 170 // starting point for general KV usage. Nemesis test variants that want to 171 // stress particular areas may want to start with this and eliminate some 172 // operations/make some operations more likely. 173 func NewDefaultConfig() GeneratorConfig { 174 config := newAllOperationsConfig() 175 // TODO(dan): This fails with a WriteTooOld error if the same key is Put twice 176 // in a single batch. However, if the same Batch is committed using txn.Run, 177 // then it works and only the last one is materialized. We could make the 178 // db.Run behavior match txn.Run by ensuring that all requests in a 179 // nontransactional batch are disjoint and upgrading to a transactional batch 180 // (see CrossRangeTxnWrapperSender) if they are. roachpb.SpanGroup can be used 181 // to efficiently check this. 182 // 183 // TODO(dan): Make this `config.Ops.Batch.Ops.PutExisting = 0` once #46081 is 184 // fixed. 185 config.Ops.Batch = BatchOperationConfig{} 186 // TODO(dan): Remove when #45586 is addressed. 187 config.Ops.ClosureTxn.CommitBatchOps.GetExisting = 0 188 config.Ops.ClosureTxn.CommitBatchOps.GetMissing = 0 189 return config 190 } 191 192 // GeneratorDataSpan returns a span that contains all of the operations created 193 // by this Generator. 194 func GeneratorDataSpan() roachpb.Span { 195 return roachpb.Span{ 196 Key: keys.SystemSQLCodec.TablePrefix(50), 197 EndKey: keys.SystemSQLCodec.TablePrefix(51), 198 } 199 } 200 201 // GetReplicasFn is a function that returns the current replicas for the range 202 // containing a key. 203 type GetReplicasFn func(roachpb.Key) []roachpb.ReplicationTarget 204 205 // Generator incrementally constructs KV traffic designed to maximally test edge 206 // cases. 207 // 208 // The expected usage is that a number of concurrent worker threads will each 209 // repeatedly ask for a Step, finish executing it, then ask for the next Step. 210 // 211 // A Step consists of a single Operation, which is a unit of work that must be 212 // done serially. It often corresponds 1:1 to a single call to some method on 213 // the KV api (such as Get or Put), but some Operations have a set of steps 214 // (such as using a transaction). 215 // 216 // Generator in itself is deterministic, but it's intended usage is that 217 // multiple worker goroutines take turns pulling steps (sequentially) which they 218 // then execute concurrently. To improve the efficiency of this pattern, 219 // Generator will track which splits and merges could possibly have taken place 220 // and takes this into account when generating operations. For example, 221 // Generator won't take a OpMergeIsSplit step if it has never previously emitted 222 // a split, but it may emit an OpMerge once it has produced an OpSplit even 223 // though the worker executing the split may find that the merge has not yet 224 // been executed. 225 type Generator struct { 226 // TODO(dan): This is awkward, unify Generator and generator. 227 mu struct { 228 syncutil.Mutex 229 generator 230 } 231 } 232 233 // MakeGenerator constructs a Generator. 234 func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator, error) { 235 if config.NumNodes <= 0 { 236 return nil, errors.Errorf(`NumNodes must be positive got: %d`, config.NumNodes) 237 } 238 if config.NumReplicas <= 0 { 239 return nil, errors.Errorf(`NumReplicas must be positive got: %d`, config.NumReplicas) 240 } 241 if config.NumReplicas > config.NumNodes { 242 return nil, errors.Errorf(`NumReplicas (%d) must <= NumNodes (%d)`, 243 config.NumReplicas, config.NumNodes) 244 } 245 g := &Generator{} 246 g.mu.generator = generator{ 247 Config: config, 248 replicasFn: replicasFn, 249 keys: make(map[string]struct{}), 250 currentSplits: make(map[string]struct{}), 251 historicalSplits: make(map[string]struct{}), 252 } 253 return g, nil 254 } 255 256 // RandStep returns a single randomly generated next operation to execute. 257 // 258 // RandStep is concurrency safe. 259 func (g *Generator) RandStep(rng *rand.Rand) Step { 260 g.mu.Lock() 261 defer g.mu.Unlock() 262 return g.mu.RandStep(rng) 263 } 264 265 type generator struct { 266 Config GeneratorConfig 267 replicasFn GetReplicasFn 268 269 nextValue int 270 271 // keys is the set of every key that has been written to, including those in 272 // rolled back transactions. 273 keys map[string]struct{} 274 275 // currentSplits is approximately the set of every split that has been made 276 // within DataSpan. The exact accounting is hard because Generator can hand 277 // out a concurrent split and merge for the same key, which is racey. These 278 // races can result in a currentSplit that is not in fact a split at the KV 279 // level. Luckily we don't need exact accounting. 280 currentSplits map[string]struct{} 281 // historicalSplits is the set of every key for which a split has been 282 // emitted, regardless of whether the split has since been applied or been 283 // merged away again. 284 historicalSplits map[string]struct{} 285 } 286 287 // RandStep returns a single randomly generated next operation to execute. 288 // 289 // RandStep is not concurrency safe. 290 func (g *generator) RandStep(rng *rand.Rand) Step { 291 var allowed []opGen 292 g.registerClientOps(&allowed, &g.Config.Ops.DB) 293 g.registerBatchOps(&allowed, &g.Config.Ops.Batch) 294 g.registerClosureTxnOps(&allowed, &g.Config.Ops.ClosureTxn) 295 296 addOpGen(&allowed, randSplitNew, g.Config.Ops.Split.SplitNew) 297 if len(g.historicalSplits) > 0 { 298 addOpGen(&allowed, randSplitAgain, g.Config.Ops.Split.SplitAgain) 299 } 300 301 addOpGen(&allowed, randMergeNotSplit, g.Config.Ops.Merge.MergeNotSplit) 302 if len(g.currentSplits) > 0 { 303 addOpGen(&allowed, randMergeIsSplit, g.Config.Ops.Merge.MergeIsSplit) 304 } 305 306 key := randKey(rng) 307 current := g.replicasFn(roachpb.Key(key)) 308 if len(current) < g.Config.NumNodes { 309 addReplicaFn := makeAddReplicaFn(key, current, false /* atomicSwap */) 310 addOpGen(&allowed, addReplicaFn, g.Config.Ops.ChangeReplicas.AddReplica) 311 } 312 if len(current) == g.Config.NumReplicas && len(current) < g.Config.NumNodes { 313 atomicSwapReplicaFn := makeAddReplicaFn(key, current, true /* atomicSwap */) 314 addOpGen(&allowed, atomicSwapReplicaFn, g.Config.Ops.ChangeReplicas.AtomicSwapReplica) 315 } 316 if len(current) > g.Config.NumReplicas { 317 removeReplicaFn := makeRemoveReplicaFn(key, current) 318 addOpGen(&allowed, removeReplicaFn, g.Config.Ops.ChangeReplicas.RemoveReplica) 319 } 320 321 return step(g.selectOp(rng, allowed)) 322 } 323 324 type opGenFunc func(*generator, *rand.Rand) Operation 325 326 type opGen struct { 327 fn opGenFunc 328 weight int 329 } 330 331 func addOpGen(valid *[]opGen, fn opGenFunc, weight int) { 332 *valid = append(*valid, opGen{fn: fn, weight: weight}) 333 } 334 335 func (g *generator) selectOp(rng *rand.Rand, contextuallyValid []opGen) Operation { 336 var total int 337 for _, x := range contextuallyValid { 338 total += x.weight 339 } 340 target := rng.Intn(total) 341 var sum int 342 for _, x := range contextuallyValid { 343 sum += x.weight 344 if sum > target { 345 return x.fn(g, rng) 346 } 347 } 348 panic(`unreachable`) 349 } 350 351 func (g *generator) registerClientOps(allowed *[]opGen, c *ClientOperationConfig) { 352 addOpGen(allowed, randGetMissing, c.GetMissing) 353 addOpGen(allowed, randPutMissing, c.PutMissing) 354 if len(g.keys) > 0 { 355 addOpGen(allowed, randGetExisting, c.GetExisting) 356 addOpGen(allowed, randPutExisting, c.PutExisting) 357 } 358 } 359 360 func (g *generator) registerBatchOps(allowed *[]opGen, c *BatchOperationConfig) { 361 addOpGen(allowed, makeRandBatch(&c.Ops), c.Batch) 362 } 363 364 func randGetMissing(_ *generator, rng *rand.Rand) Operation { 365 return get(randKey(rng)) 366 } 367 368 func randGetExisting(g *generator, rng *rand.Rand) Operation { 369 key := randMapKey(rng, g.keys) 370 return get(key) 371 } 372 373 func randPutMissing(g *generator, rng *rand.Rand) Operation { 374 value := g.getNextValue() 375 key := randKey(rng) 376 g.keys[key] = struct{}{} 377 return put(key, value) 378 } 379 380 func randPutExisting(g *generator, rng *rand.Rand) Operation { 381 value := g.getNextValue() 382 key := randMapKey(rng, g.keys) 383 return put(key, value) 384 } 385 386 func randSplitNew(g *generator, rng *rand.Rand) Operation { 387 key := randKey(rng) 388 g.currentSplits[key] = struct{}{} 389 g.historicalSplits[key] = struct{}{} 390 return split(key) 391 } 392 393 func randSplitAgain(g *generator, rng *rand.Rand) Operation { 394 key := randMapKey(rng, g.historicalSplits) 395 g.currentSplits[key] = struct{}{} 396 return split(key) 397 } 398 399 func randMergeNotSplit(g *generator, rng *rand.Rand) Operation { 400 key := randKey(rng) 401 return merge(key) 402 } 403 404 func randMergeIsSplit(g *generator, rng *rand.Rand) Operation { 405 key := randMapKey(rng, g.currentSplits) 406 // Assume that this split actually got merged, even though we may have handed 407 // out a concurrent split for the same key. 408 delete(g.currentSplits, key) 409 return merge(key) 410 } 411 412 func makeRemoveReplicaFn(key string, current []roachpb.ReplicationTarget) opGenFunc { 413 return func(g *generator, rng *rand.Rand) Operation { 414 change := roachpb.ReplicationChange{ 415 ChangeType: roachpb.REMOVE_REPLICA, 416 Target: current[rng.Intn(len(current))], 417 } 418 return changeReplicas(key, change) 419 } 420 } 421 422 func makeAddReplicaFn(key string, current []roachpb.ReplicationTarget, atomicSwap bool) opGenFunc { 423 return func(g *generator, rng *rand.Rand) Operation { 424 candidatesMap := make(map[roachpb.ReplicationTarget]struct{}) 425 for i := 0; i < g.Config.NumNodes; i++ { 426 t := roachpb.ReplicationTarget{NodeID: roachpb.NodeID(i + 1), StoreID: roachpb.StoreID(i + 1)} 427 candidatesMap[t] = struct{}{} 428 } 429 for _, replica := range current { 430 delete(candidatesMap, replica) 431 } 432 var candidates []roachpb.ReplicationTarget 433 for candidate := range candidatesMap { 434 candidates = append(candidates, candidate) 435 } 436 candidate := candidates[rng.Intn(len(candidates))] 437 changes := []roachpb.ReplicationChange{{ 438 ChangeType: roachpb.ADD_REPLICA, 439 Target: candidate, 440 }} 441 if atomicSwap { 442 changes = append(changes, roachpb.ReplicationChange{ 443 ChangeType: roachpb.REMOVE_REPLICA, 444 Target: current[rng.Intn(len(current))], 445 }) 446 } 447 return changeReplicas(key, changes...) 448 } 449 } 450 451 func makeRandBatch(c *ClientOperationConfig) opGenFunc { 452 return func(g *generator, rng *rand.Rand) Operation { 453 var allowed []opGen 454 g.registerClientOps(&allowed, c) 455 456 numOps := rng.Intn(4) 457 ops := make([]Operation, numOps) 458 for i := range ops { 459 ops[i] = g.selectOp(rng, allowed) 460 } 461 return batch(ops...) 462 } 463 } 464 465 func (g *generator) registerClosureTxnOps(allowed *[]opGen, c *ClosureTxnConfig) { 466 addOpGen(allowed, 467 makeClosureTxn(ClosureTxnType_Commit, &c.TxnClientOps, &c.TxnBatchOps, nil /* commitInBatch*/), c.Commit) 468 addOpGen(allowed, 469 makeClosureTxn(ClosureTxnType_Rollback, &c.TxnClientOps, &c.TxnBatchOps, nil /* commitInBatch*/), c.Rollback) 470 addOpGen(allowed, 471 makeClosureTxn(ClosureTxnType_Commit, &c.TxnClientOps, &c.TxnBatchOps, &c.CommitBatchOps), c.CommitInBatch) 472 } 473 474 func makeClosureTxn( 475 txnType ClosureTxnType, 476 txnClientOps *ClientOperationConfig, 477 txnBatchOps *BatchOperationConfig, 478 commitInBatch *ClientOperationConfig, 479 ) opGenFunc { 480 return func(g *generator, rng *rand.Rand) Operation { 481 var allowed []opGen 482 g.registerClientOps(&allowed, txnClientOps) 483 g.registerBatchOps(&allowed, txnBatchOps) 484 numOps := rng.Intn(4) 485 ops := make([]Operation, numOps) 486 for i := range ops { 487 ops[i] = g.selectOp(rng, allowed) 488 } 489 op := closureTxn(txnType, ops...) 490 if commitInBatch != nil { 491 if txnType != ClosureTxnType_Commit { 492 panic(errors.AssertionFailedf(`CommitInBatch must commit got: %s`, txnType)) 493 } 494 op.ClosureTxn.CommitInBatch = makeRandBatch(commitInBatch)(g, rng).Batch 495 } 496 return op 497 } 498 } 499 500 func (g *generator) getNextValue() string { 501 value := `v-` + strconv.Itoa(g.nextValue) 502 g.nextValue++ 503 return value 504 } 505 506 func randKey(rng *rand.Rand) string { 507 u, err := uuid.NewGenWithReader(rng).NewV4() 508 if err != nil { 509 panic(err) 510 } 511 key := GeneratorDataSpan().Key 512 key = encoding.EncodeStringAscending(key, u.Short()) 513 return string(key) 514 } 515 516 func randMapKey(rng *rand.Rand, m map[string]struct{}) string { 517 keys := make([]string, 0, len(m)) 518 for key := range m { 519 keys = append(keys, key) 520 } 521 if len(keys) == 0 { 522 return randKey(rng) 523 } 524 return keys[rng.Intn(len(keys))] 525 } 526 527 func step(op Operation) Step { 528 return Step{Op: op} 529 } 530 531 func batch(ops ...Operation) Operation { 532 return Operation{Batch: &BatchOperation{Ops: ops}} 533 } 534 535 func opSlice(ops ...Operation) []Operation { 536 return ops 537 } 538 539 func closureTxn(typ ClosureTxnType, ops ...Operation) Operation { 540 return Operation{ClosureTxn: &ClosureTxnOperation{Ops: ops, Type: typ}} 541 } 542 543 func closureTxnCommitInBatch(commitInBatch []Operation, ops ...Operation) Operation { 544 o := closureTxn(ClosureTxnType_Commit, ops...) 545 if len(commitInBatch) > 0 { 546 o.ClosureTxn.CommitInBatch = &BatchOperation{Ops: commitInBatch} 547 } 548 return o 549 } 550 551 func get(key string) Operation { 552 return Operation{Get: &GetOperation{Key: []byte(key)}} 553 } 554 555 func put(key, value string) Operation { 556 return Operation{Put: &PutOperation{Key: []byte(key), Value: []byte(value)}} 557 } 558 559 func split(key string) Operation { 560 return Operation{Split: &SplitOperation{Key: []byte(key)}} 561 } 562 563 func merge(key string) Operation { 564 return Operation{Merge: &MergeOperation{Key: []byte(key)}} 565 } 566 567 func changeReplicas(key string, changes ...roachpb.ReplicationChange) Operation { 568 return Operation{ChangeReplicas: &ChangeReplicasOperation{Key: []byte(key), Changes: changes}} 569 }