github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvnemesis/generator.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvnemesis
    12  
    13  import (
    14  	"math/rand"
    15  	"strconv"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/keys"
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    20  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    21  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    22  	"github.com/cockroachdb/errors"
    23  )
    24  
    25  // GeneratorConfig contains all the tunable knobs necessary to run a Generator.
    26  type GeneratorConfig struct {
    27  	Ops                   OperationConfig
    28  	NumNodes, NumReplicas int
    29  }
    30  
    31  // OperationConfig configures the relative probabilities of producing various
    32  // operations.
    33  //
    34  // In this struct and all sub-configurations, wording such as "likely exists" or
    35  // "definitely doesn't exist" is according to previously generated steps.
    36  // "likely" is a result of non-determinism due to concurrent execution of the
    37  // generated operations.
    38  type OperationConfig struct {
    39  	DB             ClientOperationConfig
    40  	Batch          BatchOperationConfig
    41  	ClosureTxn     ClosureTxnConfig
    42  	Split          SplitConfig
    43  	Merge          MergeConfig
    44  	ChangeReplicas ChangeReplicasConfig
    45  }
    46  
    47  // ClosureTxnConfig configures the relative probability of running some
    48  // operations in a transaction by using the closure-based kv.DB.Txn method. This
    49  // family of operations mainly varies in how it commits (or doesn't commit). The
    50  // composition of the operations in the txn is controlled by TxnClientOps and
    51  // TxnBatchOps
    52  type ClosureTxnConfig struct {
    53  	TxnClientOps ClientOperationConfig
    54  	TxnBatchOps  BatchOperationConfig
    55  
    56  	// Commit is a transaction that commits normally.
    57  	Commit int
    58  	// Rollback is a transaction that encounters an error at the end and has to
    59  	// roll back.
    60  	Rollback int
    61  	// CommitInBatch is a transaction that commits via the CommitInBatchMethod.
    62  	// This is an important part of the 1pc txn fastpath.
    63  	CommitInBatch int
    64  	// When CommitInBatch is selected, CommitBatchOps controls the composition of
    65  	// the kv.Batch used.
    66  	CommitBatchOps ClientOperationConfig
    67  }
    68  
    69  // ClientOperationConfig configures the relative probabilities of the
    70  // bread-and-butter kv operations such as Get/Put/Delete/etc. These can all be
    71  // run on a DB, a Txn, or a Batch.
    72  type ClientOperationConfig struct {
    73  	// GetMissing is an operation that Gets a key that definitely doesn't exist.
    74  	GetMissing int
    75  	// GetExisting is an operation that Gets a key that likely exists.
    76  	GetExisting int
    77  	// PutMissing is an operation that Puts a key that definitely doesn't exist.
    78  	PutMissing int
    79  	// PutExisting is an operation that Puts a key that likely exists.
    80  	PutExisting int
    81  }
    82  
    83  // BatchOperationConfig configures the relative probability of generating a
    84  // kv.Batch of some number of operations as well as the composition of the
    85  // operations in the batch itself. These can be run in various ways including
    86  // kv.DB.Run or kv.Txn.Run.
    87  type BatchOperationConfig struct {
    88  	Batch int
    89  	Ops   ClientOperationConfig
    90  }
    91  
    92  // SplitConfig configures the relative probability of generating a Split
    93  // operation.
    94  type SplitConfig struct {
    95  	// SplitNew is an operation that Splits at a key that has never previously
    96  	// been a split point.
    97  	SplitNew int
    98  	// SplitAgain is an operation that Splits at a key that likely has
    99  	// previously been a split point, though it may or may not have been merged
   100  	// since.
   101  	SplitAgain int
   102  }
   103  
   104  // MergeConfig configures the relative probability of generating a Merge
   105  // operation.
   106  type MergeConfig struct {
   107  	// MergeNotSplit is an operation that Merges at a key that has never been
   108  	// split at (meaning this should be a no-op).
   109  	MergeNotSplit int
   110  	// MergeIsSplit is an operation that Merges at a key that is likely to
   111  	// currently be split.
   112  	MergeIsSplit int
   113  }
   114  
   115  // ChangeReplicasConfig configures the relative probability of generating a
   116  // ChangeReplicas operation.
   117  type ChangeReplicasConfig struct {
   118  	// AddReplica adds a single replica.
   119  	AddReplica int
   120  	// RemoveReplica removes a single replica.
   121  	RemoveReplica int
   122  	// AtomicSwapReplica adds 1 replica and removes 1 replica in a single
   123  	// ChangeReplicas call.
   124  	AtomicSwapReplica int
   125  }
   126  
   127  // newAllOperationsConfig returns a GeneratorConfig that exercises *all*
   128  // options. You probably want NewDefaultConfig. Most of the time, these will be
   129  // the same, but having both allows us to merge code for operations that do not
   130  // yet pass (for example, if the new operation finds a kv bug or edge case).
   131  func newAllOperationsConfig() GeneratorConfig {
   132  	clientOpConfig := ClientOperationConfig{
   133  		GetMissing:  1,
   134  		GetExisting: 1,
   135  		PutMissing:  1,
   136  		PutExisting: 1,
   137  	}
   138  	batchOpConfig := BatchOperationConfig{
   139  		Batch: 4,
   140  		Ops:   clientOpConfig,
   141  	}
   142  	return GeneratorConfig{Ops: OperationConfig{
   143  		DB:    clientOpConfig,
   144  		Batch: batchOpConfig,
   145  		ClosureTxn: ClosureTxnConfig{
   146  			Commit:         5,
   147  			Rollback:       5,
   148  			CommitInBatch:  5,
   149  			TxnClientOps:   clientOpConfig,
   150  			TxnBatchOps:    batchOpConfig,
   151  			CommitBatchOps: clientOpConfig,
   152  		},
   153  		Split: SplitConfig{
   154  			SplitNew:   1,
   155  			SplitAgain: 1,
   156  		},
   157  		Merge: MergeConfig{
   158  			MergeNotSplit: 1,
   159  			MergeIsSplit:  1,
   160  		},
   161  		ChangeReplicas: ChangeReplicasConfig{
   162  			AddReplica:        1,
   163  			RemoveReplica:     1,
   164  			AtomicSwapReplica: 1,
   165  		},
   166  	}}
   167  }
   168  
   169  // NewDefaultConfig returns a GeneratorConfig that is a reasonable default
   170  // starting point for general KV usage. Nemesis test variants that want to
   171  // stress particular areas may want to start with this and eliminate some
   172  // operations/make some operations more likely.
   173  func NewDefaultConfig() GeneratorConfig {
   174  	config := newAllOperationsConfig()
   175  	// TODO(dan): This fails with a WriteTooOld error if the same key is Put twice
   176  	// in a single batch. However, if the same Batch is committed using txn.Run,
   177  	// then it works and only the last one is materialized. We could make the
   178  	// db.Run behavior match txn.Run by ensuring that all requests in a
   179  	// nontransactional batch are disjoint and upgrading to a transactional batch
   180  	// (see CrossRangeTxnWrapperSender) if they are. roachpb.SpanGroup can be used
   181  	// to efficiently check this.
   182  	//
   183  	// TODO(dan): Make this `config.Ops.Batch.Ops.PutExisting = 0` once #46081 is
   184  	// fixed.
   185  	config.Ops.Batch = BatchOperationConfig{}
   186  	// TODO(dan): Remove when #45586 is addressed.
   187  	config.Ops.ClosureTxn.CommitBatchOps.GetExisting = 0
   188  	config.Ops.ClosureTxn.CommitBatchOps.GetMissing = 0
   189  	return config
   190  }
   191  
   192  // GeneratorDataSpan returns a span that contains all of the operations created
   193  // by this Generator.
   194  func GeneratorDataSpan() roachpb.Span {
   195  	return roachpb.Span{
   196  		Key:    keys.SystemSQLCodec.TablePrefix(50),
   197  		EndKey: keys.SystemSQLCodec.TablePrefix(51),
   198  	}
   199  }
   200  
   201  // GetReplicasFn is a function that returns the current replicas for the range
   202  // containing a key.
   203  type GetReplicasFn func(roachpb.Key) []roachpb.ReplicationTarget
   204  
   205  // Generator incrementally constructs KV traffic designed to maximally test edge
   206  // cases.
   207  //
   208  // The expected usage is that a number of concurrent worker threads will each
   209  // repeatedly ask for a Step, finish executing it, then ask for the next Step.
   210  //
   211  // A Step consists of a single Operation, which is a unit of work that must be
   212  // done serially. It often corresponds 1:1 to a single call to some method on
   213  // the KV api (such as Get or Put), but some Operations have a set of steps
   214  // (such as using a transaction).
   215  //
   216  // Generator in itself is deterministic, but it's intended usage is that
   217  // multiple worker goroutines take turns pulling steps (sequentially) which they
   218  // then execute concurrently. To improve the efficiency of this pattern,
   219  // Generator will track which splits and merges could possibly have taken place
   220  // and takes this into account when generating operations. For example,
   221  // Generator won't take a OpMergeIsSplit step if it has never previously emitted
   222  // a split, but it may emit an OpMerge once it has produced an OpSplit even
   223  // though the worker executing the split may find that the merge has not yet
   224  // been executed.
   225  type Generator struct {
   226  	// TODO(dan): This is awkward, unify Generator and generator.
   227  	mu struct {
   228  		syncutil.Mutex
   229  		generator
   230  	}
   231  }
   232  
   233  // MakeGenerator constructs a Generator.
   234  func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator, error) {
   235  	if config.NumNodes <= 0 {
   236  		return nil, errors.Errorf(`NumNodes must be positive got: %d`, config.NumNodes)
   237  	}
   238  	if config.NumReplicas <= 0 {
   239  		return nil, errors.Errorf(`NumReplicas must be positive got: %d`, config.NumReplicas)
   240  	}
   241  	if config.NumReplicas > config.NumNodes {
   242  		return nil, errors.Errorf(`NumReplicas (%d) must <= NumNodes (%d)`,
   243  			config.NumReplicas, config.NumNodes)
   244  	}
   245  	g := &Generator{}
   246  	g.mu.generator = generator{
   247  		Config:           config,
   248  		replicasFn:       replicasFn,
   249  		keys:             make(map[string]struct{}),
   250  		currentSplits:    make(map[string]struct{}),
   251  		historicalSplits: make(map[string]struct{}),
   252  	}
   253  	return g, nil
   254  }
   255  
   256  // RandStep returns a single randomly generated next operation to execute.
   257  //
   258  // RandStep is concurrency safe.
   259  func (g *Generator) RandStep(rng *rand.Rand) Step {
   260  	g.mu.Lock()
   261  	defer g.mu.Unlock()
   262  	return g.mu.RandStep(rng)
   263  }
   264  
   265  type generator struct {
   266  	Config     GeneratorConfig
   267  	replicasFn GetReplicasFn
   268  
   269  	nextValue int
   270  
   271  	// keys is the set of every key that has been written to, including those in
   272  	// rolled back transactions.
   273  	keys map[string]struct{}
   274  
   275  	// currentSplits is approximately the set of every split that has been made
   276  	// within DataSpan. The exact accounting is hard because Generator can hand
   277  	// out a concurrent split and merge for the same key, which is racey. These
   278  	// races can result in a currentSplit that is not in fact a split at the KV
   279  	// level. Luckily we don't need exact accounting.
   280  	currentSplits map[string]struct{}
   281  	// historicalSplits is the set of every key for which a split has been
   282  	// emitted, regardless of whether the split has since been applied or been
   283  	// merged away again.
   284  	historicalSplits map[string]struct{}
   285  }
   286  
   287  // RandStep returns a single randomly generated next operation to execute.
   288  //
   289  // RandStep is not concurrency safe.
   290  func (g *generator) RandStep(rng *rand.Rand) Step {
   291  	var allowed []opGen
   292  	g.registerClientOps(&allowed, &g.Config.Ops.DB)
   293  	g.registerBatchOps(&allowed, &g.Config.Ops.Batch)
   294  	g.registerClosureTxnOps(&allowed, &g.Config.Ops.ClosureTxn)
   295  
   296  	addOpGen(&allowed, randSplitNew, g.Config.Ops.Split.SplitNew)
   297  	if len(g.historicalSplits) > 0 {
   298  		addOpGen(&allowed, randSplitAgain, g.Config.Ops.Split.SplitAgain)
   299  	}
   300  
   301  	addOpGen(&allowed, randMergeNotSplit, g.Config.Ops.Merge.MergeNotSplit)
   302  	if len(g.currentSplits) > 0 {
   303  		addOpGen(&allowed, randMergeIsSplit, g.Config.Ops.Merge.MergeIsSplit)
   304  	}
   305  
   306  	key := randKey(rng)
   307  	current := g.replicasFn(roachpb.Key(key))
   308  	if len(current) < g.Config.NumNodes {
   309  		addReplicaFn := makeAddReplicaFn(key, current, false /* atomicSwap */)
   310  		addOpGen(&allowed, addReplicaFn, g.Config.Ops.ChangeReplicas.AddReplica)
   311  	}
   312  	if len(current) == g.Config.NumReplicas && len(current) < g.Config.NumNodes {
   313  		atomicSwapReplicaFn := makeAddReplicaFn(key, current, true /* atomicSwap */)
   314  		addOpGen(&allowed, atomicSwapReplicaFn, g.Config.Ops.ChangeReplicas.AtomicSwapReplica)
   315  	}
   316  	if len(current) > g.Config.NumReplicas {
   317  		removeReplicaFn := makeRemoveReplicaFn(key, current)
   318  		addOpGen(&allowed, removeReplicaFn, g.Config.Ops.ChangeReplicas.RemoveReplica)
   319  	}
   320  
   321  	return step(g.selectOp(rng, allowed))
   322  }
   323  
   324  type opGenFunc func(*generator, *rand.Rand) Operation
   325  
   326  type opGen struct {
   327  	fn     opGenFunc
   328  	weight int
   329  }
   330  
   331  func addOpGen(valid *[]opGen, fn opGenFunc, weight int) {
   332  	*valid = append(*valid, opGen{fn: fn, weight: weight})
   333  }
   334  
   335  func (g *generator) selectOp(rng *rand.Rand, contextuallyValid []opGen) Operation {
   336  	var total int
   337  	for _, x := range contextuallyValid {
   338  		total += x.weight
   339  	}
   340  	target := rng.Intn(total)
   341  	var sum int
   342  	for _, x := range contextuallyValid {
   343  		sum += x.weight
   344  		if sum > target {
   345  			return x.fn(g, rng)
   346  		}
   347  	}
   348  	panic(`unreachable`)
   349  }
   350  
   351  func (g *generator) registerClientOps(allowed *[]opGen, c *ClientOperationConfig) {
   352  	addOpGen(allowed, randGetMissing, c.GetMissing)
   353  	addOpGen(allowed, randPutMissing, c.PutMissing)
   354  	if len(g.keys) > 0 {
   355  		addOpGen(allowed, randGetExisting, c.GetExisting)
   356  		addOpGen(allowed, randPutExisting, c.PutExisting)
   357  	}
   358  }
   359  
   360  func (g *generator) registerBatchOps(allowed *[]opGen, c *BatchOperationConfig) {
   361  	addOpGen(allowed, makeRandBatch(&c.Ops), c.Batch)
   362  }
   363  
   364  func randGetMissing(_ *generator, rng *rand.Rand) Operation {
   365  	return get(randKey(rng))
   366  }
   367  
   368  func randGetExisting(g *generator, rng *rand.Rand) Operation {
   369  	key := randMapKey(rng, g.keys)
   370  	return get(key)
   371  }
   372  
   373  func randPutMissing(g *generator, rng *rand.Rand) Operation {
   374  	value := g.getNextValue()
   375  	key := randKey(rng)
   376  	g.keys[key] = struct{}{}
   377  	return put(key, value)
   378  }
   379  
   380  func randPutExisting(g *generator, rng *rand.Rand) Operation {
   381  	value := g.getNextValue()
   382  	key := randMapKey(rng, g.keys)
   383  	return put(key, value)
   384  }
   385  
   386  func randSplitNew(g *generator, rng *rand.Rand) Operation {
   387  	key := randKey(rng)
   388  	g.currentSplits[key] = struct{}{}
   389  	g.historicalSplits[key] = struct{}{}
   390  	return split(key)
   391  }
   392  
   393  func randSplitAgain(g *generator, rng *rand.Rand) Operation {
   394  	key := randMapKey(rng, g.historicalSplits)
   395  	g.currentSplits[key] = struct{}{}
   396  	return split(key)
   397  }
   398  
   399  func randMergeNotSplit(g *generator, rng *rand.Rand) Operation {
   400  	key := randKey(rng)
   401  	return merge(key)
   402  }
   403  
   404  func randMergeIsSplit(g *generator, rng *rand.Rand) Operation {
   405  	key := randMapKey(rng, g.currentSplits)
   406  	// Assume that this split actually got merged, even though we may have handed
   407  	// out a concurrent split for the same key.
   408  	delete(g.currentSplits, key)
   409  	return merge(key)
   410  }
   411  
   412  func makeRemoveReplicaFn(key string, current []roachpb.ReplicationTarget) opGenFunc {
   413  	return func(g *generator, rng *rand.Rand) Operation {
   414  		change := roachpb.ReplicationChange{
   415  			ChangeType: roachpb.REMOVE_REPLICA,
   416  			Target:     current[rng.Intn(len(current))],
   417  		}
   418  		return changeReplicas(key, change)
   419  	}
   420  }
   421  
   422  func makeAddReplicaFn(key string, current []roachpb.ReplicationTarget, atomicSwap bool) opGenFunc {
   423  	return func(g *generator, rng *rand.Rand) Operation {
   424  		candidatesMap := make(map[roachpb.ReplicationTarget]struct{})
   425  		for i := 0; i < g.Config.NumNodes; i++ {
   426  			t := roachpb.ReplicationTarget{NodeID: roachpb.NodeID(i + 1), StoreID: roachpb.StoreID(i + 1)}
   427  			candidatesMap[t] = struct{}{}
   428  		}
   429  		for _, replica := range current {
   430  			delete(candidatesMap, replica)
   431  		}
   432  		var candidates []roachpb.ReplicationTarget
   433  		for candidate := range candidatesMap {
   434  			candidates = append(candidates, candidate)
   435  		}
   436  		candidate := candidates[rng.Intn(len(candidates))]
   437  		changes := []roachpb.ReplicationChange{{
   438  			ChangeType: roachpb.ADD_REPLICA,
   439  			Target:     candidate,
   440  		}}
   441  		if atomicSwap {
   442  			changes = append(changes, roachpb.ReplicationChange{
   443  				ChangeType: roachpb.REMOVE_REPLICA,
   444  				Target:     current[rng.Intn(len(current))],
   445  			})
   446  		}
   447  		return changeReplicas(key, changes...)
   448  	}
   449  }
   450  
   451  func makeRandBatch(c *ClientOperationConfig) opGenFunc {
   452  	return func(g *generator, rng *rand.Rand) Operation {
   453  		var allowed []opGen
   454  		g.registerClientOps(&allowed, c)
   455  
   456  		numOps := rng.Intn(4)
   457  		ops := make([]Operation, numOps)
   458  		for i := range ops {
   459  			ops[i] = g.selectOp(rng, allowed)
   460  		}
   461  		return batch(ops...)
   462  	}
   463  }
   464  
   465  func (g *generator) registerClosureTxnOps(allowed *[]opGen, c *ClosureTxnConfig) {
   466  	addOpGen(allowed,
   467  		makeClosureTxn(ClosureTxnType_Commit, &c.TxnClientOps, &c.TxnBatchOps, nil /* commitInBatch*/), c.Commit)
   468  	addOpGen(allowed,
   469  		makeClosureTxn(ClosureTxnType_Rollback, &c.TxnClientOps, &c.TxnBatchOps, nil /* commitInBatch*/), c.Rollback)
   470  	addOpGen(allowed,
   471  		makeClosureTxn(ClosureTxnType_Commit, &c.TxnClientOps, &c.TxnBatchOps, &c.CommitBatchOps), c.CommitInBatch)
   472  }
   473  
   474  func makeClosureTxn(
   475  	txnType ClosureTxnType,
   476  	txnClientOps *ClientOperationConfig,
   477  	txnBatchOps *BatchOperationConfig,
   478  	commitInBatch *ClientOperationConfig,
   479  ) opGenFunc {
   480  	return func(g *generator, rng *rand.Rand) Operation {
   481  		var allowed []opGen
   482  		g.registerClientOps(&allowed, txnClientOps)
   483  		g.registerBatchOps(&allowed, txnBatchOps)
   484  		numOps := rng.Intn(4)
   485  		ops := make([]Operation, numOps)
   486  		for i := range ops {
   487  			ops[i] = g.selectOp(rng, allowed)
   488  		}
   489  		op := closureTxn(txnType, ops...)
   490  		if commitInBatch != nil {
   491  			if txnType != ClosureTxnType_Commit {
   492  				panic(errors.AssertionFailedf(`CommitInBatch must commit got: %s`, txnType))
   493  			}
   494  			op.ClosureTxn.CommitInBatch = makeRandBatch(commitInBatch)(g, rng).Batch
   495  		}
   496  		return op
   497  	}
   498  }
   499  
   500  func (g *generator) getNextValue() string {
   501  	value := `v-` + strconv.Itoa(g.nextValue)
   502  	g.nextValue++
   503  	return value
   504  }
   505  
   506  func randKey(rng *rand.Rand) string {
   507  	u, err := uuid.NewGenWithReader(rng).NewV4()
   508  	if err != nil {
   509  		panic(err)
   510  	}
   511  	key := GeneratorDataSpan().Key
   512  	key = encoding.EncodeStringAscending(key, u.Short())
   513  	return string(key)
   514  }
   515  
   516  func randMapKey(rng *rand.Rand, m map[string]struct{}) string {
   517  	keys := make([]string, 0, len(m))
   518  	for key := range m {
   519  		keys = append(keys, key)
   520  	}
   521  	if len(keys) == 0 {
   522  		return randKey(rng)
   523  	}
   524  	return keys[rng.Intn(len(keys))]
   525  }
   526  
   527  func step(op Operation) Step {
   528  	return Step{Op: op}
   529  }
   530  
   531  func batch(ops ...Operation) Operation {
   532  	return Operation{Batch: &BatchOperation{Ops: ops}}
   533  }
   534  
   535  func opSlice(ops ...Operation) []Operation {
   536  	return ops
   537  }
   538  
   539  func closureTxn(typ ClosureTxnType, ops ...Operation) Operation {
   540  	return Operation{ClosureTxn: &ClosureTxnOperation{Ops: ops, Type: typ}}
   541  }
   542  
   543  func closureTxnCommitInBatch(commitInBatch []Operation, ops ...Operation) Operation {
   544  	o := closureTxn(ClosureTxnType_Commit, ops...)
   545  	if len(commitInBatch) > 0 {
   546  		o.ClosureTxn.CommitInBatch = &BatchOperation{Ops: commitInBatch}
   547  	}
   548  	return o
   549  }
   550  
   551  func get(key string) Operation {
   552  	return Operation{Get: &GetOperation{Key: []byte(key)}}
   553  }
   554  
   555  func put(key, value string) Operation {
   556  	return Operation{Put: &PutOperation{Key: []byte(key), Value: []byte(value)}}
   557  }
   558  
   559  func split(key string) Operation {
   560  	return Operation{Split: &SplitOperation{Key: []byte(key)}}
   561  }
   562  
   563  func merge(key string) Operation {
   564  	return Operation{Merge: &MergeOperation{Key: []byte(key)}}
   565  }
   566  
   567  func changeReplicas(key string, changes ...roachpb.ReplicationChange) Operation {
   568  	return Operation{ChangeReplicas: &ChangeReplicasOperation{Key: []byte(key), Changes: changes}}
   569  }