github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/bulkingest/bulkingest.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  /*
    12  Package bulkingest defines a workload that is intended to stress some edge cases
    13  in our bulk-ingestion infrastructure.
    14  
    15  In both IMPORT and indexing, many readers scan though the source data (i.e. CSV
    16  files or PK rows, respectively) and produce KVs to be ingested. However a given
    17  range of that source data could produce any KVs -- i.e. in some schemas or
    18  workloads, the produced KVs could have the same ordering or in some they could
    19  be random and uniformly distributed in the keyspace. Additionally, both of the
    20  processes often include concurrent producers, each scanning their own input
    21  files or ranges of a table, and there the distribution could mean that
    22  concurrent producers all produce different keys or all produce similar keys at
    23  the same time, etc.
    24  
    25  This workload is intended to produce testdata that emphasizes these cases. The
    26  multi-column PK is intended to make it easy to independently control the prefix
    27  of keys. Adding an index on the same columns with the columns reordered can then
    28  control the flow of keys between prefixes, stressing any buffering, sorting or
    29  other steps in the middle. This can be particularly interesting when concurrent
    30  producers are a factor, as the distribution (or lack there of) of their output
    31  prefixes at a given moment can cause hotspots.
    32  
    33  The workload's schema is a table with columns a, b, and c plus a padding payload
    34  string, with the primary key being (a,b,c).
    35  
    36  Creating indexes on the different columns in this schema can then trigger
    37  different distributions of produced index KVs -- i.e. an index on (b, c) would
    38  see each range of PK data produce tightly grouped output that overlaps with the
    39  output of A other ranges of the table.
    40  
    41  The workload's main parameters are number of distinct values of a, b and c.
    42  Initial data batches each correspond to one a/b pair containing c rows. By
    43  default, batches are ordered by a then b (a=1/b=1, a=1/b=2, a=1,b=3, ...) though
    44  this can optionally be inverted (a=1/b=1, a=2,b=1, a=3,b=1,...).
    45  
    46  */
    47  package bulkingest
    48  
    49  import (
    50  	"context"
    51  	gosql "database/sql"
    52  	"math/rand"
    53  	"strings"
    54  
    55  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    56  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    57  	"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
    58  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    59  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    60  	"github.com/cockroachdb/cockroach/pkg/workload"
    61  	"github.com/cockroachdb/cockroach/pkg/workload/histogram"
    62  	"github.com/cockroachdb/errors"
    63  	"github.com/spf13/pflag"
    64  )
    65  
    66  const (
    67  	bulkingestSchemaPrefix = `(
    68  		a INT,
    69  		b INT,
    70  		c INT,
    71  		payload STRING,
    72  		PRIMARY KEY (a, b, c)`
    73  
    74  	indexOnBCA = ",\n INDEX (b, c, a) STORING (payload)"
    75  
    76  	defaultPayloadBytes = 100
    77  )
    78  
    79  type bulkingest struct {
    80  	flags     workload.Flags
    81  	connFlags *workload.ConnFlags
    82  
    83  	seed                                 int64
    84  	aCount, bCount, cCount, payloadBytes int
    85  
    86  	generateBsFirst bool
    87  	indexBCA        bool
    88  }
    89  
    90  func init() {
    91  	workload.Register(bulkingestMeta)
    92  }
    93  
    94  var bulkingestMeta = workload.Meta{
    95  	Name:        `bulkingest`,
    96  	Description: `bulkingest testdata is designed to produce a skewed distribution of KVs when ingested (in initial import or during later indexing)`,
    97  	Version:     `1.0.0`,
    98  	New: func() workload.Generator {
    99  		g := &bulkingest{}
   100  		g.flags.FlagSet = pflag.NewFlagSet(`bulkingest`, pflag.ContinueOnError)
   101  		g.flags.Int64Var(&g.seed, `seed`, 1, `Key hash seed.`)
   102  		g.flags.IntVar(&g.aCount, `a`, 10, `number of values of A (i.e. pk prefix)`)
   103  		g.flags.IntVar(&g.bCount, `b`, 10, `number of values of B (i.e. idx prefix)`)
   104  		g.flags.IntVar(&g.cCount, `c`, 1000, `number of values of C (i.e. rows per A/B pair)`)
   105  		g.flags.BoolVar(&g.generateBsFirst, `batches-by-b`, false, `generate all B batches for given A first`)
   106  		g.flags.BoolVar(&g.indexBCA, `index-b-c-a`, true, `include an index on (B, C, A)`)
   107  		g.flags.IntVar(&g.payloadBytes, `payload-bytes`, defaultPayloadBytes, `Size of the payload field in each row.`)
   108  		g.connFlags = workload.NewConnFlags(&g.flags)
   109  		return g
   110  	},
   111  }
   112  
   113  // Meta implements the Generator interface.
   114  func (*bulkingest) Meta() workload.Meta { return bulkingestMeta }
   115  
   116  // Flags implements the Flagser interface.
   117  func (w *bulkingest) Flags() workload.Flags { return w.flags }
   118  
   119  // Hooks implements the Hookser interface.
   120  func (w *bulkingest) Hooks() workload.Hooks {
   121  	return workload.Hooks{}
   122  }
   123  
   124  // Tables implements the Generator interface.
   125  func (w *bulkingest) Tables() []workload.Table {
   126  	schema := bulkingestSchemaPrefix
   127  	if w.indexBCA {
   128  		schema += indexOnBCA
   129  	}
   130  	schema += ")"
   131  
   132  	var bulkingestTypes = []*types.T{
   133  		types.Int,
   134  		types.Int,
   135  		types.Int,
   136  		types.Bytes,
   137  	}
   138  
   139  	table := workload.Table{
   140  		Name:   `bulkingest`,
   141  		Schema: schema,
   142  		InitialRows: workload.BatchedTuples{
   143  			NumBatches: w.aCount * w.bCount,
   144  			FillBatch: func(ab int, cb coldata.Batch, alloc *bufalloc.ByteAllocator) {
   145  				a := ab / w.bCount
   146  				b := ab % w.bCount
   147  				if w.generateBsFirst {
   148  					b = ab / w.aCount
   149  					a = ab % w.aCount
   150  				}
   151  
   152  				cb.Reset(bulkingestTypes, w.cCount, coldata.StandardColumnFactory)
   153  				aCol := cb.ColVec(0).Int64()
   154  				bCol := cb.ColVec(1).Int64()
   155  				cCol := cb.ColVec(2).Int64()
   156  				payloadCol := cb.ColVec(3).Bytes()
   157  
   158  				rng := rand.New(rand.NewSource(w.seed + int64(ab)))
   159  				var payload []byte
   160  				payload, *alloc = alloc.Alloc(w.cCount*w.payloadBytes, 0 /* extraCap */)
   161  				randutil.ReadTestdataBytes(rng, payload)
   162  				payloadCol.Reset()
   163  				for rowIdx := 0; rowIdx < w.cCount; rowIdx++ {
   164  					c := rowIdx
   165  					off := c * w.payloadBytes
   166  					aCol[rowIdx] = int64(a)
   167  					bCol[rowIdx] = int64(b)
   168  					cCol[rowIdx] = int64(c)
   169  					payloadCol.Set(rowIdx, payload[off:off+w.payloadBytes])
   170  				}
   171  			},
   172  		},
   173  	}
   174  	return []workload.Table{table}
   175  }
   176  
   177  // Ops implements the Opser interface.
   178  func (w *bulkingest) Ops(urls []string, reg *histogram.Registry) (workload.QueryLoad, error) {
   179  	sqlDatabase, err := workload.SanitizeUrls(w, w.connFlags.DBOverride, urls)
   180  	if err != nil {
   181  		return workload.QueryLoad{}, err
   182  	}
   183  	db, err := gosql.Open(`cockroach`, strings.Join(urls, ` `))
   184  	if err != nil {
   185  		return workload.QueryLoad{}, err
   186  	}
   187  	// Allow a maximum of concurrency+1 connections to the database.
   188  	db.SetMaxOpenConns(w.connFlags.Concurrency + 1)
   189  	db.SetMaxIdleConns(w.connFlags.Concurrency + 1)
   190  
   191  	updateStmt, err := db.Prepare(`
   192  		UPDATE bulkingest
   193  		SET payload = $4
   194  		WHERE a = $1 AND b = $2 AND c = $3
   195  	`)
   196  	if err != nil {
   197  		return workload.QueryLoad{}, err
   198  	}
   199  
   200  	ql := workload.QueryLoad{SQLDatabase: sqlDatabase}
   201  	for i := 0; i < w.connFlags.Concurrency; i++ {
   202  		rng := rand.New(rand.NewSource(w.seed))
   203  		hists := reg.GetHandle()
   204  		pad := make([]byte, w.payloadBytes)
   205  		workerFn := func(ctx context.Context) error {
   206  			a := rng.Intn(w.aCount)
   207  			b := rng.Intn(w.bCount)
   208  			c := rng.Intn(w.cCount)
   209  			randutil.ReadTestdataBytes(rng, pad)
   210  
   211  			start := timeutil.Now()
   212  			res, err := updateStmt.Exec(a, b, c, pad)
   213  			elapsed := timeutil.Since(start)
   214  			hists.Get(`update-payload`).Record(elapsed)
   215  			if err != nil {
   216  				return err
   217  			}
   218  			if affected, err := res.RowsAffected(); err != nil {
   219  				return err
   220  			} else if affected != 1 {
   221  				return errors.Errorf("expected 1 row affected, got %d", affected)
   222  			}
   223  			return nil
   224  		}
   225  		ql.WorkerFns = append(ql.WorkerFns, workerFn)
   226  	}
   227  	return ql, nil
   228  }