github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/bulkingest/bulkingest.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 /* 12 Package bulkingest defines a workload that is intended to stress some edge cases 13 in our bulk-ingestion infrastructure. 14 15 In both IMPORT and indexing, many readers scan though the source data (i.e. CSV 16 files or PK rows, respectively) and produce KVs to be ingested. However a given 17 range of that source data could produce any KVs -- i.e. in some schemas or 18 workloads, the produced KVs could have the same ordering or in some they could 19 be random and uniformly distributed in the keyspace. Additionally, both of the 20 processes often include concurrent producers, each scanning their own input 21 files or ranges of a table, and there the distribution could mean that 22 concurrent producers all produce different keys or all produce similar keys at 23 the same time, etc. 24 25 This workload is intended to produce testdata that emphasizes these cases. The 26 multi-column PK is intended to make it easy to independently control the prefix 27 of keys. Adding an index on the same columns with the columns reordered can then 28 control the flow of keys between prefixes, stressing any buffering, sorting or 29 other steps in the middle. This can be particularly interesting when concurrent 30 producers are a factor, as the distribution (or lack there of) of their output 31 prefixes at a given moment can cause hotspots. 32 33 The workload's schema is a table with columns a, b, and c plus a padding payload 34 string, with the primary key being (a,b,c). 35 36 Creating indexes on the different columns in this schema can then trigger 37 different distributions of produced index KVs -- i.e. an index on (b, c) would 38 see each range of PK data produce tightly grouped output that overlaps with the 39 output of A other ranges of the table. 40 41 The workload's main parameters are number of distinct values of a, b and c. 42 Initial data batches each correspond to one a/b pair containing c rows. By 43 default, batches are ordered by a then b (a=1/b=1, a=1/b=2, a=1,b=3, ...) though 44 this can optionally be inverted (a=1/b=1, a=2,b=1, a=3,b=1,...). 45 46 */ 47 package bulkingest 48 49 import ( 50 "context" 51 gosql "database/sql" 52 "math/rand" 53 "strings" 54 55 "github.com/cockroachdb/cockroach/pkg/col/coldata" 56 "github.com/cockroachdb/cockroach/pkg/sql/types" 57 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 58 "github.com/cockroachdb/cockroach/pkg/util/randutil" 59 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 60 "github.com/cockroachdb/cockroach/pkg/workload" 61 "github.com/cockroachdb/cockroach/pkg/workload/histogram" 62 "github.com/cockroachdb/errors" 63 "github.com/spf13/pflag" 64 ) 65 66 const ( 67 bulkingestSchemaPrefix = `( 68 a INT, 69 b INT, 70 c INT, 71 payload STRING, 72 PRIMARY KEY (a, b, c)` 73 74 indexOnBCA = ",\n INDEX (b, c, a) STORING (payload)" 75 76 defaultPayloadBytes = 100 77 ) 78 79 type bulkingest struct { 80 flags workload.Flags 81 connFlags *workload.ConnFlags 82 83 seed int64 84 aCount, bCount, cCount, payloadBytes int 85 86 generateBsFirst bool 87 indexBCA bool 88 } 89 90 func init() { 91 workload.Register(bulkingestMeta) 92 } 93 94 var bulkingestMeta = workload.Meta{ 95 Name: `bulkingest`, 96 Description: `bulkingest testdata is designed to produce a skewed distribution of KVs when ingested (in initial import or during later indexing)`, 97 Version: `1.0.0`, 98 New: func() workload.Generator { 99 g := &bulkingest{} 100 g.flags.FlagSet = pflag.NewFlagSet(`bulkingest`, pflag.ContinueOnError) 101 g.flags.Int64Var(&g.seed, `seed`, 1, `Key hash seed.`) 102 g.flags.IntVar(&g.aCount, `a`, 10, `number of values of A (i.e. pk prefix)`) 103 g.flags.IntVar(&g.bCount, `b`, 10, `number of values of B (i.e. idx prefix)`) 104 g.flags.IntVar(&g.cCount, `c`, 1000, `number of values of C (i.e. rows per A/B pair)`) 105 g.flags.BoolVar(&g.generateBsFirst, `batches-by-b`, false, `generate all B batches for given A first`) 106 g.flags.BoolVar(&g.indexBCA, `index-b-c-a`, true, `include an index on (B, C, A)`) 107 g.flags.IntVar(&g.payloadBytes, `payload-bytes`, defaultPayloadBytes, `Size of the payload field in each row.`) 108 g.connFlags = workload.NewConnFlags(&g.flags) 109 return g 110 }, 111 } 112 113 // Meta implements the Generator interface. 114 func (*bulkingest) Meta() workload.Meta { return bulkingestMeta } 115 116 // Flags implements the Flagser interface. 117 func (w *bulkingest) Flags() workload.Flags { return w.flags } 118 119 // Hooks implements the Hookser interface. 120 func (w *bulkingest) Hooks() workload.Hooks { 121 return workload.Hooks{} 122 } 123 124 // Tables implements the Generator interface. 125 func (w *bulkingest) Tables() []workload.Table { 126 schema := bulkingestSchemaPrefix 127 if w.indexBCA { 128 schema += indexOnBCA 129 } 130 schema += ")" 131 132 var bulkingestTypes = []*types.T{ 133 types.Int, 134 types.Int, 135 types.Int, 136 types.Bytes, 137 } 138 139 table := workload.Table{ 140 Name: `bulkingest`, 141 Schema: schema, 142 InitialRows: workload.BatchedTuples{ 143 NumBatches: w.aCount * w.bCount, 144 FillBatch: func(ab int, cb coldata.Batch, alloc *bufalloc.ByteAllocator) { 145 a := ab / w.bCount 146 b := ab % w.bCount 147 if w.generateBsFirst { 148 b = ab / w.aCount 149 a = ab % w.aCount 150 } 151 152 cb.Reset(bulkingestTypes, w.cCount, coldata.StandardColumnFactory) 153 aCol := cb.ColVec(0).Int64() 154 bCol := cb.ColVec(1).Int64() 155 cCol := cb.ColVec(2).Int64() 156 payloadCol := cb.ColVec(3).Bytes() 157 158 rng := rand.New(rand.NewSource(w.seed + int64(ab))) 159 var payload []byte 160 payload, *alloc = alloc.Alloc(w.cCount*w.payloadBytes, 0 /* extraCap */) 161 randutil.ReadTestdataBytes(rng, payload) 162 payloadCol.Reset() 163 for rowIdx := 0; rowIdx < w.cCount; rowIdx++ { 164 c := rowIdx 165 off := c * w.payloadBytes 166 aCol[rowIdx] = int64(a) 167 bCol[rowIdx] = int64(b) 168 cCol[rowIdx] = int64(c) 169 payloadCol.Set(rowIdx, payload[off:off+w.payloadBytes]) 170 } 171 }, 172 }, 173 } 174 return []workload.Table{table} 175 } 176 177 // Ops implements the Opser interface. 178 func (w *bulkingest) Ops(urls []string, reg *histogram.Registry) (workload.QueryLoad, error) { 179 sqlDatabase, err := workload.SanitizeUrls(w, w.connFlags.DBOverride, urls) 180 if err != nil { 181 return workload.QueryLoad{}, err 182 } 183 db, err := gosql.Open(`cockroach`, strings.Join(urls, ` `)) 184 if err != nil { 185 return workload.QueryLoad{}, err 186 } 187 // Allow a maximum of concurrency+1 connections to the database. 188 db.SetMaxOpenConns(w.connFlags.Concurrency + 1) 189 db.SetMaxIdleConns(w.connFlags.Concurrency + 1) 190 191 updateStmt, err := db.Prepare(` 192 UPDATE bulkingest 193 SET payload = $4 194 WHERE a = $1 AND b = $2 AND c = $3 195 `) 196 if err != nil { 197 return workload.QueryLoad{}, err 198 } 199 200 ql := workload.QueryLoad{SQLDatabase: sqlDatabase} 201 for i := 0; i < w.connFlags.Concurrency; i++ { 202 rng := rand.New(rand.NewSource(w.seed)) 203 hists := reg.GetHandle() 204 pad := make([]byte, w.payloadBytes) 205 workerFn := func(ctx context.Context) error { 206 a := rng.Intn(w.aCount) 207 b := rng.Intn(w.bCount) 208 c := rng.Intn(w.cCount) 209 randutil.ReadTestdataBytes(rng, pad) 210 211 start := timeutil.Now() 212 res, err := updateStmt.Exec(a, b, c, pad) 213 elapsed := timeutil.Since(start) 214 hists.Get(`update-payload`).Record(elapsed) 215 if err != nil { 216 return err 217 } 218 if affected, err := res.RowsAffected(); err != nil { 219 return err 220 } else if affected != 1 { 221 return errors.Errorf("expected 1 row affected, got %d", affected) 222 } 223 return nil 224 } 225 ql.WorkerFns = append(ql.WorkerFns, workerFn) 226 } 227 return ql, nil 228 }