github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/workload.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package workload provides an abstraction for generators of sql query loads 12 // (and requisite initial data) as well as tools for working with these 13 // generators. 14 package workload 15 16 import ( 17 "context" 18 gosql "database/sql" 19 "fmt" 20 "math" 21 "math/bits" 22 "sort" 23 "strings" 24 "sync" 25 "time" 26 27 "github.com/cockroachdb/cockroach/pkg/col/coldata" 28 "github.com/cockroachdb/cockroach/pkg/col/typeconv" 29 "github.com/cockroachdb/cockroach/pkg/sql/types" 30 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 31 "github.com/cockroachdb/cockroach/pkg/workload/histogram" 32 "github.com/cockroachdb/errors" 33 "github.com/spf13/pflag" 34 ) 35 36 // Generator represents one or more sql query loads and associated initial data. 37 type Generator interface { 38 // Meta returns meta information about this generator, including a name, 39 // description, and a function to create instances of it. 40 Meta() Meta 41 42 // Tables returns the set of tables for this generator, including schemas 43 // and initial data. 44 Tables() []Table 45 } 46 47 // FlagMeta is metadata about a workload flag. 48 type FlagMeta struct { 49 // RuntimeOnly may be set to true only if the corresponding flag has no 50 // impact on the behavior of any Tables in this workload. 51 RuntimeOnly bool 52 // CheckConsistencyOnly is expected to be true only if the corresponding 53 // flag only has an effect on the CheckConsistency hook. 54 CheckConsistencyOnly bool 55 } 56 57 // Flags is a container for flags and associated metadata. 58 type Flags struct { 59 *pflag.FlagSet 60 // Meta is keyed by flag name and may be nil if no metadata is needed. 61 Meta map[string]FlagMeta 62 } 63 64 // Flagser returns the flags this Generator is configured with. Any randomness 65 // in the Generator must be deterministic from these options so that table data 66 // initialization, query work, etc can be distributed by sending only these 67 // flags. 68 type Flagser interface { 69 Generator 70 Flags() Flags 71 } 72 73 // Opser returns the work functions for this generator. The tables are required 74 // to have been created and initialized before running these. 75 type Opser interface { 76 Generator 77 Ops(urls []string, reg *histogram.Registry) (QueryLoad, error) 78 } 79 80 // Hookser returns any hooks associated with the generator. 81 type Hookser interface { 82 Generator 83 Hooks() Hooks 84 } 85 86 // Hooks stores functions to be called at points in the workload lifecycle. 87 type Hooks struct { 88 // Validate is called after workload flags are parsed. It should return an 89 // error if the workload configuration is invalid. 90 Validate func() error 91 // PreLoad is called after workload tables are created and before workload 92 // data is loaded. It is not called when storing or loading a fixture. 93 // Implementations should be idempotent. 94 // 95 // TODO(dan): Deprecate the PreLoad hook, it doesn't play well with fixtures. 96 // It's only used in practice for zone configs, so it should be reasonably 97 // straightforward to make zone configs first class citizens of 98 // workload.Table. 99 PreLoad func(*gosql.DB) error 100 // PostLoad is called after workload tables are created workload data is 101 // loaded. It called after restoring a fixture. This, for example, is where 102 // creating foreign keys should go. Implementations should be idempotent. 103 PostLoad func(*gosql.DB) error 104 // PostRun is called after workload run has ended, with the duration of the 105 // run. This is where any post-run special printing or validation can be done. 106 PostRun func(time.Duration) error 107 // CheckConsistency is called to run generator-specific consistency checks. 108 // These are expected to pass after the initial data load as well as after 109 // running queryload. 110 CheckConsistency func(context.Context, *gosql.DB) error 111 // Partition is used to run a partitioning step on the data created by the workload. 112 // TODO (rohany): migrate existing partitioning steps (such as tpcc's) into here. 113 Partition func(*gosql.DB) error 114 } 115 116 // Meta is used to register a Generator at init time and holds meta information 117 // about this generator, including a name, description, and a function to create 118 // instances of it. 119 type Meta struct { 120 // Name is a unique name for this generator. 121 Name string 122 // Description is a short description of this generator. 123 Description string 124 // Details optionally allows specifying longer, more in-depth usage details. 125 Details string 126 // Version is a semantic version for this generator. It should be bumped 127 // whenever InitialRowFn or InitialRowCount change for any of the tables. 128 Version string 129 // PublicFacing indicates that this workload is also intended for use by 130 // users doing their own testing and evaluations. This allows hiding workloads 131 // that are only expected to be used in CockroachDB's internal development to 132 // avoid confusion. Workloads setting this to true should pay added attention 133 // to their documentation and help-text. 134 PublicFacing bool 135 // New returns an unconfigured instance of this generator. 136 New func() Generator 137 } 138 139 // Table represents a single table in a Generator. Included is a name, schema, 140 // and initial data. 141 type Table struct { 142 // Name is the unqualified table name, pre-escaped for use directly in SQL. 143 Name string 144 // Schema is the SQL formatted schema for this table, with the `CREATE TABLE 145 // <name>` prefix omitted. 146 Schema string 147 // InitialRows is the initial rows that will be present in the table after 148 // setup is completed. Note that the default value of NumBatches (zero) is 149 // special - such a Table will be skipped during `init`; non-zero NumBatches 150 // with a nil FillBatch function will trigger an error during `init`. 151 InitialRows BatchedTuples 152 // Splits is the initial splits that will be present in the table after 153 // setup is completed. 154 Splits BatchedTuples 155 // Stats is the pre-calculated set of statistics on this table. They can be 156 // injected using `ALTER TABLE <name> INJECT STATISTICS ...`. 157 Stats []JSONStatistic 158 } 159 160 // BatchedTuples is a generic generator of tuples (SQL rows, PKs to split at, 161 // etc). Tuples are generated in batches of arbitrary size. Each batch has an 162 // index in `[0,NumBatches)` and a batch can be generated given only its index. 163 type BatchedTuples struct { 164 // NumBatches is the number of batches of tuples. 165 NumBatches int 166 // FillBatch is a function to deterministically compute a columnar-batch of 167 // tuples given its index. 168 // 169 // To save allocations, the Vecs in the passed Batch are reused when possible, 170 // so the results of this call are invalidated the next time the same Batch is 171 // passed to FillBatch. Ditto the ByteAllocator, which can be reset in between 172 // calls. If a caller needs the Batch and its contents to be long lived, 173 // simply pass a new Batch to each call and don't reset the ByteAllocator. 174 FillBatch func(int, coldata.Batch, *bufalloc.ByteAllocator) 175 } 176 177 // Tuples is like TypedTuples except that it tries to guess the type of each 178 // datum. However, if the function ever returns nil for one of the datums, you 179 // need to use TypedTuples instead and specify the types. 180 func Tuples(count int, fn func(int) []interface{}) BatchedTuples { 181 return TypedTuples(count, nil /* typs */, fn) 182 } 183 184 const ( 185 // timestampOutputFormat is used to output all timestamps. 186 timestampOutputFormat = "2006-01-02 15:04:05.999999-07:00" 187 ) 188 189 // TypedTuples returns a BatchedTuples where each batch has size 1. It's 190 // intended to be easier to use than directly specifying a BatchedTuples, but 191 // the tradeoff is some bit of performance. If typs is nil, an attempt is 192 // made to infer them. 193 func TypedTuples(count int, typs []*types.T, fn func(int) []interface{}) BatchedTuples { 194 // The FillBatch we create has to be concurrency safe, so we can't let it do 195 // the one-time initialization of typs without this protection. 196 var typesOnce sync.Once 197 198 t := BatchedTuples{ 199 NumBatches: count, 200 } 201 if fn != nil { 202 t.FillBatch = func(batchIdx int, cb coldata.Batch, _ *bufalloc.ByteAllocator) { 203 row := fn(batchIdx) 204 205 typesOnce.Do(func() { 206 if typs == nil { 207 typs = make([]*types.T, len(row)) 208 for i, datum := range row { 209 if datum == nil { 210 panic(fmt.Sprintf( 211 `can't determine type of nil column; call TypedTuples directly: %v`, row)) 212 } else { 213 switch datum.(type) { 214 case time.Time: 215 typs[i] = types.Bytes 216 default: 217 typs[i] = typeconv.UnsafeFromGoType(datum) 218 } 219 } 220 } 221 } 222 }) 223 224 cb.Reset(typs, 1, coldata.StandardColumnFactory) 225 for colIdx, col := range cb.ColVecs() { 226 switch d := row[colIdx].(type) { 227 case nil: 228 col.Nulls().SetNull(0) 229 case bool: 230 col.Bool()[0] = d 231 case int: 232 col.Int64()[0] = int64(d) 233 case float64: 234 col.Float64()[0] = d 235 case string: 236 col.Bytes().Set(0, []byte(d)) 237 case []byte: 238 col.Bytes().Set(0, d) 239 case time.Time: 240 col.Bytes().Set(0, []byte(d.Round(time.Microsecond).UTC().Format(timestampOutputFormat))) 241 default: 242 panic(fmt.Sprintf(`unhandled datum type %T`, d)) 243 } 244 } 245 } 246 } 247 return t 248 } 249 250 // BatchRows is a function to deterministically compute a row-batch of tuples 251 // given its index. BatchRows doesn't attempt any reuse and so is allocation 252 // heavy. In performance-critical code, FillBatch should be used directly, 253 // instead. 254 func (b BatchedTuples) BatchRows(batchIdx int) [][]interface{} { 255 cb := coldata.NewMemBatchWithSize(nil, 0, coldata.StandardColumnFactory) 256 var a bufalloc.ByteAllocator 257 b.FillBatch(batchIdx, cb, &a) 258 return ColBatchToRows(cb) 259 } 260 261 // ColBatchToRows materializes the columnar data in a coldata.Batch into rows. 262 func ColBatchToRows(cb coldata.Batch) [][]interface{} { 263 numRows, numCols := cb.Length(), cb.Width() 264 // Allocate all the []interface{} row slices in one go. 265 datums := make([]interface{}, numRows*numCols) 266 for colIdx, col := range cb.ColVecs() { 267 nulls := col.Nulls() 268 switch col.CanonicalTypeFamily() { 269 case types.BoolFamily: 270 for rowIdx, datum := range col.Bool()[:numRows] { 271 if !nulls.NullAt(rowIdx) { 272 datums[rowIdx*numCols+colIdx] = datum 273 } 274 } 275 case types.IntFamily: 276 switch col.Type().Width() { 277 case 0, 64: 278 for rowIdx, datum := range col.Int64()[:numRows] { 279 if !nulls.NullAt(rowIdx) { 280 datums[rowIdx*numCols+colIdx] = datum 281 } 282 } 283 case 16: 284 for rowIdx, datum := range col.Int16()[:numRows] { 285 if !nulls.NullAt(rowIdx) { 286 datums[rowIdx*numCols+colIdx] = datum 287 } 288 } 289 default: 290 panic(fmt.Sprintf(`unhandled type %s`, col.Type())) 291 } 292 case types.FloatFamily: 293 for rowIdx, datum := range col.Float64()[:numRows] { 294 if !nulls.NullAt(rowIdx) { 295 datums[rowIdx*numCols+colIdx] = datum 296 } 297 } 298 case types.BytesFamily: 299 // HACK: workload's Table schemas are SQL schemas, but the initial data is 300 // returned as a coldata.Batch, which has a more limited set of types. 301 // (Or, in the case of simple workloads that return a []interface{}, it's 302 // roundtripped through coldata.Batch by the `Tuples` helper.) 303 // 304 // Notably, this means a SQL STRING column is represented the same as a 305 // BYTES column (ditto UUID, etc). We could get the fidelity back by 306 // parsing the SQL schema, which in fact we do in 307 // `importccl.makeDatumFromColOffset`. At the moment, the set of types 308 // used in workloads is limited enough that the users of initial 309 // data/splits are okay with the fidelity loss. So, to avoid the 310 // complexity and the undesirable pkg/sql/parser dep, we simply treat them 311 // all as bytes and let the caller deal with the ambiguity. 312 colBytes := col.Bytes() 313 for rowIdx := 0; rowIdx < numRows; rowIdx++ { 314 if !nulls.NullAt(rowIdx) { 315 datums[rowIdx*numCols+colIdx] = colBytes.Get(rowIdx) 316 } 317 } 318 default: 319 panic(fmt.Sprintf(`unhandled type %s`, col.Type())) 320 } 321 } 322 rows := make([][]interface{}, numRows) 323 for rowIdx := 0; rowIdx < numRows; rowIdx++ { 324 rows[rowIdx] = datums[rowIdx*numCols : (rowIdx+1)*numCols] 325 } 326 return rows 327 } 328 329 // InitialDataLoader loads the initial data for all tables in a workload. It 330 // returns a measure of how many bytes were loaded. 331 // 332 // TODO(dan): It would be lovely if the number of bytes loaded was comparable 333 // between implementations but this is sadly not the case right now. 334 type InitialDataLoader interface { 335 InitialDataLoad(context.Context, *gosql.DB, Generator) (int64, error) 336 } 337 338 // ImportDataLoader is a hook for binaries that include CCL code to inject an 339 // IMPORT-based InitialDataLoader implementation. 340 var ImportDataLoader InitialDataLoader = requiresCCLBinaryDataLoader(`IMPORT`) 341 342 type requiresCCLBinaryDataLoader string 343 344 func (l requiresCCLBinaryDataLoader) InitialDataLoad( 345 context.Context, *gosql.DB, Generator, 346 ) (int64, error) { 347 return 0, errors.Errorf(`loading initial data with %s requires a CCL binary`, l) 348 } 349 350 // QueryLoad represents some SQL query workload performable on a database 351 // initialized with the requisite tables. 352 type QueryLoad struct { 353 SQLDatabase string 354 355 // WorkerFns is one function per worker. It is to be called once per unit of 356 // work to be done. 357 WorkerFns []func(context.Context) error 358 359 // Close, if set, is called before the process exits, giving workloads a 360 // chance to print some information. 361 // It's guaranteed that the ctx passed to WorkerFns (if they're still running) 362 // has been canceled by the time this is called (so an implementer can 363 // synchronize with the WorkerFns if need be). 364 Close func(context.Context) 365 366 // ResultHist is the name of the NamedHistogram to use for the benchmark 367 // formatted results output at the end of `./workload run`. The empty string 368 // will use the sum of all histograms. 369 // 370 // TODO(dan): This will go away once more of run.go moves inside Operations. 371 ResultHist string 372 } 373 374 var registered = make(map[string]Meta) 375 376 // Register is a hook for init-time registration of Generator implementations. 377 // This allows only the necessary generators to be compiled into a given binary. 378 func Register(m Meta) { 379 if _, ok := registered[m.Name]; ok { 380 panic(m.Name + " is already registered") 381 } 382 registered[m.Name] = m 383 } 384 385 // Get returns the registered Generator with the given name, if it exists. 386 func Get(name string) (Meta, error) { 387 m, ok := registered[name] 388 if !ok { 389 return Meta{}, errors.Errorf("unknown generator: %s", name) 390 } 391 return m, nil 392 } 393 394 // Registered returns all registered Generators. 395 func Registered() []Meta { 396 gens := make([]Meta, 0, len(registered)) 397 for _, gen := range registered { 398 gens = append(gens, gen) 399 } 400 sort.Slice(gens, func(i, j int) bool { return strings.Compare(gens[i].Name, gens[j].Name) < 0 }) 401 return gens 402 } 403 404 // FromFlags returns a new validated generator with the given flags. If anything 405 // goes wrong, it panics. FromFlags is intended for use with unit test helpers 406 // in individual generators, see its callers for examples. 407 func FromFlags(meta Meta, flags ...string) Generator { 408 gen := meta.New() 409 if len(flags) > 0 { 410 f, ok := gen.(Flagser) 411 if !ok { 412 panic(fmt.Sprintf(`generator %s does not accept flags: %v`, meta.Name, flags)) 413 } 414 flagsStruct := f.Flags() 415 if err := flagsStruct.Parse(flags); err != nil { 416 panic(fmt.Sprintf(`generator %s parsing flags %v: %v`, meta.Name, flags, err)) 417 } 418 } 419 if h, ok := gen.(Hookser); ok { 420 if err := h.Hooks().Validate(); err != nil { 421 panic(fmt.Sprintf(`generator %s flags %s did not validate: %v`, meta.Name, flags, err)) 422 } 423 } 424 return gen 425 } 426 427 // ApproxDatumSize returns the canonical size of a datum as returned from a call 428 // to `Table.InitialRowFn`. NB: These datums end up getting serialized in 429 // different ways, which means there's no one size that will be correct for all 430 // of them. 431 func ApproxDatumSize(x interface{}) int64 { 432 if x == nil { 433 return 0 434 } 435 switch t := x.(type) { 436 case bool: 437 return 1 438 case int: 439 if t < 0 { 440 t = -t 441 } 442 // This and float64 are `+8` so a `0` results in `1`. This function is 443 // used to batch things by size and table of all `0`s should not get 444 // infinite size batches. 445 return int64(bits.Len(uint(t))+8) / 8 446 case int64: 447 return int64(bits.Len64(uint64(t))+8) / 8 448 case int16: 449 return int64(bits.Len64(uint64(t))+8) / 8 450 case uint64: 451 return int64(bits.Len64(t)+8) / 8 452 case float64: 453 return int64(bits.Len64(math.Float64bits(t))+8) / 8 454 case string: 455 return int64(len(t)) 456 case []byte: 457 return int64(len(t)) 458 case time.Time: 459 return 12 460 default: 461 panic(fmt.Sprintf("unsupported type %T: %v", x, x)) 462 } 463 }