github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/tpchvec.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bufio" 15 "bytes" 16 "context" 17 gosql "database/sql" 18 "fmt" 19 "math/rand" 20 "regexp" 21 "runtime" 22 "sort" 23 "strconv" 24 "strings" 25 26 "github.com/cockroachdb/cockroach/pkg/util/binfetcher" 27 "github.com/cockroachdb/cockroach/pkg/util/randutil" 28 "github.com/cockroachdb/cockroach/pkg/workload/tpch" 29 "github.com/cockroachdb/errors" 30 ) 31 32 const ( 33 tpchVecNodeCount = 3 34 ) 35 36 type crdbVersion int 37 38 const ( 39 tpchVecVersion19_2 crdbVersion = iota 40 tpchVecVersion20_1 41 tpchVecVersion20_2 42 ) 43 44 func toCRDBVersion(v string) (crdbVersion, error) { 45 if strings.HasPrefix(v, "v19.2") { 46 return tpchVecVersion19_2, nil 47 } else if strings.HasPrefix(v, "v20.1") { 48 return tpchVecVersion20_1, nil 49 } else if strings.HasPrefix(v, "v20.2") { 50 return tpchVecVersion20_2, nil 51 } else { 52 return 0, errors.Errorf("unrecognized version: %s", v) 53 } 54 } 55 56 func vectorizeOptionToSetting(vectorize bool, version crdbVersion) string { 57 if !vectorize { 58 return "off" 59 } 60 switch version { 61 case tpchVecVersion19_2: 62 return "experimental_on" 63 default: 64 return "on" 65 } 66 } 67 68 var ( 69 // queriesToSkipByVersion is a map keyed by version that contains query numbers 70 // to be skipped for the given version (as well as the reasons for why they are skipped). 71 queriesToSkipByVersion = map[crdbVersion]map[int]string{ 72 tpchVecVersion19_2: { 73 5: "can cause OOM", 74 7: "can cause OOM", 75 8: "can cause OOM", 76 9: "can cause OOM", 77 19: "can cause OOM", 78 }, 79 } 80 81 // slownessThreshold describes the threshold at which we fail the test 82 // if vec ON is slower that vec OFF, meaning that if 83 // vec_on_time > vecOnSlowerFailFactor * vec_off_time, the test is failed. 84 // This will help catch any regressions. 85 // Note that for 19.2 version the threshold is higher in order to reduce 86 // the noise. 87 slownessThresholdByVersion = map[crdbVersion]float64{ 88 tpchVecVersion19_2: 1.5, 89 tpchVecVersion20_1: 1.2, 90 tpchVecVersion20_2: 1.2, 91 } 92 ) 93 94 var tpchTables = []string{ 95 "nation", "region", "part", "supplier", 96 "partsupp", "customer", "orders", "lineitem", 97 } 98 99 type tpchVecTestCase interface { 100 // TODO(asubiotto): Getting the queries we want to run given a version should 101 // also be part of this. 102 // vectorizeOptions are the vectorize options that each query will be run 103 // with. 104 vectorizeOptions() []bool 105 // numRunsPerQuery is the number of times each tpch query should be run. 106 numRunsPerQuery() int 107 // preTestRunHook is called before any tpch query is run. Can be used to 108 // perform setup. 109 preTestRunHook(ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion) 110 // postQueryRunHook is called after each tpch query is run with the output and 111 // the vectorize mode it was run in. 112 postQueryRunHook(t *test, output []byte, vectorized bool) 113 // postTestRunHook is called after all tpch queries are run. Can be used to 114 // perform teardown or general validation. 115 postTestRunHook(t *test, conn *gosql.DB, version crdbVersion) 116 } 117 118 // tpchVecTestCaseBase is a default tpchVecTestCase implementation that can be 119 // embedded and extended. 120 type tpchVecTestCaseBase struct{} 121 122 func (b tpchVecTestCaseBase) vectorizeOptions() []bool { 123 return []bool{true} 124 } 125 126 func (b tpchVecTestCaseBase) numRunsPerQuery() int { 127 return 1 128 } 129 130 func (b tpchVecTestCaseBase) preTestRunHook( 131 _ context.Context, t *test, _ *cluster, conn *gosql.DB, version crdbVersion, 132 ) { 133 if version != tpchVecVersion19_2 { 134 t.Status("resetting sql.testing.vectorize.batch_size") 135 if _, err := conn.Exec("RESET CLUSTER SETTING sql.testing.vectorize.batch_size"); err != nil { 136 t.Fatal(err) 137 } 138 } 139 t.Status("resetting workmem to default") 140 if _, err := conn.Exec("RESET CLUSTER SETTING sql.distsql.temp_storage.workmem"); err != nil { 141 t.Fatal(err) 142 } 143 } 144 145 func (b tpchVecTestCaseBase) postQueryRunHook(_ *test, _ []byte, _ bool) {} 146 147 func (b tpchVecTestCaseBase) postTestRunHook(_ *test, _ *gosql.DB, _ crdbVersion) {} 148 149 const ( 150 tpchPerfTestNumRunsPerQuery = 3 151 tpchPerfTestVecOnConfigIdx = 0 152 tpchPerfTestVecOffConfigIdx = 1 153 ) 154 155 type tpchVecPerfTest struct { 156 tpchVecTestCaseBase 157 timeByQueryNum []map[int][]float64 158 } 159 160 var _ tpchVecTestCase = &tpchVecPerfTest{} 161 162 func newTpchVecPerfTest() *tpchVecPerfTest { 163 return &tpchVecPerfTest{ 164 timeByQueryNum: []map[int][]float64{make(map[int][]float64), make(map[int][]float64)}, 165 } 166 } 167 168 func (p tpchVecPerfTest) vectorizeOptions() []bool { 169 // Since this is a performance test, each query should be run with both 170 // vectorize modes. 171 return []bool{true, false} 172 } 173 174 func (p tpchVecPerfTest) numRunsPerQuery() int { 175 return tpchPerfTestNumRunsPerQuery 176 } 177 178 func (p tpchVecPerfTest) preTestRunHook( 179 ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion, 180 ) { 181 p.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version) 182 // TODO(yuzefovich): remove this once we figure out the issue with random 183 // performance hits on query 7. 184 for node := 1; node <= c.spec.NodeCount; node++ { 185 nodeConn := c.Conn(ctx, node) 186 if _, err := nodeConn.Exec( 187 "SELECT crdb_internal.set_vmodule('vectorized_flow=1,spilling_queue=1,row_container=2,hash_row_container=2');", 188 ); err != nil { 189 t.Fatal(err) 190 } 191 } 192 } 193 194 func (p *tpchVecPerfTest) postQueryRunHook(t *test, output []byte, vectorized bool) { 195 configIdx := tpchPerfTestVecOffConfigIdx 196 if vectorized { 197 configIdx = tpchPerfTestVecOnConfigIdx 198 } 199 runtimeRegex := regexp.MustCompile(`.*\[q([\d]+)\] returned \d+ rows after ([\d]+\.[\d]+) seconds.*`) 200 scanner := bufio.NewScanner(bytes.NewReader(output)) 201 for scanner.Scan() { 202 line := scanner.Bytes() 203 match := runtimeRegex.FindSubmatch(line) 204 if match != nil { 205 queryNum, err := strconv.Atoi(string(match[1])) 206 if err != nil { 207 t.Fatalf("failed parsing %q as int with %s", match[1], err) 208 } 209 queryTime, err := strconv.ParseFloat(string(match[2]), 64) 210 if err != nil { 211 t.Fatalf("failed parsing %q as float with %s", match[2], err) 212 } 213 p.timeByQueryNum[configIdx][queryNum] = append(p.timeByQueryNum[configIdx][queryNum], queryTime) 214 } 215 } 216 } 217 218 func (p *tpchVecPerfTest) postTestRunHook(t *test, conn *gosql.DB, version crdbVersion) { 219 queriesToSkip := queriesToSkipByVersion[version] 220 t.Status("comparing the runtimes (only median values for each query are compared)") 221 for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ { 222 if _, skipped := queriesToSkip[queryNum]; skipped { 223 continue 224 } 225 findMedian := func(times []float64) float64 { 226 sort.Float64s(times) 227 return times[len(times)/2] 228 } 229 vecOnTimes := p.timeByQueryNum[tpchPerfTestVecOnConfigIdx][queryNum] 230 vecOffTimes := p.timeByQueryNum[tpchPerfTestVecOffConfigIdx][queryNum] 231 if len(vecOnTimes) != tpchPerfTestNumRunsPerQuery { 232 t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+ 233 "recorded with vec ON config: %v", queryNum, vecOnTimes)) 234 } 235 if len(vecOffTimes) != tpchPerfTestNumRunsPerQuery { 236 t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+ 237 "recorded with vec OFF config: %v", queryNum, vecOffTimes)) 238 } 239 vecOnTime := findMedian(vecOnTimes) 240 vecOffTime := findMedian(vecOffTimes) 241 if vecOffTime < vecOnTime { 242 t.l.Printf( 243 fmt.Sprintf("[q%d] vec OFF was faster by %.2f%%: "+ 244 "%.2fs ON vs %.2fs OFF --- WARNING\n"+ 245 "vec ON times: %v\t vec OFF times: %v", 246 queryNum, 100*(vecOnTime-vecOffTime)/vecOffTime, 247 vecOnTime, vecOffTime, vecOnTimes, vecOffTimes)) 248 } else { 249 t.l.Printf( 250 fmt.Sprintf("[q%d] vec ON was faster by %.2f%%: "+ 251 "%.2fs ON vs %.2fs OFF\n"+ 252 "vec ON times: %v\t vec OFF times: %v", 253 queryNum, 100*(vecOffTime-vecOnTime)/vecOnTime, 254 vecOnTime, vecOffTime, vecOnTimes, vecOffTimes)) 255 } 256 if vecOnTime >= slownessThresholdByVersion[version]*vecOffTime { 257 // For some reason, the vectorized engine executed the query a lot 258 // slower than the row-by-row engine which is unexpected. In order 259 // to understand where the slowness comes from, we will run EXPLAIN 260 // ANALYZE of the query with all `vectorize` options 261 // tpchPerfTestNumRunsPerQuery times (hoping at least one will 262 // "catch" the slowness). 263 for _, vectorize := range p.vectorizeOptions() { 264 vectorizeSetting := vectorizeOptionToSetting(vectorize, version) 265 if _, err := conn.Exec(fmt.Sprintf("SET vectorize=%s;", vectorizeSetting)); err != nil { 266 t.Fatal(err) 267 } 268 for i := 0; i < tpchPerfTestNumRunsPerQuery; i++ { 269 rows, err := conn.Query(fmt.Sprintf( 270 "EXPLAIN ANALYZE %s;", tpch.QueriesByNumber[queryNum], 271 )) 272 if err != nil { 273 t.Fatal(err) 274 } 275 var ( 276 automatic bool 277 url string 278 ) 279 rows.Next() 280 if err = rows.Scan(&automatic, &url); err != nil { 281 t.Fatal(err) 282 } 283 if err = rows.Close(); err != nil { 284 t.Fatal(err) 285 } 286 t.Status(fmt.Sprintf("EXPLAIN ANALYZE with vectorize=%s url:\n%s", vectorizeSetting, url)) 287 } 288 } 289 t.Fatal(fmt.Sprintf( 290 "[q%d] vec ON is slower by %.2f%% than vec OFF\n"+ 291 "vec ON times: %v\nvec OFF times: %v", 292 queryNum, 100*(vecOnTime-vecOffTime)/vecOffTime, vecOnTimes, vecOffTimes)) 293 } 294 } 295 } 296 297 type tpchVecDiskTest struct { 298 tpchVecTestCaseBase 299 } 300 301 func (d tpchVecDiskTest) preTestRunHook( 302 ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion, 303 ) { 304 d.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version) 305 // In order to stress the disk spilling of the vectorized 306 // engine, we will set workmem limit to a random value in range 307 // [16KiB, 256KiB). 308 rng, _ := randutil.NewPseudoRand() 309 workmemInKiB := 16 + rng.Intn(240) 310 workmem := fmt.Sprintf("%dKiB", workmemInKiB) 311 t.Status(fmt.Sprintf("setting workmem='%s'", workmem)) 312 if _, err := conn.Exec(fmt.Sprintf("SET CLUSTER SETTING sql.distsql.temp_storage.workmem='%s'", workmem)); err != nil { 313 t.Fatal(err) 314 } 315 } 316 317 // setSmallBatchSize sets a cluster setting to override the batch size to be in 318 // [1, 5) range. 319 func setSmallBatchSize(t *test, conn *gosql.DB, rng *rand.Rand) { 320 batchSize := 1 + rng.Intn(4) 321 t.Status(fmt.Sprintf("setting sql.testing.vectorize.batch_size to %d", batchSize)) 322 if _, err := conn.Exec(fmt.Sprintf("SET CLUSTER SETTING sql.testing.vectorize.batch_size=%d", batchSize)); err != nil { 323 t.Fatal(err) 324 } 325 } 326 327 type tpchVecSmallBatchSizeTest struct { 328 tpchVecTestCaseBase 329 } 330 331 func (b tpchVecSmallBatchSizeTest) preTestRunHook( 332 ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion, 333 ) { 334 b.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version) 335 rng, _ := randutil.NewPseudoRand() 336 setSmallBatchSize(t, conn, rng) 337 } 338 339 func baseTestRun( 340 ctx context.Context, t *test, c *cluster, version crdbVersion, tc tpchVecTestCase, 341 ) { 342 firstNode := c.Node(1) 343 queriesToSkip := queriesToSkipByVersion[version] 344 for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ { 345 for _, vectorize := range tc.vectorizeOptions() { 346 if reason, skip := queriesToSkip[queryNum]; skip { 347 t.Status(fmt.Sprintf("skipping q%d because of %q", queryNum, reason)) 348 continue 349 } 350 vectorizeSetting := vectorizeOptionToSetting(vectorize, version) 351 cmd := fmt.Sprintf("./workload run tpch --concurrency=1 --db=tpch "+ 352 "--max-ops=%d --queries=%d --vectorize=%s {pgurl:1-%d}", 353 tc.numRunsPerQuery(), queryNum, vectorizeSetting, tpchVecNodeCount) 354 workloadOutput, err := c.RunWithBuffer(ctx, t.l, firstNode, cmd) 355 t.l.Printf("\n" + string(workloadOutput)) 356 if err != nil { 357 // Note: if you see an error like "exit status 1", it is likely caused 358 // by the erroneous output of the query. 359 t.Fatal(err) 360 } 361 tc.postQueryRunHook(t, workloadOutput, vectorize) 362 } 363 } 364 } 365 366 type tpchVecSmithcmpTest struct { 367 tpchVecTestCaseBase 368 } 369 370 const tpchVecSmithcmp = "smithcmp" 371 372 func (s tpchVecSmithcmpTest) preTestRunHook( 373 ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion, 374 ) { 375 s.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version) 376 const smithcmpSHA = "a3f41f5ba9273249c5ecfa6348ea8ee3ac4b77e3" 377 node := c.Node(1) 378 if local && runtime.GOOS != "linux" { 379 t.Fatalf("must run on linux os, found %s", runtime.GOOS) 380 } 381 // This binary has been manually compiled using 382 // './build/builder.sh go build ./pkg/cmd/smithcmp' and uploaded to S3 383 // bucket at cockroach/smithcmp. The binary shouldn't change much, so it is 384 // acceptable. 385 smithcmp, err := binfetcher.Download(ctx, binfetcher.Options{ 386 Component: tpchVecSmithcmp, 387 Binary: tpchVecSmithcmp, 388 Version: smithcmpSHA, 389 GOOS: "linux", 390 GOARCH: "amd64", 391 }) 392 if err != nil { 393 t.Fatal(err) 394 } 395 c.Put(ctx, smithcmp, "./"+tpchVecSmithcmp, node) 396 // To increase test coverage, we will be randomizing the batch size in 50% 397 // of the runs. 398 rng, _ := randutil.NewPseudoRand() 399 if rng.Float64() < 0.5 { 400 setSmallBatchSize(t, conn, rng) 401 } 402 } 403 404 func smithcmpTestRun(ctx context.Context, t *test, c *cluster, _ crdbVersion, _ tpchVecTestCase) { 405 const ( 406 configFile = `tpchvec_smithcmp.toml` 407 configURL = `https://raw.githubusercontent.com/cockroachdb/cockroach/master/pkg/cmd/roachtest/` + configFile 408 ) 409 firstNode := c.Node(1) 410 if err := c.RunE(ctx, firstNode, fmt.Sprintf("curl %s > %s", configURL, configFile)); err != nil { 411 t.Fatal(err) 412 } 413 cmd := fmt.Sprintf("./%s %s", tpchVecSmithcmp, configFile) 414 if err := c.RunE(ctx, firstNode, cmd); err != nil { 415 t.Fatal(err) 416 } 417 } 418 419 func runTPCHVec( 420 ctx context.Context, 421 t *test, 422 c *cluster, 423 testCase tpchVecTestCase, 424 testRun func(ctx context.Context, t *test, c *cluster, version crdbVersion, tc tpchVecTestCase), 425 ) { 426 firstNode := c.Node(1) 427 c.Put(ctx, cockroach, "./cockroach", c.All()) 428 c.Put(ctx, workload, "./workload", firstNode) 429 c.Start(ctx, t) 430 431 conn := c.Conn(ctx, 1) 432 disableAutoStats(t, conn) 433 t.Status("restoring TPCH dataset for Scale Factor 1") 434 if err := loadTPCHDataset(ctx, t, c, 1 /* sf */, newMonitor(ctx, c), c.All()); err != nil { 435 t.Fatal(err) 436 } 437 438 if _, err := conn.Exec("USE tpch;"); err != nil { 439 t.Fatal(err) 440 } 441 scatterTables(t, conn, tpchTables) 442 t.Status("waiting for full replication") 443 waitForFullReplication(t, conn) 444 createStatsFromTables(t, conn, tpchTables) 445 versionString, err := fetchCockroachVersion(ctx, c, c.Node(1)[0]) 446 if err != nil { 447 t.Fatal(err) 448 } 449 version, err := toCRDBVersion(versionString) 450 if err != nil { 451 t.Fatal(err) 452 } 453 454 testCase.preTestRunHook(ctx, t, c, conn, version) 455 testRun(ctx, t, c, version, testCase) 456 testCase.postTestRunHook(t, conn, version) 457 } 458 459 func registerTPCHVec(r *testRegistry) { 460 r.Add(testSpec{ 461 Name: "tpchvec/perf", 462 Owner: OwnerSQLExec, 463 Cluster: makeClusterSpec(tpchVecNodeCount), 464 MinVersion: "v19.2.0", 465 Run: func(ctx context.Context, t *test, c *cluster) { 466 runTPCHVec(ctx, t, c, newTpchVecPerfTest(), baseTestRun) 467 }, 468 }) 469 470 r.Add(testSpec{ 471 Name: "tpchvec/disk", 472 Owner: OwnerSQLExec, 473 Cluster: makeClusterSpec(tpchVecNodeCount), 474 // 19.2 version doesn't have disk spilling nor memory monitoring, so 475 // there is no point in running this config on that version. 476 MinVersion: "v20.1.0", 477 Run: func(ctx context.Context, t *test, c *cluster) { 478 runTPCHVec(ctx, t, c, tpchVecDiskTest{}, baseTestRun) 479 }, 480 }) 481 482 r.Add(testSpec{ 483 Name: "tpchvec/smallbatchsize", 484 Owner: OwnerSQLExec, 485 Cluster: makeClusterSpec(tpchVecNodeCount), 486 // 19.2 version doesn't have the testing cluster setting to change the batch 487 // size, so only run on versions >= 20.1.0. 488 MinVersion: "v20.1.0", 489 Run: func(ctx context.Context, t *test, c *cluster) { 490 runTPCHVec(ctx, t, c, tpchVecSmallBatchSizeTest{}, baseTestRun) 491 }, 492 }) 493 494 r.Add(testSpec{ 495 Name: "tpchvec/smithcmp", 496 Owner: OwnerSQLExec, 497 Cluster: makeClusterSpec(tpchVecNodeCount), 498 MinVersion: "v20.1.0", 499 Run: func(ctx context.Context, t *test, c *cluster) { 500 runTPCHVec(ctx, t, c, tpchVecSmithcmpTest{}, smithcmpTestRun) 501 }, 502 }) 503 }