github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/tpchvec.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"context"
    17  	gosql "database/sql"
    18  	"fmt"
    19  	"math/rand"
    20  	"regexp"
    21  	"runtime"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/util/binfetcher"
    27  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    28  	"github.com/cockroachdb/cockroach/pkg/workload/tpch"
    29  	"github.com/cockroachdb/errors"
    30  )
    31  
    32  const (
    33  	tpchVecNodeCount = 3
    34  )
    35  
    36  type crdbVersion int
    37  
    38  const (
    39  	tpchVecVersion19_2 crdbVersion = iota
    40  	tpchVecVersion20_1
    41  	tpchVecVersion20_2
    42  )
    43  
    44  func toCRDBVersion(v string) (crdbVersion, error) {
    45  	if strings.HasPrefix(v, "v19.2") {
    46  		return tpchVecVersion19_2, nil
    47  	} else if strings.HasPrefix(v, "v20.1") {
    48  		return tpchVecVersion20_1, nil
    49  	} else if strings.HasPrefix(v, "v20.2") {
    50  		return tpchVecVersion20_2, nil
    51  	} else {
    52  		return 0, errors.Errorf("unrecognized version: %s", v)
    53  	}
    54  }
    55  
    56  func vectorizeOptionToSetting(vectorize bool, version crdbVersion) string {
    57  	if !vectorize {
    58  		return "off"
    59  	}
    60  	switch version {
    61  	case tpchVecVersion19_2:
    62  		return "experimental_on"
    63  	default:
    64  		return "on"
    65  	}
    66  }
    67  
    68  var (
    69  	// queriesToSkipByVersion is a map keyed by version that contains query numbers
    70  	// to be skipped for the given version (as well as the reasons for why they are skipped).
    71  	queriesToSkipByVersion = map[crdbVersion]map[int]string{
    72  		tpchVecVersion19_2: {
    73  			5:  "can cause OOM",
    74  			7:  "can cause OOM",
    75  			8:  "can cause OOM",
    76  			9:  "can cause OOM",
    77  			19: "can cause OOM",
    78  		},
    79  	}
    80  
    81  	// slownessThreshold describes the threshold at which we fail the test
    82  	// if vec ON is slower that vec OFF, meaning that if
    83  	// vec_on_time > vecOnSlowerFailFactor * vec_off_time, the test is failed.
    84  	// This will help catch any regressions.
    85  	// Note that for 19.2 version the threshold is higher in order to reduce
    86  	// the noise.
    87  	slownessThresholdByVersion = map[crdbVersion]float64{
    88  		tpchVecVersion19_2: 1.5,
    89  		tpchVecVersion20_1: 1.2,
    90  		tpchVecVersion20_2: 1.2,
    91  	}
    92  )
    93  
    94  var tpchTables = []string{
    95  	"nation", "region", "part", "supplier",
    96  	"partsupp", "customer", "orders", "lineitem",
    97  }
    98  
    99  type tpchVecTestCase interface {
   100  	// TODO(asubiotto): Getting the queries we want to run given a version should
   101  	//  also be part of this.
   102  	// vectorizeOptions are the vectorize options that each query will be run
   103  	// with.
   104  	vectorizeOptions() []bool
   105  	// numRunsPerQuery is the number of times each tpch query should be run.
   106  	numRunsPerQuery() int
   107  	// preTestRunHook is called before any tpch query is run. Can be used to
   108  	// perform setup.
   109  	preTestRunHook(ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion)
   110  	// postQueryRunHook is called after each tpch query is run with the output and
   111  	// the vectorize mode it was run in.
   112  	postQueryRunHook(t *test, output []byte, vectorized bool)
   113  	// postTestRunHook is called after all tpch queries are run. Can be used to
   114  	// perform teardown or general validation.
   115  	postTestRunHook(t *test, conn *gosql.DB, version crdbVersion)
   116  }
   117  
   118  // tpchVecTestCaseBase is a default tpchVecTestCase implementation that can be
   119  // embedded and extended.
   120  type tpchVecTestCaseBase struct{}
   121  
   122  func (b tpchVecTestCaseBase) vectorizeOptions() []bool {
   123  	return []bool{true}
   124  }
   125  
   126  func (b tpchVecTestCaseBase) numRunsPerQuery() int {
   127  	return 1
   128  }
   129  
   130  func (b tpchVecTestCaseBase) preTestRunHook(
   131  	_ context.Context, t *test, _ *cluster, conn *gosql.DB, version crdbVersion,
   132  ) {
   133  	if version != tpchVecVersion19_2 {
   134  		t.Status("resetting sql.testing.vectorize.batch_size")
   135  		if _, err := conn.Exec("RESET CLUSTER SETTING sql.testing.vectorize.batch_size"); err != nil {
   136  			t.Fatal(err)
   137  		}
   138  	}
   139  	t.Status("resetting workmem to default")
   140  	if _, err := conn.Exec("RESET CLUSTER SETTING sql.distsql.temp_storage.workmem"); err != nil {
   141  		t.Fatal(err)
   142  	}
   143  }
   144  
   145  func (b tpchVecTestCaseBase) postQueryRunHook(_ *test, _ []byte, _ bool) {}
   146  
   147  func (b tpchVecTestCaseBase) postTestRunHook(_ *test, _ *gosql.DB, _ crdbVersion) {}
   148  
   149  const (
   150  	tpchPerfTestNumRunsPerQuery = 3
   151  	tpchPerfTestVecOnConfigIdx  = 0
   152  	tpchPerfTestVecOffConfigIdx = 1
   153  )
   154  
   155  type tpchVecPerfTest struct {
   156  	tpchVecTestCaseBase
   157  	timeByQueryNum []map[int][]float64
   158  }
   159  
   160  var _ tpchVecTestCase = &tpchVecPerfTest{}
   161  
   162  func newTpchVecPerfTest() *tpchVecPerfTest {
   163  	return &tpchVecPerfTest{
   164  		timeByQueryNum: []map[int][]float64{make(map[int][]float64), make(map[int][]float64)},
   165  	}
   166  }
   167  
   168  func (p tpchVecPerfTest) vectorizeOptions() []bool {
   169  	// Since this is a performance test, each query should be run with both
   170  	// vectorize modes.
   171  	return []bool{true, false}
   172  }
   173  
   174  func (p tpchVecPerfTest) numRunsPerQuery() int {
   175  	return tpchPerfTestNumRunsPerQuery
   176  }
   177  
   178  func (p tpchVecPerfTest) preTestRunHook(
   179  	ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion,
   180  ) {
   181  	p.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version)
   182  	// TODO(yuzefovich): remove this once we figure out the issue with random
   183  	// performance hits on query 7.
   184  	for node := 1; node <= c.spec.NodeCount; node++ {
   185  		nodeConn := c.Conn(ctx, node)
   186  		if _, err := nodeConn.Exec(
   187  			"SELECT crdb_internal.set_vmodule('vectorized_flow=1,spilling_queue=1,row_container=2,hash_row_container=2');",
   188  		); err != nil {
   189  			t.Fatal(err)
   190  		}
   191  	}
   192  }
   193  
   194  func (p *tpchVecPerfTest) postQueryRunHook(t *test, output []byte, vectorized bool) {
   195  	configIdx := tpchPerfTestVecOffConfigIdx
   196  	if vectorized {
   197  		configIdx = tpchPerfTestVecOnConfigIdx
   198  	}
   199  	runtimeRegex := regexp.MustCompile(`.*\[q([\d]+)\] returned \d+ rows after ([\d]+\.[\d]+) seconds.*`)
   200  	scanner := bufio.NewScanner(bytes.NewReader(output))
   201  	for scanner.Scan() {
   202  		line := scanner.Bytes()
   203  		match := runtimeRegex.FindSubmatch(line)
   204  		if match != nil {
   205  			queryNum, err := strconv.Atoi(string(match[1]))
   206  			if err != nil {
   207  				t.Fatalf("failed parsing %q as int with %s", match[1], err)
   208  			}
   209  			queryTime, err := strconv.ParseFloat(string(match[2]), 64)
   210  			if err != nil {
   211  				t.Fatalf("failed parsing %q as float with %s", match[2], err)
   212  			}
   213  			p.timeByQueryNum[configIdx][queryNum] = append(p.timeByQueryNum[configIdx][queryNum], queryTime)
   214  		}
   215  	}
   216  }
   217  
   218  func (p *tpchVecPerfTest) postTestRunHook(t *test, conn *gosql.DB, version crdbVersion) {
   219  	queriesToSkip := queriesToSkipByVersion[version]
   220  	t.Status("comparing the runtimes (only median values for each query are compared)")
   221  	for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ {
   222  		if _, skipped := queriesToSkip[queryNum]; skipped {
   223  			continue
   224  		}
   225  		findMedian := func(times []float64) float64 {
   226  			sort.Float64s(times)
   227  			return times[len(times)/2]
   228  		}
   229  		vecOnTimes := p.timeByQueryNum[tpchPerfTestVecOnConfigIdx][queryNum]
   230  		vecOffTimes := p.timeByQueryNum[tpchPerfTestVecOffConfigIdx][queryNum]
   231  		if len(vecOnTimes) != tpchPerfTestNumRunsPerQuery {
   232  			t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
   233  				"recorded with vec ON config: %v", queryNum, vecOnTimes))
   234  		}
   235  		if len(vecOffTimes) != tpchPerfTestNumRunsPerQuery {
   236  			t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
   237  				"recorded with vec OFF config: %v", queryNum, vecOffTimes))
   238  		}
   239  		vecOnTime := findMedian(vecOnTimes)
   240  		vecOffTime := findMedian(vecOffTimes)
   241  		if vecOffTime < vecOnTime {
   242  			t.l.Printf(
   243  				fmt.Sprintf("[q%d] vec OFF was faster by %.2f%%: "+
   244  					"%.2fs ON vs %.2fs OFF --- WARNING\n"+
   245  					"vec ON times: %v\t vec OFF times: %v",
   246  					queryNum, 100*(vecOnTime-vecOffTime)/vecOffTime,
   247  					vecOnTime, vecOffTime, vecOnTimes, vecOffTimes))
   248  		} else {
   249  			t.l.Printf(
   250  				fmt.Sprintf("[q%d] vec ON was faster by %.2f%%: "+
   251  					"%.2fs ON vs %.2fs OFF\n"+
   252  					"vec ON times: %v\t vec OFF times: %v",
   253  					queryNum, 100*(vecOffTime-vecOnTime)/vecOnTime,
   254  					vecOnTime, vecOffTime, vecOnTimes, vecOffTimes))
   255  		}
   256  		if vecOnTime >= slownessThresholdByVersion[version]*vecOffTime {
   257  			// For some reason, the vectorized engine executed the query a lot
   258  			// slower than the row-by-row engine which is unexpected. In order
   259  			// to understand where the slowness comes from, we will run EXPLAIN
   260  			// ANALYZE of the query with all `vectorize` options
   261  			// tpchPerfTestNumRunsPerQuery times (hoping at least one will
   262  			// "catch" the slowness).
   263  			for _, vectorize := range p.vectorizeOptions() {
   264  				vectorizeSetting := vectorizeOptionToSetting(vectorize, version)
   265  				if _, err := conn.Exec(fmt.Sprintf("SET vectorize=%s;", vectorizeSetting)); err != nil {
   266  					t.Fatal(err)
   267  				}
   268  				for i := 0; i < tpchPerfTestNumRunsPerQuery; i++ {
   269  					rows, err := conn.Query(fmt.Sprintf(
   270  						"EXPLAIN ANALYZE %s;", tpch.QueriesByNumber[queryNum],
   271  					))
   272  					if err != nil {
   273  						t.Fatal(err)
   274  					}
   275  					var (
   276  						automatic bool
   277  						url       string
   278  					)
   279  					rows.Next()
   280  					if err = rows.Scan(&automatic, &url); err != nil {
   281  						t.Fatal(err)
   282  					}
   283  					if err = rows.Close(); err != nil {
   284  						t.Fatal(err)
   285  					}
   286  					t.Status(fmt.Sprintf("EXPLAIN ANALYZE with vectorize=%s url:\n%s", vectorizeSetting, url))
   287  				}
   288  			}
   289  			t.Fatal(fmt.Sprintf(
   290  				"[q%d] vec ON is slower by %.2f%% than vec OFF\n"+
   291  					"vec ON times: %v\nvec OFF times: %v",
   292  				queryNum, 100*(vecOnTime-vecOffTime)/vecOffTime, vecOnTimes, vecOffTimes))
   293  		}
   294  	}
   295  }
   296  
   297  type tpchVecDiskTest struct {
   298  	tpchVecTestCaseBase
   299  }
   300  
   301  func (d tpchVecDiskTest) preTestRunHook(
   302  	ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion,
   303  ) {
   304  	d.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version)
   305  	// In order to stress the disk spilling of the vectorized
   306  	// engine, we will set workmem limit to a random value in range
   307  	// [16KiB, 256KiB).
   308  	rng, _ := randutil.NewPseudoRand()
   309  	workmemInKiB := 16 + rng.Intn(240)
   310  	workmem := fmt.Sprintf("%dKiB", workmemInKiB)
   311  	t.Status(fmt.Sprintf("setting workmem='%s'", workmem))
   312  	if _, err := conn.Exec(fmt.Sprintf("SET CLUSTER SETTING sql.distsql.temp_storage.workmem='%s'", workmem)); err != nil {
   313  		t.Fatal(err)
   314  	}
   315  }
   316  
   317  // setSmallBatchSize sets a cluster setting to override the batch size to be in
   318  // [1, 5) range.
   319  func setSmallBatchSize(t *test, conn *gosql.DB, rng *rand.Rand) {
   320  	batchSize := 1 + rng.Intn(4)
   321  	t.Status(fmt.Sprintf("setting sql.testing.vectorize.batch_size to %d", batchSize))
   322  	if _, err := conn.Exec(fmt.Sprintf("SET CLUSTER SETTING sql.testing.vectorize.batch_size=%d", batchSize)); err != nil {
   323  		t.Fatal(err)
   324  	}
   325  }
   326  
   327  type tpchVecSmallBatchSizeTest struct {
   328  	tpchVecTestCaseBase
   329  }
   330  
   331  func (b tpchVecSmallBatchSizeTest) preTestRunHook(
   332  	ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion,
   333  ) {
   334  	b.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version)
   335  	rng, _ := randutil.NewPseudoRand()
   336  	setSmallBatchSize(t, conn, rng)
   337  }
   338  
   339  func baseTestRun(
   340  	ctx context.Context, t *test, c *cluster, version crdbVersion, tc tpchVecTestCase,
   341  ) {
   342  	firstNode := c.Node(1)
   343  	queriesToSkip := queriesToSkipByVersion[version]
   344  	for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ {
   345  		for _, vectorize := range tc.vectorizeOptions() {
   346  			if reason, skip := queriesToSkip[queryNum]; skip {
   347  				t.Status(fmt.Sprintf("skipping q%d because of %q", queryNum, reason))
   348  				continue
   349  			}
   350  			vectorizeSetting := vectorizeOptionToSetting(vectorize, version)
   351  			cmd := fmt.Sprintf("./workload run tpch --concurrency=1 --db=tpch "+
   352  				"--max-ops=%d --queries=%d --vectorize=%s {pgurl:1-%d}",
   353  				tc.numRunsPerQuery(), queryNum, vectorizeSetting, tpchVecNodeCount)
   354  			workloadOutput, err := c.RunWithBuffer(ctx, t.l, firstNode, cmd)
   355  			t.l.Printf("\n" + string(workloadOutput))
   356  			if err != nil {
   357  				// Note: if you see an error like "exit status 1", it is likely caused
   358  				// by the erroneous output of the query.
   359  				t.Fatal(err)
   360  			}
   361  			tc.postQueryRunHook(t, workloadOutput, vectorize)
   362  		}
   363  	}
   364  }
   365  
   366  type tpchVecSmithcmpTest struct {
   367  	tpchVecTestCaseBase
   368  }
   369  
   370  const tpchVecSmithcmp = "smithcmp"
   371  
   372  func (s tpchVecSmithcmpTest) preTestRunHook(
   373  	ctx context.Context, t *test, c *cluster, conn *gosql.DB, version crdbVersion,
   374  ) {
   375  	s.tpchVecTestCaseBase.preTestRunHook(ctx, t, c, conn, version)
   376  	const smithcmpSHA = "a3f41f5ba9273249c5ecfa6348ea8ee3ac4b77e3"
   377  	node := c.Node(1)
   378  	if local && runtime.GOOS != "linux" {
   379  		t.Fatalf("must run on linux os, found %s", runtime.GOOS)
   380  	}
   381  	// This binary has been manually compiled using
   382  	// './build/builder.sh go build ./pkg/cmd/smithcmp' and uploaded to S3
   383  	// bucket at cockroach/smithcmp. The binary shouldn't change much, so it is
   384  	// acceptable.
   385  	smithcmp, err := binfetcher.Download(ctx, binfetcher.Options{
   386  		Component: tpchVecSmithcmp,
   387  		Binary:    tpchVecSmithcmp,
   388  		Version:   smithcmpSHA,
   389  		GOOS:      "linux",
   390  		GOARCH:    "amd64",
   391  	})
   392  	if err != nil {
   393  		t.Fatal(err)
   394  	}
   395  	c.Put(ctx, smithcmp, "./"+tpchVecSmithcmp, node)
   396  	// To increase test coverage, we will be randomizing the batch size in 50%
   397  	// of the runs.
   398  	rng, _ := randutil.NewPseudoRand()
   399  	if rng.Float64() < 0.5 {
   400  		setSmallBatchSize(t, conn, rng)
   401  	}
   402  }
   403  
   404  func smithcmpTestRun(ctx context.Context, t *test, c *cluster, _ crdbVersion, _ tpchVecTestCase) {
   405  	const (
   406  		configFile = `tpchvec_smithcmp.toml`
   407  		configURL  = `https://raw.githubusercontent.com/cockroachdb/cockroach/master/pkg/cmd/roachtest/` + configFile
   408  	)
   409  	firstNode := c.Node(1)
   410  	if err := c.RunE(ctx, firstNode, fmt.Sprintf("curl %s > %s", configURL, configFile)); err != nil {
   411  		t.Fatal(err)
   412  	}
   413  	cmd := fmt.Sprintf("./%s %s", tpchVecSmithcmp, configFile)
   414  	if err := c.RunE(ctx, firstNode, cmd); err != nil {
   415  		t.Fatal(err)
   416  	}
   417  }
   418  
   419  func runTPCHVec(
   420  	ctx context.Context,
   421  	t *test,
   422  	c *cluster,
   423  	testCase tpchVecTestCase,
   424  	testRun func(ctx context.Context, t *test, c *cluster, version crdbVersion, tc tpchVecTestCase),
   425  ) {
   426  	firstNode := c.Node(1)
   427  	c.Put(ctx, cockroach, "./cockroach", c.All())
   428  	c.Put(ctx, workload, "./workload", firstNode)
   429  	c.Start(ctx, t)
   430  
   431  	conn := c.Conn(ctx, 1)
   432  	disableAutoStats(t, conn)
   433  	t.Status("restoring TPCH dataset for Scale Factor 1")
   434  	if err := loadTPCHDataset(ctx, t, c, 1 /* sf */, newMonitor(ctx, c), c.All()); err != nil {
   435  		t.Fatal(err)
   436  	}
   437  
   438  	if _, err := conn.Exec("USE tpch;"); err != nil {
   439  		t.Fatal(err)
   440  	}
   441  	scatterTables(t, conn, tpchTables)
   442  	t.Status("waiting for full replication")
   443  	waitForFullReplication(t, conn)
   444  	createStatsFromTables(t, conn, tpchTables)
   445  	versionString, err := fetchCockroachVersion(ctx, c, c.Node(1)[0])
   446  	if err != nil {
   447  		t.Fatal(err)
   448  	}
   449  	version, err := toCRDBVersion(versionString)
   450  	if err != nil {
   451  		t.Fatal(err)
   452  	}
   453  
   454  	testCase.preTestRunHook(ctx, t, c, conn, version)
   455  	testRun(ctx, t, c, version, testCase)
   456  	testCase.postTestRunHook(t, conn, version)
   457  }
   458  
   459  func registerTPCHVec(r *testRegistry) {
   460  	r.Add(testSpec{
   461  		Name:       "tpchvec/perf",
   462  		Owner:      OwnerSQLExec,
   463  		Cluster:    makeClusterSpec(tpchVecNodeCount),
   464  		MinVersion: "v19.2.0",
   465  		Run: func(ctx context.Context, t *test, c *cluster) {
   466  			runTPCHVec(ctx, t, c, newTpchVecPerfTest(), baseTestRun)
   467  		},
   468  	})
   469  
   470  	r.Add(testSpec{
   471  		Name:    "tpchvec/disk",
   472  		Owner:   OwnerSQLExec,
   473  		Cluster: makeClusterSpec(tpchVecNodeCount),
   474  		// 19.2 version doesn't have disk spilling nor memory monitoring, so
   475  		// there is no point in running this config on that version.
   476  		MinVersion: "v20.1.0",
   477  		Run: func(ctx context.Context, t *test, c *cluster) {
   478  			runTPCHVec(ctx, t, c, tpchVecDiskTest{}, baseTestRun)
   479  		},
   480  	})
   481  
   482  	r.Add(testSpec{
   483  		Name:    "tpchvec/smallbatchsize",
   484  		Owner:   OwnerSQLExec,
   485  		Cluster: makeClusterSpec(tpchVecNodeCount),
   486  		// 19.2 version doesn't have the testing cluster setting to change the batch
   487  		// size, so only run on versions >= 20.1.0.
   488  		MinVersion: "v20.1.0",
   489  		Run: func(ctx context.Context, t *test, c *cluster) {
   490  			runTPCHVec(ctx, t, c, tpchVecSmallBatchSizeTest{}, baseTestRun)
   491  		},
   492  	})
   493  
   494  	r.Add(testSpec{
   495  		Name:       "tpchvec/smithcmp",
   496  		Owner:      OwnerSQLExec,
   497  		Cluster:    makeClusterSpec(tpchVecNodeCount),
   498  		MinVersion: "v20.1.0",
   499  		Run: func(ctx context.Context, t *test, c *cluster) {
   500  			runTPCHVec(ctx, t, c, tpchVecSmithcmpTest{}, smithcmpTestRun)
   501  		},
   502  	})
   503  }