github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/versionupgrade.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"math/rand"
    18  	"runtime"
    19  	"strconv"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/binfetcher"
    24  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    25  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/version"
    27  	_ "github.com/lib/pq"
    28  )
    29  
    30  var v201 = roachpb.Version{Major: 20, Minor: 1}
    31  
    32  // Feature tests that are invoked between each step of the version upgrade test.
    33  // Tests can use u.clusterVersion to determine which version is active at the
    34  // moment.
    35  //
    36  // A gotcha is that these feature tests are also invoked when the cluster is
    37  // in the middle of upgrading -- i.e. a state where the cluster version has
    38  // already been bumped, but not all nodes are aware). This should be considered
    39  // a feature of this test, and feature tests that flake because of it need to
    40  // be fixed.
    41  var versionUpgradeTestFeatures = versionFeatureStep{
    42  	// NB: the next four tests are ancient and supported since v2.0. However,
    43  	// in 19.2 -> 20.1 we had a migration that disallowed most DDL in the
    44  	// mixed version state, and so for convenience we gate them on v20.1.
    45  	stmtFeatureTest("Object Access", v201, `
    46  -- We should be able to successfully select from objects created in ancient
    47  -- versions of CRDB using their FQNs. Prevents bugs such as #43141, where
    48  -- databases created before a migration were inaccessible after the
    49  -- migration.
    50  --
    51  -- NB: the data has been baked into the fixtures. Originally created via:
    52  --   create database persistent_db
    53  --   create table persistent_db.persistent_table(a int)"))
    54  -- on CRDB v1.0
    55  select * from persistent_db.persistent_table;
    56  show tables from persistent_db;
    57  `),
    58  	stmtFeatureTest("JSONB", v201, `
    59  CREATE DATABASE IF NOT EXISTS test;
    60  CREATE TABLE test.t (j JSONB);
    61  DROP TABLE test.t;
    62  	`),
    63  	stmtFeatureTest("Sequences", v201, `
    64  CREATE DATABASE IF NOT EXISTS test;
    65  CREATE SEQUENCE test.test_sequence;
    66  DROP SEQUENCE test.test_sequence;
    67  	`),
    68  	stmtFeatureTest("Computed Columns", v201, `
    69  CREATE DATABASE IF NOT EXISTS test;
    70  CREATE TABLE test.t (x INT AS (3) STORED);
    71  DROP TABLE test.t;
    72  	`),
    73  }
    74  
    75  func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion version.Version) {
    76  	predecessorVersion, err := PredecessorVersion(buildVersion)
    77  	if err != nil {
    78  		t.Fatal(err)
    79  	}
    80  	// This test uses fixtures and we do not have encrypted fixtures right now.
    81  	c.encryptDefault = false
    82  
    83  	// Set the bool within to true to create a new fixture for this test. This
    84  	// is necessary after every release. For example, the day `master` becomes
    85  	// the 20.2 release, this test will fail because it is missing a fixture for
    86  	// 20.1; run the test (on 20.1) with the bool flipped to create the fixture.
    87  	// Check it in (instructions are on the 'checkpointer' struct) and off we
    88  	// go.
    89  	if false {
    90  		// The version to create/update the fixture for. Must be released (i.e.
    91  		// can download it from the homepage); if that is not the case use the
    92  		// empty string which uses the local cockroach binary.
    93  		newV := "19.2.6"
    94  		predV, err := PredecessorVersion(*version.MustParse("v" + newV))
    95  		if err != nil {
    96  			t.Fatal(err)
    97  		}
    98  		makeVersionFixtureAndFatal(ctx, t, c, predV, newV)
    99  	}
   100  
   101  	testFeaturesStep := versionUpgradeTestFeatures.step(c.All())
   102  	schemaChangeStep := runSchemaChangeWorkloadStep(c.All().randNode()[0], 10 /* maxOps */, 2 /* concurrency */)
   103  
   104  	// The steps below start a cluster at predecessorVersion (from a fixture),
   105  	// then start an upgrade that is rolled back, and finally start and finalize
   106  	// the upgrade. Between each step, we run the feature tests defined in
   107  	// versionUpgradeTestFeatures.
   108  	u := newVersionUpgradeTest(c,
   109  		// Start the cluster from a fixture. That fixture's cluster version may
   110  		// be at the predecessor version (though in practice it's fully up to
   111  		// date, if it was created via the checkpointer above), so add a
   112  		// waitForUpgradeStep to make sure we're upgraded all the way before
   113  		// moving on.
   114  		//
   115  		// See the comment on createCheckpoints for details on fixtures.
   116  		uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion),
   117  		uploadAndInitSchemaChangeWorkload(),
   118  		waitForUpgradeStep(c.All()),
   119  		testFeaturesStep,
   120  
   121  		// NB: at this point, cluster and binary version equal predecessorVersion,
   122  		// and auto-upgrades are on.
   123  
   124  		// We use an empty string for the version below, which means to use the
   125  		// main ./cockroach binary (i.e. the one being tested in this run).
   126  		// We upgrade into this version more capriciously to ensure better
   127  		// coverage by first rolling the cluster into the new version with
   128  		// auto-upgrade disabled, then rolling back, and then rolling forward
   129  		// and finalizing on the auto-upgrade path.
   130  		preventAutoUpgradeStep(1),
   131  		// Roll nodes forward.
   132  		binaryUpgradeStep(c.All(), ""),
   133  		testFeaturesStep,
   134  		// Run a quick schemachange workload in between each upgrade.
   135  		// The maxOps is 10 to keep the test runtime under 1-2 minutes.
   136  		schemaChangeStep,
   137  		// Roll back again. Note that bad things would happen if the cluster had
   138  		// ignored our request to not auto-upgrade. The `autoupgrade` roachtest
   139  		// exercises this in more detail, so here we just rely on things working
   140  		// as they ought to.
   141  		binaryUpgradeStep(c.All(), predecessorVersion),
   142  		testFeaturesStep,
   143  		schemaChangeStep,
   144  		// Roll nodes forward, this time allowing them to upgrade, and waiting
   145  		// for it to happen.
   146  		binaryUpgradeStep(c.All(), ""),
   147  		allowAutoUpgradeStep(1),
   148  		testFeaturesStep,
   149  		schemaChangeStep,
   150  		waitForUpgradeStep(c.All()),
   151  		testFeaturesStep,
   152  		schemaChangeStep,
   153  	)
   154  
   155  	u.run(ctx, t)
   156  }
   157  
   158  func (u *versionUpgradeTest) run(ctx context.Context, t *test) {
   159  	defer func() {
   160  		for _, db := range u.conns {
   161  			_ = db.Close()
   162  		}
   163  	}()
   164  
   165  	for _, step := range u.steps {
   166  		if step != nil {
   167  			step(ctx, t, u)
   168  		}
   169  	}
   170  }
   171  
   172  type versionUpgradeTest struct {
   173  	goOS  string
   174  	c     *cluster
   175  	steps []versionStep
   176  
   177  	// Cache conns because opening one takes hundreds of ms, and we do it quite
   178  	// a lot.
   179  	conns []*gosql.DB
   180  }
   181  
   182  func newVersionUpgradeTest(c *cluster, steps ...versionStep) *versionUpgradeTest {
   183  	return &versionUpgradeTest{
   184  		goOS:  ifLocal(runtime.GOOS, "linux"),
   185  		c:     c,
   186  		steps: steps,
   187  	}
   188  }
   189  
   190  func checkpointName(binaryVersion string) string { return "checkpoint-v" + binaryVersion }
   191  
   192  // Return a cached conn to the given node. Don't call .Close(), the test harness
   193  // will do it.
   194  func (u *versionUpgradeTest) conn(ctx context.Context, t *test, i int) *gosql.DB {
   195  	if u.conns == nil {
   196  		for _, i := range u.c.All() {
   197  			u.conns = append(u.conns, u.c.Conn(ctx, i))
   198  		}
   199  	}
   200  	return u.conns[i-1]
   201  }
   202  
   203  func (u *versionUpgradeTest) uploadVersion(
   204  	ctx context.Context, t *test, nodes nodeListOption, newVersion string,
   205  ) option {
   206  	var binary string
   207  	if newVersion == "" {
   208  		binary = cockroach
   209  	} else {
   210  		var err error
   211  		binary, err = binfetcher.Download(ctx, binfetcher.Options{
   212  			Binary:  "cockroach",
   213  			Version: "v" + newVersion,
   214  			GOOS:    u.goOS,
   215  			GOARCH:  "amd64",
   216  		})
   217  		if err != nil {
   218  			t.Fatal(err)
   219  		}
   220  	}
   221  
   222  	target := "./cockroach"
   223  	if newVersion != "" {
   224  		target += "-" + newVersion
   225  	}
   226  	u.c.Put(ctx, binary, target, nodes)
   227  	return startArgs("--binary=" + target)
   228  }
   229  
   230  // binaryVersion returns the binary running on the (one-indexed) node.
   231  // NB: version means major.minor[-unstable]; the patch level isn't returned. For example, a binary
   232  // of version 19.2.4 will return 19.2.
   233  func (u *versionUpgradeTest) binaryVersion(ctx context.Context, t *test, i int) roachpb.Version {
   234  	db := u.conn(ctx, t, i)
   235  
   236  	var sv string
   237  	if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil {
   238  		t.Fatal(err)
   239  	}
   240  
   241  	if len(sv) == 0 {
   242  		t.Fatal("empty version")
   243  	}
   244  
   245  	cv, err := roachpb.ParseVersion(sv)
   246  	if err != nil {
   247  		t.Fatal(err)
   248  	}
   249  	return cv
   250  }
   251  
   252  // binaryVersion returns the cluster version active on the (one-indexed) node. Note that the
   253  // returned value might become stale due to the cluster auto-upgrading in the background plus
   254  // gossip asynchronicity.
   255  // NB: cluster versions are always major.minor[-unstable]; there isn't a patch level.
   256  func (u *versionUpgradeTest) clusterVersion(ctx context.Context, t *test, i int) roachpb.Version {
   257  	db := u.conn(ctx, t, i)
   258  
   259  	var sv string
   260  	if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil {
   261  		t.Fatal(err)
   262  	}
   263  
   264  	cv, err := roachpb.ParseVersion(sv)
   265  	if err != nil {
   266  		t.Fatal(err)
   267  	}
   268  	return cv
   269  }
   270  
   271  // versionStep is an isolated version migration on a running cluster.
   272  type versionStep func(ctx context.Context, t *test, u *versionUpgradeTest)
   273  
   274  func uploadAndStartFromCheckpointFixture(nodes nodeListOption, v string) versionStep {
   275  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   276  		u.c.Run(ctx, nodes, "mkdir", "-p", "{store-dir}")
   277  		vv := version.MustParse("v" + v)
   278  		// The fixtures use cluster version (major.minor) but the input might be
   279  		// a patch release.
   280  		name := checkpointName(
   281  			roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(),
   282  		)
   283  		for _, i := range nodes {
   284  			u.c.Put(ctx,
   285  				"pkg/cmd/roachtest/fixtures/"+strconv.Itoa(i)+"/"+name+".tgz",
   286  				"{store-dir}/fixture.tgz", u.c.Node(i),
   287  			)
   288  		}
   289  		// Extract fixture. Fail if there's already an LSM in the store dir.
   290  		u.c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz")
   291  
   292  		// Put and start the binary.
   293  		args := u.uploadVersion(ctx, t, nodes, v)
   294  		// NB: can't start sequentially since cluster already bootstrapped.
   295  		u.c.Start(ctx, t, nodes, args, startArgsDontEncrypt, roachprodArgOption{"--sequential=false"})
   296  	}
   297  }
   298  
   299  // binaryUpgradeStep rolling-restarts the given nodes into the new binary
   300  // version. Note that this does *not* wait for the cluster version to upgrade.
   301  // Use a waitForUpgradeStep() for that.
   302  func binaryUpgradeStep(nodes nodeListOption, newVersion string) versionStep {
   303  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   304  		c := u.c
   305  		args := u.uploadVersion(ctx, t, nodes, newVersion)
   306  
   307  		// Restart nodes in a random order; otherwise node 1 would be running all
   308  		// the migrations and it probably also has all the leases.
   309  		rand.Shuffle(len(nodes), func(i, j int) {
   310  			nodes[i], nodes[j] = nodes[j], nodes[i]
   311  		})
   312  		for _, node := range nodes {
   313  			t.l.Printf("restarting node %d", node)
   314  			c.Stop(ctx, c.Node(node))
   315  			c.Start(ctx, t, c.Node(node), args, startArgsDontEncrypt)
   316  			t.l.Printf("node %d now running binary version %s", node, u.binaryVersion(ctx, t, node))
   317  
   318  			// TODO(nvanbenschoten): add upgrade qualification step. What should we
   319  			// test? We could run logictests. We could add custom logic here. Maybe
   320  			// this should all be pushed to nightly migration tests instead.
   321  		}
   322  	}
   323  }
   324  
   325  func preventAutoUpgradeStep(node int) versionStep {
   326  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   327  		db := u.conn(ctx, t, node)
   328  		_, err := db.ExecContext(ctx, `SET CLUSTER SETTING cluster.preserve_downgrade_option = $1`, u.binaryVersion(ctx, t, node).String())
   329  		if err != nil {
   330  			t.Fatal(err)
   331  		}
   332  	}
   333  }
   334  
   335  func allowAutoUpgradeStep(node int) versionStep {
   336  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   337  		db := u.conn(ctx, t, node)
   338  		_, err := db.ExecContext(ctx, `RESET CLUSTER SETTING cluster.preserve_downgrade_option`)
   339  		if err != nil {
   340  			t.Fatal(err)
   341  		}
   342  	}
   343  }
   344  
   345  // waitForUpgradeStep waits for the cluster version to reach the first node's
   346  // binary version (which is assumed to be every node's binary version). We rely
   347  // on the cluster's internal self-upgrading mechanism.
   348  //
   349  // NB: this is intentionally kept separate from binaryUpgradeStep because we run
   350  // feature tests between the steps, and we want to expose them (at least
   351  // heuristically) to the real-world situation in which some nodes have already
   352  // learned of a cluster version bump (from Gossip) where others haven't. This
   353  // situation tends to exhibit unexpected behavior.
   354  func waitForUpgradeStep(nodes nodeListOption) versionStep {
   355  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   356  		newVersion := u.binaryVersion(ctx, t, nodes[0]).String()
   357  		t.l.Printf("%s: waiting for cluster to auto-upgrade\n", newVersion)
   358  
   359  		for _, i := range nodes {
   360  			err := retry.ForDuration(30*time.Second, func() error {
   361  				currentVersion := u.clusterVersion(ctx, t, i).String()
   362  				if currentVersion != newVersion {
   363  					return fmt.Errorf("%d: expected version %s, got %s", i, newVersion, currentVersion)
   364  				}
   365  				t.l.Printf("%s: acked by n%d", currentVersion, i)
   366  				return nil
   367  			})
   368  			if err != nil {
   369  				t.Fatal(err)
   370  			}
   371  		}
   372  
   373  		t.l.Printf("%s: nodes %v are upgraded\n", newVersion, nodes)
   374  
   375  		// TODO(nvanbenschoten): add upgrade qualification step.
   376  	}
   377  }
   378  
   379  type versionFeatureTest struct {
   380  	name string
   381  	fn   func(context.Context, *test, *versionUpgradeTest, nodeListOption) (skipped bool)
   382  }
   383  
   384  type versionFeatureStep []versionFeatureTest
   385  
   386  func (vs versionFeatureStep) step(nodes nodeListOption) versionStep {
   387  	return func(ctx context.Context, t *test, u *versionUpgradeTest) {
   388  		for _, feature := range vs {
   389  			t.l.Printf("checking %s", feature.name)
   390  			tBegin := timeutil.Now()
   391  			skipped := feature.fn(ctx, t, u, nodes)
   392  			dur := fmt.Sprintf("%.2fs", timeutil.Since(tBegin).Seconds())
   393  			if skipped {
   394  				t.l.Printf("^-- skip (%s)", dur)
   395  			} else {
   396  				t.l.Printf("^-- ok (%s)", dur)
   397  			}
   398  		}
   399  	}
   400  }
   401  
   402  func stmtFeatureTest(
   403  	name string, minVersion roachpb.Version, stmt string, args ...interface{},
   404  ) versionFeatureTest {
   405  	return versionFeatureTest{
   406  		name: name,
   407  		fn: func(ctx context.Context, t *test, u *versionUpgradeTest, nodes nodeListOption) (skipped bool) {
   408  			i := nodes.randNode()[0]
   409  			if u.clusterVersion(ctx, t, i).Less(minVersion) {
   410  				return true // skipped
   411  			}
   412  			db := u.conn(ctx, t, i)
   413  			if _, err := db.ExecContext(ctx, stmt, args...); err != nil {
   414  				t.Fatal(err)
   415  			}
   416  			return false
   417  		},
   418  	}
   419  }
   420  
   421  // makeVersionFixtureAndFatal creates fixtures to "age out" old versions of CockroachDB.
   422  // We want to test data that was created at v1.0, but we don't actually want to
   423  // run a long chain of binaries starting all the way at v1.0. Instead, we
   424  // periodically bake a set of store directories that originally started out on
   425  // v1.0 and maintain it as a fixture for this test.
   426  //
   427  // The checkpoints will be created in the log directories downloaded as part of
   428  // the artifacts. The test will fail on purpose when it's done with instructions
   429  // on where to move the files.
   430  func makeVersionFixtureAndFatal(
   431  	ctx context.Context, t *test, c *cluster, predecessorVersion string, makeFixtureVersion string,
   432  ) {
   433  	c.l.Printf("making fixture for %s (starting at %s)", makeFixtureVersion, predecessorVersion)
   434  	c.encryptDefault = false
   435  	newVersionUpgradeTest(c,
   436  		// Start the cluster from a fixture. That fixture's cluster version may
   437  		// be at the predecessor version (though in practice it's fully up to
   438  		// date, if it was created via the checkpointer above), so add a
   439  		// waitForUpgradeStep to make sure we're upgraded all the way before
   440  		// moving on.
   441  		//
   442  		// See the comment on createCheckpoints for details on fixtures.
   443  		uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion),
   444  		waitForUpgradeStep(c.All()),
   445  
   446  		// NB: at this point, cluster and binary version equal predecessorVersion,
   447  		// and auto-upgrades are on.
   448  
   449  		binaryUpgradeStep(c.All(), makeFixtureVersion),
   450  		waitForUpgradeStep(c.All()),
   451  
   452  		func(ctx context.Context, t *test, u *versionUpgradeTest) {
   453  			// If we're taking checkpoints, momentarily stop the cluster (we
   454  			// need to do that to get the checkpoints to reflect a
   455  			// consistent cluster state). The binary at this point will be
   456  			// the new one, but the cluster version was not explicitly
   457  			// bumped, though auto-update may have taken place already.
   458  			// For example, if newVersion is 2.1, the cluster version in
   459  			// the store directories may be 2.0 on some stores and 2.1 on
   460  			// the others (though if any are on 2.1, then that's what's
   461  			// stored in system.settings).
   462  			// This means that when we restart from that version, we're
   463  			// going to want to use the binary mentioned in the checkpoint,
   464  			// or at least one compatible with the *predecessor* of the
   465  			// checkpoint version. For example, for checkpoint-2.1, the
   466  			// cluster version might be 2.0, so we can only use the 2.0 or
   467  			// 2.1 binary, but not the 19.1 binary (as 19.1 and 2.0 are not
   468  			// compatible).
   469  			name := checkpointName(u.binaryVersion(ctx, t, 1).String())
   470  			u.c.Stop(ctx, c.All())
   471  			c.Run(ctx, c.All(), cockroach, "debug", "rocksdb", "--db={store-dir}",
   472  				"checkpoint", "--checkpoint_dir={store-dir}/"+name)
   473  			c.Run(ctx, c.All(), "tar", "-C", "{store-dir}/"+name, "-czf", "{log-dir}/"+name+".tgz", ".")
   474  			t.Fatalf(`successfully created checkpoints; failing test on purpose.
   475  
   476  Invoke the following to move the archives to the right place and commit the
   477  result:
   478  
   479  for i in 1 2 3 4; do
   480    mkdir -p pkg/cmd/roachtest/fixtures/${i} && \
   481    mv artifacts/acceptance/version-upgrade/run_1/${i}.logs/checkpoint-*.tgz \
   482       pkg/cmd/roachtest/fixtures/${i}/
   483  done
   484  `)
   485  		}).run(ctx, t)
   486  }