github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/version_cluster_test.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package server_test
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"path/filepath"
    18  	"strconv"
    19  	"sync/atomic"
    20  	"testing"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/server"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils"
    29  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    30  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    33  	"github.com/cockroachdb/errors"
    34  	"github.com/stretchr/testify/require"
    35  )
    36  
    37  type testClusterWithHelpers struct {
    38  	*testing.T
    39  	*testcluster.TestCluster
    40  	args func() map[int]base.TestServerArgs
    41  }
    42  
    43  func (th *testClusterWithHelpers) getVersionFromShow(i int) string {
    44  	var version string
    45  	if err := th.ServerConn(i).QueryRow("SHOW CLUSTER SETTING version").Scan(&version); err != nil {
    46  		th.Fatalf("%d: %s", i, err)
    47  	}
    48  	return version
    49  }
    50  
    51  func (th *testClusterWithHelpers) getVersionFromSelect(i int) string {
    52  	var version string
    53  	if err := th.ServerConn(i).QueryRow("SELECT value FROM system.settings WHERE name = 'version'").Scan(&version); err != nil {
    54  		if errors.Is(err, gosql.ErrNoRows) {
    55  			return ""
    56  		}
    57  		th.Fatalf("%d: %s (%T)", i, err, err)
    58  	}
    59  	var v clusterversion.ClusterVersion
    60  	if err := protoutil.Unmarshal([]byte(version), &v); err != nil {
    61  		th.Fatalf("%d: %s", i, err)
    62  	}
    63  	return v.Version.String()
    64  }
    65  
    66  func (th *testClusterWithHelpers) setVersion(i int, version string) error {
    67  	_, err := th.ServerConn(i).Exec("SET CLUSTER SETTING version = $1", version)
    68  	return err
    69  }
    70  
    71  func (th *testClusterWithHelpers) mustSetVersion(i int, version string) {
    72  	th.Helper()
    73  	if err := th.setVersion(i, version); err != nil {
    74  		th.Fatalf("%d: %s", i, err)
    75  	}
    76  }
    77  
    78  func (th *testClusterWithHelpers) setDowngrade(i int, version string) error {
    79  	_, err := th.ServerConn(i).Exec("SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", version)
    80  	return err
    81  }
    82  
    83  func (th *testClusterWithHelpers) resetDowngrade(i int) error {
    84  	_, err := th.ServerConn(i).Exec("RESET CLUSTER SETTING cluster.preserve_downgrade_option")
    85  	return err
    86  }
    87  
    88  // Set up a mixed cluster with the given initial bootstrap version and
    89  // len(versions) servers that each run at binary version == v[0] and
    90  // minimum supported version == v[1] (i.e. they identify as a binary that can
    91  // run with at least a v[1] mixed cluster and is itself v[0]). A directory can
    92  // optionally be passed in.
    93  func setupMixedCluster(
    94  	t *testing.T, knobs base.TestingKnobs, versions [][2]string, dir string,
    95  ) testClusterWithHelpers {
    96  
    97  	twh := testClusterWithHelpers{
    98  		T: t,
    99  		args: func() map[int]base.TestServerArgs {
   100  			serverArgsPerNode := map[int]base.TestServerArgs{}
   101  			for i, v := range versions {
   102  				v0, v1 := roachpb.MustParseVersion(v[0]), roachpb.MustParseVersion(v[1])
   103  				st := cluster.MakeTestingClusterSettingsWithVersions(v0, v1, false /* initializeVersion */)
   104  				args := base.TestServerArgs{
   105  					Settings: st,
   106  					Knobs:    knobs,
   107  				}
   108  				if dir != "" {
   109  					args.StoreSpecs = []base.StoreSpec{{Path: filepath.Join(dir, strconv.Itoa(i))}}
   110  				}
   111  				serverArgsPerNode[i] = args
   112  			}
   113  			return serverArgsPerNode
   114  		}}
   115  
   116  	tc := testcluster.StartTestCluster(t, len(versions), base.TestClusterArgs{
   117  		ReplicationMode:   base.ReplicationManual, // speeds up test
   118  		ServerArgsPerNode: twh.args(),
   119  	})
   120  
   121  	// We simulate crashes using this cluster, and having this enabled (which is
   122  	// a default migration) causes leaktest to complain.
   123  	if _, err := tc.ServerConn(0).Exec("SET CLUSTER SETTING diagnostics.reporting.enabled = 'false'"); err != nil {
   124  		t.Fatal(err)
   125  	}
   126  
   127  	twh.TestCluster = tc
   128  	return twh
   129  }
   130  
   131  // Prev returns the previous version of the given version.
   132  // eg. prev(20.1) = 19.2, prev(19.2) = 19.1, prev(19.1) = 2.1,
   133  // prev(2.0) = 1.0, prev(2.1) == 2.0, prev(2.1-5) == 2.1.
   134  func prev(version roachpb.Version) roachpb.Version {
   135  	if version.Unstable != 0 {
   136  		return roachpb.Version{Major: version.Major, Minor: version.Minor}
   137  	}
   138  
   139  	v19_1 := roachpb.Version{Major: 19, Minor: 1}
   140  
   141  	if v19_1.Less(version) {
   142  		if version.Minor > 1 {
   143  			return roachpb.Version{Major: version.Major, Minor: version.Minor - 1}
   144  		}
   145  		// Here we assume that there's going to only be 2 releases per year.
   146  		// Otherwise we'd need to keep some history of what releases we've had.
   147  		return roachpb.Version{Major: version.Major - 1, Minor: 2}
   148  	}
   149  
   150  	if version == v19_1 {
   151  		return roachpb.Version{Major: 2, Minor: 1}
   152  	}
   153  
   154  	// Logic for versions below 19.1.
   155  
   156  	if version.Major > 2 {
   157  		log.Fatalf(context.Background(), "can't compute previous version for %s", version)
   158  	}
   159  
   160  	if version.Minor != 0 {
   161  		return roachpb.Version{Major: version.Major}
   162  	} else {
   163  		// version will be at least 2.0-X, so it's safe to set new Major to be version.Major-1.
   164  		return roachpb.Version{Major: version.Major - 1}
   165  	}
   166  }
   167  
   168  func TestClusterVersionPersistedOnJoin(t *testing.T) {
   169  	defer leaktest.AfterTest(t)()
   170  
   171  	var newVersion = clusterversion.TestingBinaryVersion
   172  	var oldVersion = prev(newVersion)
   173  
   174  	// Starts 3 nodes that have cluster versions set to be oldVersion and
   175  	// self-declared binary version set to be newVersion with a cluster
   176  	// running at the new version (i.e. a very regular setup). Want to check
   177  	// that after joining the cluster, the second two servers persist the
   178  	// new version (and not the old one).
   179  	versions := [][2]string{
   180  		{newVersion.String(), oldVersion.String()},
   181  		{newVersion.String(), oldVersion.String()},
   182  		{newVersion.String(), oldVersion.String()},
   183  	}
   184  
   185  	knobs := base.TestingKnobs{
   186  		Server: &server.TestingKnobs{
   187  			DisableAutomaticVersionUpgrade: 1,
   188  		},
   189  	}
   190  
   191  	ctx := context.Background()
   192  	dir, finish := testutils.TempDir(t)
   193  	defer finish()
   194  	tc := setupMixedCluster(t, knobs, versions, dir)
   195  	defer tc.TestCluster.Stopper().Stop(ctx)
   196  
   197  	for i := 0; i < len(tc.TestCluster.Servers); i++ {
   198  		for _, engine := range tc.TestCluster.Servers[i].Engines() {
   199  			cv, err := kvserver.ReadClusterVersion(ctx, engine)
   200  			if err != nil {
   201  				t.Fatal(err)
   202  			}
   203  			if cv.Version != newVersion {
   204  				t.Fatalf("n%d: expected version %v, got %v", i+1, newVersion, cv)
   205  			}
   206  		}
   207  	}
   208  }
   209  
   210  func TestClusterVersionUpgrade(t *testing.T) {
   211  	defer leaktest.AfterTest(t)()
   212  	ctx := context.Background()
   213  
   214  	var newVersion = clusterversion.TestingBinaryVersion
   215  	var oldVersion = prev(newVersion)
   216  
   217  	knobs := base.TestingKnobs{
   218  		Server: &server.TestingKnobs{
   219  			BootstrapVersionOverride:       oldVersion,
   220  			DisableAutomaticVersionUpgrade: 1,
   221  		},
   222  	}
   223  
   224  	rawTC := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
   225  		ReplicationMode: base.ReplicationManual, // speeds up test
   226  		ServerArgs: base.TestServerArgs{
   227  			Knobs: knobs,
   228  		},
   229  	})
   230  	defer rawTC.Stopper().Stop(ctx)
   231  	tc := testClusterWithHelpers{
   232  		T:           t,
   233  		TestCluster: rawTC,
   234  	}
   235  
   236  	{
   237  		// Regression test for the fix for this issue:
   238  		// https://github.com/cockroachdb/cockroach/pull/39640#pullrequestreview-275532068
   239  		//
   240  		// This can be removed when VersionLearnerReplicas is always-on.
   241  		k := tc.ScratchRange(t)
   242  		tc.AddReplicasOrFatal(t, k, tc.Target(2))
   243  		_, err := tc.RemoveReplicas(k, tc.Target(2))
   244  		require.NoError(t, err)
   245  	}
   246  
   247  	// Set CLUSTER SETTING cluster.preserve_downgrade_option to oldVersion to prevent upgrade.
   248  	if err := tc.setDowngrade(0, oldVersion.String()); err != nil {
   249  		t.Fatalf("error setting CLUSTER SETTING cluster.preserve_downgrade_option: %s", err)
   250  	}
   251  	atomic.StoreInt32(&knobs.Server.(*server.TestingKnobs).DisableAutomaticVersionUpgrade, 0)
   252  
   253  	// Check the cluster version is still oldVersion.
   254  	curVersion := tc.getVersionFromSelect(0)
   255  	if curVersion != oldVersion.String() {
   256  		t.Fatalf("cluster version should still be %s, but get %s", oldVersion, curVersion)
   257  	}
   258  
   259  	// Reset cluster.preserve_downgrade_option to enable auto upgrade.
   260  	if err := tc.resetDowngrade(0); err != nil {
   261  		t.Fatalf("error resetting CLUSTER SETTING cluster.preserve_downgrade_option: %s", err)
   262  	}
   263  
   264  	// Check the cluster version is bumped to newVersion.
   265  	testutils.SucceedsSoon(t, func() error {
   266  		if version := tc.getVersionFromSelect(0); version != newVersion.String() {
   267  			return errors.Errorf("cluster version is still %s, should be %s", oldVersion, newVersion)
   268  		}
   269  		return nil
   270  	})
   271  	curVersion = tc.getVersionFromSelect(0)
   272  	isNoopUpdate := curVersion == newVersion.String()
   273  
   274  	testutils.SucceedsSoon(t, func() error {
   275  		for i := 0; i < tc.NumServers(); i++ {
   276  			st := tc.Servers[i].ClusterSettings()
   277  			v := st.Version.ActiveVersion(ctx)
   278  			wantActive := isNoopUpdate
   279  			if isActive := v.IsActiveVersion(newVersion); isActive != wantActive {
   280  				return errors.Errorf("%d: v%s active=%t (wanted %t)", i, newVersion, isActive, wantActive)
   281  			}
   282  
   283  			if tableV, curV := tc.getVersionFromSelect(i), v.String(); tableV != curV {
   284  				return errors.Errorf("%d: read v%s from table, v%s from setting", i, tableV, curV)
   285  			}
   286  		}
   287  		return nil
   288  	})
   289  
   290  	exp := newVersion.String()
   291  
   292  	// Read the versions from the table from each node. Note that under the
   293  	// hood, everything goes to the lease holder and so it's pretty much
   294  	// guaranteed that they all read the same, but it doesn't hurt to check.
   295  	testutils.SucceedsSoon(t, func() error {
   296  		for i := 0; i < tc.NumServers(); i++ {
   297  			if version := tc.getVersionFromSelect(i); version != exp {
   298  				return errors.Errorf("%d: incorrect version %q (wanted %s)", i, version, exp)
   299  			}
   300  			if version := tc.getVersionFromShow(i); version != exp {
   301  				return errors.Errorf("%d: incorrect version %s (wanted %s)", i, version, exp)
   302  			}
   303  		}
   304  		return nil
   305  	})
   306  
   307  	// Now check the Settings.Version variable. That is the tricky one for which
   308  	// we "hold back" a gossip update until we've written to the engines. We may
   309  	// have to wait a bit until we see the new version here, even though it's
   310  	// already in the table.
   311  	testutils.SucceedsSoon(t, func() error {
   312  		for i := 0; i < tc.NumServers(); i++ {
   313  			vers := tc.Servers[i].ClusterSettings().Version.ActiveVersion(ctx)
   314  			if v := vers.String(); v == curVersion {
   315  				if isNoopUpdate {
   316  					continue
   317  				}
   318  				return errors.Errorf("%d: still waiting for %s (now at %s)", i, exp, v)
   319  			} else if v != exp {
   320  				t.Fatalf("%d: should never see version %s (wanted %s)", i, v, exp)
   321  			}
   322  		}
   323  		return nil
   324  	})
   325  
   326  	// Since the wrapped version setting exposes the new versions, it must
   327  	// definitely be present on all stores on the first try.
   328  	if err := tc.Servers[1].GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error {
   329  		cv, err := kvserver.ReadVersionFromEngineOrZero(ctx, s.Engine())
   330  		if err != nil {
   331  			return err
   332  		}
   333  		if act := cv.Version.String(); act != exp {
   334  			t.Fatalf("%s: %s persisted, but should be %s", s, act, exp)
   335  		}
   336  		return nil
   337  	}); err != nil {
   338  		t.Fatal(err)
   339  	}
   340  }
   341  
   342  // Test that, after cluster bootstrap, the different ways of getting the cluster
   343  // version all agree.
   344  func TestAllVersionsAgree(t *testing.T) {
   345  	defer leaktest.AfterTest(t)()
   346  	ctx := context.Background()
   347  
   348  	tcRaw := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{})
   349  	defer tcRaw.Stopper().Stop(ctx)
   350  	tc := testClusterWithHelpers{
   351  		T:           t,
   352  		TestCluster: tcRaw,
   353  	}
   354  
   355  	exp := clusterversion.TestingBinaryVersion.String()
   356  
   357  	// The node bootstrapping the cluster starts at TestingBinaryVersion, the
   358  	// others start at TestingMinimumSupportedVersion and it takes them a gossip
   359  	// update to get to TestingBinaryVersion. Hence, we loop until that gossip
   360  	// comes.
   361  	testutils.SucceedsSoon(tc, func() error {
   362  		for i := 0; i < tc.NumServers(); i++ {
   363  			if version := tc.Servers[i].ClusterSettings().Version.ActiveVersion(ctx); version.String() != exp {
   364  				return fmt.Errorf("%d: incorrect version %s (wanted %s)", i, version, exp)
   365  			}
   366  			if version := tc.getVersionFromShow(i); version != exp {
   367  				return fmt.Errorf("%d: incorrect version %s (wanted %s)", i, version, exp)
   368  			}
   369  			if version := tc.getVersionFromSelect(i); version != exp {
   370  				return fmt.Errorf("%d: incorrect version %q (wanted %s)", i, version, exp)
   371  			}
   372  		}
   373  		return nil
   374  	})
   375  }
   376  
   377  // Returns two versions v0 and v1 which correspond to adjacent releases. v1 will
   378  // equal the TestingBinaryMinSupportedVersion to avoid rot in tests using this (as we retire
   379  // old versions).
   380  func v0v1() (roachpb.Version, roachpb.Version) {
   381  	v1 := clusterversion.TestingBinaryMinSupportedVersion
   382  	v0 := clusterversion.TestingBinaryMinSupportedVersion
   383  	if v0.Minor > 0 {
   384  		v0.Minor--
   385  	} else {
   386  		v0.Major--
   387  	}
   388  	return v0, v1
   389  }
   390  
   391  func TestClusterVersionMixedVersionTooOld(t *testing.T) {
   392  	defer leaktest.AfterTest(t)()
   393  	ctx := context.Background()
   394  
   395  	// Prevent node crashes from generating several megabytes of stacks when
   396  	// GOTRACEBACK=all, as it is on CI.
   397  	defer log.DisableTracebacks()()
   398  
   399  	exits := make(chan int, 100)
   400  
   401  	log.SetExitFunc(true /* hideStack */, func(i int) { exits <- i })
   402  	defer log.ResetExitFunc()
   403  
   404  	v0, v1 := v0v1()
   405  	v0s := v0.String()
   406  	v1s := v1.String()
   407  
   408  	// Three nodes at v1 and a fourth one at v0, but all operating at v0.
   409  	versions := [][2]string{
   410  		{v1s, v0s},
   411  		{v1s, v0s},
   412  		{v1s, v0s},
   413  		{v0s, v0s},
   414  	}
   415  
   416  	// Start by running v0.
   417  	knobs := base.TestingKnobs{
   418  		Server: &server.TestingKnobs{
   419  			DisableAutomaticVersionUpgrade: 1,
   420  			BootstrapVersionOverride:       v0,
   421  		},
   422  	}
   423  	tc := setupMixedCluster(t, knobs, versions, "")
   424  	defer tc.Stopper().Stop(ctx)
   425  
   426  	exp := v1s
   427  
   428  	// The last node refuses to perform an upgrade that would risk its own life.
   429  	if err := tc.setVersion(len(versions)-1, exp); !testutils.IsError(err,
   430  		fmt.Sprintf("cannot upgrade to %s: node running %s", v1s, v0s),
   431  	) {
   432  		t.Fatal(err)
   433  	}
   434  
   435  	// The other nodes are less careful.
   436  	tc.mustSetVersion(0, exp)
   437  
   438  	<-exits // wait for fourth node to die
   439  
   440  	// Check that we can still talk to the first three nodes.
   441  	for i := 0; i < tc.NumServers()-1; i++ {
   442  		testutils.SucceedsSoon(tc, func() error {
   443  			if version := tc.Servers[i].ClusterSettings().Version.ActiveVersion(ctx).String(); version != exp {
   444  				return errors.Errorf("%d: incorrect version %s (wanted %s)", i, version, exp)
   445  			}
   446  			if version := tc.getVersionFromShow(i); version != exp {
   447  				return errors.Errorf("%d: incorrect version %s (wanted %s)", i, version, exp)
   448  			}
   449  			return nil
   450  		})
   451  	}
   452  }