github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/versionupgrade.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "math/rand" 18 "runtime" 19 "strconv" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/util/binfetcher" 24 "github.com/cockroachdb/cockroach/pkg/util/retry" 25 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 26 "github.com/cockroachdb/cockroach/pkg/util/version" 27 _ "github.com/lib/pq" 28 ) 29 30 var v201 = roachpb.Version{Major: 20, Minor: 1} 31 32 // Feature tests that are invoked between each step of the version upgrade test. 33 // Tests can use u.clusterVersion to determine which version is active at the 34 // moment. 35 // 36 // A gotcha is that these feature tests are also invoked when the cluster is 37 // in the middle of upgrading -- i.e. a state where the cluster version has 38 // already been bumped, but not all nodes are aware). This should be considered 39 // a feature of this test, and feature tests that flake because of it need to 40 // be fixed. 41 var versionUpgradeTestFeatures = versionFeatureStep{ 42 // NB: the next four tests are ancient and supported since v2.0. However, 43 // in 19.2 -> 20.1 we had a migration that disallowed most DDL in the 44 // mixed version state, and so for convenience we gate them on v20.1. 45 stmtFeatureTest("Object Access", v201, ` 46 -- We should be able to successfully select from objects created in ancient 47 -- versions of CRDB using their FQNs. Prevents bugs such as #43141, where 48 -- databases created before a migration were inaccessible after the 49 -- migration. 50 -- 51 -- NB: the data has been baked into the fixtures. Originally created via: 52 -- create database persistent_db 53 -- create table persistent_db.persistent_table(a int)")) 54 -- on CRDB v1.0 55 select * from persistent_db.persistent_table; 56 show tables from persistent_db; 57 `), 58 stmtFeatureTest("JSONB", v201, ` 59 CREATE DATABASE IF NOT EXISTS test; 60 CREATE TABLE test.t (j JSONB); 61 DROP TABLE test.t; 62 `), 63 stmtFeatureTest("Sequences", v201, ` 64 CREATE DATABASE IF NOT EXISTS test; 65 CREATE SEQUENCE test.test_sequence; 66 DROP SEQUENCE test.test_sequence; 67 `), 68 stmtFeatureTest("Computed Columns", v201, ` 69 CREATE DATABASE IF NOT EXISTS test; 70 CREATE TABLE test.t (x INT AS (3) STORED); 71 DROP TABLE test.t; 72 `), 73 } 74 75 func runVersionUpgrade(ctx context.Context, t *test, c *cluster, buildVersion version.Version) { 76 predecessorVersion, err := PredecessorVersion(buildVersion) 77 if err != nil { 78 t.Fatal(err) 79 } 80 // This test uses fixtures and we do not have encrypted fixtures right now. 81 c.encryptDefault = false 82 83 // Set the bool within to true to create a new fixture for this test. This 84 // is necessary after every release. For example, the day `master` becomes 85 // the 20.2 release, this test will fail because it is missing a fixture for 86 // 20.1; run the test (on 20.1) with the bool flipped to create the fixture. 87 // Check it in (instructions are on the 'checkpointer' struct) and off we 88 // go. 89 if false { 90 // The version to create/update the fixture for. Must be released (i.e. 91 // can download it from the homepage); if that is not the case use the 92 // empty string which uses the local cockroach binary. 93 newV := "19.2.6" 94 predV, err := PredecessorVersion(*version.MustParse("v" + newV)) 95 if err != nil { 96 t.Fatal(err) 97 } 98 makeVersionFixtureAndFatal(ctx, t, c, predV, newV) 99 } 100 101 testFeaturesStep := versionUpgradeTestFeatures.step(c.All()) 102 schemaChangeStep := runSchemaChangeWorkloadStep(c.All().randNode()[0], 10 /* maxOps */, 2 /* concurrency */) 103 104 // The steps below start a cluster at predecessorVersion (from a fixture), 105 // then start an upgrade that is rolled back, and finally start and finalize 106 // the upgrade. Between each step, we run the feature tests defined in 107 // versionUpgradeTestFeatures. 108 u := newVersionUpgradeTest(c, 109 // Start the cluster from a fixture. That fixture's cluster version may 110 // be at the predecessor version (though in practice it's fully up to 111 // date, if it was created via the checkpointer above), so add a 112 // waitForUpgradeStep to make sure we're upgraded all the way before 113 // moving on. 114 // 115 // See the comment on createCheckpoints for details on fixtures. 116 uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion), 117 uploadAndInitSchemaChangeWorkload(), 118 waitForUpgradeStep(c.All()), 119 testFeaturesStep, 120 121 // NB: at this point, cluster and binary version equal predecessorVersion, 122 // and auto-upgrades are on. 123 124 // We use an empty string for the version below, which means to use the 125 // main ./cockroach binary (i.e. the one being tested in this run). 126 // We upgrade into this version more capriciously to ensure better 127 // coverage by first rolling the cluster into the new version with 128 // auto-upgrade disabled, then rolling back, and then rolling forward 129 // and finalizing on the auto-upgrade path. 130 preventAutoUpgradeStep(1), 131 // Roll nodes forward. 132 binaryUpgradeStep(c.All(), ""), 133 testFeaturesStep, 134 // Run a quick schemachange workload in between each upgrade. 135 // The maxOps is 10 to keep the test runtime under 1-2 minutes. 136 schemaChangeStep, 137 // Roll back again. Note that bad things would happen if the cluster had 138 // ignored our request to not auto-upgrade. The `autoupgrade` roachtest 139 // exercises this in more detail, so here we just rely on things working 140 // as they ought to. 141 binaryUpgradeStep(c.All(), predecessorVersion), 142 testFeaturesStep, 143 schemaChangeStep, 144 // Roll nodes forward, this time allowing them to upgrade, and waiting 145 // for it to happen. 146 binaryUpgradeStep(c.All(), ""), 147 allowAutoUpgradeStep(1), 148 testFeaturesStep, 149 schemaChangeStep, 150 waitForUpgradeStep(c.All()), 151 testFeaturesStep, 152 schemaChangeStep, 153 ) 154 155 u.run(ctx, t) 156 } 157 158 func (u *versionUpgradeTest) run(ctx context.Context, t *test) { 159 defer func() { 160 for _, db := range u.conns { 161 _ = db.Close() 162 } 163 }() 164 165 for _, step := range u.steps { 166 if step != nil { 167 step(ctx, t, u) 168 } 169 } 170 } 171 172 type versionUpgradeTest struct { 173 goOS string 174 c *cluster 175 steps []versionStep 176 177 // Cache conns because opening one takes hundreds of ms, and we do it quite 178 // a lot. 179 conns []*gosql.DB 180 } 181 182 func newVersionUpgradeTest(c *cluster, steps ...versionStep) *versionUpgradeTest { 183 return &versionUpgradeTest{ 184 goOS: ifLocal(runtime.GOOS, "linux"), 185 c: c, 186 steps: steps, 187 } 188 } 189 190 func checkpointName(binaryVersion string) string { return "checkpoint-v" + binaryVersion } 191 192 // Return a cached conn to the given node. Don't call .Close(), the test harness 193 // will do it. 194 func (u *versionUpgradeTest) conn(ctx context.Context, t *test, i int) *gosql.DB { 195 if u.conns == nil { 196 for _, i := range u.c.All() { 197 u.conns = append(u.conns, u.c.Conn(ctx, i)) 198 } 199 } 200 return u.conns[i-1] 201 } 202 203 func (u *versionUpgradeTest) uploadVersion( 204 ctx context.Context, t *test, nodes nodeListOption, newVersion string, 205 ) option { 206 var binary string 207 if newVersion == "" { 208 binary = cockroach 209 } else { 210 var err error 211 binary, err = binfetcher.Download(ctx, binfetcher.Options{ 212 Binary: "cockroach", 213 Version: "v" + newVersion, 214 GOOS: u.goOS, 215 GOARCH: "amd64", 216 }) 217 if err != nil { 218 t.Fatal(err) 219 } 220 } 221 222 target := "./cockroach" 223 if newVersion != "" { 224 target += "-" + newVersion 225 } 226 u.c.Put(ctx, binary, target, nodes) 227 return startArgs("--binary=" + target) 228 } 229 230 // binaryVersion returns the binary running on the (one-indexed) node. 231 // NB: version means major.minor[-unstable]; the patch level isn't returned. For example, a binary 232 // of version 19.2.4 will return 19.2. 233 func (u *versionUpgradeTest) binaryVersion(ctx context.Context, t *test, i int) roachpb.Version { 234 db := u.conn(ctx, t, i) 235 236 var sv string 237 if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil { 238 t.Fatal(err) 239 } 240 241 if len(sv) == 0 { 242 t.Fatal("empty version") 243 } 244 245 cv, err := roachpb.ParseVersion(sv) 246 if err != nil { 247 t.Fatal(err) 248 } 249 return cv 250 } 251 252 // binaryVersion returns the cluster version active on the (one-indexed) node. Note that the 253 // returned value might become stale due to the cluster auto-upgrading in the background plus 254 // gossip asynchronicity. 255 // NB: cluster versions are always major.minor[-unstable]; there isn't a patch level. 256 func (u *versionUpgradeTest) clusterVersion(ctx context.Context, t *test, i int) roachpb.Version { 257 db := u.conn(ctx, t, i) 258 259 var sv string 260 if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil { 261 t.Fatal(err) 262 } 263 264 cv, err := roachpb.ParseVersion(sv) 265 if err != nil { 266 t.Fatal(err) 267 } 268 return cv 269 } 270 271 // versionStep is an isolated version migration on a running cluster. 272 type versionStep func(ctx context.Context, t *test, u *versionUpgradeTest) 273 274 func uploadAndStartFromCheckpointFixture(nodes nodeListOption, v string) versionStep { 275 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 276 u.c.Run(ctx, nodes, "mkdir", "-p", "{store-dir}") 277 vv := version.MustParse("v" + v) 278 // The fixtures use cluster version (major.minor) but the input might be 279 // a patch release. 280 name := checkpointName( 281 roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(), 282 ) 283 for _, i := range nodes { 284 u.c.Put(ctx, 285 "pkg/cmd/roachtest/fixtures/"+strconv.Itoa(i)+"/"+name+".tgz", 286 "{store-dir}/fixture.tgz", u.c.Node(i), 287 ) 288 } 289 // Extract fixture. Fail if there's already an LSM in the store dir. 290 u.c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz") 291 292 // Put and start the binary. 293 args := u.uploadVersion(ctx, t, nodes, v) 294 // NB: can't start sequentially since cluster already bootstrapped. 295 u.c.Start(ctx, t, nodes, args, startArgsDontEncrypt, roachprodArgOption{"--sequential=false"}) 296 } 297 } 298 299 // binaryUpgradeStep rolling-restarts the given nodes into the new binary 300 // version. Note that this does *not* wait for the cluster version to upgrade. 301 // Use a waitForUpgradeStep() for that. 302 func binaryUpgradeStep(nodes nodeListOption, newVersion string) versionStep { 303 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 304 c := u.c 305 args := u.uploadVersion(ctx, t, nodes, newVersion) 306 307 // Restart nodes in a random order; otherwise node 1 would be running all 308 // the migrations and it probably also has all the leases. 309 rand.Shuffle(len(nodes), func(i, j int) { 310 nodes[i], nodes[j] = nodes[j], nodes[i] 311 }) 312 for _, node := range nodes { 313 t.l.Printf("restarting node %d", node) 314 c.Stop(ctx, c.Node(node)) 315 c.Start(ctx, t, c.Node(node), args, startArgsDontEncrypt) 316 t.l.Printf("node %d now running binary version %s", node, u.binaryVersion(ctx, t, node)) 317 318 // TODO(nvanbenschoten): add upgrade qualification step. What should we 319 // test? We could run logictests. We could add custom logic here. Maybe 320 // this should all be pushed to nightly migration tests instead. 321 } 322 } 323 } 324 325 func preventAutoUpgradeStep(node int) versionStep { 326 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 327 db := u.conn(ctx, t, node) 328 _, err := db.ExecContext(ctx, `SET CLUSTER SETTING cluster.preserve_downgrade_option = $1`, u.binaryVersion(ctx, t, node).String()) 329 if err != nil { 330 t.Fatal(err) 331 } 332 } 333 } 334 335 func allowAutoUpgradeStep(node int) versionStep { 336 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 337 db := u.conn(ctx, t, node) 338 _, err := db.ExecContext(ctx, `RESET CLUSTER SETTING cluster.preserve_downgrade_option`) 339 if err != nil { 340 t.Fatal(err) 341 } 342 } 343 } 344 345 // waitForUpgradeStep waits for the cluster version to reach the first node's 346 // binary version (which is assumed to be every node's binary version). We rely 347 // on the cluster's internal self-upgrading mechanism. 348 // 349 // NB: this is intentionally kept separate from binaryUpgradeStep because we run 350 // feature tests between the steps, and we want to expose them (at least 351 // heuristically) to the real-world situation in which some nodes have already 352 // learned of a cluster version bump (from Gossip) where others haven't. This 353 // situation tends to exhibit unexpected behavior. 354 func waitForUpgradeStep(nodes nodeListOption) versionStep { 355 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 356 newVersion := u.binaryVersion(ctx, t, nodes[0]).String() 357 t.l.Printf("%s: waiting for cluster to auto-upgrade\n", newVersion) 358 359 for _, i := range nodes { 360 err := retry.ForDuration(30*time.Second, func() error { 361 currentVersion := u.clusterVersion(ctx, t, i).String() 362 if currentVersion != newVersion { 363 return fmt.Errorf("%d: expected version %s, got %s", i, newVersion, currentVersion) 364 } 365 t.l.Printf("%s: acked by n%d", currentVersion, i) 366 return nil 367 }) 368 if err != nil { 369 t.Fatal(err) 370 } 371 } 372 373 t.l.Printf("%s: nodes %v are upgraded\n", newVersion, nodes) 374 375 // TODO(nvanbenschoten): add upgrade qualification step. 376 } 377 } 378 379 type versionFeatureTest struct { 380 name string 381 fn func(context.Context, *test, *versionUpgradeTest, nodeListOption) (skipped bool) 382 } 383 384 type versionFeatureStep []versionFeatureTest 385 386 func (vs versionFeatureStep) step(nodes nodeListOption) versionStep { 387 return func(ctx context.Context, t *test, u *versionUpgradeTest) { 388 for _, feature := range vs { 389 t.l.Printf("checking %s", feature.name) 390 tBegin := timeutil.Now() 391 skipped := feature.fn(ctx, t, u, nodes) 392 dur := fmt.Sprintf("%.2fs", timeutil.Since(tBegin).Seconds()) 393 if skipped { 394 t.l.Printf("^-- skip (%s)", dur) 395 } else { 396 t.l.Printf("^-- ok (%s)", dur) 397 } 398 } 399 } 400 } 401 402 func stmtFeatureTest( 403 name string, minVersion roachpb.Version, stmt string, args ...interface{}, 404 ) versionFeatureTest { 405 return versionFeatureTest{ 406 name: name, 407 fn: func(ctx context.Context, t *test, u *versionUpgradeTest, nodes nodeListOption) (skipped bool) { 408 i := nodes.randNode()[0] 409 if u.clusterVersion(ctx, t, i).Less(minVersion) { 410 return true // skipped 411 } 412 db := u.conn(ctx, t, i) 413 if _, err := db.ExecContext(ctx, stmt, args...); err != nil { 414 t.Fatal(err) 415 } 416 return false 417 }, 418 } 419 } 420 421 // makeVersionFixtureAndFatal creates fixtures to "age out" old versions of CockroachDB. 422 // We want to test data that was created at v1.0, but we don't actually want to 423 // run a long chain of binaries starting all the way at v1.0. Instead, we 424 // periodically bake a set of store directories that originally started out on 425 // v1.0 and maintain it as a fixture for this test. 426 // 427 // The checkpoints will be created in the log directories downloaded as part of 428 // the artifacts. The test will fail on purpose when it's done with instructions 429 // on where to move the files. 430 func makeVersionFixtureAndFatal( 431 ctx context.Context, t *test, c *cluster, predecessorVersion string, makeFixtureVersion string, 432 ) { 433 c.l.Printf("making fixture for %s (starting at %s)", makeFixtureVersion, predecessorVersion) 434 c.encryptDefault = false 435 newVersionUpgradeTest(c, 436 // Start the cluster from a fixture. That fixture's cluster version may 437 // be at the predecessor version (though in practice it's fully up to 438 // date, if it was created via the checkpointer above), so add a 439 // waitForUpgradeStep to make sure we're upgraded all the way before 440 // moving on. 441 // 442 // See the comment on createCheckpoints for details on fixtures. 443 uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion), 444 waitForUpgradeStep(c.All()), 445 446 // NB: at this point, cluster and binary version equal predecessorVersion, 447 // and auto-upgrades are on. 448 449 binaryUpgradeStep(c.All(), makeFixtureVersion), 450 waitForUpgradeStep(c.All()), 451 452 func(ctx context.Context, t *test, u *versionUpgradeTest) { 453 // If we're taking checkpoints, momentarily stop the cluster (we 454 // need to do that to get the checkpoints to reflect a 455 // consistent cluster state). The binary at this point will be 456 // the new one, but the cluster version was not explicitly 457 // bumped, though auto-update may have taken place already. 458 // For example, if newVersion is 2.1, the cluster version in 459 // the store directories may be 2.0 on some stores and 2.1 on 460 // the others (though if any are on 2.1, then that's what's 461 // stored in system.settings). 462 // This means that when we restart from that version, we're 463 // going to want to use the binary mentioned in the checkpoint, 464 // or at least one compatible with the *predecessor* of the 465 // checkpoint version. For example, for checkpoint-2.1, the 466 // cluster version might be 2.0, so we can only use the 2.0 or 467 // 2.1 binary, but not the 19.1 binary (as 19.1 and 2.0 are not 468 // compatible). 469 name := checkpointName(u.binaryVersion(ctx, t, 1).String()) 470 u.c.Stop(ctx, c.All()) 471 c.Run(ctx, c.All(), cockroach, "debug", "rocksdb", "--db={store-dir}", 472 "checkpoint", "--checkpoint_dir={store-dir}/"+name) 473 c.Run(ctx, c.All(), "tar", "-C", "{store-dir}/"+name, "-czf", "{log-dir}/"+name+".tgz", ".") 474 t.Fatalf(`successfully created checkpoints; failing test on purpose. 475 476 Invoke the following to move the archives to the right place and commit the 477 result: 478 479 for i in 1 2 3 4; do 480 mkdir -p pkg/cmd/roachtest/fixtures/${i} && \ 481 mv artifacts/acceptance/version-upgrade/run_1/${i}.logs/checkpoint-*.tgz \ 482 pkg/cmd/roachtest/fixtures/${i}/ 483 done 484 `) 485 }).run(ctx, t) 486 }