github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/schema_change_migrations_test.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql_test 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "strconv" 18 "strings" 19 "sync/atomic" 20 "testing" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/jobs" 24 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 25 "github.com/cockroachdb/cockroach/pkg/keys" 26 "github.com/cockroachdb/cockroach/pkg/kv" 27 "github.com/cockroachdb/cockroach/pkg/roachpb" 28 "github.com/cockroachdb/cockroach/pkg/security" 29 "github.com/cockroachdb/cockroach/pkg/sql" 30 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 31 "github.com/cockroachdb/cockroach/pkg/sql/sqltestutils" 32 "github.com/cockroachdb/cockroach/pkg/sql/tests" 33 "github.com/cockroachdb/cockroach/pkg/sqlmigrations" 34 "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" 35 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 36 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 37 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 38 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 39 "github.com/cockroachdb/cockroach/pkg/util/log" 40 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 41 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 42 "github.com/cockroachdb/errors" 43 "github.com/stretchr/testify/require" 44 ) 45 46 type BlockState int 47 48 // These are the states that we want to block the 19.2 style schema change and 49 // ensure that it can be migrated properly when it is in that state. 50 const ( 51 BeforeBackfill BlockState = iota 52 AfterBackfill 53 AfterReversingMutations // Only used if the job was canceled. 54 WaitingForGC // Only applies to DROP INDEX, DROP TABLE, TRUNCATE TABLE. 55 ) 56 57 type SchemaChangeType int 58 59 const ( 60 AddColumn SchemaChangeType = iota 61 DropColumn 62 CreateIndex 63 DropIndex 64 AddConstraint 65 DropConstraint 66 CreateTable 67 DropTable 68 TruncateTable 69 ) 70 71 const setup = ` 72 CREATE DATABASE t; 73 USE t; 74 CREATE TABLE test (k INT PRIMARY KEY, v INT, INDEX k_idx (k), CONSTRAINT k_cons CHECK (k > 0)); 75 INSERT INTO test VALUES (1, 2); 76 ` 77 78 // runsBackfill is a set of schema change types that run a backfill. 79 var runsBackfill = map[SchemaChangeType]bool{ 80 AddColumn: true, 81 DropColumn: true, 82 CreateIndex: true, 83 DropIndex: true, 84 } 85 86 func isDeletingTable(schemaChangeType SchemaChangeType) bool { 87 return schemaChangeType == TruncateTable || schemaChangeType == DropTable 88 } 89 90 func checkBlockedSchemaChange( 91 t *testing.T, runner *sqlutils.SQLRunner, testCase migrationTestCase, 92 ) { 93 if testCase.blockState == WaitingForGC { 94 // Earlier we turned the 20.1 GC job into a 19.2 schema change job. Delete 95 // the original schema change job which is now succeeded, to avoid having 96 // special cases later, since we rely heavily on the index of the job row in 97 // the jobs table when verifying a job. 98 // 99 // First, though, we have to actually wait for the original job to become 100 // Succeeded. 101 runner.CheckQueryResultsRetry(t, 102 "SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND status = 'succeeded'", 103 [][]string{{"1"}}, 104 ) 105 rows := runner.QueryStr( 106 t, 107 "SELECT * FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND status = 'succeeded'", 108 ) 109 jobID, _ := strconv.Atoi(rows[0][0]) 110 runner.Exec(t, "DELETE FROM system.jobs WHERE id = $1", jobID) 111 } 112 113 oldVersion := jobutils.GetJobFormatVersion(t, runner) 114 require.Equal(t, jobspb.BaseFormatVersion, oldVersion) 115 expStatus := jobs.StatusRunning 116 if testCase.shouldCancel { 117 expStatus = jobs.StatusReverting 118 } 119 if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChange, expStatus, jobs.Record{ 120 Description: testCase.schemaChange.query, 121 Username: security.RootUser, 122 DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind), 123 }); err != nil { 124 t.Fatal(err) 125 } 126 127 if !hadJobInOldVersion(testCase.schemaChange.kind) { 128 // Delete the job if it didn't have a schema change before. 129 rows := runner.QueryStr(t, "SELECT * FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE'") 130 for _, job := range rows { 131 jobID, _ := strconv.Atoi(job[0]) 132 runner.Exec(t, "DELETE FROM system.jobs WHERE id = $1", jobID) 133 } 134 } 135 } 136 137 type schemaChangeRequest struct { 138 kind SchemaChangeType 139 query string 140 } 141 142 type migrationTestCase struct { 143 blockState BlockState 144 shouldCancel bool 145 schemaChange schemaChangeRequest 146 } 147 148 // testSchemaChangeMigrations tests that a schema change can be migrated after 149 // being blocked in a certain state. 150 // 151 // 1. Create a 20.1 schema change. 152 // 2. Block the schema change at a certain point in its execution. 153 // 3. Mutate the job descriptor and table descriptor such that it appears as a 154 // 19.2 format job. These jobs will not be resumed anymore as 20.1 will refuse 155 // to run 19.2 jobs. 156 // 4. Verify that the job has been marked as a 19.2 job and is blocked. 157 // 5. Run the migration and wait for the migration to complete. 158 // 6. Ensure that the schema change completes. 159 func testSchemaChangeMigrations(t *testing.T, testCase migrationTestCase) { 160 ctx := context.Background() 161 shouldSignalMigration := int32(0) 162 blockFnErrChan := make(chan error, 1) 163 revMigrationDoneCh, signalRevMigrationDone := makeSignal() 164 migrationDoneCh, signalMigrationDone := makeCondSignal(&shouldSignalMigration) 165 runner, sqlDB, tc := setupServerAndStartSchemaChange( 166 t, 167 blockFnErrChan, 168 testCase, 169 signalRevMigrationDone, 170 signalMigrationDone, 171 ) 172 173 defer tc.Stopper().Stop(context.Background()) 174 defer sqltestutils.DisableGCTTLStrictEnforcement(t, sqlDB)() 175 176 log.Info(ctx, "waiting for all schema changes to block") 177 <-revMigrationDoneCh 178 log.Info(ctx, "all schema changes have blocked") 179 180 close(blockFnErrChan) 181 for err := range blockFnErrChan { 182 if err != nil { 183 t.Fatalf("%+v", err) 184 } 185 } 186 187 checkBlockedSchemaChange(t, runner, testCase) 188 189 // Start the migrations. 190 log.Info(ctx, "starting job migration") 191 atomic.StoreInt32(&shouldSignalMigration, 1) 192 migMgr := tc.Server(0).MigrationManager().(*sqlmigrations.Manager) 193 if err := migMgr.StartSchemaChangeJobMigration(ctx); err != nil { 194 t.Fatal(err) 195 } 196 197 log.Info(ctx, "waiting for migration to complete") 198 <-migrationDoneCh 199 200 // TODO(pbardea): SHOW JOBS WHEN COMPLETE SELECT does not work on some schema 201 // changes when canceling jobs, but querying until there are no jobs works. 202 //runner.Exec(t, "SHOW JOBS WHEN COMPLETE SELECT job_id FROM [SHOW JOBS] WHERE (job_type = 'SCHEMA CHANGE' OR job_type = 'SCHEMA CHANGE GC')") 203 // Wait until there are no more running schema changes. 204 log.Info(ctx, "waiting for new schema change jobs to complete") 205 runner.CheckQueryResultsRetry(t, "SELECT * FROM [SHOW JOBS] WHERE (job_type = 'SCHEMA CHANGE' OR job_type = 'SCHEMA CHANGE GC') AND NOT (status = 'succeeded' OR status = 'canceled')", [][]string{}) 206 log.Info(ctx, "done running new schema change jobs") 207 208 verifySchemaChangeJobRan(t, runner, testCase) 209 } 210 211 func makeCondSignal(shouldSignal *int32) (chan struct{}, func()) { 212 signalCh := make(chan struct{}) 213 signalFn := func() { 214 if atomic.LoadInt32(shouldSignal) == 1 { 215 signalCh <- struct{}{} 216 } 217 } 218 return signalCh, signalFn 219 } 220 221 func makeSignal() (chan struct{}, func()) { 222 alwaysSignal := int32(1) 223 return makeCondSignal(&alwaysSignal) 224 } 225 226 func setupServerAndStartSchemaChange( 227 t *testing.T, 228 errCh chan error, 229 testCase migrationTestCase, 230 revMigrationDone, signalMigrationDone func(), 231 ) (*sqlutils.SQLRunner, *gosql.DB, serverutils.TestClusterInterface) { 232 clusterSize := 3 233 params, _ := tests.CreateTestServerParams() 234 235 var runner *sqlutils.SQLRunner 236 var kvDB *kv.DB 237 var registry *jobs.Registry 238 239 blockSchemaChanges := false 240 241 migrateJob := func(jobID int64) { 242 if testCase.blockState == WaitingForGC { 243 if err := migrateGCJobToOldFormat(kvDB, registry, jobID, testCase.schemaChange.kind); err != nil { 244 errCh <- err 245 } 246 } else { 247 if err := migrateJobToOldFormat(kvDB, registry, jobID, testCase.schemaChange.kind); err != nil { 248 errCh <- err 249 } 250 } 251 } 252 cancelJob := func(jobID int64) { 253 runner.Exec(t, `CANCEL JOB ( 254 SELECT job_id FROM [SHOW JOBS] 255 WHERE 256 job_id = $1 257 )`, jobID) 258 } 259 260 setupTestingKnobs(t, testCase, ¶ms, &blockSchemaChanges, revMigrationDone, signalMigrationDone, migrateJob, cancelJob) 261 262 tc := serverutils.StartTestCluster(t, clusterSize, 263 base.TestClusterArgs{ 264 ReplicationMode: base.ReplicationManual, 265 ServerArgs: params, 266 }) 267 sqlDB := tc.ServerConn(0) 268 kvDB = tc.Server(0).DB() 269 runner = sqlutils.MakeSQLRunner(sqlDB) 270 registry = tc.Server(0).JobRegistry().(*jobs.Registry) 271 272 ctx, cancel := context.WithCancel(context.Background()) 273 274 if _, err := sqlDB.Exec(setup); err != nil { 275 t.Fatal(err) 276 } 277 278 runner.CheckQueryResultsRetry(t, "SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND NOT (status = 'succeeded' OR status = 'canceled')", [][]string{{"0"}}) 279 blockSchemaChanges = true 280 281 bg := ctxgroup.WithContext(ctx) 282 bg.Go(func() error { 283 if _, err := sqlDB.ExecContext(ctx, testCase.schemaChange.query); err != nil { 284 cancel() 285 return err 286 } 287 return nil 288 }) 289 // TODO(pbardea): Remove this magic 53. 290 if _, err := sqltestutils.AddImmediateGCZoneConfig(sqlDB, sqlbase.ID(53)); err != nil { 291 t.Fatal(err) 292 } 293 return runner, sqlDB, tc 294 } 295 296 // migrateJobToOldFormat updates the state of a job and table descriptor from 297 // it's 20.1 to its 19.2 representation. There is a separate implementation for 298 // GC jobs. 299 func migrateJobToOldFormat( 300 kvDB *kv.DB, registry *jobs.Registry, jobID int64, schemaChangeType SchemaChangeType, 301 ) error { 302 ctx := context.Background() 303 304 tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test") 305 if schemaChangeType == CreateTable { 306 tableDesc = sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "new_table") 307 } 308 309 if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 310 job, err := registry.LoadJobWithTxn(ctx, jobID, txn) 311 if err != nil { 312 return err 313 } 314 return job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error { 315 details := job.Details().(jobspb.SchemaChangeDetails) 316 // Explicitly zero out these fields as they will be set to their 0 value 317 // on 19.2 nodes. 318 details.TableID = 0 319 details.MutationID = 0 320 details.FormatVersion = jobspb.BaseFormatVersion 321 if isDeletingTable(schemaChangeType) { 322 details.DroppedTables = []jobspb.DroppedTableDetails{ 323 { 324 Name: tableDesc.Name, 325 ID: tableDesc.ID, 326 Status: jobspb.Status_DRAINING_NAMES, 327 }, 328 } 329 } 330 331 progress := job.Progress() 332 // TODO(pbardea): Probably want to change this to check on block state 333 // being draining names. 334 if isDeletingTable(schemaChangeType) { 335 progress.RunningStatus = string(sql.RunningStatusDrainingNames) 336 } 337 338 md.Payload.Lease = nil 339 md.Payload.Details = jobspb.WrapPayloadDetails(details) 340 md.Progress = &progress 341 ju.UpdatePayload(md.Payload) 342 ju.UpdateProgress(md.Progress) 343 return nil 344 }) 345 }); err != nil { 346 return err 347 } 348 349 // Update the table descriptor. 350 tableDesc.Lease = &sqlbase.TableDescriptor_SchemaChangeLease{ 351 ExpirationTime: timeutil.Now().UnixNano(), 352 NodeID: roachpb.NodeID(0), 353 } 354 if schemaChangeType == TruncateTable { 355 tableDesc.DropJobID = jobID 356 // TODO(pbardea): When is drop time populated? 357 } 358 359 // Write the table descriptor back. 360 return kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 361 if err := txn.SetSystemConfigTrigger(); err != nil { 362 return err 363 } 364 return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey( 365 keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc), 366 ) 367 }) 368 } 369 370 // migrateGCJobToOldFormat converts a GC job created in 20.1 into a 19.2-style 371 // schema change job that is waiting for GC. This involves changing the type of 372 // the job details and progress. 373 // 374 // We could have gone back and set the original schema change job to Running, 375 // but then we'd have to update that job from inside the GC job testing knob 376 // function, which seems risky since we have no way of controlling that schema 377 // change job once it's eligible to be adopted. 378 func migrateGCJobToOldFormat( 379 kvDB *kv.DB, registry *jobs.Registry, jobID int64, schemaChangeType SchemaChangeType, 380 ) error { 381 ctx := context.Background() 382 383 if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 384 job, err := registry.LoadJobWithTxn(ctx, jobID, txn) 385 if err != nil { 386 return err 387 } 388 return job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error { 389 // Replace the details with an entirely new SchemaChangeDetails. 390 details := jobspb.SchemaChangeDetails{ 391 FormatVersion: jobspb.BaseFormatVersion, 392 } 393 if isDeletingTable(schemaChangeType) { 394 details.DroppedTables = []jobspb.DroppedTableDetails{ 395 { 396 // TODO (lucy): Stop hard-coding these if possible. We can't get 397 // these values from the table descriptor if we're dropping the 398 // table, since at this point the table descriptor would have been 399 // deleted. 400 Name: "test", 401 ID: 53, 402 Status: jobspb.Status_WAIT_FOR_GC_INTERVAL, 403 }, 404 } 405 } 406 407 progress := jobspb.Progress{ 408 Details: jobspb.WrapProgressDetails(jobspb.SchemaChangeProgress{}), 409 RunningStatus: string(sql.RunningStatusWaitingGC), 410 } 411 412 md.Payload.Lease = nil 413 md.Payload.Description = strings.TrimPrefix(md.Payload.Description, "GC for ") 414 md.Payload.Details = jobspb.WrapPayloadDetails(details) 415 md.Progress = &progress 416 ju.UpdatePayload(md.Payload) 417 ju.UpdateProgress(md.Progress) 418 return nil 419 }) 420 }); err != nil { 421 return err 422 } 423 424 switch schemaChangeType { 425 case DropTable: 426 // There's no table descriptor to update, so we're done. 427 return nil 428 429 case DropIndex: 430 tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test") 431 if l := len(tableDesc.GCMutations); l != 1 { 432 return errors.AssertionFailedf("expected exactly 1 GCMutation, found %d", l) 433 } 434 435 // Update the table descriptor. 436 tableDesc.Lease = &sqlbase.TableDescriptor_SchemaChangeLease{ 437 ExpirationTime: timeutil.Now().UnixNano(), 438 NodeID: roachpb.NodeID(0), 439 } 440 441 tableDesc.GCMutations[0].JobID = jobID 442 tableDesc.GCMutations[0].DropTime = timeutil.Now().UnixNano() 443 444 // Write the table descriptor back. 445 return kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 446 if err := txn.SetSystemConfigTrigger(); err != nil { 447 return err 448 } 449 return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey( 450 keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc), 451 ) 452 }) 453 default: 454 return errors.Errorf("invalid schema change type: %d", schemaChangeType) 455 } 456 } 457 458 // Set up server testing args such that knobs are set to block and abandon any 459 // given schema change at a certain point. The "blocked" channel will be 460 // signaled when the schema change gets abandoned. 461 // The runner should only be used inside callback closures. 462 func setupTestingKnobs( 463 t *testing.T, 464 testCase migrationTestCase, 465 args *base.TestServerArgs, 466 blockSchemaChanges *bool, 467 revMigrationDone, signalMigrationDone func(), 468 migrateJob, cancelJob func(int64), 469 ) { 470 numJobs := 1 471 if testCase.schemaChange.kind == CreateTable { 472 numJobs = 2 473 } 474 var ( 475 mu syncutil.Mutex 476 migratedCount int 477 doneReverseMigration bool 478 ranCancelCommand bool 479 hasCanceled bool 480 ) 481 482 blockFn := func(jobID int64) error { 483 mu.Lock() 484 defer mu.Unlock() 485 if !(*blockSchemaChanges) { 486 return nil 487 } 488 489 // In the case we're canceling the job, this blockFn should only be called 490 // after the OnFailOrCancel hook is called. At this point we know that the 491 // job is actually canceled. 492 hasCanceled = true 493 494 if doneReverseMigration { 495 // Already migrated all the jobs that we want to migrate to 19.2. 496 // New jobs created after we migrated the original batch should be allowed 497 // to continue. 498 return nil 499 } else { 500 migrateJob(jobID) 501 migratedCount++ 502 } 503 504 if migratedCount == numJobs { 505 doneReverseMigration = true 506 revMigrationDone() 507 } 508 509 // Return a retryable error so that the job doesn't make any progress past 510 // this point. It should not get adopted since it has been marked as a 19.2 511 // job. 512 return jobs.NewRetryJobError("stop this job until cluster upgrade") 513 } 514 515 cancelFn := func(jobID int64) error { 516 mu.Lock() 517 defer mu.Unlock() 518 if hasCanceled { 519 // The job has already been successfully canceled. 520 return nil 521 } 522 523 if !ranCancelCommand { 524 cancelJob(jobID) 525 ranCancelCommand = true 526 } 527 528 // Don't allow the job to progress further than this knob until it has 529 // actually been canceled . 530 return jobs.NewRetryJobError("retry until canceled") 531 } 532 533 knobs := &sql.SchemaChangerTestingKnobs{} 534 gcKnobs := &sql.GCJobTestingKnobs{} 535 536 shouldCancel := testCase.shouldCancel 537 if shouldCancel { 538 if runsBackfill[testCase.schemaChange.kind] { 539 knobs.RunAfterBackfill = cancelFn 540 } else { 541 knobs.RunBeforeResume = cancelFn 542 } 543 } 544 545 switch testCase.blockState { 546 case BeforeBackfill: 547 if shouldCancel { 548 knobs.RunBeforeOnFailOrCancel = blockFn 549 } else { 550 knobs.RunBeforeResume = blockFn 551 } 552 case AfterBackfill: 553 if shouldCancel { 554 // This is a special case where (1) RunAfterBackfill within Resume() needs 555 // to call cancelFn() to cancel the job, (2) RunBeforeOnFailOrCancel needs 556 // to set hasCanceled, and (3) RunAfterBackfill, running for the 2nd time 557 // within OnFailOrCancel(), needs to read the value of hasCanceled (which 558 // is true) and run BlockFn(). 559 knobs.RunBeforeOnFailOrCancel = func(jobID int64) error { 560 mu.Lock() 561 defer mu.Unlock() 562 hasCanceled = true 563 return nil 564 } 565 knobs.RunAfterBackfill = func(jobID int64) error { 566 mu.Lock() 567 hasCanceled := hasCanceled 568 mu.Unlock() 569 if hasCanceled { 570 return blockFn(jobID) 571 } else { 572 return cancelFn(jobID) 573 } 574 } 575 } else { 576 knobs.RunAfterBackfill = blockFn 577 } 578 case AfterReversingMutations: 579 if !shouldCancel { 580 t.Fatal("can only block after reversing mutations if the job is expected to be canceled") 581 } 582 knobs.RunAfterBackfill = cancelFn 583 knobs.RunAfterMutationReversal = blockFn 584 case WaitingForGC: 585 if shouldCancel { 586 t.Fatal("cannot block on waiting for GC if the job should also be canceled") 587 } 588 gcKnobs.RunBeforeResume = blockFn 589 } 590 591 args.Knobs.SQLSchemaChanger = knobs 592 args.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{ 593 AfterJobMigration: signalMigrationDone, 594 AlwaysRunJobMigration: true, 595 } 596 args.Knobs.GCJob = gcKnobs 597 } 598 599 func getTestName(schemaChange SchemaChangeType, blockState BlockState, shouldCancel bool) string { 600 stateNames := map[BlockState]string{ 601 BeforeBackfill: "before-backfill", 602 AfterBackfill: "after-backfill", 603 AfterReversingMutations: "after-reversing-mutations", 604 WaitingForGC: "waiting-for-gc", 605 } 606 schemaChangeName := map[SchemaChangeType]string{ 607 AddColumn: "add-column", 608 DropColumn: "drop-column", 609 CreateIndex: "add-index", 610 DropIndex: "drop-index", 611 AddConstraint: "add-constraint", 612 DropConstraint: "drop-constraint", 613 CreateTable: "create-table", 614 TruncateTable: "truncate-table", 615 DropTable: "drop-table", 616 } 617 618 testName := fmt.Sprintf("%s-blocked-at-%s", schemaChangeName[schemaChange], stateNames[blockState]) 619 if shouldCancel { 620 testName += "-canceled" 621 } 622 return testName 623 } 624 625 func verifySchemaChangeJobRan( 626 t *testing.T, runner *sqlutils.SQLRunner, testCase migrationTestCase, 627 ) { 628 expStatus := jobs.StatusSucceeded 629 description := testCase.schemaChange.query 630 if testCase.shouldCancel { 631 expStatus = jobs.StatusCanceled 632 } 633 if testCase.schemaChange.kind == CreateTable { 634 description = "adding table 54" 635 } else { 636 if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChange, expStatus, jobs.Record{ 637 Description: description, 638 Username: security.RootUser, 639 DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind), 640 }); err != nil { 641 t.Fatal(err) 642 } 643 } 644 645 // Verify that the GC job exists and is in the correct state, if applicable. 646 if testCase.blockState == WaitingForGC { 647 if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChangeGC, jobs.StatusSucceeded, jobs.Record{ 648 Description: "GC for " + description, 649 Username: security.RootUser, 650 DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind), 651 }); err != nil { 652 t.Fatal(err) 653 } 654 } else { 655 // For non-GC jobs, verify that the schema change job format version was 656 // updated. 657 newVersion := jobutils.GetJobFormatVersion(t, runner) 658 require.Equal(t, jobspb.JobResumerFormatVersion, newVersion) 659 } 660 661 var expected [][]string 662 didCancel := testCase.shouldCancel 663 switch testCase.schemaChange.kind { 664 case AddColumn: 665 if didCancel { 666 expected = [][]string{{"1", "2"}} 667 } else { 668 expected = [][]string{{"1", "2", "NULL"}} 669 } 670 rows := runner.QueryStr(t, "SELECT * FROM t.test") 671 require.Equal(t, expected, rows) 672 case DropColumn: 673 if didCancel { 674 expected = [][]string{{"1", "NULL"}} 675 } else { 676 expected = [][]string{{"1"}} 677 } 678 rows := runner.QueryStr(t, "SELECT * FROM t.test") 679 require.Equal(t, expected, rows) 680 case CreateIndex: 681 if didCancel { 682 expected = [][]string{{"primary"}, {"k_idx"}} 683 } else { 684 expected = [][]string{{"primary"}, {"k_idx"}, {"v_idx"}} 685 } 686 rows := runner.QueryStr(t, "SELECT DISTINCT index_name FROM [SHOW INDEXES FROM t.test]") 687 require.Equal(t, expected, rows) 688 case DropIndex: 689 if didCancel { 690 expected = [][]string{{"primary"}, {"k_idx"}} 691 } else { 692 expected = [][]string{{"primary"}} 693 } 694 rows := runner.QueryStr(t, "SELECT DISTINCT index_name FROM [SHOW INDEXES FROM t.test]") 695 require.Equal(t, expected, rows) 696 case AddConstraint: 697 if didCancel { 698 expected = [][]string{{"k_cons"}, {"primary"}} 699 } else { 700 expected = [][]string{{"k_cons"}, {"primary"}, {"v_unq"}} 701 } 702 rows := runner.QueryStr(t, "SELECT constraint_name FROM [SHOW CONSTRAINTS FROM t.test] ORDER BY constraint_name") 703 require.Equal(t, expected, rows) 704 case DropConstraint: 705 if didCancel { 706 expected = [][]string{{"k_cons"}, {"primary"}} 707 } else { 708 expected = [][]string{{"primary"}} 709 } 710 rows := runner.QueryStr(t, "SELECT constraint_name FROM [SHOW CONSTRAINTS FROM t.test] ORDER BY constraint_name") 711 require.Equal(t, expected, rows) 712 case CreateTable: 713 if didCancel { 714 t.Fatal("cannot cancel create table") 715 } else { 716 expected = [][]string{{"new_table"}, {"test"}} 717 } 718 rows := runner.QueryStr(t, "SELECT table_name FROM [SHOW TABLES FROM t] ORDER BY table_name") 719 require.Equal(t, expected, rows) 720 case TruncateTable: 721 if didCancel { 722 expected = [][]string{{"0"}} 723 } else { 724 expected = [][]string{{"0"}} 725 } 726 rows := runner.QueryStr(t, "SELECT count(*) FROM t.test") 727 require.Equal(t, expected, rows) 728 case DropTable: 729 // Canceling after the backfill has no effect. 730 expected = [][]string{} 731 rows := runner.QueryStr(t, "SELECT table_name FROM [SHOW TABLES FROM t] ORDER BY table_name") 732 require.Equal(t, expected, rows) 733 } 734 } 735 736 func getTableIDsUnderTest(schemaChangeType SchemaChangeType) []sqlbase.ID { 737 tableID := sqlbase.ID(53) 738 if schemaChangeType == CreateTable { 739 tableID = sqlbase.ID(54) 740 } 741 return []sqlbase.ID{tableID} 742 } 743 744 // Helpers used to determine valid test cases. 745 746 // canBlockIfCanceled returns if a certain state (where we want to block the 747 // schema change) will be reached given if the job was canceled or not. 748 func canBlockIfCanceled(blockState BlockState, shouldCancel bool) bool { 749 // States that are only valid when the job is canceled. 750 if blockState == WaitingForGC { 751 return !shouldCancel 752 } 753 if blockState == AfterReversingMutations { 754 return shouldCancel 755 } 756 return true 757 } 758 759 // Ensures that the given schema change actually passes through the state where 760 // we're proposing to block. 761 func validBlockStateForSchemaChange(blockState BlockState, schemaChangeType SchemaChangeType) bool { 762 switch blockState { 763 case AfterBackfill: 764 return runsBackfill[schemaChangeType] 765 case WaitingForGC: 766 return schemaChangeType == DropIndex || schemaChangeType == DropTable 767 } 768 return true 769 } 770 771 // hasJobInOldVersion returns if a given schema change had a job in 19.2. 772 // Therefore these jobs could not be canceled in 19.2 773 func hadJobInOldVersion(schemaChangeType SchemaChangeType) bool { 774 return schemaChangeType != CreateTable 775 } 776 777 func TestMigrateSchemaChanges(t *testing.T) { 778 defer leaktest.AfterTest(t)() 779 defer setTestJobsAdoptInterval()() 780 781 blockStates := []BlockState{ 782 BeforeBackfill, 783 AfterBackfill, 784 AfterReversingMutations, 785 WaitingForGC, 786 } 787 788 schemaChanges := []schemaChangeRequest{ 789 { 790 CreateTable, 791 "CREATE TABLE t.public.new_table (k INT8, FOREIGN KEY (k) REFERENCES t.public.test (k))", 792 }, 793 { 794 AddColumn, 795 "ALTER TABLE t.public.test ADD COLUMN foo INT8", 796 }, 797 { 798 DropColumn, 799 "ALTER TABLE t.public.test DROP COLUMN v", 800 }, 801 { 802 CreateIndex, 803 "CREATE INDEX v_idx ON t.public.test (v)", 804 }, 805 { 806 DropIndex, 807 "DROP INDEX t.public.test@k_idx", 808 }, 809 { 810 AddConstraint, 811 "ALTER TABLE t.public.test ADD CONSTRAINT v_unq UNIQUE (v)", 812 }, 813 { 814 DropConstraint, 815 "ALTER TABLE t.public.test DROP CONSTRAINT k_cons", 816 }, 817 { 818 TruncateTable, 819 "TRUNCATE TABLE t.public.test", 820 }, 821 { 822 DropTable, 823 "DROP TABLE t.public.test", 824 }, 825 } 826 827 for _, schemaChange := range schemaChanges { 828 for _, blockState := range blockStates { 829 for _, shouldCancel := range []bool{true, false} { 830 blockState := blockState 831 shouldCancel := shouldCancel 832 833 // Rollbacks of DROP CONSTRAINT are broken. See #47323. 834 if schemaChange.kind == DropConstraint && shouldCancel { 835 continue 836 } 837 if !canBlockIfCanceled(blockState, shouldCancel) { 838 continue 839 } 840 if !validBlockStateForSchemaChange(blockState, schemaChange.kind) { 841 continue 842 } 843 if shouldCancel && !hadJobInOldVersion(schemaChange.kind) { 844 continue 845 } 846 847 t.Run(getTestName(schemaChange.kind, blockState, shouldCancel), func(t *testing.T) { 848 testCase := migrationTestCase{ 849 blockState: blockState, 850 shouldCancel: shouldCancel, 851 schemaChange: schemaChange, 852 } 853 testSchemaChangeMigrations(t, testCase) 854 }) 855 } 856 } 857 } 858 } 859 860 // TestGCJobCreated tests that a table descriptor in the DROP state with no 861 // running job has a GC job created for it. 862 func TestGCJobCreated(t *testing.T) { 863 defer leaktest.AfterTest(t)() 864 defer setTestJobsAdoptInterval()() 865 params, _ := tests.CreateTestServerParams() 866 params.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{ 867 AlwaysRunJobMigration: true, 868 } 869 s, sqlDB, kvDB := serverutils.StartServer(t, params) 870 defer s.Stopper().Stop(context.Background()) 871 ctx := context.Background() 872 sqlRunner := sqlutils.MakeSQLRunner(sqlDB) 873 874 // Create a table and then force it to be in the DROP state. 875 if _, err := sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test();`); err != nil { 876 t.Fatal(err) 877 } 878 tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test") 879 tableDesc.State = sqlbase.TableDescriptor_DROP 880 tableDesc.Version++ 881 tableDesc.DropTime = 1 882 if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 883 if err := txn.SetSystemConfigTrigger(); err != nil { 884 return err 885 } 886 if err := sqlbase.RemoveObjectNamespaceEntry( 887 ctx, txn, keys.SystemSQLCodec, tableDesc.ID, tableDesc.ParentID, tableDesc.Name, false, /* kvTrace */ 888 ); err != nil { 889 return err 890 } 891 return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey( 892 keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc), 893 ) 894 }); err != nil { 895 t.Fatal(err) 896 } 897 898 // Run the migration. 899 migMgr := s.MigrationManager().(*sqlmigrations.Manager) 900 if err := migMgr.StartSchemaChangeJobMigration(ctx); err != nil { 901 t.Fatal(err) 902 } 903 904 // Check that a GC job was created and completed successfully. 905 sqlRunner.CheckQueryResultsRetry(t, 906 "SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE GC' AND status = 'succeeded'", 907 [][]string{{"1"}}, 908 ) 909 } 910 911 // TestMissingMutation tests that a malformed table descriptor with a 912 // MutationJob but no Mutation for the given job causes the job to fail with an 913 // error. Regression test for #48786. 914 func TestMissingMutation(t *testing.T) { 915 defer leaktest.AfterTest(t)() 916 defer setTestJobsAdoptInterval()() 917 schemaChangeBlocked, descriptorUpdated := make(chan struct{}), make(chan struct{}) 918 migratedJob := false 919 var schemaChangeJobID int64 920 params, _ := tests.CreateTestServerParams() 921 params.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{ 922 AlwaysRunJobMigration: true, 923 } 924 params.Knobs.SQLSchemaChanger = &sql.SchemaChangerTestingKnobs{ 925 RunBeforeResume: func(jobID int64) error { 926 if !migratedJob { 927 migratedJob = true 928 schemaChangeJobID = jobID 929 close(schemaChangeBlocked) 930 } 931 932 <-descriptorUpdated 933 return jobs.NewRetryJobError("stop this job until cluster upgrade") 934 }, 935 } 936 s, sqlDB, kvDB := serverutils.StartServer(t, params) 937 ctx, cancel := context.WithCancel(context.Background()) 938 defer s.Stopper().Stop(ctx) 939 registry := s.JobRegistry().(*jobs.Registry) 940 941 _, err := sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test(k INT PRIMARY KEY, v INT);`) 942 require.NoError(t, err) 943 944 bg := ctxgroup.WithContext(ctx) 945 // Start a schema change on the table in a separate goroutine. 946 bg.Go(func() error { 947 if _, err := sqlDB.ExecContext(ctx, `ALTER TABLE t.test ADD COLUMN a INT;`); err != nil { 948 cancel() 949 return err 950 } 951 return nil 952 }) 953 954 <-schemaChangeBlocked 955 956 // Rewrite the job to be a 19.2-style job. 957 require.NoError(t, migrateJobToOldFormat(kvDB, registry, schemaChangeJobID, AddColumn)) 958 959 // To get the table descriptor into the (invalid) state we're trying to test, 960 // clear the mutations on the table descriptor. 961 tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test") 962 tableDesc.Mutations = nil 963 require.NoError( 964 t, kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 965 if err := txn.SetSystemConfigTrigger(); err != nil { 966 return err 967 } 968 return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey( 969 keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc), 970 ) 971 }), 972 ) 973 974 // Run the migration. 975 migMgr := s.MigrationManager().(*sqlmigrations.Manager) 976 require.NoError(t, migMgr.StartSchemaChangeJobMigration(ctx)) 977 978 close(descriptorUpdated) 979 980 err = bg.Wait() 981 require.Regexp(t, fmt.Sprintf("mutation %d not found for MutationJob %d", 1, schemaChangeJobID), err) 982 }