github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/schema_change_migrations_test.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql_test
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"strconv"
    18  	"strings"
    19  	"sync/atomic"
    20  	"testing"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/jobs"
    24  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    25  	"github.com/cockroachdb/cockroach/pkg/keys"
    26  	"github.com/cockroachdb/cockroach/pkg/kv"
    27  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    28  	"github.com/cockroachdb/cockroach/pkg/security"
    29  	"github.com/cockroachdb/cockroach/pkg/sql"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sqltestutils"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/tests"
    33  	"github.com/cockroachdb/cockroach/pkg/sqlmigrations"
    34  	"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
    35  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    36  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    37  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    38  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    42  	"github.com/cockroachdb/errors"
    43  	"github.com/stretchr/testify/require"
    44  )
    45  
    46  type BlockState int
    47  
    48  // These are the states that we want to block the 19.2 style schema change and
    49  // ensure that it can be migrated properly when it is in that state.
    50  const (
    51  	BeforeBackfill BlockState = iota
    52  	AfterBackfill
    53  	AfterReversingMutations // Only used if the job was canceled.
    54  	WaitingForGC            // Only applies to DROP INDEX, DROP TABLE, TRUNCATE TABLE.
    55  )
    56  
    57  type SchemaChangeType int
    58  
    59  const (
    60  	AddColumn SchemaChangeType = iota
    61  	DropColumn
    62  	CreateIndex
    63  	DropIndex
    64  	AddConstraint
    65  	DropConstraint
    66  	CreateTable
    67  	DropTable
    68  	TruncateTable
    69  )
    70  
    71  const setup = `
    72  CREATE DATABASE t;
    73  USE t;
    74  CREATE TABLE test (k INT PRIMARY KEY, v INT, INDEX k_idx (k), CONSTRAINT k_cons CHECK (k > 0));
    75  INSERT INTO test VALUES (1, 2);
    76  `
    77  
    78  // runsBackfill is a set of schema change types that run a backfill.
    79  var runsBackfill = map[SchemaChangeType]bool{
    80  	AddColumn:   true,
    81  	DropColumn:  true,
    82  	CreateIndex: true,
    83  	DropIndex:   true,
    84  }
    85  
    86  func isDeletingTable(schemaChangeType SchemaChangeType) bool {
    87  	return schemaChangeType == TruncateTable || schemaChangeType == DropTable
    88  }
    89  
    90  func checkBlockedSchemaChange(
    91  	t *testing.T, runner *sqlutils.SQLRunner, testCase migrationTestCase,
    92  ) {
    93  	if testCase.blockState == WaitingForGC {
    94  		// Earlier we turned the 20.1 GC job into a 19.2 schema change job. Delete
    95  		// the original schema change job which is now succeeded, to avoid having
    96  		// special cases later, since we rely heavily on the index of the job row in
    97  		// the jobs table when verifying a job.
    98  		//
    99  		// First, though, we have to actually wait for the original job to become
   100  		// Succeeded.
   101  		runner.CheckQueryResultsRetry(t,
   102  			"SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND status = 'succeeded'",
   103  			[][]string{{"1"}},
   104  		)
   105  		rows := runner.QueryStr(
   106  			t,
   107  			"SELECT * FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND status = 'succeeded'",
   108  		)
   109  		jobID, _ := strconv.Atoi(rows[0][0])
   110  		runner.Exec(t, "DELETE FROM system.jobs WHERE id = $1", jobID)
   111  	}
   112  
   113  	oldVersion := jobutils.GetJobFormatVersion(t, runner)
   114  	require.Equal(t, jobspb.BaseFormatVersion, oldVersion)
   115  	expStatus := jobs.StatusRunning
   116  	if testCase.shouldCancel {
   117  		expStatus = jobs.StatusReverting
   118  	}
   119  	if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChange, expStatus, jobs.Record{
   120  		Description:   testCase.schemaChange.query,
   121  		Username:      security.RootUser,
   122  		DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind),
   123  	}); err != nil {
   124  		t.Fatal(err)
   125  	}
   126  
   127  	if !hadJobInOldVersion(testCase.schemaChange.kind) {
   128  		// Delete the job if it didn't have a schema change before.
   129  		rows := runner.QueryStr(t, "SELECT * FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE'")
   130  		for _, job := range rows {
   131  			jobID, _ := strconv.Atoi(job[0])
   132  			runner.Exec(t, "DELETE FROM system.jobs WHERE id = $1", jobID)
   133  		}
   134  	}
   135  }
   136  
   137  type schemaChangeRequest struct {
   138  	kind  SchemaChangeType
   139  	query string
   140  }
   141  
   142  type migrationTestCase struct {
   143  	blockState   BlockState
   144  	shouldCancel bool
   145  	schemaChange schemaChangeRequest
   146  }
   147  
   148  // testSchemaChangeMigrations tests that a schema change can be migrated after
   149  // being blocked in a certain state.
   150  //
   151  // 1. Create a 20.1 schema change.
   152  // 2. Block the schema change at a certain point in its execution.
   153  // 3. Mutate the job descriptor and table descriptor such that it appears as a
   154  // 19.2 format job. These jobs will not be resumed anymore as 20.1 will refuse
   155  // to run 19.2 jobs.
   156  // 4. Verify that the job has been marked as a 19.2 job and is blocked.
   157  // 5. Run the migration and wait for the migration to complete.
   158  // 6. Ensure that the schema change completes.
   159  func testSchemaChangeMigrations(t *testing.T, testCase migrationTestCase) {
   160  	ctx := context.Background()
   161  	shouldSignalMigration := int32(0)
   162  	blockFnErrChan := make(chan error, 1)
   163  	revMigrationDoneCh, signalRevMigrationDone := makeSignal()
   164  	migrationDoneCh, signalMigrationDone := makeCondSignal(&shouldSignalMigration)
   165  	runner, sqlDB, tc := setupServerAndStartSchemaChange(
   166  		t,
   167  		blockFnErrChan,
   168  		testCase,
   169  		signalRevMigrationDone,
   170  		signalMigrationDone,
   171  	)
   172  
   173  	defer tc.Stopper().Stop(context.Background())
   174  	defer sqltestutils.DisableGCTTLStrictEnforcement(t, sqlDB)()
   175  
   176  	log.Info(ctx, "waiting for all schema changes to block")
   177  	<-revMigrationDoneCh
   178  	log.Info(ctx, "all schema changes have blocked")
   179  
   180  	close(blockFnErrChan)
   181  	for err := range blockFnErrChan {
   182  		if err != nil {
   183  			t.Fatalf("%+v", err)
   184  		}
   185  	}
   186  
   187  	checkBlockedSchemaChange(t, runner, testCase)
   188  
   189  	// Start the migrations.
   190  	log.Info(ctx, "starting job migration")
   191  	atomic.StoreInt32(&shouldSignalMigration, 1)
   192  	migMgr := tc.Server(0).MigrationManager().(*sqlmigrations.Manager)
   193  	if err := migMgr.StartSchemaChangeJobMigration(ctx); err != nil {
   194  		t.Fatal(err)
   195  	}
   196  
   197  	log.Info(ctx, "waiting for migration to complete")
   198  	<-migrationDoneCh
   199  
   200  	// TODO(pbardea): SHOW JOBS WHEN COMPLETE SELECT does not work on some schema
   201  	// changes when canceling jobs, but querying until there are no jobs works.
   202  	//runner.Exec(t, "SHOW JOBS WHEN COMPLETE SELECT job_id FROM [SHOW JOBS] WHERE (job_type = 'SCHEMA CHANGE' OR job_type = 'SCHEMA CHANGE GC')")
   203  	// Wait until there are no more running schema changes.
   204  	log.Info(ctx, "waiting for new schema change jobs to complete")
   205  	runner.CheckQueryResultsRetry(t, "SELECT * FROM [SHOW JOBS] WHERE (job_type = 'SCHEMA CHANGE' OR job_type = 'SCHEMA CHANGE GC') AND NOT (status = 'succeeded' OR status = 'canceled')", [][]string{})
   206  	log.Info(ctx, "done running new schema change jobs")
   207  
   208  	verifySchemaChangeJobRan(t, runner, testCase)
   209  }
   210  
   211  func makeCondSignal(shouldSignal *int32) (chan struct{}, func()) {
   212  	signalCh := make(chan struct{})
   213  	signalFn := func() {
   214  		if atomic.LoadInt32(shouldSignal) == 1 {
   215  			signalCh <- struct{}{}
   216  		}
   217  	}
   218  	return signalCh, signalFn
   219  }
   220  
   221  func makeSignal() (chan struct{}, func()) {
   222  	alwaysSignal := int32(1)
   223  	return makeCondSignal(&alwaysSignal)
   224  }
   225  
   226  func setupServerAndStartSchemaChange(
   227  	t *testing.T,
   228  	errCh chan error,
   229  	testCase migrationTestCase,
   230  	revMigrationDone, signalMigrationDone func(),
   231  ) (*sqlutils.SQLRunner, *gosql.DB, serverutils.TestClusterInterface) {
   232  	clusterSize := 3
   233  	params, _ := tests.CreateTestServerParams()
   234  
   235  	var runner *sqlutils.SQLRunner
   236  	var kvDB *kv.DB
   237  	var registry *jobs.Registry
   238  
   239  	blockSchemaChanges := false
   240  
   241  	migrateJob := func(jobID int64) {
   242  		if testCase.blockState == WaitingForGC {
   243  			if err := migrateGCJobToOldFormat(kvDB, registry, jobID, testCase.schemaChange.kind); err != nil {
   244  				errCh <- err
   245  			}
   246  		} else {
   247  			if err := migrateJobToOldFormat(kvDB, registry, jobID, testCase.schemaChange.kind); err != nil {
   248  				errCh <- err
   249  			}
   250  		}
   251  	}
   252  	cancelJob := func(jobID int64) {
   253  		runner.Exec(t, `CANCEL JOB (
   254  					SELECT job_id FROM [SHOW JOBS]
   255  					WHERE
   256  						job_id = $1
   257  				)`, jobID)
   258  	}
   259  
   260  	setupTestingKnobs(t, testCase, &params, &blockSchemaChanges, revMigrationDone, signalMigrationDone, migrateJob, cancelJob)
   261  
   262  	tc := serverutils.StartTestCluster(t, clusterSize,
   263  		base.TestClusterArgs{
   264  			ReplicationMode: base.ReplicationManual,
   265  			ServerArgs:      params,
   266  		})
   267  	sqlDB := tc.ServerConn(0)
   268  	kvDB = tc.Server(0).DB()
   269  	runner = sqlutils.MakeSQLRunner(sqlDB)
   270  	registry = tc.Server(0).JobRegistry().(*jobs.Registry)
   271  
   272  	ctx, cancel := context.WithCancel(context.Background())
   273  
   274  	if _, err := sqlDB.Exec(setup); err != nil {
   275  		t.Fatal(err)
   276  	}
   277  
   278  	runner.CheckQueryResultsRetry(t, "SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE' AND NOT (status = 'succeeded' OR status = 'canceled')", [][]string{{"0"}})
   279  	blockSchemaChanges = true
   280  
   281  	bg := ctxgroup.WithContext(ctx)
   282  	bg.Go(func() error {
   283  		if _, err := sqlDB.ExecContext(ctx, testCase.schemaChange.query); err != nil {
   284  			cancel()
   285  			return err
   286  		}
   287  		return nil
   288  	})
   289  	// TODO(pbardea): Remove this magic 53.
   290  	if _, err := sqltestutils.AddImmediateGCZoneConfig(sqlDB, sqlbase.ID(53)); err != nil {
   291  		t.Fatal(err)
   292  	}
   293  	return runner, sqlDB, tc
   294  }
   295  
   296  // migrateJobToOldFormat updates the state of a job and table descriptor from
   297  // it's 20.1 to its 19.2 representation. There is a separate implementation for
   298  // GC jobs.
   299  func migrateJobToOldFormat(
   300  	kvDB *kv.DB, registry *jobs.Registry, jobID int64, schemaChangeType SchemaChangeType,
   301  ) error {
   302  	ctx := context.Background()
   303  
   304  	tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test")
   305  	if schemaChangeType == CreateTable {
   306  		tableDesc = sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "new_table")
   307  	}
   308  
   309  	if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   310  		job, err := registry.LoadJobWithTxn(ctx, jobID, txn)
   311  		if err != nil {
   312  			return err
   313  		}
   314  		return job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error {
   315  			details := job.Details().(jobspb.SchemaChangeDetails)
   316  			// Explicitly zero out these fields as they will be set to their 0 value
   317  			// on 19.2 nodes.
   318  			details.TableID = 0
   319  			details.MutationID = 0
   320  			details.FormatVersion = jobspb.BaseFormatVersion
   321  			if isDeletingTable(schemaChangeType) {
   322  				details.DroppedTables = []jobspb.DroppedTableDetails{
   323  					{
   324  						Name:   tableDesc.Name,
   325  						ID:     tableDesc.ID,
   326  						Status: jobspb.Status_DRAINING_NAMES,
   327  					},
   328  				}
   329  			}
   330  
   331  			progress := job.Progress()
   332  			// TODO(pbardea): Probably want to change this to check on block state
   333  			// being draining names.
   334  			if isDeletingTable(schemaChangeType) {
   335  				progress.RunningStatus = string(sql.RunningStatusDrainingNames)
   336  			}
   337  
   338  			md.Payload.Lease = nil
   339  			md.Payload.Details = jobspb.WrapPayloadDetails(details)
   340  			md.Progress = &progress
   341  			ju.UpdatePayload(md.Payload)
   342  			ju.UpdateProgress(md.Progress)
   343  			return nil
   344  		})
   345  	}); err != nil {
   346  		return err
   347  	}
   348  
   349  	// Update the table descriptor.
   350  	tableDesc.Lease = &sqlbase.TableDescriptor_SchemaChangeLease{
   351  		ExpirationTime: timeutil.Now().UnixNano(),
   352  		NodeID:         roachpb.NodeID(0),
   353  	}
   354  	if schemaChangeType == TruncateTable {
   355  		tableDesc.DropJobID = jobID
   356  		// TODO(pbardea): When is drop time populated?
   357  	}
   358  
   359  	// Write the table descriptor back.
   360  	return kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   361  		if err := txn.SetSystemConfigTrigger(); err != nil {
   362  			return err
   363  		}
   364  		return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey(
   365  			keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc),
   366  		)
   367  	})
   368  }
   369  
   370  // migrateGCJobToOldFormat converts a GC job created in 20.1 into a 19.2-style
   371  // schema change job that is waiting for GC. This involves changing the type of
   372  // the job details and progress.
   373  //
   374  // We could have gone back and set the original schema change job to Running,
   375  // but then we'd have to update that job from inside the GC job testing knob
   376  // function, which seems risky since we have no way of controlling that schema
   377  // change job once it's eligible to be adopted.
   378  func migrateGCJobToOldFormat(
   379  	kvDB *kv.DB, registry *jobs.Registry, jobID int64, schemaChangeType SchemaChangeType,
   380  ) error {
   381  	ctx := context.Background()
   382  
   383  	if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   384  		job, err := registry.LoadJobWithTxn(ctx, jobID, txn)
   385  		if err != nil {
   386  			return err
   387  		}
   388  		return job.WithTxn(txn).Update(ctx, func(txn *kv.Txn, md jobs.JobMetadata, ju *jobs.JobUpdater) error {
   389  			// Replace the details with an entirely new SchemaChangeDetails.
   390  			details := jobspb.SchemaChangeDetails{
   391  				FormatVersion: jobspb.BaseFormatVersion,
   392  			}
   393  			if isDeletingTable(schemaChangeType) {
   394  				details.DroppedTables = []jobspb.DroppedTableDetails{
   395  					{
   396  						// TODO (lucy): Stop hard-coding these if possible. We can't get
   397  						// these values from the table descriptor if we're dropping the
   398  						// table, since at this point the table descriptor would have been
   399  						// deleted.
   400  						Name:   "test",
   401  						ID:     53,
   402  						Status: jobspb.Status_WAIT_FOR_GC_INTERVAL,
   403  					},
   404  				}
   405  			}
   406  
   407  			progress := jobspb.Progress{
   408  				Details:       jobspb.WrapProgressDetails(jobspb.SchemaChangeProgress{}),
   409  				RunningStatus: string(sql.RunningStatusWaitingGC),
   410  			}
   411  
   412  			md.Payload.Lease = nil
   413  			md.Payload.Description = strings.TrimPrefix(md.Payload.Description, "GC for ")
   414  			md.Payload.Details = jobspb.WrapPayloadDetails(details)
   415  			md.Progress = &progress
   416  			ju.UpdatePayload(md.Payload)
   417  			ju.UpdateProgress(md.Progress)
   418  			return nil
   419  		})
   420  	}); err != nil {
   421  		return err
   422  	}
   423  
   424  	switch schemaChangeType {
   425  	case DropTable:
   426  		// There's no table descriptor to update, so we're done.
   427  		return nil
   428  
   429  	case DropIndex:
   430  		tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test")
   431  		if l := len(tableDesc.GCMutations); l != 1 {
   432  			return errors.AssertionFailedf("expected exactly 1 GCMutation, found %d", l)
   433  		}
   434  
   435  		// Update the table descriptor.
   436  		tableDesc.Lease = &sqlbase.TableDescriptor_SchemaChangeLease{
   437  			ExpirationTime: timeutil.Now().UnixNano(),
   438  			NodeID:         roachpb.NodeID(0),
   439  		}
   440  
   441  		tableDesc.GCMutations[0].JobID = jobID
   442  		tableDesc.GCMutations[0].DropTime = timeutil.Now().UnixNano()
   443  
   444  		// Write the table descriptor back.
   445  		return kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   446  			if err := txn.SetSystemConfigTrigger(); err != nil {
   447  				return err
   448  			}
   449  			return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey(
   450  				keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc),
   451  			)
   452  		})
   453  	default:
   454  		return errors.Errorf("invalid schema change type: %d", schemaChangeType)
   455  	}
   456  }
   457  
   458  // Set up server testing args such that knobs are set to block and abandon any
   459  // given schema change at a certain point. The "blocked" channel will be
   460  // signaled when the schema change gets abandoned.
   461  // The runner should only be used inside callback closures.
   462  func setupTestingKnobs(
   463  	t *testing.T,
   464  	testCase migrationTestCase,
   465  	args *base.TestServerArgs,
   466  	blockSchemaChanges *bool,
   467  	revMigrationDone, signalMigrationDone func(),
   468  	migrateJob, cancelJob func(int64),
   469  ) {
   470  	numJobs := 1
   471  	if testCase.schemaChange.kind == CreateTable {
   472  		numJobs = 2
   473  	}
   474  	var (
   475  		mu                   syncutil.Mutex
   476  		migratedCount        int
   477  		doneReverseMigration bool
   478  		ranCancelCommand     bool
   479  		hasCanceled          bool
   480  	)
   481  
   482  	blockFn := func(jobID int64) error {
   483  		mu.Lock()
   484  		defer mu.Unlock()
   485  		if !(*blockSchemaChanges) {
   486  			return nil
   487  		}
   488  
   489  		// In the case we're canceling the job, this blockFn should only be called
   490  		// after the OnFailOrCancel hook is called. At this point we know that the
   491  		// job is actually canceled.
   492  		hasCanceled = true
   493  
   494  		if doneReverseMigration {
   495  			// Already migrated all the jobs that we want to migrate to 19.2.
   496  			// New jobs created after we migrated the original batch should be allowed
   497  			// to continue.
   498  			return nil
   499  		} else {
   500  			migrateJob(jobID)
   501  			migratedCount++
   502  		}
   503  
   504  		if migratedCount == numJobs {
   505  			doneReverseMigration = true
   506  			revMigrationDone()
   507  		}
   508  
   509  		// Return a retryable error so that the job doesn't make any progress past
   510  		// this point. It should not get adopted since it has been marked as a 19.2
   511  		// job.
   512  		return jobs.NewRetryJobError("stop this job until cluster upgrade")
   513  	}
   514  
   515  	cancelFn := func(jobID int64) error {
   516  		mu.Lock()
   517  		defer mu.Unlock()
   518  		if hasCanceled {
   519  			// The job has already been successfully canceled.
   520  			return nil
   521  		}
   522  
   523  		if !ranCancelCommand {
   524  			cancelJob(jobID)
   525  			ranCancelCommand = true
   526  		}
   527  
   528  		// Don't allow the job to progress further than this knob until it has
   529  		// actually been canceled	.
   530  		return jobs.NewRetryJobError("retry until canceled")
   531  	}
   532  
   533  	knobs := &sql.SchemaChangerTestingKnobs{}
   534  	gcKnobs := &sql.GCJobTestingKnobs{}
   535  
   536  	shouldCancel := testCase.shouldCancel
   537  	if shouldCancel {
   538  		if runsBackfill[testCase.schemaChange.kind] {
   539  			knobs.RunAfterBackfill = cancelFn
   540  		} else {
   541  			knobs.RunBeforeResume = cancelFn
   542  		}
   543  	}
   544  
   545  	switch testCase.blockState {
   546  	case BeforeBackfill:
   547  		if shouldCancel {
   548  			knobs.RunBeforeOnFailOrCancel = blockFn
   549  		} else {
   550  			knobs.RunBeforeResume = blockFn
   551  		}
   552  	case AfterBackfill:
   553  		if shouldCancel {
   554  			// This is a special case where (1) RunAfterBackfill within Resume() needs
   555  			// to call cancelFn() to cancel the job, (2) RunBeforeOnFailOrCancel needs
   556  			// to set hasCanceled, and (3) RunAfterBackfill, running for the 2nd time
   557  			// within OnFailOrCancel(), needs to read the value of hasCanceled (which
   558  			// is true) and run BlockFn().
   559  			knobs.RunBeforeOnFailOrCancel = func(jobID int64) error {
   560  				mu.Lock()
   561  				defer mu.Unlock()
   562  				hasCanceled = true
   563  				return nil
   564  			}
   565  			knobs.RunAfterBackfill = func(jobID int64) error {
   566  				mu.Lock()
   567  				hasCanceled := hasCanceled
   568  				mu.Unlock()
   569  				if hasCanceled {
   570  					return blockFn(jobID)
   571  				} else {
   572  					return cancelFn(jobID)
   573  				}
   574  			}
   575  		} else {
   576  			knobs.RunAfterBackfill = blockFn
   577  		}
   578  	case AfterReversingMutations:
   579  		if !shouldCancel {
   580  			t.Fatal("can only block after reversing mutations if the job is expected to be canceled")
   581  		}
   582  		knobs.RunAfterBackfill = cancelFn
   583  		knobs.RunAfterMutationReversal = blockFn
   584  	case WaitingForGC:
   585  		if shouldCancel {
   586  			t.Fatal("cannot block on waiting for GC if the job should also be canceled")
   587  		}
   588  		gcKnobs.RunBeforeResume = blockFn
   589  	}
   590  
   591  	args.Knobs.SQLSchemaChanger = knobs
   592  	args.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{
   593  		AfterJobMigration:     signalMigrationDone,
   594  		AlwaysRunJobMigration: true,
   595  	}
   596  	args.Knobs.GCJob = gcKnobs
   597  }
   598  
   599  func getTestName(schemaChange SchemaChangeType, blockState BlockState, shouldCancel bool) string {
   600  	stateNames := map[BlockState]string{
   601  		BeforeBackfill:          "before-backfill",
   602  		AfterBackfill:           "after-backfill",
   603  		AfterReversingMutations: "after-reversing-mutations",
   604  		WaitingForGC:            "waiting-for-gc",
   605  	}
   606  	schemaChangeName := map[SchemaChangeType]string{
   607  		AddColumn:      "add-column",
   608  		DropColumn:     "drop-column",
   609  		CreateIndex:    "add-index",
   610  		DropIndex:      "drop-index",
   611  		AddConstraint:  "add-constraint",
   612  		DropConstraint: "drop-constraint",
   613  		CreateTable:    "create-table",
   614  		TruncateTable:  "truncate-table",
   615  		DropTable:      "drop-table",
   616  	}
   617  
   618  	testName := fmt.Sprintf("%s-blocked-at-%s", schemaChangeName[schemaChange], stateNames[blockState])
   619  	if shouldCancel {
   620  		testName += "-canceled"
   621  	}
   622  	return testName
   623  }
   624  
   625  func verifySchemaChangeJobRan(
   626  	t *testing.T, runner *sqlutils.SQLRunner, testCase migrationTestCase,
   627  ) {
   628  	expStatus := jobs.StatusSucceeded
   629  	description := testCase.schemaChange.query
   630  	if testCase.shouldCancel {
   631  		expStatus = jobs.StatusCanceled
   632  	}
   633  	if testCase.schemaChange.kind == CreateTable {
   634  		description = "adding table 54"
   635  	} else {
   636  		if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChange, expStatus, jobs.Record{
   637  			Description:   description,
   638  			Username:      security.RootUser,
   639  			DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind),
   640  		}); err != nil {
   641  			t.Fatal(err)
   642  		}
   643  	}
   644  
   645  	// Verify that the GC job exists and is in the correct state, if applicable.
   646  	if testCase.blockState == WaitingForGC {
   647  		if err := jobutils.VerifySystemJob(t, runner, 0, jobspb.TypeSchemaChangeGC, jobs.StatusSucceeded, jobs.Record{
   648  			Description:   "GC for " + description,
   649  			Username:      security.RootUser,
   650  			DescriptorIDs: getTableIDsUnderTest(testCase.schemaChange.kind),
   651  		}); err != nil {
   652  			t.Fatal(err)
   653  		}
   654  	} else {
   655  		// For non-GC jobs, verify that the schema change job format version was
   656  		// updated.
   657  		newVersion := jobutils.GetJobFormatVersion(t, runner)
   658  		require.Equal(t, jobspb.JobResumerFormatVersion, newVersion)
   659  	}
   660  
   661  	var expected [][]string
   662  	didCancel := testCase.shouldCancel
   663  	switch testCase.schemaChange.kind {
   664  	case AddColumn:
   665  		if didCancel {
   666  			expected = [][]string{{"1", "2"}}
   667  		} else {
   668  			expected = [][]string{{"1", "2", "NULL"}}
   669  		}
   670  		rows := runner.QueryStr(t, "SELECT * FROM t.test")
   671  		require.Equal(t, expected, rows)
   672  	case DropColumn:
   673  		if didCancel {
   674  			expected = [][]string{{"1", "NULL"}}
   675  		} else {
   676  			expected = [][]string{{"1"}}
   677  		}
   678  		rows := runner.QueryStr(t, "SELECT * FROM t.test")
   679  		require.Equal(t, expected, rows)
   680  	case CreateIndex:
   681  		if didCancel {
   682  			expected = [][]string{{"primary"}, {"k_idx"}}
   683  		} else {
   684  			expected = [][]string{{"primary"}, {"k_idx"}, {"v_idx"}}
   685  		}
   686  		rows := runner.QueryStr(t, "SELECT DISTINCT index_name FROM [SHOW INDEXES FROM t.test]")
   687  		require.Equal(t, expected, rows)
   688  	case DropIndex:
   689  		if didCancel {
   690  			expected = [][]string{{"primary"}, {"k_idx"}}
   691  		} else {
   692  			expected = [][]string{{"primary"}}
   693  		}
   694  		rows := runner.QueryStr(t, "SELECT DISTINCT index_name FROM [SHOW INDEXES FROM t.test]")
   695  		require.Equal(t, expected, rows)
   696  	case AddConstraint:
   697  		if didCancel {
   698  			expected = [][]string{{"k_cons"}, {"primary"}}
   699  		} else {
   700  			expected = [][]string{{"k_cons"}, {"primary"}, {"v_unq"}}
   701  		}
   702  		rows := runner.QueryStr(t, "SELECT constraint_name FROM [SHOW CONSTRAINTS FROM t.test] ORDER BY constraint_name")
   703  		require.Equal(t, expected, rows)
   704  	case DropConstraint:
   705  		if didCancel {
   706  			expected = [][]string{{"k_cons"}, {"primary"}}
   707  		} else {
   708  			expected = [][]string{{"primary"}}
   709  		}
   710  		rows := runner.QueryStr(t, "SELECT constraint_name FROM [SHOW CONSTRAINTS FROM t.test] ORDER BY constraint_name")
   711  		require.Equal(t, expected, rows)
   712  	case CreateTable:
   713  		if didCancel {
   714  			t.Fatal("cannot cancel create table")
   715  		} else {
   716  			expected = [][]string{{"new_table"}, {"test"}}
   717  		}
   718  		rows := runner.QueryStr(t, "SELECT table_name FROM [SHOW TABLES FROM t] ORDER BY table_name")
   719  		require.Equal(t, expected, rows)
   720  	case TruncateTable:
   721  		if didCancel {
   722  			expected = [][]string{{"0"}}
   723  		} else {
   724  			expected = [][]string{{"0"}}
   725  		}
   726  		rows := runner.QueryStr(t, "SELECT count(*) FROM t.test")
   727  		require.Equal(t, expected, rows)
   728  	case DropTable:
   729  		// Canceling after the backfill has no effect.
   730  		expected = [][]string{}
   731  		rows := runner.QueryStr(t, "SELECT table_name FROM [SHOW TABLES FROM t] ORDER BY table_name")
   732  		require.Equal(t, expected, rows)
   733  	}
   734  }
   735  
   736  func getTableIDsUnderTest(schemaChangeType SchemaChangeType) []sqlbase.ID {
   737  	tableID := sqlbase.ID(53)
   738  	if schemaChangeType == CreateTable {
   739  		tableID = sqlbase.ID(54)
   740  	}
   741  	return []sqlbase.ID{tableID}
   742  }
   743  
   744  // Helpers used to determine valid test cases.
   745  
   746  // canBlockIfCanceled returns if a certain state (where we want to block the
   747  // schema change) will be reached given if the job was canceled or not.
   748  func canBlockIfCanceled(blockState BlockState, shouldCancel bool) bool {
   749  	// States that are only valid when the job is canceled.
   750  	if blockState == WaitingForGC {
   751  		return !shouldCancel
   752  	}
   753  	if blockState == AfterReversingMutations {
   754  		return shouldCancel
   755  	}
   756  	return true
   757  }
   758  
   759  // Ensures that the given schema change actually passes through the state where
   760  // we're proposing to block.
   761  func validBlockStateForSchemaChange(blockState BlockState, schemaChangeType SchemaChangeType) bool {
   762  	switch blockState {
   763  	case AfterBackfill:
   764  		return runsBackfill[schemaChangeType]
   765  	case WaitingForGC:
   766  		return schemaChangeType == DropIndex || schemaChangeType == DropTable
   767  	}
   768  	return true
   769  }
   770  
   771  // hasJobInOldVersion returns if a given schema change had a job in 19.2.
   772  // Therefore these jobs could not be canceled in 19.2
   773  func hadJobInOldVersion(schemaChangeType SchemaChangeType) bool {
   774  	return schemaChangeType != CreateTable
   775  }
   776  
   777  func TestMigrateSchemaChanges(t *testing.T) {
   778  	defer leaktest.AfterTest(t)()
   779  	defer setTestJobsAdoptInterval()()
   780  
   781  	blockStates := []BlockState{
   782  		BeforeBackfill,
   783  		AfterBackfill,
   784  		AfterReversingMutations,
   785  		WaitingForGC,
   786  	}
   787  
   788  	schemaChanges := []schemaChangeRequest{
   789  		{
   790  			CreateTable,
   791  			"CREATE TABLE t.public.new_table (k INT8, FOREIGN KEY (k) REFERENCES t.public.test (k))",
   792  		},
   793  		{
   794  			AddColumn,
   795  			"ALTER TABLE t.public.test ADD COLUMN foo INT8",
   796  		},
   797  		{
   798  			DropColumn,
   799  			"ALTER TABLE t.public.test DROP COLUMN v",
   800  		},
   801  		{
   802  			CreateIndex,
   803  			"CREATE INDEX v_idx ON t.public.test (v)",
   804  		},
   805  		{
   806  			DropIndex,
   807  			"DROP INDEX t.public.test@k_idx",
   808  		},
   809  		{
   810  			AddConstraint,
   811  			"ALTER TABLE t.public.test ADD CONSTRAINT v_unq UNIQUE (v)",
   812  		},
   813  		{
   814  			DropConstraint,
   815  			"ALTER TABLE t.public.test DROP CONSTRAINT k_cons",
   816  		},
   817  		{
   818  			TruncateTable,
   819  			"TRUNCATE TABLE t.public.test",
   820  		},
   821  		{
   822  			DropTable,
   823  			"DROP TABLE t.public.test",
   824  		},
   825  	}
   826  
   827  	for _, schemaChange := range schemaChanges {
   828  		for _, blockState := range blockStates {
   829  			for _, shouldCancel := range []bool{true, false} {
   830  				blockState := blockState
   831  				shouldCancel := shouldCancel
   832  
   833  				// Rollbacks of DROP CONSTRAINT are broken. See #47323.
   834  				if schemaChange.kind == DropConstraint && shouldCancel {
   835  					continue
   836  				}
   837  				if !canBlockIfCanceled(blockState, shouldCancel) {
   838  					continue
   839  				}
   840  				if !validBlockStateForSchemaChange(blockState, schemaChange.kind) {
   841  					continue
   842  				}
   843  				if shouldCancel && !hadJobInOldVersion(schemaChange.kind) {
   844  					continue
   845  				}
   846  
   847  				t.Run(getTestName(schemaChange.kind, blockState, shouldCancel), func(t *testing.T) {
   848  					testCase := migrationTestCase{
   849  						blockState:   blockState,
   850  						shouldCancel: shouldCancel,
   851  						schemaChange: schemaChange,
   852  					}
   853  					testSchemaChangeMigrations(t, testCase)
   854  				})
   855  			}
   856  		}
   857  	}
   858  }
   859  
   860  // TestGCJobCreated tests that a table descriptor in the DROP state with no
   861  // running job has a GC job created for it.
   862  func TestGCJobCreated(t *testing.T) {
   863  	defer leaktest.AfterTest(t)()
   864  	defer setTestJobsAdoptInterval()()
   865  	params, _ := tests.CreateTestServerParams()
   866  	params.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{
   867  		AlwaysRunJobMigration: true,
   868  	}
   869  	s, sqlDB, kvDB := serverutils.StartServer(t, params)
   870  	defer s.Stopper().Stop(context.Background())
   871  	ctx := context.Background()
   872  	sqlRunner := sqlutils.MakeSQLRunner(sqlDB)
   873  
   874  	// Create a table and then force it to be in the DROP state.
   875  	if _, err := sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test();`); err != nil {
   876  		t.Fatal(err)
   877  	}
   878  	tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test")
   879  	tableDesc.State = sqlbase.TableDescriptor_DROP
   880  	tableDesc.Version++
   881  	tableDesc.DropTime = 1
   882  	if err := kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   883  		if err := txn.SetSystemConfigTrigger(); err != nil {
   884  			return err
   885  		}
   886  		if err := sqlbase.RemoveObjectNamespaceEntry(
   887  			ctx, txn, keys.SystemSQLCodec, tableDesc.ID, tableDesc.ParentID, tableDesc.Name, false, /* kvTrace */
   888  		); err != nil {
   889  			return err
   890  		}
   891  		return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey(
   892  			keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc),
   893  		)
   894  	}); err != nil {
   895  		t.Fatal(err)
   896  	}
   897  
   898  	// Run the migration.
   899  	migMgr := s.MigrationManager().(*sqlmigrations.Manager)
   900  	if err := migMgr.StartSchemaChangeJobMigration(ctx); err != nil {
   901  		t.Fatal(err)
   902  	}
   903  
   904  	// Check that a GC job was created and completed successfully.
   905  	sqlRunner.CheckQueryResultsRetry(t,
   906  		"SELECT count(*) FROM [SHOW JOBS] WHERE job_type = 'SCHEMA CHANGE GC' AND status = 'succeeded'",
   907  		[][]string{{"1"}},
   908  	)
   909  }
   910  
   911  // TestMissingMutation tests that a malformed table descriptor with a
   912  // MutationJob but no Mutation for the given job causes the job to fail with an
   913  // error. Regression test for #48786.
   914  func TestMissingMutation(t *testing.T) {
   915  	defer leaktest.AfterTest(t)()
   916  	defer setTestJobsAdoptInterval()()
   917  	schemaChangeBlocked, descriptorUpdated := make(chan struct{}), make(chan struct{})
   918  	migratedJob := false
   919  	var schemaChangeJobID int64
   920  	params, _ := tests.CreateTestServerParams()
   921  	params.Knobs.SQLMigrationManager = &sqlmigrations.MigrationManagerTestingKnobs{
   922  		AlwaysRunJobMigration: true,
   923  	}
   924  	params.Knobs.SQLSchemaChanger = &sql.SchemaChangerTestingKnobs{
   925  		RunBeforeResume: func(jobID int64) error {
   926  			if !migratedJob {
   927  				migratedJob = true
   928  				schemaChangeJobID = jobID
   929  				close(schemaChangeBlocked)
   930  			}
   931  
   932  			<-descriptorUpdated
   933  			return jobs.NewRetryJobError("stop this job until cluster upgrade")
   934  		},
   935  	}
   936  	s, sqlDB, kvDB := serverutils.StartServer(t, params)
   937  	ctx, cancel := context.WithCancel(context.Background())
   938  	defer s.Stopper().Stop(ctx)
   939  	registry := s.JobRegistry().(*jobs.Registry)
   940  
   941  	_, err := sqlDB.Exec(`CREATE DATABASE t; CREATE TABLE t.test(k INT PRIMARY KEY, v INT);`)
   942  	require.NoError(t, err)
   943  
   944  	bg := ctxgroup.WithContext(ctx)
   945  	// Start a schema change on the table in a separate goroutine.
   946  	bg.Go(func() error {
   947  		if _, err := sqlDB.ExecContext(ctx, `ALTER TABLE t.test ADD COLUMN a INT;`); err != nil {
   948  			cancel()
   949  			return err
   950  		}
   951  		return nil
   952  	})
   953  
   954  	<-schemaChangeBlocked
   955  
   956  	// Rewrite the job to be a 19.2-style job.
   957  	require.NoError(t, migrateJobToOldFormat(kvDB, registry, schemaChangeJobID, AddColumn))
   958  
   959  	// To get the table descriptor into the (invalid) state we're trying to test,
   960  	// clear the mutations on the table descriptor.
   961  	tableDesc := sqlbase.GetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test")
   962  	tableDesc.Mutations = nil
   963  	require.NoError(
   964  		t, kvDB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   965  			if err := txn.SetSystemConfigTrigger(); err != nil {
   966  				return err
   967  			}
   968  			return kvDB.Put(ctx, sqlbase.MakeDescMetadataKey(
   969  				keys.SystemSQLCodec, tableDesc.GetID()), sqlbase.WrapDescriptor(tableDesc),
   970  			)
   971  		}),
   972  	)
   973  
   974  	// Run the migration.
   975  	migMgr := s.MigrationManager().(*sqlmigrations.Manager)
   976  	require.NoError(t, migMgr.StartSchemaChangeJobMigration(ctx))
   977  
   978  	close(descriptorUpdated)
   979  
   980  	err = bg.Wait()
   981  	require.Regexp(t, fmt.Sprintf("mutation %d not found for MutationJob %d", 1, schemaChangeJobID), err)
   982  }