github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/schema_changer.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/jobs"
    22  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    23  	"github.com/cockroachdb/cockroach/pkg/kv"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/security"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/parser"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    40  	"github.com/cockroachdb/cockroach/pkg/util/log"
    41  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    43  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    44  	"github.com/cockroachdb/errors"
    45  	"github.com/cockroachdb/logtags"
    46  )
    47  
    48  const (
    49  	// RunningStatusDrainingNames used to indicate that the job was draining names
    50  	// for dropped descriptors. This constant is now deprecated and only exists
    51  	// to be used for migrating old jobs.
    52  	RunningStatusDrainingNames jobs.RunningStatus = "draining names"
    53  	// RunningStatusWaitingGC is for jobs that are currently in progress and
    54  	// are waiting for the GC interval to expire
    55  	RunningStatusWaitingGC jobs.RunningStatus = "waiting for GC TTL"
    56  	// RunningStatusDeleteOnly is for jobs that are currently waiting on
    57  	// the cluster to converge to seeing the schema element in the DELETE_ONLY
    58  	// state.
    59  	RunningStatusDeleteOnly jobs.RunningStatus = "waiting in DELETE-ONLY"
    60  	// RunningStatusDeleteAndWriteOnly is for jobs that are currently waiting on
    61  	// the cluster to converge to seeing the schema element in the
    62  	// DELETE_AND_WRITE_ONLY state.
    63  	RunningStatusDeleteAndWriteOnly jobs.RunningStatus = "waiting in DELETE-AND-WRITE_ONLY"
    64  	// RunningStatusBackfill is for jobs that are currently running a backfill
    65  	// for a schema element.
    66  	RunningStatusBackfill jobs.RunningStatus = "populating schema"
    67  	// RunningStatusValidation is for jobs that are currently validating
    68  	// a schema element.
    69  	RunningStatusValidation jobs.RunningStatus = "validating schema"
    70  )
    71  
    72  // SchemaChanger is used to change the schema on a table.
    73  type SchemaChanger struct {
    74  	tableID           sqlbase.ID
    75  	mutationID        sqlbase.MutationID
    76  	droppedDatabaseID sqlbase.ID
    77  	sqlInstanceID     base.SQLInstanceID
    78  	db                *kv.DB
    79  	leaseMgr          *lease.Manager
    80  
    81  	testingKnobs   *SchemaChangerTestingKnobs
    82  	distSQLPlanner *DistSQLPlanner
    83  	jobRegistry    *jobs.Registry
    84  	// Keep a reference to the job related to this schema change
    85  	// so that we don't need to read the job again while updating
    86  	// the status of the job.
    87  	job *jobs.Job
    88  	// Caches updated by DistSQL.
    89  	rangeDescriptorCache *kvcoord.RangeDescriptorCache
    90  	leaseHolderCache     *kvcoord.LeaseHolderCache
    91  	clock                *hlc.Clock
    92  	settings             *cluster.Settings
    93  	execCfg              *ExecutorConfig
    94  	ieFactory            sqlutil.SessionBoundInternalExecutorFactory
    95  }
    96  
    97  // NewSchemaChangerForTesting only for tests.
    98  func NewSchemaChangerForTesting(
    99  	tableID sqlbase.ID,
   100  	mutationID sqlbase.MutationID,
   101  	sqlInstanceID base.SQLInstanceID,
   102  	db kv.DB,
   103  	leaseMgr *lease.Manager,
   104  	jobRegistry *jobs.Registry,
   105  	execCfg *ExecutorConfig,
   106  	settings *cluster.Settings,
   107  ) SchemaChanger {
   108  	return SchemaChanger{
   109  		tableID:       tableID,
   110  		mutationID:    mutationID,
   111  		sqlInstanceID: sqlInstanceID,
   112  		db:            &db,
   113  		leaseMgr:      leaseMgr,
   114  		jobRegistry:   jobRegistry,
   115  		settings:      settings,
   116  		execCfg:       execCfg,
   117  	}
   118  }
   119  
   120  // isPermanentSchemaChangeError returns true if the error results in
   121  // a permanent failure of a schema change. This function is a whitelist
   122  // instead of a blacklist: only known safe errors are confirmed to not be
   123  // permanent errors. Anything unknown is assumed to be permanent.
   124  func isPermanentSchemaChangeError(err error) bool {
   125  	if err == nil {
   126  		return false
   127  	}
   128  
   129  	if grpcutil.IsClosedConnection(err) {
   130  		return false
   131  	}
   132  
   133  	// Ignore error thrown because of a read at a very old timestamp.
   134  	// The Backfill will grab a new timestamp to read at for the rest
   135  	// of the backfill.
   136  	// TODO(knz): this should really use errors.Is(). However until/unless
   137  	// we are not receiving errors from 19.1 any more, a string
   138  	// comparison must remain.
   139  	if strings.Contains(err.Error(), "must be after replica GC threshold") {
   140  		return false
   141  	}
   142  
   143  	if pgerror.IsSQLRetryableError(err) {
   144  		return false
   145  	}
   146  
   147  	if errors.IsAny(err,
   148  		context.Canceled,
   149  		context.DeadlineExceeded,
   150  		errExistingSchemaChangeLease,
   151  		errExpiredSchemaChangeLease,
   152  		errNotHitGCTTLDeadline,
   153  		errSchemaChangeDuringDrain,
   154  		errSchemaChangeNotFirstInLine,
   155  		errTableVersionMismatchSentinel,
   156  	) {
   157  		return false
   158  	}
   159  
   160  	switch pgerror.GetPGCode(err) {
   161  	case pgcode.SerializationFailure, pgcode.InternalConnectionFailure, pgcode.DeprecatedInternalConnectionFailure:
   162  		return false
   163  
   164  	case pgcode.Internal, pgcode.RangeUnavailable, pgcode.DeprecatedRangeUnavailable:
   165  		if strings.Contains(err.Error(), context.DeadlineExceeded.Error()) {
   166  			return false
   167  		}
   168  	}
   169  
   170  	return true
   171  }
   172  
   173  var (
   174  	errExistingSchemaChangeLease  = errors.Newf("an outstanding schema change lease exists")
   175  	errExpiredSchemaChangeLease   = errors.Newf("the schema change lease has expired")
   176  	errSchemaChangeNotFirstInLine = errors.Newf("schema change not first in line")
   177  	errNotHitGCTTLDeadline        = errors.Newf("not hit gc ttl deadline")
   178  	errSchemaChangeDuringDrain    = errors.Newf("a schema change ran during the drain phase, re-increment")
   179  )
   180  
   181  type errTableVersionMismatch struct {
   182  	version  sqlbase.DescriptorVersion
   183  	expected sqlbase.DescriptorVersion
   184  }
   185  
   186  var errTableVersionMismatchSentinel = errTableVersionMismatch{}
   187  
   188  func makeErrTableVersionMismatch(version, expected sqlbase.DescriptorVersion) error {
   189  	return errors.Mark(errors.WithStack(errTableVersionMismatch{
   190  		version:  version,
   191  		expected: expected,
   192  	}), errTableVersionMismatchSentinel)
   193  }
   194  
   195  func (e errTableVersionMismatch) Error() string {
   196  	return fmt.Sprintf("table version mismatch: %d, expected: %d", e.version, e.expected)
   197  }
   198  
   199  // maybe backfill a created table by executing the AS query. Return nil if
   200  // successfully backfilled.
   201  //
   202  // Note that this does not connect to the tracing settings of the
   203  // surrounding SQL transaction. This should be OK as (at the time of
   204  // this writing) this code path is only used for standalone CREATE
   205  // TABLE AS statements, which cannot be traced.
   206  func (sc *SchemaChanger) maybeBackfillCreateTableAs(
   207  	ctx context.Context, table *sqlbase.TableDescriptor,
   208  ) error {
   209  	if !(table.Adding() && table.IsAs()) {
   210  		return nil
   211  	}
   212  
   213  	return sc.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   214  		txn.SetFixedTimestamp(ctx, table.CreateAsOfTime)
   215  
   216  		// Create an internal planner as the planner used to serve the user query
   217  		// would have committed by this point.
   218  		p, cleanup := NewInternalPlanner("ctasBackfill", txn, security.RootUser, &MemoryMetrics{}, sc.execCfg)
   219  		defer cleanup()
   220  		localPlanner := p.(*planner)
   221  		stmt, err := parser.ParseOne(table.CreateQuery)
   222  		if err != nil {
   223  			return err
   224  		}
   225  
   226  		// Construct an optimized logical plan of the AS source stmt.
   227  		localPlanner.stmt = &Statement{Statement: stmt}
   228  		localPlanner.optPlanningCtx.init(localPlanner)
   229  
   230  		localPlanner.runWithOptions(resolveFlags{skipCache: true}, func() {
   231  			err = localPlanner.makeOptimizerPlan(ctx)
   232  		})
   233  
   234  		if err != nil {
   235  			return err
   236  		}
   237  		defer localPlanner.curPlan.close(ctx)
   238  
   239  		res := roachpb.BulkOpSummary{}
   240  		rw := newCallbackResultWriter(func(ctx context.Context, row tree.Datums) error {
   241  			// TODO(adityamaru): Use the BulkOpSummary for either telemetry or to
   242  			// return to user.
   243  			var counts roachpb.BulkOpSummary
   244  			if err := protoutil.Unmarshal([]byte(*row[0].(*tree.DBytes)), &counts); err != nil {
   245  				return err
   246  			}
   247  			res.Add(counts)
   248  			return nil
   249  		})
   250  		recv := MakeDistSQLReceiver(
   251  			ctx,
   252  			rw,
   253  			tree.Rows,
   254  			sc.execCfg.RangeDescriptorCache,
   255  			sc.execCfg.LeaseHolderCache,
   256  			txn,
   257  			func(ts hlc.Timestamp) {
   258  				sc.clock.Update(ts)
   259  			},
   260  			// Make a session tracing object on-the-fly. This is OK
   261  			// because it sets "enabled: false" and thus none of the
   262  			// other fields are used.
   263  			&SessionTracing{},
   264  		)
   265  		defer recv.Release()
   266  
   267  		willDistribute := willDistributePlan(
   268  			ctx, localPlanner.execCfg.NodeID,
   269  			localPlanner.extendedEvalCtx.SessionData.DistSQLMode,
   270  			localPlanner.curPlan.main,
   271  		)
   272  		var planAndRunErr error
   273  		localPlanner.runWithOptions(resolveFlags{skipCache: true}, func() {
   274  			// Resolve subqueries before running the queries' physical plan.
   275  			if len(localPlanner.curPlan.subqueryPlans) != 0 {
   276  				if !sc.distSQLPlanner.PlanAndRunSubqueries(
   277  					ctx, localPlanner, localPlanner.ExtendedEvalContextCopy,
   278  					localPlanner.curPlan.subqueryPlans, recv, willDistribute,
   279  				) {
   280  					if planAndRunErr = rw.Err(); planAndRunErr != nil {
   281  						return
   282  					}
   283  					if planAndRunErr = recv.commErr; planAndRunErr != nil {
   284  						return
   285  					}
   286  				}
   287  			}
   288  
   289  			isLocal := !willDistribute
   290  			out := execinfrapb.ProcessorCoreUnion{BulkRowWriter: &execinfrapb.BulkRowWriterSpec{
   291  				Table: *table,
   292  			}}
   293  
   294  			PlanAndRunCTAS(ctx, sc.distSQLPlanner, localPlanner,
   295  				txn, isLocal, localPlanner.curPlan.main, out, recv)
   296  			if planAndRunErr = rw.Err(); planAndRunErr != nil {
   297  				return
   298  			}
   299  			if planAndRunErr = recv.commErr; planAndRunErr != nil {
   300  				return
   301  			}
   302  		})
   303  
   304  		return planAndRunErr
   305  	})
   306  }
   307  
   308  // maybe make a table PUBLIC if it's in the ADD state.
   309  func (sc *SchemaChanger) maybeMakeAddTablePublic(
   310  	ctx context.Context, table *sqlbase.TableDescriptor,
   311  ) error {
   312  	if table.Adding() {
   313  		fks := table.AllActiveAndInactiveForeignKeys()
   314  		for _, fk := range fks {
   315  			if err := sc.waitToUpdateLeases(ctx, fk.ReferencedTableID); err != nil {
   316  				return err
   317  			}
   318  		}
   319  
   320  		if _, err := sc.leaseMgr.Publish(
   321  			ctx,
   322  			table.ID,
   323  			func(tbl *sqlbase.MutableTableDescriptor) error {
   324  				if !tbl.Adding() {
   325  					return lease.ErrDidntUpdateDescriptor
   326  				}
   327  				tbl.State = sqlbase.TableDescriptor_PUBLIC
   328  				return nil
   329  			},
   330  			func(txn *kv.Txn) error { return nil },
   331  		); err != nil {
   332  			return err
   333  		}
   334  	}
   335  
   336  	return nil
   337  }
   338  
   339  // Drain old names from the cluster.
   340  func (sc *SchemaChanger) drainNames(ctx context.Context) error {
   341  	// Publish a new version with all the names drained after everyone
   342  	// has seen the version with the new name. All the draining names
   343  	// can be reused henceforth.
   344  	var namesToReclaim []sqlbase.TableDescriptor_NameInfo
   345  	_, err := sc.leaseMgr.Publish(
   346  		ctx,
   347  		sc.tableID,
   348  		func(desc *sqlbase.MutableTableDescriptor) error {
   349  			if sc.testingKnobs.OldNamesDrainedNotification != nil {
   350  				sc.testingKnobs.OldNamesDrainedNotification()
   351  			}
   352  			// Free up the old name(s) for reuse.
   353  			namesToReclaim = desc.DrainingNames
   354  			desc.DrainingNames = nil
   355  			return nil
   356  		},
   357  		// Reclaim all the old names.
   358  		func(txn *kv.Txn) error {
   359  			b := txn.NewBatch()
   360  			for _, drain := range namesToReclaim {
   361  				err := sqlbase.RemoveObjectNamespaceEntry(
   362  					ctx, txn, sc.execCfg.Codec, drain.ParentID, drain.ParentSchemaID, drain.Name, false, /* KVTrace */
   363  				)
   364  				if err != nil {
   365  					return err
   366  				}
   367  			}
   368  			return txn.Run(ctx, b)
   369  		},
   370  	)
   371  	return err
   372  }
   373  
   374  func startGCJob(
   375  	ctx context.Context,
   376  	db *kv.DB,
   377  	jobRegistry *jobs.Registry,
   378  	username string,
   379  	schemaChangeDescription string,
   380  	details jobspb.SchemaChangeGCDetails,
   381  ) error {
   382  	var sj *jobs.StartableJob
   383  	jobRecord := CreateGCJobRecord(schemaChangeDescription, username, details)
   384  	if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   385  		var err error
   386  		if sj, err = jobRegistry.CreateStartableJobWithTxn(ctx, jobRecord, txn, nil /* resultCh */); err != nil {
   387  			return err
   388  		}
   389  		return nil
   390  	}); err != nil {
   391  		return err
   392  	}
   393  	if _, err := sj.Start(ctx); err != nil {
   394  		return err
   395  	}
   396  	return nil
   397  }
   398  
   399  // Execute the entire schema change in steps.
   400  // inSession is set to false when this is called from the asynchronous
   401  // schema change execution path.
   402  //
   403  // If the txn that queued the schema changer did not commit, this will be a
   404  // no-op, as we'll fail to find the job for our mutation in the jobs registry.
   405  func (sc *SchemaChanger) exec(ctx context.Context) error {
   406  	ctx = logtags.AddTag(ctx, "scExec", nil)
   407  
   408  	// TODO (lucy): Now that marking a schema change job as succeeded doesn't
   409  	// happen in the same transaction as removing mutations from a table
   410  	// descriptor, it seems possible for a job to be resumed after the mutation
   411  	// has already been removed. If there's a mutation provided, we should check
   412  	// whether it actually exists on the table descriptor and exit the job if not.
   413  	tableDesc, notFirst, err := sc.notFirstInLine(ctx)
   414  	if err != nil {
   415  		return err
   416  	}
   417  	if notFirst {
   418  		log.Infof(ctx,
   419  			"schema change on %s (%d v%d) mutation %d: another change is still in progress",
   420  			tableDesc.Name, sc.tableID, tableDesc.Version, sc.mutationID,
   421  		)
   422  		return errSchemaChangeNotFirstInLine
   423  	}
   424  
   425  	log.Infof(ctx,
   426  		"schema change on %s (%d v%d) mutation %d starting execution...",
   427  		tableDesc.Name, sc.tableID, tableDesc.Version, sc.mutationID,
   428  	)
   429  
   430  	if tableDesc.HasDrainingNames() {
   431  		if err := sc.drainNames(ctx); err != nil {
   432  			return err
   433  		}
   434  	}
   435  
   436  	if tableDesc.Dropped() && sc.droppedDatabaseID == sqlbase.InvalidID {
   437  		// We've dropped this table, let's kick off a GC job.
   438  		dropTime := timeutil.Now().UnixNano()
   439  		if tableDesc.DropTime > 0 {
   440  			dropTime = tableDesc.DropTime
   441  		}
   442  		gcDetails := jobspb.SchemaChangeGCDetails{
   443  			Tables: []jobspb.SchemaChangeGCDetails_DroppedID{
   444  				{
   445  					ID:       tableDesc.ID,
   446  					DropTime: dropTime,
   447  				},
   448  			},
   449  		}
   450  		if err := startGCJob(
   451  			ctx, sc.db, sc.jobRegistry, sc.job.Payload().Username, sc.job.Payload().Description, gcDetails,
   452  		); err != nil {
   453  			return err
   454  		}
   455  	}
   456  
   457  	if err := sc.maybeBackfillCreateTableAs(ctx, tableDesc); err != nil {
   458  		return err
   459  	}
   460  
   461  	if err := sc.maybeMakeAddTablePublic(ctx, tableDesc); err != nil {
   462  		return err
   463  	}
   464  
   465  	// Wait for the schema change to propagate to all nodes after this function
   466  	// returns, so that the new schema is live everywhere. This is not needed for
   467  	// correctness but is done to make the UI experience/tests predictable.
   468  	waitToUpdateLeases := func(refreshStats bool) error {
   469  		if err := sc.waitToUpdateLeases(ctx, sc.tableID); err != nil {
   470  			if errors.Is(err, sqlbase.ErrDescriptorNotFound) {
   471  				return err
   472  			}
   473  			log.Warningf(ctx, "waiting to update leases: %+v", err)
   474  			// As we are dismissing the error, go through the recording motions.
   475  			// This ensures that any important error gets reported to Sentry, etc.
   476  			sqltelemetry.RecordError(ctx, err, &sc.settings.SV)
   477  		}
   478  		// We wait to trigger a stats refresh until we know the leases have been
   479  		// updated.
   480  		if refreshStats {
   481  			sc.refreshStats()
   482  		}
   483  		return nil
   484  	}
   485  
   486  	if sc.mutationID == sqlbase.InvalidMutationID {
   487  		// Nothing more to do.
   488  		isCreateTableAs := tableDesc.Adding() && tableDesc.IsAs()
   489  		return waitToUpdateLeases(isCreateTableAs /* refreshStats */)
   490  	}
   491  
   492  	if err := sc.initJobRunningStatus(ctx); err != nil {
   493  		if log.V(2) {
   494  			log.Infof(ctx, "failed to update job status: %+v", err)
   495  		}
   496  		// Go through the recording motions. See comment above.
   497  		sqltelemetry.RecordError(ctx, err, &sc.settings.SV)
   498  	}
   499  
   500  	// Run through mutation state machine and backfill.
   501  	err = sc.runStateMachineAndBackfill(ctx)
   502  	if err != nil {
   503  		return err
   504  	}
   505  
   506  	defer func() {
   507  		if err := waitToUpdateLeases(err == nil /* refreshStats */); err != nil && !errors.Is(err, sqlbase.ErrDescriptorNotFound) {
   508  			// We only expect ErrDescriptorNotFound to be returned. This happens
   509  			// when the table descriptor was deleted. We can ignore this error.
   510  
   511  			log.Warningf(ctx, "unexpected error while waiting for leases to update: %+v", err)
   512  			// As we are dismissing the error, go through the recording motions.
   513  			// This ensures that any important error gets reported to Sentry, etc.
   514  			sqltelemetry.RecordError(ctx, err, &sc.settings.SV)
   515  		}
   516  	}()
   517  
   518  	return err
   519  }
   520  
   521  // handlePermanentSchemaChangeError cleans up schema changes that cannot
   522  // be completed successfully. For schema changes with mutations, it reverses the
   523  // direction of the mutations so that we can step through the state machine
   524  // backwards. Note that schema changes which don't have mutations are meant to
   525  // run quickly and aren't truly cancellable in the small window they require to
   526  // complete. In that case, cleanup consists of simply resuming the same schema
   527  // change.
   528  // TODO (lucy): This is how "rolling back" has always worked for non-mutation
   529  // schema change jobs, but it's unnatural for the job API and we should rethink
   530  // it.
   531  func (sc *SchemaChanger) handlePermanentSchemaChangeError(
   532  	ctx context.Context, err error, evalCtx *extendedEvalContext,
   533  ) error {
   534  	if rollbackErr := sc.rollbackSchemaChange(ctx, err); rollbackErr != nil {
   535  		// From now on, the returned error will be a secondary error of the returned
   536  		// error, so we'll record the original error now.
   537  		secondary := errors.Wrap(err, "original error when rolling back mutations")
   538  		sqltelemetry.RecordError(ctx, secondary, &sc.settings.SV)
   539  		return errors.WithSecondaryError(rollbackErr, secondary)
   540  	}
   541  
   542  	// TODO (lucy): This is almost the same as in exec(), maybe refactor.
   543  	// Wait for the schema change to propagate to all nodes after this function
   544  	// returns, so that the new schema is live everywhere. This is not needed for
   545  	// correctness but is done to make the UI experience/tests predictable.
   546  	waitToUpdateLeases := func(refreshStats bool) error {
   547  		if err := sc.waitToUpdateLeases(ctx, sc.tableID); err != nil {
   548  			if errors.Is(err, sqlbase.ErrDescriptorNotFound) {
   549  				return err
   550  			}
   551  			log.Warningf(ctx, "waiting to update leases: %+v", err)
   552  			// As we are dismissing the error, go through the recording motions.
   553  			// This ensures that any important error gets reported to Sentry, etc.
   554  			sqltelemetry.RecordError(ctx, err, &sc.settings.SV)
   555  		}
   556  		// We wait to trigger a stats refresh until we know the leases have been
   557  		// updated.
   558  		if refreshStats {
   559  			sc.refreshStats()
   560  		}
   561  		return nil
   562  	}
   563  
   564  	defer func() {
   565  		if err := waitToUpdateLeases(false /* refreshStats */); err != nil && !errors.Is(err, sqlbase.ErrDescriptorNotFound) {
   566  			// We only expect ErrDescriptorNotFound to be returned. This happens
   567  			// when the table descriptor was deleted. We can ignore this error.
   568  
   569  			log.Warningf(ctx, "unexpected error while waiting for leases to update: %+v", err)
   570  			// As we are dismissing the error, go through the recording motions.
   571  			// This ensures that any important error gets reported to Sentry, etc.
   572  			sqltelemetry.RecordError(ctx, err, &sc.settings.SV)
   573  		}
   574  	}()
   575  
   576  	return nil
   577  }
   578  
   579  // initialize the job running status.
   580  func (sc *SchemaChanger) initJobRunningStatus(ctx context.Context) error {
   581  	return sc.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   582  		desc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.execCfg.Codec, sc.tableID)
   583  		if err != nil {
   584  			return err
   585  		}
   586  
   587  		var runStatus jobs.RunningStatus
   588  		for _, mutation := range desc.Mutations {
   589  			if mutation.MutationID != sc.mutationID {
   590  				// Mutations are applied in a FIFO order. Only apply the first set of
   591  				// mutations if they have the mutation ID we're looking for.
   592  				break
   593  			}
   594  
   595  			switch mutation.Direction {
   596  			case sqlbase.DescriptorMutation_ADD:
   597  				switch mutation.State {
   598  				case sqlbase.DescriptorMutation_DELETE_ONLY:
   599  					runStatus = RunningStatusDeleteOnly
   600  				}
   601  
   602  			case sqlbase.DescriptorMutation_DROP:
   603  				switch mutation.State {
   604  				case sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY:
   605  					runStatus = RunningStatusDeleteAndWriteOnly
   606  				}
   607  			}
   608  		}
   609  		if runStatus != "" && !desc.Dropped() {
   610  			if err := sc.job.WithTxn(txn).RunningStatus(
   611  				ctx, func(ctx context.Context, details jobspb.Details) (jobs.RunningStatus, error) {
   612  					return runStatus, nil
   613  				}); err != nil {
   614  				return errors.Wrapf(err, "failed to update job status")
   615  			}
   616  		}
   617  		return nil
   618  	})
   619  }
   620  
   621  func (sc *SchemaChanger) rollbackSchemaChange(ctx context.Context, err error) error {
   622  	log.Warningf(ctx, "reversing schema change %d due to irrecoverable error: %s", *sc.job.ID(), err)
   623  	if errReverse := sc.maybeReverseMutations(ctx, err); errReverse != nil {
   624  		return errReverse
   625  	}
   626  
   627  	if fn := sc.testingKnobs.RunAfterMutationReversal; fn != nil {
   628  		if err := fn(*sc.job.ID()); err != nil {
   629  			return err
   630  		}
   631  	}
   632  
   633  	// After this point the schema change has been reversed and any retry
   634  	// of the schema change will act upon the reversed schema change.
   635  	return sc.runStateMachineAndBackfill(ctx)
   636  }
   637  
   638  // RunStateMachineBeforeBackfill moves the state machine forward
   639  // and wait to ensure that all nodes are seeing the latest version
   640  // of the table.
   641  func (sc *SchemaChanger) RunStateMachineBeforeBackfill(ctx context.Context) error {
   642  	var runStatus jobs.RunningStatus
   643  	if _, err := sc.leaseMgr.Publish(ctx, sc.tableID, func(desc *sqlbase.MutableTableDescriptor) error {
   644  
   645  		runStatus = ""
   646  		// Apply mutations belonging to the same version.
   647  		for i, mutation := range desc.Mutations {
   648  			if mutation.MutationID != sc.mutationID {
   649  				// Mutations are applied in a FIFO order. Only apply the first set of
   650  				// mutations if they have the mutation ID we're looking for.
   651  				break
   652  			}
   653  			switch mutation.Direction {
   654  			case sqlbase.DescriptorMutation_ADD:
   655  				switch mutation.State {
   656  				case sqlbase.DescriptorMutation_DELETE_ONLY:
   657  					// TODO(vivek): while moving up the state is appropriate,
   658  					// it will be better to run the backfill of a unique index
   659  					// twice: once in the DELETE_ONLY state to confirm that
   660  					// the index can indeed be created, and subsequently in the
   661  					// DELETE_AND_WRITE_ONLY state to fill in the missing elements of the
   662  					// index (INSERT and UPDATE that happened in the interim).
   663  					desc.Mutations[i].State = sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY
   664  					runStatus = RunningStatusDeleteAndWriteOnly
   665  
   666  				case sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY:
   667  					// The state change has already moved forward.
   668  				}
   669  
   670  			case sqlbase.DescriptorMutation_DROP:
   671  				switch mutation.State {
   672  				case sqlbase.DescriptorMutation_DELETE_ONLY:
   673  					// The state change has already moved forward.
   674  
   675  				case sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY:
   676  					desc.Mutations[i].State = sqlbase.DescriptorMutation_DELETE_ONLY
   677  					runStatus = RunningStatusDeleteOnly
   678  				}
   679  			}
   680  		}
   681  		if doNothing := runStatus == "" || desc.Dropped(); doNothing {
   682  			// Return error so that Publish() doesn't increment the version.
   683  			return lease.ErrDidntUpdateDescriptor
   684  		}
   685  		return nil
   686  	}, func(txn *kv.Txn) error {
   687  		if sc.job != nil {
   688  			if err := sc.job.WithTxn(txn).RunningStatus(ctx, func(ctx context.Context, details jobspb.Details) (jobs.RunningStatus, error) {
   689  				return runStatus, nil
   690  			}); err != nil {
   691  				return errors.Wrap(err, "failed to update job status")
   692  			}
   693  		}
   694  		return nil
   695  	}); err != nil {
   696  		return err
   697  	}
   698  
   699  	// wait for the state change to propagate to all leases.
   700  	return sc.waitToUpdateLeases(ctx, sc.tableID)
   701  }
   702  
   703  // Wait until the entire cluster has been updated to the latest version
   704  // of the table descriptor.
   705  func (sc *SchemaChanger) waitToUpdateLeases(ctx context.Context, tableID sqlbase.ID) error {
   706  	// Aggressively retry because there might be a user waiting for the
   707  	// schema change to complete.
   708  	retryOpts := retry.Options{
   709  		InitialBackoff: 20 * time.Millisecond,
   710  		MaxBackoff:     200 * time.Millisecond,
   711  		Multiplier:     2,
   712  	}
   713  	log.Infof(ctx, "waiting for a single version of table %d...", tableID)
   714  	version, err := sc.leaseMgr.WaitForOneVersion(ctx, tableID, retryOpts)
   715  	log.Infof(ctx, "waiting for a single version of table %d... done (at v %d)", tableID, version)
   716  	return err
   717  }
   718  
   719  // done finalizes the mutations (adds new cols/indexes to the table).
   720  // It ensures that all nodes are on the current (pre-update) version of the
   721  // schema.
   722  // It also kicks off GC jobs as needed.
   723  // Returns the updated descriptor.
   724  func (sc *SchemaChanger) done(ctx context.Context) (*sqlbase.ImmutableTableDescriptor, error) {
   725  	isRollback := false
   726  
   727  	// Get the other tables whose foreign key backreferences need to be removed.
   728  	// We make a call to PublishMultiple to handle the situation to add Foreign Key backreferences.
   729  	var fksByBackrefTable map[sqlbase.ID][]*sqlbase.ConstraintToUpdate
   730  	var interleaveParents map[sqlbase.ID]struct{}
   731  	err := sc.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   732  		fksByBackrefTable = make(map[sqlbase.ID][]*sqlbase.ConstraintToUpdate)
   733  		interleaveParents = make(map[sqlbase.ID]struct{})
   734  
   735  		desc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.execCfg.Codec, sc.tableID)
   736  		if err != nil {
   737  			return err
   738  		}
   739  		for _, mutation := range desc.Mutations {
   740  			if mutation.MutationID != sc.mutationID {
   741  				break
   742  			}
   743  			if constraint := mutation.GetConstraint(); constraint != nil &&
   744  				constraint.ConstraintType == sqlbase.ConstraintToUpdate_FOREIGN_KEY &&
   745  				mutation.Direction == sqlbase.DescriptorMutation_ADD &&
   746  				constraint.ForeignKey.Validity == sqlbase.ConstraintValidity_Unvalidated {
   747  				// Add backref table to referenced table with an unvalidated foreign key constraint
   748  				fk := &constraint.ForeignKey
   749  				if fk.ReferencedTableID != desc.ID {
   750  					fksByBackrefTable[constraint.ForeignKey.ReferencedTableID] = append(fksByBackrefTable[constraint.ForeignKey.ReferencedTableID], constraint)
   751  				}
   752  			} else if swap := mutation.GetPrimaryKeySwap(); swap != nil {
   753  				// If any old indexes (including the old primary index) being rewritten are interleaved
   754  				// children, we will have to update their parents as well.
   755  				for _, idxID := range append([]sqlbase.IndexID{swap.OldPrimaryIndexId}, swap.OldIndexes...) {
   756  					oldIndex, err := desc.FindIndexByID(idxID)
   757  					if err != nil {
   758  						return err
   759  					}
   760  					if len(oldIndex.Interleave.Ancestors) != 0 {
   761  						ancestor := oldIndex.Interleave.Ancestors[len(oldIndex.Interleave.Ancestors)-1]
   762  						if ancestor.TableID != desc.ID {
   763  							interleaveParents[ancestor.TableID] = struct{}{}
   764  						}
   765  					}
   766  				}
   767  				// Because we are not currently supporting primary key changes on tables/indexes
   768  				// that are interleaved parents, we don't check oldPrimaryIndex.InterleavedBy.
   769  			}
   770  		}
   771  		return nil
   772  	})
   773  	if err != nil {
   774  		return nil, err
   775  	}
   776  	tableIDsToUpdate := make([]sqlbase.ID, 0, len(fksByBackrefTable)+1)
   777  	tableIDsToUpdate = append(tableIDsToUpdate, sc.tableID)
   778  	for id := range fksByBackrefTable {
   779  		tableIDsToUpdate = append(tableIDsToUpdate, id)
   780  	}
   781  	for id := range interleaveParents {
   782  		if _, ok := fksByBackrefTable[id]; !ok {
   783  			tableIDsToUpdate = append(tableIDsToUpdate, id)
   784  		}
   785  	}
   786  
   787  	// Jobs (for GC, etc.) that need to be started immediately after the table
   788  	// descriptor updates are published.
   789  	var childJobs []*jobs.StartableJob
   790  	update := func(txn *kv.Txn, descs map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error {
   791  		// Reset vars here because update function can be called multiple times in a retry.
   792  		isRollback = false
   793  		childJobs = nil
   794  
   795  		i := 0
   796  		scDesc, ok := descs[sc.tableID]
   797  		if !ok {
   798  			return errors.AssertionFailedf("required table with ID %d not provided to update closure", sc.tableID)
   799  		}
   800  
   801  		for _, mutation := range scDesc.Mutations {
   802  			if mutation.MutationID != sc.mutationID {
   803  				// Mutations are applied in a FIFO order. Only apply the first set of
   804  				// mutations if they have the mutation ID we're looking for.
   805  				break
   806  			}
   807  			isRollback = mutation.Rollback
   808  			if indexDesc := mutation.GetIndex(); mutation.Direction == sqlbase.DescriptorMutation_DROP &&
   809  				indexDesc != nil {
   810  				if canClearRangeForDrop(indexDesc) {
   811  					// how we keep track of dropped index names (for, e.g., zone config
   812  					// lookups), even though in the absence of a GC job there's nothing to
   813  					// clean them up.
   814  					scDesc.GCMutations = append(
   815  						scDesc.GCMutations,
   816  						sqlbase.TableDescriptor_GCDescriptorMutation{
   817  							IndexID: indexDesc.ID,
   818  						})
   819  
   820  					dropTime := timeutil.Now().UnixNano()
   821  					indexGCDetails := jobspb.SchemaChangeGCDetails{
   822  						Indexes: []jobspb.SchemaChangeGCDetails_DroppedIndex{
   823  							{
   824  								IndexID:  indexDesc.ID,
   825  								DropTime: dropTime,
   826  							},
   827  						},
   828  						ParentID: sc.tableID,
   829  					}
   830  
   831  					description := sc.job.Payload().Description
   832  					if isRollback {
   833  						description = "ROLLBACK of " + description
   834  					}
   835  					gcJobRecord := CreateGCJobRecord(description, sc.job.Payload().Username, indexGCDetails)
   836  					indexGCJob, err := sc.jobRegistry.CreateStartableJobWithTxn(ctx, gcJobRecord, txn, nil /* resultsCh */)
   837  					if err != nil {
   838  						return err
   839  					}
   840  					log.VEventf(ctx, 2, "created index GC job %d", *indexGCJob.ID())
   841  					childJobs = append(childJobs, indexGCJob)
   842  				}
   843  			}
   844  			if constraint := mutation.GetConstraint(); constraint != nil &&
   845  				constraint.ConstraintType == sqlbase.ConstraintToUpdate_FOREIGN_KEY &&
   846  				mutation.Direction == sqlbase.DescriptorMutation_ADD &&
   847  				constraint.ForeignKey.Validity == sqlbase.ConstraintValidity_Unvalidated {
   848  				// Add backreference on the referenced table (which could be the same table)
   849  				backrefTable, ok := descs[constraint.ForeignKey.ReferencedTableID]
   850  				if !ok {
   851  					return errors.AssertionFailedf("required table with ID %d not provided to update closure", sc.tableID)
   852  				}
   853  				backrefTable.InboundFKs = append(backrefTable.InboundFKs, constraint.ForeignKey)
   854  			}
   855  
   856  			// Some primary key change specific operations need to happen before
   857  			// and after the index swap occurs.
   858  			if pkSwap := mutation.GetPrimaryKeySwap(); pkSwap != nil {
   859  				// We might have to update some zone configs for indexes that are
   860  				// being rewritten. It is important that this is done _before_ the
   861  				// index swap occurs. The logic that generates spans for subzone
   862  				// configurations removes spans for indexes in the dropping state,
   863  				// which we don't want. So, set up the zone configs before we swap.
   864  				if err := sc.maybeUpdateZoneConfigsForPKChange(
   865  					ctx, txn, sc.execCfg, scDesc.TableDesc(), pkSwap); err != nil {
   866  					return err
   867  				}
   868  			}
   869  
   870  			if err := scDesc.MakeMutationComplete(mutation); err != nil {
   871  				return err
   872  			}
   873  
   874  			if pkSwap := mutation.GetPrimaryKeySwap(); pkSwap != nil {
   875  				if fn := sc.testingKnobs.RunBeforePrimaryKeySwap; fn != nil {
   876  					fn()
   877  				}
   878  				// If any old index had an interleaved parent, remove the
   879  				// backreference from the parent.
   880  				// N.B. This logic needs to be kept up to date with the
   881  				// corresponding piece in runSchemaChangesInTxn.
   882  				for _, idxID := range append(
   883  					[]sqlbase.IndexID{pkSwap.OldPrimaryIndexId}, pkSwap.OldIndexes...) {
   884  					oldIndex, err := scDesc.FindIndexByID(idxID)
   885  					if err != nil {
   886  						return err
   887  					}
   888  					if len(oldIndex.Interleave.Ancestors) != 0 {
   889  						ancestorInfo := oldIndex.Interleave.Ancestors[len(oldIndex.Interleave.Ancestors)-1]
   890  						ancestor := descs[ancestorInfo.TableID]
   891  						ancestorIdx, err := ancestor.FindIndexByID(ancestorInfo.IndexID)
   892  						if err != nil {
   893  							return err
   894  						}
   895  						foundAncestor := false
   896  						for k, ref := range ancestorIdx.InterleavedBy {
   897  							if ref.Table == scDesc.ID && ref.Index == oldIndex.ID {
   898  								if foundAncestor {
   899  									return errors.AssertionFailedf(
   900  										"ancestor entry in %s for %s@%s found more than once",
   901  										ancestor.Name, scDesc.Name, oldIndex.Name)
   902  								}
   903  								ancestorIdx.InterleavedBy = append(
   904  									ancestorIdx.InterleavedBy[:k], ancestorIdx.InterleavedBy[k+1:]...)
   905  								foundAncestor = true
   906  							}
   907  						}
   908  					}
   909  				}
   910  				// If we performed MakeMutationComplete on a PrimaryKeySwap mutation, then we need to start
   911  				// a job for the index deletion mutations that the primary key swap mutation added, if any.
   912  				if childJobs, err = sc.queueCleanupJobs(ctx, scDesc, txn, childJobs); err != nil {
   913  					return err
   914  				}
   915  			}
   916  
   917  			if computedColumnSwap := mutation.GetComputedColumnSwap(); computedColumnSwap != nil {
   918  				if fn := sc.testingKnobs.RunBeforeComputedColumnSwap; fn != nil {
   919  					fn()
   920  				}
   921  
   922  				// If we performed MakeMutationComplete on a computed column swap, then
   923  				// we need to start a job for the column deletion that the swap mutation
   924  				// added if any.
   925  				if childJobs, err = sc.queueCleanupJobs(ctx, scDesc, txn, childJobs); err != nil {
   926  					return err
   927  				}
   928  			}
   929  			i++
   930  		}
   931  		if i == 0 {
   932  			// The table descriptor is unchanged. Don't let Publish() increment
   933  			// the version.
   934  			return lease.ErrDidntUpdateDescriptor
   935  		}
   936  		// Trim the executed mutations from the descriptor.
   937  		scDesc.Mutations = scDesc.Mutations[i:]
   938  
   939  		for i, g := range scDesc.MutationJobs {
   940  			if g.MutationID == sc.mutationID {
   941  				// Trim the executed mutation group from the descriptor.
   942  				scDesc.MutationJobs = append(scDesc.MutationJobs[:i], scDesc.MutationJobs[i+1:]...)
   943  				break
   944  			}
   945  		}
   946  		return nil
   947  	}
   948  
   949  	descs, err := sc.leaseMgr.PublishMultiple(ctx, tableIDsToUpdate, update, func(txn *kv.Txn) error {
   950  		schemaChangeEventType := EventLogFinishSchemaChange
   951  		if isRollback {
   952  			schemaChangeEventType = EventLogFinishSchemaRollback
   953  		}
   954  
   955  		// Log "Finish Schema Change" or "Finish Schema Change Rollback"
   956  		// event. Only the table ID and mutation ID are logged; this can
   957  		// be correlated with the DDL statement that initiated the change
   958  		// using the mutation id.
   959  		return MakeEventLogger(sc.execCfg).InsertEventRecord(
   960  			ctx,
   961  			txn,
   962  			schemaChangeEventType,
   963  			int32(sc.tableID),
   964  			int32(sc.sqlInstanceID),
   965  			struct {
   966  				MutationID uint32
   967  			}{uint32(sc.mutationID)},
   968  		)
   969  	})
   970  	if fn := sc.testingKnobs.RunBeforeChildJobs; fn != nil {
   971  		if len(childJobs) != 0 {
   972  			fn()
   973  		}
   974  	}
   975  	if err != nil {
   976  		for _, job := range childJobs {
   977  			if rollbackErr := job.CleanupOnRollback(ctx); rollbackErr != nil {
   978  				log.Warningf(ctx, "failed to clean up job: %v", rollbackErr)
   979  			}
   980  		}
   981  		return nil, err
   982  	}
   983  	for _, job := range childJobs {
   984  		if _, err := job.Start(ctx); err != nil {
   985  			log.Warningf(ctx, "starting job %d failed with error: %v", *job.ID(), err)
   986  		}
   987  		log.VEventf(ctx, 2, "started job %d", *job.ID())
   988  	}
   989  	return descs[sc.tableID], nil
   990  }
   991  
   992  // maybeUpdateZoneConfigsForPKChange moves zone configs for any rewritten
   993  // indexes from the old index over to the new index.
   994  func (sc *SchemaChanger) maybeUpdateZoneConfigsForPKChange(
   995  	ctx context.Context,
   996  	txn *kv.Txn,
   997  	execCfg *ExecutorConfig,
   998  	table *sqlbase.TableDescriptor,
   999  	swapInfo *sqlbase.PrimaryKeySwap,
  1000  ) error {
  1001  	zone, err := getZoneConfigRaw(ctx, txn, table.ID)
  1002  	if err != nil {
  1003  		return err
  1004  	}
  1005  
  1006  	// If this table doesn't have a zone attached to it, don't do anything.
  1007  	if zone == nil {
  1008  		return nil
  1009  	}
  1010  
  1011  	// For each rewritten index, point its subzones for the old index at the
  1012  	// new index.
  1013  	for i, oldID := range swapInfo.OldIndexes {
  1014  		for j := range zone.Subzones {
  1015  			subzone := &zone.Subzones[j]
  1016  			if subzone.IndexID == uint32(oldID) {
  1017  				// If we find a subzone matching an old index, copy its subzone
  1018  				// into a new subzone with the new index's ID.
  1019  				subzoneCopy := *subzone
  1020  				subzoneCopy.IndexID = uint32(swapInfo.NewIndexes[i])
  1021  				zone.SetSubzone(subzoneCopy)
  1022  			}
  1023  		}
  1024  	}
  1025  
  1026  	// Write the zone back. This call regenerates the index spans that apply
  1027  	// to each partition in the index.
  1028  	_, err = writeZoneConfig(ctx, txn, table.ID, table, zone, execCfg, false)
  1029  	if err != nil && !sqlbase.IsCCLRequiredError(err) {
  1030  		return err
  1031  	}
  1032  
  1033  	return nil
  1034  }
  1035  
  1036  // notFirstInLine returns true whenever the schema change has been queued
  1037  // up for execution after another schema change.
  1038  func (sc *SchemaChanger) notFirstInLine(
  1039  	ctx context.Context,
  1040  ) (*sqlbase.TableDescriptor, bool, error) {
  1041  	var notFirst bool
  1042  	var desc *sqlbase.TableDescriptor
  1043  	err := sc.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1044  		notFirst = false
  1045  		var err error
  1046  		desc, err = sqlbase.GetTableDescFromID(ctx, txn, sc.execCfg.Codec, sc.tableID)
  1047  		if err != nil {
  1048  			return err
  1049  		}
  1050  		for i, mutation := range desc.Mutations {
  1051  			if mutation.MutationID == sc.mutationID {
  1052  				notFirst = i != 0
  1053  				break
  1054  			}
  1055  		}
  1056  		return nil
  1057  	})
  1058  	return desc, notFirst, err
  1059  }
  1060  
  1061  // runStateMachineAndBackfill runs the schema change state machine followed by
  1062  // the backfill.
  1063  func (sc *SchemaChanger) runStateMachineAndBackfill(ctx context.Context) error {
  1064  	if fn := sc.testingKnobs.RunBeforePublishWriteAndDelete; fn != nil {
  1065  		fn()
  1066  	}
  1067  	// Run through mutation state machine before backfill.
  1068  	if err := sc.RunStateMachineBeforeBackfill(ctx); err != nil {
  1069  		return err
  1070  	}
  1071  
  1072  	// Run backfill(s).
  1073  	if err := sc.runBackfill(ctx); err != nil {
  1074  		return err
  1075  	}
  1076  
  1077  	// Mark the mutations as completed.
  1078  	_, err := sc.done(ctx)
  1079  	return err
  1080  }
  1081  
  1082  func (sc *SchemaChanger) refreshStats() {
  1083  	// Initiate an asynchronous run of CREATE STATISTICS. We use a large number
  1084  	// for rowsAffected because we want to make sure that stats always get
  1085  	// created/refreshed here.
  1086  	sc.execCfg.StatsRefresher.NotifyMutation(sc.tableID, math.MaxInt32 /* rowsAffected */)
  1087  }
  1088  
  1089  // maybeReverseMutations reverses the direction of all the mutations with the
  1090  // mutationID. This is called after hitting an irrecoverable error while
  1091  // applying a schema change. If a column being added is reversed and dropped,
  1092  // all new indexes referencing the column will also be dropped.
  1093  func (sc *SchemaChanger) maybeReverseMutations(ctx context.Context, causingError error) error {
  1094  	if fn := sc.testingKnobs.RunBeforeMutationReversal; fn != nil {
  1095  		if err := fn(*sc.job.ID()); err != nil {
  1096  			return err
  1097  		}
  1098  	}
  1099  
  1100  	// Get the other tables whose foreign key backreferences need to be removed.
  1101  	var fksByBackrefTable map[sqlbase.ID][]*sqlbase.ConstraintToUpdate
  1102  	alreadyReversed := false
  1103  	err := sc.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1104  		fksByBackrefTable = make(map[sqlbase.ID][]*sqlbase.ConstraintToUpdate)
  1105  		var err error
  1106  		desc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.execCfg.Codec, sc.tableID)
  1107  		if err != nil {
  1108  			return err
  1109  		}
  1110  		for _, mutation := range desc.Mutations {
  1111  			if mutation.MutationID != sc.mutationID {
  1112  				break
  1113  			}
  1114  			if mutation.Rollback {
  1115  				// Mutation is already reversed, so we don't need to do any more work.
  1116  				// This can happen if the mutations were already reversed, but before
  1117  				// the rollback completed the job was adopted.
  1118  				alreadyReversed = true
  1119  				return nil
  1120  			}
  1121  			if constraint := mutation.GetConstraint(); constraint != nil &&
  1122  				constraint.ConstraintType == sqlbase.ConstraintToUpdate_FOREIGN_KEY &&
  1123  				mutation.Direction == sqlbase.DescriptorMutation_ADD &&
  1124  				constraint.ForeignKey.Validity == sqlbase.ConstraintValidity_Validating {
  1125  				fk := &constraint.ForeignKey
  1126  				if fk.ReferencedTableID != desc.ID {
  1127  					fksByBackrefTable[constraint.ForeignKey.ReferencedTableID] = append(fksByBackrefTable[constraint.ForeignKey.ReferencedTableID], constraint)
  1128  				}
  1129  			}
  1130  		}
  1131  		return nil
  1132  	})
  1133  	if err != nil {
  1134  		return err
  1135  	}
  1136  	if alreadyReversed {
  1137  		return nil
  1138  	}
  1139  	tableIDsToUpdate := make([]sqlbase.ID, 0, len(fksByBackrefTable)+1)
  1140  	tableIDsToUpdate = append(tableIDsToUpdate, sc.tableID)
  1141  	for id := range fksByBackrefTable {
  1142  		tableIDsToUpdate = append(tableIDsToUpdate, id)
  1143  	}
  1144  
  1145  	// Create update closure for the table and all other tables with backreferences
  1146  	var droppedMutations map[sqlbase.MutationID]struct{}
  1147  	update := func(_ *kv.Txn, descs map[sqlbase.ID]*sqlbase.MutableTableDescriptor) error {
  1148  		scDesc, ok := descs[sc.tableID]
  1149  		if !ok {
  1150  			return errors.AssertionFailedf("required table with ID %d not provided to update closure", sc.tableID)
  1151  		}
  1152  		// Keep track of the column mutations being reversed so that indexes
  1153  		// referencing them can be dropped.
  1154  		columns := make(map[string]struct{})
  1155  		droppedMutations = nil
  1156  
  1157  		for i, mutation := range scDesc.Mutations {
  1158  			if mutation.MutationID != sc.mutationID {
  1159  				// Only reverse the first set of mutations if they have the
  1160  				// mutation ID we're looking for.
  1161  				if i == 0 {
  1162  					return lease.ErrDidntUpdateDescriptor
  1163  				}
  1164  				break
  1165  			}
  1166  
  1167  			if mutation.Rollback {
  1168  				// Can actually never happen. Since we should have checked for this case
  1169  				// above.
  1170  				return errors.AssertionFailedf("mutation already rolled back: %v", mutation)
  1171  			}
  1172  
  1173  			log.Warningf(ctx, "reverse schema change mutation: %+v", mutation)
  1174  			scDesc.Mutations[i], columns = sc.reverseMutation(mutation, false /*notStarted*/, columns)
  1175  
  1176  			// If the mutation is for validating a constraint that is being added,
  1177  			// drop the constraint because validation has failed
  1178  			if constraint := mutation.GetConstraint(); constraint != nil &&
  1179  				mutation.Direction == sqlbase.DescriptorMutation_ADD {
  1180  				log.Warningf(ctx, "dropping constraint %+v", constraint)
  1181  				if err := sc.maybeDropValidatingConstraint(ctx, scDesc, constraint); err != nil {
  1182  					return err
  1183  				}
  1184  				// Get the foreign key backreferences to remove.
  1185  				if constraint.ConstraintType == sqlbase.ConstraintToUpdate_FOREIGN_KEY {
  1186  					fk := &constraint.ForeignKey
  1187  					backrefTable, ok := descs[fk.ReferencedTableID]
  1188  					if !ok {
  1189  						return errors.AssertionFailedf("required table with ID %d not provided to update closure", sc.tableID)
  1190  					}
  1191  					if err := removeFKBackReferenceFromTable(backrefTable, fk.Name, scDesc.TableDesc()); err != nil {
  1192  						return err
  1193  					}
  1194  				}
  1195  			}
  1196  			scDesc.Mutations[i].Rollback = true
  1197  		}
  1198  
  1199  		// Delete all mutations that reference any of the reversed columns
  1200  		// by running a graph traversal of the mutations.
  1201  		if len(columns) > 0 {
  1202  			var err error
  1203  			droppedMutations, err = sc.deleteIndexMutationsWithReversedColumns(ctx, scDesc, columns)
  1204  			if err != nil {
  1205  				return err
  1206  			}
  1207  		}
  1208  
  1209  		// PublishMultiple() will increment the version.
  1210  		return nil
  1211  	}
  1212  
  1213  	_, err = sc.leaseMgr.PublishMultiple(ctx, tableIDsToUpdate, update, func(txn *kv.Txn) error {
  1214  		// Read the table descriptor from the store. The Version of the
  1215  		// descriptor has already been incremented in the transaction and
  1216  		// this descriptor can be modified without incrementing the version.
  1217  		tableDesc, err := sqlbase.GetTableDescFromID(ctx, txn, sc.execCfg.Codec, sc.tableID)
  1218  		if err != nil {
  1219  			return err
  1220  		}
  1221  
  1222  		// Mark the schema change job as failed and create a rollback job.
  1223  		err = sc.updateJobForRollback(ctx, txn, tableDesc)
  1224  		if err != nil {
  1225  			return err
  1226  		}
  1227  
  1228  		// Mark other reversed mutation jobs as failed.
  1229  		for m := range droppedMutations {
  1230  			jobID, err := getJobIDForMutationWithDescriptor(ctx, tableDesc, m)
  1231  			if err != nil {
  1232  				return err
  1233  			}
  1234  			if err := sc.jobRegistry.Failed(ctx, txn, jobID, causingError); err != nil {
  1235  				return err
  1236  			}
  1237  		}
  1238  
  1239  		// Log "Reverse Schema Change" event. Only the causing error and the
  1240  		// mutation ID are logged; this can be correlated with the DDL statement
  1241  		// that initiated the change using the mutation id.
  1242  		return MakeEventLogger(sc.execCfg).InsertEventRecord(
  1243  			ctx,
  1244  			txn,
  1245  			EventLogReverseSchemaChange,
  1246  			int32(sc.tableID),
  1247  			int32(sc.sqlInstanceID),
  1248  			struct {
  1249  				Error      string
  1250  				MutationID uint32
  1251  			}{fmt.Sprintf("%+v", causingError), uint32(sc.mutationID)},
  1252  		)
  1253  	})
  1254  	if err != nil {
  1255  		return err
  1256  	}
  1257  
  1258  	if err := sc.waitToUpdateLeases(ctx, sc.tableID); err != nil {
  1259  		return err
  1260  	}
  1261  	for id := range fksByBackrefTable {
  1262  		if err := sc.waitToUpdateLeases(ctx, id); err != nil {
  1263  			return err
  1264  		}
  1265  	}
  1266  
  1267  	return nil
  1268  }
  1269  
  1270  // updateJobForRollback updates the schema change job in the case of a rollback.
  1271  func (sc *SchemaChanger) updateJobForRollback(
  1272  	ctx context.Context, txn *kv.Txn, tableDesc *sqlbase.TableDescriptor,
  1273  ) error {
  1274  	// Initialize refresh spans to scan the entire table.
  1275  	span := tableDesc.PrimaryIndexSpan(sc.execCfg.Codec)
  1276  	var spanList []jobspb.ResumeSpanList
  1277  	for _, m := range tableDesc.Mutations {
  1278  		if m.MutationID == sc.mutationID {
  1279  			spanList = append(spanList,
  1280  				jobspb.ResumeSpanList{
  1281  					ResumeSpans: []roachpb.Span{span},
  1282  				},
  1283  			)
  1284  		}
  1285  	}
  1286  	if err := sc.job.WithTxn(txn).SetDetails(
  1287  		ctx, jobspb.SchemaChangeDetails{
  1288  			TableID:        sc.tableID,
  1289  			MutationID:     sc.mutationID,
  1290  			ResumeSpanList: spanList,
  1291  			FormatVersion:  jobspb.JobResumerFormatVersion,
  1292  		},
  1293  	); err != nil {
  1294  		return err
  1295  	}
  1296  	if err := sc.job.WithTxn(txn).SetProgress(ctx, jobspb.SchemaChangeProgress{}); err != nil {
  1297  		return err
  1298  	}
  1299  	// Set the transaction back to nil so that this job can be used in other
  1300  	// transactions.
  1301  	sc.job.WithTxn(nil)
  1302  
  1303  	return nil
  1304  }
  1305  
  1306  func (sc *SchemaChanger) maybeDropValidatingConstraint(
  1307  	ctx context.Context, desc *MutableTableDescriptor, constraint *sqlbase.ConstraintToUpdate,
  1308  ) error {
  1309  	switch constraint.ConstraintType {
  1310  	case sqlbase.ConstraintToUpdate_CHECK, sqlbase.ConstraintToUpdate_NOT_NULL:
  1311  		if constraint.Check.Validity == sqlbase.ConstraintValidity_Unvalidated {
  1312  			return nil
  1313  		}
  1314  		for j, c := range desc.Checks {
  1315  			if c.Name == constraint.Check.Name {
  1316  				desc.Checks = append(desc.Checks[:j], desc.Checks[j+1:]...)
  1317  				return nil
  1318  			}
  1319  		}
  1320  		if log.V(2) {
  1321  			log.Infof(
  1322  				ctx,
  1323  				"attempted to drop constraint %s, but it hadn't been added to the table descriptor yet",
  1324  				constraint.Check.Name,
  1325  			)
  1326  		}
  1327  	case sqlbase.ConstraintToUpdate_FOREIGN_KEY:
  1328  		for i, fk := range desc.OutboundFKs {
  1329  			if fk.Name == constraint.ForeignKey.Name {
  1330  				desc.OutboundFKs = append(desc.OutboundFKs[:i], desc.OutboundFKs[i+1:]...)
  1331  				return nil
  1332  			}
  1333  		}
  1334  		if log.V(2) {
  1335  			log.Infof(
  1336  				ctx,
  1337  				"attempted to drop constraint %s, but it hadn't been added to the table descriptor yet",
  1338  				constraint.ForeignKey.Name,
  1339  			)
  1340  		}
  1341  	default:
  1342  		return errors.AssertionFailedf("unsupported constraint type: %d", errors.Safe(constraint.ConstraintType))
  1343  	}
  1344  	return nil
  1345  }
  1346  
  1347  // deleteIndexMutationsWithReversedColumns deletes mutations with a
  1348  // different mutationID than the schema changer and with an index that
  1349  // references one of the reversed columns. Execute this as a breadth
  1350  // first search graph traversal.
  1351  func (sc *SchemaChanger) deleteIndexMutationsWithReversedColumns(
  1352  	ctx context.Context, desc *sqlbase.MutableTableDescriptor, columns map[string]struct{},
  1353  ) (map[sqlbase.MutationID]struct{}, error) {
  1354  	dropMutations := make(map[sqlbase.MutationID]struct{})
  1355  	// Run breadth first search traversal that reverses mutations
  1356  	for {
  1357  		start := len(dropMutations)
  1358  		for _, mutation := range desc.Mutations {
  1359  			if mutation.MutationID != sc.mutationID {
  1360  				if idx := mutation.GetIndex(); idx != nil {
  1361  					for _, name := range idx.ColumnNames {
  1362  						if _, ok := columns[name]; ok {
  1363  							// Such an index mutation has to be with direction ADD and
  1364  							// in the DELETE_ONLY state. Live indexes referencing live
  1365  							// columns cannot be deleted and thus never have direction
  1366  							// DROP. All mutations with the ADD direction start off in
  1367  							// the DELETE_ONLY state.
  1368  							if mutation.Direction != sqlbase.DescriptorMutation_ADD ||
  1369  								mutation.State != sqlbase.DescriptorMutation_DELETE_ONLY {
  1370  								panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
  1371  							}
  1372  							log.Warningf(ctx, "drop schema change mutation: %+v", mutation)
  1373  							dropMutations[mutation.MutationID] = struct{}{}
  1374  							break
  1375  						}
  1376  					}
  1377  				}
  1378  			}
  1379  		}
  1380  
  1381  		if len(dropMutations) == start {
  1382  			// No more mutations to drop.
  1383  			break
  1384  		}
  1385  		// Drop mutations.
  1386  		newMutations := make([]sqlbase.DescriptorMutation, 0, len(desc.Mutations))
  1387  		for _, mutation := range desc.Mutations {
  1388  			if _, ok := dropMutations[mutation.MutationID]; ok {
  1389  				// Reverse mutation. Update columns to reflect additional
  1390  				// columns that have been purged. This mutation doesn't need
  1391  				// a rollback because it was not started.
  1392  				mutation, columns = sc.reverseMutation(mutation, true /*notStarted*/, columns)
  1393  				// Mark as complete because this mutation needs no backfill.
  1394  				if err := desc.MakeMutationComplete(mutation); err != nil {
  1395  					return nil, err
  1396  				}
  1397  			} else {
  1398  				newMutations = append(newMutations, mutation)
  1399  			}
  1400  		}
  1401  		// Reset mutations.
  1402  		desc.Mutations = newMutations
  1403  	}
  1404  	return dropMutations, nil
  1405  }
  1406  
  1407  // Reverse a mutation. Returns the updated mutation and updated columns.
  1408  // notStarted is set to true only if the schema change state machine
  1409  // was not started for the mutation.
  1410  func (sc *SchemaChanger) reverseMutation(
  1411  	mutation sqlbase.DescriptorMutation, notStarted bool, columns map[string]struct{},
  1412  ) (sqlbase.DescriptorMutation, map[string]struct{}) {
  1413  	switch mutation.Direction {
  1414  	case sqlbase.DescriptorMutation_ADD:
  1415  		mutation.Direction = sqlbase.DescriptorMutation_DROP
  1416  		// A column ADD being reversed gets placed in the map.
  1417  		if col := mutation.GetColumn(); col != nil {
  1418  			columns[col.Name] = struct{}{}
  1419  		}
  1420  		// PrimaryKeySwap and ComputedColumnSwap don't have a concept of the state machine.
  1421  		if pkSwap, computedColumnsSwap :=
  1422  			mutation.GetPrimaryKeySwap(), mutation.GetComputedColumnSwap(); pkSwap != nil || computedColumnsSwap != nil {
  1423  			return mutation, columns
  1424  		}
  1425  
  1426  		if notStarted && mutation.State != sqlbase.DescriptorMutation_DELETE_ONLY {
  1427  			panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
  1428  		}
  1429  
  1430  	case sqlbase.DescriptorMutation_DROP:
  1431  		mutation.Direction = sqlbase.DescriptorMutation_ADD
  1432  		if notStarted && mutation.State != sqlbase.DescriptorMutation_DELETE_AND_WRITE_ONLY {
  1433  			panic(fmt.Sprintf("mutation in bad state: %+v", mutation))
  1434  		}
  1435  	}
  1436  	return mutation, columns
  1437  }
  1438  
  1439  // CreateGCJobRecord creates the job record for a GC job, setting some
  1440  // properties which are common for all GC jobs.
  1441  func CreateGCJobRecord(
  1442  	originalDescription string, username string, details jobspb.SchemaChangeGCDetails,
  1443  ) jobs.Record {
  1444  	descriptorIDs := make([]sqlbase.ID, 0)
  1445  	if len(details.Indexes) > 0 {
  1446  		if len(descriptorIDs) == 0 {
  1447  			descriptorIDs = []sqlbase.ID{details.ParentID}
  1448  		}
  1449  	} else {
  1450  		for _, table := range details.Tables {
  1451  			descriptorIDs = append(descriptorIDs, table.ID)
  1452  		}
  1453  	}
  1454  	return jobs.Record{
  1455  		Description:   fmt.Sprintf("GC for %s", originalDescription),
  1456  		Username:      username,
  1457  		DescriptorIDs: descriptorIDs,
  1458  		Details:       details,
  1459  		Progress:      jobspb.SchemaChangeGCProgress{},
  1460  		NonCancelable: true,
  1461  	}
  1462  }
  1463  
  1464  // GCJobTestingKnobs is for testing the Schema Changer GC job.
  1465  // Note that this is defined here for testing purposes to avoid cyclic
  1466  // dependencies.
  1467  type GCJobTestingKnobs struct {
  1468  	RunBeforeResume func(jobID int64) error
  1469  }
  1470  
  1471  // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
  1472  func (*GCJobTestingKnobs) ModuleTestingKnobs() {}
  1473  
  1474  // SchemaChangerTestingKnobs for testing the schema change execution path
  1475  // through both the synchronous and asynchronous paths.
  1476  type SchemaChangerTestingKnobs struct {
  1477  	// SchemaChangeJobNoOp returning true will cause the job to be a no-op.
  1478  	SchemaChangeJobNoOp func() bool
  1479  
  1480  	// RunBeforePublishWriteAndDelete is called just before publishing the
  1481  	// write+delete state for the schema change.
  1482  	RunBeforePublishWriteAndDelete func()
  1483  
  1484  	// RunBeforeBackfill is called just before starting the backfill.
  1485  	RunBeforeBackfill func() error
  1486  
  1487  	// RunAfterBackfill is called after completing a backfill.
  1488  	RunAfterBackfill func(jobID int64) error
  1489  
  1490  	// RunBeforeIndexBackfill is called just before starting the index backfill, after
  1491  	// fixing the index backfill scan timestamp.
  1492  	RunBeforeIndexBackfill func()
  1493  
  1494  	// RunBeforePrimaryKeySwap is called just before the primary key swap is committed.
  1495  	RunBeforePrimaryKeySwap func()
  1496  
  1497  	// RunBeforeComputedColumnSwap is called just before the computed column swap is committed.
  1498  	RunBeforeComputedColumnSwap func()
  1499  
  1500  	// RunBeforeChildJobs is called just before child jobs are run to clean up
  1501  	// dropped schema elements after a mutation.
  1502  	RunBeforeChildJobs func()
  1503  
  1504  	// RunBeforeIndexValidation is called just before starting the index validation,
  1505  	// after setting the job status to validating.
  1506  	RunBeforeIndexValidation func() error
  1507  
  1508  	// RunBeforeConstraintValidation is called just before starting the checks validation,
  1509  	// after setting the job status to validating.
  1510  	RunBeforeConstraintValidation func() error
  1511  
  1512  	// RunBeforeMutationReversal runs at the beginning of maybeReverseMutations.
  1513  	RunBeforeMutationReversal func(jobID int64) error
  1514  
  1515  	// RunAfterMutationReversal runs in OnFailOrCancel after the mutations have
  1516  	// been reversed.
  1517  	RunAfterMutationReversal func(jobID int64) error
  1518  
  1519  	// RunAtStartOfOnFailOrCancel runs at the start of the OnFailOrCancel hook.
  1520  	RunBeforeOnFailOrCancel func(jobID int64) error
  1521  
  1522  	// RunAfterOnFailOrCancel runs after the OnFailOrCancel hook.
  1523  	RunAfterOnFailOrCancel func(jobID int64) error
  1524  
  1525  	// RunBeforeResume runs at the start of the Resume hook.
  1526  	RunBeforeResume func(jobID int64) error
  1527  
  1528  	// OldNamesDrainedNotification is called during a schema change,
  1529  	// after all leases on the version of the descriptor with the old
  1530  	// names are gone, and just before the mapping of the old names to the
  1531  	// descriptor id are about to be deleted.
  1532  	OldNamesDrainedNotification func()
  1533  
  1534  	// WriteCheckpointInterval is the interval after which a checkpoint is
  1535  	// written.
  1536  	WriteCheckpointInterval time.Duration
  1537  
  1538  	// BackfillChunkSize is to be used for all backfill chunked operations.
  1539  	BackfillChunkSize int64
  1540  
  1541  	// TwoVersionLeaseViolation is called whenever a schema change
  1542  	// transaction is unable to commit because it is violating the two
  1543  	// version lease invariant.
  1544  	TwoVersionLeaseViolation func()
  1545  }
  1546  
  1547  // ModuleTestingKnobs is part of the base.ModuleTestingKnobs interface.
  1548  func (*SchemaChangerTestingKnobs) ModuleTestingKnobs() {}
  1549  
  1550  // createSchemaChangeEvalCtx creates an extendedEvalContext() to be used for backfills.
  1551  //
  1552  // TODO(andrei): This EvalContext() will be broken for backfills trying to use
  1553  // functions marked with distsqlBlacklist.
  1554  // Also, the SessionTracing inside the context is unrelated to the one
  1555  // used in the surrounding SQL session, so session tracing is unable
  1556  // to capture schema change activity.
  1557  func createSchemaChangeEvalCtx(
  1558  	ctx context.Context,
  1559  	execCfg *ExecutorConfig,
  1560  	ts hlc.Timestamp,
  1561  	ieFactory sqlutil.SessionBoundInternalExecutorFactory,
  1562  ) extendedEvalContext {
  1563  	dummyLocation := time.UTC
  1564  
  1565  	sd := &sessiondata.SessionData{
  1566  		SearchPath: sqlbase.DefaultSearchPath,
  1567  		// The database is not supposed to be needed in schema changes, as there
  1568  		// shouldn't be unqualified identifiers in backfills, and the pure functions
  1569  		// that need it should have already been evaluated.
  1570  		//
  1571  		// TODO(andrei): find a way to assert that this field is indeed not used.
  1572  		// And in fact it is used by `current_schemas()`, which, although is a pure
  1573  		// function, takes arguments which might be impure (so it can't always be
  1574  		// pre-evaluated).
  1575  		Database:      "",
  1576  		SequenceState: sessiondata.NewSequenceState(),
  1577  		DataConversion: sessiondata.DataConversionConfig{
  1578  			Location: dummyLocation,
  1579  		},
  1580  		User: security.NodeUser,
  1581  	}
  1582  
  1583  	evalCtx := extendedEvalContext{
  1584  		// Make a session tracing object on-the-fly. This is OK
  1585  		// because it sets "enabled: false" and thus none of the
  1586  		// other fields are used.
  1587  		Tracing: &SessionTracing{},
  1588  		ExecCfg: execCfg,
  1589  		EvalContext: tree.EvalContext{
  1590  			SessionData:      sd,
  1591  			InternalExecutor: ieFactory(ctx, sd),
  1592  			// TODO(andrei): This is wrong (just like on the main code path on
  1593  			// setupFlow). Each processor should override Ctx with its own context.
  1594  			Context:            ctx,
  1595  			Planner:            &sqlbase.DummyEvalPlanner{},
  1596  			PrivilegedAccessor: &sqlbase.DummyPrivilegedAccessor{},
  1597  			SessionAccessor:    &sqlbase.DummySessionAccessor{},
  1598  			ClientNoticeSender: &sqlbase.DummyClientNoticeSender{},
  1599  			Sequence:           &sqlbase.DummySequenceOperators{},
  1600  			Tenant:             &sqlbase.DummyTenantOperator{},
  1601  			Settings:           execCfg.Settings,
  1602  			TestingKnobs:       execCfg.EvalContextTestingKnobs,
  1603  			ClusterID:          execCfg.ClusterID(),
  1604  			ClusterName:        execCfg.RPCContext.ClusterName(),
  1605  			NodeID:             execCfg.NodeID,
  1606  			Codec:              execCfg.Codec,
  1607  			Locality:           execCfg.Locality,
  1608  		},
  1609  	}
  1610  	// The backfill is going to use the current timestamp for the various
  1611  	// functions, like now(), that need it.  It's possible that the backfill has
  1612  	// been partially performed already by another SchemaChangeManager with
  1613  	// another timestamp.
  1614  	//
  1615  	// TODO(andrei): Figure out if this is what we want, and whether the
  1616  	// timestamp from the session that enqueued the schema change
  1617  	// is/should be used for impure functions like now().
  1618  	evalCtx.SetTxnTimestamp(timeutil.Unix(0 /* sec */, ts.WallTime))
  1619  	evalCtx.SetStmtTimestamp(timeutil.Unix(0 /* sec */, ts.WallTime))
  1620  
  1621  	return evalCtx
  1622  }
  1623  
  1624  type schemaChangeResumer struct {
  1625  	job *jobs.Job
  1626  }
  1627  
  1628  func (r schemaChangeResumer) Resume(
  1629  	ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums,
  1630  ) error {
  1631  	p := phs.(PlanHookState)
  1632  	details := r.job.Details().(jobspb.SchemaChangeDetails)
  1633  	if p.ExecCfg().SchemaChangerTestingKnobs.SchemaChangeJobNoOp != nil &&
  1634  		p.ExecCfg().SchemaChangerTestingKnobs.SchemaChangeJobNoOp() {
  1635  		return nil
  1636  	}
  1637  	if fn := p.ExecCfg().SchemaChangerTestingKnobs.RunBeforeResume; fn != nil {
  1638  		if err := fn(*r.job.ID()); err != nil {
  1639  			return err
  1640  		}
  1641  	}
  1642  
  1643  	execSchemaChange := func(tableID sqlbase.ID, mutationID sqlbase.MutationID, droppedDatabaseID sqlbase.ID) error {
  1644  		sc := SchemaChanger{
  1645  			tableID:              tableID,
  1646  			mutationID:           mutationID,
  1647  			droppedDatabaseID:    droppedDatabaseID,
  1648  			sqlInstanceID:        p.ExecCfg().NodeID.SQLInstanceID(),
  1649  			db:                   p.ExecCfg().DB,
  1650  			leaseMgr:             p.ExecCfg().LeaseManager,
  1651  			testingKnobs:         p.ExecCfg().SchemaChangerTestingKnobs,
  1652  			distSQLPlanner:       p.DistSQLPlanner(),
  1653  			jobRegistry:          p.ExecCfg().JobRegistry,
  1654  			job:                  r.job,
  1655  			rangeDescriptorCache: p.ExecCfg().RangeDescriptorCache,
  1656  			leaseHolderCache:     p.ExecCfg().LeaseHolderCache,
  1657  			clock:                p.ExecCfg().Clock,
  1658  			settings:             p.ExecCfg().Settings,
  1659  			execCfg:              p.ExecCfg(),
  1660  			ieFactory: func(ctx context.Context, sd *sessiondata.SessionData) sqlutil.InternalExecutor {
  1661  				return r.job.MakeSessionBoundInternalExecutor(ctx, sd)
  1662  			},
  1663  		}
  1664  		opts := retry.Options{
  1665  			InitialBackoff: 100 * time.Millisecond,
  1666  			MaxBackoff:     20 * time.Second,
  1667  			Multiplier:     1.5,
  1668  		}
  1669  
  1670  		// The schema change may have to be retried if it is not first in line or
  1671  		// for other retriable reasons so we run it in an exponential backoff retry
  1672  		// loop. The loop terminates only if the context is canceled.
  1673  		var scErr error
  1674  		for r := retry.StartWithCtx(ctx, opts); r.Next(); {
  1675  			// Note that r.Next always returns true on first run so exec will be
  1676  			// called at least once before there is a chance for this loop to exit.
  1677  			scErr = sc.exec(ctx)
  1678  			switch {
  1679  			case scErr == nil:
  1680  				return nil
  1681  			case errors.Is(scErr, sqlbase.ErrDescriptorNotFound):
  1682  				// If the table descriptor for the ID can't be found, we assume that
  1683  				// another job to drop the table got to it first, and consider this job
  1684  				// finished.
  1685  				log.Infof(
  1686  					ctx,
  1687  					"descriptor %d not found for schema change processing mutation %d;"+
  1688  						"assuming it was dropped, and exiting",
  1689  					tableID, mutationID,
  1690  				)
  1691  				return nil
  1692  			case !isPermanentSchemaChangeError(scErr):
  1693  				// Check if the error is on a whitelist of errors we should retry on,
  1694  				// including the schema change not having the first mutation in line.
  1695  			default:
  1696  				// All other errors lead to a failed job.
  1697  				return scErr
  1698  			}
  1699  		}
  1700  		// If the context was canceled, the job registry will retry the job. We can
  1701  		// just return the error without wrapping it in a retry error.
  1702  		return scErr
  1703  	}
  1704  
  1705  	// For an empty database, the zone config for it was already GC'ed and there's
  1706  	// nothing left to do.
  1707  	if details.DroppedDatabaseID != sqlbase.InvalidID && len(details.DroppedTables) == 0 {
  1708  		return nil
  1709  	}
  1710  
  1711  	// If a database is being dropped, handle this separately by draining names
  1712  	// for all the tables.
  1713  	//
  1714  	// This also covers other cases where we have a leftover 19.2 job that drops
  1715  	// multiple tables in a single job (e.g., TRUNCATE on multiple tables), so
  1716  	// it's possible for DroppedDatabaseID to be unset.
  1717  	if details.DroppedDatabaseID != sqlbase.InvalidID || len(details.DroppedTables) > 1 {
  1718  		for i := range details.DroppedTables {
  1719  			droppedTable := &details.DroppedTables[i]
  1720  			if err := execSchemaChange(droppedTable.ID, sqlbase.InvalidMutationID, details.DroppedDatabaseID); err != nil {
  1721  				return err
  1722  			}
  1723  		}
  1724  		dropTime := timeutil.Now().UnixNano()
  1725  		tablesToGC := make([]jobspb.SchemaChangeGCDetails_DroppedID, len(details.DroppedTables))
  1726  		for i, table := range details.DroppedTables {
  1727  			tablesToGC[i] = jobspb.SchemaChangeGCDetails_DroppedID{ID: table.ID, DropTime: dropTime}
  1728  		}
  1729  		multiTableGCDetails := jobspb.SchemaChangeGCDetails{
  1730  			Tables:   tablesToGC,
  1731  			ParentID: details.DroppedDatabaseID,
  1732  		}
  1733  
  1734  		return startGCJob(
  1735  			ctx,
  1736  			p.ExecCfg().DB,
  1737  			p.ExecCfg().JobRegistry,
  1738  			r.job.Payload().Username,
  1739  			r.job.Payload().Description,
  1740  			multiTableGCDetails,
  1741  		)
  1742  	}
  1743  	if details.TableID == sqlbase.InvalidID {
  1744  		return errors.AssertionFailedf("schema change has no specified database or table(s)")
  1745  	}
  1746  
  1747  	return execSchemaChange(details.TableID, details.MutationID, details.DroppedDatabaseID)
  1748  }
  1749  
  1750  // OnFailOrCancel is part of the jobs.Resumer interface.
  1751  func (r schemaChangeResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error {
  1752  	p := phs.(PlanHookState)
  1753  	details := r.job.Details().(jobspb.SchemaChangeDetails)
  1754  
  1755  	if details.DroppedDatabaseID != sqlbase.InvalidID {
  1756  		// TODO (lucy): Do we need to do anything here?
  1757  		return nil
  1758  	}
  1759  	if details.TableID == sqlbase.InvalidID {
  1760  		return errors.AssertionFailedf("job has no database ID or table ID")
  1761  	}
  1762  	sc := SchemaChanger{
  1763  		tableID:              details.TableID,
  1764  		mutationID:           details.MutationID,
  1765  		sqlInstanceID:        p.ExecCfg().NodeID.SQLInstanceID(),
  1766  		db:                   p.ExecCfg().DB,
  1767  		leaseMgr:             p.ExecCfg().LeaseManager,
  1768  		testingKnobs:         p.ExecCfg().SchemaChangerTestingKnobs,
  1769  		distSQLPlanner:       p.DistSQLPlanner(),
  1770  		jobRegistry:          p.ExecCfg().JobRegistry,
  1771  		job:                  r.job,
  1772  		rangeDescriptorCache: p.ExecCfg().RangeDescriptorCache,
  1773  		leaseHolderCache:     p.ExecCfg().LeaseHolderCache,
  1774  		clock:                p.ExecCfg().Clock,
  1775  		settings:             p.ExecCfg().Settings,
  1776  		execCfg:              p.ExecCfg(),
  1777  		ieFactory: func(ctx context.Context, sd *sessiondata.SessionData) sqlutil.InternalExecutor {
  1778  			return r.job.MakeSessionBoundInternalExecutor(ctx, sd)
  1779  		},
  1780  	}
  1781  
  1782  	if fn := sc.testingKnobs.RunBeforeOnFailOrCancel; fn != nil {
  1783  		if err := fn(*r.job.ID()); err != nil {
  1784  			return err
  1785  		}
  1786  	}
  1787  
  1788  	if r.job.Payload().FinalResumeError == nil {
  1789  		return errors.AssertionFailedf("job failed but had no recorded error")
  1790  	}
  1791  	scErr := errors.DecodeError(ctx, *r.job.Payload().FinalResumeError)
  1792  
  1793  	if rollbackErr := sc.handlePermanentSchemaChangeError(ctx, scErr, p.ExtendedEvalContext()); rollbackErr != nil {
  1794  		switch {
  1795  		case errors.Is(rollbackErr, sqlbase.ErrDescriptorNotFound):
  1796  			// If the table descriptor for the ID can't be found, we assume that
  1797  			// another job to drop the table got to it first, and consider this job
  1798  			// finished.
  1799  			log.Infof(
  1800  				ctx,
  1801  				"descriptor %d not found for rollback of schema change processing mutation %d;"+
  1802  					"assuming it was dropped, and exiting",
  1803  				details.TableID, details.MutationID,
  1804  			)
  1805  		case ctx.Err() != nil:
  1806  			// If the context was canceled, the job registry will retry the job.
  1807  			// We check for this case so that we can just return the error without
  1808  			// wrapping it in a retry error.
  1809  			return rollbackErr
  1810  		case !isPermanentSchemaChangeError(rollbackErr):
  1811  			// Check if the error is on a whitelist of errors we should retry on, and
  1812  			// have the job registry retry.
  1813  			return jobs.NewRetryJobError(rollbackErr.Error())
  1814  		default:
  1815  			// All other errors lead to a failed job.
  1816  			//
  1817  			// TODO (lucy): We have a problem where some schema change rollbacks will
  1818  			// never succeed because the backfiller can't handle rolling back schema
  1819  			// changes that involve dropping a column; see #46541. (This is probably
  1820  			// not the only bug that could cause rollbacks to fail.) For historical
  1821  			// context: This was the case in 19.2 and probably earlier versions as
  1822  			// well, and in those earlier versions, the old async schema changer would
  1823  			// keep retrying the rollback and failing in the background because the
  1824  			// mutation would still be left on the table descriptor. In the present
  1825  			// schema change job, we return an error immediately and put the job in a
  1826  			// terminal state instead of retrying indefinitely, basically to make the
  1827  			// behavior similar to 19.2: If the rollback fails, we end up returning
  1828  			// immediately (instead of retrying and blocking indefinitely), and the
  1829  			// table descriptor is left in a bad state with some mutations that we
  1830  			// can't clean up.
  1831  			//
  1832  			// Ultimately, this is untenable, and we should figure out some better way
  1833  			// of dealing with failed rollbacks. Part of the solution is just making
  1834  			// rollbacks (especially of dropped columns) more robust, but part of it
  1835  			// will likely involve some sort of medium-term solution for cleaning up
  1836  			// mutations that we can't make any progress on (see #47456). In the long
  1837  			// term we'll hopefully be rethinking what it even means to "roll back" a
  1838  			// (transactional) schema change.
  1839  			return rollbackErr
  1840  		}
  1841  	}
  1842  
  1843  	if fn := sc.testingKnobs.RunAfterOnFailOrCancel; fn != nil {
  1844  		if err := fn(*r.job.ID()); err != nil {
  1845  			return err
  1846  		}
  1847  	}
  1848  	return nil
  1849  }
  1850  
  1851  func init() {
  1852  	createResumerFn := func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
  1853  		return &schemaChangeResumer{job: job}
  1854  	}
  1855  	jobs.RegisterConstructor(jobspb.TypeSchemaChange, createResumerFn)
  1856  }
  1857  
  1858  // queueCleanupJobs checks if the completed schema change needs to start a
  1859  // child job to clean up dropped schema elements.
  1860  func (sc *SchemaChanger) queueCleanupJobs(
  1861  	ctx context.Context, scDesc *MutableTableDescriptor, txn *kv.Txn, childJobs []*jobs.StartableJob,
  1862  ) ([]*jobs.StartableJob, error) {
  1863  	// Create jobs for dropped columns / indexes to be deleted.
  1864  	mutationID := scDesc.ClusterVersion.NextMutationID
  1865  	span := scDesc.PrimaryIndexSpan(sc.execCfg.Codec)
  1866  	var spanList []jobspb.ResumeSpanList
  1867  	for j := len(scDesc.ClusterVersion.Mutations); j < len(scDesc.Mutations); j++ {
  1868  		spanList = append(spanList,
  1869  			jobspb.ResumeSpanList{
  1870  				ResumeSpans: roachpb.Spans{span},
  1871  			},
  1872  		)
  1873  	}
  1874  	// Only start a job if spanList has any spans. If len(spanList) == 0, then
  1875  	// no mutations were enqueued by the primary key change.
  1876  	if len(spanList) > 0 {
  1877  		jobRecord := jobs.Record{
  1878  			Description:   fmt.Sprintf("CLEANUP JOB for '%s'", sc.job.Payload().Description),
  1879  			Username:      sc.job.Payload().Username,
  1880  			DescriptorIDs: sqlbase.IDs{scDesc.GetID()},
  1881  			Details: jobspb.SchemaChangeDetails{
  1882  				TableID:        sc.tableID,
  1883  				MutationID:     mutationID,
  1884  				ResumeSpanList: spanList,
  1885  				FormatVersion:  jobspb.JobResumerFormatVersion,
  1886  			},
  1887  			Progress:      jobspb.SchemaChangeProgress{},
  1888  			NonCancelable: true,
  1889  		}
  1890  		job, err := sc.jobRegistry.CreateStartableJobWithTxn(ctx, jobRecord, txn, nil /* resultsCh */)
  1891  		if err != nil {
  1892  			return nil, err
  1893  		}
  1894  		log.VEventf(ctx, 2, "created job %d to drop previous columns "+
  1895  			"and indexes.", *job.ID())
  1896  		childJobs = append(childJobs, job)
  1897  		scDesc.MutationJobs = append(scDesc.MutationJobs, sqlbase.TableDescriptor_MutationJob{
  1898  			MutationID: mutationID,
  1899  			JobID:      *job.ID(),
  1900  		})
  1901  	}
  1902  	return childJobs, nil
  1903  }